Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
N
network-assets-reptile
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
liyang
network-assets-reptile
Commits
3a1860d4
Commit
3a1860d4
authored
Jul 11, 2023
by
liyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix:爬取数据入库
parent
2531d166
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
12 additions
and
4 deletions
+12
-4
pc_ptt.py
pc_ptt.py
+12
-4
No files found.
pc_ptt.py
View file @
3a1860d4
...
@@ -7,7 +7,7 @@ import loguru
...
@@ -7,7 +7,7 @@ import loguru
import
pymysql.cursors
import
pymysql.cursors
import
requests
import
requests
from
bs4
import
BeautifulSoup
from
bs4
import
BeautifulSoup
from
datetime
import
datetime
from
api.index
import
importJson
,
getReptileTask
,
importJsonPath
from
api.index
import
importJson
,
getReptileTask
,
importJsonPath
from
utils.Logger
import
log
from
utils.Logger
import
log
# from requests_toolbelt import *
# from requests_toolbelt import *
...
@@ -105,6 +105,15 @@ def reptile(browser=None, search_word=""):
...
@@ -105,6 +105,15 @@ def reptile(browser=None, search_word=""):
# 作者
# 作者
element_author
=
browser
.
find_element
(
'xpath'
,
element_author
=
browser
.
find_element
(
'xpath'
,
"//div[@id='main-content']/div[@class='article-metaline'][1]/span[2]"
)
"//div[@id='main-content']/div[@class='article-metaline'][1]/span[2]"
)
# 发布时间
element_release
=
browser
.
find_element
(
'xpath'
,
"//div[@id='main-content']/div[@class='article-metaline'][3]/span[2]"
)
date_string
=
element_release
.
text
date_format
=
"
%
a
%
b
%
d
%
H:
%
M:
%
S
%
Y"
# 将日期字符串转换为datetime对象
date_time
=
datetime
.
strptime
(
date_string
,
date_format
)
# 将datetime对象转换为时间戳(以秒为单位)
release_time
=
int
(
date_time
.
timestamp
())
log
.
debug
(
'开始判断类型'
)
log
.
debug
(
'开始判断类型'
)
# ---------------- 判断类型 start ----------
# ---------------- 判断类型 start ----------
# 查找所有img标签
# 查找所有img标签
...
@@ -133,8 +142,6 @@ def reptile(browser=None, search_word=""):
...
@@ -133,8 +142,6 @@ def reptile(browser=None, search_word=""):
for
span
in
span_element
:
for
span
in
span_element
:
span
.
extract
()
span
.
extract
()
html
=
soup
.
prettify
()
.
replace
(
'amp;'
,
''
)
html
=
soup
.
prettify
()
.
replace
(
'amp;'
,
''
)
print
(
html
)
print
(
"aaaaa"
)
# ------------------ content 过滤 end--------------
# ------------------ content 过滤 end--------------
# --------------- 组装数据 start---------------------
# --------------- 组装数据 start---------------------
...
@@ -144,7 +151,8 @@ def reptile(browser=None, search_word=""):
...
@@ -144,7 +151,8 @@ def reptile(browser=None, search_word=""):
"link"
:
browser_current_url
,
"link"
:
browser_current_url
,
"reptileTime"
:
str
(
int
(
time
.
time
())),
"reptileTime"
:
str
(
int
(
time
.
time
())),
"type"
:
content_type
,
"type"
:
content_type
,
"author"
:
element_author
.
text
"author"
:
element_author
.
text
,
"releaseTime"
:
release_time
}
}
# --------------- 组装数据 end---------------------
# --------------- 组装数据 end---------------------
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment