Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
N
network-assets-reptile
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
liyang
network-assets-reptile
Commits
115c9564
Commit
115c9564
authored
Jul 11, 2023
by
liyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix:爬取数据入库
parent
4b5870d4
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
24 additions
and
17 deletions
+24
-17
pc_ptt.py
pc_ptt.py
+24
-17
No files found.
pc_ptt.py
View file @
115c9564
...
...
@@ -61,9 +61,9 @@ def write_to_database(data):
def
reptile
(
browser
=
None
,
search_word
=
""
):
url
=
"https://www.ptt.cc/bbs/hotboards.html"
# 无头模式执行
#
browser = browser or create(['--headless'])
browser
=
browser
or
create
([
'--headless'
])
# 有头模式执行
browser
=
browser
or
create
()
#
browser = browser or create()
# 打开网页
browser
.
get
(
url
)
log
.
debug
(
"已打开浏览器"
)
...
...
@@ -130,20 +130,27 @@ def reptile(browser=None, search_word=""):
# ---------------- 判断类型 end ----------
log
.
debug
(
'开始内容过滤'
)
# ------------------ content 过滤 start--------------
# 查找所有的<a>标签
a_tags
=
soup
.
find_all
(
'a'
,
href
=
True
)
# 循环遍历<a>标签,检查每个<a>标签是否包含<img>元素,如果包含则删除该<a>标签
for
tag
in
a_tags
:
tag
.
decompose
()
# 找到所有第一级标签为 `div` 的元素
div_elements
=
soup
.
find_all
(
'div'
)
# 逐个删除这些元素
for
div
in
div_elements
:
div
.
extract
()
# 删除第一级span
span_element
=
soup
.
find_all
(
'span'
)
for
span
in
span_element
:
span
.
extract
()
try
:
# 查找所有的<a>标签
a_tags
=
soup
.
find_all
(
'a'
,
href
=
True
)
# 循环遍历<a>标签,检查每个<a>标签是否包含<img>元素,如果包含则删除该<a>标签
for
tag
in
a_tags
:
tag
.
decompose
()
except
:
log
.
debug
(
"查找所有的<a>标签失败"
)
try
:
# 找到所有第一级标签为 `div` 的元素
div_elements
=
soup
.
find_all
(
'div'
)
# 逐个删除这些元素
for
div
in
div_elements
:
div
.
extract
()
# 删除第一级span
span_element
=
soup
.
find_all
(
'span'
)
for
span
in
span_element
:
span
.
extract
()
except
:
log
.
debug
(
"删除第一级div失败"
)
html
=
soup
.
prettify
()
.
replace
(
'amp;'
,
''
)
# ------------------ content 过滤 end--------------
...
...
@@ -162,7 +169,7 @@ def reptile(browser=None, search_word=""):
data
.
append
(
obj
)
# 使用正则表达式进行匹配
# matches = re.findall(search_word, element_title.text)
#
#
打印匹配结果
# 打印匹配结果
# if matches:
# # log.debug(f"找到了匹配的字符串:{matches}")
# data.append(obj)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment