Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
N
network-assets-reptile
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
liyang
network-assets-reptile
Commits
2732252c
Commit
2732252c
authored
Jul 28, 2023
by
liyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix:ptt执行效率
parent
f71f84a0
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
16 additions
and
15 deletions
+16
-15
pc_ptt.py
pc_ptt.py
+16
-15
No files found.
pc_ptt.py
View file @
2732252c
...
...
@@ -33,17 +33,15 @@ from selenium.webdriver.support import expected_conditions as EC
def
reptile
(
browser
=
None
,
search_word
=
""
):
url
=
"https://www.ptt.cc/bbs/hotboards.html"
browser
=
browser
or
create
(
no_headless
=
Tru
e
,
using_user_data
=
True
)
browser
=
browser
or
create
(
no_headless
=
Fals
e
,
using_user_data
=
True
)
# 有头模式执行
# browser = browser or create()
# 打开网页
browser
.
get
(
url
)
# log.debug("已打开浏览器")
classify_item_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='board-class']"
)
# log.debug(classify_item_list)
# classify_item_list = item_list.copy()
length
=
len
(
classify_item_list
)
for
index
in
range
(
length
):
for
index
,
item_element
in
enumerate
(
classify_item_list
):
# 暂时先爬取 第2个 分类
if
0
<=
index
<
4
:
type_title
=
classify_item_list
[
index
]
.
text
...
...
@@ -62,23 +60,23 @@ def reptile(browser=None, search_word=""):
wait
=
WebDriverWait
(
browser
,
10
)
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
XPATH
,
"//div[@class='r-ent']"
)))
element_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='r-ent']//div[@class='title']//a"
)
length_two
=
len
(
element_list
)
for
index_two
in
range
(
length_two
):
for
index_two
,
item
in
enumerate
(
element_list
):
# print(element_list[index_two].text)
try
:
re
.
findall
(
"公告"
,
element_list
[
index_two
]
.
text
)
re
.
findall
(
"公告"
,
item
.
text
)
except
IndexError
:
log
.
debug
(
f
"正在爬取分类:{type_title}-第{index_two + 1}条"
)
print
(
"当前连接:"
+
str
(
browser
.
current_url
))
print
(
data
[
len
(
data
)
-
1
][
"title"
])
print
(
"当前连接:"
+
str
(
browser
.
current_url
))
print
(
data
[
len
(
data
)
-
1
][
"title"
])
# 使用正则表达式进行匹配关键词
if
re
.
findall
(
search_word
,
element_list
[
index_two
]
.
text
):
if
re
.
findall
(
search_word
,
item
.
text
):
# log.debug(f"找到了匹配的字符串:{matches}")
error
=
""
else
:
# log.debug("未找到匹配的字符串")
# 退出本次迭代,进入下一次迭代
# 本次迭代帖子标题未匹配关键词,退出本次迭代,进入下一次迭代
continue
# 标题不包含"公告"和"看板"
if
re
.
findall
(
"公告"
,
element_list
[
index_two
]
.
text
)
or
re
.
findall
(
"看板"
,
element_list
[
index_two
]
.
text
):
a
=
1
...
...
@@ -201,10 +199,13 @@ def reptile(browser=None, search_word=""):
"picture_url"
:
","
.
join
(
picture_url
)
}
# --------------- 组装数据 end---------------------
data
.
append
(
obj
)
# 浏览器返回上一页
browser
.
back
()
time
.
sleep
(
0.1
)
element_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='r-ent']//div[@class='title']//a"
)
print
(
"循环结束"
)
# 浏览器返回上一页
browser
.
back
()
if
index
==
0
:
...
...
@@ -224,11 +225,11 @@ def reptile(browser=None, search_word=""):
log
.
debug
(
'save file success'
)
else
:
log
.
debug
(
'save file failed'
)
#
script_close(browser)
script_close
(
browser
)
else
:
# 爬取数据为空
log
.
info
(
"未爬取到数据"
)
#
script_close(browser)
script_close
(
browser
)
def
script_close
(
browser
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment