Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
N
network-assets-reptile
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
liyang
network-assets-reptile
Commits
cc26717b
Commit
cc26717b
authored
Jul 26, 2023
by
liyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
feat:ptt 后处理优化
parent
c712ff68
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
18 additions
and
2 deletions
+18
-2
pc_ptt.py
pc_ptt.py
+18
-2
No files found.
pc_ptt.py
View file @
cc26717b
...
...
@@ -18,7 +18,12 @@ from utils.filse import save_json
import
os
from
config.settings
import
get_base_file_url
from
utils.download_image
import
download_image
# --------------- selenium 依赖 start ----------------
from
selenium.webdriver.common.by
import
By
from
selenium.webdriver.support.ui
import
WebDriverWait
from
selenium.webdriver.support
import
expected_conditions
as
EC
# --------------- selenium 依赖 end ----------------
'''
爬取台湾PTT论坛的热门帖子,包括帖子的标题、内容【文本、图片、视频】
...
...
@@ -28,7 +33,7 @@ from utils.download_image import download_image
def
reptile
(
browser
=
None
,
search_word
=
""
):
url
=
"https://www.ptt.cc/bbs/hotboards.html"
browser
=
browser
or
create
(
no_headless
=
True
,
using_user_data
=
True
)
browser
=
browser
or
create
(
no_headless
=
True
,
using_user_data
=
True
)
# 有头模式执行
# browser = browser or create()
# 打开网页
...
...
@@ -39,10 +44,18 @@ def reptile(browser=None, search_word=""):
length
=
len
(
classify_item_list
)
for
index
in
range
(
length
):
# 暂时先爬取 第2个 分类
if
0
<
index
<
4
:
if
0
<
=
index
<
4
:
type_title
=
classify_item_list
[
index
]
.
text
classify_item_list
[
index
]
.
click
()
time
.
sleep
(
0.1
)
if
index
==
0
:
try
:
button
=
browser
.
find_element
(
"xpath"
,
"//form/div[1]//button"
)
button
.
click
()
except
:
error
=
""
wait
=
WebDriverWait
(
browser
,
10
)
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
XPATH
,
"//div[@class='r-ent']"
)))
element_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='r-ent']//div[@class='title']//a"
)
length_two
=
len
(
element_list
)
for
index_two
in
range
(
length_two
):
...
...
@@ -183,6 +196,8 @@ def reptile(browser=None, search_word=""):
element_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='r-ent']//div[@class='title']//a"
)
# 浏览器返回上一页
browser
.
back
()
if
index
==
0
:
browser
.
back
()
time
.
sleep
(
0.1
)
# 重新获取
classify_item_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='board-class']"
)
...
...
@@ -214,6 +229,7 @@ def script_close(browser):
log
.
debug
(
"浏览器驱动关闭失败"
)
sys
.
exit
()
def
main
():
"""
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment