Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
N
network-assets-reptile
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
liyang
network-assets-reptile
Commits
07abae43
Commit
07abae43
authored
Aug 10, 2023
by
liyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
feat:dcard debug
parent
b2ea4b6d
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
22 additions
and
22 deletions
+22
-22
pc_dcard.py
pc_dcard.py
+22
-22
No files found.
pc_dcard.py
View file @
07abae43
...
@@ -6,7 +6,7 @@ from utils.createBrowserDriver import create
...
@@ -6,7 +6,7 @@ from utils.createBrowserDriver import create
from
utils.filse
import
save_json
from
utils.filse
import
save_json
from
api.index
import
importJson
,
getReptileTask
,
importJsonPath
from
api.index
import
importJson
,
getReptileTask
,
importJsonPath
from
utils.index
import
convert_to_traditional
,
yt_dlp_download
,
convert_string_to_time
,
parse_twitter_time_string
,
\
from
utils.index
import
convert_to_traditional
,
yt_dlp_download
,
convert_string_to_time
,
parse_twitter_time_string
,
\
is_base64_image
,
save_base64_image
,
get_screen_resolution
,
create_directory_if_not_exists
,
delete_directory
is_base64_image
,
save_base64_image
,
get_screen_resolution
,
create_directory_if_not_exists
,
delete_directory
# from pytube import YouTube
# from pytube import YouTube
from
selenium.common.exceptions
import
NoSuchElementException
from
selenium.common.exceptions
import
NoSuchElementException
import
os
import
os
...
@@ -35,55 +35,56 @@ def reptile(browser=None, search_word=""):
...
@@ -35,55 +35,56 @@ def reptile(browser=None, search_word=""):
"""
"""
print
(
f
"搜索词:{search_word}"
)
print
(
f
"搜索词:{search_word}"
)
base_url
=
"https://www.dcard.tw"
base_url
=
"https://www.dcard.tw"
browser
=
browser
or
create
(
no_headless
=
False
,
using_user_data
=
True
)
# Chrome 无痕模式选项
# option=["--incognito"]
browser
=
browser
or
create
(
no_headless
=
False
,
using_user_data
=
False
)
# 打开网页
# 打开网页
browser
.
get
(
f
"{base_url}/search?query={search_word}"
)
browser
.
get
(
f
"{base_url}/search?query={search_word}"
)
time
.
sleep
(
6
)
time
.
sleep
(
6
)
# 滚动一页
# 滚动一页
# 使用 JavaScript 将网页滚动到底部
# 使用 JavaScript 将网页滚动到底部
# browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# time.sleep(6)
# time.sleep(3)
base_xpath
=
"//div[@role='main']//div[@data-key]//article"
# 内容块
# 内容块
element_content_list
=
browser
.
find_elements
(
'xpath'
,
base_xpath
)
element_content_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@role='main']//article"
)
# 作者
element_authors_list
=
browser
.
find_elements
(
'xpath'
,
f
"{base_xpath}/div[1]/div[1]/div[2]/div/div[1]"
)
# 时间
element_time_list
=
browser
.
find_elements
(
'xpath'
,
f
"{base_xpath}/div[1]/div[1]/div[2]/div/div[2]/time"
)
# 标题
element_title_list
=
browser
.
find_elements
(
'xpath'
,
f
"{base_xpath}//h2"
)
# 点赞
# element_like_list = browser.find_elements('xpath', f"{base_xpath}/div[3]/div[1]/div/div[2]")
# 评论
# element_comment_list = browser.find_elements('xpath', f"{base_xpath}/div[3]/div[2]/div/span")
for
index
,
item
in
enumerate
(
element_content_list
):
for
index
,
item
in
enumerate
(
element_content_list
):
# 点赞
# element_like_list = browser.find_elements('xpath', f"{base_xpath}/div[3]/div[1]/div/div[2]")
# 评论
# element_comment_list = browser.find_elements('xpath', f"{base_xpath}/div[3]/div[2]/div/span")
# 时间
# element_time_list = browser.find_elements('xpath', f"{base_xpath}/div[1]/div[1]/div[2]/div/div[2]/time")
# 提取时间,并转为时间戳
# 提取时间,并转为时间戳
timestamp
=
datetime
.
fromisoformat
(
element_time_list
[
index
]
.
get_attribute
(
"datetime"
)[:
-
1
])
.
timestamp
()
tag
=
item
.
find_element
(
'xpath'
,
".//time"
)
timestamp_str
=
tag
.
get_attribute
(
"datetime"
)[:
-
1
]
timestamp
=
datetime
.
fromisoformat
(
timestamp_str
)
.
timestamp
()
# 过滤时间
# 过滤时间
# # 如果'releaseTime'不是整数,则将其转换为整数
# # 如果'releaseTime'不是整数,则将其转换为整数
new_releaseTime
=
int
(
timestamp
)
new_releaseTime
=
int
(
timestamp
)
print
(
f
"开始时间:{beginFiltrationTime};结束时间:{endFiltrationTime};当前时间:{new_releaseTime}"
)
if
new_releaseTime
<
beginFiltrationTime
or
new_releaseTime
>
endFiltrationTime
:
if
new_releaseTime
<
beginFiltrationTime
or
new_releaseTime
>
endFiltrationTime
:
# 如果'new_releaseTime'不在指定范围内,则跳过当前迭代,继续下一个项目
# 如果'new_releaseTime'不在指定范围内,则跳过当前迭代,继续下一个项目
continue
continue
# 提取作者
# 提取作者
author
=
element_authors_list
[
index
]
.
text
author
=
item
.
find_element
(
'xpath'
,
f
"./div[1]/div[1]/div[2]/div/div[1]"
)
# 提取标题
# 提取标题
title
=
element_title_list
[
index
]
.
text
title
=
item
.
find_element
(
'xpath'
,
f
".//h2"
)
# 提取点赞
# 提取点赞
# like = element_like_list[index].text
# like = element_like_list[index].text
# 提取评论
# 提取评论
# comment = element_comment_list[index].text
# comment = element_comment_list[index].text
# -------------提取内容---------------
# -------------提取内容---------------
element_content_list
[
index
]
.
click
()
item
.
click
()
# browser.execute_script("arguments[0].click();", item)
# 等待弹窗内容出现,设置最长等待时间为10秒
# 等待弹窗内容出现,设置最长等待时间为10秒
wait
=
WebDriverWait
(
browser
,
10
)
wait
=
WebDriverWait
(
browser
,
10
)
# 通过 expected_conditions 来定义等待条件,这里以弹窗内容的某个元素为例
# 通过 expected_conditions 来定义等待条件,这里以弹窗内容的某个元素为例
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
XPATH
,
"//div[@data-testid='overlay']"
)))
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
XPATH
,
"//div[@data-testid='overlay']"
)))
time
.
sleep
(
3
)
#
time.sleep(3)
click_dom
=
browser
.
find_element
(
"xpath"
,
click_dom
=
browser
.
find_element
(
"xpath"
,
"//div[@data-testid='overlay']"
)
"//div[@data-testid='overlay']"
)
# 处理弹窗内容加载失败的情况
# 处理弹窗内容加载失败的情况
...
@@ -221,7 +222,6 @@ def script_close(browser):
...
@@ -221,7 +222,6 @@ def script_close(browser):
print
(
"sys.exit() 执行失败"
)
print
(
"sys.exit() 执行失败"
)
def
main
():
def
main
():
"""
"""
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment