Commit 07abae43 authored by liyang's avatar liyang

feat:dcard debug

parent b2ea4b6d
......@@ -6,7 +6,7 @@ from utils.createBrowserDriver import create
from utils.filse import save_json
from api.index import importJson, getReptileTask, importJsonPath
from utils.index import convert_to_traditional, yt_dlp_download, convert_string_to_time, parse_twitter_time_string, \
is_base64_image, save_base64_image, get_screen_resolution,create_directory_if_not_exists, delete_directory
is_base64_image, save_base64_image, get_screen_resolution, create_directory_if_not_exists, delete_directory
# from pytube import YouTube
from selenium.common.exceptions import NoSuchElementException
import os
......@@ -35,55 +35,56 @@ def reptile(browser=None, search_word=""):
"""
print(f"搜索词:{search_word}")
base_url = "https://www.dcard.tw"
browser = browser or create(no_headless=False, using_user_data=True)
# Chrome 无痕模式选项
# option=["--incognito"]
browser = browser or create(no_headless=False, using_user_data=False)
# 打开网页
browser.get(f"{base_url}/search?query={search_word}")
time.sleep(6)
# 滚动一页
# 使用 JavaScript 将网页滚动到底部
# browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# time.sleep(6)
base_xpath = "//div[@role='main']//div[@data-key]//article"
# time.sleep(3)
# 内容块
element_content_list = browser.find_elements('xpath', base_xpath)
# 作者
element_authors_list = browser.find_elements('xpath', f"{base_xpath}/div[1]/div[1]/div[2]/div/div[1]")
# 时间
element_time_list = browser.find_elements('xpath', f"{base_xpath}/div[1]/div[1]/div[2]/div/div[2]/time")
# 标题
element_title_list = browser.find_elements('xpath', f"{base_xpath}//h2")
# 点赞
# element_like_list = browser.find_elements('xpath', f"{base_xpath}/div[3]/div[1]/div/div[2]")
# 评论
# element_comment_list = browser.find_elements('xpath', f"{base_xpath}/div[3]/div[2]/div/span")
element_content_list = browser.find_elements('xpath', "//div[@role='main']//article")
for index,item in enumerate(element_content_list):
for index, item in enumerate(element_content_list):
# 点赞
# element_like_list = browser.find_elements('xpath', f"{base_xpath}/div[3]/div[1]/div/div[2]")
# 评论
# element_comment_list = browser.find_elements('xpath', f"{base_xpath}/div[3]/div[2]/div/span")
# 时间
# element_time_list = browser.find_elements('xpath', f"{base_xpath}/div[1]/div[1]/div[2]/div/div[2]/time")
# 提取时间,并转为时间戳
timestamp = datetime.fromisoformat(element_time_list[index].get_attribute("datetime")[:-1]).timestamp()
tag = item.find_element('xpath', ".//time")
timestamp_str = tag.get_attribute("datetime")[:-1]
timestamp = datetime.fromisoformat(timestamp_str).timestamp()
# 过滤时间
# # 如果'releaseTime'不是整数,则将其转换为整数
new_releaseTime = int(timestamp)
print(f"开始时间:{beginFiltrationTime};结束时间:{endFiltrationTime};当前时间:{new_releaseTime}")
if new_releaseTime < beginFiltrationTime or new_releaseTime > endFiltrationTime:
# 如果'new_releaseTime'不在指定范围内,则跳过当前迭代,继续下一个项目
continue
# 提取作者
author = element_authors_list[index].text
author = item.find_element('xpath', f"./div[1]/div[1]/div[2]/div/div[1]")
# 提取标题
title = element_title_list[index].text
title = item.find_element('xpath', f".//h2")
# 提取点赞
# like = element_like_list[index].text
# 提取评论
# comment = element_comment_list[index].text
# -------------提取内容---------------
element_content_list[index].click()
item.click()
# browser.execute_script("arguments[0].click();", item)
# 等待弹窗内容出现,设置最长等待时间为10秒
wait = WebDriverWait(browser, 10)
# 通过 expected_conditions 来定义等待条件,这里以弹窗内容的某个元素为例
wait.until(EC.presence_of_element_located((By.XPATH, "//div[@data-testid='overlay']")))
time.sleep(3)
# time.sleep(3)
click_dom = browser.find_element("xpath",
"//div[@data-testid='overlay']")
# 处理弹窗内容加载失败的情况
......@@ -221,7 +222,6 @@ def script_close(browser):
print("sys.exit() 执行失败")
def main():
"""
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment