Commit c712ff68 authored by liyang's avatar liyang

feat:time.sleep()更换为 WebDriverWait

parent 37ffd734
......@@ -35,9 +35,10 @@ def reptile(browser=None, search_word=""):
"""
print(f"搜索词:{search_word}")
base_url = "https://www.dcard.tw"
browser = browser or create(no_headless=True,using_user_data=True)
browser = browser or create(no_headless=False, using_user_data=True)
# 打开网页
# browser.get(base_url)
# time.sleep(3)
browser.get(f"{base_url}/search?query={search_word}")
base_xpath = "//div[@role='main']//div[@data-key]//article"
# 内容块
......
......@@ -13,6 +13,12 @@ import os
from config.settings import get_base_file_url
from config.settings import get_account
import sys
# --------------- selenium 依赖 start ----------------
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# --------------- selenium 依赖 end ----------------
# 工具函数-下载图片
'''
......@@ -23,7 +29,7 @@ import sys
def reptile(browser=None, search_word=""):
print(f"搜索词:{search_word}")
url = "https://www.facebook.com/"
browser = browser or create(no_headless=True,using_user_data=True)
browser = browser or create(no_headless=False,using_user_data=True)
# 打开网页
browser.get(url)
try:
......@@ -35,15 +41,19 @@ def reptile(browser=None, search_word=""):
# 获取登录按钮
button_login = browser.find_element('xpath', "//button[@name='login']")
button_login.click()
time.sleep(6)
wait = WebDriverWait(browser,10)
wait.until(EC.presence_of_element_located((By.XPATH,"//div[@role='main']")))
except:
print("已登录")
log.debug("facebook login complete")
url = f"https://www.facebook.com/search/top?q={search_word}"
browser.get(url)
# 使用 JavaScript 将网页滚动到底部
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3)
# 等待内容出现,设置最长等待时间为10秒
wait = WebDriverWait(browser, 10)
# 通过 expected_conditions 来定义等待条件,这里以弹窗内容的某个元素为例
wait.until(EC.presence_of_element_located((By.XPATH, "//div[@role='feed']")))
# 内容
element_content_list = browser.find_elements('xpath',
"//div[@role='feed']/div//div[@aria-describedby]/div/div/div/div/div/div[2]/div/div/div[3]")
......
......@@ -56,6 +56,7 @@ def reptile(browser=None, search_word=""):
except:
print("------")
# print("1111")
log.debug("instagram login complete")
url = f"{base_url}explore/tags/{search_word}/"
browser.get(url)
wait = WebDriverWait(browser, 10)
......@@ -98,6 +99,7 @@ def reptile(browser=None, search_word=""):
if len(title_str_list) >= 3:
title = title_str_list[1]
else:
# 提取图片中的文字
title = ""
img_soup = BeautifulSoup(item.get_attribute("outerHTML"), "html.parser").find("img")
del img_soup["srcset"]
......
......@@ -33,7 +33,7 @@ def reptile(browser=None, search_word=""):
# browser = browser or create()
# 打开网页
browser.get(url)
log.debug("已打开浏览器")
# log.debug("已打开浏览器")
classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")
# log.debug(classify_item_list)
length = len(classify_item_list)
......
......@@ -14,7 +14,12 @@ from utils.download_image import download_image
from config.settings import get_base_file_url
from config.settings import get_account
# 工具函数-下载图片
# --------------- selenium 依赖 start ----------------
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# --------------- selenium 依赖 end ----------------
'''
打开指定网页,并使用 Selenium 模拟点击 "GDPR-accept" 按钮,然后循环点击 "search-show-more-button" 按钮来加载更多数据,直到按钮不再可点击为止。最后,获取完整的分页数据并关闭浏览器驱动。
'''
......@@ -28,32 +33,35 @@ def reptile(browser=None, search_word=""):
"""
print(f"搜索词:{search_word}")
base_url = "https://twitter.com/"
browser = browser or create(no_headless=True,using_user_data=True)
browser = browser or create(no_headless=False, using_user_data=True)
# print(browser)
# 打开网页
browser.get(base_url)
time.sleep(3)
time.sleep(2)
try:
# wait = WebDriverWait(browser, 20)
# wait.until(EC.presence_of_element_located((By.XPATH, "//input[@autocomplete='username']")))
# 检测是否要登录
login_input = browser.find_element('xpath', "//input[@autocomplete='username']")
login_input.send_keys(get_account("twitter")["name"])
# 获取下一步按钮
buttons = browser.find_element('xpath', "//div[@role='button'][2]")
buttons.click()
time.sleep(3)
wait = WebDriverWait(browser, 10)
wait.until(EC.presence_of_element_located((By.XPATH, "//input[@autocomplete='current-password']")))
password_input = browser.find_element('xpath', "//input[@autocomplete='current-password']")
password_input.send_keys(get_account("twitter")["password"])
# # 获取登录按钮
button_login = browser.find_element('xpath', "//div[@data-testid='LoginForm_Login_Button']")
button_login.click()
time.sleep(1)
except:
print("------")
time.sleep(2)
# print("1111")
url = 'https://twitter.com/search?q=' + search_word + '&src=typed_query'
browser.get(url)
time.sleep(4)
wait = WebDriverWait(browser, 10)
wait.until(
EC.presence_of_element_located((By.XPATH, "//div[@data-testid='cellInnerDiv']//article/div/div/div[2]/div[2]")))
base_xpath = "//div[@data-testid='cellInnerDiv']//article/div/div/div[2]/div[2]"
# 内容块
element_content_list = browser.find_elements('xpath', base_xpath)
......
......@@ -12,8 +12,14 @@ import os
from config.settings import get_base_file_url
from selenium.webdriver.common.action_chains import ActionChains
import sys
# --------------- selenium 依赖 start ----------------
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# --------------- selenium 依赖 end ----------------
def reptile(browser=None, search_word=""):
"""
......@@ -21,12 +27,14 @@ def reptile(browser=None, search_word=""):
:param search_word:
:return:
"""
browser = browser or create(no_headless=True,using_user_data=True)
browser = browser or create(no_headless=False, using_user_data=True)
# print(browser)
# 打开网页
url = f'https://www.youtube.com/results?search_query={search_word}'
browser.get(url)
# time.sleep(2)
wait = WebDriverWait(browser, 10)
wait.until(EC.presence_of_element_located((By.XPATH,"//div[@id='contents']")))
log.debug("youtube login complete")
classify_video_list = browser.find_elements('xpath',
"//div[@id='contents']//ytd-video-renderer//div[@id='title-wrapper']//a")
element_author_list = browser.find_elements('xpath',
......@@ -54,6 +62,7 @@ def reptile(browser=None, search_word=""):
# 下载视频
state_download = yt_dlp_download(url, 'youtube')
video_url.append(download_dir)
if state_download:
# 组装数据
obj = {
......@@ -68,8 +77,8 @@ def reptile(browser=None, search_word=""):
}
data.append(obj)
else:
print("")
# print("")
error = ""
if len(data) > 0:
# 保存json文件到本地
# log.debug(os.path.abspath("../"))
......
This diff is collapsed.
......@@ -87,7 +87,7 @@ def create(option=None, no_headless=False, using_user_data=True, web_browser="fi
# options.add_argument("--window-size=1920x1080") # 设置窗口大小,这是一个常见的完全无头模式的设置
# options.add_argument("--start-maximized") # 最大化窗口
if no_headless == True:
if platform.system() == "Linux" and platform.system() == "Darwin":
if platform.system() == "Linux" or platform.system() == "Darwin":
# 开启无头模式
options.add_argument("-headless")
elif platform.system() == "Windows" and web_browser == "firefox":
......
......@@ -166,19 +166,15 @@ def pytube_download(link, file_dir):
def yt_dlp_download(url, name):
file_dir = os.path.abspath("../")
options = f'-v'
network_options = f'-o "{os.path.join(file_dir, "network-assets-reptile", "reptile_data", name, "%(id)s.%(ext)s")}"'
geo = ""
# --get-url
video_selection = f''
# 清晰度
definition = f'18' # 360p
# definition = f'18' # 360p
# definition = f'18' # 720p
# definition = f'24' # 1080p
download_options = f'-f {definition} -vU'
other_options = f'--verbose'
# f'-f 18 -vU'
download_options = f'-f mp4'
# 要执行的 shell 命令
command = f'yt-dlp {options} {network_options} {geo} {video_selection} {download_options} {other_options} -- {url}'
command = f'yt-dlp -v {download_options} {network_options} --verbose -- {url}'
# 使用 subprocess 调用 shell 命令
result = subprocess.run(command, shell=True, capture_output=True, text=True)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment