Commit c712ff68 authored by liyang's avatar liyang

feat:time.sleep()更换为 WebDriverWait

parent 37ffd734
...@@ -35,9 +35,10 @@ def reptile(browser=None, search_word=""): ...@@ -35,9 +35,10 @@ def reptile(browser=None, search_word=""):
""" """
print(f"搜索词:{search_word}") print(f"搜索词:{search_word}")
base_url = "https://www.dcard.tw" base_url = "https://www.dcard.tw"
browser = browser or create(no_headless=True,using_user_data=True) browser = browser or create(no_headless=False, using_user_data=True)
# 打开网页 # 打开网页
# browser.get(base_url) # browser.get(base_url)
# time.sleep(3)
browser.get(f"{base_url}/search?query={search_word}") browser.get(f"{base_url}/search?query={search_word}")
base_xpath = "//div[@role='main']//div[@data-key]//article" base_xpath = "//div[@role='main']//div[@data-key]//article"
# 内容块 # 内容块
......
...@@ -13,6 +13,12 @@ import os ...@@ -13,6 +13,12 @@ import os
from config.settings import get_base_file_url from config.settings import get_base_file_url
from config.settings import get_account from config.settings import get_account
import sys import sys
# --------------- selenium 依赖 start ----------------
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# --------------- selenium 依赖 end ----------------
# 工具函数-下载图片 # 工具函数-下载图片
''' '''
...@@ -23,7 +29,7 @@ import sys ...@@ -23,7 +29,7 @@ import sys
def reptile(browser=None, search_word=""): def reptile(browser=None, search_word=""):
print(f"搜索词:{search_word}") print(f"搜索词:{search_word}")
url = "https://www.facebook.com/" url = "https://www.facebook.com/"
browser = browser or create(no_headless=True,using_user_data=True) browser = browser or create(no_headless=False,using_user_data=True)
# 打开网页 # 打开网页
browser.get(url) browser.get(url)
try: try:
...@@ -35,15 +41,19 @@ def reptile(browser=None, search_word=""): ...@@ -35,15 +41,19 @@ def reptile(browser=None, search_word=""):
# 获取登录按钮 # 获取登录按钮
button_login = browser.find_element('xpath', "//button[@name='login']") button_login = browser.find_element('xpath', "//button[@name='login']")
button_login.click() button_login.click()
time.sleep(6) wait = WebDriverWait(browser,10)
wait.until(EC.presence_of_element_located((By.XPATH,"//div[@role='main']")))
except: except:
print("已登录") print("已登录")
log.debug("facebook login complete")
url = f"https://www.facebook.com/search/top?q={search_word}" url = f"https://www.facebook.com/search/top?q={search_word}"
browser.get(url) browser.get(url)
# 使用 JavaScript 将网页滚动到底部 # 使用 JavaScript 将网页滚动到底部
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);") browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3) # 等待内容出现,设置最长等待时间为10秒
wait = WebDriverWait(browser, 10)
# 通过 expected_conditions 来定义等待条件,这里以弹窗内容的某个元素为例
wait.until(EC.presence_of_element_located((By.XPATH, "//div[@role='feed']")))
# 内容 # 内容
element_content_list = browser.find_elements('xpath', element_content_list = browser.find_elements('xpath',
"//div[@role='feed']/div//div[@aria-describedby]/div/div/div/div/div/div[2]/div/div/div[3]") "//div[@role='feed']/div//div[@aria-describedby]/div/div/div/div/div/div[2]/div/div/div[3]")
......
...@@ -56,6 +56,7 @@ def reptile(browser=None, search_word=""): ...@@ -56,6 +56,7 @@ def reptile(browser=None, search_word=""):
except: except:
print("------") print("------")
# print("1111") # print("1111")
log.debug("instagram login complete")
url = f"{base_url}explore/tags/{search_word}/" url = f"{base_url}explore/tags/{search_word}/"
browser.get(url) browser.get(url)
wait = WebDriverWait(browser, 10) wait = WebDriverWait(browser, 10)
...@@ -98,6 +99,7 @@ def reptile(browser=None, search_word=""): ...@@ -98,6 +99,7 @@ def reptile(browser=None, search_word=""):
if len(title_str_list) >= 3: if len(title_str_list) >= 3:
title = title_str_list[1] title = title_str_list[1]
else: else:
# 提取图片中的文字
title = "" title = ""
img_soup = BeautifulSoup(item.get_attribute("outerHTML"), "html.parser").find("img") img_soup = BeautifulSoup(item.get_attribute("outerHTML"), "html.parser").find("img")
del img_soup["srcset"] del img_soup["srcset"]
......
...@@ -33,7 +33,7 @@ def reptile(browser=None, search_word=""): ...@@ -33,7 +33,7 @@ def reptile(browser=None, search_word=""):
# browser = browser or create() # browser = browser or create()
# 打开网页 # 打开网页
browser.get(url) browser.get(url)
log.debug("已打开浏览器") # log.debug("已打开浏览器")
classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']") classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")
# log.debug(classify_item_list) # log.debug(classify_item_list)
length = len(classify_item_list) length = len(classify_item_list)
......
...@@ -14,7 +14,12 @@ from utils.download_image import download_image ...@@ -14,7 +14,12 @@ from utils.download_image import download_image
from config.settings import get_base_file_url from config.settings import get_base_file_url
from config.settings import get_account from config.settings import get_account
# 工具函数-下载图片 # --------------- selenium 依赖 start ----------------
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# --------------- selenium 依赖 end ----------------
''' '''
打开指定网页,并使用 Selenium 模拟点击 "GDPR-accept" 按钮,然后循环点击 "search-show-more-button" 按钮来加载更多数据,直到按钮不再可点击为止。最后,获取完整的分页数据并关闭浏览器驱动。 打开指定网页,并使用 Selenium 模拟点击 "GDPR-accept" 按钮,然后循环点击 "search-show-more-button" 按钮来加载更多数据,直到按钮不再可点击为止。最后,获取完整的分页数据并关闭浏览器驱动。
''' '''
...@@ -28,32 +33,35 @@ def reptile(browser=None, search_word=""): ...@@ -28,32 +33,35 @@ def reptile(browser=None, search_word=""):
""" """
print(f"搜索词:{search_word}") print(f"搜索词:{search_word}")
base_url = "https://twitter.com/" base_url = "https://twitter.com/"
browser = browser or create(no_headless=True,using_user_data=True) browser = browser or create(no_headless=False, using_user_data=True)
# print(browser) # print(browser)
# 打开网页 # 打开网页
browser.get(base_url) browser.get(base_url)
time.sleep(3) time.sleep(2)
try: try:
# wait = WebDriverWait(browser, 20)
# wait.until(EC.presence_of_element_located((By.XPATH, "//input[@autocomplete='username']")))
# 检测是否要登录 # 检测是否要登录
login_input = browser.find_element('xpath', "//input[@autocomplete='username']") login_input = browser.find_element('xpath', "//input[@autocomplete='username']")
login_input.send_keys(get_account("twitter")["name"]) login_input.send_keys(get_account("twitter")["name"])
# 获取下一步按钮 # 获取下一步按钮
buttons = browser.find_element('xpath', "//div[@role='button'][2]") buttons = browser.find_element('xpath', "//div[@role='button'][2]")
buttons.click() buttons.click()
time.sleep(3) wait = WebDriverWait(browser, 10)
wait.until(EC.presence_of_element_located((By.XPATH, "//input[@autocomplete='current-password']")))
password_input = browser.find_element('xpath', "//input[@autocomplete='current-password']") password_input = browser.find_element('xpath', "//input[@autocomplete='current-password']")
password_input.send_keys(get_account("twitter")["password"]) password_input.send_keys(get_account("twitter")["password"])
# # 获取登录按钮 # # 获取登录按钮
button_login = browser.find_element('xpath', "//div[@data-testid='LoginForm_Login_Button']") button_login = browser.find_element('xpath', "//div[@data-testid='LoginForm_Login_Button']")
button_login.click() button_login.click()
time.sleep(1)
except: except:
print("------") print("------")
time.sleep(2)
# print("1111")
url = 'https://twitter.com/search?q=' + search_word + '&src=typed_query' url = 'https://twitter.com/search?q=' + search_word + '&src=typed_query'
browser.get(url) browser.get(url)
time.sleep(4) wait = WebDriverWait(browser, 10)
wait.until(
EC.presence_of_element_located((By.XPATH, "//div[@data-testid='cellInnerDiv']//article/div/div/div[2]/div[2]")))
base_xpath = "//div[@data-testid='cellInnerDiv']//article/div/div/div[2]/div[2]" base_xpath = "//div[@data-testid='cellInnerDiv']//article/div/div/div[2]/div[2]"
# 内容块 # 内容块
element_content_list = browser.find_elements('xpath', base_xpath) element_content_list = browser.find_elements('xpath', base_xpath)
......
...@@ -12,8 +12,14 @@ import os ...@@ -12,8 +12,14 @@ import os
from config.settings import get_base_file_url from config.settings import get_base_file_url
from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.action_chains import ActionChains
import sys import sys
# --------------- selenium 依赖 start ----------------
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# --------------- selenium 依赖 end ----------------
def reptile(browser=None, search_word=""): def reptile(browser=None, search_word=""):
""" """
...@@ -21,12 +27,14 @@ def reptile(browser=None, search_word=""): ...@@ -21,12 +27,14 @@ def reptile(browser=None, search_word=""):
:param search_word: :param search_word:
:return: :return:
""" """
browser = browser or create(no_headless=True,using_user_data=True) browser = browser or create(no_headless=False, using_user_data=True)
# print(browser) # print(browser)
# 打开网页 # 打开网页
url = f'https://www.youtube.com/results?search_query={search_word}' url = f'https://www.youtube.com/results?search_query={search_word}'
browser.get(url) browser.get(url)
# time.sleep(2) wait = WebDriverWait(browser, 10)
wait.until(EC.presence_of_element_located((By.XPATH,"//div[@id='contents']")))
log.debug("youtube login complete")
classify_video_list = browser.find_elements('xpath', classify_video_list = browser.find_elements('xpath',
"//div[@id='contents']//ytd-video-renderer//div[@id='title-wrapper']//a") "//div[@id='contents']//ytd-video-renderer//div[@id='title-wrapper']//a")
element_author_list = browser.find_elements('xpath', element_author_list = browser.find_elements('xpath',
...@@ -54,6 +62,7 @@ def reptile(browser=None, search_word=""): ...@@ -54,6 +62,7 @@ def reptile(browser=None, search_word=""):
# 下载视频 # 下载视频
state_download = yt_dlp_download(url, 'youtube') state_download = yt_dlp_download(url, 'youtube')
video_url.append(download_dir) video_url.append(download_dir)
if state_download: if state_download:
# 组装数据 # 组装数据
obj = { obj = {
...@@ -68,8 +77,8 @@ def reptile(browser=None, search_word=""): ...@@ -68,8 +77,8 @@ def reptile(browser=None, search_word=""):
} }
data.append(obj) data.append(obj)
else: else:
print("") # print("")
error = ""
if len(data) > 0: if len(data) > 0:
# 保存json文件到本地 # 保存json文件到本地
# log.debug(os.path.abspath("../")) # log.debug(os.path.abspath("../"))
......
This diff is collapsed.
...@@ -87,7 +87,7 @@ def create(option=None, no_headless=False, using_user_data=True, web_browser="fi ...@@ -87,7 +87,7 @@ def create(option=None, no_headless=False, using_user_data=True, web_browser="fi
# options.add_argument("--window-size=1920x1080") # 设置窗口大小,这是一个常见的完全无头模式的设置 # options.add_argument("--window-size=1920x1080") # 设置窗口大小,这是一个常见的完全无头模式的设置
# options.add_argument("--start-maximized") # 最大化窗口 # options.add_argument("--start-maximized") # 最大化窗口
if no_headless == True: if no_headless == True:
if platform.system() == "Linux" and platform.system() == "Darwin": if platform.system() == "Linux" or platform.system() == "Darwin":
# 开启无头模式 # 开启无头模式
options.add_argument("-headless") options.add_argument("-headless")
elif platform.system() == "Windows" and web_browser == "firefox": elif platform.system() == "Windows" and web_browser == "firefox":
......
...@@ -166,19 +166,15 @@ def pytube_download(link, file_dir): ...@@ -166,19 +166,15 @@ def pytube_download(link, file_dir):
def yt_dlp_download(url, name): def yt_dlp_download(url, name):
file_dir = os.path.abspath("../") file_dir = os.path.abspath("../")
options = f'-v'
network_options = f'-o "{os.path.join(file_dir, "network-assets-reptile", "reptile_data", name, "%(id)s.%(ext)s")}"' network_options = f'-o "{os.path.join(file_dir, "network-assets-reptile", "reptile_data", name, "%(id)s.%(ext)s")}"'
geo = ""
# --get-url
video_selection = f''
# 清晰度 # 清晰度
definition = f'18' # 360p # definition = f'18' # 360p
# definition = f'18' # 720p # definition = f'18' # 720p
# definition = f'24' # 1080p # definition = f'24' # 1080p
download_options = f'-f {definition} -vU' # f'-f 18 -vU'
other_options = f'--verbose' download_options = f'-f mp4'
# 要执行的 shell 命令 # 要执行的 shell 命令
command = f'yt-dlp {options} {network_options} {geo} {video_selection} {download_options} {other_options} -- {url}' command = f'yt-dlp -v {download_options} {network_options} --verbose -- {url}'
# 使用 subprocess 调用 shell 命令 # 使用 subprocess 调用 shell 命令
result = subprocess.run(command, shell=True, capture_output=True, text=True) result = subprocess.run(command, shell=True, capture_output=True, text=True)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment