import json import platform import time from bs4 import BeautifulSoup from utils.Logger import log from utils.createBrowserDriver import create from utils.filse import save_json from api.index import importJson, getReptileTask, importJsonPath from utils.index import convert_to_traditional, yt_dlp_download, convert_string_to_time, \ parse_time_string, create_directory_if_not_exists, delete_directory # from pytube import YouTube from datetime import datetime from utils.download_image import download_image import os from config.settings import get_base_file_url from config.settings import get_account import sys # --------------- selenium 依赖 start ---------------- from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC # --------------- selenium 依赖 end ---------------- # 工具函数-下载图片 ''' 打开指定网页,并使用 Selenium 模拟点击 "GDPR-accept" 按钮,然后循环点击 "search-show-more-button" 按钮来加载更多数据,直到按钮不再可点击为止。最后,获取完整的分页数据并关闭浏览器驱动。 ''' def reptile(browser=None, search_word=""): print(f"搜索词:{search_word}") url = "https://www.facebook.com/" browser = browser or create(no_headless=False, using_user_data=True) # 打开网页 browser.get(url) time.sleep(2) try: # time.sleep(3) # 检测是否要登录 login_input = browser.find_element('xpath', "//input[@name='email']") password_input = browser.find_element('xpath', "//input[@name='pass']") login_input.send_keys(get_account("facebook")["name"]) password_input.send_keys(get_account("facebook")["password"]) # 获取登录按钮 button_login = browser.find_element('xpath', "//button[@name='login']") button_login.click() time.sleep(3) except: print("已登录") log.debug("facebook login complete") url = f"https://www.facebook.com/search/top?q={search_word}" browser.get(url) time.sleep(2) # 使用 JavaScript 将网页滚动到底部 browser.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(6) # 等待内容出现,设置最长等待时间为10秒 wait = WebDriverWait(browser, 10) # 通过 expected_conditions 来定义等待条件,这里以弹窗内容的某个元素为例 wait.until(EC.presence_of_element_located((By.XPATH, "//div[@role='feed']"))) # 内容 element_content_list = browser.find_elements('xpath', "//div[@role='feed']/div//div[@aria-describedby]/div/div/div/div/div/div[2]/div/div/div[3]") # 作者 element_authors_list = browser.find_elements('xpath', "//div[@role='feed']/div//div[@aria-describedby]//h3/span[1]") # 发布时间 element_release_list = browser.find_elements('xpath', "//div[@role='feed']/div//div[@aria-describedby]//span[@dir]/span//a[@role='link' and @aria-label]") # 查找所有 展开 按钮,循环点击后在查找内容 elements_expand_list = browser.find_elements('xpath', "//div[@role='feed']/div//div[@aria-describedby]//div[@role='button' and text()='展开']") for key, element in enumerate(elements_expand_list): try: # 使用JavaScript 执行点击操作 browser.execute_script("arguments[0].click();", element) except Exception as e: print("Clicking element failed: " + str(e)) length = len(element_content_list) for index in range(length): author_soup = BeautifulSoup(element_authors_list[index].get_attribute("outerHTML"), "html") time_soup = BeautifulSoup(element_release_list[index].get_attribute("outerHTML"), "html") # author = element_authors_list[index].text author = author_soup.find_all("a")[0].text time_text = time_soup.find_all("a")[0].text release_time_timestamp = int(parse_time_string(time_text)) release_time = str(release_time_timestamp) # 过滤时间 # # 如果'releaseTime'不是整数,则将其转换为整数 new_releaseTime = int(release_time) if new_releaseTime < beginFiltrationTime or new_releaseTime > endFiltrationTime: # 如果'new_releaseTime'不在指定范围内,则跳过当前迭代,继续下一个项目 continue text = element_content_list[index].text soup = BeautifulSoup(element_content_list[index].get_attribute('outerHTML'), 'html.parser') soup_str = soup.prettify() # 查找是否含有视频 video_list = soup.find_all("video") image_list = soup.find_all("img") # lth = len(ignore_list) if len(video_list) > 0: # for key,element in enumerate(video_list): # 删除第二个子元素 # 找到包含两个