Commit 45591aa3 authored by liyang's avatar liyang

fix:selenium 驱动配置

parent 1b3d2164
...@@ -81,6 +81,7 @@ def reptile(browser=None, search_word=""): ...@@ -81,6 +81,7 @@ def reptile(browser=None, search_word=""):
image_list = soup.find_all("img") image_list = soup.find_all("img")
# lth = len(ignore_list) # lth = len(ignore_list)
if len(video_list) > 0: if len(video_list) > 0:
# for key,element in enumerate(video_list):
# 删除第二个子元素 # 删除第二个子元素
# 找到包含两个 <div> 元素的父级元素 # 找到包含两个 <div> 元素的父级元素
parent_div = soup.find('div') parent_div = soup.find('div')
......
...@@ -27,7 +27,7 @@ import os ...@@ -27,7 +27,7 @@ import os
def reptile(browser=None, search_word=""): def reptile(browser=None, search_word=""):
url = "https://www.ptt.cc/bbs/hotboards.html" url = "https://www.ptt.cc/bbs/hotboards.html"
# 无头模式执行 # 无头模式执行
browser = browser or create(['--headless']) browser = browser or create(['--headless'],False)
# 有头模式执行 # 有头模式执行
# browser = browser or create() # browser = browser or create()
# 打开网页 # 打开网页
......
...@@ -9,6 +9,7 @@ from utils.index import convert_to_traditional, yt_dlp_download, convert_string_ ...@@ -9,6 +9,7 @@ from utils.index import convert_to_traditional, yt_dlp_download, convert_string_
# from pytube import YouTube # from pytube import YouTube
import os import os
from datetime import datetime from datetime import datetime
from utils.download_image import download_image
from config.settings import get_base_file_url from config.settings import get_base_file_url
# 工具函数-下载图片 # 工具函数-下载图片
...@@ -21,7 +22,7 @@ def reptile(browser=None, search_word=""): ...@@ -21,7 +22,7 @@ def reptile(browser=None, search_word=""):
base_url = "https://twitter.com/" base_url = "https://twitter.com/"
option = ['--headless'] option = ['--headless']
# ['--headless'] # ['--headless']
browser = browser or create(None, False) browser = browser or create(None, True)
# print(browser) # print(browser)
# 打开网页 # 打开网页
browser.get(base_url) browser.get(base_url)
...@@ -36,12 +37,13 @@ def reptile(browser=None, search_word=""): ...@@ -36,12 +37,13 @@ def reptile(browser=None, search_word=""):
time.sleep(3) time.sleep(3)
password_input = browser.find_element('xpath', "//input[@autocomplete='current-password']") password_input = browser.find_element('xpath', "//input[@autocomplete='current-password']")
password_input.send_keys("liyang19970814") password_input.send_keys("liyang19970814")
# 获取登录按钮 # # 获取登录按钮
button_login = browser.find_element('xpath', "//div[@data-testid='LoginForm_Login_Button']") button_login = browser.find_element('xpath', "//div[@data-testid='LoginForm_Login_Button']")
button_login.click() button_login.click()
except: except:
print("------") print("------")
time.sleep(2) time.sleep(2)
# print("1111")
url = 'https://twitter.com/search?q=' + search_word + '&src=typed_query' url = 'https://twitter.com/search?q=' + search_word + '&src=typed_query'
browser.get(url) browser.get(url)
time.sleep(4) time.sleep(4)
...@@ -52,16 +54,10 @@ def reptile(browser=None, search_word=""): ...@@ -52,16 +54,10 @@ def reptile(browser=None, search_word=""):
element_content_list = browser.find_elements('xpath',base_xpath) element_content_list = browser.find_elements('xpath',base_xpath)
# 作者 # 作者
element_authors_list = browser.find_elements('xpath',f"{base_xpath}//div[@data-testid='User-Name']/div[1]//a[@role='link']") element_authors_list = browser.find_elements('xpath',f"{base_xpath}//div[@data-testid='User-Name']/div[1]//a[@role='link']")
# time.sleep(2)
# 发布时间
# element_release_list = browser.find_elements('xpath',f"{base_xpath}//div[@data-testid='User-Name']/div[2]//a/time")
# time_a_list = browser.find_elements('xpath',f"{base_xpath}//div[@data-testid='User-Name']/div[2]//a/time/..")
# print(element_content_list)
length = len(element_authors_list) length = len(element_authors_list)
for index in range(length): for index in range(length):
# print(index) # print(index)
content = element_content_list[index].get_attribute("outerHTML") soup = BeautifulSoup(element_content_list[index].get_attribute("outerHTML"),"html.parser")
soup = BeautifulSoup(content,"html.parser")
# 查找time标签 # 查找time标签
time_soup = soup.find('time') time_soup = soup.find('time')
timestamp = datetime.fromisoformat(time_soup['datetime'].replace("Z", "+00:00")).timestamp() timestamp = datetime.fromisoformat(time_soup['datetime'].replace("Z", "+00:00")).timestamp()
...@@ -70,6 +66,52 @@ def reptile(browser=None, search_word=""): ...@@ -70,6 +66,52 @@ def reptile(browser=None, search_word=""):
author = element_authors_list[index].text author = element_authors_list[index].text
# 标题取:作者+日期 # 标题取:作者+日期
title = f"{author}-{datetime.fromtimestamp(int(timestamp))}" title = f"{author}-{datetime.fromtimestamp(int(timestamp))}"
video_list = soup.find_all("video")
image_list = soup.find_all("img")
# lth = len(ignore_list)
if len(video_list) > 0:
# for key,element in enumerate(video_list):
# 删除第二个子元素
# 找到包含两个 <div> 元素的父级元素
parent_div = soup.find('div')
# 找到所有的 <div> 子元素
div_elements = parent_div.find_all('div', recursive=False)
# div_tags = soup.find_all("div", recursive=False)
# 确保列表中至少有两个 <div> 子元素
if len(div_elements) >= 2:
# 获取第二个 <div> 元素,并将其从父级元素中移除
div_to_remove = div_elements[1]
div_to_remove.extract()
# 删除
# div.decompose()
# 创建video标签占位
custom_video = soup.new_tag("video")
custom_video["src"] = ""
parent_div.append(custom_video)
else:
print("")
picture_url = []
if len(image_list) > 0:
for key, element in enumerate(image_list):
# 下载图片至本地,替换标签中的src
id = str(int(time.time()))
# 下载地址
download_dir = f'{os.path.join(file_dir, f"{id}.jpg")}'
# 访问地址
access_address = f'{get_base_file_url()}{table_name.split("_")[1]}/{id}.jpg'
# 下载状态
status = download_image(element['src'], download_dir)
if status:
element['src'] = access_address
picture_url.append(access_address)
else:
print("")
content = soup.prettify()
# ---------------- 判断类型 start ---------- # ---------------- 判断类型 start ----------
# 类型 # 类型
content_type = "" content_type = ""
......
# from utils.index import yt_dlp_download
#
# status = yt_dlp_download("https://www.facebook.com/e5627ead-8b9a-48fd-820f-ee242cc08bbb", "facebook")
# print(status)
import time
from selenium.webdriver import Firefox
from selenium import webdriver
driver = webdriver.Firefox()
driver.get("https://www.toutiao.com/a6969138023774667264/")
time.sleep(2)
html = driver.page_source
print(html)
driver.quit()
\ No newline at end of file
import os.path import os
import platform
import sys import sys
from selenium import webdriver from selenium import webdriver
from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.options import Options
...@@ -6,7 +7,8 @@ from selenium.webdriver.chrome.service import Service ...@@ -6,7 +7,8 @@ from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support.ui import WebDriverWait
import chromedriver_autoinstaller
# from mozprofile import FirefoxProfile
''' '''
创建浏览器实例 创建浏览器实例
''' '''
...@@ -19,18 +21,32 @@ def create(option=None, using_user_data=True): ...@@ -19,18 +21,32 @@ def create(option=None, using_user_data=True):
:param option: :param option:
:return: :return:
""" """
# 安装或升级 chromedriver
chromedriver_autoinstaller.install()
# 获取现有Chrome浏览器用户数据目录
# chrome_user_data_dir = ""
# if platform.system() == 'Windows':
# chrome_user_data_dir = os.path.join(os.environ['USERPROFILE'], 'AppData', 'Local', 'Google', 'Chrome',
# 'User Data')
# elif platform.system() == 'Linux':
# chrome_user_data_dir = os.path.join(os.path.expanduser('~'), '.config', 'google-chrome')
# elif platform.system() == 'Darwin':
# chrome_user_data_dir = os.path.join(os.path.expanduser("~"), 'Library', 'Application Support', 'Google','Chrome')
# else:
# raise Exception('Unsupported operating system')
chrome_options = webdriver.ChromeOptions() chrome_options = webdriver.ChromeOptions()
if option is not None: if option is not None:
for value in option: for value in option:
chrome_options.add_argument(value) chrome_options.add_argument(value)
# 启用浏览器的持久性会话,可以保存登录状态和Cookie # 启用浏览器的持久性会话,可以保存登录状态和Cookie
# 使用本地
user_data_dir = os.path.join(os.path.abspath("../"), 'network-assets-reptile', 'user_data') user_data_dir = os.path.join(os.path.abspath("../"), 'network-assets-reptile', 'user_data')
script = f'--user-data-dir={user_data_dir}'
# print(script)
# log.debug(script)
if using_user_data: if using_user_data:
chrome_options.add_argument(script) # 设置一个自定义的用户配置文件路径 # 添加用户数据目录参数
chrome_options.add_argument(f'--user-data-dir={user_data_dir}')
if sys.platform.startswith('linux'): if sys.platform.startswith('linux'):
# print("当前系统是 Linux") # print("当前系统是 Linux")
......
...@@ -84,8 +84,8 @@ def parse_twitter_time_string(time_str): ...@@ -84,8 +84,8 @@ def parse_twitter_time_string(time_str):
""" """
times = parser.parse(time_str, fuzzy=True) times = parser.parse(time_str, fuzzy=True)
# a = datetime.datetime.strptime(time,"%Y-%m-%d %H:%M:%S") # a = datetime.datetime.strptime(time,"%Y-%m-%d %H:%M:%S")
b = datetime.datetime.strftime(times,"%Y-%m-%d %H:%M:%S") b = datetime.datetime.strftime(times, "%Y-%m-%d %H:%M:%S")
c = time.mktime(time.strptime(b,"%Y-%m-%d %H:%M:%S")) c = time.mktime(time.strptime(b, "%Y-%m-%d %H:%M:%S"))
# 解析相对时间字符串 # 解析相对时间字符串
return c return c
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment