Commit 45591aa3 authored by liyang's avatar liyang

fix:selenium 驱动配置

parent 1b3d2164
......@@ -81,6 +81,7 @@ def reptile(browser=None, search_word=""):
image_list = soup.find_all("img")
# lth = len(ignore_list)
if len(video_list) > 0:
# for key,element in enumerate(video_list):
# 删除第二个子元素
# 找到包含两个 <div> 元素的父级元素
parent_div = soup.find('div')
......
......@@ -27,7 +27,7 @@ import os
def reptile(browser=None, search_word=""):
url = "https://www.ptt.cc/bbs/hotboards.html"
# 无头模式执行
browser = browser or create(['--headless'])
browser = browser or create(['--headless'],False)
# 有头模式执行
# browser = browser or create()
# 打开网页
......
......@@ -9,6 +9,7 @@ from utils.index import convert_to_traditional, yt_dlp_download, convert_string_
# from pytube import YouTube
import os
from datetime import datetime
from utils.download_image import download_image
from config.settings import get_base_file_url
# 工具函数-下载图片
......@@ -21,7 +22,7 @@ def reptile(browser=None, search_word=""):
base_url = "https://twitter.com/"
option = ['--headless']
# ['--headless']
browser = browser or create(None, False)
browser = browser or create(None, True)
# print(browser)
# 打开网页
browser.get(base_url)
......@@ -36,12 +37,13 @@ def reptile(browser=None, search_word=""):
time.sleep(3)
password_input = browser.find_element('xpath', "//input[@autocomplete='current-password']")
password_input.send_keys("liyang19970814")
# 获取登录按钮
# # 获取登录按钮
button_login = browser.find_element('xpath', "//div[@data-testid='LoginForm_Login_Button']")
button_login.click()
except:
print("------")
time.sleep(2)
# print("1111")
url = 'https://twitter.com/search?q=' + search_word + '&src=typed_query'
browser.get(url)
time.sleep(4)
......@@ -52,16 +54,10 @@ def reptile(browser=None, search_word=""):
element_content_list = browser.find_elements('xpath',base_xpath)
# 作者
element_authors_list = browser.find_elements('xpath',f"{base_xpath}//div[@data-testid='User-Name']/div[1]//a[@role='link']")
# time.sleep(2)
# 发布时间
# element_release_list = browser.find_elements('xpath',f"{base_xpath}//div[@data-testid='User-Name']/div[2]//a/time")
# time_a_list = browser.find_elements('xpath',f"{base_xpath}//div[@data-testid='User-Name']/div[2]//a/time/..")
# print(element_content_list)
length = len(element_authors_list)
for index in range(length):
# print(index)
content = element_content_list[index].get_attribute("outerHTML")
soup = BeautifulSoup(content,"html.parser")
soup = BeautifulSoup(element_content_list[index].get_attribute("outerHTML"),"html.parser")
# 查找time标签
time_soup = soup.find('time')
timestamp = datetime.fromisoformat(time_soup['datetime'].replace("Z", "+00:00")).timestamp()
......@@ -70,6 +66,52 @@ def reptile(browser=None, search_word=""):
author = element_authors_list[index].text
# 标题取:作者+日期
title = f"{author}-{datetime.fromtimestamp(int(timestamp))}"
video_list = soup.find_all("video")
image_list = soup.find_all("img")
# lth = len(ignore_list)
if len(video_list) > 0:
# for key,element in enumerate(video_list):
# 删除第二个子元素
# 找到包含两个 <div> 元素的父级元素
parent_div = soup.find('div')
# 找到所有的 <div> 子元素
div_elements = parent_div.find_all('div', recursive=False)
# div_tags = soup.find_all("div", recursive=False)
# 确保列表中至少有两个 <div> 子元素
if len(div_elements) >= 2:
# 获取第二个 <div> 元素,并将其从父级元素中移除
div_to_remove = div_elements[1]
div_to_remove.extract()
# 删除
# div.decompose()
# 创建video标签占位
custom_video = soup.new_tag("video")
custom_video["src"] = ""
parent_div.append(custom_video)
else:
print("")
picture_url = []
if len(image_list) > 0:
for key, element in enumerate(image_list):
# 下载图片至本地,替换标签中的src
id = str(int(time.time()))
# 下载地址
download_dir = f'{os.path.join(file_dir, f"{id}.jpg")}'
# 访问地址
access_address = f'{get_base_file_url()}{table_name.split("_")[1]}/{id}.jpg'
# 下载状态
status = download_image(element['src'], download_dir)
if status:
element['src'] = access_address
picture_url.append(access_address)
else:
print("")
content = soup.prettify()
# ---------------- 判断类型 start ----------
# 类型
content_type = ""
......
# from utils.index import yt_dlp_download
#
# status = yt_dlp_download("https://www.facebook.com/e5627ead-8b9a-48fd-820f-ee242cc08bbb", "facebook")
# print(status)
import time
from selenium.webdriver import Firefox
from selenium import webdriver
driver = webdriver.Firefox()
driver.get("https://www.toutiao.com/a6969138023774667264/")
time.sleep(2)
html = driver.page_source
print(html)
driver.quit()
\ No newline at end of file
import os.path
import os
import platform
import sys
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
......@@ -6,7 +7,8 @@ from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import chromedriver_autoinstaller
# from mozprofile import FirefoxProfile
'''
创建浏览器实例
'''
......@@ -19,18 +21,32 @@ def create(option=None, using_user_data=True):
:param option:
:return:
"""
# 安装或升级 chromedriver
chromedriver_autoinstaller.install()
# 获取现有Chrome浏览器用户数据目录
# chrome_user_data_dir = ""
# if platform.system() == 'Windows':
# chrome_user_data_dir = os.path.join(os.environ['USERPROFILE'], 'AppData', 'Local', 'Google', 'Chrome',
# 'User Data')
# elif platform.system() == 'Linux':
# chrome_user_data_dir = os.path.join(os.path.expanduser('~'), '.config', 'google-chrome')
# elif platform.system() == 'Darwin':
# chrome_user_data_dir = os.path.join(os.path.expanduser("~"), 'Library', 'Application Support', 'Google','Chrome')
# else:
# raise Exception('Unsupported operating system')
chrome_options = webdriver.ChromeOptions()
if option is not None:
for value in option:
chrome_options.add_argument(value)
# 启用浏览器的持久性会话,可以保存登录状态和Cookie
# 使用本地
user_data_dir = os.path.join(os.path.abspath("../"), 'network-assets-reptile', 'user_data')
script = f'--user-data-dir={user_data_dir}'
# print(script)
# log.debug(script)
if using_user_data:
chrome_options.add_argument(script) # 设置一个自定义的用户配置文件路径
# 添加用户数据目录参数
chrome_options.add_argument(f'--user-data-dir={user_data_dir}')
if sys.platform.startswith('linux'):
# print("当前系统是 Linux")
......
......@@ -84,8 +84,8 @@ def parse_twitter_time_string(time_str):
"""
times = parser.parse(time_str, fuzzy=True)
# a = datetime.datetime.strptime(time,"%Y-%m-%d %H:%M:%S")
b = datetime.datetime.strftime(times,"%Y-%m-%d %H:%M:%S")
c = time.mktime(time.strptime(b,"%Y-%m-%d %H:%M:%S"))
b = datetime.datetime.strftime(times, "%Y-%m-%d %H:%M:%S")
c = time.mktime(time.strptime(b, "%Y-%m-%d %H:%M:%S"))
# 解析相对时间字符串
return c
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment