Commit 1f24eff4 authored by liyang's avatar liyang

fix:selenium 驱动配置

parent 781ee034
......@@ -22,7 +22,7 @@ def reptile(browser=None, search_word=""):
print(f"搜索词:{search_word}")
url = "https://www.facebook.com/"
option = ['--headless']
browser = browser or create(option)
browser = browser or create(option, True)
# 打开网页
browser.get(url)
try:
......
......@@ -16,7 +16,8 @@ from utils.createBrowserDriver import create
import opencc
from utils.filse import save_json
import os
from config.settings import get_base_file_url
from utils.download_image import download_image
'''
爬取台湾PTT论坛的热门帖子,包括帖子的标题、内容【文本、图片、视频】
......@@ -27,7 +28,7 @@ import os
def reptile(browser=None, search_word=""):
url = "https://www.ptt.cc/bbs/hotboards.html"
# 无头模式执行
browser = browser or create(['--headless'],False)
browser = browser or create(['--headless'], True)
# 有头模式执行
# browser = browser or create()
# 打开网页
......@@ -97,15 +98,31 @@ def reptile(browser=None, search_word=""):
# ---------------- 判断类型 start ----------
# 类型
content_type = ""
# 查找所有img标签
image_list = soup.find_all('img')
try:
# 查找所有img标签
img_tags = soup.find_all('img')
if len(img_tags) > 0:
if len(image_list) > 0:
content_type = "图文"
else:
content_type = "文字"
except:
content_type = "文字"
picture_url = []
if len(image_list) > 0:
for key, element in enumerate(image_list):
# 下载图片至本地,替换标签中的src
id = str(int(time.time()))
# 下载地址
download_dir = f'{os.path.join(file_dir, f"{id}.jpg")}'
# 访问地址
access_address = f'{get_base_file_url()}{table_name.split("_")[1]}/{id}.jpg'
# 下载状态
status = download_image(element['src'], download_dir)
if status:
element['src'] = access_address
picture_url.append(access_address)
else:
print("")
# ---------------- 判断类型 end ----------
# log.debug('开始内容过滤')
# ------------------ content 过滤 start--------------
......@@ -146,7 +163,8 @@ def reptile(browser=None, search_word=""):
"reptileTime": str(int(time.time())),
"type": content_type,
"author": element_author.text,
"releaseTime": release_time
"releaseTime": release_time,
"picture_url": ",".join(picture_url)
}
# --------------- 组装数据 end---------------------
......@@ -175,7 +193,7 @@ def reptile(browser=None, search_word=""):
if len(data) > 0:
# 保存json文件到本地
# log.debug(os.path.abspath("../"))
state_save = save_json(os.path.join(file_dir,str(int(time.time())) + ".json"), data)
state_save = save_json(os.path.join(file_dir, str(int(time.time())) + ".json"), data)
if state_save:
log.debug('save file success')
else:
......@@ -220,7 +238,7 @@ def main():
# 全局变量
data = []
table_name = "pms_ptt"
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data",table_name.split("_")[1])}'
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
# 是否启用
status_task = '0'
# 调用main函数
......
......@@ -19,10 +19,16 @@ from config.settings import get_base_file_url
def reptile(browser=None, search_word=""):
"""
:param browser:
:param search_word:
"""
print(f"搜索词:{search_word}")
base_url = "https://twitter.com/"
option = ['--headless']
# ['--headless']
browser = browser or create(None, True)
browser = browser or create(option, True)
# print(browser)
# 打开网页
browser.get(base_url)
......@@ -48,26 +54,29 @@ def reptile(browser=None, search_word=""):
browser.get(url)
time.sleep(4)
base_xpath = "//div[@data-testid='cellInnerDiv']//article/div/div/div[2]/div[2]"
# 内容块
element_content_list = browser.find_elements('xpath',base_xpath)
element_content_list = browser.find_elements('xpath', base_xpath)
# 作者
element_authors_list = browser.find_elements('xpath',f"{base_xpath}//div[@data-testid='User-Name']/div[1]//a[@role='link']")
element_authors_list = browser.find_elements('xpath',
f"{base_xpath}//div[@data-testid='User-Name']/div[1]//a[@role='link']")
length = len(element_authors_list)
for index in range(length):
# print(index)
soup = BeautifulSoup(element_content_list[index].get_attribute("outerHTML"),"html.parser")
soup = BeautifulSoup(element_content_list[index].get_attribute("outerHTML"), "html.parser")
# 查找time标签
time_soup = soup.find('time')
timestamp = datetime.fromisoformat(time_soup['datetime'].replace("Z", "+00:00")).timestamp()
link_soup = time_soup.parent
link_str = base_url+link_soup["href"]
try:
time_soup = soup.find('time')
timestamp = datetime.fromisoformat(time_soup['datetime'].replace("Z", "+00:00")).timestamp()
link_soup = time_soup.parent
link_str = base_url + link_soup["href"]
except:
link_str = ""
timestamp = time.time()
author = element_authors_list[index].text
# 标题取:作者+日期
title = f"{author}-{datetime.fromtimestamp(int(timestamp))}"
video_list = soup.find_all("video")
image_list = soup.find_all("img")
# lth = len(ignore_list)
......@@ -111,7 +120,6 @@ def reptile(browser=None, search_word=""):
print("")
content = soup.prettify()
# ---------------- 判断类型 start ----------
# 类型
content_type = ""
......@@ -133,7 +141,8 @@ def reptile(browser=None, search_word=""):
"reptileTime": str(int(time.time())),
"type": content_type,
"author": author,
"releaseTime": str(int(timestamp))
"releaseTime": str(int(timestamp)),
"picture_url": ",".join(picture_url)
}
# --------------- 组装数据 end---------------------
data.append(obj)
......@@ -189,7 +198,7 @@ def main():
# 全局变量
data = []
table_name = "pms_twitter"
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data",table_name.split("_")[1])}'
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
# 是否启用
status_task = '0'
# 调用main函数
......
......@@ -21,7 +21,7 @@ def reptile(browser=None, search_word=""):
:return:
"""
option = ['--headless']
browser = browser or create(['--headless'])
browser = browser or create(['--headless'],True)
# print(browser)
# 打开网页
url = f'https://www.youtube.com/results?search_query={search_word}'
......
......@@ -8,65 +8,99 @@ from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import chromedriver_autoinstaller
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
# from mozprofile import FirefoxProfile
'''
创建浏览器实例
'''
def create(option=None, using_user_data=True):
def create(option=None, using_user_data=True, web_browser="firefox"):
"""
:param web_browser:
:param using_user_data:
:param option:
:return:
"""
# 安装或升级 chromedriver
chromedriver_autoinstaller.install()
# chromedriver_autoinstaller.install()
# 获取现有Chrome浏览器用户数据目录
# chrome_user_data_dir = ""
# if platform.system() == 'Windows':
# chrome_user_data_dir = os.path.join(os.environ['USERPROFILE'], 'AppData', 'Local', 'Google', 'Chrome',
# 'User Data')
# elif platform.system() == 'Linux':
# chrome_user_data_dir = os.path.join(os.path.expanduser('~'), '.config', 'google-chrome')
# elif platform.system() == 'Darwin':
# chrome_user_data_dir = os.path.join(os.path.expanduser("~"), 'Library', 'Application Support', 'Google','Chrome')
# else:
# raise Exception('Unsupported operating system')
def get_user_data_dir():
"""
:return:
"""
# 获取现有Chrome浏览器用户数据目录
user_data_dir = os.path.join(os.path.abspath("../"), 'network-assets-reptile', 'user_data')
# if platform.system() == 'Windows':
# if web_browser == "firefox":
# user_data_dir = os.path.join(os.environ['USERPROFILE'], 'AppData', 'Local', 'Mozilla', 'Firefox',
# 'Profiles')
# else:
# user_data_dir = os.path.join(os.environ['USERPROFILE'], 'AppData', 'Local', 'Google', 'Chrome',
# 'User Data')
# elif platform.system() == 'Linux':
# if web_browser == "firefox":
# user_data_dir = os.path.join(os.path.expanduser('~'), '.config', 'Firefox', 'Profiles','huqg7mpy.default-release')
# else:
# user_data_dir = os.path.join(os.path.expanduser('~'), '.config', 'google-chrome')
# elif platform.system() == 'Darwin':
# if web_browser == "firefox":
# user_data_dir = os.path.join(os.path.expanduser("~"), 'Library', 'Application Support', 'Firefox',
# 'Profiles','huqg7mpy.default-release')
# else:
# user_data_dir = os.path.join(os.path.expanduser("~"), 'Library', 'Application Support', 'Google',
# 'Chrome')
# else:
# raise Exception('Unsupported operating system')
return user_data_dir
chrome_options = webdriver.ChromeOptions()
options = ""
browser = ""
if web_browser == "firefox":
options = webdriver.FirefoxOptions()
else:
options = webdriver.ChromeOptions()
if option is not None:
for value in option:
chrome_options.add_argument(value)
options.add_argument(value)
# 启用浏览器的持久性会话,可以保存登录状态和Cookie
# 使用本地
user_data_dir = os.path.join(os.path.abspath("../"), 'network-assets-reptile', 'user_data')
if using_user_data:
# 添加用户数据目录参数
chrome_options.add_argument(f'--user-data-dir={user_data_dir}')
# 添加用户数据目录参数,启用浏览器的持久性会话,可以保存登录状态和Cookie
if web_browser == "firefox":
firefox_profile_path = get_user_data_dir() # 将此处替换为你的Firefox用户数据目录路径
profile = FirefoxProfile(profile_directory=firefox_profile_path)
options.profile = profile
else:
options.add_argument(f'--user-data-dir={get_user_data_dir()}')
if sys.platform.startswith('linux'):
# print("当前系统是 Linux")
# linux下运行记得加上这些参数 ----------------------------
# chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--disable-dev-shm-usage')
# 加载chromedriver -------------------------------------------------
# windows 下的 chromedriver 默认加载路径是当前路径下的 chromedriver.exe
# linux 下的 chromedriver 默认加载路径是 /usr/local/bin/chromedriver
# 当然也可以通过 executable_path 自定义
browser = webdriver.Chrome(options=chrome_options)
# -----------------------------------------------------------------
# if sys.platform.startswith('linux'):
# print("当前系统是 Linux")
# linux下运行记得加上这些参数 ----------------------------
# chrome_options.add_argument('--headless')
options.add_argument("--window-size=1920x1080") # 设置窗口大小,这是一个常见的完全无头模式的设置
options.add_argument('--no-sandbox')
options.add_argument('--disable-gpu')
options.add_argument('--disable-dev-shm-usage')
# 加载chromedriver -------------------------------------------------
# windows 下的 chromedriver 默认加载路径是当前路径下的 chromedriver.exe
# linux 下的 chromedriver 默认加载路径是 /usr/local/bin/chromedriver
# 当然也可以通过 executable_path 自定义
if web_browser == "firefox":
browser = webdriver.Firefox(options=options)
else:
# print("当前系统不是 Linux")
# linux下运行记得加上这些参数 ----------------------------
# chrome_options.add_argument('--headless') # 启用无头模式
chrome_options.add_argument('--no-sandbox') # 禁用沙盒模式
# 创建浏览器驱动对象
browser = webdriver.Chrome(options=chrome_options)
browser = webdriver.Chrome(options=options)
# -----------------------------------------------------------------
# else:
# # print("当前系统不是 Linux")
# # linux下运行记得加上这些参数 ----------------------------
# # chrome_options.add_argument('--headless') # 启用无头模式
# options.add_argument('--no-sandbox') # 禁用沙盒模式
# options.add_argument('--disable-gpu')
# options.add_argument('--disable-dev-shm-usage')
# # 创建浏览器驱动对象
# browser = webdriver.Chrome(options=options)
return browser
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment