Commit 1f24eff4 authored by liyang's avatar liyang

fix:selenium 驱动配置

parent 781ee034
...@@ -22,7 +22,7 @@ def reptile(browser=None, search_word=""): ...@@ -22,7 +22,7 @@ def reptile(browser=None, search_word=""):
print(f"搜索词:{search_word}") print(f"搜索词:{search_word}")
url = "https://www.facebook.com/" url = "https://www.facebook.com/"
option = ['--headless'] option = ['--headless']
browser = browser or create(option) browser = browser or create(option, True)
# 打开网页 # 打开网页
browser.get(url) browser.get(url)
try: try:
......
...@@ -16,7 +16,8 @@ from utils.createBrowserDriver import create ...@@ -16,7 +16,8 @@ from utils.createBrowserDriver import create
import opencc import opencc
from utils.filse import save_json from utils.filse import save_json
import os import os
from config.settings import get_base_file_url
from utils.download_image import download_image
''' '''
爬取台湾PTT论坛的热门帖子,包括帖子的标题、内容【文本、图片、视频】 爬取台湾PTT论坛的热门帖子,包括帖子的标题、内容【文本、图片、视频】
...@@ -27,7 +28,7 @@ import os ...@@ -27,7 +28,7 @@ import os
def reptile(browser=None, search_word=""): def reptile(browser=None, search_word=""):
url = "https://www.ptt.cc/bbs/hotboards.html" url = "https://www.ptt.cc/bbs/hotboards.html"
# 无头模式执行 # 无头模式执行
browser = browser or create(['--headless'],False) browser = browser or create(['--headless'], True)
# 有头模式执行 # 有头模式执行
# browser = browser or create() # browser = browser or create()
# 打开网页 # 打开网页
...@@ -97,15 +98,31 @@ def reptile(browser=None, search_word=""): ...@@ -97,15 +98,31 @@ def reptile(browser=None, search_word=""):
# ---------------- 判断类型 start ---------- # ---------------- 判断类型 start ----------
# 类型 # 类型
content_type = "" content_type = ""
try:
# 查找所有img标签 # 查找所有img标签
img_tags = soup.find_all('img') image_list = soup.find_all('img')
if len(img_tags) > 0: try:
if len(image_list) > 0:
content_type = "图文" content_type = "图文"
else: else:
content_type = "文字" content_type = "文字"
except: except:
content_type = "文字" content_type = "文字"
picture_url = []
if len(image_list) > 0:
for key, element in enumerate(image_list):
# 下载图片至本地,替换标签中的src
id = str(int(time.time()))
# 下载地址
download_dir = f'{os.path.join(file_dir, f"{id}.jpg")}'
# 访问地址
access_address = f'{get_base_file_url()}{table_name.split("_")[1]}/{id}.jpg'
# 下载状态
status = download_image(element['src'], download_dir)
if status:
element['src'] = access_address
picture_url.append(access_address)
else:
print("")
# ---------------- 判断类型 end ---------- # ---------------- 判断类型 end ----------
# log.debug('开始内容过滤') # log.debug('开始内容过滤')
# ------------------ content 过滤 start-------------- # ------------------ content 过滤 start--------------
...@@ -146,7 +163,8 @@ def reptile(browser=None, search_word=""): ...@@ -146,7 +163,8 @@ def reptile(browser=None, search_word=""):
"reptileTime": str(int(time.time())), "reptileTime": str(int(time.time())),
"type": content_type, "type": content_type,
"author": element_author.text, "author": element_author.text,
"releaseTime": release_time "releaseTime": release_time,
"picture_url": ",".join(picture_url)
} }
# --------------- 组装数据 end--------------------- # --------------- 组装数据 end---------------------
...@@ -175,7 +193,7 @@ def reptile(browser=None, search_word=""): ...@@ -175,7 +193,7 @@ def reptile(browser=None, search_word=""):
if len(data) > 0: if len(data) > 0:
# 保存json文件到本地 # 保存json文件到本地
# log.debug(os.path.abspath("../")) # log.debug(os.path.abspath("../"))
state_save = save_json(os.path.join(file_dir,str(int(time.time())) + ".json"), data) state_save = save_json(os.path.join(file_dir, str(int(time.time())) + ".json"), data)
if state_save: if state_save:
log.debug('save file success') log.debug('save file success')
else: else:
...@@ -220,7 +238,7 @@ def main(): ...@@ -220,7 +238,7 @@ def main():
# 全局变量 # 全局变量
data = [] data = []
table_name = "pms_ptt" table_name = "pms_ptt"
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data",table_name.split("_")[1])}' file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
# 是否启用 # 是否启用
status_task = '0' status_task = '0'
# 调用main函数 # 调用main函数
......
...@@ -19,10 +19,16 @@ from config.settings import get_base_file_url ...@@ -19,10 +19,16 @@ from config.settings import get_base_file_url
def reptile(browser=None, search_word=""): def reptile(browser=None, search_word=""):
"""
:param browser:
:param search_word:
"""
print(f"搜索词:{search_word}")
base_url = "https://twitter.com/" base_url = "https://twitter.com/"
option = ['--headless'] option = ['--headless']
# ['--headless'] # ['--headless']
browser = browser or create(None, True) browser = browser or create(option, True)
# print(browser) # print(browser)
# 打开网页 # 打开网页
browser.get(base_url) browser.get(base_url)
...@@ -48,26 +54,29 @@ def reptile(browser=None, search_word=""): ...@@ -48,26 +54,29 @@ def reptile(browser=None, search_word=""):
browser.get(url) browser.get(url)
time.sleep(4) time.sleep(4)
base_xpath = "//div[@data-testid='cellInnerDiv']//article/div/div/div[2]/div[2]" base_xpath = "//div[@data-testid='cellInnerDiv']//article/div/div/div[2]/div[2]"
# 内容块 # 内容块
element_content_list = browser.find_elements('xpath',base_xpath) element_content_list = browser.find_elements('xpath', base_xpath)
# 作者 # 作者
element_authors_list = browser.find_elements('xpath',f"{base_xpath}//div[@data-testid='User-Name']/div[1]//a[@role='link']") element_authors_list = browser.find_elements('xpath',
f"{base_xpath}//div[@data-testid='User-Name']/div[1]//a[@role='link']")
length = len(element_authors_list) length = len(element_authors_list)
for index in range(length): for index in range(length):
# print(index) # print(index)
soup = BeautifulSoup(element_content_list[index].get_attribute("outerHTML"),"html.parser") soup = BeautifulSoup(element_content_list[index].get_attribute("outerHTML"), "html.parser")
# 查找time标签 # 查找time标签
try:
time_soup = soup.find('time') time_soup = soup.find('time')
timestamp = datetime.fromisoformat(time_soup['datetime'].replace("Z", "+00:00")).timestamp() timestamp = datetime.fromisoformat(time_soup['datetime'].replace("Z", "+00:00")).timestamp()
link_soup = time_soup.parent link_soup = time_soup.parent
link_str = base_url+link_soup["href"] link_str = base_url + link_soup["href"]
except:
link_str = ""
timestamp = time.time()
author = element_authors_list[index].text author = element_authors_list[index].text
# 标题取:作者+日期 # 标题取:作者+日期
title = f"{author}-{datetime.fromtimestamp(int(timestamp))}" title = f"{author}-{datetime.fromtimestamp(int(timestamp))}"
video_list = soup.find_all("video") video_list = soup.find_all("video")
image_list = soup.find_all("img") image_list = soup.find_all("img")
# lth = len(ignore_list) # lth = len(ignore_list)
...@@ -111,7 +120,6 @@ def reptile(browser=None, search_word=""): ...@@ -111,7 +120,6 @@ def reptile(browser=None, search_word=""):
print("") print("")
content = soup.prettify() content = soup.prettify()
# ---------------- 判断类型 start ---------- # ---------------- 判断类型 start ----------
# 类型 # 类型
content_type = "" content_type = ""
...@@ -133,7 +141,8 @@ def reptile(browser=None, search_word=""): ...@@ -133,7 +141,8 @@ def reptile(browser=None, search_word=""):
"reptileTime": str(int(time.time())), "reptileTime": str(int(time.time())),
"type": content_type, "type": content_type,
"author": author, "author": author,
"releaseTime": str(int(timestamp)) "releaseTime": str(int(timestamp)),
"picture_url": ",".join(picture_url)
} }
# --------------- 组装数据 end--------------------- # --------------- 组装数据 end---------------------
data.append(obj) data.append(obj)
...@@ -189,7 +198,7 @@ def main(): ...@@ -189,7 +198,7 @@ def main():
# 全局变量 # 全局变量
data = [] data = []
table_name = "pms_twitter" table_name = "pms_twitter"
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data",table_name.split("_")[1])}' file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
# 是否启用 # 是否启用
status_task = '0' status_task = '0'
# 调用main函数 # 调用main函数
......
...@@ -21,7 +21,7 @@ def reptile(browser=None, search_word=""): ...@@ -21,7 +21,7 @@ def reptile(browser=None, search_word=""):
:return: :return:
""" """
option = ['--headless'] option = ['--headless']
browser = browser or create(['--headless']) browser = browser or create(['--headless'],True)
# print(browser) # print(browser)
# 打开网页 # 打开网页
url = f'https://www.youtube.com/results?search_query={search_word}' url = f'https://www.youtube.com/results?search_query={search_word}'
......
...@@ -8,65 +8,99 @@ from selenium.webdriver.common.by import By ...@@ -8,65 +8,99 @@ from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support.ui import WebDriverWait
import chromedriver_autoinstaller import chromedriver_autoinstaller
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
# from mozprofile import FirefoxProfile # from mozprofile import FirefoxProfile
''' '''
创建浏览器实例 创建浏览器实例
''' '''
def create(option=None, using_user_data=True): def create(option=None, using_user_data=True, web_browser="firefox"):
""" """
:param web_browser:
:param using_user_data: :param using_user_data:
:param option: :param option:
:return: :return:
""" """
# 安装或升级 chromedriver # 安装或升级 chromedriver
chromedriver_autoinstaller.install() # chromedriver_autoinstaller.install()
def get_user_data_dir():
"""
:return:
"""
# 获取现有Chrome浏览器用户数据目录 # 获取现有Chrome浏览器用户数据目录
# chrome_user_data_dir = "" user_data_dir = os.path.join(os.path.abspath("../"), 'network-assets-reptile', 'user_data')
# if platform.system() == 'Windows': # if platform.system() == 'Windows':
# chrome_user_data_dir = os.path.join(os.environ['USERPROFILE'], 'AppData', 'Local', 'Google', 'Chrome', # if web_browser == "firefox":
# user_data_dir = os.path.join(os.environ['USERPROFILE'], 'AppData', 'Local', 'Mozilla', 'Firefox',
# 'Profiles')
# else:
# user_data_dir = os.path.join(os.environ['USERPROFILE'], 'AppData', 'Local', 'Google', 'Chrome',
# 'User Data') # 'User Data')
# elif platform.system() == 'Linux': # elif platform.system() == 'Linux':
# chrome_user_data_dir = os.path.join(os.path.expanduser('~'), '.config', 'google-chrome') # if web_browser == "firefox":
# user_data_dir = os.path.join(os.path.expanduser('~'), '.config', 'Firefox', 'Profiles','huqg7mpy.default-release')
# else:
# user_data_dir = os.path.join(os.path.expanduser('~'), '.config', 'google-chrome')
# elif platform.system() == 'Darwin': # elif platform.system() == 'Darwin':
# chrome_user_data_dir = os.path.join(os.path.expanduser("~"), 'Library', 'Application Support', 'Google','Chrome') # if web_browser == "firefox":
# user_data_dir = os.path.join(os.path.expanduser("~"), 'Library', 'Application Support', 'Firefox',
# 'Profiles','huqg7mpy.default-release')
# else:
# user_data_dir = os.path.join(os.path.expanduser("~"), 'Library', 'Application Support', 'Google',
# 'Chrome')
# else: # else:
# raise Exception('Unsupported operating system') # raise Exception('Unsupported operating system')
return user_data_dir
chrome_options = webdriver.ChromeOptions() options = ""
browser = ""
if web_browser == "firefox":
options = webdriver.FirefoxOptions()
else:
options = webdriver.ChromeOptions()
if option is not None: if option is not None:
for value in option: for value in option:
chrome_options.add_argument(value) options.add_argument(value)
# 启用浏览器的持久性会话,可以保存登录状态和Cookie
# 使用本地
user_data_dir = os.path.join(os.path.abspath("../"), 'network-assets-reptile', 'user_data')
if using_user_data: if using_user_data:
# 添加用户数据目录参数 # 添加用户数据目录参数,启用浏览器的持久性会话,可以保存登录状态和Cookie
chrome_options.add_argument(f'--user-data-dir={user_data_dir}') if web_browser == "firefox":
firefox_profile_path = get_user_data_dir() # 将此处替换为你的Firefox用户数据目录路径
profile = FirefoxProfile(profile_directory=firefox_profile_path)
options.profile = profile
else:
options.add_argument(f'--user-data-dir={get_user_data_dir()}')
if sys.platform.startswith('linux'): # if sys.platform.startswith('linux'):
# print("当前系统是 Linux") # print("当前系统是 Linux")
# linux下运行记得加上这些参数 ---------------------------- # linux下运行记得加上这些参数 ----------------------------
# chrome_options.add_argument('--headless') # chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox') options.add_argument("--window-size=1920x1080") # 设置窗口大小,这是一个常见的完全无头模式的设置
chrome_options.add_argument('--disable-gpu') options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage') options.add_argument('--disable-gpu')
options.add_argument('--disable-dev-shm-usage')
# 加载chromedriver ------------------------------------------------- # 加载chromedriver -------------------------------------------------
# windows 下的 chromedriver 默认加载路径是当前路径下的 chromedriver.exe # windows 下的 chromedriver 默认加载路径是当前路径下的 chromedriver.exe
# linux 下的 chromedriver 默认加载路径是 /usr/local/bin/chromedriver # linux 下的 chromedriver 默认加载路径是 /usr/local/bin/chromedriver
# 当然也可以通过 executable_path 自定义 # 当然也可以通过 executable_path 自定义
browser = webdriver.Chrome(options=chrome_options) if web_browser == "firefox":
# ----------------------------------------------------------------- browser = webdriver.Firefox(options=options)
else: else:
# print("当前系统不是 Linux") browser = webdriver.Chrome(options=options)
# linux下运行记得加上这些参数 ---------------------------- # -----------------------------------------------------------------
# chrome_options.add_argument('--headless') # 启用无头模式 # else:
chrome_options.add_argument('--no-sandbox') # 禁用沙盒模式 # # print("当前系统不是 Linux")
# 创建浏览器驱动对象 # # linux下运行记得加上这些参数 ----------------------------
browser = webdriver.Chrome(options=chrome_options) # # chrome_options.add_argument('--headless') # 启用无头模式
# options.add_argument('--no-sandbox') # 禁用沙盒模式
# options.add_argument('--disable-gpu')
# options.add_argument('--disable-dev-shm-usage')
# # 创建浏览器驱动对象
# browser = webdriver.Chrome(options=options)
return browser return browser
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment