fix：selenium 驱动配置

1f24eff4 · liyang · 781ee034 · 1f24eff4 · 1f24eff4 · 1f24eff4
Commit 1f24eff4 authored Jul 20, 2023 by liyang
Showing with 123 additions and 62 deletions

pc_facebook.py pc_facebook.py +1 -1

pc_ptt.py pc_ptt.py +26 -8

pc_twitter.py pc_twitter.py +22 -13

pc_youtube.py pc_youtube.py +1 -1

createBrowserDriver.py utils/createBrowserDriver.py +73 -39

No files found.
--- a/pc_facebook.py
+++ b/pc_facebook.py
@@ -22,7 +22,7 @@ def reptile(browser=None, search_word=""):
    print(f"搜索词:{search_word}")
    url = "https://www.facebook.com/"
    option = ['--headless']
-    browser = browser or create(option)
+    browser = browser or create(option, True)
    # 打开网页
    browser.get(url)
    try:

--- a/pc_ptt.py
+++ b/pc_ptt.py
@@ -16,7 +16,8 @@ from utils.createBrowserDriver import create
 import opencc
 from utils.filse import save_json
 import os
-
+from config.settings import get_base_file_url
+from utils.download_image import download_image
 '''
 爬取台湾PTT论坛的热门帖子，包括帖子的标题、内容【文本、图片、视频】

@@ -27,7 +28,7 @@ import os
 def reptile(browser=None, search_word=""):
    url = "https://www.ptt.cc/bbs/hotboards.html"
    # 无头模式执行
-    browser = browser or create(['--headless'],False)
+    browser = browser or create(['--headless'], True)
    # 有头模式执行
    # browser = browser or create()
    # 打开网页
@@ -97,15 +98,31 @@ def reptile(browser=None, search_word=""):
                    # ---------------- 判断类型 start ----------
                    # 类型
                    content_type = ""
+                    # 查找所有img标签
+                    image_list = soup.find_all('img')
                    try:
-                        # 查找所有img标签
-                        img_tags = soup.find_all('img')
-                        if len(img_tags) > 0:
+                        if len(image_list) > 0:
                            content_type = "图文"
                        else:
                            content_type = "文字"
                    except:
                        content_type = "文字"
+                    picture_url = []
+                    if len(image_list) > 0:
+                        for key, element in enumerate(image_list):
+                            # 下载图片至本地，替换标签中的src
+                            id = str(int(time.time()))
+                            # 下载地址
+                            download_dir = f'{os.path.join(file_dir, f"{id}.jpg")}'
+                            # 访问地址
+                            access_address = f'{get_base_file_url()}{table_name.split("_")[1]}/{id}.jpg'
+                            # 下载状态
+                            status = download_image(element['src'], download_dir)
+                            if status:
+                                element['src'] = access_address
+                                picture_url.append(access_address)
+                    else:
+                        print("")
                    # ---------------- 判断类型 end ----------
                    # log.debug('开始内容过滤')
                    # ------------------ content 过滤 start--------------
@@ -146,7 +163,8 @@ def reptile(browser=None, search_word=""):
                        "reptileTime": str(int(time.time())),
                        "type": content_type,
                        "author": element_author.text,
-                        "releaseTime": release_time
+                        "releaseTime": release_time,
+                        "picture_url": ",".join(picture_url)
                    }
                    # --------------- 组装数据 end---------------------

@@ -175,7 +193,7 @@ def reptile(browser=None, search_word=""):
    if len(data) > 0:
        # 保存json文件到本地
        # log.debug(os.path.abspath("../"))
-        state_save = save_json(os.path.join(file_dir,str(int(time.time())) + ".json"), data)
+        state_save = save_json(os.path.join(file_dir, str(int(time.time())) + ".json"), data)
        if state_save:
            log.debug('save file success')
        else:
@@ -220,7 +238,7 @@ def main():
 # 全局变量
 data = []
 table_name = "pms_ptt"
-file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data",table_name.split("_")[1])}'
+file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
 # 是否启用
 status_task = '0'
 # 调用main函数

--- a/pc_twitter.py
+++ b/pc_twitter.py
@@ -19,10 +19,16 @@ from config.settings import get_base_file_url


 def reptile(browser=None, search_word=""):
+    """
+
+    :param browser:
+    :param search_word:
+    """
+    print(f"搜索词:{search_word}")
    base_url = "https://twitter.com/"
    option = ['--headless']
    # ['--headless']
-    browser = browser or create(None, True)
+    browser = browser or create(option, True)
    # print(browser)
    # 打开网页
    browser.get(base_url)
@@ -48,26 +54,29 @@ def reptile(browser=None, search_word=""):
    browser.get(url)
    time.sleep(4)

-
    base_xpath = "//div[@data-testid='cellInnerDiv']//article/div/div/div[2]/div[2]"
    # 内容块
-    element_content_list = browser.find_elements('xpath',base_xpath)
+    element_content_list = browser.find_elements('xpath', base_xpath)
    # 作者
-    element_authors_list = browser.find_elements('xpath',f"{base_xpath}//div[@data-testid='User-Name']/div[1]//a[@role='link']")
+    element_authors_list = browser.find_elements('xpath',
+                                                 f"{base_xpath}//div[@data-testid='User-Name']/div[1]//a[@role='link']")
    length = len(element_authors_list)
    for index in range(length):
        # print(index)
-        soup = BeautifulSoup(element_content_list[index].get_attribute("outerHTML"),"html.parser")
+        soup = BeautifulSoup(element_content_list[index].get_attribute("outerHTML"), "html.parser")
        # 查找time标签
-        time_soup = soup.find('time')
-        timestamp = datetime.fromisoformat(time_soup['datetime'].replace("Z", "+00:00")).timestamp()
-        link_soup = time_soup.parent
-        link_str = base_url+link_soup["href"]
+        try:
+            time_soup = soup.find('time')
+            timestamp = datetime.fromisoformat(time_soup['datetime'].replace("Z", "+00:00")).timestamp()
+            link_soup = time_soup.parent
+            link_str = base_url + link_soup["href"]
+        except:
+            link_str = ""
+            timestamp = time.time()
        author = element_authors_list[index].text
        # 标题取：作者+日期
        title = f"{author}-{datetime.fromtimestamp(int(timestamp))}"

-
        video_list = soup.find_all("video")
        image_list = soup.find_all("img")
        # lth = len(ignore_list)
@@ -111,7 +120,6 @@ def reptile(browser=None, search_word=""):
            print("")
        content = soup.prettify()

-
        # ---------------- 判断类型 start ----------
        # 类型
        content_type = ""
@@ -133,7 +141,8 @@ def reptile(browser=None, search_word=""):
            "reptileTime": str(int(time.time())),
            "type": content_type,
            "author": author,
-            "releaseTime": str(int(timestamp))
+            "releaseTime": str(int(timestamp)),
+            "picture_url": ",".join(picture_url)
        }
        # --------------- 组装数据 end---------------------
        data.append(obj)
@@ -189,7 +198,7 @@ def main():
 # 全局变量
 data = []
 table_name = "pms_twitter"
-file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data",table_name.split("_")[1])}'
+file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
 # 是否启用
 status_task = '0'
 # 调用main函数

--- a/pc_youtube.py
+++ b/pc_youtube.py
@@ -21,7 +21,7 @@ def reptile(browser=None, search_word=""):
    :return:
    """
    option = ['--headless']
-    browser = browser or create(['--headless'])
+    browser = browser or create(['--headless'],True)
    # print(browser)
    # 打开网页
    url = f'https://www.youtube.com/results?search_query={search_word}'

--- a/utils/createBrowserDriver.py
+++ b/utils/createBrowserDriver.py
@@ -8,65 +8,99 @@ from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
 import chromedriver_autoinstaller
+from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
 # from mozprofile import FirefoxProfile
 '''
 创建浏览器实例
 '''


-def create(option=None, using_user_data=True):
+def create(option=None, using_user_data=True, web_browser="firefox"):
    """

+    :param web_browser:
    :param using_user_data:
    :param option:
    :return:
    """
+
    # 安装或升级 chromedriver
-    chromedriver_autoinstaller.install()
+    # chromedriver_autoinstaller.install()

-    # 获取现有Chrome浏览器用户数据目录
-    # chrome_user_data_dir = ""
-    # if platform.system() == 'Windows':
-    #     chrome_user_data_dir = os.path.join(os.environ['USERPROFILE'], 'AppData', 'Local', 'Google', 'Chrome',
-    #                                         'User Data')
-    # elif platform.system() == 'Linux':
-    #     chrome_user_data_dir = os.path.join(os.path.expanduser('~'), '.config', 'google-chrome')
-    # elif platform.system() == 'Darwin':
-    #     chrome_user_data_dir = os.path.join(os.path.expanduser("~"), 'Library', 'Application Support', 'Google','Chrome')
-    # else:
-    #     raise Exception('Unsupported operating system')
+    def get_user_data_dir():
+        """
+
+        :return:
+        """
+        # 获取现有Chrome浏览器用户数据目录
+        user_data_dir = os.path.join(os.path.abspath("../"), 'network-assets-reptile', 'user_data')
+        # if platform.system() == 'Windows':
+        #     if web_browser == "firefox":
+        #         user_data_dir = os.path.join(os.environ['USERPROFILE'], 'AppData', 'Local', 'Mozilla', 'Firefox',
+        #                                      'Profiles')
+        #     else:
+        #         user_data_dir = os.path.join(os.environ['USERPROFILE'], 'AppData', 'Local', 'Google', 'Chrome',
+        #                                      'User Data')
+        # elif platform.system() == 'Linux':
+        #     if web_browser == "firefox":
+        #         user_data_dir = os.path.join(os.path.expanduser('~'), '.config', 'Firefox', 'Profiles','huqg7mpy.default-release')
+        #     else:
+        #         user_data_dir = os.path.join(os.path.expanduser('~'), '.config', 'google-chrome')
+        # elif platform.system() == 'Darwin':
+        #     if web_browser == "firefox":
+        #         user_data_dir = os.path.join(os.path.expanduser("~"), 'Library', 'Application Support', 'Firefox',
+        #                                      'Profiles','huqg7mpy.default-release')
+        #     else:
+        #         user_data_dir = os.path.join(os.path.expanduser("~"), 'Library', 'Application Support', 'Google',
+        #                                      'Chrome')
+        # else:
+        #     raise Exception('Unsupported operating system')
+        return user_data_dir

-    chrome_options = webdriver.ChromeOptions()
+    options = ""
+    browser = ""
+    if web_browser == "firefox":
+        options = webdriver.FirefoxOptions()
+    else:
+        options = webdriver.ChromeOptions()
    if option is not None:
        for value in option:
-            chrome_options.add_argument(value)
+            options.add_argument(value)

-    # 启用浏览器的持久性会话，可以保存登录状态和Cookie
-    # 使用本地
-    user_data_dir = os.path.join(os.path.abspath("../"), 'network-assets-reptile', 'user_data')
    if using_user_data:
-        # 添加用户数据目录参数
-        chrome_options.add_argument(f'--user-data-dir={user_data_dir}')
+        # 添加用户数据目录参数,启用浏览器的持久性会话，可以保存登录状态和Cookie
+        if web_browser == "firefox":
+            firefox_profile_path = get_user_data_dir()  # 将此处替换为你的Firefox用户数据目录路径
+            profile = FirefoxProfile(profile_directory=firefox_profile_path)
+            options.profile = profile
+        else:
+            options.add_argument(f'--user-data-dir={get_user_data_dir()}')

-    if sys.platform.startswith('linux'):
-        # print("当前系统是 Linux")
-        # linux下运行记得加上这些参数 ----------------------------
-        # chrome_options.add_argument('--headless')
-        chrome_options.add_argument('--no-sandbox')
-        chrome_options.add_argument('--disable-gpu')
-        chrome_options.add_argument('--disable-dev-shm-usage')
-        # 加载chromedriver -------------------------------------------------
-        # windows 下的 chromedriver 默认加载路径是当前路径下的 chromedriver.exe
-        # linux 下的 chromedriver 默认加载路径是 /usr/local/bin/chromedriver
-        # 当然也可以通过 executable_path 自定义
-        browser = webdriver.Chrome(options=chrome_options)
-        # -----------------------------------------------------------------
+    # if sys.platform.startswith('linux'):
+    # print("当前系统是 Linux")
+    # linux下运行记得加上这些参数 ----------------------------
+    # chrome_options.add_argument('--headless')
+    options.add_argument("--window-size=1920x1080")  # 设置窗口大小，这是一个常见的完全无头模式的设置
+    options.add_argument('--no-sandbox')
+    options.add_argument('--disable-gpu')
+    options.add_argument('--disable-dev-shm-usage')
+    # 加载chromedriver -------------------------------------------------
+    # windows 下的 chromedriver 默认加载路径是当前路径下的 chromedriver.exe
+    # linux 下的 chromedriver 默认加载路径是 /usr/local/bin/chromedriver
+    # 当然也可以通过 executable_path 自定义
+    if web_browser == "firefox":
+        browser = webdriver.Firefox(options=options)
    else:
-        # print("当前系统不是 Linux")
-        # linux下运行记得加上这些参数 ----------------------------
-        # chrome_options.add_argument('--headless')  # 启用无头模式
-        chrome_options.add_argument('--no-sandbox')  # 禁用沙盒模式
-        # 创建浏览器驱动对象
-        browser = webdriver.Chrome(options=chrome_options)
+        browser = webdriver.Chrome(options=options)
+    # -----------------------------------------------------------------
+    # else:
+    #     # print("当前系统不是 Linux")
+    #     # linux下运行记得加上这些参数 ----------------------------
+    #     # chrome_options.add_argument('--headless')  # 启用无头模式
+    #     options.add_argument('--no-sandbox')  # 禁用沙盒模式
+    #     options.add_argument('--disable-gpu')
+    #     options.add_argument('--disable-dev-shm-usage')
+    #     # 创建浏览器驱动对象
+    #     browser = webdriver.Chrome(options=options)

    return browser