feat:脚本适配ubuntu

1821fe14 · liyang · d0bd152d · 1821fe14 · 1821fe14 · 1821fe14
Commit 1821fe14 authored Jul 27, 2023 by liyang
8 changed files
--- a/browser/web-driver/firefox/linux/geckodriver
+++ b/browser/web-driver/firefox/linux/geckodriver
--- a/pc_dcard.py
+++ b/pc_dcard.py
@@ -37,9 +37,8 @@ def reptile(browser=None, search_word=""):
    base_url = "https://www.dcard.tw"
    browser = browser or create(no_headless=True, using_user_data=True)
    # 打开网页
-    # browser.get(base_url)
-    # time.sleep(3)
    browser.get(f"{base_url}/search?query={search_word}")
+    time.sleep(6)
    base_xpath = "//div[@role='main']//div[@data-key]//article"
    # 内容块
    element_content_list = browser.find_elements('xpath', base_xpath)

--- a/pc_facebook.py
+++ b/pc_facebook.py
@@ -30,10 +30,12 @@ from selenium.webdriver.support import expected_conditions as EC
 def reptile(browser=None, search_word=""):
    print(f"搜索词:{search_word}")
    url = "https://www.facebook.com/"
-    browser = browser or create(no_headless=True,using_user_data=True)
+    browser = browser or create(no_headless=True, using_user_data=False)
    # 打开网页
    browser.get(url)
+    # time.sleep(3)
    try:
+        # time.sleep(3)
        # 检测是否要登录
        login_input = browser.find_element('xpath', "//input[@name='email']")
        password_input = browser.find_element('xpath', "//input[@name='pass']")

--- a/pc_instagram.py
+++ b/pc_instagram.py
@@ -35,7 +35,7 @@ def reptile(browser=None, search_word=""):
    print(f"搜索词:{search_word}")
    base_url = "https://www.instagram.com/"

-    browser = browser or create(no_headless=True,using_user_data=True)
+    browser = browser or create(no_headless=True, using_user_data=True)
    # print(browser)
    # 打开网页
    browser.get(base_url)
@@ -103,7 +103,7 @@ def reptile(browser=None, search_word=""):
                        title = ""
                img_soup = BeautifulSoup(item.get_attribute("outerHTML"), "html.parser").find("img")
                del img_soup["srcset"]
-                img_soup["style"]="width:100%"
+                img_soup["style"] = "width:100%"
                src = item.get_attribute("src")
            else:
                # 有视频，图片链接从列表中提取

--- a/pc_youtube.py
+++ b/pc_youtube.py
 import json
+import platform
 import time
 from bs4 import BeautifulSoup
 from utils.Logger import log
@@ -30,28 +31,47 @@ def reptile(browser=None, search_word=""):
    browser = browser or create(no_headless=True, using_user_data=False)
    # print(browser)
    # 打开网页
+    print(f"搜索词:{search_word}")
    url = f'https://www.youtube.com/results?search_query={search_word}'
    browser.get(url)
+    # print(browser.page_source)
+    if platform.system() == "Linux":
+        time.sleep(3)
+    else:
        wait = WebDriverWait(browser, 10)
-    wait.until(EC.presence_of_element_located((By.XPATH,"//div[@id='contents']")))
+        wait.until(EC.presence_of_element_located((By.XPATH, "//div[@id='contents']")))
    log.debug("youtube login complete")
-    classify_video_list = browser.find_elements('xpath',
-                                                "//div[@id='contents']//ytd-video-renderer//div[@id='title-wrapper']//a")
-    element_author_list = browser.find_elements('xpath',
-                                                "//div[@id='contents']//ytd-video-renderer//ytd-channel-name//yt-formatted-string/a")
-    element_time_list = browser.find_elements('xpath',
-                                              "//div[@id='contents']//ytd-video-renderer//ytd-video-meta-block//div[@id='metadata-line']/span[2]")
-    length = len(classify_video_list)
+    video_list = browser.find_elements('xpath', "//div[@id='contents']//ytd-video-renderer")
+    # print(video_list[0].get_attribute("outerHTML"))
+    length = len(video_list)
    for index in range(length):
-        title = classify_video_list[index].get_attribute('title')
-        link = classify_video_list[index].get_attribute('href')
+        # 查找标题
+        author_element = video_list[index].find_element("xpath","./div[1]/div/div[2]//ytd-channel-name//yt-formatted-string/a")
+        # print(author_element.get_attribute("outerHTML"))
+
+        title_element = video_list[index].find_element("xpath",".//div[@id='title-wrapper']//a")
+        # print(title_element.get_attribute("outerHTML"))
+
+        time_element = video_list[index].find_element("xpath",".//ytd-video-meta-block//div[@id='metadata-line']/span[2]")
+        # print(time_element.get_attribute("outerHTML"))
+
+        title = title_element.get_attribute('title')
+        link = title_element.get_attribute('href')
        id = link.split("?")[1].split("&")[0].replace("v=", "")
        url = f'https://www.youtube.com/watch?v={id}'
-        if index < 6 and YouTube(url).length // 60 < 60:
+
+        # 时长按照秒计算
+        video_duration = int(YouTube(url).length) // 60
+
+        # 暂时先取6条数据
+        if index < 6 and video_duration < 60:
+            # print(str(id))
+            # print("视频连接：" + str(link))
+            # print("视频时长：" + str(video_duration))
            base_urr = get_base_file_url()
            releaseTime = ""
            try:
-                releaseTime = str(int(convert_string_to_time(element_time_list[index].text)))
+                releaseTime = str(int(convert_string_to_time(time_element.text)))
            except:
                releaseTime = str(int(time.time()))
            video_url = []
@@ -62,7 +82,7 @@ def reptile(browser=None, search_word=""):
            # 下载视频
            state_download = yt_dlp_download(url, 'youtube')
            video_url.append(download_dir)
-
+            # print(str(state_download))
            if state_download:
                # 组装数据
                obj = {
@@ -72,7 +92,7 @@ def reptile(browser=None, search_word=""):
                    "link": link,
                    "reptileTime": str(int(time.time())),
                    "type": '视频',
-                    "author": element_author_list[index].text,
+                    "author": author_element.text,
                    "releaseTime": releaseTime
                }
                data.append(obj)

--- a/test.py
+++ b/test.py
-import os
-import pytesseract
-from PIL import Image
-
-# 指定 Tesseract OCR 的执行路径（可选，如果已经配置环境变量，则无需此步骤）
-cmd_path = "/usr/local/Cellar/tesseract/5.3.2/share/tessdata"
-img_path = os.path.join(os.path.abspath("../"), 'network-assets-reptile', 'reptile-data', "instagram","Cr8vg2MyNFz.jpg")
-pytesseract.pytesseract.tesseract_cmd = cmd_path
-
-# 打开图片
-image = Image.open(img_path)
-
-# 进行图片文字识别
-text = pytesseract.image_to_string(image, lang='chi_sim')
-
-# 输出识别的文字
-print(text)
\ No newline at end of file
+# set options to be headless, ..
+from selenium import webdriver
+options = webdriver.ChromeOptions()
+options.add_argument('--headless')
+options.add_argument('--no-sandbox')
+options.add_argument('--disable-dev-shm-usage')
+
+# open it, go to a website, and get results
+wd = webdriver.Chrome(options=options)
+wd.get("https://www.youtube.com/results?search_query=俄乌战争")
+
+print(wd.page_source)  # results
\ No newline at end of file
--- a/utils/createBrowserDriver.py
+++ b/utils/createBrowserDriver.py
 import os
 import platform
 import sys
-
+from utils.Logger import log
 from selenium import webdriver
 # ---------------   selenium 依赖 start ----------------
 from selenium.webdriver.chrome.service import Service as ChromeService
@@ -29,7 +29,7 @@ from utils.index import get_screen_resolution
 '''


-def create(option=None, no_headless=False, using_user_data=True, web_browser="firefox"):
+def create(option=None, no_headless=False, using_user_data=True, web_browser="chrome"):
    """
    生成selenium实例

@@ -86,20 +86,22 @@ def create(option=None, no_headless=False, using_user_data=True, web_browser="fi
    # chrome_options.add_argument('--headless')
    # options.add_argument("--window-size=1920x1080")  # 设置窗口大小，这是一个常见的完全无头模式的设置
    # options.add_argument("--start-maximized")  # 最大化窗口
-    if no_headless == True:
+    options.add_argument("--no-sandbox")
+    if no_headless:
        if platform.system() == "Linux" or platform.system() == "Darwin":
            # 开启无头模式
-            options.add_argument("-headless")
+            options.add_argument("--headless")
        elif platform.system() == "Windows" and web_browser == "firefox":
            # windows系统、火狐浏览器不开启无头模式
-            print("")
+            # print("")
+            error = ""

-    if option != None:
+    if no_headless:
        # 无头模式下禁用gpu加速
        options.add_argument('--disable-gpu')

    # 无头模式-linux 系统
-    if option != None and platform.system() == "Linux":
+    if no_headless and platform.system() == "Linux":
        '''
            --disable-dev-shm-usage 是 Chrome 浏览器在无头模式下运行时的一个常用启动参数。在 Linux 系统下特别常见，通过这个参数，Chrome 浏览器会禁用对 /dev/shm 的使用。
        '''
@@ -119,17 +121,24 @@ def create(option=None, no_headless=False, using_user_data=True, web_browser="fi
                                             "geckodriver.exe"))
            browser = webdriver.Firefox(options=options, service=service)
        elif platform.system() == "Linux":
-            service = FirefoxService(executable_path=os.path.join(os.path.abspath("../"), 'network-assets-reptile', 'browser',"web-driver","firefox","linux",
-                                           "geckodriver"))
+            path = os.path.join(os.path.abspath("../"), 'network-assets-reptile', 'browser', "web-driver", "firefox",
+                                "linux",
+                                "geckodriver")
+            service = FirefoxService(executable_path=path)
+            log.debug("firefox驱动路径：" + path)
+            # options=options,
            browser = webdriver.Firefox(options=options, service=service)
        elif platform.system() == "Darwin":
-            service = FirefoxService(executable_path=os.path.join(os.path.abspath("../"), 'network-assets-reptile', 'browser',"web-driver","firefox","mac",
+            service = FirefoxService(
+                executable_path=os.path.join(os.path.abspath("../"), 'network-assets-reptile', 'browser', "web-driver",
+                                             "firefox", "mac",
                                             "geckodriver"))
            browser = webdriver.Firefox(options=options, service=service)
-
    elif web_browser == "chrome":
        # 创建Chrome浏览器对象并传入选项
-        web_browser = webdriver.Chrome(options=options, service=ChromeService(ChromeDriverManager().install()))
+        if platform.system() == "Darwin":
+            options.binary_location = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
+        browser = webdriver.Chrome(options=options, service=ChromeService())
    elif web_browser == "chromium":
        binary_location = ""
        webdriver_location = ""
@@ -152,12 +161,7 @@ def create(option=None, no_headless=False, using_user_data=True, web_browser="fi
                                              "chromedriver_mac64",
                                              "chromedriver")
        else:
-            print("")
-        # 指定浏览器路径
-        # print(binary_location)
-        # 指定浏览器路径
-        # options.binary_location = binary_location
-        # options.browser_version = "114"
+            error = ""
        # 设置驱动二进制可执行文件路径
        # service = ChromeService(executable_path=webdriver_location)
        service = ChromeService(executable_path=webdriver_location)
@@ -166,7 +170,15 @@ def create(option=None, no_headless=False, using_user_data=True, web_browser="fi
    elif web_browser == "edge":
        browser = webdriver.Edge(options=options, service=EdgeService(EdgeChromiumDriverManager().install()))
    else:
-        print("")
+        error = ""
+
+    # 获取浏览器信息
+    browser_name = browser.capabilities['browserName']
+    browser_version = browser.capabilities['browserVersion']
+
+    # 输出浏览器信息
+    print("浏览器名称:", browser_name)
+    print("浏览器版本:", browser_version)

    if option is None:
        # 获取屏幕分辨率

--- a/utils/index.py
+++ b/utils/index.py
@@ -175,7 +175,7 @@ def yt_dlp_download(url, name):
    download_options = f'-f mp4'
    # 要执行的 shell 命令
    command = f'yt-dlp -v {download_options} {network_options} --verbose -- {url}'
-
+    # print(command)
    # 使用 subprocess 调用 shell 命令
    result = subprocess.run(command, shell=True, capture_output=True, text=True)