feat:ins爬虫数据条件过滤优化

46ec2eee · liyang · 6774ae74 · 46ec2eee · 46ec2eee · 46ec2eee
Commit 46ec2eee authored Jul 25, 2023 by liyang
Show whitespace changes
Inline Side-by-side

Showing with 27 additions and 10 deletions

pc_instagram.py pc_instagram.py +3 -2

createBrowserDriver.py utils/createBrowserDriver.py +15 -3

download_image.py utils/download_image.py +9 -5

No files found.
--- a/pc_instagram.py
+++ b/pc_instagram.py
@@ -36,7 +36,7 @@ def reptile(browser=None, search_word=""):
    base_url = "https://www.instagram.com/"
    option = ['--headless']
    # ['--headless']
-    browser = browser or create(None, True)
+    browser = browser or create(option, True)
    # print(browser)
    # 打开网页
    browser.get(base_url)
@@ -123,7 +123,8 @@ def reptile(browser=None, search_word=""):
                # print(img_soup.prettify())
                soup.append(img_soup)
                picture_url.append(download_dir)
-
+            else:
+                picture_url.append("")
        content = soup.prettify()
        # 类型
        content_type = "图文"

--- a/utils/createBrowserDriver.py
+++ b/utils/createBrowserDriver.py
 import os
 import platform
+import sys

 from selenium import webdriver
 # ---------------   selenium 依赖 start ----------------
@@ -30,6 +31,7 @@ from utils.index import get_screen_resolution

 def create(option=None, using_user_data=True, web_browser="firefox"):
    """
+    生成selenium实例

    :param web_browser:
    :param using_user_data:
@@ -82,9 +84,19 @@ def create(option=None, using_user_data=True, web_browser="firefox"):
    # chrome_options.add_argument('--headless')
    # options.add_argument("--window-size=1920x1080")  # 设置窗口大小，这是一个常见的完全无头模式的设置
    # options.add_argument("--start-maximized")  # 最大化窗口
-    options.add_argument('--no-sandbox')
+
+    if option != None:
+        # 无头模式下禁用gpu加速
        options.add_argument('--disable-gpu')
+
+    # 无头模式-linux 系统
+    if option != None and platform.system() == "Linux":
+        '''
+            --disable-dev-shm-usage 是 Chrome 浏览器在无头模式下运行时的一个常用启动参数。在 Linux 系统下特别常见，通过这个参数，Chrome 浏览器会禁用对 /dev/shm 的使用。
+        '''
        options.add_argument('--disable-dev-shm-usage')
+        # 禁用沙盒模式
+        options.add_argument('--no-sandbox')
    # 加载chromedriver -------------------------------------------------
    # windows 下的 chromedriver 默认加载路径是当前路径下的 chromedriver.exe
    # linux 下的 chromedriver 默认加载路径是 /usr/local/bin/chromedriver

--- a/utils/download_image.py
+++ b/utils/download_image.py
 import requests
-
+import os

 def download_image(url, save_path):
    """
+    下载图片并保存到本地文件

-    :param url:
-    :param save_path:
-    :return:
+    :param url: 图片的 URL 地址
+    :param save_path: 图片保存的文件路径
+    :return: 下载成功返回 True，下载失败返回 False
    """
+    if os.path.exists(save_path):
+        # print(f"图片文件已存在：{save_path}")
+        return True
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(save_path, 'wb') as file:
@@ -16,5 +20,5 @@ def download_image(url, save_path):
        # print(f"图片下载成功：{save_path}")
        return True
    else:
-        print(f"图片下载失败：{url}")
+        # print(f"图片下载失败：{url}")
        return False