Commit 1821fe14 authored by liyang's avatar liyang

feat:脚本适配ubuntu

parent d0bd152d
...@@ -37,9 +37,8 @@ def reptile(browser=None, search_word=""): ...@@ -37,9 +37,8 @@ def reptile(browser=None, search_word=""):
base_url = "https://www.dcard.tw" base_url = "https://www.dcard.tw"
browser = browser or create(no_headless=True, using_user_data=True) browser = browser or create(no_headless=True, using_user_data=True)
# 打开网页 # 打开网页
# browser.get(base_url)
# time.sleep(3)
browser.get(f"{base_url}/search?query={search_word}") browser.get(f"{base_url}/search?query={search_word}")
time.sleep(6)
base_xpath = "//div[@role='main']//div[@data-key]//article" base_xpath = "//div[@role='main']//div[@data-key]//article"
# 内容块 # 内容块
element_content_list = browser.find_elements('xpath', base_xpath) element_content_list = browser.find_elements('xpath', base_xpath)
......
...@@ -30,10 +30,12 @@ from selenium.webdriver.support import expected_conditions as EC ...@@ -30,10 +30,12 @@ from selenium.webdriver.support import expected_conditions as EC
def reptile(browser=None, search_word=""): def reptile(browser=None, search_word=""):
print(f"搜索词:{search_word}") print(f"搜索词:{search_word}")
url = "https://www.facebook.com/" url = "https://www.facebook.com/"
browser = browser or create(no_headless=True,using_user_data=True) browser = browser or create(no_headless=True, using_user_data=False)
# 打开网页 # 打开网页
browser.get(url) browser.get(url)
# time.sleep(3)
try: try:
# time.sleep(3)
# 检测是否要登录 # 检测是否要登录
login_input = browser.find_element('xpath', "//input[@name='email']") login_input = browser.find_element('xpath', "//input[@name='email']")
password_input = browser.find_element('xpath', "//input[@name='pass']") password_input = browser.find_element('xpath', "//input[@name='pass']")
......
...@@ -35,7 +35,7 @@ def reptile(browser=None, search_word=""): ...@@ -35,7 +35,7 @@ def reptile(browser=None, search_word=""):
print(f"搜索词:{search_word}") print(f"搜索词:{search_word}")
base_url = "https://www.instagram.com/" base_url = "https://www.instagram.com/"
browser = browser or create(no_headless=True,using_user_data=True) browser = browser or create(no_headless=True, using_user_data=True)
# print(browser) # print(browser)
# 打开网页 # 打开网页
browser.get(base_url) browser.get(base_url)
...@@ -103,7 +103,7 @@ def reptile(browser=None, search_word=""): ...@@ -103,7 +103,7 @@ def reptile(browser=None, search_word=""):
title = "" title = ""
img_soup = BeautifulSoup(item.get_attribute("outerHTML"), "html.parser").find("img") img_soup = BeautifulSoup(item.get_attribute("outerHTML"), "html.parser").find("img")
del img_soup["srcset"] del img_soup["srcset"]
img_soup["style"]="width:100%" img_soup["style"] = "width:100%"
src = item.get_attribute("src") src = item.get_attribute("src")
else: else:
# 有视频,图片链接从列表中提取 # 有视频,图片链接从列表中提取
......
import json import json
import platform
import time import time
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from utils.Logger import log from utils.Logger import log
...@@ -30,28 +31,47 @@ def reptile(browser=None, search_word=""): ...@@ -30,28 +31,47 @@ def reptile(browser=None, search_word=""):
browser = browser or create(no_headless=True, using_user_data=False) browser = browser or create(no_headless=True, using_user_data=False)
# print(browser) # print(browser)
# 打开网页 # 打开网页
print(f"搜索词:{search_word}")
url = f'https://www.youtube.com/results?search_query={search_word}' url = f'https://www.youtube.com/results?search_query={search_word}'
browser.get(url) browser.get(url)
wait = WebDriverWait(browser, 10) # print(browser.page_source)
wait.until(EC.presence_of_element_located((By.XPATH,"//div[@id='contents']"))) if platform.system() == "Linux":
time.sleep(3)
else:
wait = WebDriverWait(browser, 10)
wait.until(EC.presence_of_element_located((By.XPATH, "//div[@id='contents']")))
log.debug("youtube login complete") log.debug("youtube login complete")
classify_video_list = browser.find_elements('xpath', video_list = browser.find_elements('xpath', "//div[@id='contents']//ytd-video-renderer")
"//div[@id='contents']//ytd-video-renderer//div[@id='title-wrapper']//a") # print(video_list[0].get_attribute("outerHTML"))
element_author_list = browser.find_elements('xpath', length = len(video_list)
"//div[@id='contents']//ytd-video-renderer//ytd-channel-name//yt-formatted-string/a")
element_time_list = browser.find_elements('xpath',
"//div[@id='contents']//ytd-video-renderer//ytd-video-meta-block//div[@id='metadata-line']/span[2]")
length = len(classify_video_list)
for index in range(length): for index in range(length):
title = classify_video_list[index].get_attribute('title') # 查找标题
link = classify_video_list[index].get_attribute('href') author_element = video_list[index].find_element("xpath","./div[1]/div/div[2]//ytd-channel-name//yt-formatted-string/a")
# print(author_element.get_attribute("outerHTML"))
title_element = video_list[index].find_element("xpath",".//div[@id='title-wrapper']//a")
# print(title_element.get_attribute("outerHTML"))
time_element = video_list[index].find_element("xpath",".//ytd-video-meta-block//div[@id='metadata-line']/span[2]")
# print(time_element.get_attribute("outerHTML"))
title = title_element.get_attribute('title')
link = title_element.get_attribute('href')
id = link.split("?")[1].split("&")[0].replace("v=", "") id = link.split("?")[1].split("&")[0].replace("v=", "")
url = f'https://www.youtube.com/watch?v={id}' url = f'https://www.youtube.com/watch?v={id}'
if index < 6 and YouTube(url).length // 60 < 60:
# 时长按照秒计算
video_duration = int(YouTube(url).length) // 60
# 暂时先取6条数据
if index < 6 and video_duration < 60:
# print(str(id))
# print("视频连接:" + str(link))
# print("视频时长:" + str(video_duration))
base_urr = get_base_file_url() base_urr = get_base_file_url()
releaseTime = "" releaseTime = ""
try: try:
releaseTime = str(int(convert_string_to_time(element_time_list[index].text))) releaseTime = str(int(convert_string_to_time(time_element.text)))
except: except:
releaseTime = str(int(time.time())) releaseTime = str(int(time.time()))
video_url = [] video_url = []
...@@ -62,7 +82,7 @@ def reptile(browser=None, search_word=""): ...@@ -62,7 +82,7 @@ def reptile(browser=None, search_word=""):
# 下载视频 # 下载视频
state_download = yt_dlp_download(url, 'youtube') state_download = yt_dlp_download(url, 'youtube')
video_url.append(download_dir) video_url.append(download_dir)
# print(str(state_download))
if state_download: if state_download:
# 组装数据 # 组装数据
obj = { obj = {
...@@ -72,7 +92,7 @@ def reptile(browser=None, search_word=""): ...@@ -72,7 +92,7 @@ def reptile(browser=None, search_word=""):
"link": link, "link": link,
"reptileTime": str(int(time.time())), "reptileTime": str(int(time.time())),
"type": '视频', "type": '视频',
"author": element_author_list[index].text, "author": author_element.text,
"releaseTime": releaseTime "releaseTime": releaseTime
} }
data.append(obj) data.append(obj)
......
import os # set options to be headless, ..
import pytesseract from selenium import webdriver
from PIL import Image options = webdriver.ChromeOptions()
options.add_argument('--headless')
# 指定 Tesseract OCR 的执行路径(可选,如果已经配置环境变量,则无需此步骤) options.add_argument('--no-sandbox')
cmd_path = "/usr/local/Cellar/tesseract/5.3.2/share/tessdata" options.add_argument('--disable-dev-shm-usage')
img_path = os.path.join(os.path.abspath("../"), 'network-assets-reptile', 'reptile-data', "instagram","Cr8vg2MyNFz.jpg")
pytesseract.pytesseract.tesseract_cmd = cmd_path # open it, go to a website, and get results
wd = webdriver.Chrome(options=options)
# 打开图片 wd.get("https://www.youtube.com/results?search_query=俄乌战争")
image = Image.open(img_path)
print(wd.page_source) # results
# 进行图片文字识别 \ No newline at end of file
text = pytesseract.image_to_string(image, lang='chi_sim')
# 输出识别的文字
print(text)
\ No newline at end of file
import os import os
import platform import platform
import sys import sys
from utils.Logger import log
from selenium import webdriver from selenium import webdriver
# --------------- selenium 依赖 start ---------------- # --------------- selenium 依赖 start ----------------
from selenium.webdriver.chrome.service import Service as ChromeService from selenium.webdriver.chrome.service import Service as ChromeService
...@@ -29,7 +29,7 @@ from utils.index import get_screen_resolution ...@@ -29,7 +29,7 @@ from utils.index import get_screen_resolution
''' '''
def create(option=None, no_headless=False, using_user_data=True, web_browser="firefox"): def create(option=None, no_headless=False, using_user_data=True, web_browser="chrome"):
""" """
生成selenium实例 生成selenium实例
...@@ -86,20 +86,22 @@ def create(option=None, no_headless=False, using_user_data=True, web_browser="fi ...@@ -86,20 +86,22 @@ def create(option=None, no_headless=False, using_user_data=True, web_browser="fi
# chrome_options.add_argument('--headless') # chrome_options.add_argument('--headless')
# options.add_argument("--window-size=1920x1080") # 设置窗口大小,这是一个常见的完全无头模式的设置 # options.add_argument("--window-size=1920x1080") # 设置窗口大小,这是一个常见的完全无头模式的设置
# options.add_argument("--start-maximized") # 最大化窗口 # options.add_argument("--start-maximized") # 最大化窗口
if no_headless == True: options.add_argument("--no-sandbox")
if no_headless:
if platform.system() == "Linux" or platform.system() == "Darwin": if platform.system() == "Linux" or platform.system() == "Darwin":
# 开启无头模式 # 开启无头模式
options.add_argument("-headless") options.add_argument("--headless")
elif platform.system() == "Windows" and web_browser == "firefox": elif platform.system() == "Windows" and web_browser == "firefox":
# windows系统、火狐浏览器不开启无头模式 # windows系统、火狐浏览器不开启无头模式
print("") # print("")
error = ""
if option != None: if no_headless:
# 无头模式下禁用gpu加速 # 无头模式下禁用gpu加速
options.add_argument('--disable-gpu') options.add_argument('--disable-gpu')
# 无头模式-linux 系统 # 无头模式-linux 系统
if option != None and platform.system() == "Linux": if no_headless and platform.system() == "Linux":
''' '''
--disable-dev-shm-usage 是 Chrome 浏览器在无头模式下运行时的一个常用启动参数。在 Linux 系统下特别常见,通过这个参数,Chrome 浏览器会禁用对 /dev/shm 的使用。 --disable-dev-shm-usage 是 Chrome 浏览器在无头模式下运行时的一个常用启动参数。在 Linux 系统下特别常见,通过这个参数,Chrome 浏览器会禁用对 /dev/shm 的使用。
''' '''
...@@ -119,17 +121,24 @@ def create(option=None, no_headless=False, using_user_data=True, web_browser="fi ...@@ -119,17 +121,24 @@ def create(option=None, no_headless=False, using_user_data=True, web_browser="fi
"geckodriver.exe")) "geckodriver.exe"))
browser = webdriver.Firefox(options=options, service=service) browser = webdriver.Firefox(options=options, service=service)
elif platform.system() == "Linux": elif platform.system() == "Linux":
service = FirefoxService(executable_path=os.path.join(os.path.abspath("../"), 'network-assets-reptile', 'browser',"web-driver","firefox","linux", path = os.path.join(os.path.abspath("../"), 'network-assets-reptile', 'browser', "web-driver", "firefox",
"geckodriver")) "linux",
"geckodriver")
service = FirefoxService(executable_path=path)
log.debug("firefox驱动路径:" + path)
# options=options,
browser = webdriver.Firefox(options=options, service=service) browser = webdriver.Firefox(options=options, service=service)
elif platform.system() == "Darwin": elif platform.system() == "Darwin":
service = FirefoxService(executable_path=os.path.join(os.path.abspath("../"), 'network-assets-reptile', 'browser',"web-driver","firefox","mac", service = FirefoxService(
"geckodriver")) executable_path=os.path.join(os.path.abspath("../"), 'network-assets-reptile', 'browser', "web-driver",
"firefox", "mac",
"geckodriver"))
browser = webdriver.Firefox(options=options, service=service) browser = webdriver.Firefox(options=options, service=service)
elif web_browser == "chrome": elif web_browser == "chrome":
# 创建Chrome浏览器对象并传入选项 # 创建Chrome浏览器对象并传入选项
web_browser = webdriver.Chrome(options=options, service=ChromeService(ChromeDriverManager().install())) if platform.system() == "Darwin":
options.binary_location = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
browser = webdriver.Chrome(options=options, service=ChromeService())
elif web_browser == "chromium": elif web_browser == "chromium":
binary_location = "" binary_location = ""
webdriver_location = "" webdriver_location = ""
...@@ -152,12 +161,7 @@ def create(option=None, no_headless=False, using_user_data=True, web_browser="fi ...@@ -152,12 +161,7 @@ def create(option=None, no_headless=False, using_user_data=True, web_browser="fi
"chromedriver_mac64", "chromedriver_mac64",
"chromedriver") "chromedriver")
else: else:
print("") error = ""
# 指定浏览器路径
# print(binary_location)
# 指定浏览器路径
# options.binary_location = binary_location
# options.browser_version = "114"
# 设置驱动二进制可执行文件路径 # 设置驱动二进制可执行文件路径
# service = ChromeService(executable_path=webdriver_location) # service = ChromeService(executable_path=webdriver_location)
service = ChromeService(executable_path=webdriver_location) service = ChromeService(executable_path=webdriver_location)
...@@ -166,7 +170,15 @@ def create(option=None, no_headless=False, using_user_data=True, web_browser="fi ...@@ -166,7 +170,15 @@ def create(option=None, no_headless=False, using_user_data=True, web_browser="fi
elif web_browser == "edge": elif web_browser == "edge":
browser = webdriver.Edge(options=options, service=EdgeService(EdgeChromiumDriverManager().install())) browser = webdriver.Edge(options=options, service=EdgeService(EdgeChromiumDriverManager().install()))
else: else:
print("") error = ""
# 获取浏览器信息
browser_name = browser.capabilities['browserName']
browser_version = browser.capabilities['browserVersion']
# 输出浏览器信息
print("浏览器名称:", browser_name)
print("浏览器版本:", browser_version)
if option is None: if option is None:
# 获取屏幕分辨率 # 获取屏幕分辨率
......
...@@ -175,7 +175,7 @@ def yt_dlp_download(url, name): ...@@ -175,7 +175,7 @@ def yt_dlp_download(url, name):
download_options = f'-f mp4' download_options = f'-f mp4'
# 要执行的 shell 命令 # 要执行的 shell 命令
command = f'yt-dlp -v {download_options} {network_options} --verbose -- {url}' command = f'yt-dlp -v {download_options} {network_options} --verbose -- {url}'
# print(command)
# 使用 subprocess 调用 shell 命令 # 使用 subprocess 调用 shell 命令
result = subprocess.run(command, shell=True, capture_output=True, text=True) result = subprocess.run(command, shell=True, capture_output=True, text=True)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment