Commit 1821fe14 authored by liyang's avatar liyang

feat:脚本适配ubuntu

parent d0bd152d
......@@ -37,9 +37,8 @@ def reptile(browser=None, search_word=""):
base_url = "https://www.dcard.tw"
browser = browser or create(no_headless=True, using_user_data=True)
# 打开网页
# browser.get(base_url)
# time.sleep(3)
browser.get(f"{base_url}/search?query={search_word}")
time.sleep(6)
base_xpath = "//div[@role='main']//div[@data-key]//article"
# 内容块
element_content_list = browser.find_elements('xpath', base_xpath)
......
......@@ -30,10 +30,12 @@ from selenium.webdriver.support import expected_conditions as EC
def reptile(browser=None, search_word=""):
print(f"搜索词:{search_word}")
url = "https://www.facebook.com/"
browser = browser or create(no_headless=True,using_user_data=True)
browser = browser or create(no_headless=True, using_user_data=False)
# 打开网页
browser.get(url)
# time.sleep(3)
try:
# time.sleep(3)
# 检测是否要登录
login_input = browser.find_element('xpath', "//input[@name='email']")
password_input = browser.find_element('xpath', "//input[@name='pass']")
......
......@@ -35,7 +35,7 @@ def reptile(browser=None, search_word=""):
print(f"搜索词:{search_word}")
base_url = "https://www.instagram.com/"
browser = browser or create(no_headless=True,using_user_data=True)
browser = browser or create(no_headless=True, using_user_data=True)
# print(browser)
# 打开网页
browser.get(base_url)
......@@ -103,7 +103,7 @@ def reptile(browser=None, search_word=""):
title = ""
img_soup = BeautifulSoup(item.get_attribute("outerHTML"), "html.parser").find("img")
del img_soup["srcset"]
img_soup["style"]="width:100%"
img_soup["style"] = "width:100%"
src = item.get_attribute("src")
else:
# 有视频,图片链接从列表中提取
......
import json
import platform
import time
from bs4 import BeautifulSoup
from utils.Logger import log
......@@ -30,28 +31,47 @@ def reptile(browser=None, search_word=""):
browser = browser or create(no_headless=True, using_user_data=False)
# print(browser)
# 打开网页
print(f"搜索词:{search_word}")
url = f'https://www.youtube.com/results?search_query={search_word}'
browser.get(url)
# print(browser.page_source)
if platform.system() == "Linux":
time.sleep(3)
else:
wait = WebDriverWait(browser, 10)
wait.until(EC.presence_of_element_located((By.XPATH,"//div[@id='contents']")))
wait.until(EC.presence_of_element_located((By.XPATH, "//div[@id='contents']")))
log.debug("youtube login complete")
classify_video_list = browser.find_elements('xpath',
"//div[@id='contents']//ytd-video-renderer//div[@id='title-wrapper']//a")
element_author_list = browser.find_elements('xpath',
"//div[@id='contents']//ytd-video-renderer//ytd-channel-name//yt-formatted-string/a")
element_time_list = browser.find_elements('xpath',
"//div[@id='contents']//ytd-video-renderer//ytd-video-meta-block//div[@id='metadata-line']/span[2]")
length = len(classify_video_list)
video_list = browser.find_elements('xpath', "//div[@id='contents']//ytd-video-renderer")
# print(video_list[0].get_attribute("outerHTML"))
length = len(video_list)
for index in range(length):
title = classify_video_list[index].get_attribute('title')
link = classify_video_list[index].get_attribute('href')
# 查找标题
author_element = video_list[index].find_element("xpath","./div[1]/div/div[2]//ytd-channel-name//yt-formatted-string/a")
# print(author_element.get_attribute("outerHTML"))
title_element = video_list[index].find_element("xpath",".//div[@id='title-wrapper']//a")
# print(title_element.get_attribute("outerHTML"))
time_element = video_list[index].find_element("xpath",".//ytd-video-meta-block//div[@id='metadata-line']/span[2]")
# print(time_element.get_attribute("outerHTML"))
title = title_element.get_attribute('title')
link = title_element.get_attribute('href')
id = link.split("?")[1].split("&")[0].replace("v=", "")
url = f'https://www.youtube.com/watch?v={id}'
if index < 6 and YouTube(url).length // 60 < 60:
# 时长按照秒计算
video_duration = int(YouTube(url).length) // 60
# 暂时先取6条数据
if index < 6 and video_duration < 60:
# print(str(id))
# print("视频连接:" + str(link))
# print("视频时长:" + str(video_duration))
base_urr = get_base_file_url()
releaseTime = ""
try:
releaseTime = str(int(convert_string_to_time(element_time_list[index].text)))
releaseTime = str(int(convert_string_to_time(time_element.text)))
except:
releaseTime = str(int(time.time()))
video_url = []
......@@ -62,7 +82,7 @@ def reptile(browser=None, search_word=""):
# 下载视频
state_download = yt_dlp_download(url, 'youtube')
video_url.append(download_dir)
# print(str(state_download))
if state_download:
# 组装数据
obj = {
......@@ -72,7 +92,7 @@ def reptile(browser=None, search_word=""):
"link": link,
"reptileTime": str(int(time.time())),
"type": '视频',
"author": element_author_list[index].text,
"author": author_element.text,
"releaseTime": releaseTime
}
data.append(obj)
......
import os
import pytesseract
from PIL import Image
# 指定 Tesseract OCR 的执行路径(可选,如果已经配置环境变量,则无需此步骤)
cmd_path = "/usr/local/Cellar/tesseract/5.3.2/share/tessdata"
img_path = os.path.join(os.path.abspath("../"), 'network-assets-reptile', 'reptile-data', "instagram","Cr8vg2MyNFz.jpg")
pytesseract.pytesseract.tesseract_cmd = cmd_path
# 打开图片
image = Image.open(img_path)
# 进行图片文字识别
text = pytesseract.image_to_string(image, lang='chi_sim')
# 输出识别的文字
print(text)
\ No newline at end of file
# set options to be headless, ..
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
# open it, go to a website, and get results
wd = webdriver.Chrome(options=options)
wd.get("https://www.youtube.com/results?search_query=俄乌战争")
print(wd.page_source) # results
\ No newline at end of file
import os
import platform
import sys
from utils.Logger import log
from selenium import webdriver
# --------------- selenium 依赖 start ----------------
from selenium.webdriver.chrome.service import Service as ChromeService
......@@ -29,7 +29,7 @@ from utils.index import get_screen_resolution
'''
def create(option=None, no_headless=False, using_user_data=True, web_browser="firefox"):
def create(option=None, no_headless=False, using_user_data=True, web_browser="chrome"):
"""
生成selenium实例
......@@ -86,20 +86,22 @@ def create(option=None, no_headless=False, using_user_data=True, web_browser="fi
# chrome_options.add_argument('--headless')
# options.add_argument("--window-size=1920x1080") # 设置窗口大小,这是一个常见的完全无头模式的设置
# options.add_argument("--start-maximized") # 最大化窗口
if no_headless == True:
options.add_argument("--no-sandbox")
if no_headless:
if platform.system() == "Linux" or platform.system() == "Darwin":
# 开启无头模式
options.add_argument("-headless")
options.add_argument("--headless")
elif platform.system() == "Windows" and web_browser == "firefox":
# windows系统、火狐浏览器不开启无头模式
print("")
# print("")
error = ""
if option != None:
if no_headless:
# 无头模式下禁用gpu加速
options.add_argument('--disable-gpu')
# 无头模式-linux 系统
if option != None and platform.system() == "Linux":
if no_headless and platform.system() == "Linux":
'''
--disable-dev-shm-usage 是 Chrome 浏览器在无头模式下运行时的一个常用启动参数。在 Linux 系统下特别常见,通过这个参数,Chrome 浏览器会禁用对 /dev/shm 的使用。
'''
......@@ -119,17 +121,24 @@ def create(option=None, no_headless=False, using_user_data=True, web_browser="fi
"geckodriver.exe"))
browser = webdriver.Firefox(options=options, service=service)
elif platform.system() == "Linux":
service = FirefoxService(executable_path=os.path.join(os.path.abspath("../"), 'network-assets-reptile', 'browser',"web-driver","firefox","linux",
"geckodriver"))
path = os.path.join(os.path.abspath("../"), 'network-assets-reptile', 'browser', "web-driver", "firefox",
"linux",
"geckodriver")
service = FirefoxService(executable_path=path)
log.debug("firefox驱动路径:" + path)
# options=options,
browser = webdriver.Firefox(options=options, service=service)
elif platform.system() == "Darwin":
service = FirefoxService(executable_path=os.path.join(os.path.abspath("../"), 'network-assets-reptile', 'browser',"web-driver","firefox","mac",
service = FirefoxService(
executable_path=os.path.join(os.path.abspath("../"), 'network-assets-reptile', 'browser', "web-driver",
"firefox", "mac",
"geckodriver"))
browser = webdriver.Firefox(options=options, service=service)
elif web_browser == "chrome":
# 创建Chrome浏览器对象并传入选项
web_browser = webdriver.Chrome(options=options, service=ChromeService(ChromeDriverManager().install()))
if platform.system() == "Darwin":
options.binary_location = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
browser = webdriver.Chrome(options=options, service=ChromeService())
elif web_browser == "chromium":
binary_location = ""
webdriver_location = ""
......@@ -152,12 +161,7 @@ def create(option=None, no_headless=False, using_user_data=True, web_browser="fi
"chromedriver_mac64",
"chromedriver")
else:
print("")
# 指定浏览器路径
# print(binary_location)
# 指定浏览器路径
# options.binary_location = binary_location
# options.browser_version = "114"
error = ""
# 设置驱动二进制可执行文件路径
# service = ChromeService(executable_path=webdriver_location)
service = ChromeService(executable_path=webdriver_location)
......@@ -166,7 +170,15 @@ def create(option=None, no_headless=False, using_user_data=True, web_browser="fi
elif web_browser == "edge":
browser = webdriver.Edge(options=options, service=EdgeService(EdgeChromiumDriverManager().install()))
else:
print("")
error = ""
# 获取浏览器信息
browser_name = browser.capabilities['browserName']
browser_version = browser.capabilities['browserVersion']
# 输出浏览器信息
print("浏览器名称:", browser_name)
print("浏览器版本:", browser_version)
if option is None:
# 获取屏幕分辨率
......
......@@ -175,7 +175,7 @@ def yt_dlp_download(url, name):
download_options = f'-f mp4'
# 要执行的 shell 命令
command = f'yt-dlp -v {download_options} {network_options} --verbose -- {url}'
# print(command)
# 使用 subprocess 调用 shell 命令
result = subprocess.run(command, shell=True, capture_output=True, text=True)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment