Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
N
network-assets-reptile
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
liyang
network-assets-reptile
Commits
1821fe14
Commit
1821fe14
authored
Jul 27, 2023
by
liyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
feat:脚本适配ubuntu
parent
d0bd152d
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
86 additions
and
58 deletions
+86
-58
geckodriver
browser/web-driver/firefox/linux/geckodriver
+0
-0
pc_dcard.py
pc_dcard.py
+1
-2
pc_facebook.py
pc_facebook.py
+3
-1
pc_instagram.py
pc_instagram.py
+2
-2
pc_youtube.py
pc_youtube.py
+35
-15
test.py
test.py
+12
-17
createBrowserDriver.py
utils/createBrowserDriver.py
+32
-20
index.py
utils/index.py
+1
-1
No files found.
browser/web-driver/firefox/linux/geckodriver
View file @
1821fe14
No preview for this file type
pc_dcard.py
View file @
1821fe14
...
...
@@ -37,9 +37,8 @@ def reptile(browser=None, search_word=""):
base_url
=
"https://www.dcard.tw"
browser
=
browser
or
create
(
no_headless
=
True
,
using_user_data
=
True
)
# 打开网页
# browser.get(base_url)
# time.sleep(3)
browser
.
get
(
f
"{base_url}/search?query={search_word}"
)
time
.
sleep
(
6
)
base_xpath
=
"//div[@role='main']//div[@data-key]//article"
# 内容块
element_content_list
=
browser
.
find_elements
(
'xpath'
,
base_xpath
)
...
...
pc_facebook.py
View file @
1821fe14
...
...
@@ -30,10 +30,12 @@ from selenium.webdriver.support import expected_conditions as EC
def
reptile
(
browser
=
None
,
search_word
=
""
):
print
(
f
"搜索词:{search_word}"
)
url
=
"https://www.facebook.com/"
browser
=
browser
or
create
(
no_headless
=
True
,
using_user_data
=
Tru
e
)
browser
=
browser
or
create
(
no_headless
=
True
,
using_user_data
=
Fals
e
)
# 打开网页
browser
.
get
(
url
)
# time.sleep(3)
try
:
# time.sleep(3)
# 检测是否要登录
login_input
=
browser
.
find_element
(
'xpath'
,
"//input[@name='email']"
)
password_input
=
browser
.
find_element
(
'xpath'
,
"//input[@name='pass']"
)
...
...
pc_instagram.py
View file @
1821fe14
...
...
@@ -35,7 +35,7 @@ def reptile(browser=None, search_word=""):
print
(
f
"搜索词:{search_word}"
)
base_url
=
"https://www.instagram.com/"
browser
=
browser
or
create
(
no_headless
=
True
,
using_user_data
=
True
)
browser
=
browser
or
create
(
no_headless
=
True
,
using_user_data
=
True
)
# print(browser)
# 打开网页
browser
.
get
(
base_url
)
...
...
@@ -103,7 +103,7 @@ def reptile(browser=None, search_word=""):
title
=
""
img_soup
=
BeautifulSoup
(
item
.
get_attribute
(
"outerHTML"
),
"html.parser"
)
.
find
(
"img"
)
del
img_soup
[
"srcset"
]
img_soup
[
"style"
]
=
"width:100
%
"
img_soup
[
"style"
]
=
"width:100
%
"
src
=
item
.
get_attribute
(
"src"
)
else
:
# 有视频,图片链接从列表中提取
...
...
pc_youtube.py
View file @
1821fe14
import
json
import
platform
import
time
from
bs4
import
BeautifulSoup
from
utils.Logger
import
log
...
...
@@ -30,28 +31,47 @@ def reptile(browser=None, search_word=""):
browser
=
browser
or
create
(
no_headless
=
True
,
using_user_data
=
False
)
# print(browser)
# 打开网页
print
(
f
"搜索词:{search_word}"
)
url
=
f
'https://www.youtube.com/results?search_query={search_word}'
browser
.
get
(
url
)
# print(browser.page_source)
if
platform
.
system
()
==
"Linux"
:
time
.
sleep
(
3
)
else
:
wait
=
WebDriverWait
(
browser
,
10
)
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
XPATH
,
"//div[@id='contents']"
)))
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
XPATH
,
"//div[@id='contents']"
)))
log
.
debug
(
"youtube login complete"
)
classify_video_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@id='contents']//ytd-video-renderer//div[@id='title-wrapper']//a"
)
element_author_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@id='contents']//ytd-video-renderer//ytd-channel-name//yt-formatted-string/a"
)
element_time_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@id='contents']//ytd-video-renderer//ytd-video-meta-block//div[@id='metadata-line']/span[2]"
)
length
=
len
(
classify_video_list
)
video_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@id='contents']//ytd-video-renderer"
)
# print(video_list[0].get_attribute("outerHTML"))
length
=
len
(
video_list
)
for
index
in
range
(
length
):
title
=
classify_video_list
[
index
]
.
get_attribute
(
'title'
)
link
=
classify_video_list
[
index
]
.
get_attribute
(
'href'
)
# 查找标题
author_element
=
video_list
[
index
]
.
find_element
(
"xpath"
,
"./div[1]/div/div[2]//ytd-channel-name//yt-formatted-string/a"
)
# print(author_element.get_attribute("outerHTML"))
title_element
=
video_list
[
index
]
.
find_element
(
"xpath"
,
".//div[@id='title-wrapper']//a"
)
# print(title_element.get_attribute("outerHTML"))
time_element
=
video_list
[
index
]
.
find_element
(
"xpath"
,
".//ytd-video-meta-block//div[@id='metadata-line']/span[2]"
)
# print(time_element.get_attribute("outerHTML"))
title
=
title_element
.
get_attribute
(
'title'
)
link
=
title_element
.
get_attribute
(
'href'
)
id
=
link
.
split
(
"?"
)[
1
]
.
split
(
"&"
)[
0
]
.
replace
(
"v="
,
""
)
url
=
f
'https://www.youtube.com/watch?v={id}'
if
index
<
6
and
YouTube
(
url
)
.
length
//
60
<
60
:
# 时长按照秒计算
video_duration
=
int
(
YouTube
(
url
)
.
length
)
//
60
# 暂时先取6条数据
if
index
<
6
and
video_duration
<
60
:
# print(str(id))
# print("视频连接:" + str(link))
# print("视频时长:" + str(video_duration))
base_urr
=
get_base_file_url
()
releaseTime
=
""
try
:
releaseTime
=
str
(
int
(
convert_string_to_time
(
element_time_list
[
index
]
.
text
)))
releaseTime
=
str
(
int
(
convert_string_to_time
(
time_element
.
text
)))
except
:
releaseTime
=
str
(
int
(
time
.
time
()))
video_url
=
[]
...
...
@@ -62,7 +82,7 @@ def reptile(browser=None, search_word=""):
# 下载视频
state_download
=
yt_dlp_download
(
url
,
'youtube'
)
video_url
.
append
(
download_dir
)
# print(str(state_download))
if
state_download
:
# 组装数据
obj
=
{
...
...
@@ -72,7 +92,7 @@ def reptile(browser=None, search_word=""):
"link"
:
link
,
"reptileTime"
:
str
(
int
(
time
.
time
())),
"type"
:
'视频'
,
"author"
:
element_author_list
[
index
]
.
text
,
"author"
:
author_element
.
text
,
"releaseTime"
:
releaseTime
}
data
.
append
(
obj
)
...
...
test.py
View file @
1821fe14
import
os
import
pytesseract
from
PIL
import
Image
# 指定 Tesseract OCR 的执行路径(可选,如果已经配置环境变量,则无需此步骤)
cmd_path
=
"/usr/local/Cellar/tesseract/5.3.2/share/tessdata"
img_path
=
os
.
path
.
join
(
os
.
path
.
abspath
(
"../"
),
'network-assets-reptile'
,
'reptile-data'
,
"instagram"
,
"Cr8vg2MyNFz.jpg"
)
pytesseract
.
pytesseract
.
tesseract_cmd
=
cmd_path
# 打开图片
image
=
Image
.
open
(
img_path
)
# 进行图片文字识别
text
=
pytesseract
.
image_to_string
(
image
,
lang
=
'chi_sim'
)
# 输出识别的文字
print
(
text
)
\ No newline at end of file
# set options to be headless, ..
from
selenium
import
webdriver
options
=
webdriver
.
ChromeOptions
()
options
.
add_argument
(
'--headless'
)
options
.
add_argument
(
'--no-sandbox'
)
options
.
add_argument
(
'--disable-dev-shm-usage'
)
# open it, go to a website, and get results
wd
=
webdriver
.
Chrome
(
options
=
options
)
wd
.
get
(
"https://www.youtube.com/results?search_query=俄乌战争"
)
print
(
wd
.
page_source
)
# results
\ No newline at end of file
utils/createBrowserDriver.py
View file @
1821fe14
import
os
import
platform
import
sys
from
utils.Logger
import
log
from
selenium
import
webdriver
# --------------- selenium 依赖 start ----------------
from
selenium.webdriver.chrome.service
import
Service
as
ChromeService
...
...
@@ -29,7 +29,7 @@ from utils.index import get_screen_resolution
'''
def
create
(
option
=
None
,
no_headless
=
False
,
using_user_data
=
True
,
web_browser
=
"
firefox
"
):
def
create
(
option
=
None
,
no_headless
=
False
,
using_user_data
=
True
,
web_browser
=
"
chrome
"
):
"""
生成selenium实例
...
...
@@ -86,20 +86,22 @@ def create(option=None, no_headless=False, using_user_data=True, web_browser="fi
# chrome_options.add_argument('--headless')
# options.add_argument("--window-size=1920x1080") # 设置窗口大小,这是一个常见的完全无头模式的设置
# options.add_argument("--start-maximized") # 最大化窗口
if
no_headless
==
True
:
options
.
add_argument
(
"--no-sandbox"
)
if
no_headless
:
if
platform
.
system
()
==
"Linux"
or
platform
.
system
()
==
"Darwin"
:
# 开启无头模式
options
.
add_argument
(
"-headless"
)
options
.
add_argument
(
"-
-
headless"
)
elif
platform
.
system
()
==
"Windows"
and
web_browser
==
"firefox"
:
# windows系统、火狐浏览器不开启无头模式
print
(
""
)
# print("")
error
=
""
if
option
!=
None
:
if
no_headless
:
# 无头模式下禁用gpu加速
options
.
add_argument
(
'--disable-gpu'
)
# 无头模式-linux 系统
if
option
!=
None
and
platform
.
system
()
==
"Linux"
:
if
no_headless
and
platform
.
system
()
==
"Linux"
:
'''
--disable-dev-shm-usage 是 Chrome 浏览器在无头模式下运行时的一个常用启动参数。在 Linux 系统下特别常见,通过这个参数,Chrome 浏览器会禁用对 /dev/shm 的使用。
'''
...
...
@@ -119,17 +121,24 @@ def create(option=None, no_headless=False, using_user_data=True, web_browser="fi
"geckodriver.exe"
))
browser
=
webdriver
.
Firefox
(
options
=
options
,
service
=
service
)
elif
platform
.
system
()
==
"Linux"
:
service
=
FirefoxService
(
executable_path
=
os
.
path
.
join
(
os
.
path
.
abspath
(
"../"
),
'network-assets-reptile'
,
'browser'
,
"web-driver"
,
"firefox"
,
"linux"
,
"geckodriver"
))
path
=
os
.
path
.
join
(
os
.
path
.
abspath
(
"../"
),
'network-assets-reptile'
,
'browser'
,
"web-driver"
,
"firefox"
,
"linux"
,
"geckodriver"
)
service
=
FirefoxService
(
executable_path
=
path
)
log
.
debug
(
"firefox驱动路径:"
+
path
)
# options=options,
browser
=
webdriver
.
Firefox
(
options
=
options
,
service
=
service
)
elif
platform
.
system
()
==
"Darwin"
:
service
=
FirefoxService
(
executable_path
=
os
.
path
.
join
(
os
.
path
.
abspath
(
"../"
),
'network-assets-reptile'
,
'browser'
,
"web-driver"
,
"firefox"
,
"mac"
,
service
=
FirefoxService
(
executable_path
=
os
.
path
.
join
(
os
.
path
.
abspath
(
"../"
),
'network-assets-reptile'
,
'browser'
,
"web-driver"
,
"firefox"
,
"mac"
,
"geckodriver"
))
browser
=
webdriver
.
Firefox
(
options
=
options
,
service
=
service
)
elif
web_browser
==
"chrome"
:
# 创建Chrome浏览器对象并传入选项
web_browser
=
webdriver
.
Chrome
(
options
=
options
,
service
=
ChromeService
(
ChromeDriverManager
()
.
install
()))
if
platform
.
system
()
==
"Darwin"
:
options
.
binary_location
=
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
browser
=
webdriver
.
Chrome
(
options
=
options
,
service
=
ChromeService
())
elif
web_browser
==
"chromium"
:
binary_location
=
""
webdriver_location
=
""
...
...
@@ -152,12 +161,7 @@ def create(option=None, no_headless=False, using_user_data=True, web_browser="fi
"chromedriver_mac64"
,
"chromedriver"
)
else
:
print
(
""
)
# 指定浏览器路径
# print(binary_location)
# 指定浏览器路径
# options.binary_location = binary_location
# options.browser_version = "114"
error
=
""
# 设置驱动二进制可执行文件路径
# service = ChromeService(executable_path=webdriver_location)
service
=
ChromeService
(
executable_path
=
webdriver_location
)
...
...
@@ -166,7 +170,15 @@ def create(option=None, no_headless=False, using_user_data=True, web_browser="fi
elif
web_browser
==
"edge"
:
browser
=
webdriver
.
Edge
(
options
=
options
,
service
=
EdgeService
(
EdgeChromiumDriverManager
()
.
install
()))
else
:
print
(
""
)
error
=
""
# 获取浏览器信息
browser_name
=
browser
.
capabilities
[
'browserName'
]
browser_version
=
browser
.
capabilities
[
'browserVersion'
]
# 输出浏览器信息
print
(
"浏览器名称:"
,
browser_name
)
print
(
"浏览器版本:"
,
browser_version
)
if
option
is
None
:
# 获取屏幕分辨率
...
...
utils/index.py
View file @
1821fe14
...
...
@@ -175,7 +175,7 @@ def yt_dlp_download(url, name):
download_options
=
f
'-f mp4'
# 要执行的 shell 命令
command
=
f
'yt-dlp -v {download_options} {network_options} --verbose -- {url}'
# print(command)
# 使用 subprocess 调用 shell 命令
result
=
subprocess
.
run
(
command
,
shell
=
True
,
capture_output
=
True
,
text
=
True
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment