Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
N
network-assets-reptile
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
liyang
network-assets-reptile
Commits
c712ff68
Commit
c712ff68
authored
Jul 26, 2023
by
liyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
feat:time.sleep()更换为 WebDriverWait
parent
37ffd734
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
66 additions
and
48 deletions
+66
-48
pc_dcard.py
pc_dcard.py
+2
-1
pc_facebook.py
pc_facebook.py
+14
-4
pc_instagram.py
pc_instagram.py
+2
-0
pc_ptt.py
pc_ptt.py
+1
-1
pc_twitter.py
pc_twitter.py
+16
-8
pc_youtube.py
pc_youtube.py
+13
-4
test.py
test.py
+13
-21
createBrowserDriver.py
utils/createBrowserDriver.py
+1
-1
index.py
utils/index.py
+4
-8
No files found.
pc_dcard.py
View file @
c712ff68
...
...
@@ -35,9 +35,10 @@ def reptile(browser=None, search_word=""):
"""
print
(
f
"搜索词:{search_word}"
)
base_url
=
"https://www.dcard.tw"
browser
=
browser
or
create
(
no_headless
=
True
,
using_user_data
=
True
)
browser
=
browser
or
create
(
no_headless
=
False
,
using_user_data
=
True
)
# 打开网页
# browser.get(base_url)
# time.sleep(3)
browser
.
get
(
f
"{base_url}/search?query={search_word}"
)
base_xpath
=
"//div[@role='main']//div[@data-key]//article"
# 内容块
...
...
pc_facebook.py
View file @
c712ff68
...
...
@@ -13,6 +13,12 @@ import os
from
config.settings
import
get_base_file_url
from
config.settings
import
get_account
import
sys
# --------------- selenium 依赖 start ----------------
from
selenium.webdriver.common.by
import
By
from
selenium.webdriver.support.ui
import
WebDriverWait
from
selenium.webdriver.support
import
expected_conditions
as
EC
# --------------- selenium 依赖 end ----------------
# 工具函数-下载图片
'''
...
...
@@ -23,7 +29,7 @@ import sys
def
reptile
(
browser
=
None
,
search_word
=
""
):
print
(
f
"搜索词:{search_word}"
)
url
=
"https://www.facebook.com/"
browser
=
browser
or
create
(
no_headless
=
Tru
e
,
using_user_data
=
True
)
browser
=
browser
or
create
(
no_headless
=
Fals
e
,
using_user_data
=
True
)
# 打开网页
browser
.
get
(
url
)
try
:
...
...
@@ -35,15 +41,19 @@ def reptile(browser=None, search_word=""):
# 获取登录按钮
button_login
=
browser
.
find_element
(
'xpath'
,
"//button[@name='login']"
)
button_login
.
click
()
time
.
sleep
(
6
)
wait
=
WebDriverWait
(
browser
,
10
)
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
XPATH
,
"//div[@role='main']"
)))
except
:
print
(
"已登录"
)
log
.
debug
(
"facebook login complete"
)
url
=
f
"https://www.facebook.com/search/top?q={search_word}"
browser
.
get
(
url
)
# 使用 JavaScript 将网页滚动到底部
browser
.
execute_script
(
"window.scrollTo(0, document.body.scrollHeight);"
)
time
.
sleep
(
3
)
# 等待内容出现,设置最长等待时间为10秒
wait
=
WebDriverWait
(
browser
,
10
)
# 通过 expected_conditions 来定义等待条件,这里以弹窗内容的某个元素为例
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
XPATH
,
"//div[@role='feed']"
)))
# 内容
element_content_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@role='feed']/div//div[@aria-describedby]/div/div/div/div/div/div[2]/div/div/div[3]"
)
...
...
pc_instagram.py
View file @
c712ff68
...
...
@@ -56,6 +56,7 @@ def reptile(browser=None, search_word=""):
except
:
print
(
"------"
)
# print("1111")
log
.
debug
(
"instagram login complete"
)
url
=
f
"{base_url}explore/tags/{search_word}/"
browser
.
get
(
url
)
wait
=
WebDriverWait
(
browser
,
10
)
...
...
@@ -98,6 +99,7 @@ def reptile(browser=None, search_word=""):
if
len
(
title_str_list
)
>=
3
:
title
=
title_str_list
[
1
]
else
:
# 提取图片中的文字
title
=
""
img_soup
=
BeautifulSoup
(
item
.
get_attribute
(
"outerHTML"
),
"html.parser"
)
.
find
(
"img"
)
del
img_soup
[
"srcset"
]
...
...
pc_ptt.py
View file @
c712ff68
...
...
@@ -33,7 +33,7 @@ def reptile(browser=None, search_word=""):
# browser = browser or create()
# 打开网页
browser
.
get
(
url
)
log
.
debug
(
"已打开浏览器"
)
#
log.debug("已打开浏览器")
classify_item_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='board-class']"
)
# log.debug(classify_item_list)
length
=
len
(
classify_item_list
)
...
...
pc_twitter.py
View file @
c712ff68
...
...
@@ -14,7 +14,12 @@ from utils.download_image import download_image
from
config.settings
import
get_base_file_url
from
config.settings
import
get_account
# 工具函数-下载图片
# --------------- selenium 依赖 start ----------------
from
selenium.webdriver.common.by
import
By
from
selenium.webdriver.support.ui
import
WebDriverWait
from
selenium.webdriver.support
import
expected_conditions
as
EC
# --------------- selenium 依赖 end ----------------
'''
打开指定网页,并使用 Selenium 模拟点击 "GDPR-accept" 按钮,然后循环点击 "search-show-more-button" 按钮来加载更多数据,直到按钮不再可点击为止。最后,获取完整的分页数据并关闭浏览器驱动。
'''
...
...
@@ -28,32 +33,35 @@ def reptile(browser=None, search_word=""):
"""
print
(
f
"搜索词:{search_word}"
)
base_url
=
"https://twitter.com/"
browser
=
browser
or
create
(
no_headless
=
True
,
using_user_data
=
True
)
browser
=
browser
or
create
(
no_headless
=
False
,
using_user_data
=
True
)
# print(browser)
# 打开网页
browser
.
get
(
base_url
)
time
.
sleep
(
3
)
time
.
sleep
(
2
)
try
:
# wait = WebDriverWait(browser, 20)
# wait.until(EC.presence_of_element_located((By.XPATH, "//input[@autocomplete='username']")))
# 检测是否要登录
login_input
=
browser
.
find_element
(
'xpath'
,
"//input[@autocomplete='username']"
)
login_input
.
send_keys
(
get_account
(
"twitter"
)[
"name"
])
# 获取下一步按钮
buttons
=
browser
.
find_element
(
'xpath'
,
"//div[@role='button'][2]"
)
buttons
.
click
()
time
.
sleep
(
3
)
wait
=
WebDriverWait
(
browser
,
10
)
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
XPATH
,
"//input[@autocomplete='current-password']"
)))
password_input
=
browser
.
find_element
(
'xpath'
,
"//input[@autocomplete='current-password']"
)
password_input
.
send_keys
(
get_account
(
"twitter"
)[
"password"
])
# # 获取登录按钮
button_login
=
browser
.
find_element
(
'xpath'
,
"//div[@data-testid='LoginForm_Login_Button']"
)
button_login
.
click
()
time
.
sleep
(
1
)
except
:
print
(
"------"
)
time
.
sleep
(
2
)
# print("1111")
url
=
'https://twitter.com/search?q='
+
search_word
+
'&src=typed_query'
browser
.
get
(
url
)
time
.
sleep
(
4
)
wait
=
WebDriverWait
(
browser
,
10
)
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
XPATH
,
"//div[@data-testid='cellInnerDiv']//article/div/div/div[2]/div[2]"
)))
base_xpath
=
"//div[@data-testid='cellInnerDiv']//article/div/div/div[2]/div[2]"
# 内容块
element_content_list
=
browser
.
find_elements
(
'xpath'
,
base_xpath
)
...
...
pc_youtube.py
View file @
c712ff68
...
...
@@ -12,8 +12,14 @@ import os
from
config.settings
import
get_base_file_url
from
selenium.webdriver.common.action_chains
import
ActionChains
import
sys
# --------------- selenium 依赖 start ----------------
from
selenium.webdriver.common.by
import
By
from
selenium.webdriver.support.ui
import
WebDriverWait
from
selenium.webdriver.support
import
expected_conditions
as
EC
# --------------- selenium 依赖 end ----------------
def
reptile
(
browser
=
None
,
search_word
=
""
):
"""
...
...
@@ -21,12 +27,14 @@ def reptile(browser=None, search_word=""):
:param search_word:
:return:
"""
browser
=
browser
or
create
(
no_headless
=
True
,
using_user_data
=
True
)
browser
=
browser
or
create
(
no_headless
=
False
,
using_user_data
=
True
)
# print(browser)
# 打开网页
url
=
f
'https://www.youtube.com/results?search_query={search_word}'
browser
.
get
(
url
)
# time.sleep(2)
wait
=
WebDriverWait
(
browser
,
10
)
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
XPATH
,
"//div[@id='contents']"
)))
log
.
debug
(
"youtube login complete"
)
classify_video_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@id='contents']//ytd-video-renderer//div[@id='title-wrapper']//a"
)
element_author_list
=
browser
.
find_elements
(
'xpath'
,
...
...
@@ -54,6 +62,7 @@ def reptile(browser=None, search_word=""):
# 下载视频
state_download
=
yt_dlp_download
(
url
,
'youtube'
)
video_url
.
append
(
download_dir
)
if
state_download
:
# 组装数据
obj
=
{
...
...
@@ -68,8 +77,8 @@ def reptile(browser=None, search_word=""):
}
data
.
append
(
obj
)
else
:
print
(
""
)
#
print("")
error
=
""
if
len
(
data
)
>
0
:
# 保存json文件到本地
# log.debug(os.path.abspath("../"))
...
...
test.py
View file @
c712ff68
This diff is collapsed.
Click to expand it.
utils/createBrowserDriver.py
View file @
c712ff68
...
...
@@ -87,7 +87,7 @@ def create(option=None, no_headless=False, using_user_data=True, web_browser="fi
# options.add_argument("--window-size=1920x1080") # 设置窗口大小,这是一个常见的完全无头模式的设置
# options.add_argument("--start-maximized") # 最大化窗口
if
no_headless
==
True
:
if
platform
.
system
()
==
"Linux"
and
platform
.
system
()
==
"Darwin"
:
if
platform
.
system
()
==
"Linux"
or
platform
.
system
()
==
"Darwin"
:
# 开启无头模式
options
.
add_argument
(
"-headless"
)
elif
platform
.
system
()
==
"Windows"
and
web_browser
==
"firefox"
:
...
...
utils/index.py
View file @
c712ff68
...
...
@@ -166,19 +166,15 @@ def pytube_download(link, file_dir):
def
yt_dlp_download
(
url
,
name
):
file_dir
=
os
.
path
.
abspath
(
"../"
)
options
=
f
'-v'
network_options
=
f
'-o "{os.path.join(file_dir, "network-assets-reptile", "reptile_data", name, "
%(id)
s.
%(ext)
s")}"'
geo
=
""
# --get-url
video_selection
=
f
''
# 清晰度
definition
=
f
'18'
# 360p
#
definition = f'18' # 360p
# definition = f'18' # 720p
# definition = f'24' # 1080p
download_options
=
f
'-f {definition}
-vU'
other_options
=
f
'--verbose
'
# f'-f 18
-vU'
download_options
=
f
'-f mp4
'
# 要执行的 shell 命令
command
=
f
'yt-dlp
{options} {network_options} {geo} {video_selection} {download_options} {other_options}
-- {url}'
command
=
f
'yt-dlp
-v {download_options} {network_options} --verbose
-- {url}'
# 使用 subprocess 调用 shell 命令
result
=
subprocess
.
run
(
command
,
shell
=
True
,
capture_output
=
True
,
text
=
True
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment