Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
N
network-assets-reptile
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
liyang
network-assets-reptile
Commits
c712ff68
Commit
c712ff68
authored
Jul 26, 2023
by
liyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
feat:time.sleep()更换为 WebDriverWait
parent
37ffd734
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
66 additions
and
48 deletions
+66
-48
pc_dcard.py
pc_dcard.py
+2
-1
pc_facebook.py
pc_facebook.py
+14
-4
pc_instagram.py
pc_instagram.py
+2
-0
pc_ptt.py
pc_ptt.py
+1
-1
pc_twitter.py
pc_twitter.py
+16
-8
pc_youtube.py
pc_youtube.py
+13
-4
test.py
test.py
+13
-21
createBrowserDriver.py
utils/createBrowserDriver.py
+1
-1
index.py
utils/index.py
+4
-8
No files found.
pc_dcard.py
View file @
c712ff68
...
@@ -35,9 +35,10 @@ def reptile(browser=None, search_word=""):
...
@@ -35,9 +35,10 @@ def reptile(browser=None, search_word=""):
"""
"""
print
(
f
"搜索词:{search_word}"
)
print
(
f
"搜索词:{search_word}"
)
base_url
=
"https://www.dcard.tw"
base_url
=
"https://www.dcard.tw"
browser
=
browser
or
create
(
no_headless
=
True
,
using_user_data
=
True
)
browser
=
browser
or
create
(
no_headless
=
False
,
using_user_data
=
True
)
# 打开网页
# 打开网页
# browser.get(base_url)
# browser.get(base_url)
# time.sleep(3)
browser
.
get
(
f
"{base_url}/search?query={search_word}"
)
browser
.
get
(
f
"{base_url}/search?query={search_word}"
)
base_xpath
=
"//div[@role='main']//div[@data-key]//article"
base_xpath
=
"//div[@role='main']//div[@data-key]//article"
# 内容块
# 内容块
...
...
pc_facebook.py
View file @
c712ff68
...
@@ -13,6 +13,12 @@ import os
...
@@ -13,6 +13,12 @@ import os
from
config.settings
import
get_base_file_url
from
config.settings
import
get_base_file_url
from
config.settings
import
get_account
from
config.settings
import
get_account
import
sys
import
sys
# --------------- selenium 依赖 start ----------------
from
selenium.webdriver.common.by
import
By
from
selenium.webdriver.support.ui
import
WebDriverWait
from
selenium.webdriver.support
import
expected_conditions
as
EC
# --------------- selenium 依赖 end ----------------
# 工具函数-下载图片
# 工具函数-下载图片
'''
'''
...
@@ -23,7 +29,7 @@ import sys
...
@@ -23,7 +29,7 @@ import sys
def
reptile
(
browser
=
None
,
search_word
=
""
):
def
reptile
(
browser
=
None
,
search_word
=
""
):
print
(
f
"搜索词:{search_word}"
)
print
(
f
"搜索词:{search_word}"
)
url
=
"https://www.facebook.com/"
url
=
"https://www.facebook.com/"
browser
=
browser
or
create
(
no_headless
=
Tru
e
,
using_user_data
=
True
)
browser
=
browser
or
create
(
no_headless
=
Fals
e
,
using_user_data
=
True
)
# 打开网页
# 打开网页
browser
.
get
(
url
)
browser
.
get
(
url
)
try
:
try
:
...
@@ -35,15 +41,19 @@ def reptile(browser=None, search_word=""):
...
@@ -35,15 +41,19 @@ def reptile(browser=None, search_word=""):
# 获取登录按钮
# 获取登录按钮
button_login
=
browser
.
find_element
(
'xpath'
,
"//button[@name='login']"
)
button_login
=
browser
.
find_element
(
'xpath'
,
"//button[@name='login']"
)
button_login
.
click
()
button_login
.
click
()
time
.
sleep
(
6
)
wait
=
WebDriverWait
(
browser
,
10
)
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
XPATH
,
"//div[@role='main']"
)))
except
:
except
:
print
(
"已登录"
)
print
(
"已登录"
)
log
.
debug
(
"facebook login complete"
)
url
=
f
"https://www.facebook.com/search/top?q={search_word}"
url
=
f
"https://www.facebook.com/search/top?q={search_word}"
browser
.
get
(
url
)
browser
.
get
(
url
)
# 使用 JavaScript 将网页滚动到底部
# 使用 JavaScript 将网页滚动到底部
browser
.
execute_script
(
"window.scrollTo(0, document.body.scrollHeight);"
)
browser
.
execute_script
(
"window.scrollTo(0, document.body.scrollHeight);"
)
time
.
sleep
(
3
)
# 等待内容出现,设置最长等待时间为10秒
wait
=
WebDriverWait
(
browser
,
10
)
# 通过 expected_conditions 来定义等待条件,这里以弹窗内容的某个元素为例
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
XPATH
,
"//div[@role='feed']"
)))
# 内容
# 内容
element_content_list
=
browser
.
find_elements
(
'xpath'
,
element_content_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@role='feed']/div//div[@aria-describedby]/div/div/div/div/div/div[2]/div/div/div[3]"
)
"//div[@role='feed']/div//div[@aria-describedby]/div/div/div/div/div/div[2]/div/div/div[3]"
)
...
...
pc_instagram.py
View file @
c712ff68
...
@@ -56,6 +56,7 @@ def reptile(browser=None, search_word=""):
...
@@ -56,6 +56,7 @@ def reptile(browser=None, search_word=""):
except
:
except
:
print
(
"------"
)
print
(
"------"
)
# print("1111")
# print("1111")
log
.
debug
(
"instagram login complete"
)
url
=
f
"{base_url}explore/tags/{search_word}/"
url
=
f
"{base_url}explore/tags/{search_word}/"
browser
.
get
(
url
)
browser
.
get
(
url
)
wait
=
WebDriverWait
(
browser
,
10
)
wait
=
WebDriverWait
(
browser
,
10
)
...
@@ -98,6 +99,7 @@ def reptile(browser=None, search_word=""):
...
@@ -98,6 +99,7 @@ def reptile(browser=None, search_word=""):
if
len
(
title_str_list
)
>=
3
:
if
len
(
title_str_list
)
>=
3
:
title
=
title_str_list
[
1
]
title
=
title_str_list
[
1
]
else
:
else
:
# 提取图片中的文字
title
=
""
title
=
""
img_soup
=
BeautifulSoup
(
item
.
get_attribute
(
"outerHTML"
),
"html.parser"
)
.
find
(
"img"
)
img_soup
=
BeautifulSoup
(
item
.
get_attribute
(
"outerHTML"
),
"html.parser"
)
.
find
(
"img"
)
del
img_soup
[
"srcset"
]
del
img_soup
[
"srcset"
]
...
...
pc_ptt.py
View file @
c712ff68
...
@@ -33,7 +33,7 @@ def reptile(browser=None, search_word=""):
...
@@ -33,7 +33,7 @@ def reptile(browser=None, search_word=""):
# browser = browser or create()
# browser = browser or create()
# 打开网页
# 打开网页
browser
.
get
(
url
)
browser
.
get
(
url
)
log
.
debug
(
"已打开浏览器"
)
#
log.debug("已打开浏览器")
classify_item_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='board-class']"
)
classify_item_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='board-class']"
)
# log.debug(classify_item_list)
# log.debug(classify_item_list)
length
=
len
(
classify_item_list
)
length
=
len
(
classify_item_list
)
...
...
pc_twitter.py
View file @
c712ff68
...
@@ -14,7 +14,12 @@ from utils.download_image import download_image
...
@@ -14,7 +14,12 @@ from utils.download_image import download_image
from
config.settings
import
get_base_file_url
from
config.settings
import
get_base_file_url
from
config.settings
import
get_account
from
config.settings
import
get_account
# 工具函数-下载图片
# --------------- selenium 依赖 start ----------------
from
selenium.webdriver.common.by
import
By
from
selenium.webdriver.support.ui
import
WebDriverWait
from
selenium.webdriver.support
import
expected_conditions
as
EC
# --------------- selenium 依赖 end ----------------
'''
'''
打开指定网页,并使用 Selenium 模拟点击 "GDPR-accept" 按钮,然后循环点击 "search-show-more-button" 按钮来加载更多数据,直到按钮不再可点击为止。最后,获取完整的分页数据并关闭浏览器驱动。
打开指定网页,并使用 Selenium 模拟点击 "GDPR-accept" 按钮,然后循环点击 "search-show-more-button" 按钮来加载更多数据,直到按钮不再可点击为止。最后,获取完整的分页数据并关闭浏览器驱动。
'''
'''
...
@@ -28,32 +33,35 @@ def reptile(browser=None, search_word=""):
...
@@ -28,32 +33,35 @@ def reptile(browser=None, search_word=""):
"""
"""
print
(
f
"搜索词:{search_word}"
)
print
(
f
"搜索词:{search_word}"
)
base_url
=
"https://twitter.com/"
base_url
=
"https://twitter.com/"
browser
=
browser
or
create
(
no_headless
=
True
,
using_user_data
=
True
)
browser
=
browser
or
create
(
no_headless
=
False
,
using_user_data
=
True
)
# print(browser)
# print(browser)
# 打开网页
# 打开网页
browser
.
get
(
base_url
)
browser
.
get
(
base_url
)
time
.
sleep
(
3
)
time
.
sleep
(
2
)
try
:
try
:
# wait = WebDriverWait(browser, 20)
# wait.until(EC.presence_of_element_located((By.XPATH, "//input[@autocomplete='username']")))
# 检测是否要登录
# 检测是否要登录
login_input
=
browser
.
find_element
(
'xpath'
,
"//input[@autocomplete='username']"
)
login_input
=
browser
.
find_element
(
'xpath'
,
"//input[@autocomplete='username']"
)
login_input
.
send_keys
(
get_account
(
"twitter"
)[
"name"
])
login_input
.
send_keys
(
get_account
(
"twitter"
)[
"name"
])
# 获取下一步按钮
# 获取下一步按钮
buttons
=
browser
.
find_element
(
'xpath'
,
"//div[@role='button'][2]"
)
buttons
=
browser
.
find_element
(
'xpath'
,
"//div[@role='button'][2]"
)
buttons
.
click
()
buttons
.
click
()
time
.
sleep
(
3
)
wait
=
WebDriverWait
(
browser
,
10
)
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
XPATH
,
"//input[@autocomplete='current-password']"
)))
password_input
=
browser
.
find_element
(
'xpath'
,
"//input[@autocomplete='current-password']"
)
password_input
=
browser
.
find_element
(
'xpath'
,
"//input[@autocomplete='current-password']"
)
password_input
.
send_keys
(
get_account
(
"twitter"
)[
"password"
])
password_input
.
send_keys
(
get_account
(
"twitter"
)[
"password"
])
# # 获取登录按钮
# # 获取登录按钮
button_login
=
browser
.
find_element
(
'xpath'
,
"//div[@data-testid='LoginForm_Login_Button']"
)
button_login
=
browser
.
find_element
(
'xpath'
,
"//div[@data-testid='LoginForm_Login_Button']"
)
button_login
.
click
()
button_login
.
click
()
time
.
sleep
(
1
)
except
:
except
:
print
(
"------"
)
print
(
"------"
)
time
.
sleep
(
2
)
# print("1111")
url
=
'https://twitter.com/search?q='
+
search_word
+
'&src=typed_query'
url
=
'https://twitter.com/search?q='
+
search_word
+
'&src=typed_query'
browser
.
get
(
url
)
browser
.
get
(
url
)
time
.
sleep
(
4
)
wait
=
WebDriverWait
(
browser
,
10
)
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
XPATH
,
"//div[@data-testid='cellInnerDiv']//article/div/div/div[2]/div[2]"
)))
base_xpath
=
"//div[@data-testid='cellInnerDiv']//article/div/div/div[2]/div[2]"
base_xpath
=
"//div[@data-testid='cellInnerDiv']//article/div/div/div[2]/div[2]"
# 内容块
# 内容块
element_content_list
=
browser
.
find_elements
(
'xpath'
,
base_xpath
)
element_content_list
=
browser
.
find_elements
(
'xpath'
,
base_xpath
)
...
...
pc_youtube.py
View file @
c712ff68
...
@@ -12,8 +12,14 @@ import os
...
@@ -12,8 +12,14 @@ import os
from
config.settings
import
get_base_file_url
from
config.settings
import
get_base_file_url
from
selenium.webdriver.common.action_chains
import
ActionChains
from
selenium.webdriver.common.action_chains
import
ActionChains
import
sys
import
sys
# --------------- selenium 依赖 start ----------------
from
selenium.webdriver.common.by
import
By
from
selenium.webdriver.support.ui
import
WebDriverWait
from
selenium.webdriver.support
import
expected_conditions
as
EC
# --------------- selenium 依赖 end ----------------
def
reptile
(
browser
=
None
,
search_word
=
""
):
def
reptile
(
browser
=
None
,
search_word
=
""
):
"""
"""
...
@@ -21,12 +27,14 @@ def reptile(browser=None, search_word=""):
...
@@ -21,12 +27,14 @@ def reptile(browser=None, search_word=""):
:param search_word:
:param search_word:
:return:
:return:
"""
"""
browser
=
browser
or
create
(
no_headless
=
True
,
using_user_data
=
True
)
browser
=
browser
or
create
(
no_headless
=
False
,
using_user_data
=
True
)
# print(browser)
# print(browser)
# 打开网页
# 打开网页
url
=
f
'https://www.youtube.com/results?search_query={search_word}'
url
=
f
'https://www.youtube.com/results?search_query={search_word}'
browser
.
get
(
url
)
browser
.
get
(
url
)
# time.sleep(2)
wait
=
WebDriverWait
(
browser
,
10
)
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
XPATH
,
"//div[@id='contents']"
)))
log
.
debug
(
"youtube login complete"
)
classify_video_list
=
browser
.
find_elements
(
'xpath'
,
classify_video_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@id='contents']//ytd-video-renderer//div[@id='title-wrapper']//a"
)
"//div[@id='contents']//ytd-video-renderer//div[@id='title-wrapper']//a"
)
element_author_list
=
browser
.
find_elements
(
'xpath'
,
element_author_list
=
browser
.
find_elements
(
'xpath'
,
...
@@ -54,6 +62,7 @@ def reptile(browser=None, search_word=""):
...
@@ -54,6 +62,7 @@ def reptile(browser=None, search_word=""):
# 下载视频
# 下载视频
state_download
=
yt_dlp_download
(
url
,
'youtube'
)
state_download
=
yt_dlp_download
(
url
,
'youtube'
)
video_url
.
append
(
download_dir
)
video_url
.
append
(
download_dir
)
if
state_download
:
if
state_download
:
# 组装数据
# 组装数据
obj
=
{
obj
=
{
...
@@ -68,8 +77,8 @@ def reptile(browser=None, search_word=""):
...
@@ -68,8 +77,8 @@ def reptile(browser=None, search_word=""):
}
}
data
.
append
(
obj
)
data
.
append
(
obj
)
else
:
else
:
print
(
""
)
#
print("")
error
=
""
if
len
(
data
)
>
0
:
if
len
(
data
)
>
0
:
# 保存json文件到本地
# 保存json文件到本地
# log.debug(os.path.abspath("../"))
# log.debug(os.path.abspath("../"))
...
...
test.py
View file @
c712ff68
# from utils.index import yt_dlp_download
import
os
#
import
pytesseract
# status = yt_dlp_download("https://www.facebook.com/e5627ead-8b9a-48fd-820f-ee242cc08bbb", "facebook")
from
PIL
import
Image
# print(status)
# 指定 Tesseract OCR 的执行路径(可选,如果已经配置环境变量,则无需此步骤)
cmd_path
=
"/usr/local/Cellar/tesseract/5.3.2/share/tessdata"
img_path
=
os
.
path
.
join
(
os
.
path
.
abspath
(
"../"
),
'network-assets-reptile'
,
'reptile-data'
,
"instagram"
,
"Cr8vg2MyNFz.jpg"
)
pytesseract
.
pytesseract
.
tesseract_cmd
=
cmd_path
# 打开图片
image
=
Image
.
open
(
img_path
)
# import time
# 进行图片文字识别
# from selenium.webdriver import Firefox
text
=
pytesseract
.
image_to_string
(
image
,
lang
=
'chi_sim'
)
# from selenium import webdriver
# driver = webdriver.Firefox()
# driver.get("https://www.toutiao.com/a6969138023774667264/")
# time.sleep(2)
# html = driver.page_source
# print(html)
# driver.quit()
# var = {'title': 'Photo by 今周刊 on July 17, 2023. May be an illustration of poster and text.',
# 输出识别的文字
# 'content': '<h1 class="_aacl _aaco _aacu _aacx _aad7 _aade" dir="auto">\n 你有沒有想過,為什麼你的時間總是不夠用?為什麼你買稍微貴點的東西,都要不自覺地把它折算成自己多少天的收入?為什麼你以前看著數字就頭暈腦脹,現在願意對著數字精打細算,還時刻惦記著要怎麼花錢才能熬得到月尾?為什麼你正值青春的花樣年華,卻窮得只剩下理想,忙得沒時間生活?\n <br/>\n <br/>\n 實際上,絕大多數人的疲於奔命,不是因為忙,而是心態出了問題,是眼下的生活不能如人所願,是對當前的生活不知所措。\n <br/>\n <br/>\n 但是,如果你不能以一種主動的、有規劃的方式去對待生活和工作,那麼你即使什麼都不做,依然會覺得疲憊。\n <br/>\n <br/>\n 比如你忙著回覆一封又一封無關緊要的郵件,忙著參加一個又一個無聊的會議,忙著從一個聚會趕到另一個聚會,忙著在節假日跟社交軟體裡每一個熟悉的和不熟悉的人說沒完沒了的、便宜的祝福語……比如你每天兩點一線,在家和公司之間步履匆匆。\n <br/>\n <br/>\n 一大早忙著擠上即將關門開走的公車,好不容易來到公司忙著準備資料、製作檔案、接待客戶。終於熬到下班,行屍走肉樣的狀態卻不忘看看社群動態,在手機裡看著大家都在為生計而奔忙。\n <br/>\n <br/>\n 可如果誰要是問你,「怎麼你老是這麼忙?都做了些什麼?」你就算皺緊了眉頭,想破了腦袋也只能給出一個這樣的回答:「呃,我也記不住都忙什麼了,反正就是很忙!」\n <br/>\n \u200b\n <br/>\n 你呀,像極了一隻在泳池裡瞎撲騰的旱鴨子,一直抓住一個叫作「工作忙」的游泳圈不肯放手。\n <br/>\n <br/>\n 於是,「我好忙」變成了你的海洛因,變成了讓你麻木的精神撫慰品。它讓你忘記為了什麼而出發,忘記了你的最終目的是什麼,就像把你綁在了旋轉的音樂盒上,看起來美妙,聽著也舒服,卻是周而復始的、無意義的瞎打轉。\n <br/>\n <br/>\n 嗯,那你就接著懶吧,以後很失敗的時候,還有可以安慰一下自己的理由——萬一努力了還不成功,那不就尷尬了?\n <br/>\n <br/>\n 這世上真的沒有什麼搖身一變,更沒有什麼能拯救你的人,有的只是你看不到的低調努力。怕就怕,你只有低調,沒有努力。\n <br/>\n \u200b\n <br/>\n 📚本篇僅為部分節錄,摘自《裝睡的人叫不醒,再不清醒窮死你》\n <br/>\n \u200b\n <br/>\n <a class="x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz _aa9_ _a6hd" href="/explore/tags/%E6%9B%B8%E6%91%98/" role="link" tabindex="0">\n #書摘\n </a>\n <a class="x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz _aa9_ _a6hd" href="/explore/tags/%E9%96%B1%E8%AE%80/" role="link" tabindex="0">\n #閱讀\n </a>\n <a class="x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz _aa9_ _a6hd" href="/explore/tags/reading/" role="link" tabindex="0">\n #reading\n </a>\n <a class="x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz _aa9_ _a6hd" href="/explore/tags/%E4%BB%8A%E5%91%A8%E5%88%8A/" role="link" tabindex="0">\n #今周刊\n </a>\n <a class="x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz _aa9_ _a6hd" href="/explore/tags/%E6%96%B0%E8%81%9E/" role="link" tabindex="0">\n #新聞\n </a>\n <a class="x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz _aa9_ _a6hd" href="/explore/tags/news/" role="link" tabindex="0">\n #news\n </a>\n <a class="x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz _aa9_ _a6hd" href="/explore/tags/%E5%AA%92%E9%AB%94/" role="link" tabindex="0">\n #媒體\n </a>\n <a class="x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz _aa9_ _a6hd" href="/explore/tags/%E7%90%86%E8%B2%A1/" role="link" tabindex="0">\n #理財\n </a>\n <a class="x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz _aa9_ _a6hd" href="/explore/tags/quotes/" role="link" tabindex="0">\n #quotes\n </a>\n <a class="x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz _aa9_ _a6hd" href="/explore/tags/allianzgitw/" role="link" tabindex="0">\n #allianzgitw\n </a>\n <a class="x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz _aa9_ _a6hd" href="/explore/tags/%E4%B8%BB%E5%8B%95%E8%AE%93%E6%8A%95%E8%B3%87%E7%99%BC%E6%8F%AE%E5%BD%B1%E9%9F%BF%E5%8A%9B/" role="link" tabindex="0">\n #主動讓投資發揮影響力\n </a>\n</h1>\n<img alt="Photo by 今周刊 on July 17, 2023. May be an illustration of poster and text." class="x5yr21d xu96u03 x10l6tqk x13vifvy x87ps6o xh8yej3" crossorigin="anonymous" decoding="auto" sizes="653.5999755859375px" src="https://scontent-hkg4-2.cdninstagram.com/v/t39.30808-6/361612960_677983984363846_6639522657508930508_n.jpg?stp=dst-jpg_e15_fr_p1080x1080&_nc_ht=scontent-hkg4-2.cdninstagram.com&_nc_cat=110&_nc_ohc=NFrLOL0j0HoAX-dbhrX&edm=AGyKU4gAAAAA&ccb=7-5&ig_cache_key=MzE0OTI2Nzg0NTc4NzA3NTU0NQ%3D%3D.2-ccb7-5&oh=00_AfBdfyF7qUQQaVxi_e9z5aI4P6e6Hy9JIVtTl2YV9gXoJw&oe=64C4688F&_nc_sid=2011ad" srcset="https://scontent-hkg4-2.cdninstagram.com/v/t39.30808-6/361612960_677983984363846_6639522657508930508_n.jpg?stp=dst-jpg_e15_fr_p1080x1080&_nc_ht=scontent-hkg4-2.cdninstagram.com&_nc_cat=110&_nc_ohc=NFrLOL0j0HoAX-dbhrX&edm=AGyKU4gAAAAA&ccb=7-5&ig_cache_key=MzE0OTI2Nzg0NTc4NzA3NTU0NQ%3D%3D.2-ccb7-5&oh=00_AfBdfyF7qUQQaVxi_e9z5aI4P6e6Hy9JIVtTl2YV9gXoJw&oe=64C4688F&_nc_sid=2011ad 1080w,https://scontent-hkg4-2.cdninstagram.com/v/t39.30808-6/361612960_677983984363846_6639522657508930508_n.jpg?stp=dst-jpg_e35_p750x750_sh0.08&_nc_ht=scontent-hkg4-2.cdninstagram.com&_nc_cat=110&_nc_ohc=NFrLOL0j0HoAX-dbhrX&edm=AGyKU4gAAAAA&ccb=7-5&ig_cache_key=MzE0OTI2Nzg0NTc4NzA3NTU0NQ%3D%3D.2-ccb7-5&oh=00_AfDH1NfR4Ik2R9DwcoPB5XectpJqfeUOtbvrxtmRHDxOVg&oe=64C4688F&_nc_sid=2011ad 750w,https://scontent-hkg4-2.cdninstagram.com/v/t39.30808-6/361612960_677983984363846_6639522657508930508_n.jpg?stp=dst-jpg_e35_p640x640_sh0.08&_nc_ht=scontent-hkg4-2.cdninstagram.com&_nc_cat=110&_nc_ohc=NFrLOL0j0HoAX-dbhrX&edm=AGyKU4gAAAAA&ccb=7-5&ig_cache_key=MzE0OTI2Nzg0NTc4NzA3NTU0NQ%3D%3D.2-ccb7-5&oh=00_AfCc0UgUfQGa7N4QJeWgEMdqmiIDKwuO10SH_A5R1-q9EQ&oe=64C4688F&_nc_sid=2011ad 640w,https://scontent-hkg4-2.cdninstagram.com/v/t39.30808-6/361612960_677983984363846_6639522657508930508_n.jpg?stp=dst-jpg_e15_p480x480&_nc_ht=scontent-hkg4-2.cdninstagram.com&_nc_cat=110&_nc_ohc=NFrLOL0j0HoAX-dbhrX&edm=AGyKU4gAAAAA&ccb=7-5&ig_cache_key=MzE0OTI2Nzg0NTc4NzA3NTU0NQ%3D%3D.2-ccb7-5&oh=00_AfAlY_Yxs-GAahDKxf4ijFWqjTERRFrRitvuXySepJR7hw&oe=64C4688F&_nc_sid=2011ad 480w,https://scontent-hkg4-2.cdninstagram.com/v/t39.30808-6/361612960_677983984363846_6639522657508930508_n.jpg?stp=dst-jpg_e15_p320x320&_nc_ht=scontent-hkg4-2.cdninstagram.com&_nc_cat=110&_nc_ohc=NFrLOL0j0HoAX-dbhrX&edm=AGyKU4gAAAAA&ccb=7-5&ig_cache_key=MzE0OTI2Nzg0NTc4NzA3NTU0NQ%3D%3D.2-ccb7-5&oh=00_AfBuQrHYUVMkHj6NKLuNmNvSlyupwQFTY6e7lsdxwXmy5Q&oe=64C4688F&_nc_sid=2011ad 320w,https://scontent-hkg4-2.cdninstagram.com/v/t39.30808-6/361612960_677983984363846_6639522657508930508_n.jpg?stp=dst-jpg_e15_p240x240&_nc_ht=scontent-hkg4-2.cdninstagram.com&_nc_cat=110&_nc_ohc=NFrLOL0j0HoAX-dbhrX&edm=AGyKU4gAAAAA&ccb=7-5&ig_cache_key=MzE0OTI2Nzg0NTc4NzA3NTU0NQ%3D%3D.2-ccb7-5&oh=00_AfD9YWNkm7j6GSh3CufCedWL0LlxHSEdLskO6PVsE4DA8Q&oe=64C4688F&_nc_sid=2011ad 240w,https://scontent-hkg4-2.cdninstagram.com/v/t39.30808-6/361612960_677983984363846_6639522657508930508_n.jpg?stp=dst-jpg_e15_p150x150&_nc_ht=scontent-hkg4-2.cdninstagram.com&_nc_cat=110&_nc_ohc=NFrLOL0j0HoAX-dbhrX&edm=AGyKU4gAAAAA&ccb=7-5&ig_cache_key=MzE0OTI2Nzg0NTc4NzA3NTU0NQ%3D%3D.2-ccb7-5&oh=00_AfAzad7jssscLrABgHQKNVi0CGOm5H1DZWIpqwwMMx5Kjw&oe=64C4688F&_nc_sid=2011ad 150w" style="object-fit: cover;"/>\n',
print
(
text
)
# 'link': 'https://www.instagram.com/p/Cu0cmuSssPZ/', 'reptileTime': '1690259090', 'type': '图文', 'author': '',
\ No newline at end of file
# 'releaseTime': '1689613204', 'picture_url': 'http://192.168.0.118:8186/instagram/1690259027.jpg'}
str
=
"https://www.instagram.com/p/Cs0YvVcJFF8/"
list
=
str
.
split
(
"/"
)
print
(
list
[
len
(
list
)
-
2
])
\ No newline at end of file
utils/createBrowserDriver.py
View file @
c712ff68
...
@@ -87,7 +87,7 @@ def create(option=None, no_headless=False, using_user_data=True, web_browser="fi
...
@@ -87,7 +87,7 @@ def create(option=None, no_headless=False, using_user_data=True, web_browser="fi
# options.add_argument("--window-size=1920x1080") # 设置窗口大小,这是一个常见的完全无头模式的设置
# options.add_argument("--window-size=1920x1080") # 设置窗口大小,这是一个常见的完全无头模式的设置
# options.add_argument("--start-maximized") # 最大化窗口
# options.add_argument("--start-maximized") # 最大化窗口
if
no_headless
==
True
:
if
no_headless
==
True
:
if
platform
.
system
()
==
"Linux"
and
platform
.
system
()
==
"Darwin"
:
if
platform
.
system
()
==
"Linux"
or
platform
.
system
()
==
"Darwin"
:
# 开启无头模式
# 开启无头模式
options
.
add_argument
(
"-headless"
)
options
.
add_argument
(
"-headless"
)
elif
platform
.
system
()
==
"Windows"
and
web_browser
==
"firefox"
:
elif
platform
.
system
()
==
"Windows"
and
web_browser
==
"firefox"
:
...
...
utils/index.py
View file @
c712ff68
...
@@ -166,19 +166,15 @@ def pytube_download(link, file_dir):
...
@@ -166,19 +166,15 @@ def pytube_download(link, file_dir):
def
yt_dlp_download
(
url
,
name
):
def
yt_dlp_download
(
url
,
name
):
file_dir
=
os
.
path
.
abspath
(
"../"
)
file_dir
=
os
.
path
.
abspath
(
"../"
)
options
=
f
'-v'
network_options
=
f
'-o "{os.path.join(file_dir, "network-assets-reptile", "reptile_data", name, "
%(id)
s.
%(ext)
s")}"'
network_options
=
f
'-o "{os.path.join(file_dir, "network-assets-reptile", "reptile_data", name, "
%(id)
s.
%(ext)
s")}"'
geo
=
""
# --get-url
video_selection
=
f
''
# 清晰度
# 清晰度
definition
=
f
'18'
# 360p
#
definition = f'18' # 360p
# definition = f'18' # 720p
# definition = f'18' # 720p
# definition = f'24' # 1080p
# definition = f'24' # 1080p
download_options
=
f
'-f {definition}
-vU'
# f'-f 18
-vU'
other_options
=
f
'--verbose
'
download_options
=
f
'-f mp4
'
# 要执行的 shell 命令
# 要执行的 shell 命令
command
=
f
'yt-dlp
{options} {network_options} {geo} {video_selection} {download_options} {other_options}
-- {url}'
command
=
f
'yt-dlp
-v {download_options} {network_options} --verbose
-- {url}'
# 使用 subprocess 调用 shell 命令
# 使用 subprocess 调用 shell 命令
result
=
subprocess
.
run
(
command
,
shell
=
True
,
capture_output
=
True
,
text
=
True
)
result
=
subprocess
.
run
(
command
,
shell
=
True
,
capture_output
=
True
,
text
=
True
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment