Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
N
network-assets-reptile
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
liyang
network-assets-reptile
Commits
e6a8964a
Commit
e6a8964a
authored
Jul 25, 2023
by
liyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
feat:处理ins爬虫数据
parent
1526bcd6
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
102 additions
and
150 deletions
+102
-150
pc_dcard.py
pc_dcard.py
+5
-2
pc_facebook.py
pc_facebook.py
+6
-1
pc_instagram.py
pc_instagram.py
+58
-39
pc_nytimes.py
pc_nytimes.py
+0
-94
pc_ptt.py
pc_ptt.py
+4
-1
pc_twitter.py
pc_twitter.py
+6
-0
pc_video.py
pc_video.py
+0
-0
pc_youtube.py
pc_youtube.py
+4
-1
test.py
test.py
+14
-9
createBrowserDriver.py
utils/createBrowserDriver.py
+5
-3
No files found.
pc_dcard.py
View file @
e6a8964a
...
...
@@ -55,7 +55,6 @@ def reptile(browser=None, search_word=""):
# 评论
element_comment_list
=
browser
.
find_elements
(
'xpath'
,
f
"{base_xpath}/div[3]/div[2]/div/span"
)
length
=
len
(
element_content_list
)
for
index
in
range
(
length
):
# 提取时间,并转为时间戳
...
...
@@ -77,7 +76,7 @@ def reptile(browser=None, search_word=""):
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
XPATH
,
"//div[@data-testid='overlay']"
)))
time
.
sleep
(
3
)
click_dom
=
browser
.
find_element
(
"xpath"
,
"//div[@data-testid='overlay']"
)
"//div[@data-testid='overlay']"
)
# 处理弹窗内容加载失败的情况
try
:
browser
.
find_element
(
"xpath"
,
"//div[@data-testid='overlay']//h2[text()='發生錯誤']"
)
...
...
@@ -175,10 +174,14 @@ def reptile(browser=None, search_word=""):
log
.
debug
(
'save file success'
)
else
:
log
.
debug
(
'save file failed'
)
script_close
(
browser
)
else
:
# 爬取数据为空
log
.
info
(
"未爬取到数据"
)
script_close
(
browser
)
def
script_close
(
browser
):
# 关闭浏览器驱动
try
:
browser
.
close
()
...
...
pc_facebook.py
View file @
e6a8964a
...
...
@@ -13,6 +13,7 @@ import os
from
config.settings
import
get_base_file_url
from
config.settings
import
get_account
import
sys
# 工具函数-下载图片
'''
打开指定网页,并使用 Selenium 模拟点击 "GDPR-accept" 按钮,然后循环点击 "search-show-more-button" 按钮来加载更多数据,直到按钮不再可点击为止。最后,获取完整的分页数据并关闭浏览器驱动。
...
...
@@ -160,19 +161,23 @@ def reptile(browser=None, search_word=""):
log
.
debug
(
'save file success'
)
else
:
log
.
debug
(
'save file failed'
)
script_close
(
browser
)
else
:
# 爬取数据为空
log
.
info
(
"未爬取到数据"
)
script_close
(
browser
)
def
script_close
(
browser
):
# 关闭浏览器驱动
try
:
browser
.
close
()
browser
.
quit
()
except
:
log
.
debug
(
"浏览器驱动关闭失败"
)
sys
.
exit
()
def
main
():
"""
...
...
pc_instagram.py
View file @
e6a8964a
...
...
@@ -40,7 +40,10 @@ def reptile(browser=None, search_word=""):
# print(browser)
# 打开网页
browser
.
get
(
base_url
)
time
.
sleep
(
3
)
# 等待加载完成
time
.
sleep
(
2
)
# wait = WebDriverWait(browser, 10)
# wait.until(EC.presence_of_element_located((By.XPATH, "//input[@name='username']")))
try
:
# 检测是否要登录
login_input
=
browser
.
find_element
(
'xpath'
,
"//input[@name='username']"
)
...
...
@@ -50,55 +53,62 @@ def reptile(browser=None, search_word=""):
# 获取登录按钮
button_login
=
browser
.
find_element
(
'xpath'
,
"//button[@type='submit']"
)
button_login
.
click
()
time
.
sleep
(
3
)
time
.
sleep
(
2
)
except
:
print
(
"------"
)
# print("1111")
url
=
f
"{base_url}explore/tags/{search_word}/"
browser
.
get
(
url
)
wait
=
WebDriverWait
(
browser
,
10
)
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
XPATH
,
"//article//a"
)))
# 链接
element_link_list
=
browser
.
find_elements
(
'xpath'
,
"//article//a"
)
element_link_list
=
browser
.
find_elements
(
'xpath'
,
"//article//a"
)
length
=
len
(
element_link_list
)
for
index
in
range
(
length
):
element_link_list
[
index
]
.
click
()
# element_link_list[index].click()
browser
.
execute_script
(
"arguments[0].click();"
,
element_link_list
[
index
])
# 等待弹窗加载完成
wait
=
WebDriverWait
(
browser
,
10
)
wait
=
WebDriverWait
(
browser
,
10
)
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
XPATH
,
"//div[@role='dialog']/div/div[2]"
)))
# 提取其他
author
=
browser
.
find_element
(
"xpath"
,
"//div[@role='dialog']/div//article/div/div[2]/div/div/div[1]//a"
)
author
=
browser
.
find_element
(
"xpath"
,
"//div[@role='dialog']/div//article/div/div[2]/div/div/div[1]//a"
)
content_element
=
browser
.
find_element
(
"xpath"
,
"//div[@role='dialog']/div//article/div/div[2]/div/div/div[2]/div[1]//div[@role='button']//h1"
)
time_element
=
browser
.
find_element
(
"xpath"
,
"//div[@role='dialog']/div//article/div/div[2]/div/div/div[2]/div[2]//time"
)
content_element
=
browser
.
find_element
(
"xpath"
,
"//div[@role='dialog']/div//article/div/div[2]/div/div/div[2]/div[1]//div[@role='button']//h1"
)
time_element
=
browser
.
find_element
(
"xpath"
,
"//div[@role='dialog']/div//article/div/div[2]/div/div/div[2]/div[2]//time"
)
link_str
=
browser
.
current_url
# 提取时间,并转为时间戳
timestamp
=
datetime
.
fromisoformat
(
time_element
.
get_attribute
(
"datetime"
)[:
-
1
])
.
timestamp
()
#提取图片、视频
picture_url
=
[]
img_list
=
browser
.
find_elements
(
"xpath"
,
"//div[@role='dialog']/div//article/div/div[1]/div/div[1]//img"
)
for
key
,
item
in
enumerate
(
img_list
):
if
key
==
0
:
title
=
item
.
get_attribute
(
"alt"
)
# 下载图片至本地,替换标签中的src
id
=
str
(
int
(
time
.
time
()))
# 下载地址
download_dir
=
f
'{os.path.join(file_dir, f"{id}.jpg")}'
# 访问地址
access_address
=
f
'{get_base_file_url()}{table_name.split("_")[1]}/{id}.jpg'
# 下载状态
status
=
download_image
(
item
[
'src'
],
download_dir
)
if
status
:
item
[
'src'
]
=
access_address
picture_url
.
append
(
access_address
)
#提取弹窗内容
# 提取弹窗内容
soup
=
BeautifulSoup
(
content_element
.
get_attribute
(
"outerHTML"
),
"html.parser"
)
# 将图片整合到内容中
# 提取图片、视频
picture_url
=
[]
img_list
=
browser
.
find_elements
(
"xpath"
,
"//div[@role='dialog']/div//article/div/div[1]/div/div[1]//img"
)
# 过滤视频
video_list
=
browser
.
find_elements
(
"xpath"
,
"//div[@role='dialog']/div//article/div/div[1]/div/div[1]//video"
)
for
key
,
item
in
enumerate
(
img_list
):
img
=
BeautifulSoup
(
item
.
get_attribute
(
"outerHTML"
),
"html.parser"
)
soup
.
append
(
img
)
if
len
(
video_list
)
==
0
:
if
key
==
0
:
title
=
item
.
get_attribute
(
"alt"
)
# 下载图片至本地,替换标签中的src
id
=
str
(
int
(
time
.
time
()))
# 下载地址
download_dir
=
f
'{os.path.join(file_dir, f"{id}.jpg")}'
# 访问地址
access_address
=
f
'{get_base_file_url()}{table_name.split("_")[1]}/{id}.jpg'
# 下载状态
status
=
download_image
(
item
.
get_attribute
(
"src"
),
download_dir
)
if
status
:
# 将图片追加到内容中
img_soup
=
BeautifulSoup
(
item
.
get_attribute
(
"outerHTML"
),
"html.parser"
)
img_soup
.
img
[
"src"
]
=
access_address
# print(img_soup.prettify())
soup
.
append
(
img_soup
)
picture_url
.
append
(
access_address
)
content
=
soup
.
prettify
()
# 类型
content_type
=
"图文"
...
...
@@ -116,15 +126,12 @@ def reptile(browser=None, search_word=""):
# --------------- 组装数据 end---------------------
data
.
append
(
obj
)
# 获取下一页按钮
next_buttons
=
browser
.
find_elements
(
"xpath"
,
"//div[@role='dialog']/div/div[1]//button"
)
if
index
<
length
-
1
:
for
key
,
item
in
enumerate
(
next_buttons
):
if
key
+
1
==
len
(
next_buttons
):
next_buttons
=
browser
.
find_elements
(
"xpath"
,
"//div[@role='dialog']/div/div[1]//button"
)
if
index
<
length
-
1
:
for
key
,
item
in
enumerate
(
next_buttons
):
if
key
+
1
==
len
(
next_buttons
):
item
.
click
()
# 发送爬取数据到java服务
# print('----------------------')
# print(data)
if
len
(
data
)
>
0
:
# 保存json文件到本地
# log.debug(os.path.abspath("../"))
...
...
@@ -133,17 +140,29 @@ def reptile(browser=None, search_word=""):
log
.
debug
(
'save file success'
)
else
:
log
.
debug
(
'save file failed'
)
script_close
(
browser
)
else
:
# 爬取数据为空
log
.
info
(
"未爬取到数据"
)
script_close
(
browser
)
def
script_close
(
browser
):
# 关闭浏览器驱动
try
:
browser
.
close
()
browser
.
quit
()
except
:
log
.
debug
(
"浏览器驱动关闭失败"
)
sys
.
exit
()
try
:
# 一些代码...
sys
.
exit
()
except
SystemExit
:
raise
# 重新抛出SystemExit异常,让脚本退出
except
Exception
as
e
:
# 异常处理代码...
print
(
"sys.exit() 执行失败"
)
def
main
():
"""
...
...
pc_nytimes.py
deleted
100644 → 0
View file @
1526bcd6
# # 导入依赖库
import
json
import
time
from
datetime
import
datetime
,
timedelta
from
bs4
import
BeautifulSoup
from
loguru
import
logger
from
selenium
import
webdriver
from
selenium.webdriver.common.by
import
By
from
selenium.webdriver.support
import
expected_conditions
as
EC
from
selenium.webdriver.support.ui
import
WebDriverWait
# 工具函数-下载图片
from
utils.download_image
import
download_image
'''
打开指定网页,并使用 Selenium 模拟点击 "GDPR-accept" 按钮,然后循环点击 "search-show-more-button" 按钮来加载更多数据,直到按钮不再可点击为止。最后,获取完整的分页数据并关闭浏览器驱动。
'''
# # json 数据
data
=
[];
image_key
=
0
fileDir
=
"./reptile_data/news/nytimes/"
year
=
datetime
(
2021
,
1
,
1
)
startDate
=
datetime
(
2020
,
12
,
31
)
# 初始日期
endDate
=
datetime
(
2020
,
12
,
31
)
# 结束日期
# 创建浏览器驱动对象
browser
=
webdriver
.
Chrome
()
for
i
in
range
(
1
):
endDate
=
startDate
=
startDate
+
timedelta
(
days
=
i
)
# 打开网页
browser
.
get
(
f
'https://www.nytimes.com/search?dropmab=false&endDate={endDate.strftime("
%
Y
%
m
%
d")}&query={year.strftime("
%
Y")}&sort=best&startDate={startDate.strftime("
%
Y
%
m
%
d")}&types=interactivegraphics
%2
Carticle'
)
try
:
accept
=
WebDriverWait
(
browser
,
10
)
.
until
(
EC
.
presence_of_element_located
((
By
.
XPATH
,
"//button[@data-testid='GDPR-accept']"
)))
accept
.
click
()
finally
:
logger
.
debug
(
""
)
# 等待加载更多按钮出现
button
=
WebDriverWait
(
browser
,
10
)
.
until
(
EC
.
element_to_be_clickable
((
By
.
XPATH
,
"//button[@data-testid='search-show-more-button']"
)))
# logger.debug(button)
# 模拟点击按钮多次加载更多数据
while
button
.
is_enabled
():
time
.
sleep
(
2
)
# 等待一段时间,确保页面加载完毕
try
:
button
.
click
()
button
=
WebDriverWait
(
browser
,
5
)
.
until
(
EC
.
element_to_be_clickable
((
By
.
XPATH
,
"//button[@data-testid='search-show-more-button']"
)))
except
:
break
# 获取完整的分页数据
page_content
=
browser
.
page_source
soup
=
BeautifulSoup
(
page_content
,
'html.parser'
)
list_news
=
soup
.
find_all
(
'li'
,
{
"class"
:
"css-1l4w6pd"
})
for
index
,
item
in
enumerate
(
list_news
):
logger
.
debug
(
item
)
# 抓取图片
image_key
=
image_key
+
1
url_element
=
item
.
find
(
'img'
,
{
"class"
:
"css-rq4mmj"
})
image_url
=
url_element
[
'src'
]
if
url_element
else
""
# logger.debug(url)
if
image_url
:
# logger.debug(url)
# # 下载图片
#
filename
=
f
"{image_key}.jpg"
# logger.debug(filename)
# sys.exit()
download_image
(
image_url
,
f
'{fileDir}images/{filename}'
)
# 抓取文字
title_element
=
item
.
find
(
'h4'
,
{
"class"
:
"css-2fgx4k"
})
introduction_element
=
item
.
find
(
'p'
,
{
"class"
:
"css-16nhkrn"
})
title
=
title_element
.
get_text
()
if
title_element
else
""
introduction
=
introduction_element
.
get_text
()
if
introduction_element
else
""
news
=
{
"title"
:
title
,
"introduction"
:
introduction
,
"imageName"
:
filename
}
data
.
append
(
news
)
# logger.debug(data)
# 将数据保存到文件中
with
open
(
f
'{fileDir}data.json'
,
"w"
,
encoding
=
"utf-8"
)
as
file
:
json
.
dump
(
data
,
file
,
indent
=
2
,
ensure_ascii
=
False
)
browser
.
close
()
# 关闭浏览器驱动
browser
.
quit
()
pc_ptt.py
View file @
e6a8964a
...
...
@@ -200,10 +200,14 @@ def reptile(browser=None, search_word=""):
log
.
debug
(
'save file success'
)
else
:
log
.
debug
(
'save file failed'
)
script_close
(
browser
)
else
:
# 爬取数据为空
log
.
info
(
"未爬取到数据"
)
script_close
(
browser
)
def
script_close
(
browser
):
# 关闭浏览器驱动
try
:
browser
.
close
()
...
...
@@ -212,7 +216,6 @@ def reptile(browser=None, search_word=""):
log
.
debug
(
"浏览器驱动关闭失败"
)
sys
.
exit
()
def
main
():
"""
...
...
pc_twitter.py
View file @
e6a8964a
...
...
@@ -13,6 +13,7 @@ from datetime import datetime
from
utils.download_image
import
download_image
from
config.settings
import
get_base_file_url
from
config.settings
import
get_account
# 工具函数-下载图片
'''
打开指定网页,并使用 Selenium 模拟点击 "GDPR-accept" 按钮,然后循环点击 "search-show-more-button" 按钮来加载更多数据,直到按钮不再可点击为止。最后,获取完整的分页数据并关闭浏览器驱动。
...
...
@@ -159,10 +160,14 @@ def reptile(browser=None, search_word=""):
log
.
debug
(
'save file success'
)
else
:
log
.
debug
(
'save file failed'
)
script_close
(
browser
)
else
:
# 爬取数据为空
log
.
info
(
"未爬取到数据"
)
script_close
(
browser
)
def
script_close
(
browser
):
# 关闭浏览器驱动
try
:
browser
.
close
()
...
...
@@ -171,6 +176,7 @@ def reptile(browser=None, search_word=""):
log
.
debug
(
"浏览器驱动关闭失败"
)
sys
.
exit
()
def
main
():
"""
...
...
pc_video.py
deleted
100644 → 0
View file @
1526bcd6
pc_youtube.py
View file @
e6a8964a
...
...
@@ -75,15 +75,18 @@ def reptile(browser=None, search_word=""):
log
.
debug
(
'save file success'
)
else
:
log
.
debug
(
'save file failed'
)
script_close
(
browser
)
else
:
# 爬取数据为空
log
.
info
(
"未爬取到数据"
)
script_close
(
browser
)
def
script_close
(
browser
):
# 关闭浏览器驱动
try
:
browser
.
close
()
browser
.
quit
()
except
:
log
.
debug
(
"浏览器驱动关闭失败"
)
sys
.
exit
()
...
...
test.py
View file @
e6a8964a
...
...
@@ -5,12 +5,17 @@
import
time
from
selenium.webdriver
import
Firefox
from
selenium
import
webdriver
driver
=
webdriver
.
Firefox
()
driver
.
get
(
"https://www.toutiao.com/a6969138023774667264/"
)
time
.
sleep
(
2
)
html
=
driver
.
page_source
print
(
html
)
driver
.
quit
()
\ No newline at end of file
# import time
# from selenium.webdriver import Firefox
# from selenium import webdriver
# driver = webdriver.Firefox()
# driver.get("https://www.toutiao.com/a6969138023774667264/")
# time.sleep(2)
# html = driver.page_source
# print(html)
# driver.quit()
var
=
{
'title'
:
'Photo by 今周刊 on July 17, 2023. May be an illustration of poster and text.'
,
'content'
:
'<h1 class="_aacl _aaco _aacu _aacx _aad7 _aade" dir="auto">
\n
你有沒有想過,為什麼你的時間總是不夠用?為什麼你買稍微貴點的東西,都要不自覺地把它折算成自己多少天的收入?為什麼你以前看著數字就頭暈腦脹,現在願意對著數字精打細算,還時刻惦記著要怎麼花錢才能熬得到月尾?為什麼你正值青春的花樣年華,卻窮得只剩下理想,忙得沒時間生活?
\n
<br/>
\n
<br/>
\n
實際上,絕大多數人的疲於奔命,不是因為忙,而是心態出了問題,是眼下的生活不能如人所願,是對當前的生活不知所措。
\n
<br/>
\n
<br/>
\n
但是,如果你不能以一種主動的、有規劃的方式去對待生活和工作,那麼你即使什麼都不做,依然會覺得疲憊。
\n
<br/>
\n
<br/>
\n
比如你忙著回覆一封又一封無關緊要的郵件,忙著參加一個又一個無聊的會議,忙著從一個聚會趕到另一個聚會,忙著在節假日跟社交軟體裡每一個熟悉的和不熟悉的人說沒完沒了的、便宜的祝福語……比如你每天兩點一線,在家和公司之間步履匆匆。
\n
<br/>
\n
<br/>
\n
一大早忙著擠上即將關門開走的公車,好不容易來到公司忙著準備資料、製作檔案、接待客戶。終於熬到下班,行屍走肉樣的狀態卻不忘看看社群動態,在手機裡看著大家都在為生計而奔忙。
\n
<br/>
\n
<br/>
\n
可如果誰要是問你,「怎麼你老是這麼忙?都做了些什麼?」你就算皺緊了眉頭,想破了腦袋也只能給出一個這樣的回答:「呃,我也記不住都忙什麼了,反正就是很忙!」
\n
<br/>
\n
\u200b\n
<br/>
\n
你呀,像極了一隻在泳池裡瞎撲騰的旱鴨子,一直抓住一個叫作「工作忙」的游泳圈不肯放手。
\n
<br/>
\n
<br/>
\n
於是,「我好忙」變成了你的海洛因,變成了讓你麻木的精神撫慰品。它讓你忘記為了什麼而出發,忘記了你的最終目的是什麼,就像把你綁在了旋轉的音樂盒上,看起來美妙,聽著也舒服,卻是周而復始的、無意義的瞎打轉。
\n
<br/>
\n
<br/>
\n
嗯,那你就接著懶吧,以後很失敗的時候,還有可以安慰一下自己的理由——萬一努力了還不成功,那不就尷尬了?
\n
<br/>
\n
<br/>
\n
這世上真的沒有什麼搖身一變,更沒有什麼能拯救你的人,有的只是你看不到的低調努力。怕就怕,你只有低調,沒有努力。
\n
<br/>
\n
\u200b\n
<br/>
\n
📚本篇僅為部分節錄,摘自《裝睡的人叫不醒,再不清醒窮死你》
\n
<br/>
\n
\u200b\n
<br/>
\n
<a class="x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz _aa9_ _a6hd" href="/explore/tags/
%
E6
%9
B
%
B8
%
E6
%91%98
/" role="link" tabindex="0">
\n
#書摘
\n
</a>
\n
<a class="x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz _aa9_ _a6hd" href="/explore/tags/
%
E9
%96%
B1
%
E8
%
AE
%80
/" role="link" tabindex="0">
\n
#閱讀
\n
</a>
\n
<a class="x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz _aa9_ _a6hd" href="/explore/tags/reading/" role="link" tabindex="0">
\n
#reading
\n
</a>
\n
<a class="x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz _aa9_ _a6hd" href="/explore/tags/
%
E4
%
BB
%8
A
%
E5
%91%
A8
%
E5
%88%8
A/" role="link" tabindex="0">
\n
#今周刊
\n
</a>
\n
<a class="x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz _aa9_ _a6hd" href="/explore/tags/
%
E6
%96%
B0
%
E8
%81%9
E/" role="link" tabindex="0">
\n
#新聞
\n
</a>
\n
<a class="x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz _aa9_ _a6hd" href="/explore/tags/news/" role="link" tabindex="0">
\n
#news
\n
</a>
\n
<a class="x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz _aa9_ _a6hd" href="/explore/tags/
%
E5
%
AA
%92%
E9
%
AB
%94
/" role="link" tabindex="0">
\n
#媒體
\n
</a>
\n
<a class="x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz _aa9_ _a6hd" href="/explore/tags/
%
E7
%90%86%
E8
%
B2
%
A1/" role="link" tabindex="0">
\n
#理財
\n
</a>
\n
<a class="x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz _aa9_ _a6hd" href="/explore/tags/quotes/" role="link" tabindex="0">
\n
#quotes
\n
</a>
\n
<a class="x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz _aa9_ _a6hd" href="/explore/tags/allianzgitw/" role="link" tabindex="0">
\n
#allianzgitw
\n
</a>
\n
<a class="x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz _aa9_ _a6hd" href="/explore/tags/
%
E4
%
B8
%
BB
%
E5
%8
B
%95%
E8
%
AE
%93%
E6
%8
A
%95%
E8
%
B3
%87%
E7
%99%
BC
%
E6
%8
F
%
AE
%
E5
%
BD
%
B1
%
E9
%9
F
%
BF
%
E5
%8
A
%9
B/" role="link" tabindex="0">
\n
#主動讓投資發揮影響力
\n
</a>
\n
</h1>
\n
<img alt="Photo by 今周刊 on July 17, 2023. May be an illustration of poster and text." class="x5yr21d xu96u03 x10l6tqk x13vifvy x87ps6o xh8yej3" crossorigin="anonymous" decoding="auto" sizes="653.5999755859375px" src="https://scontent-hkg4-2.cdninstagram.com/v/t39.30808-6/361612960_677983984363846_6639522657508930508_n.jpg?stp=dst-jpg_e15_fr_p1080x1080&_nc_ht=scontent-hkg4-2.cdninstagram.com&_nc_cat=110&_nc_ohc=NFrLOL0j0HoAX-dbhrX&edm=AGyKU4gAAAAA&ccb=7-5&ig_cache_key=MzE0OTI2Nzg0NTc4NzA3NTU0NQ
%3
D
%3
D.2-ccb7-5&oh=00_AfBdfyF7qUQQaVxi_e9z5aI4P6e6Hy9JIVtTl2YV9gXoJw&oe=64C4688F&_nc_sid=2011ad" srcset="https://scontent-hkg4-2.cdninstagram.com/v/t39.30808-6/361612960_677983984363846_6639522657508930508_n.jpg?stp=dst-jpg_e15_fr_p1080x1080&_nc_ht=scontent-hkg4-2.cdninstagram.com&_nc_cat=110&_nc_ohc=NFrLOL0j0HoAX-dbhrX&edm=AGyKU4gAAAAA&ccb=7-5&ig_cache_key=MzE0OTI2Nzg0NTc4NzA3NTU0NQ
%3
D
%3
D.2-ccb7-5&oh=00_AfBdfyF7qUQQaVxi_e9z5aI4P6e6Hy9JIVtTl2YV9gXoJw&oe=64C4688F&_nc_sid=2011ad 1080w,https://scontent-hkg4-2.cdninstagram.com/v/t39.30808-6/361612960_677983984363846_6639522657508930508_n.jpg?stp=dst-jpg_e35_p750x750_sh0.08&_nc_ht=scontent-hkg4-2.cdninstagram.com&_nc_cat=110&_nc_ohc=NFrLOL0j0HoAX-dbhrX&edm=AGyKU4gAAAAA&ccb=7-5&ig_cache_key=MzE0OTI2Nzg0NTc4NzA3NTU0NQ
%3
D
%3
D.2-ccb7-5&oh=00_AfDH1NfR4Ik2R9DwcoPB5XectpJqfeUOtbvrxtmRHDxOVg&oe=64C4688F&_nc_sid=2011ad 750w,https://scontent-hkg4-2.cdninstagram.com/v/t39.30808-6/361612960_677983984363846_6639522657508930508_n.jpg?stp=dst-jpg_e35_p640x640_sh0.08&_nc_ht=scontent-hkg4-2.cdninstagram.com&_nc_cat=110&_nc_ohc=NFrLOL0j0HoAX-dbhrX&edm=AGyKU4gAAAAA&ccb=7-5&ig_cache_key=MzE0OTI2Nzg0NTc4NzA3NTU0NQ
%3
D
%3
D.2-ccb7-5&oh=00_AfCc0UgUfQGa7N4QJeWgEMdqmiIDKwuO10SH_A5R1-q9EQ&oe=64C4688F&_nc_sid=2011ad 640w,https://scontent-hkg4-2.cdninstagram.com/v/t39.30808-6/361612960_677983984363846_6639522657508930508_n.jpg?stp=dst-jpg_e15_p480x480&_nc_ht=scontent-hkg4-2.cdninstagram.com&_nc_cat=110&_nc_ohc=NFrLOL0j0HoAX-dbhrX&edm=AGyKU4gAAAAA&ccb=7-5&ig_cache_key=MzE0OTI2Nzg0NTc4NzA3NTU0NQ
%3
D
%3
D.2-ccb7-5&oh=00_AfAlY_Yxs-GAahDKxf4ijFWqjTERRFrRitvuXySepJR7hw&oe=64C4688F&_nc_sid=2011ad 480w,https://scontent-hkg4-2.cdninstagram.com/v/t39.30808-6/361612960_677983984363846_6639522657508930508_n.jpg?stp=dst-jpg_e15_p320x320&_nc_ht=scontent-hkg4-2.cdninstagram.com&_nc_cat=110&_nc_ohc=NFrLOL0j0HoAX-dbhrX&edm=AGyKU4gAAAAA&ccb=7-5&ig_cache_key=MzE0OTI2Nzg0NTc4NzA3NTU0NQ
%3
D
%3
D.2-ccb7-5&oh=00_AfBuQrHYUVMkHj6NKLuNmNvSlyupwQFTY6e7lsdxwXmy5Q&oe=64C4688F&_nc_sid=2011ad 320w,https://scontent-hkg4-2.cdninstagram.com/v/t39.30808-6/361612960_677983984363846_6639522657508930508_n.jpg?stp=dst-jpg_e15_p240x240&_nc_ht=scontent-hkg4-2.cdninstagram.com&_nc_cat=110&_nc_ohc=NFrLOL0j0HoAX-dbhrX&edm=AGyKU4gAAAAA&ccb=7-5&ig_cache_key=MzE0OTI2Nzg0NTc4NzA3NTU0NQ
%3
D
%3
D.2-ccb7-5&oh=00_AfD9YWNkm7j6GSh3CufCedWL0LlxHSEdLskO6PVsE4DA8Q&oe=64C4688F&_nc_sid=2011ad 240w,https://scontent-hkg4-2.cdninstagram.com/v/t39.30808-6/361612960_677983984363846_6639522657508930508_n.jpg?stp=dst-jpg_e15_p150x150&_nc_ht=scontent-hkg4-2.cdninstagram.com&_nc_cat=110&_nc_ohc=NFrLOL0j0HoAX-dbhrX&edm=AGyKU4gAAAAA&ccb=7-5&ig_cache_key=MzE0OTI2Nzg0NTc4NzA3NTU0NQ
%3
D
%3
D.2-ccb7-5&oh=00_AfAzad7jssscLrABgHQKNVi0CGOm5H1DZWIpqwwMMx5Kjw&oe=64C4688F&_nc_sid=2011ad 150w" style="object-fit: cover;"/>
\n
'
,
'link'
:
'https://www.instagram.com/p/Cu0cmuSssPZ/'
,
'reptileTime'
:
'1690259090'
,
'type'
:
'图文'
,
'author'
:
''
,
'releaseTime'
:
'1689613204'
,
'picture_url'
:
'http://192.168.0.118:8186/instagram/1690259027.jpg'
}
\ No newline at end of file
utils/createBrowserDriver.py
View file @
e6a8964a
...
...
@@ -28,7 +28,7 @@ from utils.index import get_screen_resolution
'''
def
create
(
option
=
None
,
using_user_data
=
True
,
web_browser
=
"
chromium
"
):
def
create
(
option
=
None
,
using_user_data
=
True
,
web_browser
=
"
firefox
"
):
"""
:param web_browser:
...
...
@@ -60,8 +60,10 @@ def create(option=None, using_user_data=True, web_browser="chromium"):
# 添加用户数据目录参数,启用浏览器的持久性会话,可以保存登录状态和Cookie
if
web_browser
==
"firefox"
:
# 将此处替换为你的Firefox用户数据目录路径
profile
=
FirefoxProfile
(
profile_directory
=
user_data_dir
)
options
.
profile
=
profile
# profile = FirefoxProfile(profile_directory=user_data_dir)
# options.profile = profile
options
.
add_argument
(
"-profile"
)
options
.
add_argument
(
user_data_dir
)
# options.add_argument(f'--user-data-dir={user_data_dir}')
elif
web_browser
==
"chrome"
:
options
.
add_argument
(
f
'--user-data-dir={user_data_dir}'
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment