Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
N
network-assets-reptile
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
liyang
network-assets-reptile
Commits
e6a8964a
Commit
e6a8964a
authored
Jul 25, 2023
by
liyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
feat:处理ins爬虫数据
parent
1526bcd6
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
102 additions
and
150 deletions
+102
-150
pc_dcard.py
pc_dcard.py
+5
-2
pc_facebook.py
pc_facebook.py
+6
-1
pc_instagram.py
pc_instagram.py
+58
-39
pc_nytimes.py
pc_nytimes.py
+0
-94
pc_ptt.py
pc_ptt.py
+4
-1
pc_twitter.py
pc_twitter.py
+6
-0
pc_video.py
pc_video.py
+0
-0
pc_youtube.py
pc_youtube.py
+4
-1
test.py
test.py
+14
-9
createBrowserDriver.py
utils/createBrowserDriver.py
+5
-3
No files found.
pc_dcard.py
View file @
e6a8964a
...
...
@@ -55,7 +55,6 @@ def reptile(browser=None, search_word=""):
# 评论
element_comment_list
=
browser
.
find_elements
(
'xpath'
,
f
"{base_xpath}/div[3]/div[2]/div/span"
)
length
=
len
(
element_content_list
)
for
index
in
range
(
length
):
# 提取时间,并转为时间戳
...
...
@@ -77,7 +76,7 @@ def reptile(browser=None, search_word=""):
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
XPATH
,
"//div[@data-testid='overlay']"
)))
time
.
sleep
(
3
)
click_dom
=
browser
.
find_element
(
"xpath"
,
"//div[@data-testid='overlay']"
)
"//div[@data-testid='overlay']"
)
# 处理弹窗内容加载失败的情况
try
:
browser
.
find_element
(
"xpath"
,
"//div[@data-testid='overlay']//h2[text()='發生錯誤']"
)
...
...
@@ -175,10 +174,14 @@ def reptile(browser=None, search_word=""):
log
.
debug
(
'save file success'
)
else
:
log
.
debug
(
'save file failed'
)
script_close
(
browser
)
else
:
# 爬取数据为空
log
.
info
(
"未爬取到数据"
)
script_close
(
browser
)
def
script_close
(
browser
):
# 关闭浏览器驱动
try
:
browser
.
close
()
...
...
pc_facebook.py
View file @
e6a8964a
...
...
@@ -13,6 +13,7 @@ import os
from
config.settings
import
get_base_file_url
from
config.settings
import
get_account
import
sys
# 工具函数-下载图片
'''
打开指定网页,并使用 Selenium 模拟点击 "GDPR-accept" 按钮,然后循环点击 "search-show-more-button" 按钮来加载更多数据,直到按钮不再可点击为止。最后,获取完整的分页数据并关闭浏览器驱动。
...
...
@@ -160,19 +161,23 @@ def reptile(browser=None, search_word=""):
log
.
debug
(
'save file success'
)
else
:
log
.
debug
(
'save file failed'
)
script_close
(
browser
)
else
:
# 爬取数据为空
log
.
info
(
"未爬取到数据"
)
script_close
(
browser
)
def
script_close
(
browser
):
# 关闭浏览器驱动
try
:
browser
.
close
()
browser
.
quit
()
except
:
log
.
debug
(
"浏览器驱动关闭失败"
)
sys
.
exit
()
def
main
():
"""
...
...
pc_instagram.py
View file @
e6a8964a
...
...
@@ -40,7 +40,10 @@ def reptile(browser=None, search_word=""):
# print(browser)
# 打开网页
browser
.
get
(
base_url
)
time
.
sleep
(
3
)
# 等待加载完成
time
.
sleep
(
2
)
# wait = WebDriverWait(browser, 10)
# wait.until(EC.presence_of_element_located((By.XPATH, "//input[@name='username']")))
try
:
# 检测是否要登录
login_input
=
browser
.
find_element
(
'xpath'
,
"//input[@name='username']"
)
...
...
@@ -50,55 +53,62 @@ def reptile(browser=None, search_word=""):
# 获取登录按钮
button_login
=
browser
.
find_element
(
'xpath'
,
"//button[@type='submit']"
)
button_login
.
click
()
time
.
sleep
(
3
)
time
.
sleep
(
2
)
except
:
print
(
"------"
)
# print("1111")
url
=
f
"{base_url}explore/tags/{search_word}/"
browser
.
get
(
url
)
wait
=
WebDriverWait
(
browser
,
10
)
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
XPATH
,
"//article//a"
)))
# 链接
element_link_list
=
browser
.
find_elements
(
'xpath'
,
"//article//a"
)
element_link_list
=
browser
.
find_elements
(
'xpath'
,
"//article//a"
)
length
=
len
(
element_link_list
)
for
index
in
range
(
length
):
element_link_list
[
index
]
.
click
()
# element_link_list[index].click()
browser
.
execute_script
(
"arguments[0].click();"
,
element_link_list
[
index
])
# 等待弹窗加载完成
wait
=
WebDriverWait
(
browser
,
10
)
wait
=
WebDriverWait
(
browser
,
10
)
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
XPATH
,
"//div[@role='dialog']/div/div[2]"
)))
# 提取其他
author
=
browser
.
find_element
(
"xpath"
,
"//div[@role='dialog']/div//article/div/div[2]/div/div/div[1]//a"
)
author
=
browser
.
find_element
(
"xpath"
,
"//div[@role='dialog']/div//article/div/div[2]/div/div/div[1]//a"
)
content_element
=
browser
.
find_element
(
"xpath"
,
"//div[@role='dialog']/div//article/div/div[2]/div/div/div[2]/div[1]//div[@role='button']//h1"
)
time_element
=
browser
.
find_element
(
"xpath"
,
"//div[@role='dialog']/div//article/div/div[2]/div/div/div[2]/div[2]//time"
)
content_element
=
browser
.
find_element
(
"xpath"
,
"//div[@role='dialog']/div//article/div/div[2]/div/div/div[2]/div[1]//div[@role='button']//h1"
)
time_element
=
browser
.
find_element
(
"xpath"
,
"//div[@role='dialog']/div//article/div/div[2]/div/div/div[2]/div[2]//time"
)
link_str
=
browser
.
current_url
# 提取时间,并转为时间戳
timestamp
=
datetime
.
fromisoformat
(
time_element
.
get_attribute
(
"datetime"
)[:
-
1
])
.
timestamp
()
#提取图片、视频
picture_url
=
[]
img_list
=
browser
.
find_elements
(
"xpath"
,
"//div[@role='dialog']/div//article/div/div[1]/div/div[1]//img"
)
for
key
,
item
in
enumerate
(
img_list
):
if
key
==
0
:
title
=
item
.
get_attribute
(
"alt"
)
# 下载图片至本地,替换标签中的src
id
=
str
(
int
(
time
.
time
()))
# 下载地址
download_dir
=
f
'{os.path.join(file_dir, f"{id}.jpg")}'
# 访问地址
access_address
=
f
'{get_base_file_url()}{table_name.split("_")[1]}/{id}.jpg'
# 下载状态
status
=
download_image
(
item
[
'src'
],
download_dir
)
if
status
:
item
[
'src'
]
=
access_address
picture_url
.
append
(
access_address
)
#提取弹窗内容
# 提取弹窗内容
soup
=
BeautifulSoup
(
content_element
.
get_attribute
(
"outerHTML"
),
"html.parser"
)
# 将图片整合到内容中
# 提取图片、视频
picture_url
=
[]
img_list
=
browser
.
find_elements
(
"xpath"
,
"//div[@role='dialog']/div//article/div/div[1]/div/div[1]//img"
)
# 过滤视频
video_list
=
browser
.
find_elements
(
"xpath"
,
"//div[@role='dialog']/div//article/div/div[1]/div/div[1]//video"
)
for
key
,
item
in
enumerate
(
img_list
):
img
=
BeautifulSoup
(
item
.
get_attribute
(
"outerHTML"
),
"html.parser"
)
soup
.
append
(
img
)
if
len
(
video_list
)
==
0
:
if
key
==
0
:
title
=
item
.
get_attribute
(
"alt"
)
# 下载图片至本地,替换标签中的src
id
=
str
(
int
(
time
.
time
()))
# 下载地址
download_dir
=
f
'{os.path.join(file_dir, f"{id}.jpg")}'
# 访问地址
access_address
=
f
'{get_base_file_url()}{table_name.split("_")[1]}/{id}.jpg'
# 下载状态
status
=
download_image
(
item
.
get_attribute
(
"src"
),
download_dir
)
if
status
:
# 将图片追加到内容中
img_soup
=
BeautifulSoup
(
item
.
get_attribute
(
"outerHTML"
),
"html.parser"
)
img_soup
.
img
[
"src"
]
=
access_address
# print(img_soup.prettify())
soup
.
append
(
img_soup
)
picture_url
.
append
(
access_address
)
content
=
soup
.
prettify
()
# 类型
content_type
=
"图文"
...
...
@@ -116,15 +126,12 @@ def reptile(browser=None, search_word=""):
# --------------- 组装数据 end---------------------
data
.
append
(
obj
)
# 获取下一页按钮
next_buttons
=
browser
.
find_elements
(
"xpath"
,
"//div[@role='dialog']/div/div[1]//button"
)
if
index
<
length
-
1
:
for
key
,
item
in
enumerate
(
next_buttons
):
if
key
+
1
==
len
(
next_buttons
):
next_buttons
=
browser
.
find_elements
(
"xpath"
,
"//div[@role='dialog']/div/div[1]//button"
)
if
index
<
length
-
1
:
for
key
,
item
in
enumerate
(
next_buttons
):
if
key
+
1
==
len
(
next_buttons
):
item
.
click
()
# 发送爬取数据到java服务
# print('----------------------')
# print(data)
if
len
(
data
)
>
0
:
# 保存json文件到本地
# log.debug(os.path.abspath("../"))
...
...
@@ -133,17 +140,29 @@ def reptile(browser=None, search_word=""):
log
.
debug
(
'save file success'
)
else
:
log
.
debug
(
'save file failed'
)
script_close
(
browser
)
else
:
# 爬取数据为空
log
.
info
(
"未爬取到数据"
)
script_close
(
browser
)
def
script_close
(
browser
):
# 关闭浏览器驱动
try
:
browser
.
close
()
browser
.
quit
()
except
:
log
.
debug
(
"浏览器驱动关闭失败"
)
sys
.
exit
()
try
:
# 一些代码...
sys
.
exit
()
except
SystemExit
:
raise
# 重新抛出SystemExit异常,让脚本退出
except
Exception
as
e
:
# 异常处理代码...
print
(
"sys.exit() 执行失败"
)
def
main
():
"""
...
...
pc_nytimes.py
deleted
100644 → 0
View file @
1526bcd6
# # 导入依赖库
import
json
import
time
from
datetime
import
datetime
,
timedelta
from
bs4
import
BeautifulSoup
from
loguru
import
logger
from
selenium
import
webdriver
from
selenium.webdriver.common.by
import
By
from
selenium.webdriver.support
import
expected_conditions
as
EC
from
selenium.webdriver.support.ui
import
WebDriverWait
# 工具函数-下载图片
from
utils.download_image
import
download_image
'''
打开指定网页,并使用 Selenium 模拟点击 "GDPR-accept" 按钮,然后循环点击 "search-show-more-button" 按钮来加载更多数据,直到按钮不再可点击为止。最后,获取完整的分页数据并关闭浏览器驱动。
'''
# # json 数据
data
=
[];
image_key
=
0
fileDir
=
"./reptile_data/news/nytimes/"
year
=
datetime
(
2021
,
1
,
1
)
startDate
=
datetime
(
2020
,
12
,
31
)
# 初始日期
endDate
=
datetime
(
2020
,
12
,
31
)
# 结束日期
# 创建浏览器驱动对象
browser
=
webdriver
.
Chrome
()
for
i
in
range
(
1
):
endDate
=
startDate
=
startDate
+
timedelta
(
days
=
i
)
# 打开网页
browser
.
get
(
f
'https://www.nytimes.com/search?dropmab=false&endDate={endDate.strftime("
%
Y
%
m
%
d")}&query={year.strftime("
%
Y")}&sort=best&startDate={startDate.strftime("
%
Y
%
m
%
d")}&types=interactivegraphics
%2
Carticle'
)
try
:
accept
=
WebDriverWait
(
browser
,
10
)
.
until
(
EC
.
presence_of_element_located
((
By
.
XPATH
,
"//button[@data-testid='GDPR-accept']"
)))
accept
.
click
()
finally
:
logger
.
debug
(
""
)
# 等待加载更多按钮出现
button
=
WebDriverWait
(
browser
,
10
)
.
until
(
EC
.
element_to_be_clickable
((
By
.
XPATH
,
"//button[@data-testid='search-show-more-button']"
)))
# logger.debug(button)
# 模拟点击按钮多次加载更多数据
while
button
.
is_enabled
():
time
.
sleep
(
2
)
# 等待一段时间,确保页面加载完毕
try
:
button
.
click
()
button
=
WebDriverWait
(
browser
,
5
)
.
until
(
EC
.
element_to_be_clickable
((
By
.
XPATH
,
"//button[@data-testid='search-show-more-button']"
)))
except
:
break
# 获取完整的分页数据
page_content
=
browser
.
page_source
soup
=
BeautifulSoup
(
page_content
,
'html.parser'
)
list_news
=
soup
.
find_all
(
'li'
,
{
"class"
:
"css-1l4w6pd"
})
for
index
,
item
in
enumerate
(
list_news
):
logger
.
debug
(
item
)
# 抓取图片
image_key
=
image_key
+
1
url_element
=
item
.
find
(
'img'
,
{
"class"
:
"css-rq4mmj"
})
image_url
=
url_element
[
'src'
]
if
url_element
else
""
# logger.debug(url)
if
image_url
:
# logger.debug(url)
# # 下载图片
#
filename
=
f
"{image_key}.jpg"
# logger.debug(filename)
# sys.exit()
download_image
(
image_url
,
f
'{fileDir}images/{filename}'
)
# 抓取文字
title_element
=
item
.
find
(
'h4'
,
{
"class"
:
"css-2fgx4k"
})
introduction_element
=
item
.
find
(
'p'
,
{
"class"
:
"css-16nhkrn"
})
title
=
title_element
.
get_text
()
if
title_element
else
""
introduction
=
introduction_element
.
get_text
()
if
introduction_element
else
""
news
=
{
"title"
:
title
,
"introduction"
:
introduction
,
"imageName"
:
filename
}
data
.
append
(
news
)
# logger.debug(data)
# 将数据保存到文件中
with
open
(
f
'{fileDir}data.json'
,
"w"
,
encoding
=
"utf-8"
)
as
file
:
json
.
dump
(
data
,
file
,
indent
=
2
,
ensure_ascii
=
False
)
browser
.
close
()
# 关闭浏览器驱动
browser
.
quit
()
pc_ptt.py
View file @
e6a8964a
...
...
@@ -200,10 +200,14 @@ def reptile(browser=None, search_word=""):
log
.
debug
(
'save file success'
)
else
:
log
.
debug
(
'save file failed'
)
script_close
(
browser
)
else
:
# 爬取数据为空
log
.
info
(
"未爬取到数据"
)
script_close
(
browser
)
def
script_close
(
browser
):
# 关闭浏览器驱动
try
:
browser
.
close
()
...
...
@@ -212,7 +216,6 @@ def reptile(browser=None, search_word=""):
log
.
debug
(
"浏览器驱动关闭失败"
)
sys
.
exit
()
def
main
():
"""
...
...
pc_twitter.py
View file @
e6a8964a
...
...
@@ -13,6 +13,7 @@ from datetime import datetime
from
utils.download_image
import
download_image
from
config.settings
import
get_base_file_url
from
config.settings
import
get_account
# 工具函数-下载图片
'''
打开指定网页,并使用 Selenium 模拟点击 "GDPR-accept" 按钮,然后循环点击 "search-show-more-button" 按钮来加载更多数据,直到按钮不再可点击为止。最后,获取完整的分页数据并关闭浏览器驱动。
...
...
@@ -159,10 +160,14 @@ def reptile(browser=None, search_word=""):
log
.
debug
(
'save file success'
)
else
:
log
.
debug
(
'save file failed'
)
script_close
(
browser
)
else
:
# 爬取数据为空
log
.
info
(
"未爬取到数据"
)
script_close
(
browser
)
def
script_close
(
browser
):
# 关闭浏览器驱动
try
:
browser
.
close
()
...
...
@@ -171,6 +176,7 @@ def reptile(browser=None, search_word=""):
log
.
debug
(
"浏览器驱动关闭失败"
)
sys
.
exit
()
def
main
():
"""
...
...
pc_video.py
deleted
100644 → 0
View file @
1526bcd6
pc_youtube.py
View file @
e6a8964a
...
...
@@ -75,15 +75,18 @@ def reptile(browser=None, search_word=""):
log
.
debug
(
'save file success'
)
else
:
log
.
debug
(
'save file failed'
)
script_close
(
browser
)
else
:
# 爬取数据为空
log
.
info
(
"未爬取到数据"
)
script_close
(
browser
)
def
script_close
(
browser
):
# 关闭浏览器驱动
try
:
browser
.
close
()
browser
.
quit
()
except
:
log
.
debug
(
"浏览器驱动关闭失败"
)
sys
.
exit
()
...
...
test.py
View file @
e6a8964a
This diff is collapsed.
Click to expand it.
utils/createBrowserDriver.py
View file @
e6a8964a
...
...
@@ -28,7 +28,7 @@ from utils.index import get_screen_resolution
'''
def
create
(
option
=
None
,
using_user_data
=
True
,
web_browser
=
"
chromium
"
):
def
create
(
option
=
None
,
using_user_data
=
True
,
web_browser
=
"
firefox
"
):
"""
:param web_browser:
...
...
@@ -60,8 +60,10 @@ def create(option=None, using_user_data=True, web_browser="chromium"):
# 添加用户数据目录参数,启用浏览器的持久性会话,可以保存登录状态和Cookie
if
web_browser
==
"firefox"
:
# 将此处替换为你的Firefox用户数据目录路径
profile
=
FirefoxProfile
(
profile_directory
=
user_data_dir
)
options
.
profile
=
profile
# profile = FirefoxProfile(profile_directory=user_data_dir)
# options.profile = profile
options
.
add_argument
(
"-profile"
)
options
.
add_argument
(
user_data_dir
)
# options.add_argument(f'--user-data-dir={user_data_dir}')
elif
web_browser
==
"chrome"
:
options
.
add_argument
(
f
'--user-data-dir={user_data_dir}'
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment