Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
N
network-assets-reptile
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
liyang
network-assets-reptile
Commits
1b3d2164
Commit
1b3d2164
authored
Jul 19, 2023
by
liyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix:facebook 富文本图片src替换为本地访问路径
parent
1c268bac
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
44 additions
and
157 deletions
+44
-157
pc_facebook.py
pc_facebook.py
+25
-81
pc_ptt.py
pc_ptt.py
+3
-18
pc_twitter.py
pc_twitter.py
+4
-19
pc_youtube.py
pc_youtube.py
+4
-39
download_image.py
utils/download_image.py
+8
-0
No files found.
pc_facebook.py
View file @
1b3d2164
...
@@ -8,6 +8,7 @@ from api.index import importJson, getReptileTask, importJsonPath
...
@@ -8,6 +8,7 @@ from api.index import importJson, getReptileTask, importJsonPath
from
utils.index
import
convert_to_traditional
,
yt_dlp_download
,
convert_string_to_time
,
parse_time_string
from
utils.index
import
convert_to_traditional
,
yt_dlp_download
,
convert_string_to_time
,
parse_time_string
# from pytube import YouTube
# from pytube import YouTube
from
datetime
import
datetime
from
datetime
import
datetime
from
utils.download_image
import
download_image
import
os
import
os
from
config.settings
import
get_base_file_url
from
config.settings
import
get_base_file_url
...
@@ -18,6 +19,7 @@ from config.settings import get_base_file_url
...
@@ -18,6 +19,7 @@ from config.settings import get_base_file_url
def
reptile
(
browser
=
None
,
search_word
=
""
):
def
reptile
(
browser
=
None
,
search_word
=
""
):
print
(
f
"搜索词:{search_word}"
)
url
=
"https://www.facebook.com/"
url
=
"https://www.facebook.com/"
option
=
[
'--headless'
]
option
=
[
'--headless'
]
browser
=
browser
or
create
(
option
)
browser
=
browser
or
create
(
option
)
...
@@ -76,6 +78,7 @@ def reptile(browser=None, search_word=""):
...
@@ -76,6 +78,7 @@ def reptile(browser=None, search_word=""):
# 查找是否含有视频
# 查找是否含有视频
# ignore_list = soup.find_all("div", {"data-visualcompletion": "video"})
# ignore_list = soup.find_all("div", {"data-visualcompletion": "video"})
video_list
=
soup
.
find_all
(
"video"
)
video_list
=
soup
.
find_all
(
"video"
)
image_list
=
soup
.
find_all
(
"img"
)
# lth = len(ignore_list)
# lth = len(ignore_list)
if
len
(
video_list
)
>
0
:
if
len
(
video_list
)
>
0
:
# 删除第二个子元素
# 删除第二个子元素
...
@@ -98,6 +101,22 @@ def reptile(browser=None, search_word=""):
...
@@ -98,6 +101,22 @@ def reptile(browser=None, search_word=""):
else
:
else
:
print
(
""
)
print
(
""
)
picture_url
=
[]
if
len
(
image_list
)
>
0
:
for
key
,
element
in
enumerate
(
image_list
):
# 下载图片至本地,替换标签中的src
id
=
str
(
int
(
time
.
time
()))
# 下载地址
download_dir
=
f
'{os.path.join(file_dir, f"{id}.jpg")}'
# 访问地址
access_address
=
f
'{get_base_file_url()}{table_name.split("_")[1]}/{id}.jpg'
# 下载状态
status
=
download_image
(
element
[
'src'
],
download_dir
)
if
status
:
element
[
'src'
]
=
access_address
picture_url
.
append
(
access_address
)
else
:
print
(
""
)
content
=
soup
.
prettify
()
content
=
soup
.
prettify
()
# 标题取:作者+日期
# 标题取:作者+日期
title
=
f
"{author}-{datetime.fromtimestamp(release_time_timestamp)}"
title
=
f
"{author}-{datetime.fromtimestamp(release_time_timestamp)}"
...
@@ -123,101 +142,25 @@ def reptile(browser=None, search_word=""):
...
@@ -123,101 +142,25 @@ def reptile(browser=None, search_word=""):
"reptileTime"
:
str
(
int
(
time
.
time
())),
"reptileTime"
:
str
(
int
(
time
.
time
())),
"type"
:
content_type
,
"type"
:
content_type
,
"author"
:
author
,
"author"
:
author
,
"releaseTime"
:
release_time
"releaseTime"
:
release_time
,
"picture_url"
:
","
.
join
(
picture_url
)
}
}
# --------------- 组装数据 end---------------------
# --------------- 组装数据 end---------------------
data
.
append
(
obj
)
data
.
append
(
obj
)
# print(content)
# 内容过滤
# 使用BeautifulSoup解析HTML
# soup = BeautifulSoup(content.get_attribute("innerHTML"), 'html.parser')
# print(soup)
# print("-----")
# print("---------------")
# print(input_email_element)
# print(input_pwd_element)
# print(button_login)
# logger.debug(button)
# 模拟点击按钮多次加载更多数据
# while button.is_enabled():
# time.sleep(2) # 等待一段时间,确保页面加载完毕
# try:
# button.click()
# button = WebDriverWait(browser, 5).until(
# EC.element_to_be_clickable((By.XPATH, "//button[@data-testid='search-show-more-button']")))
# except:
# break
# time.sleep(3)
# 获取完整的分页数据
# page_content = browser.page_source
# soup = BeautifulSoup(page_content, 'html.parser')
# print("----------")
# print(soup)
# list_news = soup.find_all('li', {"class": "css-1l4w6pd"})
# for index, item in enumerate(list_news):
# logger.debug(item)
# # 抓取图片
# image_key = image_key + 1
# url_element = item.find('img', {"class": "css-rq4mmj"})
# image_url = url_element['src'] if url_element else ""
# # logger.debug(url)
# if image_url:
# # logger.debug(url)
# # # 下载图片
# #
# filename = f"{image_key}.jpg"
# # logger.debug(filename)
# # sys.exit()
# download_image(image_url, f'{fileDir}images/{filename}')
# # 抓取文字
# title_element = item.find('h4', {"class": "css-2fgx4k"})
# introduction_element = item.find('p', {"class": "css-16nhkrn"})
# title = title_element.get_text() if title_element else ""
# introduction = introduction_element.get_text() if introduction_element else ""
# news = {
# "title": title,
# "introduction": introduction,
# "imageName": filename
# }
# data.append(news)
# logger.debug(data)
# 将数据保存到文件中
# with open(f'{fileDir}data.json', "w", encoding="utf-8") as file:
# json.dump(data, file, indent=2, ensure_ascii=False)
# 发送爬取数据到java服务
# print('----------------------')
# print(data)
if
len
(
data
)
>
0
:
if
len
(
data
)
>
0
:
# 保存json文件到本地
# 保存json文件到本地
log
.
debug
(
os
.
path
.
abspath
(
"../"
))
# log.debug(os.path.abspath("../"))
file_dir
=
f
'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", "facebook", str(int(time.time())) + ".json")}'
state_save
=
save_json
(
os
.
path
.
join
(
file_dir
,
str
(
int
(
time
.
time
()))
+
".json"
),
data
)
# file_dir = f'./reptile_data/ptt/{int(time.time())}.json'
state_save
=
save_json
(
file_dir
,
data
)
# log.debug("-----------------------------")
# write_to_database(data)
if
state_save
:
if
state_save
:
log
.
debug
(
'save file success'
)
log
.
debug
(
'save file success'
)
# path = os.path.abspath(file_dir).join(file_dir).join(".json")
# log.debug('file_path:' + file_dir)
# form_data = {
# "path": file_dir,
# "tableName": table_name
# }
# response = importJsonPath(form_data)
else
:
else
:
log
.
debug
(
'save file failed'
)
log
.
debug
(
'save file failed'
)
else
:
else
:
# 爬取数据为空
# 爬取数据为空
log
.
info
(
"未爬取到数据"
)
log
.
info
(
"未爬取到数据"
)
# form_data = {
# "path": "",
# "tableName": table_name
# }
# response = importJsonPath(form_data)
browser
.
quit
()
browser
.
quit
()
...
@@ -233,7 +176,7 @@ def main():
...
@@ -233,7 +176,7 @@ def main():
log
.
debug
(
"call success"
)
log
.
debug
(
"call success"
)
search_word
=
""
search_word
=
""
for
item
in
response
[
'data'
][
'rows'
]:
for
item
in
response
[
'data'
][
'rows'
]:
if
item
[
'name'
]
==
'
ptt
'
:
if
item
[
'name'
]
==
'
facebook
'
:
search_word
=
item
[
'keyword'
]
search_word
=
item
[
'keyword'
]
table_name
=
item
[
'tableName'
]
table_name
=
item
[
'tableName'
]
status_task
=
int
(
item
[
"status"
])
status_task
=
int
(
item
[
"status"
])
...
@@ -252,5 +195,6 @@ def main():
...
@@ -252,5 +195,6 @@ def main():
# 全局变量
# 全局变量
data
=
[]
data
=
[]
table_name
=
"pms_facebook"
table_name
=
"pms_facebook"
file_dir
=
f
'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
# 调用main函数
# 调用main函数
main
()
main
()
pc_ptt.py
View file @
1b3d2164
...
@@ -174,31 +174,15 @@ def reptile(browser=None, search_word=""):
...
@@ -174,31 +174,15 @@ def reptile(browser=None, search_word=""):
# print(data)
# print(data)
if
len
(
data
)
>
0
:
if
len
(
data
)
>
0
:
# 保存json文件到本地
# 保存json文件到本地
log
.
debug
(
os
.
path
.
abspath
(
"../"
))
# log.debug(os.path.abspath("../"))
file_dir
=
f
'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", "ptt", str(int(time.time())) + ".json")}'
state_save
=
save_json
(
os
.
path
.
join
(
file_dir
,
str
(
int
(
time
.
time
()))
+
".json"
),
data
)
# file_dir = f'./reptile_data/ptt/{int(time.time())}.json'
state_save
=
save_json
(
file_dir
,
data
)
# log.debug("-----------------------------")
# write_to_database(data)
if
state_save
:
if
state_save
:
log
.
debug
(
'save file success'
)
log
.
debug
(
'save file success'
)
# path = os.path.abspath(file_dir).join(file_dir).join(".json")
# log.debug('file_path:' + file_dir)
# form_data = {
# "path": file_dir,
# "tableName": table_name
# }
# response = importJsonPath(form_data)
else
:
else
:
log
.
debug
(
'save file failed'
)
log
.
debug
(
'save file failed'
)
else
:
else
:
# 爬取数据为空
# 爬取数据为空
log
.
info
(
"未爬取到数据"
)
log
.
info
(
"未爬取到数据"
)
# form_data = {
# "path": "",
# "tableName": table_name
# }
# response = importJsonPath(form_data)
# 关闭浏览器驱动
# 关闭浏览器驱动
# time.sleep(3)
# time.sleep(3)
...
@@ -236,6 +220,7 @@ def main():
...
@@ -236,6 +220,7 @@ def main():
# 全局变量
# 全局变量
data
=
[]
data
=
[]
table_name
=
"pms_ptt"
table_name
=
"pms_ptt"
file_dir
=
f
'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data",table_name.split("_")[1])}'
# 是否启用
# 是否启用
status_task
=
'0'
status_task
=
'0'
# 调用main函数
# 调用main函数
...
...
pc_twitter.py
View file @
1b3d2164
...
@@ -101,31 +101,15 @@ def reptile(browser=None, search_word=""):
...
@@ -101,31 +101,15 @@ def reptile(browser=None, search_word=""):
# print(data)
# print(data)
if
len
(
data
)
>
0
:
if
len
(
data
)
>
0
:
# 保存json文件到本地
# 保存json文件到本地
log
.
debug
(
os
.
path
.
abspath
(
"../"
))
# log.debug(os.path.abspath("../"))
file_dir
=
f
'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", "twitter", str(int(time.time())) + ".json")}'
state_save
=
save_json
(
os
.
path
.
join
(
file_dir
,
str
(
int
(
time
.
time
()))
+
".json"
),
data
)
# file_dir = f'./reptile_data/ptt/{int(time.time())}.json'
state_save
=
save_json
(
file_dir
,
data
)
# log.debug("-----------------------------")
# write_to_database(data)
if
state_save
:
if
state_save
:
log
.
debug
(
'save file success'
)
log
.
debug
(
'save file success'
)
# path = os.path.abspath(file_dir).join(file_dir).join(".json")
# log.debug('file_path:' + file_dir)
# form_data = {
# "path": file_dir,
# "tableName": table_name
# }
# response = importJsonPath(form_data)
else
:
else
:
log
.
debug
(
'save file failed'
)
log
.
debug
(
'save file failed'
)
else
:
else
:
# 爬取数据为空
# 爬取数据为空
log
.
info
(
"未爬取到数据"
)
log
.
info
(
"未爬取到数据"
)
# form_data = {
# "path": "",
# "tableName": table_name
# }
# response = importJsonPath(form_data)
# 关闭浏览器驱动
# 关闭浏览器驱动
# time.sleep(3)
# time.sleep(3)
...
@@ -144,7 +128,7 @@ def main():
...
@@ -144,7 +128,7 @@ def main():
log
.
debug
(
"call success"
)
log
.
debug
(
"call success"
)
search_word
=
""
search_word
=
""
for
item
in
response
[
'data'
][
'rows'
]:
for
item
in
response
[
'data'
][
'rows'
]:
if
item
[
'name'
]
==
'
ptt
'
:
if
item
[
'name'
]
==
'
twitter
'
:
search_word
=
item
[
'keyword'
]
search_word
=
item
[
'keyword'
]
table_name
=
item
[
'tableName'
]
table_name
=
item
[
'tableName'
]
status_task
=
int
(
item
[
"status"
])
status_task
=
int
(
item
[
"status"
])
...
@@ -163,6 +147,7 @@ def main():
...
@@ -163,6 +147,7 @@ def main():
# 全局变量
# 全局变量
data
=
[]
data
=
[]
table_name
=
"pms_twitter"
table_name
=
"pms_twitter"
file_dir
=
f
'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data",table_name.split("_")[1])}'
# 是否启用
# 是否启用
status_task
=
'0'
status_task
=
'0'
# 调用main函数
# 调用main函数
...
...
pc_youtube.py
View file @
1b3d2164
...
@@ -33,31 +33,13 @@ def reptile(browser=None, search_word=""):
...
@@ -33,31 +33,13 @@ def reptile(browser=None, search_word=""):
"//div[@id='contents']//ytd-video-renderer//ytd-channel-name//yt-formatted-string/a"
)
"//div[@id='contents']//ytd-video-renderer//ytd-channel-name//yt-formatted-string/a"
)
element_time_list
=
browser
.
find_elements
(
'xpath'
,
element_time_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@id='contents']//ytd-video-renderer//ytd-video-meta-block//div[@id='metadata-line']/span[2]"
)
"//div[@id='contents']//ytd-video-renderer//ytd-video-meta-block//div[@id='metadata-line']/span[2]"
)
# 时间长度集合
# elemnet_logtime_list = browser.find_elements('xpath',"//div[@id='contents']//ytd-video-renderer//ytd-thumbnail//ytd-thumbnail-overlay-time-status-renderer//span")
# 获取目录下所有文件名
# file_names = os.listdir(os.path.join(os.path.abspath("./"), "reptile_data", "youtube"))
length
=
len
(
classify_video_list
)
length
=
len
(
classify_video_list
)
for
index
in
range
(
length
):
for
index
in
range
(
length
):
title
=
classify_video_list
[
index
]
.
get_attribute
(
'title'
)
title
=
classify_video_list
[
index
]
.
get_attribute
(
'title'
)
link
=
classify_video_list
[
index
]
.
get_attribute
(
'href'
)
link
=
classify_video_list
[
index
]
.
get_attribute
(
'href'
)
id
=
link
.
split
(
"?"
)[
1
]
.
split
(
"&"
)[
0
]
.
replace
(
"v="
,
""
)
id
=
link
.
split
(
"?"
)[
1
]
.
split
(
"&"
)[
0
]
.
replace
(
"v="
,
""
)
url
=
f
'https://www.youtube.com/watch?v={id}'
url
=
f
'https://www.youtube.com/watch?v={id}'
# is_repeat = False
# for item in file_names:
# # print("id——1:"+f'{id}.mp4')
# # print("id——2:" + item)
# if f'{id}.mp4' == item:
# is_repeat = True
# else:
# is_repeat = False
# print(is_repeat)
if
index
<
6
and
YouTube
(
url
)
.
length
//
60
<
60
:
if
index
<
6
and
YouTube
(
url
)
.
length
//
60
<
60
:
# yt = YouTube(link)
# link = "https://www.youtube.com/watch?v=7q88m5MQRhE"
# print(link)
# author = element_author_list[index].text
# file_url = './' + link + '.mp4'
base_urr
=
get_base_file_url
()
base_urr
=
get_base_file_url
()
releaseTime
=
""
releaseTime
=
""
try
:
try
:
...
@@ -83,36 +65,18 @@ def reptile(browser=None, search_word=""):
...
@@ -83,36 +65,18 @@ def reptile(browser=None, search_word=""):
data
.
append
(
obj
)
data
.
append
(
obj
)
else
:
else
:
print
(
""
)
print
(
""
)
# return False
# log.debug("")
if
len
(
data
)
>
0
:
if
len
(
data
)
>
0
:
# 保存json文件到本地
# 保存json文件到本地
log
.
debug
(
os
.
path
.
abspath
(
"../"
))
# log.debug(os.path.abspath("../"))
file_dir
=
f
'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", "youtube", str(int(time.time())) + ".json")}'
state_save
=
save_json
(
os
.
path
.
join
(
file_dir
,
str
(
int
(
time
.
time
()))
+
".json"
),
data
)
# file_dir = f'./reptile_data/ptt/{int(time.time())}.json'
state_save
=
save_json
(
file_dir
,
data
)
# log.debug("-----------------------------")
# write_to_database(data)
if
state_save
:
if
state_save
:
log
.
debug
(
'save file success'
)
log
.
debug
(
'save file success'
)
# path = os.path.abspath(file_dir).join(file_dir).join(".json")
# log.debug('file_path:' + file_dir)
# form_data = {
# "path": file_dir,
# "tableName": table_name
# }
# response = importJsonPath(form_data)
else
:
else
:
log
.
debug
(
'save file failed'
)
log
.
debug
(
'save file failed'
)
else
:
else
:
# 爬取数据为空
# 爬取数据为空
log
.
info
(
"未爬取到数据"
)
log
.
info
(
"未爬取到数据"
)
# form_data = {
# "path": "",
# "tableName": table_name
# }
# response = importJsonPath(form_data)
browser
.
close
()
browser
.
close
()
# 关闭浏览器驱动
# 关闭浏览器驱动
browser
.
quit
()
browser
.
quit
()
...
@@ -130,7 +94,7 @@ def main():
...
@@ -130,7 +94,7 @@ def main():
log
.
debug
(
"call success"
)
log
.
debug
(
"call success"
)
search_word
=
""
search_word
=
""
for
item
in
response
[
'data'
][
'rows'
]:
for
item
in
response
[
'data'
][
'rows'
]:
if
item
[
'name'
]
==
'
ptt
'
:
if
item
[
'name'
]
==
'
youtube
'
:
search_word
=
item
[
'keyword'
]
search_word
=
item
[
'keyword'
]
table_name
=
item
[
'tableName'
]
table_name
=
item
[
'tableName'
]
status_task
=
int
(
item
[
"status"
])
status_task
=
int
(
item
[
"status"
])
...
@@ -149,6 +113,7 @@ def main():
...
@@ -149,6 +113,7 @@ def main():
# 全局变量
# 全局变量
data
=
[]
data
=
[]
table_name
=
"pms_youtube"
table_name
=
"pms_youtube"
file_dir
=
f
'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data",table_name.split("_")[1])}'
# 是否启用
# 是否启用
status_task
=
'0'
status_task
=
'0'
# 调用main函数
# 调用main函数
...
...
utils/download_image.py
View file @
1b3d2164
...
@@ -2,11 +2,19 @@ import requests
...
@@ -2,11 +2,19 @@ import requests
def
download_image
(
url
,
save_path
):
def
download_image
(
url
,
save_path
):
"""
:param url:
:param save_path:
:return:
"""
response
=
requests
.
get
(
url
,
stream
=
True
)
response
=
requests
.
get
(
url
,
stream
=
True
)
if
response
.
status_code
==
200
:
if
response
.
status_code
==
200
:
with
open
(
save_path
,
'wb'
)
as
file
:
with
open
(
save_path
,
'wb'
)
as
file
:
for
chunk
in
response
.
iter_content
(
1024
):
for
chunk
in
response
.
iter_content
(
1024
):
file
.
write
(
chunk
)
file
.
write
(
chunk
)
# print(f"图片下载成功:{save_path}")
# print(f"图片下载成功:{save_path}")
return
True
else
:
else
:
print
(
f
"图片下载失败:{url}"
)
print
(
f
"图片下载失败:{url}"
)
return
False
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment