Commit 1b3d2164 authored by liyang's avatar liyang

fix:facebook 富文本图片src替换为本地访问路径

parent 1c268bac
...@@ -8,6 +8,7 @@ from api.index import importJson, getReptileTask, importJsonPath ...@@ -8,6 +8,7 @@ from api.index import importJson, getReptileTask, importJsonPath
from utils.index import convert_to_traditional, yt_dlp_download, convert_string_to_time, parse_time_string from utils.index import convert_to_traditional, yt_dlp_download, convert_string_to_time, parse_time_string
# from pytube import YouTube # from pytube import YouTube
from datetime import datetime from datetime import datetime
from utils.download_image import download_image
import os import os
from config.settings import get_base_file_url from config.settings import get_base_file_url
...@@ -18,6 +19,7 @@ from config.settings import get_base_file_url ...@@ -18,6 +19,7 @@ from config.settings import get_base_file_url
def reptile(browser=None, search_word=""): def reptile(browser=None, search_word=""):
print(f"搜索词:{search_word}")
url = "https://www.facebook.com/" url = "https://www.facebook.com/"
option = ['--headless'] option = ['--headless']
browser = browser or create(option) browser = browser or create(option)
...@@ -76,6 +78,7 @@ def reptile(browser=None, search_word=""): ...@@ -76,6 +78,7 @@ def reptile(browser=None, search_word=""):
# 查找是否含有视频 # 查找是否含有视频
# ignore_list = soup.find_all("div", {"data-visualcompletion": "video"}) # ignore_list = soup.find_all("div", {"data-visualcompletion": "video"})
video_list = soup.find_all("video") video_list = soup.find_all("video")
image_list = soup.find_all("img")
# lth = len(ignore_list) # lth = len(ignore_list)
if len(video_list) > 0: if len(video_list) > 0:
# 删除第二个子元素 # 删除第二个子元素
...@@ -98,6 +101,22 @@ def reptile(browser=None, search_word=""): ...@@ -98,6 +101,22 @@ def reptile(browser=None, search_word=""):
else: else:
print("") print("")
picture_url = []
if len(image_list) > 0:
for key, element in enumerate(image_list):
# 下载图片至本地,替换标签中的src
id = str(int(time.time()))
# 下载地址
download_dir = f'{os.path.join(file_dir, f"{id}.jpg")}'
# 访问地址
access_address = f'{get_base_file_url()}{table_name.split("_")[1]}/{id}.jpg'
# 下载状态
status = download_image(element['src'], download_dir)
if status:
element['src'] = access_address
picture_url.append(access_address)
else:
print("")
content = soup.prettify() content = soup.prettify()
# 标题取:作者+日期 # 标题取:作者+日期
title = f"{author}-{datetime.fromtimestamp(release_time_timestamp)}" title = f"{author}-{datetime.fromtimestamp(release_time_timestamp)}"
...@@ -123,101 +142,25 @@ def reptile(browser=None, search_word=""): ...@@ -123,101 +142,25 @@ def reptile(browser=None, search_word=""):
"reptileTime": str(int(time.time())), "reptileTime": str(int(time.time())),
"type": content_type, "type": content_type,
"author": author, "author": author,
"releaseTime": release_time "releaseTime": release_time,
"picture_url": ",".join(picture_url)
} }
# --------------- 组装数据 end--------------------- # --------------- 组装数据 end---------------------
data.append(obj) data.append(obj)
# print(content)
# 内容过滤
# 使用BeautifulSoup解析HTML
# soup = BeautifulSoup(content.get_attribute("innerHTML"), 'html.parser')
# print(soup)
# print("-----")
# print("---------------")
# print(input_email_element)
# print(input_pwd_element)
# print(button_login)
# logger.debug(button)
# 模拟点击按钮多次加载更多数据
# while button.is_enabled():
# time.sleep(2) # 等待一段时间,确保页面加载完毕
# try:
# button.click()
# button = WebDriverWait(browser, 5).until(
# EC.element_to_be_clickable((By.XPATH, "//button[@data-testid='search-show-more-button']")))
# except:
# break
# time.sleep(3)
# 获取完整的分页数据
# page_content = browser.page_source
# soup = BeautifulSoup(page_content, 'html.parser')
# print("----------")
# print(soup)
# list_news = soup.find_all('li', {"class": "css-1l4w6pd"})
# for index, item in enumerate(list_news):
# logger.debug(item)
# # 抓取图片
# image_key = image_key + 1
# url_element = item.find('img', {"class": "css-rq4mmj"})
# image_url = url_element['src'] if url_element else ""
# # logger.debug(url)
# if image_url:
# # logger.debug(url)
# # # 下载图片
# #
# filename = f"{image_key}.jpg"
# # logger.debug(filename)
# # sys.exit()
# download_image(image_url, f'{fileDir}images/{filename}')
# # 抓取文字
# title_element = item.find('h4', {"class": "css-2fgx4k"})
# introduction_element = item.find('p', {"class": "css-16nhkrn"})
# title = title_element.get_text() if title_element else ""
# introduction = introduction_element.get_text() if introduction_element else ""
# news = {
# "title": title,
# "introduction": introduction,
# "imageName": filename
# }
# data.append(news)
# logger.debug(data)
# 将数据保存到文件中
# with open(f'{fileDir}data.json', "w", encoding="utf-8") as file:
# json.dump(data, file, indent=2, ensure_ascii=False)
# 发送爬取数据到java服务
# print('----------------------')
# print(data)
if len(data) > 0: if len(data) > 0:
# 保存json文件到本地 # 保存json文件到本地
log.debug(os.path.abspath("../")) # log.debug(os.path.abspath("../"))
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", "facebook", str(int(time.time())) + ".json")}' state_save = save_json(os.path.join(file_dir, str(int(time.time())) + ".json"), data)
# file_dir = f'./reptile_data/ptt/{int(time.time())}.json'
state_save = save_json(file_dir, data)
# log.debug("-----------------------------")
# write_to_database(data)
if state_save: if state_save:
log.debug('save file success') log.debug('save file success')
# path = os.path.abspath(file_dir).join(file_dir).join(".json")
# log.debug('file_path:' + file_dir)
# form_data = {
# "path": file_dir,
# "tableName": table_name
# }
# response = importJsonPath(form_data)
else: else:
log.debug('save file failed') log.debug('save file failed')
else: else:
# 爬取数据为空 # 爬取数据为空
log.info("未爬取到数据") log.info("未爬取到数据")
# form_data = {
# "path": "",
# "tableName": table_name
# }
# response = importJsonPath(form_data)
browser.quit() browser.quit()
...@@ -233,7 +176,7 @@ def main(): ...@@ -233,7 +176,7 @@ def main():
log.debug("call success") log.debug("call success")
search_word = "" search_word = ""
for item in response['data']['rows']: for item in response['data']['rows']:
if item['name'] == 'ptt': if item['name'] == 'facebook':
search_word = item['keyword'] search_word = item['keyword']
table_name = item['tableName'] table_name = item['tableName']
status_task = int(item["status"]) status_task = int(item["status"])
...@@ -252,5 +195,6 @@ def main(): ...@@ -252,5 +195,6 @@ def main():
# 全局变量 # 全局变量
data = [] data = []
table_name = "pms_facebook" table_name = "pms_facebook"
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
# 调用main函数 # 调用main函数
main() main()
...@@ -174,31 +174,15 @@ def reptile(browser=None, search_word=""): ...@@ -174,31 +174,15 @@ def reptile(browser=None, search_word=""):
# print(data) # print(data)
if len(data) > 0: if len(data) > 0:
# 保存json文件到本地 # 保存json文件到本地
log.debug(os.path.abspath("../")) # log.debug(os.path.abspath("../"))
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", "ptt", str(int(time.time())) + ".json")}' state_save = save_json(os.path.join(file_dir,str(int(time.time())) + ".json"), data)
# file_dir = f'./reptile_data/ptt/{int(time.time())}.json'
state_save = save_json(file_dir, data)
# log.debug("-----------------------------")
# write_to_database(data)
if state_save: if state_save:
log.debug('save file success') log.debug('save file success')
# path = os.path.abspath(file_dir).join(file_dir).join(".json")
# log.debug('file_path:' + file_dir)
# form_data = {
# "path": file_dir,
# "tableName": table_name
# }
# response = importJsonPath(form_data)
else: else:
log.debug('save file failed') log.debug('save file failed')
else: else:
# 爬取数据为空 # 爬取数据为空
log.info("未爬取到数据") log.info("未爬取到数据")
# form_data = {
# "path": "",
# "tableName": table_name
# }
# response = importJsonPath(form_data)
# 关闭浏览器驱动 # 关闭浏览器驱动
# time.sleep(3) # time.sleep(3)
...@@ -236,6 +220,7 @@ def main(): ...@@ -236,6 +220,7 @@ def main():
# 全局变量 # 全局变量
data = [] data = []
table_name = "pms_ptt" table_name = "pms_ptt"
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data",table_name.split("_")[1])}'
# 是否启用 # 是否启用
status_task = '0' status_task = '0'
# 调用main函数 # 调用main函数
......
...@@ -101,31 +101,15 @@ def reptile(browser=None, search_word=""): ...@@ -101,31 +101,15 @@ def reptile(browser=None, search_word=""):
# print(data) # print(data)
if len(data) > 0: if len(data) > 0:
# 保存json文件到本地 # 保存json文件到本地
log.debug(os.path.abspath("../")) # log.debug(os.path.abspath("../"))
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", "twitter", str(int(time.time())) + ".json")}' state_save = save_json(os.path.join(file_dir, str(int(time.time())) + ".json"), data)
# file_dir = f'./reptile_data/ptt/{int(time.time())}.json'
state_save = save_json(file_dir, data)
# log.debug("-----------------------------")
# write_to_database(data)
if state_save: if state_save:
log.debug('save file success') log.debug('save file success')
# path = os.path.abspath(file_dir).join(file_dir).join(".json")
# log.debug('file_path:' + file_dir)
# form_data = {
# "path": file_dir,
# "tableName": table_name
# }
# response = importJsonPath(form_data)
else: else:
log.debug('save file failed') log.debug('save file failed')
else: else:
# 爬取数据为空 # 爬取数据为空
log.info("未爬取到数据") log.info("未爬取到数据")
# form_data = {
# "path": "",
# "tableName": table_name
# }
# response = importJsonPath(form_data)
# 关闭浏览器驱动 # 关闭浏览器驱动
# time.sleep(3) # time.sleep(3)
...@@ -144,7 +128,7 @@ def main(): ...@@ -144,7 +128,7 @@ def main():
log.debug("call success") log.debug("call success")
search_word = "" search_word = ""
for item in response['data']['rows']: for item in response['data']['rows']:
if item['name'] == 'ptt': if item['name'] == 'twitter':
search_word = item['keyword'] search_word = item['keyword']
table_name = item['tableName'] table_name = item['tableName']
status_task = int(item["status"]) status_task = int(item["status"])
...@@ -163,6 +147,7 @@ def main(): ...@@ -163,6 +147,7 @@ def main():
# 全局变量 # 全局变量
data = [] data = []
table_name = "pms_twitter" table_name = "pms_twitter"
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data",table_name.split("_")[1])}'
# 是否启用 # 是否启用
status_task = '0' status_task = '0'
# 调用main函数 # 调用main函数
......
...@@ -33,31 +33,13 @@ def reptile(browser=None, search_word=""): ...@@ -33,31 +33,13 @@ def reptile(browser=None, search_word=""):
"//div[@id='contents']//ytd-video-renderer//ytd-channel-name//yt-formatted-string/a") "//div[@id='contents']//ytd-video-renderer//ytd-channel-name//yt-formatted-string/a")
element_time_list = browser.find_elements('xpath', element_time_list = browser.find_elements('xpath',
"//div[@id='contents']//ytd-video-renderer//ytd-video-meta-block//div[@id='metadata-line']/span[2]") "//div[@id='contents']//ytd-video-renderer//ytd-video-meta-block//div[@id='metadata-line']/span[2]")
# 时间长度集合
# elemnet_logtime_list = browser.find_elements('xpath',"//div[@id='contents']//ytd-video-renderer//ytd-thumbnail//ytd-thumbnail-overlay-time-status-renderer//span")
# 获取目录下所有文件名
# file_names = os.listdir(os.path.join(os.path.abspath("./"), "reptile_data", "youtube"))
length = len(classify_video_list) length = len(classify_video_list)
for index in range(length): for index in range(length):
title = classify_video_list[index].get_attribute('title') title = classify_video_list[index].get_attribute('title')
link = classify_video_list[index].get_attribute('href') link = classify_video_list[index].get_attribute('href')
id = link.split("?")[1].split("&")[0].replace("v=", "") id = link.split("?")[1].split("&")[0].replace("v=", "")
url = f'https://www.youtube.com/watch?v={id}' url = f'https://www.youtube.com/watch?v={id}'
# is_repeat = False
# for item in file_names:
# # print("id——1:"+f'{id}.mp4')
# # print("id——2:" + item)
# if f'{id}.mp4' == item:
# is_repeat = True
# else:
# is_repeat = False
# print(is_repeat)
if index < 6 and YouTube(url).length // 60 < 60: if index < 6 and YouTube(url).length // 60 < 60:
# yt = YouTube(link)
# link = "https://www.youtube.com/watch?v=7q88m5MQRhE"
# print(link)
# author = element_author_list[index].text
# file_url = './' + link + '.mp4'
base_urr = get_base_file_url() base_urr = get_base_file_url()
releaseTime = "" releaseTime = ""
try: try:
...@@ -83,36 +65,18 @@ def reptile(browser=None, search_word=""): ...@@ -83,36 +65,18 @@ def reptile(browser=None, search_word=""):
data.append(obj) data.append(obj)
else : else :
print("") print("")
# return False
# log.debug("")
if len(data) > 0: if len(data) > 0:
# 保存json文件到本地 # 保存json文件到本地
log.debug(os.path.abspath("../")) # log.debug(os.path.abspath("../"))
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", "youtube", str(int(time.time())) + ".json")}' state_save = save_json(os.path.join(file_dir,str(int(time.time())) + ".json"), data)
# file_dir = f'./reptile_data/ptt/{int(time.time())}.json'
state_save = save_json(file_dir, data)
# log.debug("-----------------------------")
# write_to_database(data)
if state_save: if state_save:
log.debug('save file success') log.debug('save file success')
# path = os.path.abspath(file_dir).join(file_dir).join(".json")
# log.debug('file_path:' + file_dir)
# form_data = {
# "path": file_dir,
# "tableName": table_name
# }
# response = importJsonPath(form_data)
else: else:
log.debug('save file failed') log.debug('save file failed')
else: else:
# 爬取数据为空 # 爬取数据为空
log.info("未爬取到数据") log.info("未爬取到数据")
# form_data = {
# "path": "",
# "tableName": table_name
# }
# response = importJsonPath(form_data)
browser.close() browser.close()
# 关闭浏览器驱动 # 关闭浏览器驱动
browser.quit() browser.quit()
...@@ -130,7 +94,7 @@ def main(): ...@@ -130,7 +94,7 @@ def main():
log.debug("call success") log.debug("call success")
search_word = "" search_word = ""
for item in response['data']['rows']: for item in response['data']['rows']:
if item['name'] == 'ptt': if item['name'] == 'youtube':
search_word = item['keyword'] search_word = item['keyword']
table_name = item['tableName'] table_name = item['tableName']
status_task = int(item["status"]) status_task = int(item["status"])
...@@ -149,6 +113,7 @@ def main(): ...@@ -149,6 +113,7 @@ def main():
# 全局变量 # 全局变量
data = [] data = []
table_name = "pms_youtube" table_name = "pms_youtube"
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data",table_name.split("_")[1])}'
# 是否启用 # 是否启用
status_task = '0' status_task = '0'
# 调用main函数 # 调用main函数
......
...@@ -2,11 +2,19 @@ import requests ...@@ -2,11 +2,19 @@ import requests
def download_image(url, save_path): def download_image(url, save_path):
"""
:param url:
:param save_path:
:return:
"""
response = requests.get(url, stream=True) response = requests.get(url, stream=True)
if response.status_code == 200: if response.status_code == 200:
with open(save_path, 'wb') as file: with open(save_path, 'wb') as file:
for chunk in response.iter_content(1024): for chunk in response.iter_content(1024):
file.write(chunk) file.write(chunk)
# print(f"图片下载成功:{save_path}") # print(f"图片下载成功:{save_path}")
return True
else: else:
print(f"图片下载失败:{url}") print(f"图片下载失败:{url}")
return False
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment