Commit 794b15d6 authored by liyang's avatar liyang

fix:facebook 富文本删除视频相关html标签,增加video标签占位

parent 5773068e
......@@ -3,7 +3,7 @@ def get_log_path():
return "../"
def get_base_url():
return "http://192.168.0.127:8081/"
return "http://192.168.0.118:8081/"
def get_base_file_url():
return "http://192.168.0.127:8186/"
\ No newline at end of file
return "http://192.168.0.118:8186/"
\ No newline at end of file
......@@ -5,7 +5,7 @@ from utils.Logger import log
from utils.createBrowserDriver import create
from utils.filse import save_json
from api.index import importJson, getReptileTask, importJsonPath
from utils.index import convert_to_traditional, yt_dlp_download,convert_string_to_time,parse_time_string
from utils.index import convert_to_traditional, yt_dlp_download, convert_string_to_time, parse_time_string
# from pytube import YouTube
from datetime import datetime
import os
......@@ -25,7 +25,7 @@ def reptile(browser=None, search_word=""):
browser.get(url)
try:
# 检测是否要登录
login_input = browser.find_element('xpath',"//input[@name='email']")
login_input = browser.find_element('xpath', "//input[@name='email']")
password_input = browser.find_element('xpath', "//input[@name='pass']")
login_input.send_keys("liyang19970814@gmail.com")
password_input.send_keys("xn89kiPT/^Kaeg#")
......@@ -34,37 +34,74 @@ def reptile(browser=None, search_word=""):
button_login.click()
time.sleep(3)
except:
print("error")
print("已登录")
url = f"https://www.facebook.com/search/top?q={search_word}"
browser.get(url)
# 使用 JavaScript 将网页滚动到底部
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3)
# 帖子块集合
elements = browser.find_elements('xpath',"//div[@role='feed']/div//div[@aria-describedby]")
# 内容
element_content_list = browser.find_elements('xpath',
"//div[@role='feed']/div//div[@aria-describedby]/div/div/div/div/div/div[2]/div/div/div[3]")
# 作者
element_authors_list = browser.find_elements('xpath',
"//div[@role='feed']/div//div[@aria-describedby]//h3/span[1]")
element_authors_list = browser.find_elements('xpath', "//div[@role='feed']/div//div[@aria-describedby]//h3/span[1]")
# 发布时间
element_release_list = browser.find_elements('xpath',
"//div[@role='feed']/div//div[@aria-describedby]//span[@dir]/span//a[@role='link' and @aria-label]")
# 查找所有 展开 按钮,循环点击后在查找内容
elements_expand_list = browser.find_elements('xpath',"//div[@role='feed']/div//div[@aria-describedby]//div[@role='button' and text()='展开']")
for item in elements_expand_list:
item.click()
# 内容
element_content_list = browser.find_elements('xpath',"//div[@role='feed']/div//div[@aria-describedby]/div/div/div/div/div/div[2]/div/div/div[3]")
# print(element_content_list)
length = len(elements)
elements_expand_list = browser.find_elements('xpath',
"//div[@role='feed']/div//div[@aria-describedby]//div[@role='button' and text()='展开']")
for key, element in enumerate(elements_expand_list):
try:
# 使用JavaScript 执行点击操作
browser.execute_script("arguments[0].click();", element)
except Exception as e:
print("Clicking element failed: " + str(e))
length = len(element_content_list)
# print(length)
for index in range(length):
author = element_authors_list[index].text
release_time = str(int(parse_time_string(element_release_list[index].text)))
content = element_content_list[index].get_attribute("outerHTML")
# print(element_release_list[index].text)
# print(parse_time_string(element_release_list[index].text))
release_time_timestamp = int(parse_time_string(element_release_list[index].text))
release_time = str(release_time_timestamp)
# release_time = ""
# content = element_content_list[index].get_attribute("outerHTML")
# 使用BeautifulSoup解析HTML
soup = BeautifulSoup(element_content_list[index].get_attribute('innerHTML'), 'html.parser')
text = element_content_list[index].text
soup = BeautifulSoup(element_content_list[index].get_attribute('outerHTML'), 'html.parser')
soup_str = soup.prettify()
# 查找是否含有视频
# ignore_list = soup.find_all("div", {"data-visualcompletion": "video"})
video_list = soup.find_all("video")
# lth = len(ignore_list)
if len(video_list) > 0:
# 删除第二个子元素
# 找到包含两个 <div> 元素的父级元素
parent_div = soup.find('div')
# 找到所有的 <div> 子元素
div_elements = parent_div.find_all('div', recursive=False)
# div_tags = soup.find_all("div", recursive=False)
# 确保列表中至少有两个 <div> 子元素
if len(div_elements) >= 2:
# 获取第二个 <div> 元素,并将其从父级元素中移除
div_to_remove = div_elements[1]
div_to_remove.extract()
# 删除
# div.decompose()
# 创建video标签占位
custom_video = soup.new_tag("video")
custom_video["src"] = ""
parent_div.append(custom_video)
else:
print("")
content = soup.prettify()
# 标题取:作者+日期
title = f"{author}-{datetime.fromtimestamp(int(parse_time_string(element_release_list[index].text)))}"
title = f"{author}-{datetime.fromtimestamp(release_time_timestamp)}"
# title = ""
# ---------------- 判断类型 start ----------
# 类型
content_type = ""
......@@ -183,6 +220,7 @@ def reptile(browser=None, search_word=""):
browser.quit()
def main():
"""
......
......@@ -38,9 +38,9 @@ def parse_time_string(time_str):
else:
try:
datetime_str = time_str.replace("月", " ").replace("日", "")
month, day, hour, minute = map(int, datetime_str.split())
month, day = map(int, datetime_str.split())
current_year = datetime.datetime.now().year
datetime_obj = datetime.datetime(year=current_year, month=month, day=day, hour=hour, minute=minute)
datetime_obj = datetime.datetime(year=current_year, month=month, day=day)
return int(datetime_obj.timestamp())
except ValueError:
return None
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment