Commit 794b15d6 authored by liyang's avatar liyang

fix:facebook 富文本删除视频相关html标签,增加video标签占位

parent 5773068e
...@@ -3,7 +3,7 @@ def get_log_path(): ...@@ -3,7 +3,7 @@ def get_log_path():
return "../" return "../"
def get_base_url(): def get_base_url():
return "http://192.168.0.127:8081/" return "http://192.168.0.118:8081/"
def get_base_file_url(): def get_base_file_url():
return "http://192.168.0.127:8186/" return "http://192.168.0.118:8186/"
\ No newline at end of file \ No newline at end of file
...@@ -5,7 +5,7 @@ from utils.Logger import log ...@@ -5,7 +5,7 @@ from utils.Logger import log
from utils.createBrowserDriver import create from utils.createBrowserDriver import create
from utils.filse import save_json from utils.filse import save_json
from api.index import importJson, getReptileTask, importJsonPath from api.index import importJson, getReptileTask, importJsonPath
from utils.index import convert_to_traditional, yt_dlp_download,convert_string_to_time,parse_time_string from utils.index import convert_to_traditional, yt_dlp_download, convert_string_to_time, parse_time_string
# from pytube import YouTube # from pytube import YouTube
from datetime import datetime from datetime import datetime
import os import os
...@@ -25,7 +25,7 @@ def reptile(browser=None, search_word=""): ...@@ -25,7 +25,7 @@ def reptile(browser=None, search_word=""):
browser.get(url) browser.get(url)
try: try:
# 检测是否要登录 # 检测是否要登录
login_input = browser.find_element('xpath',"//input[@name='email']") login_input = browser.find_element('xpath', "//input[@name='email']")
password_input = browser.find_element('xpath', "//input[@name='pass']") password_input = browser.find_element('xpath', "//input[@name='pass']")
login_input.send_keys("liyang19970814@gmail.com") login_input.send_keys("liyang19970814@gmail.com")
password_input.send_keys("xn89kiPT/^Kaeg#") password_input.send_keys("xn89kiPT/^Kaeg#")
...@@ -34,37 +34,74 @@ def reptile(browser=None, search_word=""): ...@@ -34,37 +34,74 @@ def reptile(browser=None, search_word=""):
button_login.click() button_login.click()
time.sleep(3) time.sleep(3)
except: except:
print("error") print("已登录")
url = f"https://www.facebook.com/search/top?q={search_word}" url = f"https://www.facebook.com/search/top?q={search_word}"
browser.get(url) browser.get(url)
# 使用 JavaScript 将网页滚动到底部 # 使用 JavaScript 将网页滚动到底部
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);") browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3) time.sleep(3)
# 帖子块集合
elements = browser.find_elements('xpath',"//div[@role='feed']/div//div[@aria-describedby]") # 内容
element_content_list = browser.find_elements('xpath',
"//div[@role='feed']/div//div[@aria-describedby]/div/div/div/div/div/div[2]/div/div/div[3]")
# 作者 # 作者
element_authors_list = browser.find_elements('xpath', element_authors_list = browser.find_elements('xpath', "//div[@role='feed']/div//div[@aria-describedby]//h3/span[1]")
"//div[@role='feed']/div//div[@aria-describedby]//h3/span[1]")
# 发布时间 # 发布时间
element_release_list = browser.find_elements('xpath', element_release_list = browser.find_elements('xpath',
"//div[@role='feed']/div//div[@aria-describedby]//span[@dir]/span//a[@role='link' and @aria-label]") "//div[@role='feed']/div//div[@aria-describedby]//span[@dir]/span//a[@role='link' and @aria-label]")
# 查找所有 展开 按钮,循环点击后在查找内容 # 查找所有 展开 按钮,循环点击后在查找内容
elements_expand_list = browser.find_elements('xpath',"//div[@role='feed']/div//div[@aria-describedby]//div[@role='button' and text()='展开']") elements_expand_list = browser.find_elements('xpath',
for item in elements_expand_list: "//div[@role='feed']/div//div[@aria-describedby]//div[@role='button' and text()='展开']")
item.click() for key, element in enumerate(elements_expand_list):
# 内容 try:
element_content_list = browser.find_elements('xpath',"//div[@role='feed']/div//div[@aria-describedby]/div/div/div/div/div/div[2]/div/div/div[3]") # 使用JavaScript 执行点击操作
# print(element_content_list) browser.execute_script("arguments[0].click();", element)
length = len(elements) except Exception as e:
print("Clicking element failed: " + str(e))
length = len(element_content_list)
# print(length) # print(length)
for index in range(length): for index in range(length):
author = element_authors_list[index].text author = element_authors_list[index].text
release_time = str(int(parse_time_string(element_release_list[index].text))) # print(element_release_list[index].text)
content = element_content_list[index].get_attribute("outerHTML") # print(parse_time_string(element_release_list[index].text))
release_time_timestamp = int(parse_time_string(element_release_list[index].text))
release_time = str(release_time_timestamp)
# release_time = ""
# content = element_content_list[index].get_attribute("outerHTML")
# 使用BeautifulSoup解析HTML # 使用BeautifulSoup解析HTML
soup = BeautifulSoup(element_content_list[index].get_attribute('innerHTML'), 'html.parser') text = element_content_list[index].text
soup = BeautifulSoup(element_content_list[index].get_attribute('outerHTML'), 'html.parser')
soup_str = soup.prettify()
# 查找是否含有视频
# ignore_list = soup.find_all("div", {"data-visualcompletion": "video"})
video_list = soup.find_all("video")
# lth = len(ignore_list)
if len(video_list) > 0:
# 删除第二个子元素
# 找到包含两个 <div> 元素的父级元素
parent_div = soup.find('div')
# 找到所有的 <div> 子元素
div_elements = parent_div.find_all('div', recursive=False)
# div_tags = soup.find_all("div", recursive=False)
# 确保列表中至少有两个 <div> 子元素
if len(div_elements) >= 2:
# 获取第二个 <div> 元素,并将其从父级元素中移除
div_to_remove = div_elements[1]
div_to_remove.extract()
# 删除
# div.decompose()
# 创建video标签占位
custom_video = soup.new_tag("video")
custom_video["src"] = ""
parent_div.append(custom_video)
else:
print("")
content = soup.prettify()
# 标题取:作者+日期 # 标题取:作者+日期
title = f"{author}-{datetime.fromtimestamp(int(parse_time_string(element_release_list[index].text)))}" title = f"{author}-{datetime.fromtimestamp(release_time_timestamp)}"
# title = ""
# ---------------- 判断类型 start ---------- # ---------------- 判断类型 start ----------
# 类型 # 类型
content_type = "" content_type = ""
...@@ -183,6 +220,7 @@ def reptile(browser=None, search_word=""): ...@@ -183,6 +220,7 @@ def reptile(browser=None, search_word=""):
browser.quit() browser.quit()
def main(): def main():
""" """
......
...@@ -38,9 +38,9 @@ def parse_time_string(time_str): ...@@ -38,9 +38,9 @@ def parse_time_string(time_str):
else: else:
try: try:
datetime_str = time_str.replace("月", " ").replace("日", "") datetime_str = time_str.replace("月", " ").replace("日", "")
month, day, hour, minute = map(int, datetime_str.split()) month, day = map(int, datetime_str.split())
current_year = datetime.datetime.now().year current_year = datetime.datetime.now().year
datetime_obj = datetime.datetime(year=current_year, month=month, day=day, hour=hour, minute=minute) datetime_obj = datetime.datetime(year=current_year, month=month, day=day)
return int(datetime_obj.timestamp()) return int(datetime_obj.timestamp())
except ValueError: except ValueError:
return None return None
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment