Commit e66c1079 authored by liyang's avatar liyang

fix:twitter 过滤

parent 0af3679e
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
venv/ venv/
.idea/ .idea/
*pycache*/ *pycache*/
*.svg
user_data/** user_data/**
log/**/*.json log/**/*.json
app.log app.log
......
...@@ -5,7 +5,7 @@ from utils.Logger import log ...@@ -5,7 +5,7 @@ from utils.Logger import log
from utils.createBrowserDriver import create from utils.createBrowserDriver import create
from utils.filse import save_json from utils.filse import save_json
from api.index import importJson, getReptileTask, importJsonPath from api.index import importJson, getReptileTask, importJsonPath
from utils.index import convert_to_traditional, yt_dlp_download, convert_string_to_time, parse_twitter_time_string from utils.index import convert_to_traditional, yt_dlp_download, convert_string_to_time, parse_twitter_time_string,extract_image_format
# from pytube import YouTube # from pytube import YouTube
import os import os
import sys import sys
...@@ -94,14 +94,6 @@ def reptile(browser=None, search_word=""): ...@@ -94,14 +94,6 @@ def reptile(browser=None, search_word=""):
link_str = "" link_str = ""
timestamp = time.time() timestamp = time.time()
# 删除多余div
parent_div = soup.find("div")
# 找到所有的 <div> 子元素
div_elements = parent_div.find_all('div', recursive=False)
for key, item in enumerate(div_elements):
if key == 0 or key == len(div_elements)-1:
item.extract()
author = element_authors_list[index].text author = element_authors_list[index].text
# 标题取:作者+日期 # 标题取:作者+日期
title = f"{author}-{datetime.fromtimestamp(int(timestamp))}" title = f"{author}-{datetime.fromtimestamp(int(timestamp))}"
...@@ -111,11 +103,8 @@ def reptile(browser=None, search_word=""): ...@@ -111,11 +103,8 @@ def reptile(browser=None, search_word=""):
# lth = len(ignore_list) # lth = len(ignore_list)
if len(video_list) > 0: if len(video_list) > 0:
# for key,element in enumerate(video_list): # for key,element in enumerate(video_list):
# 删除第二个子元素
# 找到包含两个 <div> 元素的父级元素 div_elements = soup.find("div").findChildren("div", recursive=False)
parent_div = soup.find('div')
# 找到所有的 <div> 子元素
div_elements = parent_div.find_all('div', recursive=False)
# div_tags = soup.find_all("div", recursive=False) # div_tags = soup.find_all("div", recursive=False)
# 确保列表中至少有两个 <div> 子元素 # 确保列表中至少有两个 <div> 子元素
if len(div_elements) >= 2: if len(div_elements) >= 2:
...@@ -127,7 +116,7 @@ def reptile(browser=None, search_word=""): ...@@ -127,7 +116,7 @@ def reptile(browser=None, search_word=""):
# 创建video标签占位 # 创建video标签占位
custom_video = soup.new_tag("video") custom_video = soup.new_tag("video")
custom_video["src"] = "" custom_video["src"] = ""
parent_div.append(custom_video) soup.find("div").append(custom_video)
else: else:
# print("") # print("")
error = "" error = ""
...@@ -136,10 +125,11 @@ def reptile(browser=None, search_word=""): ...@@ -136,10 +125,11 @@ def reptile(browser=None, search_word=""):
for key, element in enumerate(image_list): for key, element in enumerate(image_list):
# 下载图片至本地,替换标签中的src # 下载图片至本地,替换标签中的src
id = str(int(time.time())) id = str(int(time.time()))
image_type = extract_image_format(element['src'])
# 下载地址 # 下载地址
download_dir = f'{os.path.join(file_dir, f"{id}.jpg")}' download_dir = f'{os.path.join(file_dir, f"{id}.{image_type}")}'
# 访问地址 # 访问地址
access_address = f'{get_base_file_url()}{table_name.split("_")[1]}/{id}.jpg' access_address = f'{get_base_file_url()}{table_name.split("_")[1]}/{id}.{image_type}'
# 下载状态 # 下载状态
status = download_image(element['src'], download_dir) status = download_image(element['src'], download_dir)
if status: if status:
...@@ -149,8 +139,16 @@ def reptile(browser=None, search_word=""): ...@@ -149,8 +139,16 @@ def reptile(browser=None, search_word=""):
# print("") # print("")
error = "" error = ""
content = soup.prettify() # 删除多余div
# parent_div = soup.find("div")
# 找到所有的 <div> 子元素
div_elements = soup.find("div").findChildren("div",recursive=False)
for key, item in enumerate(div_elements):
if key == 0 or key == len(div_elements) - 1:
item.extract()
content = soup.prettify()
print("")
# ---------------- 判断类型 start ---------- # ---------------- 判断类型 start ----------
# 类型 # 类型
content_type = "" content_type = ""
...@@ -164,6 +162,7 @@ def reptile(browser=None, search_word=""): ...@@ -164,6 +162,7 @@ def reptile(browser=None, search_word=""):
except: except:
content_type = "文字" content_type = "文字"
# ---------------- 判断类型 end ---------- # ---------------- 判断类型 end ----------
# --------------- 组装数据 start--------------------- # --------------- 组装数据 start---------------------
obj = { obj = {
"title": title, "title": title,
......
...@@ -11,6 +11,8 @@ import utils.Logger ...@@ -11,6 +11,8 @@ import utils.Logger
from dateutil import parser from dateutil import parser
import base64 import base64
import tkinter as tk import tkinter as tk
from urllib.parse import urlparse, parse_qs
# import io # import io
# from PIL import Image # from PIL import Image
...@@ -19,6 +21,27 @@ import tkinter as tk ...@@ -19,6 +21,27 @@ import tkinter as tk
ssl._create_default_https_context = ssl._create_stdlib_context ssl._create_default_https_context = ssl._create_stdlib_context
def extract_image_format(url):
# 解析 URL
parsed_url = urlparse(url)
# 获取查询字符串部分
query_string = parsed_url.query
# 判断查询字符串是否为空
if query_string:
return "jpg"
else:
# 在URL中查找最后一个点(.)的索引
last_dot_index = url.rfind('.')
# 如果没有找到点(.)或点(.)在URL的末尾,则无法确定图片格式
if last_dot_index == -1 or last_dot_index == len(url) - 1:
return "jpg"
# 获取点(.)后面的部分,即文件扩展名
image_format = url[last_dot_index + 1:]
return image_format
def get_screen_resolution(): def get_screen_resolution():
try: try:
root = tk.Tk() root = tk.Tk()
...@@ -30,6 +53,7 @@ def get_screen_resolution(): ...@@ -30,6 +53,7 @@ def get_screen_resolution():
print("无法获取屏幕分辨率:", e) print("无法获取屏幕分辨率:", e)
return 1920, 1080 return 1920, 1080
def save_base64_image(base64_string, file_path): def save_base64_image(base64_string, file_path):
try: try:
# 解码base64数据 # 解码base64数据
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment