Commit 00b24977 authored by liyang's avatar liyang

fix:去除富文本转义符号

parent 22acb3c5
# # 导入依赖库
import json
import time
from telnetlib import EC
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from utils.Logger import log
from utils.createBrowserDriver import create
from utils.filse import save_json
from api.index import importJson, getReptileTask, importJsonPath
from utils.index import convert_to_traditional, yt_dlp_download,convert_string_to_time
# from pytube import YouTube
import os
from config.settings import get_base_file_url
# 工具函数-下载图片
'''
......@@ -14,36 +16,30 @@ from utils.createBrowserDriver import create
'''
def reptile(browser):
# # json 数据
data = []
image_key = 0
fileDir = "./reptile_data/news/nytimes/"
# year = datetime(2021, 1, 1)
# startDate = datetime(2020, 12, 31) # 初始日期
# endDate = datetime(2020, 12, 31) # 结束日期
def reptile(browser=None, search_word=""):
url = "https://twitter.com/"
browser = browser or create(False)
print(browser)
option = ['--headless']
# ['--headless']
browser = browser or create()
# print(browser)
# browser = webdriver.Firefox(executable_path='/usr/local/bin/geckodriver')
# endDate = startDate = startDate + timedelta(days=i)
# 打开网页
browser.get(url)
# WebDriverWait(browser,10).
# 打开登录窗口
open_button_login = WebDriverWait(browser, 10).until(
EC.presence_of_element_located((By.XPATH, "//a[@data-testid='login']")))
open_button_login.click()
time.sleep(5)
# open_button_login = WebDriverWait(browser, 10).until(
# EC.presence_of_element_located((By.XPATH, "//a[@data-testid='login']")))
# open_button_login.click()
# time.sleep(5)
# 获取账号密码输入框
input_email_element = WebDriverWait(browser, 10).until(
EC.presence_of_element_located((By.XPATH, "//input[@autocomplete='username']")))
# 获取下一步按钮
buttons = WebDriverWait(browser, 10).until(EC.presence_of_all_elements_located((By.XPATH, "//div[@role='button']")))
for item in buttons:
print(BeautifulSoup(item, 'html.parser'))
# input_email_element = WebDriverWait(browser, 10).until(
# EC.presence_of_element_located((By.XPATH, "//input[@autocomplete='username']")))
# # 获取下一步按钮
# buttons = WebDriverWait(browser, 10).until(EC.presence_of_all_elements_located((By.XPATH, "//div[@role='button']")))
# for item in buttons:
# print(BeautifulSoup(item, 'html.parser'))
# soup = BeautifulSoup(page_content, 'html.parser')
# input_pwd_element = WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.XPATH, "//input[@name='pass']")))
# # 获取登录按钮
......@@ -68,8 +64,8 @@ def reptile(browser):
# break
# time.sleep(3)
# 获取完整的分页数据
page_content = browser.page_source
soup = BeautifulSoup(page_content, 'html.parser')
# page_content = browser.page_source
# soup = BeautifulSoup(page_content, 'html.parser')
# print("----------")
# print(soup)
# list_news = soup.find_all('li', {"class": "css-1l4w6pd"})
......@@ -105,6 +101,34 @@ def reptile(browser):
# with open(f'{fileDir}data.json', "w", encoding="utf-8") as file:
# json.dump(data, file, indent=2, ensure_ascii=False)
browser.close()
# 关闭浏览器驱动
browser.quit()
# browser.close()
# # 关闭浏览器驱动
# browser.quit()
def main():
"""
"""
# 请求关键词
response = getReptileTask()
# print(response)
if response['status_code'] == 200 and response['data']['code'] == 200:
log.debug("call success")
search_word = ""
for item in response['data']['rows']:
if item['name'] == 'twitter':
search_word = item['keyword']
table_name = item['tableName']
reptile(None, convert_to_traditional(search_word))
else:
log.debug("call failed")
reptile(None, '')
# upload_control()
# 全局变量
data = []
table_name = "pms_twitter"
# 调用main函数
main()
\ No newline at end of file
......@@ -29,7 +29,7 @@ def reptile(browser=None, search_word=""):
# print(classify_item_list)
length = len(classify_video_list)
for index in range(length):
if 0 < index < 2:
if -1 < index < length:
title = classify_video_list[index].get_attribute('title')
link = classify_video_list[index].get_attribute('href')
# yt = YouTube(link)
......@@ -47,7 +47,7 @@ def reptile(browser=None, search_word=""):
# 组装数据
obj = {
"title": title,
"content": f"<video src='{file_http_src}'></video>",
"content": f"<video controls style='width:100%' src='{file_http_src}'></video>",
"videoUrl": file_http_src,
"link": link,
"reptileTime": str(int(time.time())),
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment