Commit e0c2ddfc authored by liyang's avatar liyang

fix:爬虫优化

parent ea343def
......@@ -5,7 +5,7 @@ from utils.Logger import log
from utils.createBrowserDriver import create
from utils.filse import save_json
from api.index import importJson, getReptileTask, importJsonPath
from utils.index import convert_to_traditional, yt_dlp_download, convert_string_to_time
from utils.index import convert_to_traditional, yt_dlp_download, convert_string_to_time, parse_twitter_time_string
# from pytube import YouTube
import os
from config.settings import get_base_file_url
......@@ -20,7 +20,7 @@ def reptile(browser=None, search_word=""):
url = "https://twitter.com/"
option = ['--headless']
# ['--headless']
browser = browser or create(option, False)
browser = browser or create(None, False)
# print(browser)
# 打开网页
browser.get(url)
......@@ -41,7 +41,7 @@ def reptile(browser=None, search_word=""):
except:
print("------")
time.sleep(2)
url = 'https://twitter.com/search?q='+search_word+'&src=typed_query'
url = 'https://twitter.com/search?q=' + search_word + '&src=typed_query'
browser.get(url)
time.sleep(3)
# 内容块
......@@ -57,7 +57,10 @@ def reptile(browser=None, search_word=""):
length = len(element_authors_list)
for index in range(length):
author = element_authors_list[index].text
release_time = str(int(parse_twitter_time_string(element_release_list[index].text)))
try:
release_time = str(int(parse_twitter_time_string(element_release_list[index].text)))
except:
release_time = str(int(time.time()))
content = element_content_list[index].get_attribute("innerHTML")
# print(content)
# 内容过滤
......@@ -141,7 +144,7 @@ def main():
table_name = item['tableName']
status_task = int(item["status"])
# 简体转繁体
if status_task == 0 and len(search_word)>0:
if status_task == 0 and len(search_word) > 0:
reptile(None, convert_to_traditional(search_word))
else:
log.debug("爬取任务未启用")
......
import os.path
import re
import time
import opencc
from pytube import YouTube
import ssl
......@@ -80,8 +82,12 @@ def parse_twitter_time_string(time_str):
:param time_str:
:return:
"""
times = parser.parse(time_str, fuzzy=True)
# a = datetime.datetime.strptime(time,"%Y-%m-%d %H:%M:%S")
b = datetime.datetime.strftime(times,"%Y-%m-%d %H:%M:%S")
c = time.mktime(time.strptime(b,"%Y-%m-%d %H:%M:%S"))
# 解析相对时间字符串
return datetime.timestamp(parser.parse(time_str, fuzzy=True))
return c
def convert_to_traditional(simplified_text):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment