Commit 3a1860d4 authored by liyang's avatar liyang

fix:爬取数据入库

parent 2531d166
......@@ -7,7 +7,7 @@ import loguru
import pymysql.cursors
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from api.index import importJson, getReptileTask, importJsonPath
from utils.Logger import log
# from requests_toolbelt import *
......@@ -105,6 +105,15 @@ def reptile(browser=None, search_word=""):
# 作者
element_author = browser.find_element('xpath',
"//div[@id='main-content']/div[@class='article-metaline'][1]/span[2]")
# 发布时间
element_release = browser.find_element('xpath',
"//div[@id='main-content']/div[@class='article-metaline'][3]/span[2]")
date_string = element_release.text
date_format = "%a %b %d %H:%M:%S %Y"
# 将日期字符串转换为datetime对象
date_time = datetime.strptime(date_string, date_format)
# 将datetime对象转换为时间戳(以秒为单位)
release_time = int(date_time.timestamp())
log.debug('开始判断类型')
# ---------------- 判断类型 start ----------
# 查找所有img标签
......@@ -133,8 +142,6 @@ def reptile(browser=None, search_word=""):
for span in span_element:
span.extract()
html = soup.prettify().replace('amp;', '')
print(html)
print("aaaaa")
# ------------------ content 过滤 end--------------
# --------------- 组装数据 start---------------------
......@@ -144,7 +151,8 @@ def reptile(browser=None, search_word=""):
"link": browser_current_url,
"reptileTime": str(int(time.time())),
"type": content_type,
"author": element_author.text
"author": element_author.text,
"releaseTime": release_time
}
# --------------- 组装数据 end---------------------
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment