Commit 3a1860d4 authored by liyang's avatar liyang

fix:爬取数据入库

parent 2531d166
...@@ -7,7 +7,7 @@ import loguru ...@@ -7,7 +7,7 @@ import loguru
import pymysql.cursors import pymysql.cursors
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from datetime import datetime
from api.index import importJson, getReptileTask, importJsonPath from api.index import importJson, getReptileTask, importJsonPath
from utils.Logger import log from utils.Logger import log
# from requests_toolbelt import * # from requests_toolbelt import *
...@@ -105,6 +105,15 @@ def reptile(browser=None, search_word=""): ...@@ -105,6 +105,15 @@ def reptile(browser=None, search_word=""):
# 作者 # 作者
element_author = browser.find_element('xpath', element_author = browser.find_element('xpath',
"//div[@id='main-content']/div[@class='article-metaline'][1]/span[2]") "//div[@id='main-content']/div[@class='article-metaline'][1]/span[2]")
# 发布时间
element_release = browser.find_element('xpath',
"//div[@id='main-content']/div[@class='article-metaline'][3]/span[2]")
date_string = element_release.text
date_format = "%a %b %d %H:%M:%S %Y"
# 将日期字符串转换为datetime对象
date_time = datetime.strptime(date_string, date_format)
# 将datetime对象转换为时间戳(以秒为单位)
release_time = int(date_time.timestamp())
log.debug('开始判断类型') log.debug('开始判断类型')
# ---------------- 判断类型 start ---------- # ---------------- 判断类型 start ----------
# 查找所有img标签 # 查找所有img标签
...@@ -133,8 +142,6 @@ def reptile(browser=None, search_word=""): ...@@ -133,8 +142,6 @@ def reptile(browser=None, search_word=""):
for span in span_element: for span in span_element:
span.extract() span.extract()
html = soup.prettify().replace('amp;', '') html = soup.prettify().replace('amp;', '')
print(html)
print("aaaaa")
# ------------------ content 过滤 end-------------- # ------------------ content 过滤 end--------------
# --------------- 组装数据 start--------------------- # --------------- 组装数据 start---------------------
...@@ -144,7 +151,8 @@ def reptile(browser=None, search_word=""): ...@@ -144,7 +151,8 @@ def reptile(browser=None, search_word=""):
"link": browser_current_url, "link": browser_current_url,
"reptileTime": str(int(time.time())), "reptileTime": str(int(time.time())),
"type": content_type, "type": content_type,
"author": element_author.text "author": element_author.text,
"releaseTime": release_time
} }
# --------------- 组装数据 end--------------------- # --------------- 组装数据 end---------------------
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment