fix:爬取数据入库

3a1860d4 · liyang · 2531d166 · 3a1860d4
Commit 3a1860d4 authored Jul 11, 2023 by liyang
Show whitespace changes
Inline Side-by-side

Showing with 12 additions and 4 deletions

pc_ptt.py pc_ptt.py +12 -4

No files found.
--- a/pc_ptt.py
+++ b/pc_ptt.py
@@ -7,7 +7,7 @@ import loguru
 import pymysql.cursors
 import requests
 from bs4 import BeautifulSoup
-
+from datetime import datetime
 from api.index import importJson, getReptileTask, importJsonPath
 from utils.Logger import log
 # from requests_toolbelt import *
@@ -105,6 +105,15 @@ def reptile(browser=None, search_word=""):
                # 作者
                element_author = browser.find_element('xpath',
                                                      "//div[@id='main-content']/div[@class='article-metaline'][1]/span[2]")
+                # 发布时间
+                element_release = browser.find_element('xpath',
+                                                       "//div[@id='main-content']/div[@class='article-metaline'][3]/span[2]")
+                date_string = element_release.text
+                date_format = "%a %b %d %H:%M:%S %Y"
+                # 将日期字符串转换为datetime对象
+                date_time = datetime.strptime(date_string, date_format)
+                # 将datetime对象转换为时间戳（以秒为单位）
+                release_time = int(date_time.timestamp())
                log.debug('开始判断类型')
                # ---------------- 判断类型 start ----------
                # 查找所有img标签
@@ -133,8 +142,6 @@ def reptile(browser=None, search_word=""):
                for span in span_element:
                    span.extract()
                html = soup.prettify().replace('amp;', '')
-                print(html)
-                print("aaaaa")
                # ------------------ content 过滤 end--------------

                # --------------- 组装数据 start---------------------
@@ -144,7 +151,8 @@ def reptile(browser=None, search_word=""):
                    "link": browser_current_url,
                    "reptileTime": str(int(time.time())),
                    "type": content_type,
-                    "author": element_author.text
+                    "author": element_author.text,
+                    "releaseTime": release_time
                }
                # --------------- 组装数据 end---------------------