fix:爬取数据入库

d6d072eb · liyang · 3769935a · d6d072eb · d6d072eb
Commit d6d072eb authored Jul 11, 2023 by liyang
Expand all Hide whitespace changes
Inline Side-by-side

Showing with 49 additions and 168 deletions

app.log app.log +0 -142

pc_ptt.py pc_ptt.py +49 -26

No files found.
--- a/app.log
+++ b/app.log
--- a/pc_ptt.py
+++ b/pc_ptt.py
@@ -61,17 +61,15 @@ def write_to_database(data):
 def reptile(browser=None, search_word=""):
    url = "https://www.ptt.cc/bbs/hotboards.html"
    browser = browser or create(['--headless'])
-    # browser = browser or create()
-    # time.sleep(1)
    # 打开网页
    browser.get(url)
+    log.debug("已打开浏览器")
    classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")
    # log.debug(classify_item_list)
    length = len(classify_item_list)
    for index in range(length):
-        if 1 < index < 3:
+        if 0 < index < 2:
            classify_item_list[index].click()
-            # if index==0:
            time.sleep(1)
            element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
            length_two = len(element_list)
@@ -104,35 +102,61 @@ def reptile(browser=None, search_word=""):
                # ------------------------------------
                # 使用BeautifulSoup解析HTML
                soup = BeautifulSoup(element_content.get_attribute('innerHTML'), 'html.parser')
+                # 作者
+                element_author = browser.find_element('xpath',
+                                                      "//div[@id='main-content']/div[@class='article-metaline'][1]/span[2]")
+                log.debug('开始判断类型')
+                # ---------------- 判断类型 start ----------
+                # 查找所有img标签
+                img_tags = soup.find_all('img')
+                # 类型
+                content_type = ""
+                if len(img_tags) > 0:
+                    content_type = "图文"
+                else:
+                    content_type = "文字"
+                # ---------------- 判断类型 end ----------
+                log.debug('开始内容过滤')
+                # ------------------ content 过滤 start--------------
                # 查找所有的<a>标签
                a_tags = soup.find_all('a', href=True)
                # 循环遍历<a>标签，检查每个<a>标签是否包含<img>元素，如果包含则删除该<a>标签
                for tag in a_tags:
                    tag.decompose()
+                # 找到所有第一级标签为 `div` 的元素
+                div_elements = soup.find_all('div')
+                # 逐个删除这些元素
+                for div in div_elements:
+                    div.extract()
+                # 删除第一级span
+                span_element = soup.find_all('span')
+                for span in span_element:
+                    span.extract()
                html = soup.prettify().replace('amp;', '')
-                # log.debug(html)
-                # log.debug('11111')
-                # ------------------------------------
-                # 组装数据
+                print(html)
+                print("aaaaa")
+                # ------------------ content 过滤 end--------------
+
+                # --------------- 组装数据 start---------------------
                obj = {
                    "title": element_title.text,
                    "content": html,
                    "link": browser_current_url,
-                    "reptileTime": str(int(time.time()))
+                    "reptileTime": str(int(time.time())),
+                    "type": content_type,
+                    "author": element_author.text
                }
+                # --------------- 组装数据 end---------------------

-                # ------------------------------------------------------
-                data.append(obj)
-                # # 使用正则表达式进行匹配
-                # matches = re.findall(search_word, element_title.text)
-                # # 打印匹配结果
-                # if matches:
-                #     # log.debug(f"找到了匹配的字符串：{matches}")
-                #     data.append(obj)
-                # else:
-                #     log.debug("未找到匹配的字符串")
-                # ------------------------------------------------------
-
+                # data.append(obj)
+                # 使用正则表达式进行匹配
+                matches = re.findall(search_word, element_title.text)
+                # 打印匹配结果
+                if matches:
+                    # log.debug(f"找到了匹配的字符串：{matches}")
+                    data.append(obj)
+                else:
+                    log.debug("未找到匹配的字符串")

                # 浏览器返回上一页
                browser.back()
@@ -185,11 +209,6 @@ def convert_to_traditional(simplified_text):
    return traditional_text


-# 全局变量
-data = []
-table_name = "pms_ptt"
-
-
 def main():
    # 请求关键词
    response = getReptileTask()
@@ -208,4 +227,8 @@ def main():
        # upload_control()


+# 全局变量
+data = []
+table_name = "pms_ptt"
+# 调用main函数
 main()