feat:ptt debug

dae3fe3d · liyang · 27018e2d · dae3fe3d · dae3fe3d
Commit dae3fe3d authored Aug 09, 2023 by liyang
Show whitespace changes
Inline Side-by-side

Showing with 185 additions and 149 deletions

error.log log/error.log +9 -0

pc_ptt.py pc_ptt.py +176 -149

No files found.
--- a/log/error.log
+++ b/log/error.log
@@ -11,3 +11,12 @@
 2023-07-13 16:41:30,332  ERROR  pc_ptt.py : reptile  [line: 66]  xpath 找不到元素：//div[@id='main-content']/div[3]//span[@class='article-meta-value']
 2023-07-13 16:43:37,394  ERROR  pc_ptt.py : reptile  [line: 67]  xpath 找不到元素：//div[@id='main-content']/div[3]//span[@class='article-meta-value']
 2023-07-21 10:54:17,501  ERROR  pc_ptt.py : reptile  [line: 73]  xpath 找不到元素：//div[@id='main-content']/div[3]//span[@class='article-meta-value']
+2023-08-09 15:41:32,527  ERROR  pc_ptt.py : reptile  [line: 120]  xpath 找不到元素：//div[@id='main-content']/div[3]//span[@class='article-meta-value']
+2023-08-09 15:41:41,957  ERROR  pc_ptt.py : reptile  [line: 120]  xpath 找不到元素：//div[@id='main-content']/div[3]//span[@class='article-meta-value']
+2023-08-09 15:41:43,433  ERROR  pc_ptt.py : reptile  [line: 120]  xpath 找不到元素：//div[@id='main-content']/div[3]//span[@class='article-meta-value']
+2023-08-09 15:51:10,728  ERROR  pc_ptt.py : reptile  [line: 120]  xpath 找不到元素：//div[@id='main-content']/div[3]//span[@class='article-meta-value']
+2023-08-09 15:52:41,156  ERROR  pc_ptt.py : reptile  [line: 120]  xpath 找不到元素：//div[@id='main-content']/div[3]//span[@class='article-meta-value']
+2023-08-09 15:58:54,782  ERROR  pc_ptt.py : reptile  [line: 120]  xpath 找不到元素：//div[@id='main-content']/div[3]//span[@class='article-meta-value']
+2023-08-09 15:59:04,220  ERROR  pc_ptt.py : reptile  [line: 120]  xpath 找不到元素：//div[@id='main-content']/div[3]//span[@class='article-meta-value']
+2023-08-09 15:59:27,844  ERROR  pc_ptt.py : reptile  [line: 120]  xpath 找不到元素：//div[@id='main-content']/div[3]//span[@class='article-meta-value']
+2023-08-09 16:00:02,916  ERROR  pc_ptt.py : reptile  [line: 120]  xpath 找不到元素：//div[@id='main-content']/div[3]//span[@class='article-meta-value']
--- a/pc_ptt.py
+++ b/pc_ptt.py
@@ -8,6 +8,9 @@ import loguru
 import requests
 from bs4 import BeautifulSoup
 from datetime import datetime
+
+from selenium.common import NoSuchElementException
+
 from api.index import importJson, getReptileTask, importJsonPath
 from utils.Logger import log
 from utils.index import convert_to_traditional, create_directory_if_not_exists, delete_directory
@@ -42,8 +45,8 @@ def reptile(browser=None, search_word=""):
    classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")

    for index, item_element in enumerate(classify_item_list):
-        # 暂时先爬取 第2个 分类
-        if 0 <= index <= 14:
+        # 只爬取综合分类
+        if 0 <= index < 1:
            type_title = classify_item_list[index].text
            # 进入分类页面
            classify_item_list[index].click()
@@ -59,40 +62,46 @@ def reptile(browser=None, search_word=""):

            wait = WebDriverWait(browser, 10)
            wait.until(EC.presence_of_element_located((By.XPATH, "//div[@class='r-ent']")))
-            element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
+            page = 1
+
+            # element_meta_list = browser.find_elements("xpath", "//div[@class='r-ent']//div[@class='meta']")
+
+            def process_data():
+                # 增加搜索
+                search_input = browser.find_element("xpath", "//div[@class='search-bar']//input")
+                if search_word != search_input.get_attribute("value"):
+                    # 输入搜索关键词
+                    search_input.send_keys(search_word)
+                    # 点击输入框
+                    # browser.execute_script("arguments[0].click();", search_input)
+                    # 确认搜索关键词
+                    # search_input.click()
+                    search_input.submit()
+                    # 程序睡眠300ms，等待页面加载完成
+                    time.sleep(0.3)

+                element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
+                nonlocal page
+                print(f"当前为第：{page} 页，共 {len(element_list)} 条数据")
                for index_two, item in enumerate(element_list):
                    # print(element_list[index_two].text)
                    try:
                        re.findall("公告", item.text)
                    except IndexError:
-                    log.debug(f"正在爬取分类：{type_title}-第{index_two + 1}条")
-                    print("当前连接：" + str(browser.current_url))
-                    print(data[len(data) - 1]["title"])
-                # 使用正则表达式进行匹配关键词
-                if re.findall(search_word, item.text):
-                    # log.debug(f"找到了匹配的字符串：{matches}")
                        error = ""
-                else:
-                    # 本次迭代帖子标题未匹配关键词，退出本次迭代，进入下一次迭代
-                    continue
+                        # log.debug(f"正在爬取分类：{type_title}-第{index_two + 1}条")
+                        # print("当前连接：" + str(browser.current_url))
+                        # print(data[len(data) - 1]["title"])

                    # 标题不包含"公告"和"看板"
-                if re.findall("公告", element_list[index_two].text) or re.findall("看板", element_list[index_two].text):
+                    if re.findall("公告", element_list[index_two].text) or re.findall("看板",
+                                                                                      element_list[index_two].text):
                        a = 1
                    else:
-                    # 使用正则表达式进行匹配
-                    # matches =
-                    # log.debug(element_list[index_two].text+str(matches))
-                    # 打印匹配结果
-                    # if matches:
-                    # log.debug(f"找到了匹配的字符串：{matches}")
                        element_list[index_two].click()
-                    time.sleep(0.1)
+                        time.sleep(0.2)
                        # 原链接
                        browser_current_url = browser.current_url
-                    # print(browser_current_url)
-                    # log.debug('网页链接' + str(browser_current_url))
                        try:
                            # 获取帖子详情
                            element_title = browser.find_element('xpath',
@@ -104,7 +113,8 @@ def reptile(browser=None, search_word=""):
                            log.debug(f'页面链接：{browser_current_url}')
                            # 浏览器返回上一页
                            browser.back()
-                        element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
+                            element_list = browser.find_elements('xpath',
+                                                                 "//div[@class='r-ent']//div[@class='title']//a")
                            break
                        # 内容可能包含图片和视频，需要后处理
                        element_content = browser.find_element('xpath', "//div[@id='main-content']")
@@ -183,6 +193,12 @@ def reptile(browser=None, search_word=""):
                        # ------------------ content 过滤 end--------------

                        date_string = element_release.text
+                        # date_string = "Wed Aug  9 15:39:26 2023 //update 20934353"
+                        # 提取日期字符串
+                        if "//" in date_string:
+                            date_string = date_string.split("//")[0]
+                            date_string = date_string.strip()
+
                        date_format = "%a %b %d %H:%M:%S %Y"
                        # 将日期字符串转换为datetime对象
                        date_time = datetime.strptime(date_string, date_format)
@@ -206,17 +222,28 @@ def reptile(browser=None, search_word=""):
                            data.append(obj)
                        # 浏览器返回上一页
                        browser.back()
-                    time.sleep(0.1)
+                        time.sleep(0.2)
                        element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
+                page = page + 1
+                # print("111111")
+                try:
+                    prev_button = browser.find_element('xpath',
+                                                       "//a[@class='btn wide' and text() = '‹ 上頁']")
+                    prev_button.click()
+                    time.sleep(0.3)
+                    process_data()
+                except:
+                    error = ""

-            # print("循环结束")
-            # 浏览器返回上一页
-            browser.back()
-            if index == 0:
-                browser.back()
-            time.sleep(0.1)
-            # 重新获取
-            classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")
+            process_data()
+            # # print("循环结束")
+            # # 浏览器返回上一页
+            # browser.back()
+            # if index == 0:
+            #     browser.back()
+            # time.sleep(0.1)
+            # # 重新获取
+            # classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")

    # 发送爬取数据到java服务
    # print('----------------------')