fix:爬取数据入库

992c7fbb · liyang · 0c402bd4 · 992c7fbb · 992c7fbb · 992c7fbb
Commit 992c7fbb authored Jul 11, 2023 by liyang
Show whitespace changes
Inline Side-by-side

Showing with 62 additions and 29 deletions

app.log app.log +25 -0

pc_ptt.py pc_ptt.py +27 -29

.gitkeep user_data/.gitkeep +0 -0

createBrowserDriver.py utils/createBrowserDriver.py +10 -0

No files found.
--- a/app.log
+++ b/app.log
@@ -282,3 +282,28 @@
 2023-07-10 20:07:10,800  DEBUG  pc_ptt.py : reptile  [line: 157]  -----------------------------
 2023-07-10 20:07:10,800  DEBUG  pc_ptt.py : reptile  [line: 160]  文件保存成功
 2023-07-10 20:07:10,801  DEBUG  pc_ptt.py : reptile  [line: 162]  文件绝对路径：/Users/macosx/Desktop/项目文档/网络资产管理系统/network-assets-reptile/reptile_data/ptt/1688990830.json
+2023-07-10 20:53:30,796  DEBUG  pc_ptt.py : main  [line: 191]  call success
+2023-07-10 20:53:37,765  DEBUG  pc_ptt.py : reptile  [line: 88]  网页链接https://www.ptt.cc/bbs/Baseball/M.1688991749.A.198.html
+2023-07-10 20:53:40,542  DEBUG  pc_ptt.py : reptile  [line: 88]  网页链接https://www.ptt.cc/bbs/Baseball/M.1688991994.A.D59.html
+2023-07-10 20:53:40,583  DEBUG  pc_ptt.py : reptile  [line: 127]  未找到匹配的字符串
+2023-07-10 20:53:42,508  DEBUG  pc_ptt.py : reptile  [line: 88]  网页链接https://www.ptt.cc/bbs/Baseball/M.1688991995.A.4CE.html
+2023-07-10 20:53:42,546  DEBUG  pc_ptt.py : reptile  [line: 127]  未找到匹配的字符串
+2023-07-10 20:53:44,395  DEBUG  pc_ptt.py : reptile  [line: 88]  网页链接https://www.ptt.cc/bbs/Baseball/M.1688991995.A.797.html
+2023-07-10 20:53:44,433  DEBUG  pc_ptt.py : reptile  [line: 127]  未找到匹配的字符串
+2023-07-10 20:53:45,741  DEBUG  pc_ptt.py : reptile  [line: 88]  网页链接https://www.ptt.cc/bbs/Baseball/M.1688992089.A.50C.html
+2023-07-10 20:53:47,085  DEBUG  pc_ptt.py : reptile  [line: 88]  网页链接https://www.ptt.cc/bbs/Baseball/M.1688992768.A.CC7.html
+2023-07-10 20:53:48,390  DEBUG  pc_ptt.py : reptile  [line: 88]  网页链接https://www.ptt.cc/bbs/Baseball/M.1688993164.A.DB1.html
+2023-07-10 20:53:48,428  DEBUG  pc_ptt.py : reptile  [line: 127]  未找到匹配的字符串
+2023-07-10 20:53:49,690  DEBUG  pc_ptt.py : reptile  [line: 88]  网页链接https://www.ptt.cc/bbs/Baseball/M.1688993204.A.E97.html
+2023-07-10 20:53:49,727  DEBUG  pc_ptt.py : reptile  [line: 127]  未找到匹配的字符串
+2023-07-10 20:53:51,003  DEBUG  pc_ptt.py : reptile  [line: 88]  网页链接https://www.ptt.cc/bbs/Baseball/M.1688993233.A.0E7.html
+2023-07-10 20:53:51,041  DEBUG  pc_ptt.py : reptile  [line: 127]  未找到匹配的字符串
+2023-07-10 20:53:52,275  DEBUG  pc_ptt.py : reptile  [line: 88]  网页链接https://www.ptt.cc/bbs/Baseball/M.1688993306.A.8D6.html
+2023-07-10 20:53:52,314  DEBUG  pc_ptt.py : reptile  [line: 127]  未找到匹配的字符串
+2023-07-10 20:53:53,630  DEBUG  pc_ptt.py : reptile  [line: 88]  网页链接https://www.ptt.cc/bbs/Baseball/M.1634212242.A.283.html
+2023-07-10 20:53:54,783  DEBUG  pc_ptt.py : reptile  [line: 153]  /Users/macosx/Desktop/项目文档/网络资产管理系统
+2023-07-10 20:53:54,784  DEBUG  filse.py : save_json  [line: 8]  ------save json start--------
+2023-07-10 20:53:54,784  DEBUG  filse.py : save_json  [line: 14]  文件保存路径:/Users/macosx/Desktop/项目文档/网络资产管理系统/network-assets-reptile/reptile_data/ptt/1688993634.json
+2023-07-10 20:53:54,786  DEBUG  pc_ptt.py : reptile  [line: 157]  -----------------------------
+2023-07-10 20:53:54,786  DEBUG  pc_ptt.py : reptile  [line: 160]  save file success
+2023-07-10 20:53:54,786  DEBUG  pc_ptt.py : reptile  [line: 162]  file_path：/Users/macosx/Desktop/项目文档/网络资产管理系统/network-assets-reptile/reptile_data/ptt/1688993634.json
--- a/pc_ptt.py
+++ b/pc_ptt.py
@@ -60,7 +60,8 @@ def write_to_database(data):

 def reptile(browser=None, search_word=""):
    url = "https://www.ptt.cc/bbs/hotboards.html"
-    browser = browser or create(['--headless'])
+    # browser = browser or create(['--headless'])
+    browser = browser or create()
    # time.sleep(1)
    # 打开网页
    browser.get(url)
@@ -70,6 +71,7 @@ def reptile(browser=None, search_word=""):
    for index in range(length):
        if 0 < index < 2:
            classify_item_list[index].click()
+            # if index==0:
            time.sleep(1)
            element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
            length_two = len(element_list)
@@ -91,6 +93,7 @@ def reptile(browser=None, search_word=""):
                    element_title = browser.find_element('xpath',
                                                         "//div[@id='main-content']/div[3]//span[@class='article-meta-value']")
                except:
+                    log.error("xpath 找不到元素：//div[@id='main-content']/div[3]//span[@class='article-meta-value']")
                    # 浏览器返回上一页
                    browser.back()
                    element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
@@ -135,23 +138,10 @@ def reptile(browser=None, search_word=""):
            classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")

    # 发送爬取数据到java服务
-    def upload_control():
-        # 定义表单数据
-        form_data = {
-            "tableName": "pms_ptt",
-        }
-        # file = io.BytesIO(json.dumps(data).encode())
-        # response = importJson(file, form_data)
-        # if response['status_code'] == 200 and response['data']['code'] == 200:
-        #     log.debug("调用成功")
-        # else:
-        #     log.debug("调用失败")
-        # upload_control()
-
-    # upload_control()
+    if len(data) > 0:
        # 保存json文件到本地
        log.debug(os.path.abspath("../"))
-    file_dir = f'{os.path.join(os.path.abspath("../"),"network-assets-reptile","reptile_data","ptt",str(int(time.time()))+".json")}'
+        file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", "ptt", str(int(time.time())) + ".json")}'
        # file_dir = f'./reptile_data/ptt/{int(time.time())}.json'
        state_save = save_json(file_dir, data)
        log.debug("-----------------------------")
@@ -167,6 +157,14 @@ def reptile(browser=None, search_word=""):
            response = importJsonPath(form_data)
        else:
            log.debug('save file failed')
+    else:
+        # 爬取数据为空
+        form_data = {
+            "path": "",
+            "tableName": table_name
+        }
+        response = importJsonPath(form_data)
+
    # 关闭浏览器驱动
    # time.sleep(3)
    browser.quit()

--- a/user_data/.gitkeep
+++ b/user_data/.gitkeep
--- a/utils/createBrowserDriver.py
+++ b/utils/createBrowserDriver.py
+import os.path
 import sys
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
@@ -16,6 +17,14 @@ def create(option=None):
    if option is not None:
        for value in option:
            chrome_options.add_argument(value)
+
+    # 启用浏览器的持久性会话，可以保存登录状态和Cookie
+    user_data_dir = os.path.join(os.path.abspath("../"), 'network-assets-reptile', 'user_data')
+    script = f'--user-data-dir={user_data_dir}'
+    print(script)
+    log.debug(script)
+    chrome_options.add_argument(script)  # 设置一个自定义的用户配置文件路径
+
    if sys.platform.startswith('linux'):
        # print("当前系统是 Linux")
        # linux下运行记得加上这些参数 ----------------------------
@@ -36,4 +45,5 @@ def create(option=None):
        chrome_options.add_argument('--no-sandbox')  # 禁用沙盒模式
        # 创建浏览器驱动对象
        browser = webdriver.Chrome(options=chrome_options)
+
    return browser