Commit 992c7fbb authored by liyang's avatar liyang

fix:爬取数据入库

parent 0c402bd4
...@@ -282,3 +282,28 @@ ...@@ -282,3 +282,28 @@
2023-07-10 20:07:10,800 DEBUG pc_ptt.py : reptile [line: 157] ----------------------------- 2023-07-10 20:07:10,800 DEBUG pc_ptt.py : reptile [line: 157] -----------------------------
2023-07-10 20:07:10,800 DEBUG pc_ptt.py : reptile [line: 160] 文件保存成功 2023-07-10 20:07:10,800 DEBUG pc_ptt.py : reptile [line: 160] 文件保存成功
2023-07-10 20:07:10,801 DEBUG pc_ptt.py : reptile [line: 162] 文件绝对路径:/Users/macosx/Desktop/项目文档/网络资产管理系统/network-assets-reptile/reptile_data/ptt/1688990830.json 2023-07-10 20:07:10,801 DEBUG pc_ptt.py : reptile [line: 162] 文件绝对路径:/Users/macosx/Desktop/项目文档/网络资产管理系统/network-assets-reptile/reptile_data/ptt/1688990830.json
2023-07-10 20:53:30,796 DEBUG pc_ptt.py : main [line: 191] call success
2023-07-10 20:53:37,765 DEBUG pc_ptt.py : reptile [line: 88] 网页链接https://www.ptt.cc/bbs/Baseball/M.1688991749.A.198.html
2023-07-10 20:53:40,542 DEBUG pc_ptt.py : reptile [line: 88] 网页链接https://www.ptt.cc/bbs/Baseball/M.1688991994.A.D59.html
2023-07-10 20:53:40,583 DEBUG pc_ptt.py : reptile [line: 127] 未找到匹配的字符串
2023-07-10 20:53:42,508 DEBUG pc_ptt.py : reptile [line: 88] 网页链接https://www.ptt.cc/bbs/Baseball/M.1688991995.A.4CE.html
2023-07-10 20:53:42,546 DEBUG pc_ptt.py : reptile [line: 127] 未找到匹配的字符串
2023-07-10 20:53:44,395 DEBUG pc_ptt.py : reptile [line: 88] 网页链接https://www.ptt.cc/bbs/Baseball/M.1688991995.A.797.html
2023-07-10 20:53:44,433 DEBUG pc_ptt.py : reptile [line: 127] 未找到匹配的字符串
2023-07-10 20:53:45,741 DEBUG pc_ptt.py : reptile [line: 88] 网页链接https://www.ptt.cc/bbs/Baseball/M.1688992089.A.50C.html
2023-07-10 20:53:47,085 DEBUG pc_ptt.py : reptile [line: 88] 网页链接https://www.ptt.cc/bbs/Baseball/M.1688992768.A.CC7.html
2023-07-10 20:53:48,390 DEBUG pc_ptt.py : reptile [line: 88] 网页链接https://www.ptt.cc/bbs/Baseball/M.1688993164.A.DB1.html
2023-07-10 20:53:48,428 DEBUG pc_ptt.py : reptile [line: 127] 未找到匹配的字符串
2023-07-10 20:53:49,690 DEBUG pc_ptt.py : reptile [line: 88] 网页链接https://www.ptt.cc/bbs/Baseball/M.1688993204.A.E97.html
2023-07-10 20:53:49,727 DEBUG pc_ptt.py : reptile [line: 127] 未找到匹配的字符串
2023-07-10 20:53:51,003 DEBUG pc_ptt.py : reptile [line: 88] 网页链接https://www.ptt.cc/bbs/Baseball/M.1688993233.A.0E7.html
2023-07-10 20:53:51,041 DEBUG pc_ptt.py : reptile [line: 127] 未找到匹配的字符串
2023-07-10 20:53:52,275 DEBUG pc_ptt.py : reptile [line: 88] 网页链接https://www.ptt.cc/bbs/Baseball/M.1688993306.A.8D6.html
2023-07-10 20:53:52,314 DEBUG pc_ptt.py : reptile [line: 127] 未找到匹配的字符串
2023-07-10 20:53:53,630 DEBUG pc_ptt.py : reptile [line: 88] 网页链接https://www.ptt.cc/bbs/Baseball/M.1634212242.A.283.html
2023-07-10 20:53:54,783 DEBUG pc_ptt.py : reptile [line: 153] /Users/macosx/Desktop/项目文档/网络资产管理系统
2023-07-10 20:53:54,784 DEBUG filse.py : save_json [line: 8] ------save json start--------
2023-07-10 20:53:54,784 DEBUG filse.py : save_json [line: 14] 文件保存路径:/Users/macosx/Desktop/项目文档/网络资产管理系统/network-assets-reptile/reptile_data/ptt/1688993634.json
2023-07-10 20:53:54,786 DEBUG pc_ptt.py : reptile [line: 157] -----------------------------
2023-07-10 20:53:54,786 DEBUG pc_ptt.py : reptile [line: 160] save file success
2023-07-10 20:53:54,786 DEBUG pc_ptt.py : reptile [line: 162] file_path:/Users/macosx/Desktop/项目文档/网络资产管理系统/network-assets-reptile/reptile_data/ptt/1688993634.json
...@@ -60,7 +60,8 @@ def write_to_database(data): ...@@ -60,7 +60,8 @@ def write_to_database(data):
def reptile(browser=None, search_word=""): def reptile(browser=None, search_word=""):
url = "https://www.ptt.cc/bbs/hotboards.html" url = "https://www.ptt.cc/bbs/hotboards.html"
browser = browser or create(['--headless']) # browser = browser or create(['--headless'])
browser = browser or create()
# time.sleep(1) # time.sleep(1)
# 打开网页 # 打开网页
browser.get(url) browser.get(url)
...@@ -70,6 +71,7 @@ def reptile(browser=None, search_word=""): ...@@ -70,6 +71,7 @@ def reptile(browser=None, search_word=""):
for index in range(length): for index in range(length):
if 0 < index < 2: if 0 < index < 2:
classify_item_list[index].click() classify_item_list[index].click()
# if index==0:
time.sleep(1) time.sleep(1)
element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a") element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
length_two = len(element_list) length_two = len(element_list)
...@@ -91,6 +93,7 @@ def reptile(browser=None, search_word=""): ...@@ -91,6 +93,7 @@ def reptile(browser=None, search_word=""):
element_title = browser.find_element('xpath', element_title = browser.find_element('xpath',
"//div[@id='main-content']/div[3]//span[@class='article-meta-value']") "//div[@id='main-content']/div[3]//span[@class='article-meta-value']")
except: except:
log.error("xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']")
# 浏览器返回上一页 # 浏览器返回上一页
browser.back() browser.back()
element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a") element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
...@@ -135,38 +138,33 @@ def reptile(browser=None, search_word=""): ...@@ -135,38 +138,33 @@ def reptile(browser=None, search_word=""):
classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']") classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")
# 发送爬取数据到java服务 # 发送爬取数据到java服务
def upload_control(): if len(data) > 0:
# 定义表单数据 # 保存json文件到本地
form_data = { log.debug(os.path.abspath("../"))
"tableName": "pms_ptt", file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", "ptt", str(int(time.time())) + ".json")}'
} # file_dir = f'./reptile_data/ptt/{int(time.time())}.json'
# file = io.BytesIO(json.dumps(data).encode()) state_save = save_json(file_dir, data)
# response = importJson(file, form_data) log.debug("-----------------------------")
# if response['status_code'] == 200 and response['data']['code'] == 200: # write_to_database(data)
# log.debug("调用成功") if state_save:
# else: log.debug('save file success')
# log.debug("调用失败") # path = os.path.abspath(file_dir).join(file_dir).join(".json")
# upload_control() log.debug('file_path:' + file_dir)
form_data = {
# upload_control() "path": file_dir,
# 保存json文件到本地 "tableName": table_name
log.debug(os.path.abspath("../")) }
file_dir = f'{os.path.join(os.path.abspath("../"),"network-assets-reptile","reptile_data","ptt",str(int(time.time()))+".json")}' response = importJsonPath(form_data)
# file_dir = f'./reptile_data/ptt/{int(time.time())}.json' else:
state_save = save_json(file_dir, data) log.debug('save file failed')
log.debug("-----------------------------") else:
# write_to_database(data) # 爬取数据为空
if state_save:
log.debug('save file success')
# path = os.path.abspath(file_dir).join(file_dir).join(".json")
log.debug('file_path:' + file_dir)
form_data = { form_data = {
"path": file_dir, "path": "",
"tableName": table_name "tableName": table_name
} }
response = importJsonPath(form_data) response = importJsonPath(form_data)
else:
log.debug('save file failed')
# 关闭浏览器驱动 # 关闭浏览器驱动
# time.sleep(3) # time.sleep(3)
browser.quit() browser.quit()
......
import os.path
import sys import sys
from selenium import webdriver from selenium import webdriver
from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.options import Options
...@@ -16,6 +17,14 @@ def create(option=None): ...@@ -16,6 +17,14 @@ def create(option=None):
if option is not None: if option is not None:
for value in option: for value in option:
chrome_options.add_argument(value) chrome_options.add_argument(value)
# 启用浏览器的持久性会话,可以保存登录状态和Cookie
user_data_dir = os.path.join(os.path.abspath("../"), 'network-assets-reptile', 'user_data')
script = f'--user-data-dir={user_data_dir}'
print(script)
log.debug(script)
chrome_options.add_argument(script) # 设置一个自定义的用户配置文件路径
if sys.platform.startswith('linux'): if sys.platform.startswith('linux'):
# print("当前系统是 Linux") # print("当前系统是 Linux")
# linux下运行记得加上这些参数 ---------------------------- # linux下运行记得加上这些参数 ----------------------------
...@@ -36,4 +45,5 @@ def create(option=None): ...@@ -36,4 +45,5 @@ def create(option=None):
chrome_options.add_argument('--no-sandbox') # 禁用沙盒模式 chrome_options.add_argument('--no-sandbox') # 禁用沙盒模式
# 创建浏览器驱动对象 # 创建浏览器驱动对象
browser = webdriver.Chrome(options=chrome_options) browser = webdriver.Chrome(options=chrome_options)
return browser return browser
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment