Commit 992c7fbb authored by liyang's avatar liyang

fix:爬取数据入库

parent 0c402bd4
......@@ -282,3 +282,28 @@
2023-07-10 20:07:10,800 DEBUG pc_ptt.py : reptile [line: 157] -----------------------------
2023-07-10 20:07:10,800 DEBUG pc_ptt.py : reptile [line: 160] 文件保存成功
2023-07-10 20:07:10,801 DEBUG pc_ptt.py : reptile [line: 162] 文件绝对路径:/Users/macosx/Desktop/项目文档/网络资产管理系统/network-assets-reptile/reptile_data/ptt/1688990830.json
2023-07-10 20:53:30,796 DEBUG pc_ptt.py : main [line: 191] call success
2023-07-10 20:53:37,765 DEBUG pc_ptt.py : reptile [line: 88] 网页链接https://www.ptt.cc/bbs/Baseball/M.1688991749.A.198.html
2023-07-10 20:53:40,542 DEBUG pc_ptt.py : reptile [line: 88] 网页链接https://www.ptt.cc/bbs/Baseball/M.1688991994.A.D59.html
2023-07-10 20:53:40,583 DEBUG pc_ptt.py : reptile [line: 127] 未找到匹配的字符串
2023-07-10 20:53:42,508 DEBUG pc_ptt.py : reptile [line: 88] 网页链接https://www.ptt.cc/bbs/Baseball/M.1688991995.A.4CE.html
2023-07-10 20:53:42,546 DEBUG pc_ptt.py : reptile [line: 127] 未找到匹配的字符串
2023-07-10 20:53:44,395 DEBUG pc_ptt.py : reptile [line: 88] 网页链接https://www.ptt.cc/bbs/Baseball/M.1688991995.A.797.html
2023-07-10 20:53:44,433 DEBUG pc_ptt.py : reptile [line: 127] 未找到匹配的字符串
2023-07-10 20:53:45,741 DEBUG pc_ptt.py : reptile [line: 88] 网页链接https://www.ptt.cc/bbs/Baseball/M.1688992089.A.50C.html
2023-07-10 20:53:47,085 DEBUG pc_ptt.py : reptile [line: 88] 网页链接https://www.ptt.cc/bbs/Baseball/M.1688992768.A.CC7.html
2023-07-10 20:53:48,390 DEBUG pc_ptt.py : reptile [line: 88] 网页链接https://www.ptt.cc/bbs/Baseball/M.1688993164.A.DB1.html
2023-07-10 20:53:48,428 DEBUG pc_ptt.py : reptile [line: 127] 未找到匹配的字符串
2023-07-10 20:53:49,690 DEBUG pc_ptt.py : reptile [line: 88] 网页链接https://www.ptt.cc/bbs/Baseball/M.1688993204.A.E97.html
2023-07-10 20:53:49,727 DEBUG pc_ptt.py : reptile [line: 127] 未找到匹配的字符串
2023-07-10 20:53:51,003 DEBUG pc_ptt.py : reptile [line: 88] 网页链接https://www.ptt.cc/bbs/Baseball/M.1688993233.A.0E7.html
2023-07-10 20:53:51,041 DEBUG pc_ptt.py : reptile [line: 127] 未找到匹配的字符串
2023-07-10 20:53:52,275 DEBUG pc_ptt.py : reptile [line: 88] 网页链接https://www.ptt.cc/bbs/Baseball/M.1688993306.A.8D6.html
2023-07-10 20:53:52,314 DEBUG pc_ptt.py : reptile [line: 127] 未找到匹配的字符串
2023-07-10 20:53:53,630 DEBUG pc_ptt.py : reptile [line: 88] 网页链接https://www.ptt.cc/bbs/Baseball/M.1634212242.A.283.html
2023-07-10 20:53:54,783 DEBUG pc_ptt.py : reptile [line: 153] /Users/macosx/Desktop/项目文档/网络资产管理系统
2023-07-10 20:53:54,784 DEBUG filse.py : save_json [line: 8] ------save json start--------
2023-07-10 20:53:54,784 DEBUG filse.py : save_json [line: 14] 文件保存路径:/Users/macosx/Desktop/项目文档/网络资产管理系统/network-assets-reptile/reptile_data/ptt/1688993634.json
2023-07-10 20:53:54,786 DEBUG pc_ptt.py : reptile [line: 157] -----------------------------
2023-07-10 20:53:54,786 DEBUG pc_ptt.py : reptile [line: 160] save file success
2023-07-10 20:53:54,786 DEBUG pc_ptt.py : reptile [line: 162] file_path:/Users/macosx/Desktop/项目文档/网络资产管理系统/network-assets-reptile/reptile_data/ptt/1688993634.json
......@@ -60,7 +60,8 @@ def write_to_database(data):
def reptile(browser=None, search_word=""):
url = "https://www.ptt.cc/bbs/hotboards.html"
browser = browser or create(['--headless'])
# browser = browser or create(['--headless'])
browser = browser or create()
# time.sleep(1)
# 打开网页
browser.get(url)
......@@ -70,6 +71,7 @@ def reptile(browser=None, search_word=""):
for index in range(length):
if 0 < index < 2:
classify_item_list[index].click()
# if index==0:
time.sleep(1)
element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
length_two = len(element_list)
......@@ -91,6 +93,7 @@ def reptile(browser=None, search_word=""):
element_title = browser.find_element('xpath',
"//div[@id='main-content']/div[3]//span[@class='article-meta-value']")
except:
log.error("xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']")
# 浏览器返回上一页
browser.back()
element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
......@@ -135,23 +138,10 @@ def reptile(browser=None, search_word=""):
classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")
# 发送爬取数据到java服务
def upload_control():
# 定义表单数据
form_data = {
"tableName": "pms_ptt",
}
# file = io.BytesIO(json.dumps(data).encode())
# response = importJson(file, form_data)
# if response['status_code'] == 200 and response['data']['code'] == 200:
# log.debug("调用成功")
# else:
# log.debug("调用失败")
# upload_control()
# upload_control()
if len(data) > 0:
# 保存json文件到本地
log.debug(os.path.abspath("../"))
file_dir = f'{os.path.join(os.path.abspath("../"),"network-assets-reptile","reptile_data","ptt",str(int(time.time()))+".json")}'
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", "ptt", str(int(time.time())) + ".json")}'
# file_dir = f'./reptile_data/ptt/{int(time.time())}.json'
state_save = save_json(file_dir, data)
log.debug("-----------------------------")
......@@ -167,6 +157,14 @@ def reptile(browser=None, search_word=""):
response = importJsonPath(form_data)
else:
log.debug('save file failed')
else:
# 爬取数据为空
form_data = {
"path": "",
"tableName": table_name
}
response = importJsonPath(form_data)
# 关闭浏览器驱动
# time.sleep(3)
browser.quit()
......
import os.path
import sys
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
......@@ -16,6 +17,14 @@ def create(option=None):
if option is not None:
for value in option:
chrome_options.add_argument(value)
# 启用浏览器的持久性会话,可以保存登录状态和Cookie
user_data_dir = os.path.join(os.path.abspath("../"), 'network-assets-reptile', 'user_data')
script = f'--user-data-dir={user_data_dir}'
print(script)
log.debug(script)
chrome_options.add_argument(script) # 设置一个自定义的用户配置文件路径
if sys.platform.startswith('linux'):
# print("当前系统是 Linux")
# linux下运行记得加上这些参数 ----------------------------
......@@ -36,4 +45,5 @@ def create(option=None):
chrome_options.add_argument('--no-sandbox') # 禁用沙盒模式
# 创建浏览器驱动对象
browser = webdriver.Chrome(options=chrome_options)
return browser
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment