Commit 3a027fb6 authored by liyang's avatar liyang

feat:调试数据入库

parent 8708ce28
reptile_data/files reptile_data/files
reptile_data/ptt/*.json
*.zip *.zip
*.json *.json
.DS_Store .DS_Store
venv venv
reptile_data
*.idea *.idea
\ No newline at end of file
...@@ -11,8 +11,23 @@ def importJson(file, form_data): ...@@ -11,8 +11,23 @@ def importJson(file, form_data):
# 将响应内容解析为 JSON 格式 # 将响应内容解析为 JSON 格式
return {"status_code": response.status_code, "data": response.json()} return {"status_code": response.status_code, "data": response.json()}
def getReptileTask(): def getReptileTask():
http_url = baser_url + "crawlerSetting/list" http_url = baser_url + "crawlerSetting/list"
response = requests.get(http_url, headers=headers) response = requests.get(http_url, headers=headers)
# 将响应内容解析为 JSON 格式 # 将响应内容解析为 JSON 格式
return {"status_code": response.status_code, "data": response.json()} return {"status_code": response.status_code, "data": response.json()}
\ No newline at end of file
def importJsonPath(form_data):
headers = {"Content-Type":"application/json"}
http_url = baser_url + "importJson/importJsonPath"
response = requests.post(http_url, headers=headers, data=form_data)
# 将响应内容解析为 JSON 格式
return {"status_code": response.status_code, "data": response.json()}
def runingPython(form_data):
headers = {"Content-Type":"application/json"}
http_url = baser_url + "python/startPy"
response = requests.post(http_url, headers=headers, data=form_data)
# 将响应内容解析为 JSON 格式
return {"status_code": response.status_code, "data": response.json()}
...@@ -13,6 +13,8 @@ from utils.Logger import log ...@@ -13,6 +13,8 @@ from utils.Logger import log
# from requests_toolbelt import * # from requests_toolbelt import *
from utils.createBrowserDriver import create from utils.createBrowserDriver import create
import opencc import opencc
from utils.filse import save_json
import os
''' '''
爬取台湾PTT论坛的热门帖子,包括帖子的标题、内容【文本、图片、视频】 爬取台湾PTT论坛的热门帖子,包括帖子的标题、内容【文本、图片、视频】
...@@ -20,8 +22,6 @@ import opencc ...@@ -20,8 +22,6 @@ import opencc
爬取流程:创建驱动--》打开浏览器--》打开网页--》爬取分类元素--》循环点击--》爬取热门帖子标题--》循环点击--》爬取帖子详情 爬取流程:创建驱动--》打开浏览器--》打开网页--》爬取分类元素--》循环点击--》爬取热门帖子标题--》循环点击--》爬取帖子详情
''' '''
data = []
def write_to_database(data): def write_to_database(data):
# 连接到数据库 # 连接到数据库
...@@ -74,14 +74,27 @@ def reptile(browser=None, search_word=""): ...@@ -74,14 +74,27 @@ def reptile(browser=None, search_word=""):
element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a") element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
length_two = len(element_list) length_two = len(element_list)
for index_two in range(length_two): for index_two in range(length_two):
# 标题不包含"公告"
# 使用正则表达式进行匹配
# matches = re.findall("公告", element_list[index_two].text)
# log.debug(element_list[index_two].text+str(matches))
# 打印匹配结果
# if matches:
# log.debug(f"找到了匹配的字符串:{matches}")
element_list[index_two].click() element_list[index_two].click()
time.sleep(1) time.sleep(1)
# 原链接 # 原链接
browser_current_url = browser.current_url browser_current_url = browser.current_url
log.debug('网页链接' + str(browser_current_url)) log.debug('网页链接' + str(browser_current_url))
# 获取帖子详情 try:
element_title = browser.find_element('xpath', # 获取帖子详情
"//div[@id='main-content']/div[3]//span[@class='article-meta-value']") element_title = browser.find_element('xpath',
"//div[@id='main-content']/div[3]//span[@class='article-meta-value']")
except:
# 浏览器返回上一页
browser.back()
element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
break
# 内容可能包含图片和视频,需要后处理 # 内容可能包含图片和视频,需要后处理
element_content = browser.find_element('xpath', "//div[@id='main-content']") element_content = browser.find_element('xpath', "//div[@id='main-content']")
# 去除herf属性值包含'img'的a标签 # 去除herf属性值包含'img'的a标签
...@@ -122,56 +135,66 @@ def reptile(browser=None, search_word=""): ...@@ -122,56 +135,66 @@ def reptile(browser=None, search_word=""):
classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']") classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")
# 发送爬取数据到java服务 # 发送爬取数据到java服务
# 保存json文件到本地
file_dir = f'./reptile_data/ptt/{int(time.time())}'
# state_save = save_json(file_dir, data)
def upload_control(): def upload_control():
# 定义表单数据 # 定义表单数据
form_data = { form_data = {
"tableName": "pms_ptt", "tableName": "pms_ptt",
} }
file = io.BytesIO(json.dumps(data).encode()) # file = io.BytesIO(json.dumps(data).encode())
response = importJson(file, form_data) # response = importJson(file, form_data)
if response['status_code'] == 200 and response['data']['code'] == 200: # if response['status_code'] == 200 and response['data']['code'] == 200:
log.debug("调用成功") # log.debug("调用成功")
else: # else:
log.debug("调用失败") # log.debug("调用失败")
# upload_control() # upload_control()
if len(data) == 0: # upload_control()
log.debug("数据为空") # 保存json文件到本地
else: file_dir = f'./reptile_data/ptt/{int(time.time())}'
upload_control() state_save = save_json(file_dir, data)
# write_to_database(data) # write_to_database(data)
# if state_save: if state_save:
# log.debug('文件保存成功') log.debug('文件保存成功')
# else: # log.debug('文件绝对路径:'+os.path.abspath(file_dir))
# log.debug('文件保存失败') form_data = {
path: os.path.abspath(file_dir) + ".json",
tableName: table_name
}
response = importJsonPath()
else:
log.debug('文件保存失败')
# 关闭浏览器驱动 # 关闭浏览器驱动
# time.sleep(3) # time.sleep(3)
browser.quit() browser.quit()
response = getReptileTask()
def convert_to_traditional(simplified_text): def convert_to_traditional(simplified_text):
converter = opencc.OpenCC('s2t.json') # 创建简体中文到繁体中文的转换器 converter = opencc.OpenCC('s2t.json') # 创建简体中文到繁体中文的转换器
traditional_text = converter.convert(simplified_text) # 进行转换 traditional_text = converter.convert(simplified_text) # 进行转换
return traditional_text return traditional_text
if response['status_code'] == 200 and response['data']['code'] == 200: # 全局变量
log.debug("调用成功") data = []
search_word = "" table_name = ""
for item in response['data']['rows']:
if item['name'] == 'ptt':
search_word = item['keyword'] def main():
# print(convert_to_traditional(search_word)) # 请求关键词
reptile(None, convert_to_traditional(search_word)) response = getReptileTask()
else:
log.debug("调用失败") if response['status_code'] == 200 and response['data']['code'] == 200:
# upload_control() log.debug("调用成功")
search_word = ""
for item in response['data']['rows']:
if item['name'] == 'ptt':
search_word = item['keyword']
table_name = item['tableName']
# print(convert_to_traditional(search_word))
reptile(None, convert_to_traditional(search_word))
else:
log.debug("调用失败")
# upload_control()
main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment