Commit 3a027fb6 authored by liyang's avatar liyang

feat:调试数据入库

parent 8708ce28
reptile_data/files
reptile_data/ptt/*.json
*.zip
*.json
.DS_Store
venv
reptile_data
*.idea
\ No newline at end of file
......@@ -11,8 +11,23 @@ def importJson(file, form_data):
# 将响应内容解析为 JSON 格式
return {"status_code": response.status_code, "data": response.json()}
def getReptileTask():
http_url = baser_url + "crawlerSetting/list"
response = requests.get(http_url, headers=headers)
# 将响应内容解析为 JSON 格式
return {"status_code": response.status_code, "data": response.json()}
def importJsonPath(form_data):
headers = {"Content-Type":"application/json"}
http_url = baser_url + "importJson/importJsonPath"
response = requests.post(http_url, headers=headers, data=form_data)
# 将响应内容解析为 JSON 格式
return {"status_code": response.status_code, "data": response.json()}
def runingPython(form_data):
headers = {"Content-Type":"application/json"}
http_url = baser_url + "python/startPy"
response = requests.post(http_url, headers=headers, data=form_data)
# 将响应内容解析为 JSON 格式
return {"status_code": response.status_code, "data": response.json()}
......@@ -13,6 +13,8 @@ from utils.Logger import log
# from requests_toolbelt import *
from utils.createBrowserDriver import create
import opencc
from utils.filse import save_json
import os
'''
爬取台湾PTT论坛的热门帖子,包括帖子的标题、内容【文本、图片、视频】
......@@ -20,8 +22,6 @@ import opencc
爬取流程:创建驱动--》打开浏览器--》打开网页--》爬取分类元素--》循环点击--》爬取热门帖子标题--》循环点击--》爬取帖子详情
'''
data = []
def write_to_database(data):
# 连接到数据库
......@@ -74,14 +74,27 @@ def reptile(browser=None, search_word=""):
element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
length_two = len(element_list)
for index_two in range(length_two):
# 标题不包含"公告"
# 使用正则表达式进行匹配
# matches = re.findall("公告", element_list[index_two].text)
# log.debug(element_list[index_two].text+str(matches))
# 打印匹配结果
# if matches:
# log.debug(f"找到了匹配的字符串:{matches}")
element_list[index_two].click()
time.sleep(1)
# 原链接
browser_current_url = browser.current_url
log.debug('网页链接' + str(browser_current_url))
try:
# 获取帖子详情
element_title = browser.find_element('xpath',
"//div[@id='main-content']/div[3]//span[@class='article-meta-value']")
except:
# 浏览器返回上一页
browser.back()
element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
break
# 内容可能包含图片和视频,需要后处理
element_content = browser.find_element('xpath', "//div[@id='main-content']")
# 去除herf属性值包含'img'的a标签
......@@ -122,56 +135,66 @@ def reptile(browser=None, search_word=""):
classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")
# 发送爬取数据到java服务
# 保存json文件到本地
file_dir = f'./reptile_data/ptt/{int(time.time())}'
# state_save = save_json(file_dir, data)
def upload_control():
# 定义表单数据
form_data = {
"tableName": "pms_ptt",
}
file = io.BytesIO(json.dumps(data).encode())
response = importJson(file, form_data)
if response['status_code'] == 200 and response['data']['code'] == 200:
log.debug("调用成功")
else:
log.debug("调用失败")
# file = io.BytesIO(json.dumps(data).encode())
# response = importJson(file, form_data)
# if response['status_code'] == 200 and response['data']['code'] == 200:
# log.debug("调用成功")
# else:
# log.debug("调用失败")
# upload_control()
if len(data) == 0:
log.debug("数据为空")
else:
upload_control()
# upload_control()
# 保存json文件到本地
file_dir = f'./reptile_data/ptt/{int(time.time())}'
state_save = save_json(file_dir, data)
# write_to_database(data)
# if state_save:
# log.debug('文件保存成功')
# else:
# log.debug('文件保存失败')
if state_save:
log.debug('文件保存成功')
# log.debug('文件绝对路径:'+os.path.abspath(file_dir))
form_data = {
path: os.path.abspath(file_dir) + ".json",
tableName: table_name
}
response = importJsonPath()
else:
log.debug('文件保存失败')
# 关闭浏览器驱动
# time.sleep(3)
browser.quit()
response = getReptileTask()
def convert_to_traditional(simplified_text):
converter = opencc.OpenCC('s2t.json') # 创建简体中文到繁体中文的转换器
traditional_text = converter.convert(simplified_text) # 进行转换
return traditional_text
if response['status_code'] == 200 and response['data']['code'] == 200:
# 全局变量
data = []
table_name = ""
def main():
# 请求关键词
response = getReptileTask()
if response['status_code'] == 200 and response['data']['code'] == 200:
log.debug("调用成功")
search_word = ""
for item in response['data']['rows']:
if item['name'] == 'ptt':
search_word = item['keyword']
table_name = item['tableName']
# print(convert_to_traditional(search_word))
reptile(None, convert_to_traditional(search_word))
else:
else:
log.debug("调用失败")
# upload_control()
main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment