Commit af6aac0d authored by liyang's avatar liyang

feat:task.json 新增total字段

parent 72c7ada4
...@@ -183,7 +183,8 @@ def reptile(browser=None, search_word=""): ...@@ -183,7 +183,8 @@ def reptile(browser=None, search_word=""):
"localPath": local_path, "localPath": local_path,
"beginFiltrationTime": beginFiltrationTime, "beginFiltrationTime": beginFiltrationTime,
"endFiltrationTime": endFiltrationTime, "endFiltrationTime": endFiltrationTime,
"keyword": keyword "keyword": keyword,
"total": len(data)
} }
state_save = save_json(os.path.join(file_dir, "task.json"), task) state_save = save_json(os.path.join(file_dir, "task.json"), task)
if state_save: if state_save:
......
...@@ -186,7 +186,8 @@ def reptile(browser=None, search_word=""): ...@@ -186,7 +186,8 @@ def reptile(browser=None, search_word=""):
"localPath": local_path, "localPath": local_path,
"beginFiltrationTime": beginFiltrationTime, "beginFiltrationTime": beginFiltrationTime,
"endFiltrationTime": endFiltrationTime, "endFiltrationTime": endFiltrationTime,
"keyword": keyword "keyword": keyword,
"total": len(data)
} }
state_save = save_json(os.path.join(file_dir, "task.json"), task) state_save = save_json(os.path.join(file_dir, "task.json"), task)
if state_save: if state_save:
......
...@@ -173,7 +173,8 @@ def reptile(browser=None, search_word=""): ...@@ -173,7 +173,8 @@ def reptile(browser=None, search_word=""):
"localPath": local_path, "localPath": local_path,
"beginFiltrationTime": beginFiltrationTime, "beginFiltrationTime": beginFiltrationTime,
"endFiltrationTime": endFiltrationTime, "endFiltrationTime": endFiltrationTime,
"keyword": keyword "keyword": keyword,
"total": len(data)
} }
state_save = save_json(os.path.join(file_dir, "task.json"), task) state_save = save_json(os.path.join(file_dir, "task.json"), task)
if state_save: if state_save:
......
...@@ -156,7 +156,8 @@ def reptile(browser=None, search_word=""): ...@@ -156,7 +156,8 @@ def reptile(browser=None, search_word=""):
"localPath": local_path, "localPath": local_path,
"beginFiltrationTime": beginFiltrationTime, "beginFiltrationTime": beginFiltrationTime,
"endFiltrationTime": endFiltrationTime, "endFiltrationTime": endFiltrationTime,
"keyword": keyword "keyword": keyword,
"total": len(data)
} }
state_save = save_json(os.path.join(file_dir, "task.json"), task) state_save = save_json(os.path.join(file_dir, "task.json"), task)
if state_save: if state_save:
......
...@@ -43,7 +43,7 @@ def reptile(browser=None, search_word=""): ...@@ -43,7 +43,7 @@ def reptile(browser=None, search_word=""):
for index, item_element in enumerate(classify_item_list): for index, item_element in enumerate(classify_item_list):
# 暂时先爬取 第2个 分类 # 暂时先爬取 第2个 分类
if 0 <= index: if 0 <= index <= 14:
type_title = classify_item_list[index].text type_title = classify_item_list[index].text
# 进入分类页面 # 进入分类页面
classify_item_list[index].click() classify_item_list[index].click()
...@@ -233,7 +233,8 @@ def reptile(browser=None, search_word=""): ...@@ -233,7 +233,8 @@ def reptile(browser=None, search_word=""):
"localPath": local_path, "localPath": local_path,
"beginFiltrationTime": beginFiltrationTime, "beginFiltrationTime": beginFiltrationTime,
"endFiltrationTime": endFiltrationTime, "endFiltrationTime": endFiltrationTime,
"keyword": keyword "keyword": keyword,
"total": len(data)
} }
state_save = save_json(os.path.join(file_dir, "task.json"), task) state_save = save_json(os.path.join(file_dir, "task.json"), task)
if state_save: if state_save:
......
...@@ -214,7 +214,8 @@ def reptile(browser=None, search_word=""): ...@@ -214,7 +214,8 @@ def reptile(browser=None, search_word=""):
"localPath": local_path, "localPath": local_path,
"beginFiltrationTime": beginFiltrationTime, "beginFiltrationTime": beginFiltrationTime,
"endFiltrationTime": endFiltrationTime, "endFiltrationTime": endFiltrationTime,
"keyword": keyword "keyword": keyword,
"total": len(data)
} }
state_save = save_json(os.path.join(file_dir, "task.json"), task) state_save = save_json(os.path.join(file_dir, "task.json"), task)
if state_save: if state_save:
......
...@@ -114,7 +114,8 @@ def reptile(browser=None, search_word=""): ...@@ -114,7 +114,8 @@ def reptile(browser=None, search_word=""):
"localPath": local_path, "localPath": local_path,
"beginFiltrationTime": beginFiltrationTime, "beginFiltrationTime": beginFiltrationTime,
"endFiltrationTime": endFiltrationTime, "endFiltrationTime": endFiltrationTime,
"keyword": keyword "keyword": keyword,
"total": len(data)
} }
state_save = save_json(os.path.join(file_dir, "task.json"), task) state_save = save_json(os.path.join(file_dir, "task.json"), task)
if state_save: if state_save:
......
import io import mysql.connector
import json import json
import re
import sys # 连接到数据库
import time db_config = {
import loguru "host": "8.142.151.250",
# import pymysql.cursors "user": "script",
import requests "password": "changfA123$",
from bs4 import BeautifulSoup "database": "network_assets"
from datetime import datetime }
from api.index import importJson, getReptileTask, importJsonPath
from utils.Logger import log connection = mysql.connector.connect(**db_config)
from utils.index import convert_to_traditional, create_directory_if_not_exists, delete_directory cursor = connection.cursor()
# from requests_toolbelt import *
from utils.createBrowserDriver import create # 设置 group_concat_max_len,设置返回字符串长度限制为 1,000,000 字符
import opencc cursor.execute("SET SESSION group_concat_max_len = 1000000")
from utils.filse import save_json
import os # 执行SQL查询语句
from config.settings import get_base_file_url sql_query = """
from utils.download_image import download_image SELECT
# --------------- selenium 依赖 start ---------------- country_code AS countryCode,
from selenium.webdriver.common.by import By CONCAT(
from selenium.webdriver.support.ui import WebDriverWait '{"countryCode": "', REPLACE(country_code, '"', '\\"'), '", ',
from selenium.webdriver.support import expected_conditions as EC '"ASInfoList": [',
GROUP_CONCAT(
# --------------- selenium 依赖 end ---------------- CONCAT(
''' '{"topology": false, "ASType": "', REPLACE(type, '"', '\\"'),
爬取台湾PTT论坛的热门帖子,包括帖子的标题、内容【文本、图片、视频】 '", "linkedNumber": ', connect_degree,
', "ASNumber": ', as_number,
爬取流程:创建驱动--》打开浏览器--》打开网页--》爬取分类元素--》循环点击--》爬取热门帖子标题--》循环点击--》爬取帖子详情 ', "ASDegrees": ', transmit_degree,
''' ', "countryCode": "', REPLACE(country_code, '"', '\\"'), '"}'
)
SEPARATOR ', '
def reptile(browser=None, search_word=""): ),
url = "https://skynet.ipplus360.com/q.html" '], ',
browser = browser or create(no_headless=False, using_user_data=True) '"countryName": "', REPLACE(country, '"', '\\"'), '"}'
# 有头模式执行 ) AS result
# browser = browser or create() FROM as_info
# 打开网页 GROUP BY country_code, country
browser.get(url) """
print("------")
print(browser.page_source) cursor.execute(sql_query)
# log.debug("已打开浏览器") query_result = cursor.fetchall()
# classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")
# 关闭数据库连接
reptile() cursor.close()
\ No newline at end of file connection.close()
# 将查询结果转换为正确格式的数据
formatted_result = []
for row in query_result:
country_code = row[0]
result_data = row[1]
# 转换非字符串类型为字符串
if isinstance(result_data, (list, tuple)):
result_data = [str(item) for item in result_data]
# # 构建 JSON 数据
# json_data = {
# "countryCode": country_code,
# "ASInfoList": result_data
# }
data = json.loads(result_data)
formatted_result.append(data)
# 将结果导出到 JSON 文件
output_file_path = "./output.json"
with open(output_file_path, "w", encoding="utf-8") as json_file:
json.dump(formatted_result, json_file, indent=4, ensure_ascii=False)
print(f"查询结果已导出到 {output_file_path}")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment