Commit af6aac0d authored by liyang's avatar liyang

feat:task.json 新增total字段

parent 72c7ada4
......@@ -183,7 +183,8 @@ def reptile(browser=None, search_word=""):
"localPath": local_path,
"beginFiltrationTime": beginFiltrationTime,
"endFiltrationTime": endFiltrationTime,
"keyword": keyword
"keyword": keyword,
"total": len(data)
}
state_save = save_json(os.path.join(file_dir, "task.json"), task)
if state_save:
......
......@@ -186,7 +186,8 @@ def reptile(browser=None, search_word=""):
"localPath": local_path,
"beginFiltrationTime": beginFiltrationTime,
"endFiltrationTime": endFiltrationTime,
"keyword": keyword
"keyword": keyword,
"total": len(data)
}
state_save = save_json(os.path.join(file_dir, "task.json"), task)
if state_save:
......
......@@ -173,7 +173,8 @@ def reptile(browser=None, search_word=""):
"localPath": local_path,
"beginFiltrationTime": beginFiltrationTime,
"endFiltrationTime": endFiltrationTime,
"keyword": keyword
"keyword": keyword,
"total": len(data)
}
state_save = save_json(os.path.join(file_dir, "task.json"), task)
if state_save:
......
......@@ -156,7 +156,8 @@ def reptile(browser=None, search_word=""):
"localPath": local_path,
"beginFiltrationTime": beginFiltrationTime,
"endFiltrationTime": endFiltrationTime,
"keyword": keyword
"keyword": keyword,
"total": len(data)
}
state_save = save_json(os.path.join(file_dir, "task.json"), task)
if state_save:
......
......@@ -43,7 +43,7 @@ def reptile(browser=None, search_word=""):
for index, item_element in enumerate(classify_item_list):
# 暂时先爬取 第2个 分类
if 0 <= index:
if 0 <= index <= 14:
type_title = classify_item_list[index].text
# 进入分类页面
classify_item_list[index].click()
......@@ -233,7 +233,8 @@ def reptile(browser=None, search_word=""):
"localPath": local_path,
"beginFiltrationTime": beginFiltrationTime,
"endFiltrationTime": endFiltrationTime,
"keyword": keyword
"keyword": keyword,
"total": len(data)
}
state_save = save_json(os.path.join(file_dir, "task.json"), task)
if state_save:
......
......@@ -214,7 +214,8 @@ def reptile(browser=None, search_word=""):
"localPath": local_path,
"beginFiltrationTime": beginFiltrationTime,
"endFiltrationTime": endFiltrationTime,
"keyword": keyword
"keyword": keyword,
"total": len(data)
}
state_save = save_json(os.path.join(file_dir, "task.json"), task)
if state_save:
......
......@@ -114,7 +114,8 @@ def reptile(browser=None, search_word=""):
"localPath": local_path,
"beginFiltrationTime": beginFiltrationTime,
"endFiltrationTime": endFiltrationTime,
"keyword": keyword
"keyword": keyword,
"total": len(data)
}
state_save = save_json(os.path.join(file_dir, "task.json"), task)
if state_save:
......
import io
import mysql.connector
import json
import re
import sys
import time
import loguru
# import pymysql.cursors
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from api.index import importJson, getReptileTask, importJsonPath
from utils.Logger import log
from utils.index import convert_to_traditional, create_directory_if_not_exists, delete_directory
# from requests_toolbelt import *
from utils.createBrowserDriver import create
import opencc
from utils.filse import save_json
import os
from config.settings import get_base_file_url
from utils.download_image import download_image
# --------------- selenium 依赖 start ----------------
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# --------------- selenium 依赖 end ----------------
'''
爬取台湾PTT论坛的热门帖子,包括帖子的标题、内容【文本、图片、视频】
爬取流程:创建驱动--》打开浏览器--》打开网页--》爬取分类元素--》循环点击--》爬取热门帖子标题--》循环点击--》爬取帖子详情
'''
def reptile(browser=None, search_word=""):
url = "https://skynet.ipplus360.com/q.html"
browser = browser or create(no_headless=False, using_user_data=True)
# 有头模式执行
# browser = browser or create()
# 打开网页
browser.get(url)
print("------")
print(browser.page_source)
# log.debug("已打开浏览器")
# classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")
reptile()
\ No newline at end of file
# 连接到数据库
db_config = {
"host": "8.142.151.250",
"user": "script",
"password": "changfA123$",
"database": "network_assets"
}
connection = mysql.connector.connect(**db_config)
cursor = connection.cursor()
# 设置 group_concat_max_len,设置返回字符串长度限制为 1,000,000 字符
cursor.execute("SET SESSION group_concat_max_len = 1000000")
# 执行SQL查询语句
sql_query = """
SELECT
country_code AS countryCode,
CONCAT(
'{"countryCode": "', REPLACE(country_code, '"', '\\"'), '", ',
'"ASInfoList": [',
GROUP_CONCAT(
CONCAT(
'{"topology": false, "ASType": "', REPLACE(type, '"', '\\"'),
'", "linkedNumber": ', connect_degree,
', "ASNumber": ', as_number,
', "ASDegrees": ', transmit_degree,
', "countryCode": "', REPLACE(country_code, '"', '\\"'), '"}'
)
SEPARATOR ', '
),
'], ',
'"countryName": "', REPLACE(country, '"', '\\"'), '"}'
) AS result
FROM as_info
GROUP BY country_code, country
"""
cursor.execute(sql_query)
query_result = cursor.fetchall()
# 关闭数据库连接
cursor.close()
connection.close()
# 将查询结果转换为正确格式的数据
formatted_result = []
for row in query_result:
country_code = row[0]
result_data = row[1]
# 转换非字符串类型为字符串
if isinstance(result_data, (list, tuple)):
result_data = [str(item) for item in result_data]
# # 构建 JSON 数据
# json_data = {
# "countryCode": country_code,
# "ASInfoList": result_data
# }
data = json.loads(result_data)
formatted_result.append(data)
# 将结果导出到 JSON 文件
output_file_path = "./output.json"
with open(output_file_path, "w", encoding="utf-8") as json_file:
json.dump(formatted_result, json_file, indent=4, ensure_ascii=False)
print(f"查询结果已导出到 {output_file_path}")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment