Commit d0b2cdfa authored by liyang's avatar liyang

feat:任务爬取数据存放在独立目录

parent 602bca1c
...@@ -6,7 +6,7 @@ from utils.createBrowserDriver import create ...@@ -6,7 +6,7 @@ from utils.createBrowserDriver import create
from utils.filse import save_json from utils.filse import save_json
from api.index import importJson, getReptileTask, importJsonPath from api.index import importJson, getReptileTask, importJsonPath
from utils.index import convert_to_traditional, yt_dlp_download, convert_string_to_time, parse_twitter_time_string, \ from utils.index import convert_to_traditional, yt_dlp_download, convert_string_to_time, parse_twitter_time_string, \
is_base64_image, save_base64_image, get_screen_resolution is_base64_image, save_base64_image, get_screen_resolution,create_directory_if_not_exists, delete_directory
# from pytube import YouTube # from pytube import YouTube
from selenium.common.exceptions import NoSuchElementException from selenium.common.exceptions import NoSuchElementException
import os import os
...@@ -115,9 +115,9 @@ def reptile(browser=None, search_word=""): ...@@ -115,9 +115,9 @@ def reptile(browser=None, search_word=""):
# 下载图片至本地,替换标签中的src # 下载图片至本地,替换标签中的src
id = str(int(time.time())) id = str(int(time.time()))
# 下载地址 # 下载地址
download_dir = f'{os.path.join(file_dir, f"{id}.jpg")}' download_dir = f'{os.path.join(local_path, f"{id}.jpg")}'
# 访问地址 # 访问地址
access_address = f'{get_base_file_url()}{table_name.split("_")[1]}/{id}.jpg' access_address = f'{get_base_file_url()}{table_name.split("_")[1]}/{local_path_name}/{id}.jpg'
if is_base64_image(element['src']) == False: if is_base64_image(element['src']) == False:
log.debug("图片属于 url") log.debug("图片属于 url")
# 下载状态 # 下载状态
...@@ -173,8 +173,16 @@ def reptile(browser=None, search_word=""): ...@@ -173,8 +173,16 @@ def reptile(browser=None, search_word=""):
# print(data) # print(data)
if len(data) > 0: if len(data) > 0:
# 保存json文件到本地 # 保存json文件到本地
# log.debug(os.path.abspath("../")) json_path = os.path.join(local_path, "data.json")
state_save = save_json(os.path.join(file_dir, str(int(time.time())) + ".json"), data) state_save = save_json(json_path, data)
# 保存task
task = {
# 爬取时间
"reptileTime": data[0]["reptileTime"],
# 本地路径
"localPath": local_path
}
state_save = save_json(os.path.join(file_dir, "task.json"), task)
if state_save: if state_save:
log.debug('save file success') log.debug('save file success')
else: else:
...@@ -183,6 +191,8 @@ def reptile(browser=None, search_word=""): ...@@ -183,6 +191,8 @@ def reptile(browser=None, search_word=""):
else: else:
# 爬取数据为空 # 爬取数据为空
log.info("未爬取到数据") log.info("未爬取到数据")
# 删除目录
delete_directory(local_path)
script_close(browser) script_close(browser)
...@@ -193,7 +203,15 @@ def script_close(browser): ...@@ -193,7 +203,15 @@ def script_close(browser):
browser.quit() browser.quit()
except: except:
log.debug("浏览器驱动关闭失败") log.debug("浏览器驱动关闭失败")
try:
# 一些代码...
sys.exit() sys.exit()
except SystemExit:
raise # 重新抛出SystemExit异常,让脚本退出
except Exception as e:
# 异常处理代码...
print("sys.exit() 执行失败")
def main(): def main():
...@@ -229,13 +247,22 @@ def main(): ...@@ -229,13 +247,22 @@ def main():
# 全局变量 # 全局变量
data = [] data = []
# 任务详情
task = {}
table_name = "pms_dcard" table_name = "pms_dcard"
# 过滤时间开始 # 过滤时间开始
filter_time_start = int(123) filter_time_start = int(123)
# 过滤时间结束 # 过滤时间结束
filter_time_end = int(123) filter_time_end = int(123)
# 文件根目录
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}' file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
# 任务目录名称
local_path_name = str(int(time.time()))
# 任务目录路径
local_path = f'{os.path.join(file_dir, local_path_name)}'
# 任务目录是否创建
local_path_status = create_directory_if_not_exists(local_path)
# 是否启用 # 是否启用
status_task = '0' status_task = 0
# 调用main函数 # 调用main函数
main() main()
...@@ -6,7 +6,8 @@ from utils.Logger import log ...@@ -6,7 +6,8 @@ from utils.Logger import log
from utils.createBrowserDriver import create from utils.createBrowserDriver import create
from utils.filse import save_json from utils.filse import save_json
from api.index import importJson, getReptileTask, importJsonPath from api.index import importJson, getReptileTask, importJsonPath
from utils.index import convert_to_traditional, yt_dlp_download, convert_string_to_time, parse_time_string from utils.index import convert_to_traditional, yt_dlp_download, convert_string_to_time, \
parse_time_string,create_directory_if_not_exists, delete_directory
# from pytube import YouTube # from pytube import YouTube
from datetime import datetime from datetime import datetime
from utils.download_image import download_image from utils.download_image import download_image
...@@ -30,7 +31,7 @@ from selenium.webdriver.support import expected_conditions as EC ...@@ -30,7 +31,7 @@ from selenium.webdriver.support import expected_conditions as EC
def reptile(browser=None, search_word=""): def reptile(browser=None, search_word=""):
print(f"搜索词:{search_word}") print(f"搜索词:{search_word}")
url = "https://www.facebook.com/" url = "https://www.facebook.com/"
browser = browser or create(no_headless=False, using_user_data=True) browser = browser or create(no_headless=False, using_user_data=False)
# 打开网页 # 打开网页
browser.get(url) browser.get(url)
time.sleep(2) time.sleep(2)
...@@ -50,8 +51,10 @@ def reptile(browser=None, search_word=""): ...@@ -50,8 +51,10 @@ def reptile(browser=None, search_word=""):
log.debug("facebook login complete") log.debug("facebook login complete")
url = f"https://www.facebook.com/search/top?q={search_word}" url = f"https://www.facebook.com/search/top?q={search_word}"
browser.get(url) browser.get(url)
# time.sleep(1)
# 使用 JavaScript 将网页滚动到底部 # 使用 JavaScript 将网页滚动到底部
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);") browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# time.sleep(1)
# 等待内容出现,设置最长等待时间为10秒 # 等待内容出现,设置最长等待时间为10秒
wait = WebDriverWait(browser, 10) wait = WebDriverWait(browser, 10)
# 通过 expected_conditions 来定义等待条件,这里以弹窗内容的某个元素为例 # 通过 expected_conditions 来定义等待条件,这里以弹窗内容的某个元素为例
...@@ -124,9 +127,9 @@ def reptile(browser=None, search_word=""): ...@@ -124,9 +127,9 @@ def reptile(browser=None, search_word=""):
# 下载图片至本地,替换标签中的src # 下载图片至本地,替换标签中的src
id = str(int(time.time())) id = str(int(time.time()))
# 下载地址 # 下载地址
download_dir = f'{os.path.join(file_dir, f"{id}.jpg")}' download_dir = f'{os.path.join(local_path, f"{id}.jpg")}'
# 访问地址 # 访问地址
access_address = f'{get_base_file_url()}{table_name.split("_")[1]}/{id}.jpg' access_address = f'{get_base_file_url()}{table_name.split("_")[1]}/{local_path_name}/{id}.jpg'
# 下载状态 # 下载状态
status = download_image(element['src'], download_dir) status = download_image(element['src'], download_dir)
if status: if status:
...@@ -169,8 +172,16 @@ def reptile(browser=None, search_word=""): ...@@ -169,8 +172,16 @@ def reptile(browser=None, search_word=""):
if len(data) > 0: if len(data) > 0:
# 保存json文件到本地 # 保存json文件到本地
# log.debug(os.path.abspath("../")) json_path = os.path.join(local_path, "data.json")
state_save = save_json(os.path.join(file_dir, str(int(time.time())) + ".json"), data) state_save = save_json(json_path, data)
# 保存task
task = {
# 爬取时间
"reptileTime": data[0]["reptileTime"],
# 本地路径
"localPath": local_path
}
state_save = save_json(os.path.join(file_dir, "task.json"), task)
if state_save: if state_save:
log.debug('save file success') log.debug('save file success')
else: else:
...@@ -179,6 +190,8 @@ def reptile(browser=None, search_word=""): ...@@ -179,6 +190,8 @@ def reptile(browser=None, search_word=""):
else: else:
# 爬取数据为空 # 爬取数据为空
log.info("未爬取到数据") log.info("未爬取到数据")
# 删除目录
delete_directory(local_path)
script_close(browser) script_close(browser)
...@@ -189,7 +202,15 @@ def script_close(browser): ...@@ -189,7 +202,15 @@ def script_close(browser):
browser.quit() browser.quit()
except: except:
log.debug("浏览器驱动关闭失败") log.debug("浏览器驱动关闭失败")
try:
# 一些代码...
sys.exit() sys.exit()
except SystemExit:
raise # 重新抛出SystemExit异常,让脚本退出
except Exception as e:
# 异常处理代码...
print("sys.exit() 执行失败")
def main(): def main():
...@@ -225,11 +246,22 @@ def main(): ...@@ -225,11 +246,22 @@ def main():
# 全局变量 # 全局变量
data = [] data = []
# 任务详情
task = {}
table_name = "pms_facebook" table_name = "pms_facebook"
# 过滤时间开始 # 过滤时间开始
filter_time_start = int(123) filter_time_start = int(123)
# 过滤时间结束 # 过滤时间结束
filter_time_end = int(123) filter_time_end = int(123)
# 文件根目录
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}' file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
# 任务目录名称
local_path_name = str(int(time.time()))
# 任务目录路径
local_path = f'{os.path.join(file_dir, local_path_name)}'
# 任务目录是否创建
local_path_status = create_directory_if_not_exists(local_path)
# 是否启用
status_task = 0
# 调用main函数 # 调用main函数
main() main()
...@@ -5,7 +5,8 @@ from utils.Logger import log ...@@ -5,7 +5,8 @@ from utils.Logger import log
from utils.createBrowserDriver import create from utils.createBrowserDriver import create
from utils.filse import save_json from utils.filse import save_json
from api.index import importJson, getReptileTask, importJsonPath from api.index import importJson, getReptileTask, importJsonPath
from utils.index import convert_to_traditional, yt_dlp_download, convert_string_to_time, parse_twitter_time_string from utils.index import convert_to_traditional, yt_dlp_download, convert_string_to_time, \
parse_twitter_time_string,create_directory_if_not_exists, delete_directory
# from pytube import YouTube # from pytube import YouTube
from config.settings import get_account from config.settings import get_account
import os import os
...@@ -120,12 +121,13 @@ def reptile(browser=None, search_word=""): ...@@ -120,12 +121,13 @@ def reptile(browser=None, search_word=""):
img_soup = a_soup.find("img") img_soup = a_soup.find("img")
img_soup["style"] = "width:100%" img_soup["style"] = "width:100%"
src = img_soup["src"] src = img_soup["src"]
str_list = link_str.split("/") str_list = link_str.split("/")
img_id = str_list[len(str_list) - 2] img_id = str_list[len(str_list) - 2]
# 下载地址 # 下载地址
download_dir = f'{os.path.join(file_dir, f"{img_id}.jpg")}' download_dir = f'{os.path.join(local_path, f"{img_id}.jpg")}'
# 访问地址 # 访问地址
access_address = f'{get_base_file_url()}{table_name.split("_")[1]}/{img_id}.jpg' access_address = f'{get_base_file_url()}{table_name.split("_")[1]}/{local_path_name}/{img_id}.jpg'
# 下载状态 # 下载状态
status = download_image(src, download_dir) status = download_image(src, download_dir)
if status: if status:
...@@ -161,8 +163,16 @@ def reptile(browser=None, search_word=""): ...@@ -161,8 +163,16 @@ def reptile(browser=None, search_word=""):
if len(data) > 0: if len(data) > 0:
# 保存json文件到本地 # 保存json文件到本地
# log.debug(os.path.abspath("../")) json_path = os.path.join(local_path, "data.json")
state_save = save_json(os.path.join(file_dir, str(int(time.time())) + ".json"), data) state_save = save_json(json_path, data)
# 保存task
task = {
# 爬取时间
"reptileTime": data[0]["reptileTime"],
# 本地路径
"localPath": local_path
}
state_save = save_json(os.path.join(file_dir, "task.json"), task)
if state_save: if state_save:
log.debug('save file success') log.debug('save file success')
else: else:
...@@ -171,6 +181,8 @@ def reptile(browser=None, search_word=""): ...@@ -171,6 +181,8 @@ def reptile(browser=None, search_word=""):
else: else:
# 爬取数据为空 # 爬取数据为空
log.info("未爬取到数据") log.info("未爬取到数据")
# 删除目录
delete_directory(local_path)
script_close(browser) script_close(browser)
...@@ -224,13 +236,22 @@ def main(): ...@@ -224,13 +236,22 @@ def main():
# 全局变量 # 全局变量
data = [] data = []
# 任务详情
task = {}
table_name = "pms_instagram" table_name = "pms_instagram"
# 过滤时间开始 # 过滤时间开始
filter_time_start = int(123) filter_time_start = int(123)
# 过滤时间结束 # 过滤时间结束
filter_time_end = int(123) filter_time_end = int(123)
# 文件根目录
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}' file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
# 任务目录名称
local_path_name = str(int(time.time()))
# 任务目录路径
local_path = f'{os.path.join(file_dir, local_path_name)}'
# 任务目录是否创建
local_path_status = create_directory_if_not_exists(local_path)
# 是否启用 # 是否启用
status_task = '0' status_task = 0
# 调用main函数 # 调用main函数
main() main()
...@@ -10,7 +10,7 @@ from bs4 import BeautifulSoup ...@@ -10,7 +10,7 @@ from bs4 import BeautifulSoup
from datetime import datetime from datetime import datetime
from api.index import importJson, getReptileTask, importJsonPath from api.index import importJson, getReptileTask, importJsonPath
from utils.Logger import log from utils.Logger import log
from utils.index import convert_to_traditional from utils.index import convert_to_traditional, create_directory_if_not_exists, delete_directory
# from requests_toolbelt import * # from requests_toolbelt import *
from utils.createBrowserDriver import create from utils.createBrowserDriver import create
import opencc import opencc
...@@ -155,9 +155,9 @@ def reptile(browser=None, search_word=""): ...@@ -155,9 +155,9 @@ def reptile(browser=None, search_word=""):
# 下载图片至本地,替换标签中的src # 下载图片至本地,替换标签中的src
id = str(int(time.time())) id = str(int(time.time()))
# 下载地址 # 下载地址
download_dir = f'{os.path.join(file_dir, f"{id}.jpg")}' download_dir = f'{os.path.join(local_path, f"{id}.jpg")}'
# 访问地址 # 访问地址
access_address = f'{get_base_file_url()}{table_name.split("_")[1]}/{id}.jpg' access_address = f'{get_base_file_url()}{table_name.split("_")[1]}/{local_path_name}/{id}.jpg'
# 下载状态 # 下载状态
status = download_image(element['src'], download_dir) status = download_image(element['src'], download_dir)
if status: if status:
...@@ -223,8 +223,16 @@ def reptile(browser=None, search_word=""): ...@@ -223,8 +223,16 @@ def reptile(browser=None, search_word=""):
# print(data) # print(data)
if len(data) > 0: if len(data) > 0:
# 保存json文件到本地 # 保存json文件到本地
# log.debug(os.path.abspath("../")) json_path = os.path.join(local_path, "data.json")
state_save = save_json(os.path.join(file_dir, str(int(time.time())) + ".json"), data) state_save = save_json(json_path, data)
# 保存task
task = {
# 爬取时间
"reptileTime": data[0]["reptileTime"],
# 本地路径
"localPath": local_path
}
state_save = save_json(os.path.join(file_dir, "task.json"), task)
if state_save: if state_save:
log.debug('save file success') log.debug('save file success')
else: else:
...@@ -233,6 +241,8 @@ def reptile(browser=None, search_word=""): ...@@ -233,6 +241,8 @@ def reptile(browser=None, search_word=""):
else: else:
# 爬取数据为空 # 爬取数据为空
log.info("未爬取到数据") log.info("未爬取到数据")
# 删除目录
delete_directory(local_path)
script_close(browser) script_close(browser)
...@@ -243,7 +253,15 @@ def script_close(browser): ...@@ -243,7 +253,15 @@ def script_close(browser):
browser.quit() browser.quit()
except: except:
log.debug("浏览器驱动关闭失败") log.debug("浏览器驱动关闭失败")
try:
# 一些代码...
sys.exit() sys.exit()
except SystemExit:
raise # 重新抛出SystemExit异常,让脚本退出
except Exception as e:
# 异常处理代码...
print("sys.exit() 执行失败")
def main(): def main():
...@@ -279,13 +297,22 @@ def main(): ...@@ -279,13 +297,22 @@ def main():
# 全局变量 # 全局变量
data = [] data = []
# 任务详情
task = {}
table_name = "pms_ptt" table_name = "pms_ptt"
# 过滤时间开始 # 过滤时间开始
filter_time_start = int(123) filter_time_start = int(123)
# 过滤时间结束 # 过滤时间结束
filter_time_end = int(123) filter_time_end = int(123)
# 文件根目录
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}' file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
# 任务目录名称
local_path_name = str(int(time.time()))
# 任务目录路径
local_path = f'{os.path.join(file_dir, local_path_name)}'
# 任务目录是否创建
local_path_status = create_directory_if_not_exists(local_path)
# 是否启用 # 是否启用
status_task = '0' status_task = 0
# 调用main函数 # 调用main函数
main() main()
...@@ -5,7 +5,8 @@ from utils.Logger import log ...@@ -5,7 +5,8 @@ from utils.Logger import log
from utils.createBrowserDriver import create from utils.createBrowserDriver import create
from utils.filse import save_json from utils.filse import save_json
from api.index import importJson, getReptileTask, importJsonPath from api.index import importJson, getReptileTask, importJsonPath
from utils.index import convert_to_traditional, yt_dlp_download, convert_string_to_time, parse_twitter_time_string,extract_image_format from utils.index import convert_to_traditional, yt_dlp_download, convert_string_to_time, parse_twitter_time_string,\
extract_image_format,create_directory_if_not_exists, delete_directory
# from pytube import YouTube # from pytube import YouTube
import os import os
import sys import sys
...@@ -140,9 +141,9 @@ def reptile(browser=None, search_word=""): ...@@ -140,9 +141,9 @@ def reptile(browser=None, search_word=""):
id = str(int(time.time())) id = str(int(time.time()))
image_type = extract_image_format(element['src']) image_type = extract_image_format(element['src'])
# 下载地址 # 下载地址
download_dir = f'{os.path.join(file_dir, f"{id}.{image_type}")}' download_dir = f'{os.path.join(local_path, f"{id}.{image_type}")}'
# 访问地址 # 访问地址
access_address = f'{get_base_file_url()}{table_name.split("_")[1]}/{id}.{image_type}' access_address = f'{get_base_file_url()}{table_name.split("_")[1]}/{local_path_name}/{id}.{image_type}'
# 下载状态 # 下载状态
status = download_image(element['src'], download_dir) status = download_image(element['src'], download_dir)
if status: if status:
...@@ -192,8 +193,16 @@ def reptile(browser=None, search_word=""): ...@@ -192,8 +193,16 @@ def reptile(browser=None, search_word=""):
if len(data) > 0: if len(data) > 0:
# 保存json文件到本地 # 保存json文件到本地
# log.debug(os.path.abspath("../")) json_path = os.path.join(local_path, "data.json")
state_save = save_json(os.path.join(file_dir, str(int(time.time())) + ".json"), data) state_save = save_json(json_path, data)
# 保存task
task = {
# 爬取时间
"reptileTime": data[0]["reptileTime"],
# 本地路径
"localPath": local_path
}
state_save = save_json(os.path.join(file_dir, "task.json"), task)
if state_save: if state_save:
log.debug('save file success') log.debug('save file success')
else: else:
...@@ -202,6 +211,8 @@ def reptile(browser=None, search_word=""): ...@@ -202,6 +211,8 @@ def reptile(browser=None, search_word=""):
else: else:
# 爬取数据为空 # 爬取数据为空
log.info("未爬取到数据") log.info("未爬取到数据")
# 删除目录
delete_directory(local_path)
script_close(browser) script_close(browser)
...@@ -212,7 +223,15 @@ def script_close(browser): ...@@ -212,7 +223,15 @@ def script_close(browser):
browser.quit() browser.quit()
except: except:
log.debug("浏览器驱动关闭失败") log.debug("浏览器驱动关闭失败")
try:
# 一些代码...
sys.exit() sys.exit()
except SystemExit:
raise # 重新抛出SystemExit异常,让脚本退出
except Exception as e:
# 异常处理代码...
print("sys.exit() 执行失败")
def main(): def main():
...@@ -248,13 +267,21 @@ def main(): ...@@ -248,13 +267,21 @@ def main():
# 全局变量 # 全局变量
data = [] data = []
# 任务详情
task = {}
table_name = "pms_twitter" table_name = "pms_twitter"
# 过滤时间开始 # 过滤时间开始
filter_time_start = int(123) filter_time_start = int(123)
# 过滤时间结束 # 过滤时间结束
filter_time_end = int(123) filter_time_end = int(123)
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}' file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
# 任务目录名称
local_path_name = str(int(time.time()))
# 任务目录路径
local_path = f'{os.path.join(file_dir, local_path_name)}'
# 任务目录是否创建
local_path_status = create_directory_if_not_exists(local_path)
# 是否启用 # 是否启用
status_task = '0' status_task = 0
# 调用main函数 # 调用main函数
main() main()
...@@ -6,7 +6,7 @@ from utils.Logger import log ...@@ -6,7 +6,7 @@ from utils.Logger import log
from utils.createBrowserDriver import create from utils.createBrowserDriver import create
from utils.filse import save_json from utils.filse import save_json
from api.index import importJson, getReptileTask, importJsonPath from api.index import importJson, getReptileTask, importJsonPath
from utils.index import convert_to_traditional, yt_dlp_download, convert_string_to_time from utils.index import convert_to_traditional, yt_dlp_download, convert_string_to_time,create_directory_if_not_exists, delete_directory
from pytube import YouTube from pytube import YouTube
from datetime import datetime from datetime import datetime
import os import os
...@@ -78,9 +78,9 @@ def reptile(browser=None, search_word=""): ...@@ -78,9 +78,9 @@ def reptile(browser=None, search_word=""):
video_url = [] video_url = []
# 下载地址 # 下载地址
download_dir = f'{os.path.join(file_dir, f"{id}.mp4")}' download_dir = f'{os.path.join(local_path, f"{id}.mp4")}'
# 访问地址 # 访问地址
access_address = f'{get_base_file_url()}{table_name.split("_")[1]}/{id}.mp4' access_address = f'{get_base_file_url()}{table_name.split("_")[1]}/{local_path_name}/{id}.mp4'
# 下载视频 # 下载视频
state_download = yt_dlp_download(url, 'youtube') state_download = yt_dlp_download(url, 'youtube')
video_url.append(download_dir) video_url.append(download_dir)
...@@ -103,8 +103,16 @@ def reptile(browser=None, search_word=""): ...@@ -103,8 +103,16 @@ def reptile(browser=None, search_word=""):
error = "" error = ""
if len(data) > 0: if len(data) > 0:
# 保存json文件到本地 # 保存json文件到本地
# log.debug(os.path.abspath("../")) json_path = os.path.join(local_path, "data.json")
state_save = save_json(os.path.join(file_dir, str(int(time.time())) + ".json"), data) state_save = save_json(json_path, data)
# 保存task
task = {
# 爬取时间
"reptileTime": data[0]["reptileTime"],
# 本地路径
"localPath": local_path
}
state_save = save_json(os.path.join(file_dir, "task.json"), task)
if state_save: if state_save:
log.debug('save file success') log.debug('save file success')
else: else:
...@@ -113,6 +121,8 @@ def reptile(browser=None, search_word=""): ...@@ -113,6 +121,8 @@ def reptile(browser=None, search_word=""):
else: else:
# 爬取数据为空 # 爬取数据为空
log.info("未爬取到数据") log.info("未爬取到数据")
# 删除目录
delete_directory(local_path)
script_close(browser) script_close(browser)
...@@ -123,7 +133,15 @@ def script_close(browser): ...@@ -123,7 +133,15 @@ def script_close(browser):
browser.quit() browser.quit()
except: except:
log.debug("浏览器驱动关闭失败") log.debug("浏览器驱动关闭失败")
try:
# 一些代码...
sys.exit() sys.exit()
except SystemExit:
raise # 重新抛出SystemExit异常,让脚本退出
except Exception as e:
# 异常处理代码...
print("sys.exit() 执行失败")
def main(): def main():
...@@ -160,13 +178,21 @@ def main(): ...@@ -160,13 +178,21 @@ def main():
# 全局变量 # 全局变量
data = [] data = []
# 任务详情
task = {}
table_name = "pms_youtube" table_name = "pms_youtube"
# 过滤时间开始 # 过滤时间开始
filter_time_start = int(123) filter_time_start = int(123)
# 过滤时间结束 # 过滤时间结束
filter_time_end = int(123) filter_time_end = int(123)
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}' file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
# 任务目录名称
local_path_name = str(int(time.time()))
# 任务目录路径
local_path = f'{os.path.join(file_dir, local_path_name)}'
# 任务目录是否创建
local_path_status = create_directory_if_not_exists(local_path)
# 是否启用 # 是否启用
status_task = '0' status_task = 0
# 调用main函数 # 调用main函数
main() main()
import os.path import os.path
import re import re
import shutil
import time import time
from hanziconv import HanziConv from hanziconv import HanziConv
...@@ -77,6 +78,7 @@ def is_base64_image(url): ...@@ -77,6 +78,7 @@ def is_base64_image(url):
# 转换 facebook 的时间 # 转换 facebook 的时间
def parse_time_string(time_str): def parse_time_string(time_str):
""" """
转换 facebook 的时间
:param time_str: :param time_str:
:return: :return:
...@@ -108,6 +110,7 @@ def parse_time_string(time_str): ...@@ -108,6 +110,7 @@ def parse_time_string(time_str):
# 转换 youtube 的时间 # 转换 youtube 的时间
def convert_string_to_time(string): def convert_string_to_time(string):
""" """
转换 youtube 的时间
:param string: :param string:
:return: :return:
...@@ -137,6 +140,7 @@ def convert_string_to_time(string): ...@@ -137,6 +140,7 @@ def convert_string_to_time(string):
# 转换 twitter 的时间 # 转换 twitter 的时间
def parse_twitter_time_string(time_str): def parse_twitter_time_string(time_str):
""" """
转换 twitter 的时间
:param time_str: :param time_str:
:return: :return:
...@@ -212,3 +216,36 @@ def yt_dlp_download(url, name): ...@@ -212,3 +216,36 @@ def yt_dlp_download(url, name):
# 命令执行失败,输出错误信息 # 命令执行失败,输出错误信息
# log.debug(str(result.stderr)) # log.debug(str(result.stderr))
return False return False
def create_directory_if_not_exists(directory_path):
"""
创建目录
:param directory_path:
:return:
"""
if not os.path.exists(directory_path):
try:
os.makedirs(directory_path)
return True
except OSError as e:
return False
else:
return True
def delete_directory(directory_path):
"""
删除目录
:param directory_path:
:return:
"""
try:
shutil.rmtree(directory_path)
# print(f"Directory '{directory_path}' and its contents have been deleted successfully.")
return True
except OSError as e:
# print(f"Error: Failed to delete directory '{directory_path}'. {e}")
return False
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment