Commit 653b114a authored by liyang's avatar liyang

feat:youtube debug

parent c4a794b8
...@@ -60,7 +60,7 @@ def reptile(browser=None, search_word=""): ...@@ -60,7 +60,7 @@ def reptile(browser=None, search_word=""):
# 过滤时间 # 过滤时间
# # 如果'releaseTime'不是整数,则将其转换为整数 # # 如果'releaseTime'不是整数,则将其转换为整数
new_releaseTime = int(timestamp) new_releaseTime = int(timestamp)
if new_releaseTime < filter_time_start or new_releaseTime > filter_time_end: if new_releaseTime < beginFiltrationTime or new_releaseTime > endFiltrationTime:
# 如果'new_releaseTime'不在指定范围内,则跳过当前迭代,继续下一个项目 # 如果'new_releaseTime'不在指定范围内,则跳过当前迭代,继续下一个项目
continue continue
...@@ -180,7 +180,10 @@ def reptile(browser=None, search_word=""): ...@@ -180,7 +180,10 @@ def reptile(browser=None, search_word=""):
# 爬取时间 # 爬取时间
"reptileTime": data[0]["reptileTime"], "reptileTime": data[0]["reptileTime"],
# 本地路径 # 本地路径
"localPath": local_path "localPath": local_path,
"beginFiltrationTime": beginFiltrationTime,
"endFiltrationTime": endFiltrationTime,
"keyword": keyword
} }
state_save = save_json(os.path.join(file_dir, "task.json"), task) state_save = save_json(os.path.join(file_dir, "task.json"), task)
if state_save: if state_save:
...@@ -221,8 +224,9 @@ def main(): ...@@ -221,8 +224,9 @@ def main():
# 请求关键词 # 请求关键词
response = getReptileTask() response = getReptileTask()
global status_task global status_task
global filter_time_start global beginFiltrationTime
global filter_time_end global endFiltrationTime
global keyword
if response['status_code'] == 200 and response['data']['code'] == 200: if response['status_code'] == 200 and response['data']['code'] == 200:
log.debug("call success") log.debug("call success")
search_word = "" search_word = ""
...@@ -231,8 +235,9 @@ def main(): ...@@ -231,8 +235,9 @@ def main():
search_word = item['keyword'] search_word = item['keyword']
table_name = item['tableName'] table_name = item['tableName']
status_task = int(item["status"]) status_task = int(item["status"])
filter_time_start = int(item["beginFiltrationTime"]) keyword = str(item["keyword"])
filter_time_end = int(item["endFiltrationTime"]) beginFiltrationTime = int(item["beginFiltrationTime"])
endFiltrationTime = int(item["endFiltrationTime"])
# 简体转繁体 # 简体转繁体
if status_task == 0 and len(search_word) > 0: if status_task == 0 and len(search_word) > 0:
reptile(None, convert_to_traditional(search_word)) reptile(None, convert_to_traditional(search_word))
...@@ -250,10 +255,14 @@ data = [] ...@@ -250,10 +255,14 @@ data = []
# 任务详情 # 任务详情
task = {} task = {}
table_name = "pms_dcard" table_name = "pms_dcard"
# 全局字段
keyword = ""
# 过滤时间开始 # 过滤时间开始
filter_time_start = int(123) beginFiltrationTime = int(123)
# 过滤时间结束 # 过滤时间结束
filter_time_end = int(123) endFiltrationTime = int(123)
# 文件根目录 # 文件根目录
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}' file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
# 任务目录名称 # 任务目录名称
......
...@@ -88,7 +88,7 @@ def reptile(browser=None, search_word=""): ...@@ -88,7 +88,7 @@ def reptile(browser=None, search_word=""):
# 过滤时间 # 过滤时间
# # 如果'releaseTime'不是整数,则将其转换为整数 # # 如果'releaseTime'不是整数,则将其转换为整数
new_releaseTime = int(release_time) new_releaseTime = int(release_time)
if new_releaseTime < filter_time_start or new_releaseTime > filter_time_end: if new_releaseTime < beginFiltrationTime or new_releaseTime > endFiltrationTime:
# 如果'new_releaseTime'不在指定范围内,则跳过当前迭代,继续下一个项目 # 如果'new_releaseTime'不在指定范围内,则跳过当前迭代,继续下一个项目
continue continue
...@@ -179,7 +179,10 @@ def reptile(browser=None, search_word=""): ...@@ -179,7 +179,10 @@ def reptile(browser=None, search_word=""):
# 爬取时间 # 爬取时间
"reptileTime": data[0]["reptileTime"], "reptileTime": data[0]["reptileTime"],
# 本地路径 # 本地路径
"localPath": local_path "localPath": local_path,
"beginFiltrationTime": beginFiltrationTime,
"endFiltrationTime": endFiltrationTime,
"keyword": keyword
} }
state_save = save_json(os.path.join(file_dir, "task.json"), task) state_save = save_json(os.path.join(file_dir, "task.json"), task)
if state_save: if state_save:
...@@ -220,8 +223,9 @@ def main(): ...@@ -220,8 +223,9 @@ def main():
# 请求关键词 # 请求关键词
response = getReptileTask() response = getReptileTask()
global status_task global status_task
global filter_time_start global beginFiltrationTime
global filter_time_end global endFiltrationTime
global keyword
if response['status_code'] == 200 and response['data']['code'] == 200: if response['status_code'] == 200 and response['data']['code'] == 200:
log.debug("call success") log.debug("call success")
search_word = "" search_word = ""
...@@ -230,8 +234,9 @@ def main(): ...@@ -230,8 +234,9 @@ def main():
search_word = item['keyword'] search_word = item['keyword']
table_name = item['tableName'] table_name = item['tableName']
status_task = int(item["status"]) status_task = int(item["status"])
filter_time_start = int(item["beginFiltrationTime"]) keyword = str(item["keyword"])
filter_time_end = int(item["endFiltrationTime"]) beginFiltrationTime = int(item["beginFiltrationTime"])
endFiltrationTime = int(item["endFiltrationTime"])
# 简体转繁体 # 简体转繁体
if status_task == 0 and len(search_word) > 0: if status_task == 0 and len(search_word) > 0:
reptile(None, convert_to_traditional(search_word)) reptile(None, convert_to_traditional(search_word))
...@@ -249,10 +254,14 @@ data = [] ...@@ -249,10 +254,14 @@ data = []
# 任务详情 # 任务详情
task = {} task = {}
table_name = "pms_facebook" table_name = "pms_facebook"
# 全局字段
keyword = ""
# 过滤时间开始 # 过滤时间开始
filter_time_start = int(123) beginFiltrationTime = int(123)
# 过滤时间结束 # 过滤时间结束
filter_time_end = int(123) endFiltrationTime = int(123)
# 文件根目录 # 文件根目录
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}' file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
# 任务目录名称 # 任务目录名称
......
...@@ -87,7 +87,7 @@ def reptile(browser=None, search_word=""): ...@@ -87,7 +87,7 @@ def reptile(browser=None, search_word=""):
# 过滤时间 # 过滤时间
# # 如果'releaseTime'不是整数,则将其转换为整数 # # 如果'releaseTime'不是整数,则将其转换为整数
new_releaseTime = int(timestamp) new_releaseTime = int(timestamp)
if new_releaseTime < filter_time_start or new_releaseTime > filter_time_end: if new_releaseTime < beginFiltrationTime or new_releaseTime > endFiltrationTime:
# 如果'new_releaseTime'不在指定范围内,则跳过当前迭代,继续下一个项目 # 如果'new_releaseTime'不在指定范围内,则跳过当前迭代,继续下一个项目
continue continue
...@@ -170,7 +170,10 @@ def reptile(browser=None, search_word=""): ...@@ -170,7 +170,10 @@ def reptile(browser=None, search_word=""):
# 爬取时间 # 爬取时间
"reptileTime": data[0]["reptileTime"], "reptileTime": data[0]["reptileTime"],
# 本地路径 # 本地路径
"localPath": local_path "localPath": local_path,
"beginFiltrationTime": beginFiltrationTime,
"endFiltrationTime": endFiltrationTime,
"keyword": keyword
} }
state_save = save_json(os.path.join(file_dir, "task.json"), task) state_save = save_json(os.path.join(file_dir, "task.json"), task)
if state_save: if state_save:
...@@ -210,8 +213,9 @@ def main(): ...@@ -210,8 +213,9 @@ def main():
# 请求关键词 # 请求关键词
response = getReptileTask() response = getReptileTask()
global status_task global status_task
global filter_time_start global beginFiltrationTime
global filter_time_end global endFiltrationTime
global keyword
if response['status_code'] == 200 and response['data']['code'] == 200: if response['status_code'] == 200 and response['data']['code'] == 200:
log.debug("call success") log.debug("call success")
search_word = "" search_word = ""
...@@ -220,8 +224,9 @@ def main(): ...@@ -220,8 +224,9 @@ def main():
search_word = item['keyword'] search_word = item['keyword']
table_name = item['tableName'] table_name = item['tableName']
status_task = int(item["status"]) status_task = int(item["status"])
filter_time_start = int(item["beginFiltrationTime"]) keyword = str(item["keyword"])
filter_time_end = int(item["endFiltrationTime"]) beginFiltrationTime = int(item["beginFiltrationTime"])
endFiltrationTime = int(item["endFiltrationTime"])
# 简体转繁体 # 简体转繁体
if status_task == 0 and len(search_word) > 0: if status_task == 0 and len(search_word) > 0:
reptile(None, convert_to_traditional(search_word)) reptile(None, convert_to_traditional(search_word))
...@@ -239,10 +244,14 @@ data = [] ...@@ -239,10 +244,14 @@ data = []
# 任务详情 # 任务详情
task = {} task = {}
table_name = "pms_instagram" table_name = "pms_instagram"
# 全局字段
keyword = ""
# 过滤时间开始 # 过滤时间开始
filter_time_start = int(123) beginFiltrationTime = int(123)
# 过滤时间结束 # 过滤时间结束
filter_time_end = int(123) endFiltrationTime = int(123)
# 文件根目录 # 文件根目录
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}' file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
# 任务目录名称 # 任务目录名称
......
...@@ -190,7 +190,7 @@ def reptile(browser=None, search_word=""): ...@@ -190,7 +190,7 @@ def reptile(browser=None, search_word=""):
release_time = int(date_time.timestamp()) release_time = int(date_time.timestamp())
# 过滤时间 # 过滤时间
if filter_time_start <= release_time <= filter_time_end: if beginFiltrationTime <= release_time <= endFiltrationTime:
# --------------- 组装数据 start--------------------- # --------------- 组装数据 start---------------------
obj = { obj = {
"title": element_title.text, "title": element_title.text,
...@@ -230,7 +230,10 @@ def reptile(browser=None, search_word=""): ...@@ -230,7 +230,10 @@ def reptile(browser=None, search_word=""):
# 爬取时间 # 爬取时间
"reptileTime": data[0]["reptileTime"], "reptileTime": data[0]["reptileTime"],
# 本地路径 # 本地路径
"localPath": local_path "localPath": local_path,
"beginFiltrationTime": beginFiltrationTime,
"endFiltrationTime": endFiltrationTime,
"keyword": keyword
} }
state_save = save_json(os.path.join(file_dir, "task.json"), task) state_save = save_json(os.path.join(file_dir, "task.json"), task)
if state_save: if state_save:
...@@ -263,7 +266,6 @@ def script_close(browser): ...@@ -263,7 +266,6 @@ def script_close(browser):
print("sys.exit() 执行失败") print("sys.exit() 执行失败")
def main(): def main():
""" """
...@@ -271,8 +273,9 @@ def main(): ...@@ -271,8 +273,9 @@ def main():
# 请求关键词 # 请求关键词
response = getReptileTask() response = getReptileTask()
global status_task global status_task
global filter_time_start global beginFiltrationTime
global filter_time_end global endFiltrationTime
global keyword
if response['status_code'] == 200 and response['data']['code'] == 200: if response['status_code'] == 200 and response['data']['code'] == 200:
log.debug("call success") log.debug("call success")
search_word = "" search_word = ""
...@@ -281,8 +284,9 @@ def main(): ...@@ -281,8 +284,9 @@ def main():
search_word = item['keyword'] search_word = item['keyword']
table_name = item['tableName'] table_name = item['tableName']
status_task = int(item["status"]) status_task = int(item["status"])
filter_time_start = int(item["beginFiltrationTime"]) keyword = str(item["keyword"])
filter_time_end = int(item["endFiltrationTime"]) beginFiltrationTime = int(item["beginFiltrationTime"])
endFiltrationTime = int(item["endFiltrationTime"])
# 简体转繁体 # 简体转繁体
if status_task == 0 and len(search_word) > 0: if status_task == 0 and len(search_word) > 0:
reptile(None, convert_to_traditional(search_word)) reptile(None, convert_to_traditional(search_word))
...@@ -300,10 +304,14 @@ data = [] ...@@ -300,10 +304,14 @@ data = []
# 任务详情 # 任务详情
task = {} task = {}
table_name = "pms_ptt" table_name = "pms_ptt"
# 全局字段
keyword = ""
# 过滤时间开始 # 过滤时间开始
filter_time_start = int(123) beginFiltrationTime = int(123)
# 过滤时间结束 # 过滤时间结束
filter_time_end = int(123) endFiltrationTime = int(123)
# 文件根目录 # 文件根目录
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}' file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
# 任务目录名称 # 任务目录名称
......
...@@ -95,7 +95,7 @@ def reptile(browser=None, search_word=""): ...@@ -95,7 +95,7 @@ def reptile(browser=None, search_word=""):
# 过滤时间 # 过滤时间
# # 如果'releaseTime'不是整数,则将其转换为整数 # # 如果'releaseTime'不是整数,则将其转换为整数
new_releaseTime = int(timestamp) new_releaseTime = int(timestamp)
if new_releaseTime < filter_time_start or new_releaseTime > filter_time_end: if new_releaseTime < beginFiltrationTime or new_releaseTime > endFiltrationTime:
# 如果'new_releaseTime'不在指定范围内,则跳过当前迭代,继续下一个项目 # 如果'new_releaseTime'不在指定范围内,则跳过当前迭代,继续下一个项目
continue continue
...@@ -200,7 +200,10 @@ def reptile(browser=None, search_word=""): ...@@ -200,7 +200,10 @@ def reptile(browser=None, search_word=""):
# 爬取时间 # 爬取时间
"reptileTime": data[0]["reptileTime"], "reptileTime": data[0]["reptileTime"],
# 本地路径 # 本地路径
"localPath": local_path "localPath": local_path,
"beginFiltrationTime": beginFiltrationTime,
"endFiltrationTime": endFiltrationTime,
"keyword": keyword
} }
state_save = save_json(os.path.join(file_dir, "task.json"), task) state_save = save_json(os.path.join(file_dir, "task.json"), task)
if state_save: if state_save:
...@@ -241,8 +244,9 @@ def main(): ...@@ -241,8 +244,9 @@ def main():
# 请求关键词 # 请求关键词
response = getReptileTask() response = getReptileTask()
global status_task global status_task
global filter_time_start global beginFiltrationTime
global filter_time_end global endFiltrationTime
global keyword
if response['status_code'] == 200 and response['data']['code'] == 200: if response['status_code'] == 200 and response['data']['code'] == 200:
log.debug("call success") log.debug("call success")
search_word = "" search_word = ""
...@@ -250,9 +254,10 @@ def main(): ...@@ -250,9 +254,10 @@ def main():
if item['name'] == 'twitter': if item['name'] == 'twitter':
search_word = item['keyword'] search_word = item['keyword']
table_name = item['tableName'] table_name = item['tableName']
keyword = str(item["keyword"])
status_task = int(item["status"]) status_task = int(item["status"])
filter_time_start = int(item["beginFiltrationTime"]) beginFiltrationTime = int(item["beginFiltrationTime"])
filter_time_end = int(item["endFiltrationTime"]) endFiltrationTime = int(item["endFiltrationTime"])
# 简体转繁体 # 简体转繁体
if status_task == 0 and len(search_word) > 0: if status_task == 0 and len(search_word) > 0:
reptile(None, convert_to_traditional(search_word)) reptile(None, convert_to_traditional(search_word))
...@@ -270,10 +275,14 @@ data = [] ...@@ -270,10 +275,14 @@ data = []
# 任务详情 # 任务详情
task = {} task = {}
table_name = "pms_twitter" table_name = "pms_twitter"
# 全局字段
keyword = ""
# 过滤时间开始 # 过滤时间开始
filter_time_start = int(123) beginFiltrationTime = int(123)
# 过滤时间结束 # 过滤时间结束
filter_time_end = int(123) endFiltrationTime = int(123)
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}' file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
# 任务目录名称 # 任务目录名称
local_path_name = str(int(time.time())) local_path_name = str(int(time.time()))
......
...@@ -73,7 +73,7 @@ def reptile(browser=None, search_word=""): ...@@ -73,7 +73,7 @@ def reptile(browser=None, search_word=""):
# 过滤时间 # 过滤时间
# # 如果'releaseTime'不是整数,则将其转换为整数 # # 如果'releaseTime'不是整数,则将其转换为整数
new_releaseTime = int(releaseTime) new_releaseTime = int(releaseTime)
if new_releaseTime < filter_time_start or new_releaseTime > filter_time_end: if new_releaseTime < beginFiltrationTime or new_releaseTime > endFiltrationTime:
# 如果'new_releaseTime'不在指定范围内,则跳过当前迭代,继续下一个项目 # 如果'new_releaseTime'不在指定范围内,则跳过当前迭代,继续下一个项目
continue continue
...@@ -111,7 +111,10 @@ def reptile(browser=None, search_word=""): ...@@ -111,7 +111,10 @@ def reptile(browser=None, search_word=""):
# 爬取时间 # 爬取时间
"reptileTime": data[0]["reptileTime"], "reptileTime": data[0]["reptileTime"],
# 本地路径 # 本地路径
"localPath": local_path "localPath": local_path,
"beginFiltrationTime": beginFiltrationTime,
"endFiltrationTime": endFiltrationTime,
"keyword": keyword
} }
state_save = save_json(os.path.join(file_dir, "task.json"), task) state_save = save_json(os.path.join(file_dir, "task.json"), task)
if state_save: if state_save:
...@@ -151,8 +154,9 @@ def main(): ...@@ -151,8 +154,9 @@ def main():
# 请求关键词 # 请求关键词
response = getReptileTask() response = getReptileTask()
global status_task global status_task
global filter_time_start global beginFiltrationTime
global filter_time_end global endFiltrationTime
global keyword
# print(response) # print(response)
if response['status_code'] == 200 and response['data']['code'] == 200: if response['status_code'] == 200 and response['data']['code'] == 200:
log.debug("call success") log.debug("call success")
...@@ -162,8 +166,9 @@ def main(): ...@@ -162,8 +166,9 @@ def main():
search_word = item['keyword'] search_word = item['keyword']
table_name = item['tableName'] table_name = item['tableName']
status_task = int(item["status"]) status_task = int(item["status"])
filter_time_start = int(item["beginFiltrationTime"]) keyword = str(item["keyword"])
filter_time_end = int(item["endFiltrationTime"]) beginFiltrationTime = int(item["beginFiltrationTime"])
endFiltrationTime = int(item["endFiltrationTime"])
# 简体转繁体 # 简体转繁体
if status_task == 0 and len(search_word) > 0: if status_task == 0 and len(search_word) > 0:
reptile(None, convert_to_traditional(search_word)) reptile(None, convert_to_traditional(search_word))
...@@ -181,10 +186,14 @@ data = [] ...@@ -181,10 +186,14 @@ data = []
# 任务详情 # 任务详情
task = {} task = {}
table_name = "pms_youtube" table_name = "pms_youtube"
# 全局字段
keyword = ""
# 过滤时间开始 # 过滤时间开始
filter_time_start = int(123) beginFiltrationTime = int(123)
# 过滤时间结束 # 过滤时间结束
filter_time_end = int(123) endFiltrationTime = int(123)
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}' file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
# 任务目录名称 # 任务目录名称
local_path_name = str(int(time.time())) local_path_name = str(int(time.time()))
......
classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']") import io
# log.debug(classify_item_list) import json
length = len(classify_item_list) import re
for index in range(length): import sys
# 暂时先爬取 第2个 分类 import time
if 0 <= index < 4: import loguru
type_title = classify_item_list[index].text # import pymysql.cursors
classify_item_list[index].click() import requests
time.sleep(0.1) from bs4 import BeautifulSoup
for index_two in range(length_two): from datetime import datetime
print(element_list[index_two].text) from api.index import importJson, getReptileTask, importJsonPath
# 浏览器返回上一页 from utils.Logger import log
browser.back() from utils.index import convert_to_traditional, create_directory_if_not_exists, delete_directory
if index == 0: # from requests_toolbelt import *
browser.back() from utils.createBrowserDriver import create
time.sleep(0.1) import opencc
classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']") from utils.filse import save_json
\ No newline at end of file import os
from config.settings import get_base_file_url
from utils.download_image import download_image
# --------------- selenium 依赖 start ----------------
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# --------------- selenium 依赖 end ----------------
'''
爬取台湾PTT论坛的热门帖子,包括帖子的标题、内容【文本、图片、视频】
爬取流程:创建驱动--》打开浏览器--》打开网页--》爬取分类元素--》循环点击--》爬取热门帖子标题--》循环点击--》爬取帖子详情
'''
def reptile(browser=None, search_word=""):
url = "https://skynet.ipplus360.com/q.html"
browser = browser or create(no_headless=False, using_user_data=True)
# 有头模式执行
# browser = browser or create()
# 打开网页
browser.get(url)
print("------")
print(browser.page_source)
# log.debug("已打开浏览器")
# classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")
reptile()
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment