Commit 653b114a authored by liyang's avatar liyang

feat:youtube debug

parent c4a794b8
......@@ -60,7 +60,7 @@ def reptile(browser=None, search_word=""):
# 过滤时间
# # 如果'releaseTime'不是整数,则将其转换为整数
new_releaseTime = int(timestamp)
if new_releaseTime < filter_time_start or new_releaseTime > filter_time_end:
if new_releaseTime < beginFiltrationTime or new_releaseTime > endFiltrationTime:
# 如果'new_releaseTime'不在指定范围内,则跳过当前迭代,继续下一个项目
continue
......@@ -180,7 +180,10 @@ def reptile(browser=None, search_word=""):
# 爬取时间
"reptileTime": data[0]["reptileTime"],
# 本地路径
"localPath": local_path
"localPath": local_path,
"beginFiltrationTime": beginFiltrationTime,
"endFiltrationTime": endFiltrationTime,
"keyword": keyword
}
state_save = save_json(os.path.join(file_dir, "task.json"), task)
if state_save:
......@@ -221,8 +224,9 @@ def main():
# 请求关键词
response = getReptileTask()
global status_task
global filter_time_start
global filter_time_end
global beginFiltrationTime
global endFiltrationTime
global keyword
if response['status_code'] == 200 and response['data']['code'] == 200:
log.debug("call success")
search_word = ""
......@@ -231,8 +235,9 @@ def main():
search_word = item['keyword']
table_name = item['tableName']
status_task = int(item["status"])
filter_time_start = int(item["beginFiltrationTime"])
filter_time_end = int(item["endFiltrationTime"])
keyword = str(item["keyword"])
beginFiltrationTime = int(item["beginFiltrationTime"])
endFiltrationTime = int(item["endFiltrationTime"])
# 简体转繁体
if status_task == 0 and len(search_word) > 0:
reptile(None, convert_to_traditional(search_word))
......@@ -250,10 +255,14 @@ data = []
# 任务详情
task = {}
table_name = "pms_dcard"
# 全局字段
keyword = ""
# 过滤时间开始
filter_time_start = int(123)
beginFiltrationTime = int(123)
# 过滤时间结束
filter_time_end = int(123)
endFiltrationTime = int(123)
# 文件根目录
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
# 任务目录名称
......
......@@ -88,7 +88,7 @@ def reptile(browser=None, search_word=""):
# 过滤时间
# # 如果'releaseTime'不是整数,则将其转换为整数
new_releaseTime = int(release_time)
if new_releaseTime < filter_time_start or new_releaseTime > filter_time_end:
if new_releaseTime < beginFiltrationTime or new_releaseTime > endFiltrationTime:
# 如果'new_releaseTime'不在指定范围内,则跳过当前迭代,继续下一个项目
continue
......@@ -179,7 +179,10 @@ def reptile(browser=None, search_word=""):
# 爬取时间
"reptileTime": data[0]["reptileTime"],
# 本地路径
"localPath": local_path
"localPath": local_path,
"beginFiltrationTime": beginFiltrationTime,
"endFiltrationTime": endFiltrationTime,
"keyword": keyword
}
state_save = save_json(os.path.join(file_dir, "task.json"), task)
if state_save:
......@@ -220,8 +223,9 @@ def main():
# 请求关键词
response = getReptileTask()
global status_task
global filter_time_start
global filter_time_end
global beginFiltrationTime
global endFiltrationTime
global keyword
if response['status_code'] == 200 and response['data']['code'] == 200:
log.debug("call success")
search_word = ""
......@@ -230,8 +234,9 @@ def main():
search_word = item['keyword']
table_name = item['tableName']
status_task = int(item["status"])
filter_time_start = int(item["beginFiltrationTime"])
filter_time_end = int(item["endFiltrationTime"])
keyword = str(item["keyword"])
beginFiltrationTime = int(item["beginFiltrationTime"])
endFiltrationTime = int(item["endFiltrationTime"])
# 简体转繁体
if status_task == 0 and len(search_word) > 0:
reptile(None, convert_to_traditional(search_word))
......@@ -249,10 +254,14 @@ data = []
# 任务详情
task = {}
table_name = "pms_facebook"
# 全局字段
keyword = ""
# 过滤时间开始
filter_time_start = int(123)
beginFiltrationTime = int(123)
# 过滤时间结束
filter_time_end = int(123)
endFiltrationTime = int(123)
# 文件根目录
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
# 任务目录名称
......
......@@ -87,7 +87,7 @@ def reptile(browser=None, search_word=""):
# 过滤时间
# # 如果'releaseTime'不是整数,则将其转换为整数
new_releaseTime = int(timestamp)
if new_releaseTime < filter_time_start or new_releaseTime > filter_time_end:
if new_releaseTime < beginFiltrationTime or new_releaseTime > endFiltrationTime:
# 如果'new_releaseTime'不在指定范围内,则跳过当前迭代,继续下一个项目
continue
......@@ -170,7 +170,10 @@ def reptile(browser=None, search_word=""):
# 爬取时间
"reptileTime": data[0]["reptileTime"],
# 本地路径
"localPath": local_path
"localPath": local_path,
"beginFiltrationTime": beginFiltrationTime,
"endFiltrationTime": endFiltrationTime,
"keyword": keyword
}
state_save = save_json(os.path.join(file_dir, "task.json"), task)
if state_save:
......@@ -210,8 +213,9 @@ def main():
# 请求关键词
response = getReptileTask()
global status_task
global filter_time_start
global filter_time_end
global beginFiltrationTime
global endFiltrationTime
global keyword
if response['status_code'] == 200 and response['data']['code'] == 200:
log.debug("call success")
search_word = ""
......@@ -220,8 +224,9 @@ def main():
search_word = item['keyword']
table_name = item['tableName']
status_task = int(item["status"])
filter_time_start = int(item["beginFiltrationTime"])
filter_time_end = int(item["endFiltrationTime"])
keyword = str(item["keyword"])
beginFiltrationTime = int(item["beginFiltrationTime"])
endFiltrationTime = int(item["endFiltrationTime"])
# 简体转繁体
if status_task == 0 and len(search_word) > 0:
reptile(None, convert_to_traditional(search_word))
......@@ -239,10 +244,14 @@ data = []
# 任务详情
task = {}
table_name = "pms_instagram"
# 全局字段
keyword = ""
# 过滤时间开始
filter_time_start = int(123)
beginFiltrationTime = int(123)
# 过滤时间结束
filter_time_end = int(123)
endFiltrationTime = int(123)
# 文件根目录
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
# 任务目录名称
......
......@@ -190,7 +190,7 @@ def reptile(browser=None, search_word=""):
release_time = int(date_time.timestamp())
# 过滤时间
if filter_time_start <= release_time <= filter_time_end:
if beginFiltrationTime <= release_time <= endFiltrationTime:
# --------------- 组装数据 start---------------------
obj = {
"title": element_title.text,
......@@ -230,7 +230,10 @@ def reptile(browser=None, search_word=""):
# 爬取时间
"reptileTime": data[0]["reptileTime"],
# 本地路径
"localPath": local_path
"localPath": local_path,
"beginFiltrationTime": beginFiltrationTime,
"endFiltrationTime": endFiltrationTime,
"keyword": keyword
}
state_save = save_json(os.path.join(file_dir, "task.json"), task)
if state_save:
......@@ -263,7 +266,6 @@ def script_close(browser):
print("sys.exit() 执行失败")
def main():
"""
......@@ -271,8 +273,9 @@ def main():
# 请求关键词
response = getReptileTask()
global status_task
global filter_time_start
global filter_time_end
global beginFiltrationTime
global endFiltrationTime
global keyword
if response['status_code'] == 200 and response['data']['code'] == 200:
log.debug("call success")
search_word = ""
......@@ -281,8 +284,9 @@ def main():
search_word = item['keyword']
table_name = item['tableName']
status_task = int(item["status"])
filter_time_start = int(item["beginFiltrationTime"])
filter_time_end = int(item["endFiltrationTime"])
keyword = str(item["keyword"])
beginFiltrationTime = int(item["beginFiltrationTime"])
endFiltrationTime = int(item["endFiltrationTime"])
# 简体转繁体
if status_task == 0 and len(search_word) > 0:
reptile(None, convert_to_traditional(search_word))
......@@ -300,10 +304,14 @@ data = []
# 任务详情
task = {}
table_name = "pms_ptt"
# 全局字段
keyword = ""
# 过滤时间开始
filter_time_start = int(123)
beginFiltrationTime = int(123)
# 过滤时间结束
filter_time_end = int(123)
endFiltrationTime = int(123)
# 文件根目录
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
# 任务目录名称
......
......@@ -95,7 +95,7 @@ def reptile(browser=None, search_word=""):
# 过滤时间
# # 如果'releaseTime'不是整数,则将其转换为整数
new_releaseTime = int(timestamp)
if new_releaseTime < filter_time_start or new_releaseTime > filter_time_end:
if new_releaseTime < beginFiltrationTime or new_releaseTime > endFiltrationTime:
# 如果'new_releaseTime'不在指定范围内,则跳过当前迭代,继续下一个项目
continue
......@@ -200,7 +200,10 @@ def reptile(browser=None, search_word=""):
# 爬取时间
"reptileTime": data[0]["reptileTime"],
# 本地路径
"localPath": local_path
"localPath": local_path,
"beginFiltrationTime": beginFiltrationTime,
"endFiltrationTime": endFiltrationTime,
"keyword": keyword
}
state_save = save_json(os.path.join(file_dir, "task.json"), task)
if state_save:
......@@ -241,8 +244,9 @@ def main():
# 请求关键词
response = getReptileTask()
global status_task
global filter_time_start
global filter_time_end
global beginFiltrationTime
global endFiltrationTime
global keyword
if response['status_code'] == 200 and response['data']['code'] == 200:
log.debug("call success")
search_word = ""
......@@ -250,9 +254,10 @@ def main():
if item['name'] == 'twitter':
search_word = item['keyword']
table_name = item['tableName']
keyword = str(item["keyword"])
status_task = int(item["status"])
filter_time_start = int(item["beginFiltrationTime"])
filter_time_end = int(item["endFiltrationTime"])
beginFiltrationTime = int(item["beginFiltrationTime"])
endFiltrationTime = int(item["endFiltrationTime"])
# 简体转繁体
if status_task == 0 and len(search_word) > 0:
reptile(None, convert_to_traditional(search_word))
......@@ -270,10 +275,14 @@ data = []
# 任务详情
task = {}
table_name = "pms_twitter"
# 全局字段
keyword = ""
# 过滤时间开始
filter_time_start = int(123)
beginFiltrationTime = int(123)
# 过滤时间结束
filter_time_end = int(123)
endFiltrationTime = int(123)
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
# 任务目录名称
local_path_name = str(int(time.time()))
......
......@@ -73,7 +73,7 @@ def reptile(browser=None, search_word=""):
# 过滤时间
# # 如果'releaseTime'不是整数,则将其转换为整数
new_releaseTime = int(releaseTime)
if new_releaseTime < filter_time_start or new_releaseTime > filter_time_end:
if new_releaseTime < beginFiltrationTime or new_releaseTime > endFiltrationTime:
# 如果'new_releaseTime'不在指定范围内,则跳过当前迭代,继续下一个项目
continue
......@@ -111,7 +111,10 @@ def reptile(browser=None, search_word=""):
# 爬取时间
"reptileTime": data[0]["reptileTime"],
# 本地路径
"localPath": local_path
"localPath": local_path,
"beginFiltrationTime": beginFiltrationTime,
"endFiltrationTime": endFiltrationTime,
"keyword": keyword
}
state_save = save_json(os.path.join(file_dir, "task.json"), task)
if state_save:
......@@ -151,8 +154,9 @@ def main():
# 请求关键词
response = getReptileTask()
global status_task
global filter_time_start
global filter_time_end
global beginFiltrationTime
global endFiltrationTime
global keyword
# print(response)
if response['status_code'] == 200 and response['data']['code'] == 200:
log.debug("call success")
......@@ -162,8 +166,9 @@ def main():
search_word = item['keyword']
table_name = item['tableName']
status_task = int(item["status"])
filter_time_start = int(item["beginFiltrationTime"])
filter_time_end = int(item["endFiltrationTime"])
keyword = str(item["keyword"])
beginFiltrationTime = int(item["beginFiltrationTime"])
endFiltrationTime = int(item["endFiltrationTime"])
# 简体转繁体
if status_task == 0 and len(search_word) > 0:
reptile(None, convert_to_traditional(search_word))
......@@ -181,10 +186,14 @@ data = []
# 任务详情
task = {}
table_name = "pms_youtube"
# 全局字段
keyword = ""
# 过滤时间开始
filter_time_start = int(123)
beginFiltrationTime = int(123)
# 过滤时间结束
filter_time_end = int(123)
endFiltrationTime = int(123)
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
# 任务目录名称
local_path_name = str(int(time.time()))
......
classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")
# log.debug(classify_item_list)
length = len(classify_item_list)
for index in range(length):
# 暂时先爬取 第2个 分类
if 0 <= index < 4:
type_title = classify_item_list[index].text
classify_item_list[index].click()
time.sleep(0.1)
for index_two in range(length_two):
print(element_list[index_two].text)
# 浏览器返回上一页
browser.back()
if index == 0:
browser.back()
time.sleep(0.1)
classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")
\ No newline at end of file
import io
import json
import re
import sys
import time
import loguru
# import pymysql.cursors
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from api.index import importJson, getReptileTask, importJsonPath
from utils.Logger import log
from utils.index import convert_to_traditional, create_directory_if_not_exists, delete_directory
# from requests_toolbelt import *
from utils.createBrowserDriver import create
import opencc
from utils.filse import save_json
import os
from config.settings import get_base_file_url
from utils.download_image import download_image
# --------------- selenium 依赖 start ----------------
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# --------------- selenium 依赖 end ----------------
'''
爬取台湾PTT论坛的热门帖子,包括帖子的标题、内容【文本、图片、视频】
爬取流程:创建驱动--》打开浏览器--》打开网页--》爬取分类元素--》循环点击--》爬取热门帖子标题--》循环点击--》爬取帖子详情
'''
def reptile(browser=None, search_word=""):
url = "https://skynet.ipplus360.com/q.html"
browser = browser or create(no_headless=False, using_user_data=True)
# 有头模式执行
# browser = browser or create()
# 打开网页
browser.get(url)
print("------")
print(browser.page_source)
# log.debug("已打开浏览器")
# classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")
reptile()
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment