pc_youtube.py 5.6 KB
Newer Older
liyang's avatar
liyang committed
1
import json
liyang's avatar
liyang committed
2
import platform
liyang's avatar
liyang committed
3 4
import time
from bs4 import BeautifulSoup
liyang's avatar
liyang committed
5
from utils.Logger import log
liyang's avatar
liyang committed
6 7
from utils.createBrowserDriver import create
from utils.filse import save_json
liyang's avatar
liyang committed
8
from api.index import importJson, getReptileTask, importJsonPath
9 10 11
from utils.index import convert_to_traditional, yt_dlp_download, convert_string_to_time
from pytube import YouTube
from datetime import datetime
liyang's avatar
liyang committed
12
import os
liyang's avatar
liyang committed
13
from config.settings import get_base_file_url
14
from selenium.webdriver.common.action_chains import ActionChains
15
import sys
16 17 18 19
# ---------------   selenium 依赖 start ----------------
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
20

liyang's avatar
liyang committed
21

22 23
# ---------------   selenium 依赖 end ----------------

liyang's avatar
liyang committed
24 25
def reptile(browser=None, search_word=""):
    """
liyang's avatar
liyang committed
26

liyang's avatar
liyang committed
27 28 29 30
    :param browser:
    :param search_word:
    :return:
    """
liyang's avatar
liyang committed
31
    browser = browser or create(no_headless=True, using_user_data=False)
liyang's avatar
liyang committed
32 33
    # print(browser)
    # 打开网页
liyang's avatar
liyang committed
34
    print(f"搜索词:{search_word}")
liyang's avatar
liyang committed
35 36
    url = f'https://www.youtube.com/results?search_query={search_word}'
    browser.get(url)
liyang's avatar
liyang committed
37 38 39 40 41 42
    # print(browser.page_source)
    if platform.system() == "Linux":
        time.sleep(3)
    else:
        wait = WebDriverWait(browser, 10)
        wait.until(EC.presence_of_element_located((By.XPATH, "//div[@id='contents']")))
43
    log.debug("youtube login complete")
liyang's avatar
liyang committed
44 45 46
    video_list = browser.find_elements('xpath', "//div[@id='contents']//ytd-video-renderer")
    # print(video_list[0].get_attribute("outerHTML"))
    length = len(video_list)
liyang's avatar
liyang committed
47
    for index in range(length):
liyang's avatar
liyang committed
48 49 50 51 52 53 54 55 56 57 58 59
        # 查找标题
        author_element = video_list[index].find_element("xpath","./div[1]/div/div[2]//ytd-channel-name//yt-formatted-string/a")
        # print(author_element.get_attribute("outerHTML"))

        title_element = video_list[index].find_element("xpath",".//div[@id='title-wrapper']//a")
        # print(title_element.get_attribute("outerHTML"))

        time_element = video_list[index].find_element("xpath",".//ytd-video-meta-block//div[@id='metadata-line']/span[2]")
        # print(time_element.get_attribute("outerHTML"))

        title = title_element.get_attribute('title')
        link = title_element.get_attribute('href')
60 61
        id = link.split("?")[1].split("&")[0].replace("v=", "")
        url = f'https://www.youtube.com/watch?v={id}'
liyang's avatar
liyang committed
62 63 64 65 66 67 68 69 70

        # 时长按照秒计算
        video_duration = int(YouTube(url).length) // 60

        # 暂时先取6条数据
        if index < 6 and video_duration < 60:
            # print(str(id))
            # print("视频连接:" + str(link))
            # print("视频时长:" + str(video_duration))
liyang's avatar
liyang committed
71
            base_urr = get_base_file_url()
72 73
            releaseTime = ""
            try:
liyang's avatar
liyang committed
74
                releaseTime = str(int(convert_string_to_time(time_element.text)))
75
            except:
76
                releaseTime = str(int(time.time()))
77 78 79 80 81
            video_url = []
            # 下载地址
            download_dir = f'{os.path.join(file_dir, f"{id}.mp4")}'
            # 访问地址
            access_address = f'{get_base_file_url()}{table_name.split("_")[1]}/{id}.mp4'
82
            # 下载视频
liyang's avatar
liyang committed
83
            state_download = yt_dlp_download(url, 'youtube')
84
            video_url.append(download_dir)
liyang's avatar
liyang committed
85
            # print(str(state_download))
liyang's avatar
liyang committed
86 87 88 89
            if state_download:
                # 组装数据
                obj = {
                    "title": title,
90 91
                    "content": f"<video controls style='width:100%' src='{access_address}'></video>",
                    "videoUrl": ",".join(video_url),
liyang's avatar
liyang committed
92 93 94
                    "link": link,
                    "reptileTime": str(int(time.time())),
                    "type": '视频',
liyang's avatar
liyang committed
95
                    "author": author_element.text,
96
                    "releaseTime": releaseTime
liyang's avatar
liyang committed
97 98
                }
                data.append(obj)
liyang's avatar
liyang committed
99
            else:
100 101
                # print("")
                error = ""
liyang's avatar
liyang committed
102 103
    if len(data) > 0:
        # 保存json文件到本地
104
        # log.debug(os.path.abspath("../"))
liyang's avatar
liyang committed
105
        state_save = save_json(os.path.join(file_dir, str(int(time.time())) + ".json"), data)
liyang's avatar
liyang committed
106 107 108 109
        if state_save:
            log.debug('save file success')
        else:
            log.debug('save file failed')
liyang's avatar
liyang committed
110
        script_close(browser)
liyang's avatar
liyang committed
111
    else:
liyang's avatar
liyang committed
112 113
        # 爬取数据为空
        log.info("未爬取到数据")
liyang's avatar
liyang committed
114
        script_close(browser)
115

liyang's avatar
liyang committed
116 117

def script_close(browser):
liyang's avatar
liyang committed
118
    # 关闭浏览器驱动
119 120 121 122 123
    try:
        browser.close()
        browser.quit()
    except:
        log.debug("浏览器驱动关闭失败")
liyang's avatar
liyang committed
124
    sys.exit()
liyang's avatar
liyang committed
125

liyang's avatar
liyang committed
126

liyang's avatar
liyang committed
127 128 129 130 131 132
def main():
    """

    """
    # 请求关键词
    response = getReptileTask()
liyang's avatar
liyang committed
133
    global status_task
134
    # print(response)
liyang's avatar
liyang committed
135 136 137 138
    if response['status_code'] == 200 and response['data']['code'] == 200:
        log.debug("call success")
        search_word = ""
        for item in response['data']['rows']:
139
            if item['name'] == 'youtube':
liyang's avatar
liyang committed
140 141
                search_word = item['keyword']
                table_name = item['tableName']
liyang's avatar
liyang committed
142
                status_task = int(item["status"])
143 144
        # 简体转繁体
        if status_task == 0 and len(search_word) > 0:
liyang's avatar
liyang committed
145 146 147
            reptile(None, convert_to_traditional(search_word))
        else:
            log.debug("爬取任务未启用")
liyang's avatar
liyang committed
148 149
    else:
        log.debug("call failed")
150 151
        # 请求超时
        reptile(None, convert_to_traditional("新闻"))
liyang's avatar
liyang committed
152 153
        # upload_control()

liyang's avatar
liyang committed
154

liyang's avatar
liyang committed
155 156 157
# 全局变量
data = []
table_name = "pms_youtube"
liyang's avatar
liyang committed
158
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
liyang's avatar
liyang committed
159 160
# 是否启用
status_task = '0'
liyang's avatar
liyang committed
161
# 调用main函数
162
main()