pc_ltn.py 9.05 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
import io
import json
import re
import sys
import time
import loguru
# import pymysql.cursors
import requests
from bs4 import BeautifulSoup
import datetime
from api.index import importJson, getReptileTask, importJsonPath
from utils.Logger import log
from utils.index import convert_to_traditional, create_directory_if_not_exists, delete_directory, parse_ltn_time_string
# from requests_toolbelt import *
from utils.createBrowserDriver import create
import opencc
from utils.filse import save_json
import os
from config.settings import get_base_file_url
from utils.download_image import download_image
# ---------------   selenium 依赖 start ----------------
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# ---------------   selenium 依赖 end ----------------
'''
爬取台湾PTT论坛的热门帖子,包括帖子的标题、内容【文本、图片、视频】

爬取流程:创建驱动--》打开浏览器--》打开网页--》爬取分类元素--》循环点击--》爬取热门帖子标题--》循环点击--》爬取帖子详情
'''


def reptile(browser=None, search_word=""):
    # 示例
    # url = "https://search.ltn.com.tw/list?keyword=新闻&start_time=20230730&end_time=20230801&type=all&sort=date"
    # 将时间戳转换为datetime对象
    begin_dt_object = datetime.datetime.fromtimestamp(beginFiltrationTime)
    end_dt_object = datetime.datetime.fromtimestamp(endFiltrationTime)

    # 将datetime对象格式化为指定的字符串格式 "20230730"
    filter_start_date = begin_dt_object.strftime("%Y%m%d")
    filter_end_date = end_dt_object.strftime("%Y%m%d")

    # 基础url
    url = f"https://search.ltn.com.tw/list?keyword={search_word}&start_time={str(filter_start_date)}&end_time={str(filter_end_date)}&type=all&sort=date&type=all"
    browser = browser or create(no_headless=False, using_user_data=True)
    # 有头模式执行
    # browser = browser or create()
    # 打开网页
liyang's avatar
liyang committed
51
    browser.get(url + "&page=1")
52
    time.sleep(2)
53 54

    # 获取分页
liyang's avatar
liyang committed
55 56
    page_next = True
    # page_next = browser.find_elements("xpath", "//div[@data-desc='分頁']/a[@class='p_next']")
liyang's avatar
liyang committed
57
    page_index = 1
58
    # 循环分页
liyang's avatar
liyang committed
59 60 61 62 63 64 65 66
    while page_next:
        if page_index > 1:
            browser.get(f"{url}&page={page_index}")
            time.sleep(0.5)
        # 重新获取
        try:
            page_next = browser.find_elements("xpath", "//div[@data-desc='分頁']/a[@class='p_next']")
        except:
liyang's avatar
liyang committed
67 68 69
            # 判断页码收否只有一页
            # tag_list = browser.find_elements('xpath', "//div[@class='page-name']//ul/li")
            # if len(tag_list) <= 0:
liyang's avatar
liyang committed
70
            page_next = False
71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
        # 滚动底部
        browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        # log.debug("已打开浏览器")
        classify_item_list = browser.find_elements('xpath', "//div[@class='page-name']//ul/li")

        for index, item_element in enumerate(classify_item_list):
            # 使用BeautifulSoup解析HTML
            soup = BeautifulSoup(item_element.get_attribute('outerHTML'), 'html.parser')
            # 发布时间
            element_release = item_element.find_element("xpath", "./div/span")
            # 查找所有img标签
            image_list = soup.find_all('img')
            picture_url = []
            img_tag = soup.new_tag("img")
            if len(image_list) > 0:
                for key, element in enumerate(image_list):
                    # 下载图片至本地,替换标签中的src
                    id = str(int(time.time()))
                    # 下载地址
                    download_dir = f'{os.path.join(local_path, f"{id}.jpg")}'
                    # 访问地址
                    access_address = f'{get_base_file_url()}{table_name.split("_")[1]}/{local_path_name}/{id}.jpg'
                    # 下载状态
94 95 96 97 98
                    if "default" in element['src']:
                        status = False
                    else:
                        status = download_image(element['src'], download_dir)

99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
                    if status:
                        # element['src'] = access_address
                        img_tag["src"] = access_address
                        # 重新组装图片
                        picture_url.append(download_dir)
            else:
                error = ""

            p = soup.new_tag("p")
            p.string = item_element.find_element("xpath", "./div/p").text
            div = soup.new_tag("div")
            div.append(img_tag)
            div.append(p)
            html = div.prettify()

            date_string = element_release.text
            date_format = "%a %b %d %H:%M:%S %Y"
            # 将日期字符串转换为datetime对象
            date_time = parse_ltn_time_string(date_string)
118
            # print(date_time)
119 120
            # date_time = datetime.datetime.strptime(, date_format)
            # 将datetime对象转换为时间戳(以秒为单位)
121 122 123 124
            try:
                release_time = int(date_time)
            except:
                release_time = int(time.time())
125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140

            # 过滤时间
            if beginFiltrationTime <= release_time <= endFiltrationTime:
                # --------------- 组装数据 start---------------------
                obj = {
                    "title": item_element.find_element("xpath", "./div/a[1]").text,
                    "content": html,
                    "link": item_element.find_element("xpath", "./div/a[1]").get_attribute("href"),
                    "reptileTime": str(int(time.time())),
                    "type": "图文",
                    "author": "自由时报",
                    "releaseTime": str(release_time),
                    "picture_url": ",".join(picture_url)
                }
                # --------------- 组装数据 end---------------------
                data.append(obj)
liyang's avatar
liyang committed
141 142 143 144
        page_index = page_index + 1
        time.sleep(0.1)

        # 限制最大20页
liyang's avatar
liyang committed
145
        if page_index >= 21:
liyang's avatar
liyang committed
146 147 148
            page_next = False
            # 退出循环
            break
149 150 151 152 153 154 155 156 157 158 159 160 161

    if len(data) > 0:
        # 保存json文件到本地
        json_path = os.path.join(local_path, "data.json")
        state_save = save_json(json_path, data)
        # 保存task
        task = {
            # 爬取时间
            "reptileTime": data[0]["reptileTime"],
            # 本地路径
            "localPath": local_path,
            "beginFiltrationTime": beginFiltrationTime,
            "endFiltrationTime": endFiltrationTime,
162 163
            "keyword": keyword,
            "total": len(data)
164 165 166 167 168 169 170 171 172
        }
        state_save = save_json(os.path.join(file_dir, "task.json"), task)
        if state_save:
            log.debug('save file success')
        else:
            log.debug('save file failed')
        script_close(browser)
    else:
        # 爬取数据为空
173
        log.info("no reptile data")
174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209
        # 删除目录
        delete_directory(local_path)
        script_close(browser)


def script_close(browser):
    # 关闭浏览器驱动
    try:
        browser.close()
        browser.quit()
    except:
        log.debug("浏览器驱动关闭失败")
    try:
        # 一些代码...
        sys.exit()
    except SystemExit:
        raise  # 重新抛出SystemExit异常,让脚本退出
    except Exception as e:
        # 异常处理代码...
        print("sys.exit() 执行失败")


def main():
    """

    """
    # 请求关键词
    response = getReptileTask()
    global status_task
    global beginFiltrationTime
    global endFiltrationTime
    global keyword
    if response['status_code'] == 200 and response['data']['code'] == 200:
        log.debug("call success")
        search_word = ""
        for item in response['data']['rows']:
210
            if item['name'] == 'ltn-自由时报':
211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232
                search_word = item['keyword']
                table_name = item['tableName']
                status_task = int(item["status"])
                keyword = str(item["keyword"])
                beginFiltrationTime = int(item["beginFiltrationTime"])
                endFiltrationTime = int(item["endFiltrationTime"])
        # 简体转繁体
        if status_task == 0 and len(search_word) > 0:
            reptile(None, convert_to_traditional(search_word))
        else:
            log.debug("爬取任务未启用")
    else:
        log.debug("call failed")
        # 请求超时
        reptile(None, convert_to_traditional("新闻"))
        # upload_control()


# 全局变量
data = []
# 任务详情
task = {}
233
table_name = "pms_ltn"
234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253

# 全局字段
keyword = ""
# 过滤时间开始
beginFiltrationTime = int(123)
# 过滤时间结束
endFiltrationTime = int(123)

# 文件根目录
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
# 任务目录名称
local_path_name = str(int(time.time()))
# 任务目录路径
local_path = f'{os.path.join(file_dir, local_path_name)}'
# 任务目录是否创建
local_path_status = create_directory_if_not_exists(local_path)
# 是否启用
status_task = 0
# 调用main函数
main()