pc_ptt.py 13.6 KB
Newer Older
liyang's avatar
liyang committed
1 2 3
import io
import json
import re
4
import sys
liyang's avatar
liyang committed
5 6
import time
import loguru
7
# import pymysql.cursors
liyang's avatar
liyang committed
8 9
import requests
from bs4 import BeautifulSoup
liyang's avatar
liyang committed
10
from datetime import datetime
liyang's avatar
liyang committed
11
from api.index import importJson, getReptileTask, importJsonPath
liyang's avatar
liyang committed
12
from utils.Logger import log
13
from utils.index import convert_to_traditional, create_directory_if_not_exists, delete_directory
liyang's avatar
liyang committed
14 15 16
# from requests_toolbelt import *
from utils.createBrowserDriver import create
import opencc
liyang's avatar
liyang committed
17 18
from utils.filse import save_json
import os
liyang's avatar
liyang committed
19 20
from config.settings import get_base_file_url
from utils.download_image import download_image
liyang's avatar
liyang committed
21 22 23 24
# ---------------   selenium 依赖 start ----------------
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
25

liyang's avatar
liyang committed
26
# ---------------   selenium 依赖 end ----------------
liyang's avatar
liyang committed
27 28 29 30 31 32 33 34 35
'''
爬取台湾PTT论坛的热门帖子,包括帖子的标题、内容【文本、图片、视频】

爬取流程:创建驱动--》打开浏览器--》打开网页--》爬取分类元素--》循环点击--》爬取热门帖子标题--》循环点击--》爬取帖子详情
'''


def reptile(browser=None, search_word=""):
    url = "https://www.ptt.cc/bbs/hotboards.html"
liyang's avatar
liyang committed
36
    browser = browser or create(no_headless=False, using_user_data=True)
liyang's avatar
liyang committed
37
    # 有头模式执行
liyang's avatar
liyang committed
38
    # browser = browser or create()
liyang's avatar
liyang committed
39 40
    # 打开网页
    browser.get(url)
41
    # log.debug("已打开浏览器")
liyang's avatar
liyang committed
42
    classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")
liyang's avatar
liyang committed
43

liyang's avatar
liyang committed
44
    for index, item_element in enumerate(classify_item_list):
45
        # 暂时先爬取 第2个 分类
46
        if 0 <= index <= 14:
liyang's avatar
liyang committed
47
            type_title = classify_item_list[index].text
liyang's avatar
liyang committed
48
            # 进入分类页面
liyang's avatar
liyang committed
49
            classify_item_list[index].click()
liyang's avatar
liyang committed
50
            time.sleep(0.1)
liyang's avatar
liyang committed
51 52

            # 综合分类需要确认已满18周岁
liyang's avatar
liyang committed
53 54 55 56 57 58
            if index == 0:
                try:
                    button = browser.find_element("xpath", "//form/div[1]//button")
                    button.click()
                except:
                    error = ""
liyang's avatar
liyang committed
59

liyang's avatar
liyang committed
60 61
            wait = WebDriverWait(browser, 10)
            wait.until(EC.presence_of_element_located((By.XPATH, "//div[@class='r-ent']")))
liyang's avatar
liyang committed
62
            element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
liyang's avatar
liyang committed
63 64

            for index_two, item in enumerate(element_list):
liyang's avatar
liyang committed
65
                # print(element_list[index_two].text)
liyang's avatar
liyang committed
66
                try:
liyang's avatar
liyang committed
67
                    re.findall("公告", item.text)
liyang's avatar
liyang committed
68
                except IndexError:
liyang's avatar
liyang committed
69
                    log.debug(f"正在爬取分类:{type_title}-第{index_two + 1}条")
liyang's avatar
liyang committed
70 71
                    print("当前连接:" + str(browser.current_url))
                    print(data[len(data) - 1]["title"])
liyang's avatar
liyang committed
72
                # 使用正则表达式进行匹配关键词
liyang's avatar
liyang committed
73
                if re.findall(search_word, item.text):
liyang's avatar
liyang committed
74 75 76
                    # log.debug(f"找到了匹配的字符串:{matches}")
                    error = ""
                else:
liyang's avatar
liyang committed
77
                    # 本次迭代帖子标题未匹配关键词,退出本次迭代,进入下一次迭代
liyang's avatar
liyang committed
78
                    continue
liyang's avatar
liyang committed
79

liyang's avatar
liyang committed
80 81
                # 标题不包含"公告"和"看板"
                if re.findall("公告", element_list[index_two].text) or re.findall("看板", element_list[index_two].text):
liyang's avatar
liyang committed
82
                    a = 1
liyang's avatar
liyang committed
83 84 85 86 87 88 89 90 91 92 93
                else:
                    # 使用正则表达式进行匹配
                    # matches =
                    # log.debug(element_list[index_two].text+str(matches))
                    # 打印匹配结果
                    # if matches:
                    # log.debug(f"找到了匹配的字符串:{matches}")
                    element_list[index_two].click()
                    time.sleep(0.1)
                    # 原链接
                    browser_current_url = browser.current_url
liyang's avatar
liyang committed
94
                    # print(browser_current_url)
liyang's avatar
liyang committed
95 96 97 98 99
                    # log.debug('网页链接' + str(browser_current_url))
                    try:
                        # 获取帖子详情
                        element_title = browser.find_element('xpath',
                                                             "//div[@id='main-content']/div[3]//span[@class='article-meta-value']")
liyang's avatar
liyang committed
100

liyang's avatar
liyang committed
101
                    except:
liyang's avatar
liyang committed
102 103
                        log.error(
                            "xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']")
liyang's avatar
liyang committed
104 105 106 107 108 109 110 111 112 113
                        log.debug(f'页面链接:{browser_current_url}')
                        # 浏览器返回上一页
                        browser.back()
                        element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
                        break
                    # 内容可能包含图片和视频,需要后处理
                    element_content = browser.find_element('xpath', "//div[@id='main-content']")
                    # 去除herf属性值包含'img'的a标签
                    # ------------------------------------
                    # 使用BeautifulSoup解析HTML
114
                    soup = BeautifulSoup(element_content.get_attribute('outerHTML'), 'html.parser')
liyang's avatar
liyang committed
115 116 117 118 119 120
                    # 作者
                    element_author = browser.find_element('xpath',
                                                          "//div[@id='main-content']/div[@class='article-metaline'][1]/span[2]")
                    # 发布时间
                    element_release = browser.find_element('xpath',
                                                           "//div[@id='main-content']/div[@class='article-metaline'][3]/span[2]")
liyang's avatar
liyang committed
121

liyang's avatar
liyang committed
122
                    # log.debug('开始判断类型')
liyang's avatar
liyang committed
123 124 125 126 127
                    try:
                        # 找到所有第一级标签为 `div` 的元素
                        div_elements = soup.find_all('div')
                        # log.debug("一级div数量:" + str(len(div_elements)))
                        # 逐个删除这些元素
liyang's avatar
liyang committed
128 129 130
                        for key, div in enumerate(div_elements):
                            if key > 0:
                                div.extract()
liyang's avatar
liyang committed
131 132 133 134 135
                        # 删除第一级span
                        span_element = soup.find_all('span')
                        # log.debug("一级span数量:" + str(len(span_element)))
                        for span in span_element:
                            span.extract()
liyang's avatar
liyang committed
136

liyang's avatar
liyang committed
137 138 139
                    except:
                        # log.debug("删除第一级div失败")
                        a = 2
liyang's avatar
liyang committed
140 141 142
                    # ---------------- 判断类型 start ----------
                    # 类型
                    content_type = ""
liyang's avatar
liyang committed
143 144
                    # 查找所有img标签
                    image_list = soup.find_all('img')
liyang's avatar
liyang committed
145
                    try:
liyang's avatar
liyang committed
146
                        if len(image_list) > 0:
liyang's avatar
liyang committed
147 148 149 150 151
                            content_type = "图文"
                        else:
                            content_type = "文字"
                    except:
                        content_type = "文字"
liyang's avatar
liyang committed
152 153 154 155 156 157
                    picture_url = []
                    if len(image_list) > 0:
                        for key, element in enumerate(image_list):
                            # 下载图片至本地,替换标签中的src
                            id = str(int(time.time()))
                            # 下载地址
158
                            download_dir = f'{os.path.join(local_path, f"{id}.jpg")}'
liyang's avatar
liyang committed
159
                            # 访问地址
160
                            access_address = f'{get_base_file_url()}{table_name.split("_")[1]}/{local_path_name}/{id}.jpg'
liyang's avatar
liyang committed
161 162 163 164
                            # 下载状态
                            status = download_image(element['src'], download_dir)
                            if status:
                                element['src'] = access_address
165
                                picture_url.append(download_dir)
liyang's avatar
liyang committed
166
                    else:
liyang's avatar
liyang committed
167 168
                        # print("")
                        error = ""
liyang's avatar
liyang committed
169 170 171 172 173 174 175 176 177 178 179 180
                    # ---------------- 判断类型 end ----------
                    # log.debug('开始内容过滤')
                    # ------------------ content 过滤 start--------------
                    try:
                        # 查找所有的<a>标签
                        a_tags = soup.find_all('a', href=True)
                        # log.debug("a标签数量:" + str(len(a_tags)))
                        # 循环遍历<a>标签,检查每个<a>标签是否包含<img>元素,如果包含则删除该<a>标签
                        for tag in a_tags:
                            tag.decompose()
                    except:
                        # log.debug("查找所有的<a>标签失败")
liyang's avatar
liyang committed
181
                        a = 1
liyang's avatar
liyang committed
182 183
                    html = soup.prettify().replace('amp;', '')
                    # ------------------ content 过滤 end--------------
liyang's avatar
liyang committed
184

liyang's avatar
liyang committed
185 186 187 188 189 190 191 192
                    date_string = element_release.text
                    date_format = "%a %b %d %H:%M:%S %Y"
                    # 将日期字符串转换为datetime对象
                    date_time = datetime.strptime(date_string, date_format)
                    # 将datetime对象转换为时间戳(以秒为单位)
                    release_time = int(date_time.timestamp())

                    # 过滤时间
liyang's avatar
liyang committed
193
                    if beginFiltrationTime <= release_time <= endFiltrationTime:
liyang's avatar
liyang committed
194 195 196 197 198 199 200 201 202 203 204 205 206
                        # --------------- 组装数据 start---------------------
                        obj = {
                            "title": element_title.text,
                            "content": html,
                            "link": browser_current_url,
                            "reptileTime": str(int(time.time())),
                            "type": content_type,
                            "author": element_author.text,
                            "releaseTime": str(release_time),
                            "picture_url": ",".join(picture_url)
                        }
                        # --------------- 组装数据 end---------------------
                        data.append(obj)
liyang's avatar
liyang committed
207 208
                    # 浏览器返回上一页
                    browser.back()
liyang's avatar
liyang committed
209
                    time.sleep(0.1)
liyang's avatar
liyang committed
210
                    element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
liyang's avatar
liyang committed
211

liyang's avatar
liyang committed
212
            # print("循环结束")
liyang's avatar
liyang committed
213 214
            # 浏览器返回上一页
            browser.back()
liyang's avatar
liyang committed
215 216
            if index == 0:
                browser.back()
liyang's avatar
liyang committed
217
            time.sleep(0.1)
liyang's avatar
liyang committed
218
            # 重新获取
liyang's avatar
liyang committed
219
            classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")
liyang's avatar
liyang committed
220 221

    # 发送爬取数据到java服务
liyang's avatar
liyang committed
222 223
    # print('----------------------')
    # print(data)
liyang's avatar
liyang committed
224 225
    if len(data) > 0:
        # 保存json文件到本地
226 227 228 229 230 231 232
        json_path = os.path.join(local_path, "data.json")
        state_save = save_json(json_path, data)
        # 保存task
        task = {
            # 爬取时间
            "reptileTime": data[0]["reptileTime"],
            # 本地路径
liyang's avatar
liyang committed
233 234 235
            "localPath": local_path,
            "beginFiltrationTime": beginFiltrationTime,
            "endFiltrationTime": endFiltrationTime,
236 237
            "keyword": keyword,
            "total": len(data)
238 239
        }
        state_save = save_json(os.path.join(file_dir, "task.json"), task)
liyang's avatar
liyang committed
240 241 242 243
        if state_save:
            log.debug('save file success')
        else:
            log.debug('save file failed')
liyang's avatar
liyang committed
244
        script_close(browser)
liyang's avatar
liyang committed
245 246
    else:
        # 爬取数据为空
liyang's avatar
liyang committed
247
        log.info("未爬取到数据")
248 249
        # 删除目录
        delete_directory(local_path)
liyang's avatar
liyang committed
250
        script_close(browser)
liyang's avatar
liyang committed
251

liyang's avatar
liyang committed
252 253

def script_close(browser):
liyang's avatar
liyang committed
254
    # 关闭浏览器驱动
255 256 257 258 259
    try:
        browser.close()
        browser.quit()
    except:
        log.debug("浏览器驱动关闭失败")
260 261 262 263 264 265 266 267 268
    try:
        # 一些代码...
        sys.exit()
    except SystemExit:
        raise  # 重新抛出SystemExit异常,让脚本退出
    except Exception as e:
        # 异常处理代码...
        print("sys.exit() 执行失败")

liyang's avatar
liyang committed
269

liyang's avatar
liyang committed
270
def main():
liyang's avatar
liyang committed
271 272 273
    """

    """
liyang's avatar
liyang committed
274 275
    # 请求关键词
    response = getReptileTask()
liyang's avatar
liyang committed
276
    global status_task
liyang's avatar
liyang committed
277 278 279
    global beginFiltrationTime
    global endFiltrationTime
    global keyword
liyang's avatar
liyang committed
280
    if response['status_code'] == 200 and response['data']['code'] == 200:
281
        log.debug("call success")
liyang's avatar
liyang committed
282 283 284 285 286
        search_word = ""
        for item in response['data']['rows']:
            if item['name'] == 'ptt':
                search_word = item['keyword']
                table_name = item['tableName']
liyang's avatar
liyang committed
287
                status_task = int(item["status"])
liyang's avatar
liyang committed
288 289 290
                keyword = str(item["keyword"])
                beginFiltrationTime = int(item["beginFiltrationTime"])
                endFiltrationTime = int(item["endFiltrationTime"])
liyang's avatar
liyang committed
291
        # 简体转繁体
liyang's avatar
liyang committed
292
        if status_task == 0 and len(search_word) > 0:
liyang's avatar
liyang committed
293 294 295
            reptile(None, convert_to_traditional(search_word))
        else:
            log.debug("爬取任务未启用")
liyang's avatar
liyang committed
296
    else:
297
        log.debug("call failed")
298 299
        # 请求超时
        reptile(None, convert_to_traditional("新闻"))
liyang's avatar
liyang committed
300 301 302
        # upload_control()


liyang's avatar
liyang committed
303 304
# 全局变量
data = []
305 306
# 任务详情
task = {}
liyang's avatar
liyang committed
307
table_name = "pms_ptt"
liyang's avatar
liyang committed
308 309 310

# 全局字段
keyword = ""
liyang's avatar
liyang committed
311
# 过滤时间开始
liyang's avatar
liyang committed
312
beginFiltrationTime = int(123)
liyang's avatar
liyang committed
313
# 过滤时间结束
liyang's avatar
liyang committed
314 315
endFiltrationTime = int(123)

316
# 文件根目录
liyang's avatar
liyang committed
317
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
318 319 320 321 322 323
# 任务目录名称
local_path_name = str(int(time.time()))
# 任务目录路径
local_path = f'{os.path.join(file_dir, local_path_name)}'
# 任务目录是否创建
local_path_status = create_directory_if_not_exists(local_path)
liyang's avatar
liyang committed
324
# 是否启用
325
status_task = 0
liyang's avatar
liyang committed
326
# 调用main函数
liyang's avatar
liyang committed
327
main()