pc_ptt.py 12.1 KB
Newer Older
liyang's avatar
liyang committed
1 2 3
import io
import json
import re
4
import sys
liyang's avatar
liyang committed
5 6
import time
import loguru
7
# import pymysql.cursors
liyang's avatar
liyang committed
8 9
import requests
from bs4 import BeautifulSoup
liyang's avatar
liyang committed
10
from datetime import datetime
liyang's avatar
liyang committed
11
from api.index import importJson, getReptileTask, importJsonPath
liyang's avatar
liyang committed
12
from utils.Logger import log
liyang's avatar
liyang committed
13
from utils.index import convert_to_traditional
liyang's avatar
liyang committed
14 15 16
# from requests_toolbelt import *
from utils.createBrowserDriver import create
import opencc
liyang's avatar
liyang committed
17 18
from utils.filse import save_json
import os
liyang's avatar
liyang committed
19 20
from config.settings import get_base_file_url
from utils.download_image import download_image
liyang's avatar
liyang committed
21 22 23 24
# ---------------   selenium 依赖 start ----------------
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
25

liyang's avatar
liyang committed
26
# ---------------   selenium 依赖 end ----------------
liyang's avatar
liyang committed
27 28 29 30 31 32 33 34 35
'''
爬取台湾PTT论坛的热门帖子,包括帖子的标题、内容【文本、图片、视频】

爬取流程:创建驱动--》打开浏览器--》打开网页--》爬取分类元素--》循环点击--》爬取热门帖子标题--》循环点击--》爬取帖子详情
'''


def reptile(browser=None, search_word=""):
    url = "https://www.ptt.cc/bbs/hotboards.html"
liyang's avatar
liyang committed
36
    browser = browser or create(no_headless=False, using_user_data=True)
liyang's avatar
liyang committed
37
    # 有头模式执行
liyang's avatar
liyang committed
38
    # browser = browser or create()
liyang's avatar
liyang committed
39 40
    # 打开网页
    browser.get(url)
41
    # log.debug("已打开浏览器")
liyang's avatar
liyang committed
42
    classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")
liyang's avatar
liyang committed
43 44

    for index,item_element in enumerate(classify_item_list):
45
        # 暂时先爬取 第2个 分类
liyang's avatar
liyang committed
46
        if 0 <= index < 4:
liyang's avatar
liyang committed
47
            type_title = classify_item_list[index].text
liyang's avatar
liyang committed
48
            # 进入分类页面
liyang's avatar
liyang committed
49
            classify_item_list[index].click()
liyang's avatar
liyang committed
50
            time.sleep(0.1)
liyang's avatar
liyang committed
51 52

            # 综合分类需要确认已满18周岁
liyang's avatar
liyang committed
53 54 55 56 57 58
            if index == 0:
                try:
                    button = browser.find_element("xpath", "//form/div[1]//button")
                    button.click()
                except:
                    error = ""
liyang's avatar
liyang committed
59

liyang's avatar
liyang committed
60 61
            wait = WebDriverWait(browser, 10)
            wait.until(EC.presence_of_element_located((By.XPATH, "//div[@class='r-ent']")))
liyang's avatar
liyang committed
62
            element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
liyang's avatar
liyang committed
63 64

            for index_two, item in enumerate(element_list):
liyang's avatar
liyang committed
65
                # print(element_list[index_two].text)
liyang's avatar
liyang committed
66
                try:
liyang's avatar
liyang committed
67
                    re.findall("公告", item.text)
liyang's avatar
liyang committed
68
                except IndexError:
liyang's avatar
liyang committed
69
                    log.debug(f"正在爬取分类:{type_title}-第{index_two + 1}条")
liyang's avatar
liyang committed
70 71
                    print("当前连接:" + str(browser.current_url))
                    print(data[len(data) - 1]["title"])
liyang's avatar
liyang committed
72
                # 使用正则表达式进行匹配关键词
liyang's avatar
liyang committed
73
                if re.findall(search_word, item.text):
liyang's avatar
liyang committed
74 75 76
                    # log.debug(f"找到了匹配的字符串:{matches}")
                    error = ""
                else:
liyang's avatar
liyang committed
77
                    # 本次迭代帖子标题未匹配关键词,退出本次迭代,进入下一次迭代
liyang's avatar
liyang committed
78
                    continue
liyang's avatar
liyang committed
79

liyang's avatar
liyang committed
80 81
                # 标题不包含"公告"和"看板"
                if re.findall("公告", element_list[index_two].text) or re.findall("看板", element_list[index_two].text):
liyang's avatar
liyang committed
82
                    a = 1
liyang's avatar
liyang committed
83 84 85 86 87 88 89 90 91 92 93
                else:
                    # 使用正则表达式进行匹配
                    # matches =
                    # log.debug(element_list[index_two].text+str(matches))
                    # 打印匹配结果
                    # if matches:
                    # log.debug(f"找到了匹配的字符串:{matches}")
                    element_list[index_two].click()
                    time.sleep(0.1)
                    # 原链接
                    browser_current_url = browser.current_url
liyang's avatar
liyang committed
94
                    # print(browser_current_url)
liyang's avatar
liyang committed
95 96 97 98 99
                    # log.debug('网页链接' + str(browser_current_url))
                    try:
                        # 获取帖子详情
                        element_title = browser.find_element('xpath',
                                                             "//div[@id='main-content']/div[3]//span[@class='article-meta-value']")
liyang's avatar
liyang committed
100

liyang's avatar
liyang committed
101
                    except:
liyang's avatar
liyang committed
102 103
                        log.error(
                            "xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']")
liyang's avatar
liyang committed
104 105 106 107 108 109 110 111 112 113
                        log.debug(f'页面链接:{browser_current_url}')
                        # 浏览器返回上一页
                        browser.back()
                        element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
                        break
                    # 内容可能包含图片和视频,需要后处理
                    element_content = browser.find_element('xpath', "//div[@id='main-content']")
                    # 去除herf属性值包含'img'的a标签
                    # ------------------------------------
                    # 使用BeautifulSoup解析HTML
114
                    soup = BeautifulSoup(element_content.get_attribute('outerHTML'), 'html.parser')
liyang's avatar
liyang committed
115 116 117 118 119 120 121 122 123 124 125 126 127
                    # 作者
                    element_author = browser.find_element('xpath',
                                                          "//div[@id='main-content']/div[@class='article-metaline'][1]/span[2]")
                    # 发布时间
                    element_release = browser.find_element('xpath',
                                                           "//div[@id='main-content']/div[@class='article-metaline'][3]/span[2]")
                    date_string = element_release.text
                    date_format = "%a %b %d %H:%M:%S %Y"
                    # 将日期字符串转换为datetime对象
                    date_time = datetime.strptime(date_string, date_format)
                    # 将datetime对象转换为时间戳(以秒为单位)
                    release_time = int(date_time.timestamp())
                    # log.debug('开始判断类型')
liyang's avatar
liyang committed
128 129 130 131 132
                    try:
                        # 找到所有第一级标签为 `div` 的元素
                        div_elements = soup.find_all('div')
                        # log.debug("一级div数量:" + str(len(div_elements)))
                        # 逐个删除这些元素
liyang's avatar
liyang committed
133 134 135
                        for key, div in enumerate(div_elements):
                            if key > 0:
                                div.extract()
liyang's avatar
liyang committed
136 137 138 139 140
                        # 删除第一级span
                        span_element = soup.find_all('span')
                        # log.debug("一级span数量:" + str(len(span_element)))
                        for span in span_element:
                            span.extract()
liyang's avatar
liyang committed
141

liyang's avatar
liyang committed
142 143 144
                    except:
                        # log.debug("删除第一级div失败")
                        a = 2
liyang's avatar
liyang committed
145 146 147
                    # ---------------- 判断类型 start ----------
                    # 类型
                    content_type = ""
liyang's avatar
liyang committed
148 149
                    # 查找所有img标签
                    image_list = soup.find_all('img')
liyang's avatar
liyang committed
150
                    try:
liyang's avatar
liyang committed
151
                        if len(image_list) > 0:
liyang's avatar
liyang committed
152 153 154 155 156
                            content_type = "图文"
                        else:
                            content_type = "文字"
                    except:
                        content_type = "文字"
liyang's avatar
liyang committed
157 158 159 160 161 162 163 164 165 166 167 168 169
                    picture_url = []
                    if len(image_list) > 0:
                        for key, element in enumerate(image_list):
                            # 下载图片至本地,替换标签中的src
                            id = str(int(time.time()))
                            # 下载地址
                            download_dir = f'{os.path.join(file_dir, f"{id}.jpg")}'
                            # 访问地址
                            access_address = f'{get_base_file_url()}{table_name.split("_")[1]}/{id}.jpg'
                            # 下载状态
                            status = download_image(element['src'], download_dir)
                            if status:
                                element['src'] = access_address
170
                                picture_url.append(download_dir)
liyang's avatar
liyang committed
171
                    else:
liyang's avatar
liyang committed
172 173
                        # print("")
                        error = ""
liyang's avatar
liyang committed
174 175 176 177 178 179 180 181 182 183 184 185
                    # ---------------- 判断类型 end ----------
                    # log.debug('开始内容过滤')
                    # ------------------ content 过滤 start--------------
                    try:
                        # 查找所有的<a>标签
                        a_tags = soup.find_all('a', href=True)
                        # log.debug("a标签数量:" + str(len(a_tags)))
                        # 循环遍历<a>标签,检查每个<a>标签是否包含<img>元素,如果包含则删除该<a>标签
                        for tag in a_tags:
                            tag.decompose()
                    except:
                        # log.debug("查找所有的<a>标签失败")
liyang's avatar
liyang committed
186
                        a = 1
liyang's avatar
liyang committed
187 188
                    html = soup.prettify().replace('amp;', '')
                    # ------------------ content 过滤 end--------------
liyang's avatar
liyang committed
189

liyang's avatar
liyang committed
190 191 192 193 194 195 196 197
                    # --------------- 组装数据 start---------------------
                    obj = {
                        "title": element_title.text,
                        "content": html,
                        "link": browser_current_url,
                        "reptileTime": str(int(time.time())),
                        "type": content_type,
                        "author": element_author.text,
liyang's avatar
liyang committed
198
                        "releaseTime": str(release_time),
liyang's avatar
liyang committed
199
                        "picture_url": ",".join(picture_url)
liyang's avatar
liyang committed
200 201
                    }
                    # --------------- 组装数据 end---------------------
liyang's avatar
liyang committed
202
                    data.append(obj)
liyang's avatar
liyang committed
203 204
                    # 浏览器返回上一页
                    browser.back()
liyang's avatar
liyang committed
205
                    time.sleep(0.1)
liyang's avatar
liyang committed
206
                    element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
liyang's avatar
liyang committed
207 208

            print("循环结束")
liyang's avatar
liyang committed
209 210
            # 浏览器返回上一页
            browser.back()
liyang's avatar
liyang committed
211 212
            if index == 0:
                browser.back()
liyang's avatar
liyang committed
213
            time.sleep(0.1)
liyang's avatar
liyang committed
214
            # 重新获取
liyang's avatar
liyang committed
215
            classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")
liyang's avatar
liyang committed
216 217

    # 发送爬取数据到java服务
liyang's avatar
liyang committed
218 219
    # print('----------------------')
    # print(data)
liyang's avatar
liyang committed
220 221
    if len(data) > 0:
        # 保存json文件到本地
222
        # log.debug(os.path.abspath("../"))
liyang's avatar
liyang committed
223
        state_save = save_json(os.path.join(file_dir, str(int(time.time())) + ".json"), data)
liyang's avatar
liyang committed
224 225 226 227
        if state_save:
            log.debug('save file success')
        else:
            log.debug('save file failed')
liyang's avatar
liyang committed
228
        script_close(browser)
liyang's avatar
liyang committed
229 230
    else:
        # 爬取数据为空
liyang's avatar
liyang committed
231
        log.info("未爬取到数据")
liyang's avatar
liyang committed
232
        script_close(browser)
liyang's avatar
liyang committed
233

liyang's avatar
liyang committed
234 235

def script_close(browser):
liyang's avatar
liyang committed
236
    # 关闭浏览器驱动
237 238 239 240 241
    try:
        browser.close()
        browser.quit()
    except:
        log.debug("浏览器驱动关闭失败")
liyang's avatar
liyang committed
242
    sys.exit()
liyang's avatar
liyang committed
243

liyang's avatar
liyang committed
244

liyang's avatar
liyang committed
245
def main():
liyang's avatar
liyang committed
246 247 248
    """

    """
liyang's avatar
liyang committed
249 250
    # 请求关键词
    response = getReptileTask()
liyang's avatar
liyang committed
251
    global status_task
liyang's avatar
liyang committed
252
    # print(response)
liyang's avatar
liyang committed
253
    if response['status_code'] == 200 and response['data']['code'] == 200:
254
        log.debug("call success")
liyang's avatar
liyang committed
255 256 257 258 259
        search_word = ""
        for item in response['data']['rows']:
            if item['name'] == 'ptt':
                search_word = item['keyword']
                table_name = item['tableName']
liyang's avatar
liyang committed
260
                status_task = int(item["status"])
liyang's avatar
liyang committed
261
        # 简体转繁体
liyang's avatar
liyang committed
262
        if status_task == 0 and len(search_word) > 0:
liyang's avatar
liyang committed
263 264 265
            reptile(None, convert_to_traditional(search_word))
        else:
            log.debug("爬取任务未启用")
liyang's avatar
liyang committed
266
    else:
267
        log.debug("call failed")
268 269
        # 请求超时
        reptile(None, convert_to_traditional("新闻"))
liyang's avatar
liyang committed
270 271 272
        # upload_control()


liyang's avatar
liyang committed
273 274 275
# 全局变量
data = []
table_name = "pms_ptt"
liyang's avatar
liyang committed
276
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
liyang's avatar
liyang committed
277 278
# 是否启用
status_task = '0'
liyang's avatar
liyang committed
279
# 调用main函数
liyang's avatar
liyang committed
280
main()