pc_facebook.py 10.6 KB
Newer Older
1
import json
liyang's avatar
liyang committed
2
import platform
liyang's avatar
liyang committed
3 4
import time
from bs4 import BeautifulSoup
5 6 7 8
from utils.Logger import log
from utils.createBrowserDriver import create
from utils.filse import save_json
from api.index import importJson, getReptileTask, importJsonPath
9
from utils.index import convert_to_traditional, yt_dlp_download, convert_string_to_time, \
liyang's avatar
liyang committed
10
    parse_time_string, create_directory_if_not_exists, delete_directory
11
# from pytube import YouTube
12
from datetime import datetime
13
from utils.download_image import download_image
14 15
import os
from config.settings import get_base_file_url
liyang's avatar
liyang committed
16
from config.settings import get_account
17
import sys
18 19 20 21 22 23
# ---------------   selenium 依赖 start ----------------
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# ---------------   selenium 依赖 end ----------------
liyang's avatar
liyang committed
24

liyang's avatar
liyang committed
25 26 27 28 29 30
# 工具函数-下载图片
'''
打开指定网页,并使用 Selenium 模拟点击 "GDPR-accept" 按钮,然后循环点击 "search-show-more-button" 按钮来加载更多数据,直到按钮不再可点击为止。最后,获取完整的分页数据并关闭浏览器驱动。
'''


31
def reptile(browser=None, search_word=""):
32
    print(f"搜索词:{search_word}")
33
    url = "https://www.facebook.com/"
liyang's avatar
liyang committed
34
    browser = browser or create(no_headless=False, using_user_data=True)
35 36
    # 打开网页
    browser.get(url)
liyang's avatar
liyang committed
37
    time.sleep(2)
38
    try:
liyang's avatar
liyang committed
39
        # time.sleep(3)
40
        # 检测是否要登录
41 42
        login_input = browser.find_element('xpath', "//input[@name='email']")
        password_input = browser.find_element('xpath', "//input[@name='pass']")
liyang's avatar
liyang committed
43 44
        login_input.send_keys(get_account("facebook")["name"])
        password_input.send_keys(get_account("facebook")["password"])
45 46 47
        # 获取登录按钮
        button_login = browser.find_element('xpath', "//button[@name='login']")
        button_login.click()
liyang's avatar
liyang committed
48
        time.sleep(3)
49
    except:
50
        print("已登录")
51
    log.debug("facebook login complete")
52 53
    url = f"https://www.facebook.com/search/top?q={search_word}"
    browser.get(url)
liyang's avatar
liyang committed
54
    time.sleep(2)
55 56
    # 使用 JavaScript 将网页滚动到底部
    browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
liyang's avatar
liyang committed
57
    time.sleep(6)
58 59 60 61
    # 等待内容出现,设置最长等待时间为10秒
    wait = WebDriverWait(browser, 10)
    # 通过 expected_conditions 来定义等待条件,这里以弹窗内容的某个元素为例
    wait.until(EC.presence_of_element_located((By.XPATH, "//div[@role='feed']")))
liyang's avatar
liyang committed
62

63 64 65
    # 内容
    element_content_list = browser.find_elements('xpath',
                                                 "//div[@role='feed']/div//div[@aria-describedby]/div/div/div/div/div/div[2]/div/div/div[3]")
66
    # 作者
67
    element_authors_list = browser.find_elements('xpath', "//div[@role='feed']/div//div[@aria-describedby]//h3/span[1]")
68 69
    # 发布时间
    element_release_list = browser.find_elements('xpath',
70
                                                 "//div[@role='feed']/div//div[@aria-describedby]//span[@dir]/span//a[@role='link' and @aria-label]")
71
    # 查找所有 展开 按钮,循环点击后在查找内容
72 73 74 75 76 77 78 79 80 81
    elements_expand_list = browser.find_elements('xpath',
                                                 "//div[@role='feed']/div//div[@aria-describedby]//div[@role='button' and text()='展开']")
    for key, element in enumerate(elements_expand_list):
        try:
            # 使用JavaScript 执行点击操作
            browser.execute_script("arguments[0].click();", element)
        except Exception as e:
            print("Clicking element failed: " + str(e))

    length = len(element_content_list)
82
    for index in range(length):
liyang's avatar
liyang committed
83 84 85 86 87 88
        author_soup = BeautifulSoup(element_authors_list[index].get_attribute("outerHTML"), "html")
        time_soup = BeautifulSoup(element_release_list[index].get_attribute("outerHTML"), "html")
        # author = element_authors_list[index].text
        author = author_soup.find_all("a")[0].text
        time_text = time_soup.find_all("a")[0].text
        release_time_timestamp = int(parse_time_string(time_text))
89
        release_time = str(release_time_timestamp)
liyang's avatar
liyang committed
90 91 92 93

        # 过滤时间
        # # 如果'releaseTime'不是整数,则将其转换为整数
        new_releaseTime = int(release_time)
liyang's avatar
liyang committed
94
        if new_releaseTime < beginFiltrationTime or new_releaseTime > endFiltrationTime:
liyang's avatar
liyang committed
95 96 97
            # 如果'new_releaseTime'不在指定范围内,则跳过当前迭代,继续下一个项目
            continue

98 99 100 101 102
        text = element_content_list[index].text
        soup = BeautifulSoup(element_content_list[index].get_attribute('outerHTML'), 'html.parser')
        soup_str = soup.prettify()
        # 查找是否含有视频
        video_list = soup.find_all("video")
103
        image_list = soup.find_all("img")
104 105
        # lth = len(ignore_list)
        if len(video_list) > 0:
liyang's avatar
liyang committed
106
            # for key,element in enumerate(video_list):
107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
            # 删除第二个子元素
            # 找到包含两个 <div> 元素的父级元素
            parent_div = soup.find('div')
            # 找到所有的 <div> 子元素
            div_elements = parent_div.find_all('div', recursive=False)
            # div_tags = soup.find_all("div", recursive=False)
            # 确保列表中至少有两个 <div> 子元素
            if len(div_elements) >= 2:
                # 获取第二个 <div> 元素,并将其从父级元素中移除
                div_to_remove = div_elements[1]
                div_to_remove.extract()
                # 删除
                # div.decompose()
                # 创建video标签占位
                custom_video = soup.new_tag("video")
                custom_video["src"] = ""
                parent_div.append(custom_video)
        else:
liyang's avatar
liyang committed
125 126
            # print("")
            error = ""
127 128 129 130 131 132
        picture_url = []
        if len(image_list) > 0:
            for key, element in enumerate(image_list):
                # 下载图片至本地,替换标签中的src
                id = str(int(time.time()))
                # 下载地址
133
                download_dir = f'{os.path.join(local_path, f"{id}.jpg")}'
134
                # 访问地址
135
                access_address = f'{get_base_file_url()}{table_name.split("_")[1]}/{local_path_name}/{id}.jpg'
136 137 138 139
                # 下载状态
                status = download_image(element['src'], download_dir)
                if status:
                    element['src'] = access_address
140
                    picture_url.append(download_dir)
141
        else:
liyang's avatar
liyang committed
142 143
            # print("")
            error = ""
144
        content = soup.prettify()
145
        # 标题取:作者+日期
146 147
        title = f"{author}-{datetime.fromtimestamp(release_time_timestamp)}"
        # title = ""
148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
        # ---------------- 判断类型 start ----------
        # 类型
        content_type = ""
        try:
            # 查找所有img标签
            img_tags = soup.find_all('img')
            if len(img_tags) > 0:
                content_type = "图文"
            else:
                content_type = "文字"
        except:
            content_type = "文字"
        # ---------------- 判断类型 end ----------
        # --------------- 组装数据 start---------------------
        obj = {
163
            "title": title,
164 165 166 167 168
            "content": content,
            "link": element_release_list[index].get_attribute("href"),
            "reptileTime": str(int(time.time())),
            "type": content_type,
            "author": author,
169 170
            "releaseTime": release_time,
            "picture_url": ",".join(picture_url)
171
        }
liyang's avatar
liyang committed
172

173
        # --------------- 组装数据 end---------------------
liyang's avatar
liyang committed
174

175
        data.append(obj)
liyang's avatar
liyang committed
176

177 178
    if len(data) > 0:
        # 保存json文件到本地
179 180 181 182 183 184 185
        json_path = os.path.join(local_path, "data.json")
        state_save = save_json(json_path, data)
        # 保存task
        task = {
            # 爬取时间
            "reptileTime": data[0]["reptileTime"],
            # 本地路径
liyang's avatar
liyang committed
186 187 188
            "localPath": local_path,
            "beginFiltrationTime": beginFiltrationTime,
            "endFiltrationTime": endFiltrationTime,
189 190
            "keyword": keyword,
            "total": len(data)
191 192
        }
        state_save = save_json(os.path.join(file_dir, "task.json"), task)
193 194 195 196
        if state_save:
            log.debug('save file success')
        else:
            log.debug('save file failed')
liyang's avatar
liyang committed
197
        script_close(browser)
198 199 200
    else:
        # 爬取数据为空
        log.info("未爬取到数据")
201 202
        # 删除目录
        delete_directory(local_path)
liyang's avatar
liyang committed
203 204
        script_close(browser)

205

liyang's avatar
liyang committed
206
def script_close(browser):
207 208 209 210 211 212
    # 关闭浏览器驱动
    try:
        browser.close()
        browser.quit()
    except:
        log.debug("浏览器驱动关闭失败")
213 214 215 216 217 218 219 220 221
    try:
        # 一些代码...
        sys.exit()
    except SystemExit:
        raise  # 重新抛出SystemExit异常,让脚本退出
    except Exception as e:
        # 异常处理代码...
        print("sys.exit() 执行失败")

222

223 224 225 226 227 228 229
def main():
    """

    """
    # 请求关键词
    response = getReptileTask()
    global status_task
liyang's avatar
liyang committed
230 231 232
    global beginFiltrationTime
    global endFiltrationTime
    global keyword
233 234 235 236
    if response['status_code'] == 200 and response['data']['code'] == 200:
        log.debug("call success")
        search_word = ""
        for item in response['data']['rows']:
237
            if item['name'] == 'facebook':
238 239
                search_word = item['keyword']
                table_name = item['tableName']
liyang's avatar
liyang committed
240
                status_task = int(item["status"])
liyang's avatar
liyang committed
241 242 243
                keyword = str(item["keyword"])
                beginFiltrationTime = int(item["beginFiltrationTime"])
                endFiltrationTime = int(item["endFiltrationTime"])
244 245 246 247 248 249 250 251 252 253 254 255 256 257
        # 简体转繁体
        if status_task == 0 and len(search_word) > 0:
            reptile(None, convert_to_traditional(search_word))
        else:
            log.debug("爬取任务未启用")
    else:
        log.debug("call failed")
        # 请求超时
        reptile(None, convert_to_traditional("新闻"))
        # upload_control()


# 全局变量
data = []
258 259
# 任务详情
task = {}
260
table_name = "pms_facebook"
liyang's avatar
liyang committed
261 262 263

# 全局字段
keyword = ""
liyang's avatar
liyang committed
264
# 过滤时间开始
liyang's avatar
liyang committed
265
beginFiltrationTime = int(123)
liyang's avatar
liyang committed
266
# 过滤时间结束
liyang's avatar
liyang committed
267 268
endFiltrationTime = int(123)

269
# 文件根目录
270
file_dir = f'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
271 272 273 274 275 276 277 278
# 任务目录名称
local_path_name = str(int(time.time()))
# 任务目录路径
local_path = f'{os.path.join(file_dir, local_path_name)}'
# 任务目录是否创建
local_path_status = create_directory_if_not_exists(local_path)
# 是否启用
status_task = 0
279 280
# 调用main函数
main()