import io import json import re import sys import time import loguru # import pymysql.cursors import requests from bs4 import BeautifulSoup from datetime import datetime from api.index import importJson, getReptileTask, importJsonPath from utils.Logger import log from utils.index import convert_to_traditional, create_directory_if_not_exists, delete_directory # from requests_toolbelt import * from utils.createBrowserDriver import create import opencc from utils.filse import save_json import os from config.settings import get_base_file_url from utils.download_image import download_image # --------------- selenium 依赖 start ---------------- from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC # --------------- selenium 依赖 end ---------------- ''' 爬取台湾PTT论坛的热门帖子,包括帖子的标题、内容【文本、图片、视频】 爬取流程:创建驱动--》打开浏览器--》打开网页--》爬取分类元素--》循环点击--》爬取热门帖子标题--》循环点击--》爬取帖子详情 ''' def reptile(browser=None, search_word=""): url = "https://skynet.ipplus360.com/q.html" browser = browser or create(no_headless=False, using_user_data=True) # 有头模式执行 # browser = browser or create() # 打开网页 browser.get(url) print("------") print(browser.page_source) # log.debug("已打开浏览器") # classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']") reptile()