Commit cc26717b authored by liyang's avatar liyang

feat:ptt 后处理优化

parent c712ff68
......@@ -18,7 +18,12 @@ from utils.filse import save_json
import os
from config.settings import get_base_file_url
from utils.download_image import download_image
# --------------- selenium 依赖 start ----------------
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# --------------- selenium 依赖 end ----------------
'''
爬取台湾PTT论坛的热门帖子,包括帖子的标题、内容【文本、图片、视频】
......@@ -28,7 +33,7 @@ from utils.download_image import download_image
def reptile(browser=None, search_word=""):
url = "https://www.ptt.cc/bbs/hotboards.html"
browser = browser or create(no_headless=True,using_user_data=True)
browser = browser or create(no_headless=True, using_user_data=True)
# 有头模式执行
# browser = browser or create()
# 打开网页
......@@ -39,10 +44,18 @@ def reptile(browser=None, search_word=""):
length = len(classify_item_list)
for index in range(length):
# 暂时先爬取 第2个 分类
if 0 < index < 4:
if 0 <= index < 4:
type_title = classify_item_list[index].text
classify_item_list[index].click()
time.sleep(0.1)
if index == 0:
try:
button = browser.find_element("xpath", "//form/div[1]//button")
button.click()
except:
error = ""
wait = WebDriverWait(browser, 10)
wait.until(EC.presence_of_element_located((By.XPATH, "//div[@class='r-ent']")))
element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
length_two = len(element_list)
for index_two in range(length_two):
......@@ -183,6 +196,8 @@ def reptile(browser=None, search_word=""):
element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
# 浏览器返回上一页
browser.back()
if index == 0:
browser.back()
time.sleep(0.1)
# 重新获取
classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")
......@@ -214,6 +229,7 @@ def script_close(browser):
log.debug("浏览器驱动关闭失败")
sys.exit()
def main():
"""
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment