Commit dae3fe3d authored by liyang's avatar liyang

feat:ptt debug

parent 27018e2d
......@@ -11,3 +11,12 @@
2023-07-13 16:41:30,332 ERROR pc_ptt.py : reptile [line: 66] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:43:37,394 ERROR pc_ptt.py : reptile [line: 67] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-21 10:54:17,501 ERROR pc_ptt.py : reptile [line: 73] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:41:32,527 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:41:41,957 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:41:43,433 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:51:10,728 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:52:41,156 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:58:54,782 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:59:04,220 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:59:27,844 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 16:00:02,916 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
......@@ -8,6 +8,9 @@ import loguru
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from selenium.common import NoSuchElementException
from api.index import importJson, getReptileTask, importJsonPath
from utils.Logger import log
from utils.index import convert_to_traditional, create_directory_if_not_exists, delete_directory
......@@ -42,8 +45,8 @@ def reptile(browser=None, search_word=""):
classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")
for index, item_element in enumerate(classify_item_list):
# 暂时先爬取 第2个 分类
if 0 <= index <= 14:
# 只爬取综合分类
if 0 <= index < 1:
type_title = classify_item_list[index].text
# 进入分类页面
classify_item_list[index].click()
......@@ -59,40 +62,46 @@ def reptile(browser=None, search_word=""):
wait = WebDriverWait(browser, 10)
wait.until(EC.presence_of_element_located((By.XPATH, "//div[@class='r-ent']")))
element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
page = 1
# element_meta_list = browser.find_elements("xpath", "//div[@class='r-ent']//div[@class='meta']")
def process_data():
# 增加搜索
search_input = browser.find_element("xpath", "//div[@class='search-bar']//input")
if search_word != search_input.get_attribute("value"):
# 输入搜索关键词
search_input.send_keys(search_word)
# 点击输入框
# browser.execute_script("arguments[0].click();", search_input)
# 确认搜索关键词
# search_input.click()
search_input.submit()
# 程序睡眠300ms,等待页面加载完成
time.sleep(0.3)
element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
nonlocal page
print(f"当前为第:{page} 页,共 {len(element_list)} 条数据")
for index_two, item in enumerate(element_list):
# print(element_list[index_two].text)
try:
re.findall("公告", item.text)
except IndexError:
log.debug(f"正在爬取分类:{type_title}-第{index_two + 1}条")
print("当前连接:" + str(browser.current_url))
print(data[len(data) - 1]["title"])
# 使用正则表达式进行匹配关键词
if re.findall(search_word, item.text):
# log.debug(f"找到了匹配的字符串:{matches}")
error = ""
else:
# 本次迭代帖子标题未匹配关键词,退出本次迭代,进入下一次迭代
continue
# log.debug(f"正在爬取分类:{type_title}-第{index_two + 1}条")
# print("当前连接:" + str(browser.current_url))
# print(data[len(data) - 1]["title"])
# 标题不包含"公告"和"看板"
if re.findall("公告", element_list[index_two].text) or re.findall("看板", element_list[index_two].text):
if re.findall("公告", element_list[index_two].text) or re.findall("看板",
element_list[index_two].text):
a = 1
else:
# 使用正则表达式进行匹配
# matches =
# log.debug(element_list[index_two].text+str(matches))
# 打印匹配结果
# if matches:
# log.debug(f"找到了匹配的字符串:{matches}")
element_list[index_two].click()
time.sleep(0.1)
time.sleep(0.2)
# 原链接
browser_current_url = browser.current_url
# print(browser_current_url)
# log.debug('网页链接' + str(browser_current_url))
try:
# 获取帖子详情
element_title = browser.find_element('xpath',
......@@ -104,7 +113,8 @@ def reptile(browser=None, search_word=""):
log.debug(f'页面链接:{browser_current_url}')
# 浏览器返回上一页
browser.back()
element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
element_list = browser.find_elements('xpath',
"//div[@class='r-ent']//div[@class='title']//a")
break
# 内容可能包含图片和视频,需要后处理
element_content = browser.find_element('xpath', "//div[@id='main-content']")
......@@ -183,6 +193,12 @@ def reptile(browser=None, search_word=""):
# ------------------ content 过滤 end--------------
date_string = element_release.text
# date_string = "Wed Aug 9 15:39:26 2023 //update 20934353"
# 提取日期字符串
if "//" in date_string:
date_string = date_string.split("//")[0]
date_string = date_string.strip()
date_format = "%a %b %d %H:%M:%S %Y"
# 将日期字符串转换为datetime对象
date_time = datetime.strptime(date_string, date_format)
......@@ -206,17 +222,28 @@ def reptile(browser=None, search_word=""):
data.append(obj)
# 浏览器返回上一页
browser.back()
time.sleep(0.1)
time.sleep(0.2)
element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
page = page + 1
# print("111111")
try:
prev_button = browser.find_element('xpath',
"//a[@class='btn wide' and text() = '‹ 上頁']")
prev_button.click()
time.sleep(0.3)
process_data()
except:
error = ""
# print("循环结束")
# 浏览器返回上一页
browser.back()
if index == 0:
browser.back()
time.sleep(0.1)
# 重新获取
classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")
process_data()
# # print("循环结束")
# # 浏览器返回上一页
# browser.back()
# if index == 0:
# browser.back()
# time.sleep(0.1)
# # 重新获取
# classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")
# 发送爬取数据到java服务
# print('----------------------')
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment