Commit dae3fe3d authored by liyang's avatar liyang

feat:ptt debug

parent 27018e2d
...@@ -11,3 +11,12 @@ ...@@ -11,3 +11,12 @@
2023-07-13 16:41:30,332 ERROR pc_ptt.py : reptile [line: 66] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value'] 2023-07-13 16:41:30,332 ERROR pc_ptt.py : reptile [line: 66] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:43:37,394 ERROR pc_ptt.py : reptile [line: 67] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value'] 2023-07-13 16:43:37,394 ERROR pc_ptt.py : reptile [line: 67] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-21 10:54:17,501 ERROR pc_ptt.py : reptile [line: 73] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value'] 2023-07-21 10:54:17,501 ERROR pc_ptt.py : reptile [line: 73] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:41:32,527 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:41:41,957 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:41:43,433 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:51:10,728 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:52:41,156 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:58:54,782 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:59:04,220 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:59:27,844 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 16:00:02,916 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
...@@ -8,6 +8,9 @@ import loguru ...@@ -8,6 +8,9 @@ import loguru
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from datetime import datetime from datetime import datetime
from selenium.common import NoSuchElementException
from api.index import importJson, getReptileTask, importJsonPath from api.index import importJson, getReptileTask, importJsonPath
from utils.Logger import log from utils.Logger import log
from utils.index import convert_to_traditional, create_directory_if_not_exists, delete_directory from utils.index import convert_to_traditional, create_directory_if_not_exists, delete_directory
...@@ -42,8 +45,8 @@ def reptile(browser=None, search_word=""): ...@@ -42,8 +45,8 @@ def reptile(browser=None, search_word=""):
classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']") classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")
for index, item_element in enumerate(classify_item_list): for index, item_element in enumerate(classify_item_list):
# 暂时先爬取 第2个 分类 # 只爬取综合分类
if 0 <= index <= 14: if 0 <= index < 1:
type_title = classify_item_list[index].text type_title = classify_item_list[index].text
# 进入分类页面 # 进入分类页面
classify_item_list[index].click() classify_item_list[index].click()
...@@ -59,40 +62,46 @@ def reptile(browser=None, search_word=""): ...@@ -59,40 +62,46 @@ def reptile(browser=None, search_word=""):
wait = WebDriverWait(browser, 10) wait = WebDriverWait(browser, 10)
wait.until(EC.presence_of_element_located((By.XPATH, "//div[@class='r-ent']"))) wait.until(EC.presence_of_element_located((By.XPATH, "//div[@class='r-ent']")))
element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a") page = 1
# element_meta_list = browser.find_elements("xpath", "//div[@class='r-ent']//div[@class='meta']")
def process_data():
# 增加搜索
search_input = browser.find_element("xpath", "//div[@class='search-bar']//input")
if search_word != search_input.get_attribute("value"):
# 输入搜索关键词
search_input.send_keys(search_word)
# 点击输入框
# browser.execute_script("arguments[0].click();", search_input)
# 确认搜索关键词
# search_input.click()
search_input.submit()
# 程序睡眠300ms,等待页面加载完成
time.sleep(0.3)
element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
nonlocal page
print(f"当前为第:{page} 页,共 {len(element_list)} 条数据")
for index_two, item in enumerate(element_list): for index_two, item in enumerate(element_list):
# print(element_list[index_two].text) # print(element_list[index_two].text)
try: try:
re.findall("公告", item.text) re.findall("公告", item.text)
except IndexError: except IndexError:
log.debug(f"正在爬取分类:{type_title}-第{index_two + 1}条")
print("当前连接:" + str(browser.current_url))
print(data[len(data) - 1]["title"])
# 使用正则表达式进行匹配关键词
if re.findall(search_word, item.text):
# log.debug(f"找到了匹配的字符串:{matches}")
error = "" error = ""
else: # log.debug(f"正在爬取分类:{type_title}-第{index_two + 1}条")
# 本次迭代帖子标题未匹配关键词,退出本次迭代,进入下一次迭代 # print("当前连接:" + str(browser.current_url))
continue # print(data[len(data) - 1]["title"])
# 标题不包含"公告"和"看板" # 标题不包含"公告"和"看板"
if re.findall("公告", element_list[index_two].text) or re.findall("看板", element_list[index_two].text): if re.findall("公告", element_list[index_two].text) or re.findall("看板",
element_list[index_two].text):
a = 1 a = 1
else: else:
# 使用正则表达式进行匹配
# matches =
# log.debug(element_list[index_two].text+str(matches))
# 打印匹配结果
# if matches:
# log.debug(f"找到了匹配的字符串:{matches}")
element_list[index_two].click() element_list[index_two].click()
time.sleep(0.1) time.sleep(0.2)
# 原链接 # 原链接
browser_current_url = browser.current_url browser_current_url = browser.current_url
# print(browser_current_url)
# log.debug('网页链接' + str(browser_current_url))
try: try:
# 获取帖子详情 # 获取帖子详情
element_title = browser.find_element('xpath', element_title = browser.find_element('xpath',
...@@ -104,7 +113,8 @@ def reptile(browser=None, search_word=""): ...@@ -104,7 +113,8 @@ def reptile(browser=None, search_word=""):
log.debug(f'页面链接:{browser_current_url}') log.debug(f'页面链接:{browser_current_url}')
# 浏览器返回上一页 # 浏览器返回上一页
browser.back() browser.back()
element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a") element_list = browser.find_elements('xpath',
"//div[@class='r-ent']//div[@class='title']//a")
break break
# 内容可能包含图片和视频,需要后处理 # 内容可能包含图片和视频,需要后处理
element_content = browser.find_element('xpath', "//div[@id='main-content']") element_content = browser.find_element('xpath', "//div[@id='main-content']")
...@@ -183,6 +193,12 @@ def reptile(browser=None, search_word=""): ...@@ -183,6 +193,12 @@ def reptile(browser=None, search_word=""):
# ------------------ content 过滤 end-------------- # ------------------ content 过滤 end--------------
date_string = element_release.text date_string = element_release.text
# date_string = "Wed Aug 9 15:39:26 2023 //update 20934353"
# 提取日期字符串
if "//" in date_string:
date_string = date_string.split("//")[0]
date_string = date_string.strip()
date_format = "%a %b %d %H:%M:%S %Y" date_format = "%a %b %d %H:%M:%S %Y"
# 将日期字符串转换为datetime对象 # 将日期字符串转换为datetime对象
date_time = datetime.strptime(date_string, date_format) date_time = datetime.strptime(date_string, date_format)
...@@ -206,17 +222,28 @@ def reptile(browser=None, search_word=""): ...@@ -206,17 +222,28 @@ def reptile(browser=None, search_word=""):
data.append(obj) data.append(obj)
# 浏览器返回上一页 # 浏览器返回上一页
browser.back() browser.back()
time.sleep(0.1) time.sleep(0.2)
element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a") element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
page = page + 1
# print("111111")
try:
prev_button = browser.find_element('xpath',
"//a[@class='btn wide' and text() = '‹ 上頁']")
prev_button.click()
time.sleep(0.3)
process_data()
except:
error = ""
# print("循环结束") process_data()
# 浏览器返回上一页 # # print("循环结束")
browser.back() # # 浏览器返回上一页
if index == 0: # browser.back()
browser.back() # if index == 0:
time.sleep(0.1) # browser.back()
# 重新获取 # time.sleep(0.1)
classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']") # # 重新获取
# classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")
# 发送爬取数据到java服务 # 发送爬取数据到java服务
# print('----------------------') # print('----------------------')
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment