Commit dae3fe3d authored by liyang's avatar liyang

feat:ptt debug

parent 27018e2d
...@@ -11,3 +11,12 @@ ...@@ -11,3 +11,12 @@
2023-07-13 16:41:30,332 ERROR pc_ptt.py : reptile [line: 66] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value'] 2023-07-13 16:41:30,332 ERROR pc_ptt.py : reptile [line: 66] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:43:37,394 ERROR pc_ptt.py : reptile [line: 67] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value'] 2023-07-13 16:43:37,394 ERROR pc_ptt.py : reptile [line: 67] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-21 10:54:17,501 ERROR pc_ptt.py : reptile [line: 73] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value'] 2023-07-21 10:54:17,501 ERROR pc_ptt.py : reptile [line: 73] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:41:32,527 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:41:41,957 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:41:43,433 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:51:10,728 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:52:41,156 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:58:54,782 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:59:04,220 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:59:27,844 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 16:00:02,916 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
...@@ -8,6 +8,9 @@ import loguru ...@@ -8,6 +8,9 @@ import loguru
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from datetime import datetime from datetime import datetime
from selenium.common import NoSuchElementException
from api.index import importJson, getReptileTask, importJsonPath from api.index import importJson, getReptileTask, importJsonPath
from utils.Logger import log from utils.Logger import log
from utils.index import convert_to_traditional, create_directory_if_not_exists, delete_directory from utils.index import convert_to_traditional, create_directory_if_not_exists, delete_directory
...@@ -42,8 +45,8 @@ def reptile(browser=None, search_word=""): ...@@ -42,8 +45,8 @@ def reptile(browser=None, search_word=""):
classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']") classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")
for index, item_element in enumerate(classify_item_list): for index, item_element in enumerate(classify_item_list):
# 暂时先爬取 第2个 分类 # 只爬取综合分类
if 0 <= index <= 14: if 0 <= index < 1:
type_title = classify_item_list[index].text type_title = classify_item_list[index].text
# 进入分类页面 # 进入分类页面
classify_item_list[index].click() classify_item_list[index].click()
...@@ -59,164 +62,188 @@ def reptile(browser=None, search_word=""): ...@@ -59,164 +62,188 @@ def reptile(browser=None, search_word=""):
wait = WebDriverWait(browser, 10) wait = WebDriverWait(browser, 10)
wait.until(EC.presence_of_element_located((By.XPATH, "//div[@class='r-ent']"))) wait.until(EC.presence_of_element_located((By.XPATH, "//div[@class='r-ent']")))
element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a") page = 1
for index_two, item in enumerate(element_list): # element_meta_list = browser.find_elements("xpath", "//div[@class='r-ent']//div[@class='meta']")
# print(element_list[index_two].text)
try:
re.findall("公告", item.text)
except IndexError:
log.debug(f"正在爬取分类:{type_title}-第{index_two + 1}条")
print("当前连接:" + str(browser.current_url))
print(data[len(data) - 1]["title"])
# 使用正则表达式进行匹配关键词
if re.findall(search_word, item.text):
# log.debug(f"找到了匹配的字符串:{matches}")
error = ""
else:
# 本次迭代帖子标题未匹配关键词,退出本次迭代,进入下一次迭代
continue
# 标题不包含"公告"和"看板" def process_data():
if re.findall("公告", element_list[index_two].text) or re.findall("看板", element_list[index_two].text): # 增加搜索
a = 1 search_input = browser.find_element("xpath", "//div[@class='search-bar']//input")
else: if search_word != search_input.get_attribute("value"):
# 使用正则表达式进行匹配 # 输入搜索关键词
# matches = search_input.send_keys(search_word)
# log.debug(element_list[index_two].text+str(matches)) # 点击输入框
# 打印匹配结果 # browser.execute_script("arguments[0].click();", search_input)
# if matches: # 确认搜索关键词
# log.debug(f"找到了匹配的字符串:{matches}") # search_input.click()
element_list[index_two].click() search_input.submit()
time.sleep(0.1) # 程序睡眠300ms,等待页面加载完成
# 原链接 time.sleep(0.3)
browser_current_url = browser.current_url
# print(browser_current_url) element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
# log.debug('网页链接' + str(browser_current_url)) nonlocal page
print(f"当前为第:{page} 页,共 {len(element_list)} 条数据")
for index_two, item in enumerate(element_list):
# print(element_list[index_two].text)
try: try:
# 获取帖子详情 re.findall("公告", item.text)
element_title = browser.find_element('xpath', except IndexError:
"//div[@id='main-content']/div[3]//span[@class='article-meta-value']") error = ""
# log.debug(f"正在爬取分类:{type_title}-第{index_two + 1}条")
# print("当前连接:" + str(browser.current_url))
# print(data[len(data) - 1]["title"])
except: # 标题不包含"公告"和"看板"
log.error( if re.findall("公告", element_list[index_two].text) or re.findall("看板",
"xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']") element_list[index_two].text):
log.debug(f'页面链接:{browser_current_url}') a = 1
# 浏览器返回上一页 else:
browser.back() element_list[index_two].click()
element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a") time.sleep(0.2)
break # 原链接
# 内容可能包含图片和视频,需要后处理 browser_current_url = browser.current_url
element_content = browser.find_element('xpath', "//div[@id='main-content']") try:
# 去除herf属性值包含'img'的a标签 # 获取帖子详情
# ------------------------------------ element_title = browser.find_element('xpath',
# 使用BeautifulSoup解析HTML "//div[@id='main-content']/div[3]//span[@class='article-meta-value']")
soup = BeautifulSoup(element_content.get_attribute('outerHTML'), 'html.parser')
# 作者
element_author = browser.find_element('xpath',
"//div[@id='main-content']/div[@class='article-metaline'][1]/span[2]")
# 发布时间
element_release = browser.find_element('xpath',
"//div[@id='main-content']/div[@class='article-metaline'][3]/span[2]")
# log.debug('开始判断类型') except:
try: log.error(
# 找到所有第一级标签为 `div` 的元素 "xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']")
div_elements = soup.find_all('div') log.debug(f'页面链接:{browser_current_url}')
# log.debug("一级div数量:" + str(len(div_elements))) # 浏览器返回上一页
# 逐个删除这些元素 browser.back()
for key, div in enumerate(div_elements): element_list = browser.find_elements('xpath',
if key > 0: "//div[@class='r-ent']//div[@class='title']//a")
div.extract() break
# 删除第一级span # 内容可能包含图片和视频,需要后处理
span_element = soup.find_all('span') element_content = browser.find_element('xpath', "//div[@id='main-content']")
# log.debug("一级span数量:" + str(len(span_element))) # 去除herf属性值包含'img'的a标签
for span in span_element: # ------------------------------------
span.extract() # 使用BeautifulSoup解析HTML
soup = BeautifulSoup(element_content.get_attribute('outerHTML'), 'html.parser')
# 作者
element_author = browser.find_element('xpath',
"//div[@id='main-content']/div[@class='article-metaline'][1]/span[2]")
# 发布时间
element_release = browser.find_element('xpath',
"//div[@id='main-content']/div[@class='article-metaline'][3]/span[2]")
except: # log.debug('开始判断类型')
# log.debug("删除第一级div失败") try:
a = 2 # 找到所有第一级标签为 `div` 的元素
# ---------------- 判断类型 start ---------- div_elements = soup.find_all('div')
# 类型 # log.debug("一级div数量:" + str(len(div_elements)))
content_type = "" # 逐个删除这些元素
# 查找所有img标签 for key, div in enumerate(div_elements):
image_list = soup.find_all('img') if key > 0:
try: div.extract()
# 删除第一级span
span_element = soup.find_all('span')
# log.debug("一级span数量:" + str(len(span_element)))
for span in span_element:
span.extract()
except:
# log.debug("删除第一级div失败")
a = 2
# ---------------- 判断类型 start ----------
# 类型
content_type = ""
# 查找所有img标签
image_list = soup.find_all('img')
try:
if len(image_list) > 0:
content_type = "图文"
else:
content_type = "文字"
except:
content_type = "文字"
picture_url = []
if len(image_list) > 0: if len(image_list) > 0:
content_type = "图文" for key, element in enumerate(image_list):
# 下载图片至本地,替换标签中的src
id = str(int(time.time()))
# 下载地址
download_dir = f'{os.path.join(local_path, f"{id}.jpg")}'
# 访问地址
access_address = f'{get_base_file_url()}{table_name.split("_")[1]}/{local_path_name}/{id}.jpg'
# 下载状态
status = download_image(element['src'], download_dir)
if status:
element['src'] = access_address
picture_url.append(download_dir)
else: else:
content_type = "文字" # print("")
except: error = ""
content_type = "文字" # ---------------- 判断类型 end ----------
picture_url = [] # log.debug('开始内容过滤')
if len(image_list) > 0: # ------------------ content 过滤 start--------------
for key, element in enumerate(image_list): try:
# 下载图片至本地,替换标签中的src # 查找所有的<a>标签
id = str(int(time.time())) a_tags = soup.find_all('a', href=True)
# 下载地址 # log.debug("a标签数量:" + str(len(a_tags)))
download_dir = f'{os.path.join(local_path, f"{id}.jpg")}' # 循环遍历<a>标签,检查每个<a>标签是否包含<img>元素,如果包含则删除该<a>标签
# 访问地址 for tag in a_tags:
access_address = f'{get_base_file_url()}{table_name.split("_")[1]}/{local_path_name}/{id}.jpg' tag.decompose()
# 下载状态 except:
status = download_image(element['src'], download_dir) # log.debug("查找所有的<a>标签失败")
if status: a = 1
element['src'] = access_address html = soup.prettify().replace('amp;', '')
picture_url.append(download_dir) # ------------------ content 过滤 end--------------
else:
# print("")
error = ""
# ---------------- 判断类型 end ----------
# log.debug('开始内容过滤')
# ------------------ content 过滤 start--------------
try:
# 查找所有的<a>标签
a_tags = soup.find_all('a', href=True)
# log.debug("a标签数量:" + str(len(a_tags)))
# 循环遍历<a>标签,检查每个<a>标签是否包含<img>元素,如果包含则删除该<a>标签
for tag in a_tags:
tag.decompose()
except:
# log.debug("查找所有的<a>标签失败")
a = 1
html = soup.prettify().replace('amp;', '')
# ------------------ content 过滤 end--------------
date_string = element_release.text date_string = element_release.text
date_format = "%a %b %d %H:%M:%S %Y" # date_string = "Wed Aug 9 15:39:26 2023 //update 20934353"
# 将日期字符串转换为datetime对象 # 提取日期字符串
date_time = datetime.strptime(date_string, date_format) if "//" in date_string:
# 将datetime对象转换为时间戳(以秒为单位) date_string = date_string.split("//")[0]
release_time = int(date_time.timestamp()) date_string = date_string.strip()
# 过滤时间 date_format = "%a %b %d %H:%M:%S %Y"
if beginFiltrationTime <= release_time <= endFiltrationTime: # 将日期字符串转换为datetime对象
# --------------- 组装数据 start--------------------- date_time = datetime.strptime(date_string, date_format)
obj = { # 将datetime对象转换为时间戳(以秒为单位)
"title": element_title.text, release_time = int(date_time.timestamp())
"content": html,
"link": browser_current_url,
"reptileTime": str(int(time.time())),
"type": content_type,
"author": element_author.text,
"releaseTime": str(release_time),
"picture_url": ",".join(picture_url)
}
# --------------- 组装数据 end---------------------
data.append(obj)
# 浏览器返回上一页
browser.back()
time.sleep(0.1)
element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
# print("循环结束") # 过滤时间
# 浏览器返回上一页 if beginFiltrationTime <= release_time <= endFiltrationTime:
browser.back() # --------------- 组装数据 start---------------------
if index == 0: obj = {
browser.back() "title": element_title.text,
time.sleep(0.1) "content": html,
# 重新获取 "link": browser_current_url,
classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']") "reptileTime": str(int(time.time())),
"type": content_type,
"author": element_author.text,
"releaseTime": str(release_time),
"picture_url": ",".join(picture_url)
}
# --------------- 组装数据 end---------------------
data.append(obj)
# 浏览器返回上一页
browser.back()
time.sleep(0.2)
element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
page = page + 1
# print("111111")
try:
prev_button = browser.find_element('xpath',
"//a[@class='btn wide' and text() = '‹ 上頁']")
prev_button.click()
time.sleep(0.3)
process_data()
except:
error = ""
process_data()
# # print("循环结束")
# # 浏览器返回上一页
# browser.back()
# if index == 0:
# browser.back()
# time.sleep(0.1)
# # 重新获取
# classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")
# 发送爬取数据到java服务 # 发送爬取数据到java服务
# print('----------------------') # print('----------------------')
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment