Commit ab99c057 authored by liyang's avatar liyang

fix:ptt 爬取加速

parent 51625cb9
2023-07-11 20:04:32,430 ERROR pc_ptt.py : reptile [line: 94] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:23:47,713 ERROR pc_ptt.py : reptile [line: 64] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:23:51,168 ERROR pc_ptt.py : reptile [line: 64] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:24:12,330 ERROR pc_ptt.py : reptile [line: 64] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:27:18,984 ERROR pc_ptt.py : reptile [line: 64] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:28:31,234 ERROR pc_ptt.py : reptile [line: 64] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:30:08,742 ERROR pc_ptt.py : reptile [line: 64] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:32:45,950 ERROR pc_ptt.py : reptile [line: 65] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:35:08,341 ERROR pc_ptt.py : reptile [line: 65] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:39:23,710 ERROR pc_ptt.py : reptile [line: 66] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:41:30,332 ERROR pc_ptt.py : reptile [line: 66] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:43:37,394 ERROR pc_ptt.py : reptile [line: 67] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
......@@ -38,30 +38,37 @@ def reptile(browser=None, search_word=""):
length = len(classify_item_list)
for index in range(length):
# 暂时先爬取 第2个 分类
if 1 < index < 3:
if 0 < index < 4:
type_title = classify_item_list[index].text
classify_item_list[index].click()
# time.sleep(0.1)
time.sleep(0.1)
element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
length_two = len(element_list)
for index_two in range(length_two):
# 标题不包含"公告"
# 标题不包含"公告"和"看板"
if re.findall("公告", element_list[index_two].text) or re.findall("看板", element_list[index_two].text):
a=1
else:
log.debug(f"正在爬取分类:{type_title}-第{index_two + 1}条")
# 使用正则表达式进行匹配
# matches = re.findall("公告", element_list[index_two].text)
# matches =
# log.debug(element_list[index_two].text+str(matches))
# 打印匹配结果
# if matches:
# log.debug(f"找到了匹配的字符串:{matches}")
element_list[index_two].click()
# time.sleep(0.1)
time.sleep(0.1)
# 原链接
browser_current_url = browser.current_url
log.debug('网页链接' + str(browser_current_url))
# log.debug('网页链接' + str(browser_current_url))
try:
# 获取帖子详情
element_title = browser.find_element('xpath',
"//div[@id='main-content']/div[3]//span[@class='article-meta-value']")
except:
log.error("xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']")
log.debug(f'页面链接:{browser_current_url}')
# 浏览器返回上一页
browser.back()
element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
......@@ -84,7 +91,8 @@ def reptile(browser=None, search_word=""):
date_time = datetime.strptime(date_string, date_format)
# 将datetime对象转换为时间戳(以秒为单位)
release_time = int(date_time.timestamp())
log.debug('开始判断类型')
# log.debug('开始判断类型')
# ---------------- 判断类型 start ----------
# 类型
content_type = ""
......@@ -98,33 +106,34 @@ def reptile(browser=None, search_word=""):
except:
content_type = "文字"
# ---------------- 判断类型 end ----------
log.debug('开始内容过滤')
# log.debug('开始内容过滤')
# ------------------ content 过滤 start--------------
try:
# 查找所有的<a>标签
a_tags = soup.find_all('a', href=True)
log.debug("a标签数量:" + str(len(a_tags)))
# log.debug("a标签数量:" + str(len(a_tags)))
# 循环遍历<a>标签,检查每个<a>标签是否包含<img>元素,如果包含则删除该<a>标签
for tag in a_tags:
tag.decompose()
except:
log.debug("查找所有的<a>标签失败")
# log.debug("查找所有的<a>标签失败")
a=1
try:
# 找到所有第一级标签为 `div` 的元素
div_elements = soup.find_all('div')
log.debug("一级div数量:" + str(len(div_elements)))
# log.debug("一级div数量:" + str(len(div_elements)))
# 逐个删除这些元素
for div in div_elements:
div.extract()
# 删除第一级span
span_element = soup.find_all('span')
log.debug("一级span数量:" + str(len(span_element)))
# log.debug("一级span数量:" + str(len(span_element)))
for span in span_element:
span.extract()
except:
log.debug("删除第一级div失败")
# log.debug("删除第一级div失败")
a=2
html = soup.prettify().replace('amp;', '')
# ------------------ content 过滤 end--------------
......@@ -140,24 +149,25 @@ def reptile(browser=None, search_word=""):
}
# --------------- 组装数据 end---------------------
if search_word == "":
if search_word is None or search_word==str(search_word):
data.append(obj)
else:
# 使用正则表达式进行匹配
# log.debug(f"关键词:{search_word}-{element_title.text}")
matches = re.findall(search_word, element_title.text)
# 打印匹配结果
if matches:
# log.debug(f"找到了匹配的字符串:{matches}")
data.append(obj)
else:
log.debug("未找到匹配的字符串")
# log.debug("未找到匹配的字符串")
a=3
# 浏览器返回上一页
browser.back()
element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
# 浏览器返回上一页
browser.back()
# time.sleep(1)
time.sleep(0.1)
# 重新获取
classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment