Commit ab99c057 authored by liyang's avatar liyang

fix:ptt 爬取加速

parent 51625cb9
2023-07-11 20:04:32,430 ERROR pc_ptt.py : reptile [line: 94] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value'] 2023-07-11 20:04:32,430 ERROR pc_ptt.py : reptile [line: 94] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:23:47,713 ERROR pc_ptt.py : reptile [line: 64] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:23:51,168 ERROR pc_ptt.py : reptile [line: 64] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:24:12,330 ERROR pc_ptt.py : reptile [line: 64] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:27:18,984 ERROR pc_ptt.py : reptile [line: 64] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:28:31,234 ERROR pc_ptt.py : reptile [line: 64] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:30:08,742 ERROR pc_ptt.py : reptile [line: 64] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:32:45,950 ERROR pc_ptt.py : reptile [line: 65] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:35:08,341 ERROR pc_ptt.py : reptile [line: 65] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:39:23,710 ERROR pc_ptt.py : reptile [line: 66] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:41:30,332 ERROR pc_ptt.py : reptile [line: 66] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:43:37,394 ERROR pc_ptt.py : reptile [line: 67] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
...@@ -38,30 +38,37 @@ def reptile(browser=None, search_word=""): ...@@ -38,30 +38,37 @@ def reptile(browser=None, search_word=""):
length = len(classify_item_list) length = len(classify_item_list)
for index in range(length): for index in range(length):
# 暂时先爬取 第2个 分类 # 暂时先爬取 第2个 分类
if 1 < index < 3: if 0 < index < 4:
type_title = classify_item_list[index].text
classify_item_list[index].click() classify_item_list[index].click()
# time.sleep(0.1) time.sleep(0.1)
element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a") element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
length_two = len(element_list) length_two = len(element_list)
for index_two in range(length_two): for index_two in range(length_two):
# 标题不包含"公告" # 标题不包含"公告"和"看板"
if re.findall("公告", element_list[index_two].text) or re.findall("看板", element_list[index_two].text):
a=1
else:
log.debug(f"正在爬取分类:{type_title}-第{index_two + 1}条")
# 使用正则表达式进行匹配 # 使用正则表达式进行匹配
# matches = re.findall("公告", element_list[index_two].text) # matches =
# log.debug(element_list[index_two].text+str(matches)) # log.debug(element_list[index_two].text+str(matches))
# 打印匹配结果 # 打印匹配结果
# if matches: # if matches:
# log.debug(f"找到了匹配的字符串:{matches}") # log.debug(f"找到了匹配的字符串:{matches}")
element_list[index_two].click() element_list[index_two].click()
# time.sleep(0.1) time.sleep(0.1)
# 原链接 # 原链接
browser_current_url = browser.current_url browser_current_url = browser.current_url
log.debug('网页链接' + str(browser_current_url)) # log.debug('网页链接' + str(browser_current_url))
try: try:
# 获取帖子详情 # 获取帖子详情
element_title = browser.find_element('xpath', element_title = browser.find_element('xpath',
"//div[@id='main-content']/div[3]//span[@class='article-meta-value']") "//div[@id='main-content']/div[3]//span[@class='article-meta-value']")
except: except:
log.error("xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']") log.error("xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']")
log.debug(f'页面链接:{browser_current_url}')
# 浏览器返回上一页 # 浏览器返回上一页
browser.back() browser.back()
element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a") element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
...@@ -84,7 +91,8 @@ def reptile(browser=None, search_word=""): ...@@ -84,7 +91,8 @@ def reptile(browser=None, search_word=""):
date_time = datetime.strptime(date_string, date_format) date_time = datetime.strptime(date_string, date_format)
# 将datetime对象转换为时间戳(以秒为单位) # 将datetime对象转换为时间戳(以秒为单位)
release_time = int(date_time.timestamp()) release_time = int(date_time.timestamp())
log.debug('开始判断类型') # log.debug('开始判断类型')
# ---------------- 判断类型 start ---------- # ---------------- 判断类型 start ----------
# 类型 # 类型
content_type = "" content_type = ""
...@@ -98,33 +106,34 @@ def reptile(browser=None, search_word=""): ...@@ -98,33 +106,34 @@ def reptile(browser=None, search_word=""):
except: except:
content_type = "文字" content_type = "文字"
# ---------------- 判断类型 end ---------- # ---------------- 判断类型 end ----------
log.debug('开始内容过滤') # log.debug('开始内容过滤')
# ------------------ content 过滤 start-------------- # ------------------ content 过滤 start--------------
try: try:
# 查找所有的<a>标签 # 查找所有的<a>标签
a_tags = soup.find_all('a', href=True) a_tags = soup.find_all('a', href=True)
log.debug("a标签数量:" + str(len(a_tags))) # log.debug("a标签数量:" + str(len(a_tags)))
# 循环遍历<a>标签,检查每个<a>标签是否包含<img>元素,如果包含则删除该<a>标签 # 循环遍历<a>标签,检查每个<a>标签是否包含<img>元素,如果包含则删除该<a>标签
for tag in a_tags: for tag in a_tags:
tag.decompose() tag.decompose()
except: except:
log.debug("查找所有的<a>标签失败") # log.debug("查找所有的<a>标签失败")
a=1
try: try:
# 找到所有第一级标签为 `div` 的元素 # 找到所有第一级标签为 `div` 的元素
div_elements = soup.find_all('div') div_elements = soup.find_all('div')
log.debug("一级div数量:" + str(len(div_elements))) # log.debug("一级div数量:" + str(len(div_elements)))
# 逐个删除这些元素 # 逐个删除这些元素
for div in div_elements: for div in div_elements:
div.extract() div.extract()
# 删除第一级span # 删除第一级span
span_element = soup.find_all('span') span_element = soup.find_all('span')
log.debug("一级span数量:" + str(len(span_element))) # log.debug("一级span数量:" + str(len(span_element)))
for span in span_element: for span in span_element:
span.extract() span.extract()
except: except:
log.debug("删除第一级div失败") # log.debug("删除第一级div失败")
a=2
html = soup.prettify().replace('amp;', '') html = soup.prettify().replace('amp;', '')
# ------------------ content 过滤 end-------------- # ------------------ content 过滤 end--------------
...@@ -140,24 +149,25 @@ def reptile(browser=None, search_word=""): ...@@ -140,24 +149,25 @@ def reptile(browser=None, search_word=""):
} }
# --------------- 组装数据 end--------------------- # --------------- 组装数据 end---------------------
if search_word == "": if search_word is None or search_word==str(search_word):
data.append(obj) data.append(obj)
else: else:
# 使用正则表达式进行匹配 # 使用正则表达式进行匹配
# log.debug(f"关键词:{search_word}-{element_title.text}")
matches = re.findall(search_word, element_title.text) matches = re.findall(search_word, element_title.text)
# 打印匹配结果 # 打印匹配结果
if matches: if matches:
# log.debug(f"找到了匹配的字符串:{matches}") # log.debug(f"找到了匹配的字符串:{matches}")
data.append(obj) data.append(obj)
else: else:
log.debug("未找到匹配的字符串") # log.debug("未找到匹配的字符串")
a=3
# 浏览器返回上一页 # 浏览器返回上一页
browser.back() browser.back()
element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a") element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
# 浏览器返回上一页 # 浏览器返回上一页
browser.back() browser.back()
# time.sleep(1) time.sleep(0.1)
# 重新获取 # 重新获取
classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']") classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment