Commit 115c9564 authored by liyang's avatar liyang

fix:爬取数据入库

parent 4b5870d4
...@@ -61,9 +61,9 @@ def write_to_database(data): ...@@ -61,9 +61,9 @@ def write_to_database(data):
def reptile(browser=None, search_word=""): def reptile(browser=None, search_word=""):
url = "https://www.ptt.cc/bbs/hotboards.html" url = "https://www.ptt.cc/bbs/hotboards.html"
# 无头模式执行 # 无头模式执行
# browser = browser or create(['--headless']) browser = browser or create(['--headless'])
# 有头模式执行 # 有头模式执行
browser = browser or create() # browser = browser or create()
# 打开网页 # 打开网页
browser.get(url) browser.get(url)
log.debug("已打开浏览器") log.debug("已打开浏览器")
...@@ -130,20 +130,27 @@ def reptile(browser=None, search_word=""): ...@@ -130,20 +130,27 @@ def reptile(browser=None, search_word=""):
# ---------------- 判断类型 end ---------- # ---------------- 判断类型 end ----------
log.debug('开始内容过滤') log.debug('开始内容过滤')
# ------------------ content 过滤 start-------------- # ------------------ content 过滤 start--------------
# 查找所有的<a>标签 try:
a_tags = soup.find_all('a', href=True) # 查找所有的<a>标签
# 循环遍历<a>标签,检查每个<a>标签是否包含<img>元素,如果包含则删除该<a>标签 a_tags = soup.find_all('a', href=True)
for tag in a_tags: # 循环遍历<a>标签,检查每个<a>标签是否包含<img>元素,如果包含则删除该<a>标签
tag.decompose() for tag in a_tags:
# 找到所有第一级标签为 `div` 的元素 tag.decompose()
div_elements = soup.find_all('div') except:
# 逐个删除这些元素 log.debug("查找所有的<a>标签失败")
for div in div_elements:
div.extract() try:
# 删除第一级span # 找到所有第一级标签为 `div` 的元素
span_element = soup.find_all('span') div_elements = soup.find_all('div')
for span in span_element: # 逐个删除这些元素
span.extract() for div in div_elements:
div.extract()
# 删除第一级span
span_element = soup.find_all('span')
for span in span_element:
span.extract()
except:
log.debug("删除第一级div失败")
html = soup.prettify().replace('amp;', '') html = soup.prettify().replace('amp;', '')
# ------------------ content 过滤 end-------------- # ------------------ content 过滤 end--------------
...@@ -162,7 +169,7 @@ def reptile(browser=None, search_word=""): ...@@ -162,7 +169,7 @@ def reptile(browser=None, search_word=""):
data.append(obj) data.append(obj)
# 使用正则表达式进行匹配 # 使用正则表达式进行匹配
# matches = re.findall(search_word, element_title.text) # matches = re.findall(search_word, element_title.text)
# # 打印匹配结果 # 打印匹配结果
# if matches: # if matches:
# # log.debug(f"找到了匹配的字符串:{matches}") # # log.debug(f"找到了匹配的字符串:{matches}")
# data.append(obj) # data.append(obj)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment