Commit 115c9564 authored by liyang's avatar liyang

fix:爬取数据入库

parent 4b5870d4
......@@ -61,9 +61,9 @@ def write_to_database(data):
def reptile(browser=None, search_word=""):
url = "https://www.ptt.cc/bbs/hotboards.html"
# 无头模式执行
# browser = browser or create(['--headless'])
browser = browser or create(['--headless'])
# 有头模式执行
browser = browser or create()
# browser = browser or create()
# 打开网页
browser.get(url)
log.debug("已打开浏览器")
......@@ -130,20 +130,27 @@ def reptile(browser=None, search_word=""):
# ---------------- 判断类型 end ----------
log.debug('开始内容过滤')
# ------------------ content 过滤 start--------------
# 查找所有的<a>标签
a_tags = soup.find_all('a', href=True)
# 循环遍历<a>标签,检查每个<a>标签是否包含<img>元素,如果包含则删除该<a>标签
for tag in a_tags:
tag.decompose()
# 找到所有第一级标签为 `div` 的元素
div_elements = soup.find_all('div')
# 逐个删除这些元素
for div in div_elements:
div.extract()
# 删除第一级span
span_element = soup.find_all('span')
for span in span_element:
span.extract()
try:
# 查找所有的<a>标签
a_tags = soup.find_all('a', href=True)
# 循环遍历<a>标签,检查每个<a>标签是否包含<img>元素,如果包含则删除该<a>标签
for tag in a_tags:
tag.decompose()
except:
log.debug("查找所有的<a>标签失败")
try:
# 找到所有第一级标签为 `div` 的元素
div_elements = soup.find_all('div')
# 逐个删除这些元素
for div in div_elements:
div.extract()
# 删除第一级span
span_element = soup.find_all('span')
for span in span_element:
span.extract()
except:
log.debug("删除第一级div失败")
html = soup.prettify().replace('amp;', '')
# ------------------ content 过滤 end--------------
......@@ -162,7 +169,7 @@ def reptile(browser=None, search_word=""):
data.append(obj)
# 使用正则表达式进行匹配
# matches = re.findall(search_word, element_title.text)
# # 打印匹配结果
# 打印匹配结果
# if matches:
# # log.debug(f"找到了匹配的字符串:{matches}")
# data.append(obj)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment