Commit 8bbe2730 authored by liyang's avatar liyang

fix:爬取数据入库

parent 4f920e0f
...@@ -133,6 +133,7 @@ def reptile(browser=None, search_word=""): ...@@ -133,6 +133,7 @@ def reptile(browser=None, search_word=""):
try: try:
# 查找所有的<a>标签 # 查找所有的<a>标签
a_tags = soup.find_all('a', href=True) a_tags = soup.find_all('a', href=True)
log.debug("a标签数量:" + str(len(a_tags)))
# 循环遍历<a>标签,检查每个<a>标签是否包含<img>元素,如果包含则删除该<a>标签 # 循环遍历<a>标签,检查每个<a>标签是否包含<img>元素,如果包含则删除该<a>标签
for tag in a_tags: for tag in a_tags:
tag.decompose() tag.decompose()
...@@ -142,11 +143,13 @@ def reptile(browser=None, search_word=""): ...@@ -142,11 +143,13 @@ def reptile(browser=None, search_word=""):
try: try:
# 找到所有第一级标签为 `div` 的元素 # 找到所有第一级标签为 `div` 的元素
div_elements = soup.find_all('div') div_elements = soup.find_all('div')
log.debug("一级div数量:" + str(len(div_elements)))
# 逐个删除这些元素 # 逐个删除这些元素
for div in div_elements: for div in div_elements:
div.extract() div.extract()
# 删除第一级span # 删除第一级span
span_element = soup.find_all('span') span_element = soup.find_all('span')
log.debug("一级span数量:" + str(len(span_element)))
for span in span_element: for span in span_element:
span.extract() span.extract()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment