Commit d6d072eb authored by liyang's avatar liyang

fix:爬取数据入库

parent 3769935a
This diff is collapsed.
......@@ -61,17 +61,15 @@ def write_to_database(data):
def reptile(browser=None, search_word=""):
url = "https://www.ptt.cc/bbs/hotboards.html"
browser = browser or create(['--headless'])
# browser = browser or create()
# time.sleep(1)
# 打开网页
browser.get(url)
log.debug("已打开浏览器")
classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")
# log.debug(classify_item_list)
length = len(classify_item_list)
for index in range(length):
if 1 < index < 3:
if 0 < index < 2:
classify_item_list[index].click()
# if index==0:
time.sleep(1)
element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
length_two = len(element_list)
......@@ -104,35 +102,61 @@ def reptile(browser=None, search_word=""):
# ------------------------------------
# 使用BeautifulSoup解析HTML
soup = BeautifulSoup(element_content.get_attribute('innerHTML'), 'html.parser')
# 作者
element_author = browser.find_element('xpath',
"//div[@id='main-content']/div[@class='article-metaline'][1]/span[2]")
log.debug('开始判断类型')
# ---------------- 判断类型 start ----------
# 查找所有img标签
img_tags = soup.find_all('img')
# 类型
content_type = ""
if len(img_tags) > 0:
content_type = "图文"
else:
content_type = "文字"
# ---------------- 判断类型 end ----------
log.debug('开始内容过滤')
# ------------------ content 过滤 start--------------
# 查找所有的<a>标签
a_tags = soup.find_all('a', href=True)
# 循环遍历<a>标签,检查每个<a>标签是否包含<img>元素,如果包含则删除该<a>标签
for tag in a_tags:
tag.decompose()
# 找到所有第一级标签为 `div` 的元素
div_elements = soup.find_all('div')
# 逐个删除这些元素
for div in div_elements:
div.extract()
# 删除第一级span
span_element = soup.find_all('span')
for span in span_element:
span.extract()
html = soup.prettify().replace('amp;', '')
# log.debug(html)
# log.debug('11111')
# ------------------------------------
# 组装数据
print(html)
print("aaaaa")
# ------------------ content 过滤 end--------------
# --------------- 组装数据 start---------------------
obj = {
"title": element_title.text,
"content": html,
"link": browser_current_url,
"reptileTime": str(int(time.time()))
"reptileTime": str(int(time.time())),
"type": content_type,
"author": element_author.text
}
# --------------- 组装数据 end---------------------
# ------------------------------------------------------
data.append(obj)
# # 使用正则表达式进行匹配
# matches = re.findall(search_word, element_title.text)
# # 打印匹配结果
# if matches:
# # log.debug(f"找到了匹配的字符串:{matches}")
# data.append(obj)
# else:
# log.debug("未找到匹配的字符串")
# ------------------------------------------------------
# 使用正则表达式进行匹配
matches = re.findall(search_word, element_title.text)
# 打印匹配结果
if matches:
# log.debug(f"找到了匹配的字符串:{matches}")
data.append(obj)
else:
log.debug("未找到匹配的字符串")
# 浏览器返回上一页
browser.back()
......@@ -185,11 +209,6 @@ def convert_to_traditional(simplified_text):
return traditional_text
# 全局变量
data = []
table_name = "pms_ptt"
def main():
# 请求关键词
response = getReptileTask()
......@@ -208,4 +227,8 @@ def main():
# upload_control()
# 全局变量
data = []
table_name = "pms_ptt"
# 调用main函数
main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment