Commit 2732252c authored by liyang's avatar liyang

fix:ptt执行效率

parent f71f84a0
......@@ -33,17 +33,15 @@ from selenium.webdriver.support import expected_conditions as EC
def reptile(browser=None, search_word=""):
url = "https://www.ptt.cc/bbs/hotboards.html"
browser = browser or create(no_headless=True, using_user_data=True)
browser = browser or create(no_headless=False, using_user_data=True)
# 有头模式执行
# browser = browser or create()
# 打开网页
browser.get(url)
# log.debug("已打开浏览器")
classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")
# log.debug(classify_item_list)
# classify_item_list = item_list.copy()
length = len(classify_item_list)
for index in range(length):
for index,item_element in enumerate(classify_item_list):
# 暂时先爬取 第2个 分类
if 0 <= index < 4:
type_title = classify_item_list[index].text
......@@ -62,23 +60,23 @@ def reptile(browser=None, search_word=""):
wait = WebDriverWait(browser, 10)
wait.until(EC.presence_of_element_located((By.XPATH, "//div[@class='r-ent']")))
element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
length_two = len(element_list)
for index_two in range(length_two):
for index_two, item in enumerate(element_list):
# print(element_list[index_two].text)
try:
re.findall("公告", element_list[index_two].text)
re.findall("公告", item.text)
except IndexError:
log.debug(f"正在爬取分类:{type_title}-第{index_two + 1}条")
print("当前连接:"+str(browser.current_url))
print(data[len(data)-1]["title"])
print("当前连接:" + str(browser.current_url))
print(data[len(data) - 1]["title"])
# 使用正则表达式进行匹配关键词
if re.findall(search_word, element_list[index_two].text):
if re.findall(search_word, item.text):
# log.debug(f"找到了匹配的字符串:{matches}")
error = ""
else:
# log.debug("未找到匹配的字符串")
# 退出本次迭代,进入下一次迭代
# 本次迭代帖子标题未匹配关键词,退出本次迭代,进入下一次迭代
continue
# 标题不包含"公告"和"看板"
if re.findall("公告", element_list[index_two].text) or re.findall("看板", element_list[index_two].text):
a = 1
......@@ -201,10 +199,13 @@ def reptile(browser=None, search_word=""):
"picture_url": ",".join(picture_url)
}
# --------------- 组装数据 end---------------------
data.append(obj)
# 浏览器返回上一页
browser.back()
time.sleep(0.1)
element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
print("循环结束")
# 浏览器返回上一页
browser.back()
if index == 0:
......@@ -224,11 +225,11 @@ def reptile(browser=None, search_word=""):
log.debug('save file success')
else:
log.debug('save file failed')
# script_close(browser)
script_close(browser)
else:
# 爬取数据为空
log.info("未爬取到数据")
# script_close(browser)
script_close(browser)
def script_close(browser):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment