Commit 0af3679e authored by liyang's avatar liyang

fix:ptt执行效率

parent 2732252c
......@@ -22,6 +22,7 @@ from selenium.webdriver.support import expected_conditions as EC
# --------------- selenium 依赖 end ----------------
import platform
'''
打开指定网页,并使用 Selenium 模拟点击 "GDPR-accept" 按钮,然后循环点击 "search-show-more-button" 按钮来加载更多数据,直到按钮不再可点击为止。最后,获取完整的分页数据并关闭浏览器驱动。
'''
......@@ -35,15 +36,19 @@ def reptile(browser=None, search_word=""):
"""
print(f"搜索词:{search_word}")
base_url = "https://twitter.com/"
if platform.system() == "Windows":
browser = browser or create(no_headless=True, using_user_data=True)
else:
browser = browser or create(no_headless=True, using_user_data=True)
browser = browser or create(no_headless=False, using_user_data=True)
# print(browser)
# 打开网页
browser.get(base_url)
time.sleep(2)
try:
try:
login_button = browser.find_element('xpath', "//a[@href='/login']")
login_button.click()
time.sleep(2)
except:
error = ""
# wait = WebDriverWait(browser, 20)
# wait.until(EC.presence_of_element_located((By.XPATH, "//input[@autocomplete='username']")))
# 检测是否要登录
......@@ -59,9 +64,11 @@ def reptile(browser=None, search_word=""):
# # 获取登录按钮
button_login = browser.find_element('xpath', "//div[@data-testid='LoginForm_Login_Button']")
button_login.click()
time.sleep(1)
time.sleep(2)
except:
print("------")
# print("------")
error = ""
url = 'https://twitter.com/search?q=' + search_word + '&src=typed_query'
browser.get(url)
wait = WebDriverWait(browser, 10)
......@@ -86,6 +93,15 @@ def reptile(browser=None, search_word=""):
except:
link_str = ""
timestamp = time.time()
# 删除多余div
parent_div = soup.find("div")
# 找到所有的 <div> 子元素
div_elements = parent_div.find_all('div', recursive=False)
for key, item in enumerate(div_elements):
if key == 0 or key == len(div_elements)-1:
item.extract()
author = element_authors_list[index].text
# 标题取:作者+日期
title = f"{author}-{datetime.fromtimestamp(int(timestamp))}"
......@@ -113,8 +129,8 @@ def reptile(browser=None, search_word=""):
custom_video["src"] = ""
parent_div.append(custom_video)
else:
print("")
# print("")
error = ""
picture_url = []
if len(image_list) > 0:
for key, element in enumerate(image_list):
......@@ -130,7 +146,9 @@ def reptile(browser=None, search_word=""):
element['src'] = access_address
picture_url.append(download_dir)
else:
print("")
# print("")
error = ""
content = soup.prettify()
# ---------------- 判断类型 start ----------
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment