Commit ab634c7f authored by liyang's avatar liyang

fix:爬取twitter

parent ab99c057
...@@ -47,7 +47,7 @@ def reptile(browser=None, search_word=""): ...@@ -47,7 +47,7 @@ def reptile(browser=None, search_word=""):
for index_two in range(length_two): for index_two in range(length_two):
# 标题不包含"公告"和"看板" # 标题不包含"公告"和"看板"
if re.findall("公告", element_list[index_two].text) or re.findall("看板", element_list[index_two].text): if re.findall("公告", element_list[index_two].text) or re.findall("看板", element_list[index_two].text):
a=1 a = 1
else: else:
log.debug(f"正在爬取分类:{type_title}-第{index_two + 1}条") log.debug(f"正在爬取分类:{type_title}-第{index_two + 1}条")
# 使用正则表达式进行匹配 # 使用正则表达式进行匹配
...@@ -67,7 +67,8 @@ def reptile(browser=None, search_word=""): ...@@ -67,7 +67,8 @@ def reptile(browser=None, search_word=""):
"//div[@id='main-content']/div[3]//span[@class='article-meta-value']") "//div[@id='main-content']/div[3]//span[@class='article-meta-value']")
except: except:
log.error("xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']") log.error(
"xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']")
log.debug(f'页面链接:{browser_current_url}') log.debug(f'页面链接:{browser_current_url}')
# 浏览器返回上一页 # 浏览器返回上一页
browser.back() browser.back()
...@@ -117,7 +118,7 @@ def reptile(browser=None, search_word=""): ...@@ -117,7 +118,7 @@ def reptile(browser=None, search_word=""):
tag.decompose() tag.decompose()
except: except:
# log.debug("查找所有的<a>标签失败") # log.debug("查找所有的<a>标签失败")
a=1 a = 1
try: try:
# 找到所有第一级标签为 `div` 的元素 # 找到所有第一级标签为 `div` 的元素
div_elements = soup.find_all('div') div_elements = soup.find_all('div')
...@@ -133,7 +134,7 @@ def reptile(browser=None, search_word=""): ...@@ -133,7 +134,7 @@ def reptile(browser=None, search_word=""):
except: except:
# log.debug("删除第一级div失败") # log.debug("删除第一级div失败")
a=2 a = 2
html = soup.prettify().replace('amp;', '') html = soup.prettify().replace('amp;', '')
# ------------------ content 过滤 end-------------- # ------------------ content 过滤 end--------------
...@@ -149,7 +150,7 @@ def reptile(browser=None, search_word=""): ...@@ -149,7 +150,7 @@ def reptile(browser=None, search_word=""):
} }
# --------------- 组装数据 end--------------------- # --------------- 组装数据 end---------------------
if search_word is None or search_word==str(search_word): if search_word is None or search_word == str(search_word):
data.append(obj) data.append(obj)
else: else:
# 使用正则表达式进行匹配 # 使用正则表达式进行匹配
...@@ -161,7 +162,7 @@ def reptile(browser=None, search_word=""): ...@@ -161,7 +162,7 @@ def reptile(browser=None, search_word=""):
data.append(obj) data.append(obj)
else: else:
# log.debug("未找到匹配的字符串") # log.debug("未找到匹配的字符串")
a=3 a = 3
# 浏览器返回上一页 # 浏览器返回上一页
browser.back() browser.back()
element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a") element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
...@@ -206,9 +207,14 @@ def reptile(browser=None, search_word=""): ...@@ -206,9 +207,14 @@ def reptile(browser=None, search_word=""):
# time.sleep(3) # time.sleep(3)
browser.quit() browser.quit()
def main(): def main():
"""
"""
# 请求关键词 # 请求关键词
response = getReptileTask() response = getReptileTask()
global status_task
# print(response) # print(response)
if response['status_code'] == 200 and response['data']['code'] == 200: if response['status_code'] == 200 and response['data']['code'] == 200:
log.debug("call success") log.debug("call success")
...@@ -217,8 +223,12 @@ def main(): ...@@ -217,8 +223,12 @@ def main():
if item['name'] == 'ptt': if item['name'] == 'ptt':
search_word = item['keyword'] search_word = item['keyword']
table_name = item['tableName'] table_name = item['tableName']
status_task = item["status"]
# 简体转繁体 # 简体转繁体
reptile(None, convert_to_traditional(search_word)) if status_task == 0:
reptile(None, convert_to_traditional(search_word))
else:
log.debug("爬取任务未启用")
else: else:
log.debug("call failed") log.debug("call failed")
reptile(None, '') reptile(None, '')
...@@ -228,5 +238,7 @@ def main(): ...@@ -228,5 +238,7 @@ def main():
# 全局变量 # 全局变量
data = [] data = []
table_name = "pms_ptt" table_name = "pms_ptt"
# 是否启用
status_task = '0'
# 调用main函数 # 调用main函数
main() main()
...@@ -22,85 +22,51 @@ def reptile(browser=None, search_word=""): ...@@ -22,85 +22,51 @@ def reptile(browser=None, search_word=""):
# ['--headless'] # ['--headless']
browser = browser or create() browser = browser or create()
# print(browser) # print(browser)
# browser = webdriver.Firefox(executable_path='/usr/local/bin/geckodriver')
# endDate = startDate = startDate + timedelta(days=i)
# 打开网页 # 打开网页
browser.get(url) browser.get(url)
# WebDriverWait(browser,10). time.sleep(3)
# 打开登录窗口 try:
# open_button_login = WebDriverWait(browser, 10).until( # 检测是否要登录
# EC.presence_of_element_located((By.XPATH, "//a[@data-testid='login']"))) login_input = browser.find_element('xpath',"//input[@autocomplete='username']")
# open_button_login.click() login_input.send_keys("liyang1851603")
# time.sleep(5) # 获取下一步按钮
buttons = browser.find_element('xpath', "//div[@role='button'][2]")
# 获取账号密码输入框 buttons.click()
# input_email_element = WebDriverWait(browser, 10).until( time.sleep(3)
# EC.presence_of_element_located((By.XPATH, "//input[@autocomplete='username']"))) password_input = browser.find_element('xpath', "//input[@autocomplete='current-password']")
# # 获取下一步按钮 password_input.send_keys("liyang19970814")
# buttons = WebDriverWait(browser, 10).until(EC.presence_of_all_elements_located((By.XPATH, "//div[@role='button']"))) # 获取登录按钮
# for item in buttons: button_login = browser.find_element('xpath',"//div[@data-testid='LoginForm_Login_Button']")
# print(BeautifulSoup(item, 'html.parser')) button_login.click()
# soup = BeautifulSoup(page_content, 'html.parser') except:
# input_pwd_element = WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.XPATH, "//input[@name='pass']"))) print("------")
# # 获取登录按钮 # print(333333)
# button_login = WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, "//button[@name='login']")))
#
# input_email_element.send_keys("liyang19970814@gmail.com")
# input_pwd_element.send_keys("xn89kiPT/^Kaeg#")
# button_login.click()
# print("---------------")
# print(input_email_element)
# print(input_pwd_element)
# print(button_login)
# logger.debug(button)
# 模拟点击按钮多次加载更多数据
# while button.is_enabled():
# time.sleep(2) # 等待一段时间,确保页面加载完毕
# try:
# button.click()
# button = WebDriverWait(browser, 5).until(
# EC.element_to_be_clickable((By.XPATH, "//button[@data-testid='search-show-more-button']")))
# except:
# break
# time.sleep(3) # time.sleep(3)
# 获取完整的分页数据 # 作者
# page_content = browser.page_source element_authors_list = browser.find_elements('xpath',
# soup = BeautifulSoup(page_content, 'html.parser') "//div[@data-testid='cellInnerDiv']//article//div[@data-testid='User-Name']//a[@role='link']//div[@dir='ltr']")
# print("----------") print(element_authors_list)
# print(soup) print("2222")
# list_news = soup.find_all('li', {"class": "css-1l4w6pd"}) # 发布时间
element_release_list = browser.find_elements('xpath',
# for index, item in enumerate(list_news): "//div[@data-testid='cellInnerDiv']//article//div[@data-testid='User-Name']//div[2]//time[@datetime]")
# logger.debug(item) # 标题
# # 抓取图片 # element_title_list = browser.find_element('xpath',)
# image_key = image_key + 1 # 内容
# url_element = item.find('img', {"class": "css-rq4mmj"}) element_content_list = browser.find_elements('xpath',"//div[@data-testid='cellInnerDiv']//article/div/div/div[2]/div[2]")
# image_url = url_element['src'] if url_element else "" # print(element_content_list)
# # logger.debug(url) length = len(element_authors_list)
# if image_url: print(length)
# # logger.debug(url) for index in range(length):
# # # 下载图片 author = element_authors_list[index].text
# # release_time = element_release_list[index].get_attribute("datetime")
# filename = f"{image_key}.jpg" content = element_content_list[index]
# # logger.debug(filename) print(content)
# # sys.exit() # 内容过滤
# download_image(image_url, f'{fileDir}images/{filename}') # 使用BeautifulSoup解析HTML
# # 抓取文字 soup = BeautifulSoup(content.get_attribute("innerHTML"), 'html.parser')
# title_element = item.find('h4', {"class": "css-2fgx4k"}) print(soup)
# introduction_element = item.find('p', {"class": "css-16nhkrn"}) print("-----")
# title = title_element.get_text() if title_element else ""
# introduction = introduction_element.get_text() if introduction_element else ""
# news = {
# "title": title,
# "introduction": introduction,
# "imageName": filename
# }
# data.append(news)
# logger.debug(data)
# 将数据保存到文件中
# with open(f'{fileDir}data.json', "w", encoding="utf-8") as file:
# json.dump(data, file, indent=2, ensure_ascii=False)
# browser.close() # browser.close()
# # 关闭浏览器驱动 # # 关闭浏览器驱动
# browser.quit() # browser.quit()
......
...@@ -125,6 +125,7 @@ def main(): ...@@ -125,6 +125,7 @@ def main():
# 请求关键词 # 请求关键词
response = getReptileTask() response = getReptileTask()
# print(response) # print(response)
global status_task
if response['status_code'] == 200 and response['data']['code'] == 200: if response['status_code'] == 200 and response['data']['code'] == 200:
log.debug("call success") log.debug("call success")
search_word = "" search_word = ""
...@@ -132,7 +133,11 @@ def main(): ...@@ -132,7 +133,11 @@ def main():
if item['name'] == 'youtube': if item['name'] == 'youtube':
search_word = item['keyword'] search_word = item['keyword']
table_name = item['tableName'] table_name = item['tableName']
reptile(None, convert_to_traditional(search_word)) status_task = item["status"]
if status_task == 0:
reptile(None, convert_to_traditional(search_word))
else:
log.debug("爬取任务未启用")
else: else:
log.debug("call failed") log.debug("call failed")
reptile(None, '') reptile(None, '')
...@@ -142,5 +147,7 @@ def main(): ...@@ -142,5 +147,7 @@ def main():
# 全局变量 # 全局变量
data = [] data = []
table_name = "pms_youtube" table_name = "pms_youtube"
# 是否启用
status_task = '0'
# 调用main函数 # 调用main函数
main() main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment