Commit 5773068e authored by liyang's avatar liyang

fix:针对twitter反爬虫定向抓取数据

parent e0c2ddfc
...@@ -7,6 +7,7 @@ from utils.filse import save_json ...@@ -7,6 +7,7 @@ from utils.filse import save_json
from api.index import importJson, getReptileTask, importJsonPath from api.index import importJson, getReptileTask, importJsonPath
from utils.index import convert_to_traditional, yt_dlp_download,convert_string_to_time,parse_time_string from utils.index import convert_to_traditional, yt_dlp_download,convert_string_to_time,parse_time_string
# from pytube import YouTube # from pytube import YouTube
from datetime import datetime
import os import os
from config.settings import get_base_file_url from config.settings import get_base_file_url
...@@ -19,16 +20,9 @@ from config.settings import get_base_file_url ...@@ -19,16 +20,9 @@ from config.settings import get_base_file_url
def reptile(browser=None, search_word=""): def reptile(browser=None, search_word=""):
url = "https://www.facebook.com/" url = "https://www.facebook.com/"
option = ['--headless'] option = ['--headless']
# ['--headless']
browser = browser or create(option) browser = browser or create(option)
# year = datetime(2021, 1, 1)
# startDate = datetime(2020, 12, 31) # 初始日期
# endDate = datetime(2020, 12, 31) # 结束日期
# print(browser)
# 打开网页 # 打开网页
browser.get(url) browser.get(url)
# print("00000000000000000")
# time.sleep(3)
try: try:
# 检测是否要登录 # 检测是否要登录
login_input = browser.find_element('xpath',"//input[@name='email']") login_input = browser.find_element('xpath',"//input[@name='email']")
...@@ -40,25 +34,17 @@ def reptile(browser=None, search_word=""): ...@@ -40,25 +34,17 @@ def reptile(browser=None, search_word=""):
button_login.click() button_login.click()
time.sleep(3) time.sleep(3)
except: except:
# print("------") print("error")
a=1
# time.sleep(3)
url = f"https://www.facebook.com/search/top?q={search_word}" url = f"https://www.facebook.com/search/top?q={search_word}"
browser.get(url) browser.get(url)
# 使用 JavaScript 将网页滚动到底部 # 使用 JavaScript 将网页滚动到底部
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);") browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3) time.sleep(3)
# 帖子块集合 # 帖子块集合
elements = browser.find_elements('xpath',"//div[@role='feed']/div//div[@aria-describedby]") elements = browser.find_elements('xpath',"//div[@role='feed']/div//div[@aria-describedby]")
# print(333333)
# time.sleep(3)
# 作者 # 作者
element_authors_list = browser.find_elements('xpath', element_authors_list = browser.find_elements('xpath',
"//div[@role='feed']/div//div[@aria-describedby]//h3/span[1]") "//div[@role='feed']/div//div[@aria-describedby]//h3/span[1]")
# print(element_authors_list)
# print("2222")
# 发布时间 # 发布时间
element_release_list = browser.find_elements('xpath', element_release_list = browser.find_elements('xpath',
"//div[@role='feed']/div//div[@aria-describedby]//span[@dir]/span//a[@role='link' and @aria-label]") "//div[@role='feed']/div//div[@aria-describedby]//span[@dir]/span//a[@role='link' and @aria-label]")
...@@ -66,24 +52,19 @@ def reptile(browser=None, search_word=""): ...@@ -66,24 +52,19 @@ def reptile(browser=None, search_word=""):
elements_expand_list = browser.find_elements('xpath',"//div[@role='feed']/div//div[@aria-describedby]//div[@role='button' and text()='展开']") elements_expand_list = browser.find_elements('xpath',"//div[@role='feed']/div//div[@aria-describedby]//div[@role='button' and text()='展开']")
for item in elements_expand_list: for item in elements_expand_list:
item.click() item.click()
# time.sleep(2)
# 内容 # 内容
element_content_list = browser.find_elements('xpath',"//div[@role='feed']/div//div[@aria-describedby]/div/div/div/div/div/div[2]/div/div/div[3]") element_content_list = browser.find_elements('xpath',"//div[@role='feed']/div//div[@aria-describedby]/div/div/div/div/div/div[2]/div/div/div[3]")
# print(element_content_list) # print(element_content_list)
length = len(elements) length = len(elements)
# print(length) # print(length)
for index in range(length): for index in range(length):
author = element_authors_list[index].text author = element_authors_list[index].text
# el = element_release_list[index]
# # datetime_el = el.get_attribute("datetime")
# html = el.text
# 去除时间字符串中包含的html标签
# BeautifulSoup(element_release_list[index].get_attribute("innerHTML"),"html.parser").get_text()
release_time = str(int(parse_time_string(element_release_list[index].text))) release_time = str(int(parse_time_string(element_release_list[index].text)))
content = element_content_list[index].get_attribute("innerHTML") content = element_content_list[index].get_attribute("outerHTML")
# 使用BeautifulSoup解析HTML # 使用BeautifulSoup解析HTML
soup = BeautifulSoup(element_content_list[index].get_attribute('innerHTML'), 'html.parser') soup = BeautifulSoup(element_content_list[index].get_attribute('innerHTML'), 'html.parser')
# 标题取:作者+日期
title = f"{author}-{datetime.fromtimestamp(int(parse_time_string(element_release_list[index].text)))}"
# ---------------- 判断类型 start ---------- # ---------------- 判断类型 start ----------
# 类型 # 类型
content_type = "" content_type = ""
...@@ -99,7 +80,7 @@ def reptile(browser=None, search_word=""): ...@@ -99,7 +80,7 @@ def reptile(browser=None, search_word=""):
# ---------------- 判断类型 end ---------- # ---------------- 判断类型 end ----------
# --------------- 组装数据 start--------------------- # --------------- 组装数据 start---------------------
obj = { obj = {
"title": "", "title": title,
"content": content, "content": content,
"link": element_release_list[index].get_attribute("href"), "link": element_release_list[index].get_attribute("href"),
"reptileTime": str(int(time.time())), "reptileTime": str(int(time.time())),
......
...@@ -79,7 +79,7 @@ def reptile(browser=None, search_word=""): ...@@ -79,7 +79,7 @@ def reptile(browser=None, search_word=""):
# 去除herf属性值包含'img'的a标签 # 去除herf属性值包含'img'的a标签
# ------------------------------------ # ------------------------------------
# 使用BeautifulSoup解析HTML # 使用BeautifulSoup解析HTML
soup = BeautifulSoup(element_content.get_attribute('innerHTML'), 'html.parser') soup = BeautifulSoup(element_content.get_attribute('outerHTML'), 'html.parser')
# 作者 # 作者
element_author = browser.find_element('xpath', element_author = browser.find_element('xpath',
"//div[@id='main-content']/div[@class='article-metaline'][1]/span[2]") "//div[@id='main-content']/div[@class='article-metaline'][1]/span[2]")
......
...@@ -8,6 +8,7 @@ from api.index import importJson, getReptileTask, importJsonPath ...@@ -8,6 +8,7 @@ from api.index import importJson, getReptileTask, importJsonPath
from utils.index import convert_to_traditional, yt_dlp_download, convert_string_to_time, parse_twitter_time_string from utils.index import convert_to_traditional, yt_dlp_download, convert_string_to_time, parse_twitter_time_string
# from pytube import YouTube # from pytube import YouTube
import os import os
from datetime import datetime
from config.settings import get_base_file_url from config.settings import get_base_file_url
# 工具函数-下载图片 # 工具函数-下载图片
...@@ -17,13 +18,13 @@ from config.settings import get_base_file_url ...@@ -17,13 +18,13 @@ from config.settings import get_base_file_url
def reptile(browser=None, search_word=""): def reptile(browser=None, search_word=""):
url = "https://twitter.com/" base_url = "https://twitter.com/"
option = ['--headless'] option = ['--headless']
# ['--headless'] # ['--headless']
browser = browser or create(None, False) browser = browser or create(None, False)
# print(browser) # print(browser)
# 打开网页 # 打开网页
browser.get(url) browser.get(base_url)
time.sleep(3) time.sleep(3)
try: try:
# 检测是否要登录 # 检测是否要登录
...@@ -43,29 +44,32 @@ def reptile(browser=None, search_word=""): ...@@ -43,29 +44,32 @@ def reptile(browser=None, search_word=""):
time.sleep(2) time.sleep(2)
url = 'https://twitter.com/search?q=' + search_word + '&src=typed_query' url = 'https://twitter.com/search?q=' + search_word + '&src=typed_query'
browser.get(url) browser.get(url)
time.sleep(3) time.sleep(4)
base_xpath = "//div[@data-testid='cellInnerDiv']//article/div/div/div[2]/div[2]"
# 内容块 # 内容块
element_content_list = browser.find_elements('xpath', element_content_list = browser.find_elements('xpath',base_xpath)
"//div[@data-testid='cellInnerDiv']//article/div/div/div[2]/div[2]")
# 作者 # 作者
element_authors_list = browser.find_elements('xpath', element_authors_list = browser.find_elements('xpath',f"{base_xpath}//div[@data-testid='User-Name']/div[1]//a[@role='link']")
"//div[@data-testid='cellInnerDiv']//article//div[@data-testid='User-Name']/div[1]//a[@role='link']") # time.sleep(2)
# 发布时间 # 发布时间
element_release_list = browser.find_elements('xpath', # element_release_list = browser.find_elements('xpath',f"{base_xpath}//div[@data-testid='User-Name']/div[2]//a/time")
"//div[@data-testid='cellInnerDiv']//article//div[@data-testid='User-Name']//div[2]//time[@datetime]") # time_a_list = browser.find_elements('xpath',f"{base_xpath}//div[@data-testid='User-Name']/div[2]//a/time/..")
# print(element_content_list) # print(element_content_list)
length = len(element_authors_list) length = len(element_authors_list)
for index in range(length): for index in range(length):
# print(index)
content = element_content_list[index].get_attribute("outerHTML")
soup = BeautifulSoup(content,"html.parser")
# 查找time标签
time_soup = soup.find('time')
timestamp = datetime.fromisoformat(time_soup['datetime'].replace("Z", "+00:00")).timestamp()
link_soup = time_soup.parent
link_str = base_url+link_soup["href"]
author = element_authors_list[index].text author = element_authors_list[index].text
try: # 标题取:作者+日期
release_time = str(int(parse_twitter_time_string(element_release_list[index].text))) title = f"{author}-{datetime.fromtimestamp(int(timestamp))}"
except:
release_time = str(int(time.time()))
content = element_content_list[index].get_attribute("innerHTML")
# print(content)
# 内容过滤
# 使用BeautifulSoup解析HTML
soup = BeautifulSoup(content, 'html.parser')
# ---------------- 判断类型 start ---------- # ---------------- 判断类型 start ----------
# 类型 # 类型
content_type = "" content_type = ""
...@@ -81,15 +85,16 @@ def reptile(browser=None, search_word=""): ...@@ -81,15 +85,16 @@ def reptile(browser=None, search_word=""):
# ---------------- 判断类型 end ---------- # ---------------- 判断类型 end ----------
# --------------- 组装数据 start--------------------- # --------------- 组装数据 start---------------------
obj = { obj = {
"title": "", "title": title,
"content": content, "content": content,
"link": "", "link": link_str,
"reptileTime": str(int(time.time())), "reptileTime": str(int(time.time())),
"type": content_type, "type": content_type,
"author": author, "author": author,
"releaseTime": release_time "releaseTime": str(int(timestamp))
} }
# --------------- 组装数据 end--------------------- # --------------- 组装数据 end---------------------
data.append(obj)
# 发送爬取数据到java服务 # 发送爬取数据到java服务
# print('----------------------') # print('----------------------')
......
...@@ -22,7 +22,7 @@ def parse_time_string(time_str): ...@@ -22,7 +22,7 @@ def parse_time_string(time_str):
:param time_str: :param time_str:
:return: :return:
""" """
log.debug(f'转换face4book的发布时间:{time_str}') # log.debug(f'转换face4book的发布时间:{time_str}')
if "天" in time_str: if "天" in time_str:
number = int(time_str.split("天")[0]) number = int(time_str.split("天")[0])
time_delta = datetime.timedelta(days=number) time_delta = datetime.timedelta(days=number)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment