Commit 5773068e authored by liyang's avatar liyang

fix:针对twitter反爬虫定向抓取数据

parent e0c2ddfc
......@@ -7,6 +7,7 @@ from utils.filse import save_json
from api.index import importJson, getReptileTask, importJsonPath
from utils.index import convert_to_traditional, yt_dlp_download,convert_string_to_time,parse_time_string
# from pytube import YouTube
from datetime import datetime
import os
from config.settings import get_base_file_url
......@@ -19,16 +20,9 @@ from config.settings import get_base_file_url
def reptile(browser=None, search_word=""):
url = "https://www.facebook.com/"
option = ['--headless']
# ['--headless']
browser = browser or create(option)
# year = datetime(2021, 1, 1)
# startDate = datetime(2020, 12, 31) # 初始日期
# endDate = datetime(2020, 12, 31) # 结束日期
# print(browser)
# 打开网页
browser.get(url)
# print("00000000000000000")
# time.sleep(3)
try:
# 检测是否要登录
login_input = browser.find_element('xpath',"//input[@name='email']")
......@@ -40,25 +34,17 @@ def reptile(browser=None, search_word=""):
button_login.click()
time.sleep(3)
except:
# print("------")
a=1
# time.sleep(3)
print("error")
url = f"https://www.facebook.com/search/top?q={search_word}"
browser.get(url)
# 使用 JavaScript 将网页滚动到底部
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3)
# 帖子块集合
elements = browser.find_elements('xpath',"//div[@role='feed']/div//div[@aria-describedby]")
# print(333333)
# time.sleep(3)
# 作者
element_authors_list = browser.find_elements('xpath',
"//div[@role='feed']/div//div[@aria-describedby]//h3/span[1]")
# print(element_authors_list)
# print("2222")
# 发布时间
element_release_list = browser.find_elements('xpath',
"//div[@role='feed']/div//div[@aria-describedby]//span[@dir]/span//a[@role='link' and @aria-label]")
......@@ -66,24 +52,19 @@ def reptile(browser=None, search_word=""):
elements_expand_list = browser.find_elements('xpath',"//div[@role='feed']/div//div[@aria-describedby]//div[@role='button' and text()='展开']")
for item in elements_expand_list:
item.click()
# time.sleep(2)
# 内容
element_content_list = browser.find_elements('xpath',"//div[@role='feed']/div//div[@aria-describedby]/div/div/div/div/div/div[2]/div/div/div[3]")
# print(element_content_list)
length = len(elements)
# print(length)
for index in range(length):
author = element_authors_list[index].text
# el = element_release_list[index]
# # datetime_el = el.get_attribute("datetime")
# html = el.text
# 去除时间字符串中包含的html标签
# BeautifulSoup(element_release_list[index].get_attribute("innerHTML"),"html.parser").get_text()
release_time = str(int(parse_time_string(element_release_list[index].text)))
content = element_content_list[index].get_attribute("innerHTML")
content = element_content_list[index].get_attribute("outerHTML")
# 使用BeautifulSoup解析HTML
soup = BeautifulSoup(element_content_list[index].get_attribute('innerHTML'), 'html.parser')
# 标题取:作者+日期
title = f"{author}-{datetime.fromtimestamp(int(parse_time_string(element_release_list[index].text)))}"
# ---------------- 判断类型 start ----------
# 类型
content_type = ""
......@@ -99,7 +80,7 @@ def reptile(browser=None, search_word=""):
# ---------------- 判断类型 end ----------
# --------------- 组装数据 start---------------------
obj = {
"title": "",
"title": title,
"content": content,
"link": element_release_list[index].get_attribute("href"),
"reptileTime": str(int(time.time())),
......
......@@ -79,7 +79,7 @@ def reptile(browser=None, search_word=""):
# 去除herf属性值包含'img'的a标签
# ------------------------------------
# 使用BeautifulSoup解析HTML
soup = BeautifulSoup(element_content.get_attribute('innerHTML'), 'html.parser')
soup = BeautifulSoup(element_content.get_attribute('outerHTML'), 'html.parser')
# 作者
element_author = browser.find_element('xpath',
"//div[@id='main-content']/div[@class='article-metaline'][1]/span[2]")
......
......@@ -8,6 +8,7 @@ from api.index import importJson, getReptileTask, importJsonPath
from utils.index import convert_to_traditional, yt_dlp_download, convert_string_to_time, parse_twitter_time_string
# from pytube import YouTube
import os
from datetime import datetime
from config.settings import get_base_file_url
# 工具函数-下载图片
......@@ -17,13 +18,13 @@ from config.settings import get_base_file_url
def reptile(browser=None, search_word=""):
url = "https://twitter.com/"
base_url = "https://twitter.com/"
option = ['--headless']
# ['--headless']
browser = browser or create(None, False)
# print(browser)
# 打开网页
browser.get(url)
browser.get(base_url)
time.sleep(3)
try:
# 检测是否要登录
......@@ -43,29 +44,32 @@ def reptile(browser=None, search_word=""):
time.sleep(2)
url = 'https://twitter.com/search?q=' + search_word + '&src=typed_query'
browser.get(url)
time.sleep(3)
time.sleep(4)
base_xpath = "//div[@data-testid='cellInnerDiv']//article/div/div/div[2]/div[2]"
# 内容块
element_content_list = browser.find_elements('xpath',
"//div[@data-testid='cellInnerDiv']//article/div/div/div[2]/div[2]")
element_content_list = browser.find_elements('xpath',base_xpath)
# 作者
element_authors_list = browser.find_elements('xpath',
"//div[@data-testid='cellInnerDiv']//article//div[@data-testid='User-Name']/div[1]//a[@role='link']")
element_authors_list = browser.find_elements('xpath',f"{base_xpath}//div[@data-testid='User-Name']/div[1]//a[@role='link']")
# time.sleep(2)
# 发布时间
element_release_list = browser.find_elements('xpath',
"//div[@data-testid='cellInnerDiv']//article//div[@data-testid='User-Name']//div[2]//time[@datetime]")
# element_release_list = browser.find_elements('xpath',f"{base_xpath}//div[@data-testid='User-Name']/div[2]//a/time")
# time_a_list = browser.find_elements('xpath',f"{base_xpath}//div[@data-testid='User-Name']/div[2]//a/time/..")
# print(element_content_list)
length = len(element_authors_list)
for index in range(length):
# print(index)
content = element_content_list[index].get_attribute("outerHTML")
soup = BeautifulSoup(content,"html.parser")
# 查找time标签
time_soup = soup.find('time')
timestamp = datetime.fromisoformat(time_soup['datetime'].replace("Z", "+00:00")).timestamp()
link_soup = time_soup.parent
link_str = base_url+link_soup["href"]
author = element_authors_list[index].text
try:
release_time = str(int(parse_twitter_time_string(element_release_list[index].text)))
except:
release_time = str(int(time.time()))
content = element_content_list[index].get_attribute("innerHTML")
# print(content)
# 内容过滤
# 使用BeautifulSoup解析HTML
soup = BeautifulSoup(content, 'html.parser')
# 标题取:作者+日期
title = f"{author}-{datetime.fromtimestamp(int(timestamp))}"
# ---------------- 判断类型 start ----------
# 类型
content_type = ""
......@@ -81,15 +85,16 @@ def reptile(browser=None, search_word=""):
# ---------------- 判断类型 end ----------
# --------------- 组装数据 start---------------------
obj = {
"title": "",
"title": title,
"content": content,
"link": "",
"link": link_str,
"reptileTime": str(int(time.time())),
"type": content_type,
"author": author,
"releaseTime": release_time
"releaseTime": str(int(timestamp))
}
# --------------- 组装数据 end---------------------
data.append(obj)
# 发送爬取数据到java服务
# print('----------------------')
......
......@@ -22,7 +22,7 @@ def parse_time_string(time_str):
:param time_str:
:return:
"""
log.debug(f'转换face4book的发布时间:{time_str}')
# log.debug(f'转换face4book的发布时间:{time_str}')
if "天" in time_str:
number = int(time_str.split("天")[0])
time_delta = datetime.timedelta(days=number)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment