Commit e6a8964a authored by liyang's avatar liyang

feat:处理ins爬虫数据

parent 1526bcd6
......@@ -55,7 +55,6 @@ def reptile(browser=None, search_word=""):
# 评论
element_comment_list = browser.find_elements('xpath', f"{base_xpath}/div[3]/div[2]/div/span")
length = len(element_content_list)
for index in range(length):
# 提取时间,并转为时间戳
......@@ -175,10 +174,14 @@ def reptile(browser=None, search_word=""):
log.debug('save file success')
else:
log.debug('save file failed')
script_close(browser)
else:
# 爬取数据为空
log.info("未爬取到数据")
script_close(browser)
def script_close(browser):
# 关闭浏览器驱动
try:
browser.close()
......
......@@ -13,6 +13,7 @@ import os
from config.settings import get_base_file_url
from config.settings import get_account
import sys
# 工具函数-下载图片
'''
打开指定网页,并使用 Selenium 模拟点击 "GDPR-accept" 按钮,然后循环点击 "search-show-more-button" 按钮来加载更多数据,直到按钮不再可点击为止。最后,获取完整的分页数据并关闭浏览器驱动。
......@@ -160,19 +161,23 @@ def reptile(browser=None, search_word=""):
log.debug('save file success')
else:
log.debug('save file failed')
script_close(browser)
else:
# 爬取数据为空
log.info("未爬取到数据")
script_close(browser)
def script_close(browser):
# 关闭浏览器驱动
try:
browser.close()
browser.quit()
except:
log.debug("浏览器驱动关闭失败")
sys.exit()
def main():
"""
......
......@@ -40,7 +40,10 @@ def reptile(browser=None, search_word=""):
# print(browser)
# 打开网页
browser.get(base_url)
time.sleep(3)
# 等待加载完成
time.sleep(2)
# wait = WebDriverWait(browser, 10)
# wait.until(EC.presence_of_element_located((By.XPATH, "//input[@name='username']")))
try:
# 检测是否要登录
login_input = browser.find_element('xpath', "//input[@name='username']")
......@@ -50,35 +53,44 @@ def reptile(browser=None, search_word=""):
# 获取登录按钮
button_login = browser.find_element('xpath', "//button[@type='submit']")
button_login.click()
time.sleep(3)
time.sleep(2)
except:
print("------")
# print("1111")
url = f"{base_url}explore/tags/{search_word}/"
browser.get(url)
wait = WebDriverWait(browser, 10)
wait.until(EC.presence_of_element_located((By.XPATH, "//article//a")))
# 链接
element_link_list = browser.find_elements('xpath',"//article//a")
element_link_list = browser.find_elements('xpath', "//article//a")
length = len(element_link_list)
for index in range(length):
element_link_list[index].click()
# element_link_list[index].click()
browser.execute_script("arguments[0].click();", element_link_list[index])
# 等待弹窗加载完成
wait = WebDriverWait(browser,10)
wait = WebDriverWait(browser, 10)
wait.until(EC.presence_of_element_located((By.XPATH, "//div[@role='dialog']/div/div[2]")))
# 提取其他
author = browser.find_element("xpath","//div[@role='dialog']/div//article/div/div[2]/div/div/div[1]//a")
author = browser.find_element("xpath", "//div[@role='dialog']/div//article/div/div[2]/div/div/div[1]//a")
content_element = browser.find_element("xpath","//div[@role='dialog']/div//article/div/div[2]/div/div/div[2]/div[1]//div[@role='button']//h1")
time_element = browser.find_element("xpath","//div[@role='dialog']/div//article/div/div[2]/div/div/div[2]/div[2]//time")
content_element = browser.find_element("xpath",
"//div[@role='dialog']/div//article/div/div[2]/div/div/div[2]/div[1]//div[@role='button']//h1")
time_element = browser.find_element("xpath",
"//div[@role='dialog']/div//article/div/div[2]/div/div/div[2]/div[2]//time")
link_str = browser.current_url
# 提取时间,并转为时间戳
timestamp = datetime.fromisoformat(time_element.get_attribute("datetime")[:-1]).timestamp()
#提取图片、视频
# 提取弹窗内容
soup = BeautifulSoup(content_element.get_attribute("outerHTML"), "html.parser")
# 提取图片、视频
picture_url = []
img_list = browser.find_elements("xpath","//div[@role='dialog']/div//article/div/div[1]/div/div[1]//img")
for key,item in enumerate(img_list):
img_list = browser.find_elements("xpath", "//div[@role='dialog']/div//article/div/div[1]/div/div[1]//img")
# 过滤视频
video_list = browser.find_elements("xpath", "//div[@role='dialog']/div//article/div/div[1]/div/div[1]//video")
for key, item in enumerate(img_list):
if len(video_list) == 0:
if key == 0:
title = item.get_attribute("alt")
# 下载图片至本地,替换标签中的src
......@@ -88,17 +100,15 @@ def reptile(browser=None, search_word=""):
# 访问地址
access_address = f'{get_base_file_url()}{table_name.split("_")[1]}/{id}.jpg'
# 下载状态
status = download_image(item['src'], download_dir)
status = download_image(item.get_attribute("src"), download_dir)
if status:
item['src'] = access_address
# 将图片追加到内容中
img_soup = BeautifulSoup(item.get_attribute("outerHTML"), "html.parser")
img_soup.img["src"] = access_address
# print(img_soup.prettify())
soup.append(img_soup)
picture_url.append(access_address)
#提取弹窗内容
soup = BeautifulSoup(content_element.get_attribute("outerHTML"), "html.parser")
# 将图片整合到内容中
for key, item in enumerate(img_list):
img = BeautifulSoup(item.get_attribute("outerHTML"),"html.parser")
soup.append(img)
content = soup.prettify()
# 类型
content_type = "图文"
......@@ -116,15 +126,12 @@ def reptile(browser=None, search_word=""):
# --------------- 组装数据 end---------------------
data.append(obj)
# 获取下一页按钮
next_buttons = browser.find_elements("xpath","//div[@role='dialog']/div/div[1]//button")
if index < length-1:
for key,item in enumerate(next_buttons):
if key+1 == len(next_buttons):
next_buttons = browser.find_elements("xpath", "//div[@role='dialog']/div/div[1]//button")
if index < length - 1:
for key, item in enumerate(next_buttons):
if key + 1 == len(next_buttons):
item.click()
# 发送爬取数据到java服务
# print('----------------------')
# print(data)
if len(data) > 0:
# 保存json文件到本地
# log.debug(os.path.abspath("../"))
......@@ -133,17 +140,29 @@ def reptile(browser=None, search_word=""):
log.debug('save file success')
else:
log.debug('save file failed')
script_close(browser)
else:
# 爬取数据为空
log.info("未爬取到数据")
script_close(browser)
def script_close(browser):
# 关闭浏览器驱动
try:
browser.close()
browser.quit()
except:
log.debug("浏览器驱动关闭失败")
try:
# 一些代码...
sys.exit()
except SystemExit:
raise # 重新抛出SystemExit异常,让脚本退出
except Exception as e:
# 异常处理代码...
print("sys.exit() 执行失败")
def main():
"""
......
# # 导入依赖库
import json
import time
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
from loguru import logger
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
# 工具函数-下载图片
from utils.download_image import download_image
'''
打开指定网页,并使用 Selenium 模拟点击 "GDPR-accept" 按钮,然后循环点击 "search-show-more-button" 按钮来加载更多数据,直到按钮不再可点击为止。最后,获取完整的分页数据并关闭浏览器驱动。
'''
# # json 数据
data = [];
image_key = 0
fileDir = "./reptile_data/news/nytimes/"
year = datetime(2021, 1, 1)
startDate = datetime(2020, 12, 31) # 初始日期
endDate = datetime(2020, 12, 31) # 结束日期
# 创建浏览器驱动对象
browser = webdriver.Chrome()
for i in range(1):
endDate = startDate = startDate + timedelta(days=i)
# 打开网页
browser.get(
f'https://www.nytimes.com/search?dropmab=false&endDate={endDate.strftime("%Y%m%d")}&query={year.strftime("%Y")}&sort=best&startDate={startDate.strftime("%Y%m%d")}&types=interactivegraphics%2Carticle')
try:
accept = WebDriverWait(browser, 10).until(
EC.presence_of_element_located((By.XPATH, "//button[@data-testid='GDPR-accept']")))
accept.click()
finally:
logger.debug("")
# 等待加载更多按钮出现
button = WebDriverWait(browser, 10).until(
EC.element_to_be_clickable((By.XPATH, "//button[@data-testid='search-show-more-button']")))
# logger.debug(button)
# 模拟点击按钮多次加载更多数据
while button.is_enabled():
time.sleep(2) # 等待一段时间,确保页面加载完毕
try:
button.click()
button = WebDriverWait(browser, 5).until(
EC.element_to_be_clickable((By.XPATH, "//button[@data-testid='search-show-more-button']")))
except:
break
# 获取完整的分页数据
page_content = browser.page_source
soup = BeautifulSoup(page_content, 'html.parser')
list_news = soup.find_all('li', {"class": "css-1l4w6pd"})
for index, item in enumerate(list_news):
logger.debug(item)
# 抓取图片
image_key = image_key + 1
url_element = item.find('img', {"class": "css-rq4mmj"})
image_url = url_element['src'] if url_element else ""
# logger.debug(url)
if image_url:
# logger.debug(url)
# # 下载图片
#
filename = f"{image_key}.jpg"
# logger.debug(filename)
# sys.exit()
download_image(image_url, f'{fileDir}images/{filename}')
# 抓取文字
title_element = item.find('h4', {"class": "css-2fgx4k"})
introduction_element = item.find('p', {"class": "css-16nhkrn"})
title = title_element.get_text() if title_element else ""
introduction = introduction_element.get_text() if introduction_element else ""
news = {
"title": title,
"introduction": introduction,
"imageName": filename
}
data.append(news)
# logger.debug(data)
# 将数据保存到文件中
with open(f'{fileDir}data.json', "w", encoding="utf-8") as file:
json.dump(data, file, indent=2, ensure_ascii=False)
browser.close()
# 关闭浏览器驱动
browser.quit()
......@@ -200,10 +200,14 @@ def reptile(browser=None, search_word=""):
log.debug('save file success')
else:
log.debug('save file failed')
script_close(browser)
else:
# 爬取数据为空
log.info("未爬取到数据")
script_close(browser)
def script_close(browser):
# 关闭浏览器驱动
try:
browser.close()
......@@ -212,7 +216,6 @@ def reptile(browser=None, search_word=""):
log.debug("浏览器驱动关闭失败")
sys.exit()
def main():
"""
......
......@@ -13,6 +13,7 @@ from datetime import datetime
from utils.download_image import download_image
from config.settings import get_base_file_url
from config.settings import get_account
# 工具函数-下载图片
'''
打开指定网页,并使用 Selenium 模拟点击 "GDPR-accept" 按钮,然后循环点击 "search-show-more-button" 按钮来加载更多数据,直到按钮不再可点击为止。最后,获取完整的分页数据并关闭浏览器驱动。
......@@ -159,10 +160,14 @@ def reptile(browser=None, search_word=""):
log.debug('save file success')
else:
log.debug('save file failed')
script_close(browser)
else:
# 爬取数据为空
log.info("未爬取到数据")
script_close(browser)
def script_close(browser):
# 关闭浏览器驱动
try:
browser.close()
......@@ -171,6 +176,7 @@ def reptile(browser=None, search_word=""):
log.debug("浏览器驱动关闭失败")
sys.exit()
def main():
"""
......
......@@ -75,15 +75,18 @@ def reptile(browser=None, search_word=""):
log.debug('save file success')
else:
log.debug('save file failed')
script_close(browser)
else:
# 爬取数据为空
log.info("未爬取到数据")
script_close(browser)
def script_close(browser):
# 关闭浏览器驱动
try:
browser.close()
browser.quit()
except:
log.debug("浏览器驱动关闭失败")
sys.exit()
......
This diff is collapsed.
......@@ -28,7 +28,7 @@ from utils.index import get_screen_resolution
'''
def create(option=None, using_user_data=True, web_browser="chromium"):
def create(option=None, using_user_data=True, web_browser="firefox"):
"""
:param web_browser:
......@@ -60,8 +60,10 @@ def create(option=None, using_user_data=True, web_browser="chromium"):
# 添加用户数据目录参数,启用浏览器的持久性会话,可以保存登录状态和Cookie
if web_browser == "firefox":
# 将此处替换为你的Firefox用户数据目录路径
profile = FirefoxProfile(profile_directory=user_data_dir)
options.profile = profile
# profile = FirefoxProfile(profile_directory=user_data_dir)
# options.profile = profile
options.add_argument("-profile")
options.add_argument(user_data_dir)
# options.add_argument(f'--user-data-dir={user_data_dir}')
elif web_browser == "chrome":
options.add_argument(f'--user-data-dir={user_data_dir}')
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment