Commit e6a8964a authored by liyang's avatar liyang

feat:处理ins爬虫数据

parent 1526bcd6
...@@ -55,7 +55,6 @@ def reptile(browser=None, search_word=""): ...@@ -55,7 +55,6 @@ def reptile(browser=None, search_word=""):
# 评论 # 评论
element_comment_list = browser.find_elements('xpath', f"{base_xpath}/div[3]/div[2]/div/span") element_comment_list = browser.find_elements('xpath', f"{base_xpath}/div[3]/div[2]/div/span")
length = len(element_content_list) length = len(element_content_list)
for index in range(length): for index in range(length):
# 提取时间,并转为时间戳 # 提取时间,并转为时间戳
...@@ -175,10 +174,14 @@ def reptile(browser=None, search_word=""): ...@@ -175,10 +174,14 @@ def reptile(browser=None, search_word=""):
log.debug('save file success') log.debug('save file success')
else: else:
log.debug('save file failed') log.debug('save file failed')
script_close(browser)
else: else:
# 爬取数据为空 # 爬取数据为空
log.info("未爬取到数据") log.info("未爬取到数据")
script_close(browser)
def script_close(browser):
# 关闭浏览器驱动 # 关闭浏览器驱动
try: try:
browser.close() browser.close()
......
...@@ -13,6 +13,7 @@ import os ...@@ -13,6 +13,7 @@ import os
from config.settings import get_base_file_url from config.settings import get_base_file_url
from config.settings import get_account from config.settings import get_account
import sys import sys
# 工具函数-下载图片 # 工具函数-下载图片
''' '''
打开指定网页,并使用 Selenium 模拟点击 "GDPR-accept" 按钮,然后循环点击 "search-show-more-button" 按钮来加载更多数据,直到按钮不再可点击为止。最后,获取完整的分页数据并关闭浏览器驱动。 打开指定网页,并使用 Selenium 模拟点击 "GDPR-accept" 按钮,然后循环点击 "search-show-more-button" 按钮来加载更多数据,直到按钮不再可点击为止。最后,获取完整的分页数据并关闭浏览器驱动。
...@@ -160,19 +161,23 @@ def reptile(browser=None, search_word=""): ...@@ -160,19 +161,23 @@ def reptile(browser=None, search_word=""):
log.debug('save file success') log.debug('save file success')
else: else:
log.debug('save file failed') log.debug('save file failed')
script_close(browser)
else: else:
# 爬取数据为空 # 爬取数据为空
log.info("未爬取到数据") log.info("未爬取到数据")
script_close(browser)
def script_close(browser):
# 关闭浏览器驱动 # 关闭浏览器驱动
try: try:
browser.close() browser.close()
browser.quit() browser.quit()
except: except:
log.debug("浏览器驱动关闭失败") log.debug("浏览器驱动关闭失败")
sys.exit() sys.exit()
def main(): def main():
""" """
......
...@@ -40,7 +40,10 @@ def reptile(browser=None, search_word=""): ...@@ -40,7 +40,10 @@ def reptile(browser=None, search_word=""):
# print(browser) # print(browser)
# 打开网页 # 打开网页
browser.get(base_url) browser.get(base_url)
time.sleep(3) # 等待加载完成
time.sleep(2)
# wait = WebDriverWait(browser, 10)
# wait.until(EC.presence_of_element_located((By.XPATH, "//input[@name='username']")))
try: try:
# 检测是否要登录 # 检测是否要登录
login_input = browser.find_element('xpath', "//input[@name='username']") login_input = browser.find_element('xpath', "//input[@name='username']")
...@@ -50,35 +53,44 @@ def reptile(browser=None, search_word=""): ...@@ -50,35 +53,44 @@ def reptile(browser=None, search_word=""):
# 获取登录按钮 # 获取登录按钮
button_login = browser.find_element('xpath', "//button[@type='submit']") button_login = browser.find_element('xpath', "//button[@type='submit']")
button_login.click() button_login.click()
time.sleep(3) time.sleep(2)
except: except:
print("------") print("------")
# print("1111") # print("1111")
url = f"{base_url}explore/tags/{search_word}/" url = f"{base_url}explore/tags/{search_word}/"
browser.get(url) browser.get(url)
wait = WebDriverWait(browser, 10)
wait.until(EC.presence_of_element_located((By.XPATH, "//article//a")))
# 链接 # 链接
element_link_list = browser.find_elements('xpath',"//article//a") element_link_list = browser.find_elements('xpath', "//article//a")
length = len(element_link_list) length = len(element_link_list)
for index in range(length): for index in range(length):
element_link_list[index].click() # element_link_list[index].click()
browser.execute_script("arguments[0].click();", element_link_list[index])
# 等待弹窗加载完成 # 等待弹窗加载完成
wait = WebDriverWait(browser,10) wait = WebDriverWait(browser, 10)
wait.until(EC.presence_of_element_located((By.XPATH, "//div[@role='dialog']/div/div[2]"))) wait.until(EC.presence_of_element_located((By.XPATH, "//div[@role='dialog']/div/div[2]")))
# 提取其他 # 提取其他
author = browser.find_element("xpath","//div[@role='dialog']/div//article/div/div[2]/div/div/div[1]//a") author = browser.find_element("xpath", "//div[@role='dialog']/div//article/div/div[2]/div/div/div[1]//a")
content_element = browser.find_element("xpath","//div[@role='dialog']/div//article/div/div[2]/div/div/div[2]/div[1]//div[@role='button']//h1") content_element = browser.find_element("xpath",
time_element = browser.find_element("xpath","//div[@role='dialog']/div//article/div/div[2]/div/div/div[2]/div[2]//time") "//div[@role='dialog']/div//article/div/div[2]/div/div/div[2]/div[1]//div[@role='button']//h1")
time_element = browser.find_element("xpath",
"//div[@role='dialog']/div//article/div/div[2]/div/div/div[2]/div[2]//time")
link_str = browser.current_url link_str = browser.current_url
# 提取时间,并转为时间戳 # 提取时间,并转为时间戳
timestamp = datetime.fromisoformat(time_element.get_attribute("datetime")[:-1]).timestamp() timestamp = datetime.fromisoformat(time_element.get_attribute("datetime")[:-1]).timestamp()
#提取图片、视频 # 提取弹窗内容
soup = BeautifulSoup(content_element.get_attribute("outerHTML"), "html.parser")
# 提取图片、视频
picture_url = [] picture_url = []
img_list = browser.find_elements("xpath","//div[@role='dialog']/div//article/div/div[1]/div/div[1]//img") img_list = browser.find_elements("xpath", "//div[@role='dialog']/div//article/div/div[1]/div/div[1]//img")
for key,item in enumerate(img_list): # 过滤视频
video_list = browser.find_elements("xpath", "//div[@role='dialog']/div//article/div/div[1]/div/div[1]//video")
for key, item in enumerate(img_list):
if len(video_list) == 0:
if key == 0: if key == 0:
title = item.get_attribute("alt") title = item.get_attribute("alt")
# 下载图片至本地,替换标签中的src # 下载图片至本地,替换标签中的src
...@@ -88,17 +100,15 @@ def reptile(browser=None, search_word=""): ...@@ -88,17 +100,15 @@ def reptile(browser=None, search_word=""):
# 访问地址 # 访问地址
access_address = f'{get_base_file_url()}{table_name.split("_")[1]}/{id}.jpg' access_address = f'{get_base_file_url()}{table_name.split("_")[1]}/{id}.jpg'
# 下载状态 # 下载状态
status = download_image(item['src'], download_dir) status = download_image(item.get_attribute("src"), download_dir)
if status: if status:
item['src'] = access_address # 将图片追加到内容中
img_soup = BeautifulSoup(item.get_attribute("outerHTML"), "html.parser")
img_soup.img["src"] = access_address
# print(img_soup.prettify())
soup.append(img_soup)
picture_url.append(access_address) picture_url.append(access_address)
#提取弹窗内容
soup = BeautifulSoup(content_element.get_attribute("outerHTML"), "html.parser")
# 将图片整合到内容中
for key, item in enumerate(img_list):
img = BeautifulSoup(item.get_attribute("outerHTML"),"html.parser")
soup.append(img)
content = soup.prettify() content = soup.prettify()
# 类型 # 类型
content_type = "图文" content_type = "图文"
...@@ -116,15 +126,12 @@ def reptile(browser=None, search_word=""): ...@@ -116,15 +126,12 @@ def reptile(browser=None, search_word=""):
# --------------- 组装数据 end--------------------- # --------------- 组装数据 end---------------------
data.append(obj) data.append(obj)
# 获取下一页按钮 # 获取下一页按钮
next_buttons = browser.find_elements("xpath","//div[@role='dialog']/div/div[1]//button") next_buttons = browser.find_elements("xpath", "//div[@role='dialog']/div/div[1]//button")
if index < length-1: if index < length - 1:
for key,item in enumerate(next_buttons): for key, item in enumerate(next_buttons):
if key+1 == len(next_buttons): if key + 1 == len(next_buttons):
item.click() item.click()
# 发送爬取数据到java服务
# print('----------------------')
# print(data)
if len(data) > 0: if len(data) > 0:
# 保存json文件到本地 # 保存json文件到本地
# log.debug(os.path.abspath("../")) # log.debug(os.path.abspath("../"))
...@@ -133,17 +140,29 @@ def reptile(browser=None, search_word=""): ...@@ -133,17 +140,29 @@ def reptile(browser=None, search_word=""):
log.debug('save file success') log.debug('save file success')
else: else:
log.debug('save file failed') log.debug('save file failed')
script_close(browser)
else: else:
# 爬取数据为空 # 爬取数据为空
log.info("未爬取到数据") log.info("未爬取到数据")
script_close(browser)
def script_close(browser):
# 关闭浏览器驱动 # 关闭浏览器驱动
try: try:
browser.close() browser.close()
browser.quit() browser.quit()
except: except:
log.debug("浏览器驱动关闭失败") log.debug("浏览器驱动关闭失败")
try:
# 一些代码...
sys.exit() sys.exit()
except SystemExit:
raise # 重新抛出SystemExit异常,让脚本退出
except Exception as e:
# 异常处理代码...
print("sys.exit() 执行失败")
def main(): def main():
""" """
......
# # 导入依赖库
import json
import time
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
from loguru import logger
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
# 工具函数-下载图片
from utils.download_image import download_image
'''
打开指定网页,并使用 Selenium 模拟点击 "GDPR-accept" 按钮,然后循环点击 "search-show-more-button" 按钮来加载更多数据,直到按钮不再可点击为止。最后,获取完整的分页数据并关闭浏览器驱动。
'''
# # json 数据
data = [];
image_key = 0
fileDir = "./reptile_data/news/nytimes/"
year = datetime(2021, 1, 1)
startDate = datetime(2020, 12, 31) # 初始日期
endDate = datetime(2020, 12, 31) # 结束日期
# 创建浏览器驱动对象
browser = webdriver.Chrome()
for i in range(1):
endDate = startDate = startDate + timedelta(days=i)
# 打开网页
browser.get(
f'https://www.nytimes.com/search?dropmab=false&endDate={endDate.strftime("%Y%m%d")}&query={year.strftime("%Y")}&sort=best&startDate={startDate.strftime("%Y%m%d")}&types=interactivegraphics%2Carticle')
try:
accept = WebDriverWait(browser, 10).until(
EC.presence_of_element_located((By.XPATH, "//button[@data-testid='GDPR-accept']")))
accept.click()
finally:
logger.debug("")
# 等待加载更多按钮出现
button = WebDriverWait(browser, 10).until(
EC.element_to_be_clickable((By.XPATH, "//button[@data-testid='search-show-more-button']")))
# logger.debug(button)
# 模拟点击按钮多次加载更多数据
while button.is_enabled():
time.sleep(2) # 等待一段时间,确保页面加载完毕
try:
button.click()
button = WebDriverWait(browser, 5).until(
EC.element_to_be_clickable((By.XPATH, "//button[@data-testid='search-show-more-button']")))
except:
break
# 获取完整的分页数据
page_content = browser.page_source
soup = BeautifulSoup(page_content, 'html.parser')
list_news = soup.find_all('li', {"class": "css-1l4w6pd"})
for index, item in enumerate(list_news):
logger.debug(item)
# 抓取图片
image_key = image_key + 1
url_element = item.find('img', {"class": "css-rq4mmj"})
image_url = url_element['src'] if url_element else ""
# logger.debug(url)
if image_url:
# logger.debug(url)
# # 下载图片
#
filename = f"{image_key}.jpg"
# logger.debug(filename)
# sys.exit()
download_image(image_url, f'{fileDir}images/{filename}')
# 抓取文字
title_element = item.find('h4', {"class": "css-2fgx4k"})
introduction_element = item.find('p', {"class": "css-16nhkrn"})
title = title_element.get_text() if title_element else ""
introduction = introduction_element.get_text() if introduction_element else ""
news = {
"title": title,
"introduction": introduction,
"imageName": filename
}
data.append(news)
# logger.debug(data)
# 将数据保存到文件中
with open(f'{fileDir}data.json', "w", encoding="utf-8") as file:
json.dump(data, file, indent=2, ensure_ascii=False)
browser.close()
# 关闭浏览器驱动
browser.quit()
...@@ -200,10 +200,14 @@ def reptile(browser=None, search_word=""): ...@@ -200,10 +200,14 @@ def reptile(browser=None, search_word=""):
log.debug('save file success') log.debug('save file success')
else: else:
log.debug('save file failed') log.debug('save file failed')
script_close(browser)
else: else:
# 爬取数据为空 # 爬取数据为空
log.info("未爬取到数据") log.info("未爬取到数据")
script_close(browser)
def script_close(browser):
# 关闭浏览器驱动 # 关闭浏览器驱动
try: try:
browser.close() browser.close()
...@@ -212,7 +216,6 @@ def reptile(browser=None, search_word=""): ...@@ -212,7 +216,6 @@ def reptile(browser=None, search_word=""):
log.debug("浏览器驱动关闭失败") log.debug("浏览器驱动关闭失败")
sys.exit() sys.exit()
def main(): def main():
""" """
......
...@@ -13,6 +13,7 @@ from datetime import datetime ...@@ -13,6 +13,7 @@ from datetime import datetime
from utils.download_image import download_image from utils.download_image import download_image
from config.settings import get_base_file_url from config.settings import get_base_file_url
from config.settings import get_account from config.settings import get_account
# 工具函数-下载图片 # 工具函数-下载图片
''' '''
打开指定网页,并使用 Selenium 模拟点击 "GDPR-accept" 按钮,然后循环点击 "search-show-more-button" 按钮来加载更多数据,直到按钮不再可点击为止。最后,获取完整的分页数据并关闭浏览器驱动。 打开指定网页,并使用 Selenium 模拟点击 "GDPR-accept" 按钮,然后循环点击 "search-show-more-button" 按钮来加载更多数据,直到按钮不再可点击为止。最后,获取完整的分页数据并关闭浏览器驱动。
...@@ -159,10 +160,14 @@ def reptile(browser=None, search_word=""): ...@@ -159,10 +160,14 @@ def reptile(browser=None, search_word=""):
log.debug('save file success') log.debug('save file success')
else: else:
log.debug('save file failed') log.debug('save file failed')
script_close(browser)
else: else:
# 爬取数据为空 # 爬取数据为空
log.info("未爬取到数据") log.info("未爬取到数据")
script_close(browser)
def script_close(browser):
# 关闭浏览器驱动 # 关闭浏览器驱动
try: try:
browser.close() browser.close()
...@@ -171,6 +176,7 @@ def reptile(browser=None, search_word=""): ...@@ -171,6 +176,7 @@ def reptile(browser=None, search_word=""):
log.debug("浏览器驱动关闭失败") log.debug("浏览器驱动关闭失败")
sys.exit() sys.exit()
def main(): def main():
""" """
......
...@@ -75,15 +75,18 @@ def reptile(browser=None, search_word=""): ...@@ -75,15 +75,18 @@ def reptile(browser=None, search_word=""):
log.debug('save file success') log.debug('save file success')
else: else:
log.debug('save file failed') log.debug('save file failed')
script_close(browser)
else: else:
# 爬取数据为空 # 爬取数据为空
log.info("未爬取到数据") log.info("未爬取到数据")
script_close(browser)
def script_close(browser):
# 关闭浏览器驱动 # 关闭浏览器驱动
try: try:
browser.close() browser.close()
browser.quit() browser.quit()
except: except:
log.debug("浏览器驱动关闭失败") log.debug("浏览器驱动关闭失败")
sys.exit() sys.exit()
......
This diff is collapsed.
...@@ -28,7 +28,7 @@ from utils.index import get_screen_resolution ...@@ -28,7 +28,7 @@ from utils.index import get_screen_resolution
''' '''
def create(option=None, using_user_data=True, web_browser="chromium"): def create(option=None, using_user_data=True, web_browser="firefox"):
""" """
:param web_browser: :param web_browser:
...@@ -60,8 +60,10 @@ def create(option=None, using_user_data=True, web_browser="chromium"): ...@@ -60,8 +60,10 @@ def create(option=None, using_user_data=True, web_browser="chromium"):
# 添加用户数据目录参数,启用浏览器的持久性会话,可以保存登录状态和Cookie # 添加用户数据目录参数,启用浏览器的持久性会话,可以保存登录状态和Cookie
if web_browser == "firefox": if web_browser == "firefox":
# 将此处替换为你的Firefox用户数据目录路径 # 将此处替换为你的Firefox用户数据目录路径
profile = FirefoxProfile(profile_directory=user_data_dir) # profile = FirefoxProfile(profile_directory=user_data_dir)
options.profile = profile # options.profile = profile
options.add_argument("-profile")
options.add_argument(user_data_dir)
# options.add_argument(f'--user-data-dir={user_data_dir}') # options.add_argument(f'--user-data-dir={user_data_dir}')
elif web_browser == "chrome": elif web_browser == "chrome":
options.add_argument(f'--user-data-dir={user_data_dir}') options.add_argument(f'--user-data-dir={user_data_dir}')
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment