1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# # 导入依赖库
import json
import time
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
from loguru import logger
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
# 工具函数-下载图片
from utils.download_image import download_image
'''
打开指定网页,并使用 Selenium 模拟点击 "GDPR-accept" 按钮,然后循环点击 "search-show-more-button" 按钮来加载更多数据,直到按钮不再可点击为止。最后,获取完整的分页数据并关闭浏览器驱动。
'''
# # json 数据
data = [];
image_key = 0
fileDir = "./reptile_data/news/nytimes/"
year = datetime(2021, 1, 1)
startDate = datetime(2020, 12, 31) # 初始日期
endDate = datetime(2020, 12, 31) # 结束日期
# 创建浏览器驱动对象
browser = webdriver.Chrome()
for i in range(1):
endDate = startDate = startDate + timedelta(days=i)
# 打开网页
browser.get(
f'https://www.nytimes.com/search?dropmab=false&endDate={endDate.strftime("%Y%m%d")}&query={year.strftime("%Y")}&sort=best&startDate={startDate.strftime("%Y%m%d")}&types=interactivegraphics%2Carticle')
try:
accept = WebDriverWait(browser, 10).until(
EC.presence_of_element_located((By.XPATH, "//button[@data-testid='GDPR-accept']")))
accept.click()
finally:
logger.debug("")
# 等待加载更多按钮出现
button = WebDriverWait(browser, 10).until(
EC.element_to_be_clickable((By.XPATH, "//button[@data-testid='search-show-more-button']")))
# logger.debug(button)
# 模拟点击按钮多次加载更多数据
while button.is_enabled():
time.sleep(2) # 等待一段时间,确保页面加载完毕
try:
button.click()
button = WebDriverWait(browser, 5).until(
EC.element_to_be_clickable((By.XPATH, "//button[@data-testid='search-show-more-button']")))
except:
break
# 获取完整的分页数据
page_content = browser.page_source
soup = BeautifulSoup(page_content, 'html.parser')
list_news = soup.find_all('li', {"class": "css-1l4w6pd"})
for index, item in enumerate(list_news):
logger.debug(item)
# 抓取图片
image_key = image_key + 1
url_element = item.find('img', {"class": "css-rq4mmj"})
image_url = url_element['src'] if url_element else ""
# logger.debug(url)
if image_url:
# logger.debug(url)
# # 下载图片
#
filename = f"{image_key}.jpg"
# logger.debug(filename)
# sys.exit()
download_image(image_url, f'{fileDir}images/{filename}')
# 抓取文字
title_element = item.find('h4', {"class": "css-2fgx4k"})
introduction_element = item.find('p', {"class": "css-16nhkrn"})
title = title_element.get_text() if title_element else ""
introduction = introduction_element.get_text() if introduction_element else ""
news = {
"title": title,
"introduction": introduction,
"imageName": filename
}
data.append(news)
# logger.debug(data)
# 将数据保存到文件中
with open(f'{fileDir}data.json', "w", encoding="utf-8") as file:
json.dump(data, file, indent=2, ensure_ascii=False)
browser.close()
# 关闭浏览器驱动
browser.quit()