pc_bbc.py 1.93 KB
# 导入依赖库
# import os
import json

import requests
from bs4 import BeautifulSoup

from utils.download_image import download_image

# json 数据
data = []

# 确定要爬取的网址
url = "https://www.bbc.co.uk/search?q=2022&d=news_ps&page="
fileDir = "./news/bbc/"

image_key = 280
for a in range(28):
    print(f'当前进度:{a + 1}')
    _index = a + 1
    response = requests.get(url + str(_index))
    soup = BeautifulSoup(response.content, 'html.parser')
    list_news = soup.find_all('div', {"class": "ssrcss-53phst-Promo ett16tt0"})
    for index, item in enumerate(list_news):
        # print(item)
        # 抓取图片
        image_key = image_key + 1
        url_element = item.find('img', {"class": "ssrcss-evoj7m-Image ee0ct7c0"})
        image_url = url_element['src'] if url_element else ""
        # print(url)
        if image_url:
            # print(url)
            # # 下载图片
            #
            filename = f"{image_key}.jpg"
            # print(filename)
            # sys.exit()
            download_image(url, f'{fileDir}images/{filename}')
            # 抓取文字
            title_element = item.find('div', {"class": "ssrcss-1f3bvyz-Stack e1y4nx260"}).find('p', {
                'class': 'ssrcss-6arcww-PromoHeadline e1f5wbog5'}).find('span')
            introduction_element = item.find('div', {"class": "ssrcss-1f3bvyz-Stack e1y4nx260"}).find('p', {
                'class': 'ssrcss-1q0x1qg-Paragraph eq5iqo00'})
            title = title_element.get_text() if title_element else ""
            introduction = introduction_element.get_text() if introduction_element else ""
            news = {
                "title": title,
                "introduction": introduction,
                "imageName": filename
            }
            data.append(news)
# print(data)
# 将数据保存到文件中
with open(f'{fileDir}data1.json', "w", encoding="utf-8") as file:
    json.dump(data, file, indent=2, ensure_ascii=False)