NASA每日一图的python爬取介绍

如果你安装了python的语言环境，并且希望通过自己的程序来获取每天NASA新发布的每日一图的话，我这里提供了我最近编写的一个python文件。可以供大家使用。
使用前应该注意需要安装几个库：包括request库，bs4库，urllib.parse库，以及datetime库。如果没有安装，可能运行下述文件时会报错。
那么如何安装上述库呢？

pip install ~(这里放库的名称)
# 如
pip install request

安装好该库后，运行拥有下述代码的py文件，就可以获得想要的图片了。

例如：

执行结果如下：

全部代码如下：

import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from urllib.request import urlretrieve
import datetime

def download_all_images(url, save_dir):
    # 创建保存图片的目录
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    # 发送HTTP请求
    response = requests.get(url)
    response.raise_for_status()  # 如果请求失败，抛出异常
    # 使用BeautifulSoup解析页面
    soup = BeautifulSoup(response.text, 'html.parser')
    # 查找所有的图片链接
    image_links = soup.find_all('img')
    print(image_links)
    # 下载并保存图片
    for img in image_links:
        try:
            img_url = img.get('src')
            # 如果图片链接是相对路径，则将其转换为绝对路径
            if not img_url.startswith(('http://', 'https://')):
                img_url = urljoin(url, img_url)
            # 下载图片
            img_name = os.path.join(save_dir, img_url.split('/')[-1])
            urlretrieve(img_url, img_name)
            print(f"Downloaded: {img_name}")
        except OSError:
            continue

def download_newest_images(save_dir):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    cookies = {
        '_ga_T0PVFC1H36': 'GS1.1.1677914868.1.1.1677915891.0.0.0',
        '_parsely_visitor': '{%22id%22:%22pid=13da8acd-ccec-4dd7-9905-a4d79580f4b4%22%2C%22session_count%22:1%2C%22last_session_ts%22:1699592782857}',
        '_ga_QKPHYHJWM4': 'GS1.2.1705546764.2.0.1705546764.0.0.0',
        '_ga_3MLXXCVWWY': 'GS1.1.1708602030.2.1.1708602522.0.0.0',
        '_gid': 'GA1.2.368413650.1711521957',
        '_parsely_session': '{%22sid%22:2%2C%22surl%22:%22https://www.nasa.gov/image-of-the-day/%22%2C%22sref%22:%22https://www.nasa.gov/%22%2C%22sts%22:1711528245023%2C%22slts%22:1699592782857}',
        '_parsely_visitor': '{%22id%22:%22pid=13da8acd-ccec-4dd7-9905-a4d79580f4b4%22%2C%22session_count%22:1%2C%22last_session_ts%22:1699592782857}',
        '_ga_CSLL4ZEK4L': 'GS1.1.1711528863.22.1.1711528875.0.0.0',
        '_ga': 'GA1.1.1165779847.1670123892',
    }

    headers = {
        'authority': 'www.nasa.gov',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6,es-ES;q=0.5,es;q=0.4',
        'cache-control': 'max-age=0',
        # 'cookie': '_ga_T0PVFC1H36=GS1.1.1677914868.1.1.1677915891.0.0.0; _parsely_visitor={%22id%22:%22pid=13da8acd-ccec-4dd7-9905-a4d79580f4b4%22%2C%22session_count%22:1%2C%22last_session_ts%22:1699592782857}; _ga_QKPHYHJWM4=GS1.2.1705546764.2.0.1705546764.0.0.0; _ga_3MLXXCVWWY=GS1.1.1708602030.2.1.1708602522.0.0.0; _gid=GA1.2.368413650.1711521957; _parsely_session={%22sid%22:2%2C%22surl%22:%22https://www.nasa.gov/image-of-the-day/%22%2C%22sref%22:%22https://www.nasa.gov/%22%2C%22sts%22:1711528245023%2C%22slts%22:1699592782857}; _parsely_visitor={%22id%22:%22pid=13da8acd-ccec-4dd7-9905-a4d79580f4b4%22%2C%22session_count%22:1%2C%22last_session_ts%22:1699592782857}; _ga_CSLL4ZEK4L=GS1.1.1711528863.22.1.1711528875.0.0.0; _ga=GA1.1.1165779847.1670123892',
        'sec-ch-ua': '"Chromium";v="122", "Not(A:Brand";v="24", "Microsoft Edge";v="122"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'sec-fetch-dest': 'document',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-user': '?1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0',
    }
    response = requests.get('https://www.nasa.gov/', cookies=cookies, headers=headers)
    # 解析网页
    soup = BeautifulSoup(response.text, 'html.parser')
    # 爬取内容1
    content_img = "#post-128943 > div > div.hds-image-of-the-day.color-mode-light.hds-color-mode-light.hds-module.hds-module-full.wp-block-nasa-blocks-image-of-the-day > div > div > div.grid-col-12.desktop\:grid-col-8.desktop\:padding-left-5 > div.hds-image-download-wrapper.width-full.display-block.cursor-pointer > div > figure > img"
    img = soup.select(content_img)[0]
    img_url = img.get('src')
    if '?' in img_url:
        img_url = img_url.split('?')[0]
    img_name = os.path.join(save_dir, str(datetime.date.today()) + '.' + img_url.split('.')[-1])
    print(img_url)
    urlretrieve(img_url, img_name)
    print(f"Downloaded: {img_name}")
    # 爬取内容2
    content_img_title = "#post-128943 > div > div.hds-image-of-the-day.color-mode-light.hds-color-mode-light.hds-module.hds-module-full.wp-block-nasa-blocks-image-of-the-day > div > div > div.grid-col-12.desktop\:grid-col-4.desktop\:padding-right-5.margin-bottom-6.desktop\:margin-bottom-0 > p.heading-22.margin-bottom-2"
    img_title = soup.select(content_img_title)
    print(f'这张图的标题是：{img_title[0].text}')
    # 爬取内容3
    content_img_text = "#post-128943 > div > div.hds-image-of-the-day.color-mode-light.hds-color-mode-light.hds-module.hds-module-full.wp-block-nasa-blocks-image-of-the-day > div > div > div.grid-col-12.desktop\:grid-col-4.desktop\:padding-right-5.margin-bottom-6.desktop\:margin-bottom-0 > p.p-md"
    img_text = soup.select(content_img_text)
    print(f'这张图的具体内容是：{img_text[0].text}')
    print('执行成功')

def test():
    
    print('执行成功')

url = 'https://www.nasa.gov/image-of-the-day/'  # 替换为你要爬取的网页的URL
save_dir = 'NASA_daily_pictures'    # 图片保存的目录

def menu():
    print('【任务选择】\n'
          '+—————————UVB|UVC对于各类有机物作用影响数据分析—————————+\n'
          '|0、退出。|\n'
          '|1、NASA每日一图全部图片爬取                            |\n'
          '|2、NASA每日一图最新图片爬取                            |\n'
          '|01、测试。|\n'
          '+———————————————————————————————————————————————————————+')

# 功能选择模块
def task():
    while True:
        menu()  # 打印系统主界面
        num = input('请输入任务选项：')
        if num == '1':
            download_all_images(url, save_dir)
        elif num == '2':
            download_newest_images(save_dir)
        elif num == '01':
            test()
        elif num == '0':
            print('程序结束!')
            break
        else:
            print('输入选项有误')
        input("回车显示主菜单")

# 主函数
if __name__ == '__main__':
    task()  # 调用功能选择函数

天地日志·科普文章·NASA每日一图的python爬取介绍

目录

NASA每日一图的python爬取介绍