NASA每日一图的python爬取介绍
如果你安装了python的语言环境,并且希望通过自己的程序来获取每天NASA新发布的每日一图的话,我这里提供了我最近编写的一个python文件。可以供大家使用。
使用前应该注意需要安装几个库:包括request库,bs4库,urllib.parse库,以及datetime库。如果没有安装,可能运行下述文件时会报错。
那么如何安装上述库呢?
pip install ~(这里放库的名称)
# 如
pip install request
安装好该库后,运行拥有下述代码的py文件,就可以获得想要的图片了。
例如:
执行结果如下:
全部代码如下:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from urllib.request import urlretrieve
import datetime
def download_all_images(url, save_dir):
# 创建保存图片的目录
if not os.path.exists(save_dir):
os.makedirs(save_dir)
# 发送HTTP请求
response = requests.get(url)
response.raise_for_status() # 如果请求失败,抛出异常
# 使用BeautifulSoup解析页面
soup = BeautifulSoup(response.text, 'html.parser')
# 查找所有的图片链接
image_links = soup.find_all('img')
print(image_links)
# 下载并保存图片
for img in image_links:
try:
img_url = img.get('src')
# 如果图片链接是相对路径,则将其转换为绝对路径
if not img_url.startswith(('http://', 'https://')):
img_url = urljoin(url, img_url)
# 下载图片
img_name = os.path.join(save_dir, img_url.split('/')[-1])
urlretrieve(img_url, img_name)
print(f"Downloaded: {img_name}")
except OSError:
continue
def download_newest_images(save_dir):
if not os.path.exists(save_dir):
os.makedirs(save_dir)
cookies = {
'_ga_T0PVFC1H36': 'GS1.1.1677914868.1.1.1677915891.0.0.0',
'_parsely_visitor': '{%22id%22:%22pid=13da8acd-ccec-4dd7-9905-a4d79580f4b4%22%2C%22session_count%22:1%2C%22last_session_ts%22:1699592782857}',
'_ga_QKPHYHJWM4': 'GS1.2.1705546764.2.0.1705546764.0.0.0',
'_ga_3MLXXCVWWY': 'GS1.1.1708602030.2.1.1708602522.0.0.0',
'_gid': 'GA1.2.368413650.1711521957',
'_parsely_session': '{%22sid%22:2%2C%22surl%22:%22https://www.nasa.gov/image-of-the-day/%22%2C%22sref%22:%22https://www.nasa.gov/%22%2C%22sts%22:1711528245023%2C%22slts%22:1699592782857}',
'_parsely_visitor': '{%22id%22:%22pid=13da8acd-ccec-4dd7-9905-a4d79580f4b4%22%2C%22session_count%22:1%2C%22last_session_ts%22:1699592782857}',
'_ga_CSLL4ZEK4L': 'GS1.1.1711528863.22.1.1711528875.0.0.0',
'_ga': 'GA1.1.1165779847.1670123892',
}
headers = {
'authority': 'www.nasa.gov',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6,es-ES;q=0.5,es;q=0.4',
'cache-control': 'max-age=0',
# 'cookie': '_ga_T0PVFC1H36=GS1.1.1677914868.1.1.1677915891.0.0.0; _parsely_visitor={%22id%22:%22pid=13da8acd-ccec-4dd7-9905-a4d79580f4b4%22%2C%22session_count%22:1%2C%22last_session_ts%22:1699592782857}; _ga_QKPHYHJWM4=GS1.2.1705546764.2.0.1705546764.0.0.0; _ga_3MLXXCVWWY=GS1.1.1708602030.2.1.1708602522.0.0.0; _gid=GA1.2.368413650.1711521957; _parsely_session={%22sid%22:2%2C%22surl%22:%22https://www.nasa.gov/image-of-the-day/%22%2C%22sref%22:%22https://www.nasa.gov/%22%2C%22sts%22:1711528245023%2C%22slts%22:1699592782857}; _parsely_visitor={%22id%22:%22pid=13da8acd-ccec-4dd7-9905-a4d79580f4b4%22%2C%22session_count%22:1%2C%22last_session_ts%22:1699592782857}; _ga_CSLL4ZEK4L=GS1.1.1711528863.22.1.1711528875.0.0.0; _ga=GA1.1.1165779847.1670123892',
'sec-ch-ua': '"Chromium";v="122", "Not(A:Brand";v="24", "Microsoft Edge";v="122"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0',
}
response = requests.get('https://www.nasa.gov/', cookies=cookies, headers=headers)
# 解析网页
soup = BeautifulSoup(response.text, 'html.parser')
# 爬取内容1
content_img = "#post-128943 > div > div.hds-image-of-the-day.color-mode-light.hds-color-mode-light.hds-module.hds-module-full.wp-block-nasa-blocks-image-of-the-day > div > div > div.grid-col-12.desktop\:grid-col-8.desktop\:padding-left-5 > div.hds-image-download-wrapper.width-full.display-block.cursor-pointer > div > figure > img"
img = soup.select(content_img)[0]
img_url = img.get('src')
if '?' in img_url:
img_url = img_url.split('?')[0]
img_name = os.path.join(save_dir, str(datetime.date.today()) + '.' + img_url.split('.')[-1])
print(img_url)
urlretrieve(img_url, img_name)
print(f"Downloaded: {img_name}")
# 爬取内容2
content_img_title = "#post-128943 > div > div.hds-image-of-the-day.color-mode-light.hds-color-mode-light.hds-module.hds-module-full.wp-block-nasa-blocks-image-of-the-day > div > div > div.grid-col-12.desktop\:grid-col-4.desktop\:padding-right-5.margin-bottom-6.desktop\:margin-bottom-0 > p.heading-22.margin-bottom-2"
img_title = soup.select(content_img_title)
print(f'这张图的标题是:{img_title[0].text}')
# 爬取内容3
content_img_text = "#post-128943 > div > div.hds-image-of-the-day.color-mode-light.hds-color-mode-light.hds-module.hds-module-full.wp-block-nasa-blocks-image-of-the-day > div > div > div.grid-col-12.desktop\:grid-col-4.desktop\:padding-right-5.margin-bottom-6.desktop\:margin-bottom-0 > p.p-md"
img_text = soup.select(content_img_text)
print(f'这张图的具体内容是:{img_text[0].text}')
print('执行成功')
def test():
print('执行成功')
url = 'https://www.nasa.gov/image-of-the-day/' # 替换为你要爬取的网页的URL
save_dir = 'NASA_daily_pictures' # 图片保存的目录
def menu():
print('【任务选择】\n'
'+—————————UVB|UVC对于各类有机物作用影响数据分析—————————+\n'
'|0、退出。|\n'
'|1、NASA每日一图全部图片爬取 |\n'
'|2、NASA每日一图最新图片爬取 |\n'
'|01、测试。|\n'
'+———————————————————————————————————————————————————————+')
# 功能选择模块
def task():
while True:
menu() # 打印系统主界面
num = input('请输入任务选项:')
if num == '1':
download_all_images(url, save_dir)
elif num == '2':
download_newest_images(save_dir)
elif num == '01':
test()
elif num == '0':
print('程序结束!')
break
else:
print('输入选项有误')
input("回车显示主菜单")
# 主函数
if __name__ == '__main__':
task() # 调用功能选择函数