Last active
February 3, 2022 13:49
-
-
Save jayme-github/22dfad7023c909e0ba047ef3b30fecae to your computer and use it in GitHub Desktop.
Download LAADS data from https://ladsweb.modaps.eosdis.nasa.gov
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/env python3 | |
# | |
# Download LAADS data from https://ladsweb.modaps.eosdis.nasa.gov | |
# | |
import argparse | |
import concurrent.futures | |
import datetime | |
import shutil | |
import sys | |
import threading | |
from pathlib import Path | |
from urllib3.util import Retry | |
import requests | |
USERAGENT = "tis/download.py_1.0--" + sys.version.replace("\n", "").replace("\r", "") | |
def buildSession(token): | |
session = requests.Session() | |
# Authorize against API | |
session.headers = {"Authorization": "Bearer " + token, "User-Agent": USERAGENT} | |
# Mount a retry adapter to account for server errors | |
# total: | |
# Total number of retries to allow. Takes precedence over other counts. | |
# | |
# backoff_factor: | |
# A backoff factor to apply between attempts after the second try | |
# (most errors are resolved immediately by a second try without a delay). | |
# urllib3 will sleep for: | |
# {backoff factor} * (2 ** ({number of total retries} - 1)) seconds. | |
# If the backoff_factor is 0.1, then .sleep will sleep for [0.0s, 0.2s, 0.4s, ...] | |
# between retries. It will never be longer than Retry.BACKOFF_MAX. | |
# | |
# status_forcelist: | |
# A set of integer HTTP status codes that we should force a retry on. | |
retry_adapter = requests.adapters.HTTPAdapter( | |
max_retries=Retry(total=5, backoff_factor=0.5, status_forcelist=[502, 503, 504]) | |
) | |
session.mount("http://", retry_adapter) | |
session.mount("https://", retry_adapter) | |
return session | |
def downloadFile( | |
session, dayBaseUrl, file, dstDir, deleteIncomplete=False, verbose=False | |
): | |
fileUrl = f"{dayBaseUrl}/{file['name']}" | |
# Create destination directory (if not exists) | |
dstDir.mkdir(parents=True, exist_ok=True) | |
dstFile = dstDir / file["name"] | |
# Check if file exists and has the expected size | |
# Default mode to open destination file (truncate) | |
dstFileMode = "wb" | |
extraHeaders = {} | |
currentFileSize = 0 | |
if dstFile.exists(): | |
currentFileSize = dstFile.stat().st_size | |
if currentFileSize == file["size"]: | |
if verbose: | |
print(f"File exists: {file['name']}") | |
return True | |
elif currentFileSize < file["size"]: | |
# Try to resume the download. | |
extraHeaders["Range"] = f"bytes={currentFileSize}-" | |
# Open destination file in append mode | |
dstFileMode = "ab" | |
else: | |
# current file is bigger then it is in source | |
dstFile.unlink() | |
currentFileSize = 0 | |
with session.get(fileUrl, stream=True, headers=extraHeaders) as r: | |
r.raise_for_status() | |
what = "Downloading" | |
if currentFileSize > 0: | |
if r.status_code == 206: | |
what = f"Resuming (from {currentFileSize})" | |
elif r.status_code == 200: | |
# Tried to resume the download, but server did not respond | |
# with partial content. So truncate the file instead of appending. | |
dstFileMode = "wb" | |
if verbose: | |
print(f"{what} file: {file['name']} md5: {file.get('md5sum','N/A')}") | |
with dstFile.open(dstFileMode) as f: | |
shutil.copyfileobj(r.raw, f) | |
if not dstFile.stat().st_size == file["size"]: | |
if verbose: | |
print(f"File size missmatch: {file['name']}") | |
if deleteIncomplete: | |
dstFile.unlink() | |
return False | |
return True | |
def downloadFileWorker( | |
thread_sessions, | |
token, | |
dayBaseUrl, | |
file, | |
dstDir, | |
deleteIncomplete=False, | |
verbose=False, | |
): | |
# Sessions are not thread safe so ensure one per thread | |
threadID = threading.get_ident() | |
session = thread_sessions.get(threadID) | |
if session is None: | |
session = buildSession(token) | |
thread_sessions[threadID] = session | |
return downloadFile(session, dayBaseUrl, file, dstDir, deleteIncomplete, verbose) | |
def productDayListingUrl(baseUrl, product, currentDay): | |
return f"{baseUrl}/{product}/{currentDay.year}/{currentDay.strftime('%j')}" | |
def getProductDayJSON(session, dayBaseUrl): | |
r = session.get(dayBaseUrl + ".json") | |
r.raise_for_status() | |
j = r.json() | |
# There are two different JSON formats in existence | |
# (or LAADS switches between them...). Account for the list | |
# of files inside a "content" key as well as at JSON root here | |
if "content" in j: | |
return filter(lambda e: e["kind"] == "FILE", j["content"]) | |
else: | |
return j | |
def downloadAll( | |
token, | |
start, | |
end, | |
baseUrl, | |
products, | |
dst, | |
threads=5, | |
deleteIncomplete=False, | |
verbose=False, | |
filterFunc=None, | |
): | |
# The session used to fetch file lists with. | |
# Each download thread creates it's own session (as sessions are not thread safe). | |
session = buildSession(token) | |
currentDay = start | |
thread_sessions = {} | |
url_futures = {} | |
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor: | |
while currentDay <= end: | |
for product in products: | |
dayBaseUrl = productDayListingUrl(baseUrl, product, currentDay) | |
fileListing = getProductDayJSON(session, dayBaseUrl) | |
for file in fileListing: | |
# Filter | |
if filterFunc is not None: | |
if not filterFunc(product, file): | |
continue | |
dstDir = ( | |
dst / product / str(currentDay.year) / currentDay.strftime("%j") | |
) | |
f = executor.submit( | |
downloadFileWorker, | |
thread_sessions, | |
token, | |
dayBaseUrl, | |
file, | |
dstDir, | |
deleteIncomplete, | |
verbose, | |
) | |
url_futures[f] = file | |
currentDay += datetime.timedelta(days=1) | |
for future in concurrent.futures.as_completed(url_futures): | |
file = url_futures[future] | |
try: | |
result = future.result() | |
except Exception as exc: | |
print("%r generated an exception: %s" % (file["name"], exc)) | |
else: | |
if not result: | |
print("%s failed" % file["name"]) | |
def filterByFileName(product, file): | |
if product.endswith("EFR") or product.endswith("RBT"): | |
for s in ("_1080_", "_1260_", "_1440_", "_1620_", "_1800_"): | |
if s in file["name"]: | |
return True | |
return False | |
return True | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Download LAADS data") | |
parser.add_argument("--token", type=str) | |
parser.add_argument("--start", type=datetime.date.fromisoformat) | |
parser.add_argument("--end", type=datetime.date.fromisoformat) | |
parser.add_argument("--products", type=str, nargs="+") | |
parser.add_argument( | |
"--src", | |
type=str, | |
default="https://ladsweb.modaps.eosdis.nasa.gov/archive/allData/450/", | |
) | |
parser.add_argument("--dst", type=Path) | |
parser.add_argument("-v", "--verbose", action="store_true") | |
parser.add_argument("--delete-incomplete", action="store_true") | |
parser.add_argument("--threads", type=int, default=5) | |
args = parser.parse_args() | |
downloadAll( | |
args.token, | |
args.start, | |
args.end, | |
args.src, | |
args.products, | |
args.dst, | |
args.threads, | |
args.delete_incomplete, | |
args.verbose, | |
filterByFileName, | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment