Skip to content

Instantly share code, notes, and snippets.

@bennyistanto
Last active April 1, 2023 02:14
Show Gist options
  • Save bennyistanto/b8235037012f78ec1150f14458a083a2 to your computer and use it in GitHub Desktop.
Save bennyistanto/b8235037012f78ec1150f14458a083a2 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
"""
NAME
chirps_dekad_update.py
Update Global CHIRPS's dekad collection in a folder, gunzip, rename and compress
DESCRIPTION
This script will do:
1. Check the exisiting file in the Dekad folder, match the date `..._YYYY.MM.D.tif`
with collection files in BASE_URL
2. If the file exist in the Dekad folder, then skipped, proceed to next date.
3. If the file in the url is not available in Dekad folder, the download, gunzip, and renamed
the file following standard and add a geotiff compression.
EXAMPLES
python chirps_dekad_update.py
NOTES
This script is designed to work with global CHIRPS dekad.
If using other data, some adjustment are required: parsing filename, directory, threshold.
All CHIRPS data and products are available at s3://wbgdecinternal-ntl/climate/
CONTACT
Benny Istanto
Climate Geographer
GOST/DECAT/DEC Data Group, The World Bank
LICENSE
This script is in the public domain, free from copyrights or restrictions.
VERSION
$Id$
TODO
xx
"""
import os
import gzip
import shutil
from urllib.parse import urljoin
import requests
import re
from bs4 import BeautifulSoup
from osgeo import gdal # Add this import
BASE_URL = "https://data.chc.ucsb.edu/products/CHIRPS-2.0/global_dekad/tifs/"
OUTPUT_FOLDER = "X:\\Temp\\CHIRPS\\Dekad"
if not os.path.exists(OUTPUT_FOLDER):
os.makedirs(OUTPUT_FOLDER)
def download_file(url, output_folder):
response = requests.get(url, stream=True)
filename = url.split("/")[-1]
output_path = os.path.join(output_folder, filename)
with open(output_path, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
return output_path
def gunzip_file(input_file, output_file):
with gzip.open(input_file, "rb") as f_in:
with open(output_file, "wb") as f_out:
shutil.copyfileobj(f_in, f_out)
def rename_and_move_file(input_file, output_folder):
parts = input_file.split(".")
year, month, dekad = parts[2], parts[3], parts[4]
new_name = f"wld_cli_chirps_precip_dekad_{year}.{month}.{dekad}.tif"
output_path = os.path.join(output_folder, new_name)
shutil.move(input_file, output_path)
return output_path
def extract_file_urls(html_content):
soup = BeautifulSoup(html_content, "html.parser")
file_urls = []
for link in soup.find_all("a", href=True):
file_url = link["href"]
if file_url.endswith(".gz"):
file_urls.append(file_url)
return file_urls
def extract_file_dates(file_paths):
file_dates = set()
date_pattern = re.compile(r"wld_cli_chirps_precip_dekad_(\d{4})\.(\d{2})\.(\d)\.tif")
for file_path in file_paths:
file_name = os.path.basename(file_path)
match = date_pattern.match(file_name)
if match:
year, month, day = match.groups()
date_str = f"{year}-{month}-{str(int(day))}" # Remove the leading zero from the day
file_dates.add(date_str)
return file_dates
def compress_geotiff(input_file, output_file):
input_dataset = gdal.Open(input_file)
options = [
"COMPRESS=LZW",
"PREDICTOR=1",
#"ZLEVEL=9",
#"TILED=YES",
#"BLOCKXSIZE=256",
#"BLOCKYSIZE=256",
]
gdal.Translate(output_file, input_dataset, options=options)
input_dataset = None # Close the input dataset
# Main part of the script
response = requests.get(BASE_URL)
file_urls = extract_file_urls(response.content) # Assuming you have a function to extract file URLs from the HTML
# Get the list of existing files and their dates
existing_files = [os.path.join(OUTPUT_FOLDER, f) for f in os.listdir(OUTPUT_FOLDER) if f.endswith(".tif")]
existing_dates = extract_file_dates(existing_files)
# Get the list of dates that need to be downloaded
missing_dates = set()
date_pattern = re.compile(r"chirps-v2\.0\.(\d{4})\.(\d{2})\.(\d)\.tif\.gz") # Match the date components in the file_url
for file_url in file_urls:
match = date_pattern.match(file_url)
if match:
year, month, day = match.groups()
file_date = f"{year}-{month}-{day}"
if file_date not in existing_dates:
missing_dates.add((file_date, file_url)) # Add the file URL as well, so we can download it later
# Download and process the missing dates
for missing_date, file_url in missing_dates: # Iterate over missing_dates and file_urls together
year, month, day = missing_date.split("-")
tif_filename = file_url.replace(".tif.gz", ".tif") # Replace .tif.gz with .tif
renamed_tif = os.path.join(OUTPUT_FOLDER, f"wld_cli_chirps_precip_dekad_{year}.{month}.{day}.tif")
if os.path.exists(renamed_tif):
print(f"File {renamed_tif} already exists.")
else:
print(f"Downloading file {file_url}")
gz_file_path = download_file(urljoin(BASE_URL, file_url), OUTPUT_FOLDER)
tif_file_path = os.path.join(OUTPUT_FOLDER, tif_filename)
gunzip_file(gz_file_path, tif_file_path)
os.remove(gz_file_path) # Remove the .gz file after decompression
# Compress the GeoTIFF file and remove the original TIFF
compressed_tif_path = os.path.join(OUTPUT_FOLDER, f"compressed_{tif_filename}")
compress_geotiff(tif_file_path, compressed_tif_path)
os.remove(tif_file_path)
renamed_file_path = rename_and_move_file(compressed_tif_path, OUTPUT_FOLDER)
print(f"Processed: {renamed_file_path}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment