Last active
April 1, 2023 02:14
-
-
Save bennyistanto/b8235037012f78ec1150f14458a083a2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
NAME | |
chirps_dekad_update.py | |
Update Global CHIRPS's dekad collection in a folder, gunzip, rename and compress | |
DESCRIPTION | |
This script will do: | |
1. Check the exisiting file in the Dekad folder, match the date `..._YYYY.MM.D.tif` | |
with collection files in BASE_URL | |
2. If the file exist in the Dekad folder, then skipped, proceed to next date. | |
3. If the file in the url is not available in Dekad folder, the download, gunzip, and renamed | |
the file following standard and add a geotiff compression. | |
EXAMPLES | |
python chirps_dekad_update.py | |
NOTES | |
This script is designed to work with global CHIRPS dekad. | |
If using other data, some adjustment are required: parsing filename, directory, threshold. | |
All CHIRPS data and products are available at s3://wbgdecinternal-ntl/climate/ | |
CONTACT | |
Benny Istanto | |
Climate Geographer | |
GOST/DECAT/DEC Data Group, The World Bank | |
LICENSE | |
This script is in the public domain, free from copyrights or restrictions. | |
VERSION | |
$Id$ | |
TODO | |
xx | |
""" | |
import os | |
import gzip | |
import shutil | |
from urllib.parse import urljoin | |
import requests | |
import re | |
from bs4 import BeautifulSoup | |
from osgeo import gdal # Add this import | |
BASE_URL = "https://data.chc.ucsb.edu/products/CHIRPS-2.0/global_dekad/tifs/" | |
OUTPUT_FOLDER = "X:\\Temp\\CHIRPS\\Dekad" | |
if not os.path.exists(OUTPUT_FOLDER): | |
os.makedirs(OUTPUT_FOLDER) | |
def download_file(url, output_folder): | |
response = requests.get(url, stream=True) | |
filename = url.split("/")[-1] | |
output_path = os.path.join(output_folder, filename) | |
with open(output_path, "wb") as f: | |
for chunk in response.iter_content(chunk_size=8192): | |
if chunk: | |
f.write(chunk) | |
return output_path | |
def gunzip_file(input_file, output_file): | |
with gzip.open(input_file, "rb") as f_in: | |
with open(output_file, "wb") as f_out: | |
shutil.copyfileobj(f_in, f_out) | |
def rename_and_move_file(input_file, output_folder): | |
parts = input_file.split(".") | |
year, month, dekad = parts[2], parts[3], parts[4] | |
new_name = f"wld_cli_chirps_precip_dekad_{year}.{month}.{dekad}.tif" | |
output_path = os.path.join(output_folder, new_name) | |
shutil.move(input_file, output_path) | |
return output_path | |
def extract_file_urls(html_content): | |
soup = BeautifulSoup(html_content, "html.parser") | |
file_urls = [] | |
for link in soup.find_all("a", href=True): | |
file_url = link["href"] | |
if file_url.endswith(".gz"): | |
file_urls.append(file_url) | |
return file_urls | |
def extract_file_dates(file_paths): | |
file_dates = set() | |
date_pattern = re.compile(r"wld_cli_chirps_precip_dekad_(\d{4})\.(\d{2})\.(\d)\.tif") | |
for file_path in file_paths: | |
file_name = os.path.basename(file_path) | |
match = date_pattern.match(file_name) | |
if match: | |
year, month, day = match.groups() | |
date_str = f"{year}-{month}-{str(int(day))}" # Remove the leading zero from the day | |
file_dates.add(date_str) | |
return file_dates | |
def compress_geotiff(input_file, output_file): | |
input_dataset = gdal.Open(input_file) | |
options = [ | |
"COMPRESS=LZW", | |
"PREDICTOR=1", | |
#"ZLEVEL=9", | |
#"TILED=YES", | |
#"BLOCKXSIZE=256", | |
#"BLOCKYSIZE=256", | |
] | |
gdal.Translate(output_file, input_dataset, options=options) | |
input_dataset = None # Close the input dataset | |
# Main part of the script | |
response = requests.get(BASE_URL) | |
file_urls = extract_file_urls(response.content) # Assuming you have a function to extract file URLs from the HTML | |
# Get the list of existing files and their dates | |
existing_files = [os.path.join(OUTPUT_FOLDER, f) for f in os.listdir(OUTPUT_FOLDER) if f.endswith(".tif")] | |
existing_dates = extract_file_dates(existing_files) | |
# Get the list of dates that need to be downloaded | |
missing_dates = set() | |
date_pattern = re.compile(r"chirps-v2\.0\.(\d{4})\.(\d{2})\.(\d)\.tif\.gz") # Match the date components in the file_url | |
for file_url in file_urls: | |
match = date_pattern.match(file_url) | |
if match: | |
year, month, day = match.groups() | |
file_date = f"{year}-{month}-{day}" | |
if file_date not in existing_dates: | |
missing_dates.add((file_date, file_url)) # Add the file URL as well, so we can download it later | |
# Download and process the missing dates | |
for missing_date, file_url in missing_dates: # Iterate over missing_dates and file_urls together | |
year, month, day = missing_date.split("-") | |
tif_filename = file_url.replace(".tif.gz", ".tif") # Replace .tif.gz with .tif | |
renamed_tif = os.path.join(OUTPUT_FOLDER, f"wld_cli_chirps_precip_dekad_{year}.{month}.{day}.tif") | |
if os.path.exists(renamed_tif): | |
print(f"File {renamed_tif} already exists.") | |
else: | |
print(f"Downloading file {file_url}") | |
gz_file_path = download_file(urljoin(BASE_URL, file_url), OUTPUT_FOLDER) | |
tif_file_path = os.path.join(OUTPUT_FOLDER, tif_filename) | |
gunzip_file(gz_file_path, tif_file_path) | |
os.remove(gz_file_path) # Remove the .gz file after decompression | |
# Compress the GeoTIFF file and remove the original TIFF | |
compressed_tif_path = os.path.join(OUTPUT_FOLDER, f"compressed_{tif_filename}") | |
compress_geotiff(tif_file_path, compressed_tif_path) | |
os.remove(tif_file_path) | |
renamed_file_path = rename_and_move_file(compressed_tif_path, OUTPUT_FOLDER) | |
print(f"Processed: {renamed_file_path}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment