bennyistanto · April 1, 2023 02:14
diff --git a/chirps_dekad_update.py b/chirps_dekad_update.py
 # -*- coding: utf-8 -*-
 """
 NAME
    chirps_dekad_update.py
    Update Global CHIRPS's dekad collection in a folder, gunzip, rename and compress
 DESCRIPTION
    This script will do:
    1. Check the exisiting file in the Dekad folder, match the date `..._YYYY.MM.D.tif` 
      with collection files in BASE_URL
    2. If the file exist in the Dekad folder, then skipped, proceed to next date. 
    3. If the file in the url is not available in Dekad folder, the download, gunzip, and renamed 
      the file following standard and add a geotiff compression. 
 EXAMPLES
    python chirps_dekad_update.py
 NOTES
    This script is designed to work with global CHIRPS dekad.
    If using other data, some adjustment are required: parsing filename, directory, threshold.
    All CHIRPS data and products are available at s3://wbgdecinternal-ntl/climate/
 CONTACT
    Benny Istanto
    Climate Geographer
    GOST/DECAT/DEC Data Group, The World Bank
 LICENSE
    This script is in the public domain, free from copyrights or restrictions.
 VERSION
    $Id$
 TODO
    xx
 """
 import os
 import gzip
 import shutil
 from urllib.parse import urljoin
 import requests
 import re
 from bs4 import BeautifulSoup
 from osgeo import gdal  # Add this import

 BASE_URL = "https://data.chc.ucsb.edu/products/CHIRPS-2.0/global_dekad/tifs/"
 OUTPUT_FOLDER = "X:\\Temp\\CHIRPS\\Dekad"

 if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)

 def download_file(url, output_folder):
    response = requests.get(url, stream=True)
    filename = url.split("/")[-1]
    output_path = os.path.join(output_folder, filename)

    with open(output_path, "wb") as f:
        for chunk in response.iter_content(chunk_size=8192):
            if chunk:
                f.write(chunk)

    return output_path

 def gunzip_file(input_file, output_file):
    with gzip.open(input_file, "rb") as f_in:
        with open(output_file, "wb") as f_out:
            shutil.copyfileobj(f_in, f_out)

 def rename_and_move_file(input_file, output_folder):
    parts = input_file.split(".")
    year, month, dekad = parts[2], parts[3], parts[4]
    new_name = f"wld_cli_chirps_precip_dekad_{year}.{month}.{dekad}.tif"
    output_path = os.path.join(output_folder, new_name)
    shutil.move(input_file, output_path)
    return output_path

 def extract_file_urls(html_content):
    soup = BeautifulSoup(html_content, "html.parser")
    file_urls = []
    for link in soup.find_all("a", href=True):
        file_url = link["href"]
        if file_url.endswith(".gz"):
            file_urls.append(file_url)
    return file_urls

 def extract_file_dates(file_paths):
    file_dates = set()
    date_pattern = re.compile(r"wld_cli_chirps_precip_dekad_(\d{4})\.(\d{2})\.(\d)\.tif")
    for file_path in file_paths:
        file_name = os.path.basename(file_path)
        match = date_pattern.match(file_name)
        if match:
            year, month, day = match.groups()
            date_str = f"{year}-{month}-{str(int(day))}"  # Remove the leading zero from the day
            file_dates.add(date_str)
    return file_dates

 def compress_geotiff(input_file, output_file):
    input_dataset = gdal.Open(input_file)
    options = [
        "COMPRESS=LZW",
        "PREDICTOR=1",
        #"ZLEVEL=9",
        #"TILED=YES",
        #"BLOCKXSIZE=256",
        #"BLOCKYSIZE=256",
    ]
    gdal.Translate(output_file, input_dataset, options=options)
    input_dataset = None  # Close the input dataset

 # Main part of the script
 response = requests.get(BASE_URL)
 file_urls = extract_file_urls(response.content)  # Assuming you have a function to extract file URLs from the HTML

 # Get the list of existing files and their dates
 existing_files = [os.path.join(OUTPUT_FOLDER, f) for f in os.listdir(OUTPUT_FOLDER) if f.endswith(".tif")]
 existing_dates = extract_file_dates(existing_files)

 # Get the list of dates that need to be downloaded
 missing_dates = set()
 date_pattern = re.compile(r"chirps-v2\.0\.(\d{4})\.(\d{2})\.(\d)\.tif\.gz")  # Match the date components in the file_url

 for file_url in file_urls:
    match = date_pattern.match(file_url)
    if match:
        year, month, day = match.groups()
        file_date = f"{year}-{month}-{day}"

        if file_date not in existing_dates:
            missing_dates.add((file_date, file_url))  # Add the file URL as well, so we can download it later

 # Download and process the missing dates
 for missing_date, file_url in missing_dates:  # Iterate over missing_dates and file_urls together
    year, month, day = missing_date.split("-")
    tif_filename = file_url.replace(".tif.gz", ".tif")  # Replace .tif.gz with .tif
    renamed_tif = os.path.join(OUTPUT_FOLDER, f"wld_cli_chirps_precip_dekad_{year}.{month}.{day}.tif")

    if os.path.exists(renamed_tif):
        print(f"File {renamed_tif} already exists.")
    else:
        print(f"Downloading file {file_url}")
        gz_file_path = download_file(urljoin(BASE_URL, file_url), OUTPUT_FOLDER)
        tif_file_path = os.path.join(OUTPUT_FOLDER, tif_filename)
        gunzip_file(gz_file_path, tif_file_path)
        os.remove(gz_file_path)  # Remove the .gz file after decompression

        # Compress the GeoTIFF file and remove the original TIFF
        compressed_tif_path = os.path.join(OUTPUT_FOLDER, f"compressed_{tif_filename}")
        compress_geotiff(tif_file_path, compressed_tif_path)
        os.remove(tif_file_path)

        renamed_file_path = rename_and_move_file(compressed_tif_path, OUTPUT_FOLDER)
        print(f"Processed: {renamed_file_path}")
	# -- coding: utf-8 --
	"""
	NAME
	chirps_dekad_update.py
	Update Global CHIRPS's dekad collection in a folder, gunzip, rename and compress
	DESCRIPTION
	This script will do:
	1. Check the exisiting file in the Dekad folder, match the date `..._YYYY.MM.D.tif`
	with collection files in BASE_URL
	2. If the file exist in the Dekad folder, then skipped, proceed to next date.
	3. If the file in the url is not available in Dekad folder, the download, gunzip, and renamed
	the file following standard and add a geotiff compression.
	EXAMPLES
	python chirps_dekad_update.py
	NOTES
	This script is designed to work with global CHIRPS dekad.
	If using other data, some adjustment are required: parsing filename, directory, threshold.
	All CHIRPS data and products are available at s3://wbgdecinternal-ntl/climate/
	CONTACT
	Benny Istanto
	Climate Geographer
	GOST/DECAT/DEC Data Group, The World Bank
	LICENSE
	This script is in the public domain, free from copyrights or restrictions.
	VERSION
	$Id$
	TODO
	xx
	"""
	import os
	import gzip
	import shutil
	from urllib.parse import urljoin
	import requests
	import re
	from bs4 import BeautifulSoup
	from osgeo import gdal # Add this import

	BASE_URL = "https://data.chc.ucsb.edu/products/CHIRPS-2.0/global_dekad/tifs/"
	OUTPUT_FOLDER = "X:\\Temp\\CHIRPS\\Dekad"

	if not os.path.exists(OUTPUT_FOLDER):
	os.makedirs(OUTPUT_FOLDER)

	def download_file(url, output_folder):
	response = requests.get(url, stream=True)
	filename = url.split("/")[-1]
	output_path = os.path.join(output_folder, filename)

	with open(output_path, "wb") as f:
	for chunk in response.iter_content(chunk_size=8192):
	if chunk:
	f.write(chunk)

	return output_path

	def gunzip_file(input_file, output_file):
	with gzip.open(input_file, "rb") as f_in:
	with open(output_file, "wb") as f_out:
	shutil.copyfileobj(f_in, f_out)

	def rename_and_move_file(input_file, output_folder):
	parts = input_file.split(".")
	year, month, dekad = parts[2], parts[3], parts[4]
	new_name = f"wld_cli_chirps_precip_dekad_{year}.{month}.{dekad}.tif"
	output_path = os.path.join(output_folder, new_name)
	shutil.move(input_file, output_path)
	return output_path

	def extract_file_urls(html_content):
	soup = BeautifulSoup(html_content, "html.parser")
	file_urls = []
	for link in soup.find_all("a", href=True):
	file_url = link["href"]
	if file_url.endswith(".gz"):
	file_urls.append(file_url)
	return file_urls

	def extract_file_dates(file_paths):
	file_dates = set()
	date_pattern = re.compile(r"wld_cli_chirps_precip_dekad_(\d{4})\.(\d{2})\.(\d)\.tif")
	for file_path in file_paths:
	file_name = os.path.basename(file_path)
	match = date_pattern.match(file_name)
	if match:
	year, month, day = match.groups()
	date_str = f"{year}-{month}-{str(int(day))}" # Remove the leading zero from the day
	file_dates.add(date_str)
	return file_dates

	def compress_geotiff(input_file, output_file):
	input_dataset = gdal.Open(input_file)
	options = [
	"COMPRESS=LZW",
	"PREDICTOR=1",
	#"ZLEVEL=9",
	#"TILED=YES",
	#"BLOCKXSIZE=256",
	#"BLOCKYSIZE=256",
	]
	gdal.Translate(output_file, input_dataset, options=options)
	input_dataset = None # Close the input dataset

	# Main part of the script
	response = requests.get(BASE_URL)
	file_urls = extract_file_urls(response.content) # Assuming you have a function to extract file URLs from the HTML

	# Get the list of existing files and their dates
	existing_files = [os.path.join(OUTPUT_FOLDER, f) for f in os.listdir(OUTPUT_FOLDER) if f.endswith(".tif")]
	existing_dates = extract_file_dates(existing_files)

	# Get the list of dates that need to be downloaded
	missing_dates = set()
	date_pattern = re.compile(r"chirps-v2\.0\.(\d{4})\.(\d{2})\.(\d)\.tif\.gz") # Match the date components in the file_url

	for file_url in file_urls:
	match = date_pattern.match(file_url)
	if match:
	year, month, day = match.groups()
	file_date = f"{year}-{month}-{day}"

	if file_date not in existing_dates:
	missing_dates.add((file_date, file_url)) # Add the file URL as well, so we can download it later

	# Download and process the missing dates
	for missing_date, file_url in missing_dates: # Iterate over missing_dates and file_urls together
	year, month, day = missing_date.split("-")
	tif_filename = file_url.replace(".tif.gz", ".tif") # Replace .tif.gz with .tif
	renamed_tif = os.path.join(OUTPUT_FOLDER, f"wld_cli_chirps_precip_dekad_{year}.{month}.{day}.tif")

	if os.path.exists(renamed_tif):
	print(f"File {renamed_tif} already exists.")
	else:
	print(f"Downloading file {file_url}")
	gz_file_path = download_file(urljoin(BASE_URL, file_url), OUTPUT_FOLDER)
	tif_file_path = os.path.join(OUTPUT_FOLDER, tif_filename)
	gunzip_file(gz_file_path, tif_file_path)
	os.remove(gz_file_path) # Remove the .gz file after decompression

	# Compress the GeoTIFF file and remove the original TIFF
	compressed_tif_path = os.path.join(OUTPUT_FOLDER, f"compressed_{tif_filename}")
	compress_geotiff(tif_file_path, compressed_tif_path)
	os.remove(tif_file_path)

	renamed_file_path = rename_and_move_file(compressed_tif_path, OUTPUT_FOLDER)
	print(f"Processed: {renamed_file_path}")