bennyistanto · June 8, 2024 12:48
diff --git a/imerg_dailymaxbydekad.py b/imerg_dailymaxbydekad.py
 import xarray as xr
 import numpy as np
 import os
 import glob
 import pandas as pd
 from tqdm import tqdm

 # Define the input and output directories
 max_calculations = [
    {"days": 1, "input_dir": "/mnt/d/temp/imerg/data/fdd_1day", "output_dir": "/mnt/d/temp/imerg/data/fddmaxd_1day"},
    {"days": 2, "input_dir": "/mnt/d/temp/imerg/data/fdd_2day", "output_dir": "/mnt/d/temp/imerg/data/fddmaxd_2day"},
    {"days": 3, "input_dir": "/mnt/d/temp/imerg/data/fdd_3day", "output_dir": "/mnt/d/temp/imerg/data/fddmaxd_3day"},
    {"days": 4, "input_dir": "/mnt/d/temp/imerg/data/fdd_4day", "output_dir": "/mnt/d/temp/imerg/data/fddmaxd_4day"},
    {"days": 5, "input_dir": "/mnt/d/temp/imerg/data/fdd_5day", "output_dir": "/mnt/d/temp/imerg/data/fddmaxd_5day"},
 ]

 # Directory to store combined yearly datasets
 combined_year_dir = "/mnt/d/temp/imerg/data/combined_year_ds"

 # Global variable to store user's choice
 user_choice = None

 def set_user_decision():
    """Prompt user for decision on existing files and store it globally."""
    global user_choice
    if user_choice is None:
        decision = input("An output file already exists. Choose an action - Replace (R), Skip (S), Abort (A): ").upper()
        while decision not in ['R', 'S', 'A']:
            print("Invalid choice. Please choose again.")
            decision = input("Choose an action - Replace (R), Skip (S), Abort (A): ").upper()
        user_choice = decision

 def create_combined_yearly_ds(input_dir, days):
    """Create combined yearly datasets and save them to disk."""
    # Check if input folder exists
    if not os.path.exists(input_dir):
        print(f"Input folder does not exist: {input_dir}")
        exit(1)

    os.makedirs(combined_year_dir, exist_ok=True)

    # Create a list of files
    file_list = sorted(glob.glob(os.path.join(input_dir, f"wld_cli_imerg_fdd_{days}day_*.nc4")))

    # Group files by year
    files_by_year = {}
    for file_path in file_list:
        date_str = file_path.split('_')[-1].split('.')[0]
        year = date_str[:4]
        if year not in files_by_year:
            files_by_year[year] = []
        files_by_year[year].append(file_path)

    # Process each year separately and create combined dataset for each year
    with tqdm(total=len(files_by_year), desc=f"Processing {days}-day data by year") as year_pbar:
        for year, year_files in files_by_year.items():
            combined_ds_path = os.path.join(combined_year_dir, f"combined_{days}day_{year}.nc4")
            if not os.path.exists(combined_ds_path):
                with tqdm(total=len(year_files), desc=f"Combining datasets for {year}", leave=False) as combine_pbar:
                    datasets = [xr.open_dataset(f) for f in year_files]
                    combined_year_ds = xr.concat(datasets, dim='time')
                    combine_pbar.update(len(year_files))
                    
                    combined_year_ds.to_netcdf(combined_ds_path)
                    combined_year_ds.close()
                    for ds in datasets:
                        ds.close()
                print(f"Combined dataset saved to {combined_ds_path}")
            year_pbar.update(1)

 def calculate_dekad_max(combined_ds):
    """Calculate maximum precipitation for each dekad (10-day period) within each month."""
    dekad_max_list = []

    for year in np.unique(combined_ds.time.dt.year.values):
        for month in np.unique(combined_ds.time.dt.month.values):
            month_data = combined_ds.sel(time=str(year) + '-' + str(month).zfill(2))
            if month_data.time.size > 0:
                # Calculate number of days in the month, considering leap years
                days_in_month = pd.Timestamp(f"{year}-{month:02d}-01").days_in_month
                dekads = [(1, 10), (11, 20), (21, days_in_month)]
                for start_day, end_day in dekads:
                    dekad_data = month_data.sel(time=slice(f"{year}-{month:02d}-{start_day:02d}",
                                                           f"{year}-{month:02d}-{end_day:02d}"))
                    if dekad_data.time.size > 0:
                        dekad_max = dekad_data.max(dim='time')
                        dekad_start_date = f"{year}-{month:02d}-{start_day:02d}"
                        dekad_max['time'] = pd.date_range(start=dekad_start_date, periods=1)
                        dekad_max_list.append(dekad_max)

    return xr.concat(dekad_max_list, dim='time')

 for calc in max_calculations:
    days = calc["days"]
    input_dir = calc["input_dir"]
    output_dir = calc["output_dir"]

    os.makedirs(output_dir, exist_ok=True)

    # Create combined yearly datasets if they don't already exist
    create_combined_yearly_ds(input_dir, days)

    # Load and process each combined dataset by year
    with tqdm(total=len(os.listdir(combined_year_dir)), desc=f"Calculating dekad max for {days}-day data") as year_pbar:
        for combined_ds_file in os.listdir(combined_year_dir):
            if f"combined_{days}day_" not in combined_ds_file:
                continue
            combined_ds_path = os.path.join(combined_year_dir, combined_ds_file)
            combined_ds = xr.open_dataset(combined_ds_path)

            # Calculate dekad max
            dekad_max = calculate_dekad_max(combined_ds)

            # Save each dekad's maximum as a separate file
            for date in tqdm(dekad_max.time.values, desc=f"Saving {days}-day dekad max files", leave=False):
                dekad_start_date = pd.Timestamp(date).strftime('%Y%m%d')
                output_filename = f"wld_cli_imerg_fddmax_{days}day_{dekad_start_date}.nc4"
                output_filepath = os.path.join(output_dir, output_filename)

                if os.path.exists(output_filepath):
                    if user_choice is None:
                        set_user_decision()
                    if user_choice == 'S':
                        print(f"Skipping existing file: {output_filepath}")
                        continue  # Skip to the next file
                    elif user_choice == 'A':
                        print("Aborting process.")
                        exit(0)  # Exit the script
                    elif user_choice == 'R':
                        pass  # Replace the file

                try:
                    ds = dekad_max.sel(time=date)
                    ds = ds.expand_dims(time=[date])  # Add the time dimension with the correct date

                    # Ensure CF conventions and compression
                    encoding = {var: {'zlib': True, 'complevel': 5} for var in ds.data_vars}
                    ds.attrs['Conventions'] = 'CF-1.8'
                    ds.to_netcdf(output_filepath, encoding=encoding)
                    print(f"Saved: {output_filepath}")
                except Exception as e:
                    print(f"Error saving file {output_filepath}: {e}")

            print(f"{days}-day dekad max processing complete for {combined_ds_file}.")
            year_pbar.update(1)

 print("All dekad max calculations processed.")
	import xarray as xr
	import numpy as np
	import os
	import glob
	import pandas as pd
	from tqdm import tqdm

	# Define the input and output directories
	max_calculations = [
	{"days": 1, "input_dir": "/mnt/d/temp/imerg/data/fdd_1day", "output_dir": "/mnt/d/temp/imerg/data/fddmaxd_1day"},
	{"days": 2, "input_dir": "/mnt/d/temp/imerg/data/fdd_2day", "output_dir": "/mnt/d/temp/imerg/data/fddmaxd_2day"},
	{"days": 3, "input_dir": "/mnt/d/temp/imerg/data/fdd_3day", "output_dir": "/mnt/d/temp/imerg/data/fddmaxd_3day"},
	{"days": 4, "input_dir": "/mnt/d/temp/imerg/data/fdd_4day", "output_dir": "/mnt/d/temp/imerg/data/fddmaxd_4day"},
	{"days": 5, "input_dir": "/mnt/d/temp/imerg/data/fdd_5day", "output_dir": "/mnt/d/temp/imerg/data/fddmaxd_5day"},
	]

	# Directory to store combined yearly datasets
	combined_year_dir = "/mnt/d/temp/imerg/data/combined_year_ds"

	# Global variable to store user's choice
	user_choice = None

	def set_user_decision():
	"""Prompt user for decision on existing files and store it globally."""
	global user_choice
	if user_choice is None:
	decision = input("An output file already exists. Choose an action - Replace (R), Skip (S), Abort (A): ").upper()
	while decision not in ['R', 'S', 'A']:
	print("Invalid choice. Please choose again.")
	decision = input("Choose an action - Replace (R), Skip (S), Abort (A): ").upper()
	user_choice = decision

	def create_combined_yearly_ds(input_dir, days):
	"""Create combined yearly datasets and save them to disk."""
	# Check if input folder exists
	if not os.path.exists(input_dir):
	print(f"Input folder does not exist: {input_dir}")
	exit(1)

	os.makedirs(combined_year_dir, exist_ok=True)

	# Create a list of files
	file_list = sorted(glob.glob(os.path.join(input_dir, f"wld_cli_imerg_fdd_{days}day_*.nc4")))

	# Group files by year
	files_by_year = {}
	for file_path in file_list:
	date_str = file_path.split('_')[-1].split('.')[0]
	year = date_str[:4]
	if year not in files_by_year:
	files_by_year[year] = []
	files_by_year[year].append(file_path)

	# Process each year separately and create combined dataset for each year
	with tqdm(total=len(files_by_year), desc=f"Processing {days}-day data by year") as year_pbar:
	for year, year_files in files_by_year.items():
	combined_ds_path = os.path.join(combined_year_dir, f"combined_{days}day_{year}.nc4")
	if not os.path.exists(combined_ds_path):
	with tqdm(total=len(year_files), desc=f"Combining datasets for {year}", leave=False) as combine_pbar:
	datasets = [xr.open_dataset(f) for f in year_files]
	combined_year_ds = xr.concat(datasets, dim='time')
	combine_pbar.update(len(year_files))

	combined_year_ds.to_netcdf(combined_ds_path)
	combined_year_ds.close()
	for ds in datasets:
	ds.close()
	print(f"Combined dataset saved to {combined_ds_path}")
	year_pbar.update(1)

	def calculate_dekad_max(combined_ds):
	"""Calculate maximum precipitation for each dekad (10-day period) within each month."""
	dekad_max_list = []

	for year in np.unique(combined_ds.time.dt.year.values):
	for month in np.unique(combined_ds.time.dt.month.values):
	month_data = combined_ds.sel(time=str(year) + '-' + str(month).zfill(2))
	if month_data.time.size > 0:
	# Calculate number of days in the month, considering leap years
	days_in_month = pd.Timestamp(f"{year}-{month:02d}-01").days_in_month
	dekads = [(1, 10), (11, 20), (21, days_in_month)]
	for start_day, end_day in dekads:
	dekad_data = month_data.sel(time=slice(f"{year}-{month:02d}-{start_day:02d}",
	f"{year}-{month:02d}-{end_day:02d}"))
	if dekad_data.time.size > 0:
	dekad_max = dekad_data.max(dim='time')
	dekad_start_date = f"{year}-{month:02d}-{start_day:02d}"
	dekad_max['time'] = pd.date_range(start=dekad_start_date, periods=1)
	dekad_max_list.append(dekad_max)

	return xr.concat(dekad_max_list, dim='time')

	for calc in max_calculations:
	days = calc["days"]
	input_dir = calc["input_dir"]
	output_dir = calc["output_dir"]

	os.makedirs(output_dir, exist_ok=True)

	# Create combined yearly datasets if they don't already exist
	create_combined_yearly_ds(input_dir, days)

	# Load and process each combined dataset by year
	with tqdm(total=len(os.listdir(combined_year_dir)), desc=f"Calculating dekad max for {days}-day data") as year_pbar:
	for combined_ds_file in os.listdir(combined_year_dir):
	if f"combined_{days}day_" not in combined_ds_file:
	continue
	combined_ds_path = os.path.join(combined_year_dir, combined_ds_file)
	combined_ds = xr.open_dataset(combined_ds_path)

	# Calculate dekad max
	dekad_max = calculate_dekad_max(combined_ds)

	# Save each dekad's maximum as a separate file
	for date in tqdm(dekad_max.time.values, desc=f"Saving {days}-day dekad max files", leave=False):
	dekad_start_date = pd.Timestamp(date).strftime('%Y%m%d')
	output_filename = f"wld_cli_imerg_fddmax_{days}day_{dekad_start_date}.nc4"
	output_filepath = os.path.join(output_dir, output_filename)

	if os.path.exists(output_filepath):
	if user_choice is None:
	set_user_decision()
	if user_choice == 'S':
	print(f"Skipping existing file: {output_filepath}")
	continue # Skip to the next file
	elif user_choice == 'A':
	print("Aborting process.")
	exit(0) # Exit the script
	elif user_choice == 'R':
	pass # Replace the file

	try:
	ds = dekad_max.sel(time=date)
	ds = ds.expand_dims(time=[date]) # Add the time dimension with the correct date

	# Ensure CF conventions and compression
	encoding = {var: {'zlib': True, 'complevel': 5} for var in ds.data_vars}
	ds.attrs['Conventions'] = 'CF-1.8'
	ds.to_netcdf(output_filepath, encoding=encoding)
	print(f"Saved: {output_filepath}")
	except Exception as e:
	print(f"Error saving file {output_filepath}: {e}")

	print(f"{days}-day dekad max processing complete for {combined_ds_file}.")
	year_pbar.update(1)

	print("All dekad max calculations processed.")