bennyistanto · July 3, 2024 05:43
diff --git a/lseqm.md b/lseqm.md
diff --git a/lseqm_spatialmovingwindow.py b/lseqm_spatialmovingwindow.py
 # Import the library
 import os
 import numpy as np
 import xarray as xr
 import pandas as pd
 import calendar
 from scipy import optimize
 from scipy.stats import gamma, genpareto
 from sklearn.model_selection import KFold
 from multiprocessing import Pool
 import time

 # Main directory on Google Drive
 dir = f'/mnt/d/temp/gfm1609'

 # Define the appropriate input and output directory paths
 input_dir = f'{dir}/data/bc/input'
 output_dir = f'{dir}/data/bc/output'
 imerg_path = f'{input_dir}/imergl'
 cpc_path = f'{input_dir}/cpcuni'
 mask_path = f'{dir}/data/subset/iso3'
 corrected_precip_path = f'{output_dir}/spatialwindow'

 # List of output directories
 output_directories = [output_dir, corrected_precip_path]

 # Create the output directories if they don't already exist
 for directory in output_directories:
    os.makedirs(directory, exist_ok=True)

 # Global variable to store user's choice
 user_choice = None

 def set_user_decision():
    """Prompt user for decision on existing files and store it globally."""
    global user_choice
    if user_choice is None:
        decision = input("An output file already exists. Choose an action - Replace (R), Skip (S), Abort (A): ").upper()
        while decision not in ['R', 'S', 'A']:
            print("Invalid choice. Please choose again.")
            decision = input("Choose an action - Replace (R), Skip (S), Abort (A): ").upper()
        user_choice = decision

 # To address the issue of capturing extreme values in satellite data while performing EQM, Tail Adjustment
 # is use to improve fit for extreme value. Tail adjustment with the Generalized Pareto Distribution (GPD) 
 # better captures the extreme values by specifically modeling the tails of the distribution, which is 
 # crucial for accurately representing extreme precipitation events.
 def gamma_quantile_mapping(imerg_values, cpc_values):
    """
    Apply gamma distribution-based quantile mapping with improved fitting and tail adjustment 
    to correct the distribution of precipitation data.

    This function fits gamma distributions to the IMERG and CPC precipitation values using 
    regularization and stricter constraints to improve the fitting accuracy. It then computes 
    the cumulative distribution function (CDF) of the IMERG values and applies the inverse CDF 
    of the CPC values to obtain the corrected precipitation values. Additionally, it adjusts 
    the tails using the Generalized Pareto Distribution (GPD) to better capture extreme values.

    Parameters:
    imerg_values (numpy.ndarray): Array of IMERG precipitation values.
    cpc_values (numpy.ndarray): Array of CPC precipitation values.

    Returns:
    numpy.ndarray: Corrected precipitation values after gamma quantile mapping with improved 
    fitting and tail adjustment.
    """
    def moment_equations(data, params):
        """
        Define moment equations for fitting gamma distribution using moments.

        Parameters:
        data (numpy.ndarray): Array of data values.
        params (tuple): Parameters (shape, loc, scale) for the gamma distribution.

        Returns:
        tuple: Differences between the estimated and actual moments.
        """
        shape, loc, scale = params
        if abs(shape) < 1e-8:
            # Special case when shape is close to zero
            return (loc + scale * np.euler_gamma - np.mean(data),
                    scale**2 * np.pi**2 / 6 - np.var(data),
                    0)
        else:
            return (loc + scale * (1 - np.euler_gamma * shape) / shape - np.mean(data),
                    scale**2 * (1 - 2 * shape * np.euler_gamma + shape**2 * np.pi**2 / 6) / shape**2 - np.var(data),
                    np.sign(shape) * (3 * np.euler_gamma * shape - 1 - shape**3 * np.pi**2 / 6) / (1 - 2 * shape) - np.sign(np.mean((data - np.mean(data))**3)))

    def fit_gamma_with_moment_equations(data):
        """
        Fit a gamma distribution to the data using moment equations.

        Parameters:
        data (numpy.ndarray): Array of data values.

        Returns:
        tuple: Fitted parameters (shape, loc, scale) of the gamma distribution.
        """
        data = data[~np.isnan(data)]  # Remove NaN values from data
        if len(data) == 0:  # If no data left after removing NaNs, return default values
            return 1, 0, 1
        
        initial_guess = [0.1, np.mean(data), np.std(data)]
        lower_bounds = [0.001, np.min(data), 0.001]
        upper_bounds = [10, np.max(data), np.std(data) * 10]
        
        # Ensure that each lower bound is strictly less than the corresponding upper bound
        for i in range(3):
            if lower_bounds[i] >= upper_bounds[i]:
                upper_bounds[i] = lower_bounds[i] + 1e-6
        
        # Ensure initial guess is within bounds
        initial_guess = np.clip(initial_guess, lower_bounds, upper_bounds)
        
        bounds = (lower_bounds, upper_bounds)
        
        def objective(params):
            return moment_equations(data, params)

        try:
            result = optimize.least_squares(objective, initial_guess, bounds=bounds, max_nfev=10000)
            shape, loc, scale = result.x
        except ValueError as e:
            print(f"ValueError: {e}")
            print(f"Initial guess: {initial_guess}")
            print(f"Bounds: {bounds}")
            raise

        # Regularization and constraints
        shape = max(shape, 0.001)  # Ensure shape is positive
        scale = max(scale, 0.001)  # Ensure scale is positive

        return shape, loc, scale

    def fit_generalized_pareto_distribution(data, threshold):
        """
        Fit a Generalized Pareto Distribution (GPD) to the excesses above the threshold.

        Parameters:
        data (numpy.ndarray): Array of data values.
        threshold (float): Threshold value for defining the excesses.

        Returns:
        tuple: Fitted parameters of the GPD.
        """
        excesses = data[data > threshold] - threshold
        if len(excesses) < 10:  # Arbitrary minimum number of points for GPD fitting
            #print("Not enough excesses for GPD fitting. Using default parameters.")
            return (0, 0, 1)  # Return a default GPD with zero shape and unit scale
        params = genpareto.fit(excesses)
        return params

    def cross_validate_gpd(data, threshold, n_splits=5):
        """
        Cross-validate GPD fitting by splitting data into folds.

        Parameters:
        data (numpy.ndarray): Array of data values.
        threshold (float): Threshold value for defining the excesses.
        n_splits (int, optional): Number of cross-validation splits. Default is 5.

        Returns:
        tuple: Averaged parameters of the GPD from cross-validation.
        """
        excesses = data[data > threshold] - threshold
        if len(excesses) < n_splits:
            #print("Not enough excesses for cross-validation. Fitting GPD directly.")
            return fit_generalized_pareto_distribution(data, threshold)  # Fit GPD directly if not enough samples for cross-validation

        kf = KFold(n_splits=n_splits)
        params_list = []

        for train_index, test_index in kf.split(excesses):
            train_data, test_data = excesses[train_index], excesses[test_index]
            params = genpareto.fit(train_data)
            params_list.append(params)

        # Average the parameters from cross-validation
        shape_avg = np.mean([params[0] for params in params_list])
        loc_avg = np.mean([params[1] for params in params_list])
        scale_avg = np.mean([params[2] for params in params_list])

        return shape_avg, loc_avg, scale_avg

    # Combine the values within the spatial window
    imerg_values_flat = imerg_values.flatten()
    cpc_values_flat = cpc_values.flatten()

    if imerg_values_flat.size == 0 or cpc_values_flat.size == 0:
        return np.full(imerg_values.shape, np.nan)

    #print("Fitting gamma distributions to IMERG values...")
    shape1, loc1, scale1 = fit_gamma_with_moment_equations(imerg_values_flat)
    #print(f"IMERG Gamma Params: shape={shape1}, loc={loc1}, scale={scale1}")
    y = gamma.cdf(imerg_values_flat, shape1, loc=loc1, scale=scale1)
    #print(f"Gamma CDF of IMERG values: min={np.nanmin(y)}, max={np.nanmax(y)}")

    #print("Fitting gamma distributions to CPC values...")
    shape2, loc2, scale2 = fit_gamma_with_moment_equations(cpc_values_flat)
    #print(f"CPC Gamma Params: shape={shape2}, loc={loc2}, scale={scale2}")
    cpc_quantiles_flat = gamma.ppf(y, shape2, loc=loc2, scale=scale2)
    #print(f"Gamma PPF of CPC values: min={np.nanmin(cpc_quantiles_flat)}, max={np.nanmax(cpc_quantiles_flat)}")

    # Ensure CPC quantiles are within realistic bounds
    cpc_quantiles_flat = np.maximum(cpc_quantiles_flat, 0)
    #print(f"Adjusted CPC quantiles: min={np.nanmin(cpc_quantiles_flat)}, max={np.nanmax(cpc_quantiles_flat)}")

    #print("Fitting Generalized Pareto Distribution (GPD) to CPC values...")
    try:
        threshold = np.percentile(cpc_values_flat, 95)  # Set threshold at 95th percentile
    except IndexError as e:
        print(f"IndexError: {e}")
        threshold = np.nan

    if np.isnan(threshold):
        print("Threshold is NaN, skipping GPD fitting.")
    else:
        #print(f"GPD threshold: {threshold}")
        cpc_gpd_params = cross_validate_gpd(cpc_values_flat, threshold)
        #print(f"GPD Params: {cpc_gpd_params}")

        # Adjust the tails using GPD if there are enough data points
        extreme_mask = imerg_values_flat > threshold
        if np.any(extreme_mask):
            cpc_quantiles_flat[extreme_mask] = genpareto.ppf(y[extreme_mask], *cpc_gpd_params) + threshold

    #print(f"Adjusted CPC quantiles after GPD: min={np.nanmin(cpc_quantiles_flat)}, max={np.nanmax(cpc_quantiles_flat)}")

    #print("Applying dynamic cap to CPC quantiles...")
    try:
        dynamic_cap = np.percentile(cpc_values_flat, 99.9)  # Cap at the 99.9th percentile
    except IndexError as e:
        #print(f"IndexError: {e}")
        dynamic_cap = np.nan

    print(f"Dynamic cap: {dynamic_cap}")
    if np.isnan(dynamic_cap):
        #print("Dynamic cap is NaN, skipping dynamic capping.")
        return cpc_quantiles_flat.reshape(imerg_values.shape)
    cpc_quantiles_flat = np.minimum(cpc_quantiles_flat, dynamic_cap)

    # Ensure non-negative corrected values
    cpc_quantiles_flat = np.maximum(cpc_quantiles_flat, 0)

    #print("Reshaping CPC quantiles to original shape...")
    return cpc_quantiles_flat.reshape(imerg_values.shape)

 def calculate_scale_factor(imerg_precip, cpc_precip):
    """
    Calculate the scale factor using a 5x5 spatial window.
    Parameters:
    imerg_precip (numpy.ndarray): IMERG precipitation values.
    cpc_precip (numpy.ndarray): CPC precipitation values.
    Returns:
    numpy.ndarray: Scale factors for each grid point.
    """
    scale_factors = np.full_like(cpc_precip, np.nan)
    time_steps, rows, cols = cpc_precip.shape

    for t in range(time_steps):
        for i in range(rows):
            for j in range(cols):
                # Define the window boundaries
                row_start = max(i - 2, 0)
                row_end = min(i + 3, rows)
                col_start = max(j - 2, 0)
                col_end = min(j + 3, cols)

                imerg_window = imerg_precip[t, row_start:row_end, col_start:col_end]
                cpc_window = cpc_precip[t, row_start:row_end, col_start:col_end]
                if not np.isnan(cpc_precip[t, i, j]) and np.count_nonzero(~np.isnan(imerg_window)) > 0:
                    regional_mean_imerg = np.nanmean(imerg_window)
                    if regional_mean_imerg != 0:
                        scale_factors[t, i, j] = cpc_precip[t, i, j] / regional_mean_imerg
                    else:
                        scale_factors[t, i, j] = 1.0
                else:
                    scale_factors[t, i, j] = np.nan

    return scale_factors

 def apply_linear_scaling(imerg_precip, scale_factors):
    """
    Apply the scale factors to IMERG precipitation data.

    Parameters:
    imerg_precip (numpy.ndarray): IMERG precipitation values.
    scale_factors (numpy.ndarray): Scale factors for each grid point.

    Returns:
    numpy.ndarray: Spatially calibrated IMERG precipitation values.
    """
    return imerg_precip * scale_factors

 def lseqm_spatial(imerg_ds, cpc_ds):
    """
    Apply the Linear Scaling and Empirical Quantile Mapping (LSEQM) method with a spatial moving window approach.

    This function performs bias correction on precipitation data by combining Linear Scaling (LS)
    and Empirical Quantile Mapping (EQM). The spatial moving window approach is used to smooth local
    variability, mitigate random errors, and capture larger-scale patterns in the data.

    Parameters:
    imerg_ds (xarray.Dataset): IMERG precipitation dataset with dimensions ('time', 'lat', 'lon').
    cpc_ds (xarray.Dataset): CPC precipitation dataset with dimensions ('time', 'lat', 'lon').

    Returns:
    xarray.Dataset: The bias-corrected precipitation dataset with the same dimensions as the input datasets.
    """
    # Sort the input datasets by the time dimension
    imerg_ds = imerg_ds.sortby('time')
    cpc_ds = cpc_ds.sortby('time')

    # Get the precipitation data from the input datasets
    imerg_precip = imerg_ds['precipitation']
    cpc_precip = cpc_ds['precip']

    print(f"IMERG precip shape: {imerg_precip.shape}")
    print(f"CPC precip shape: {cpc_precip.shape}")

    # Calculate the scale factors using a 5x5 spatial moving window
    scale_factors = calculate_scale_factor(imerg_precip.values, cpc_precip.values)

    #print("Scale factors calculated. Checking for NaN values...")
    if np.all(np.isnan(scale_factors)):
        #print("All scale factors are NaN. Something went wrong in calculate_scale_factor.")
        return xr.Dataset(data_vars={'precip': (('time', 'lat', 'lon'), np.full(imerg_precip.shape, np.nan))},
                          coords={'time': imerg_ds['time'], 'lat': imerg_ds['lat'], 'lon': imerg_ds['lon']})

    # Apply Linear Scaling (LS) to correct the mean bias
    print("Performing Linear Scaling (LS) to correct the mean bias...")
    ls_corrected_precip = apply_linear_scaling(imerg_precip.values, scale_factors)

    #print("Linear Scaling completed. Checking for NaN values in LS corrected precip...")
    if np.all(np.isnan(ls_corrected_precip)):
        #print("All LS corrected precip values are NaN. Something went wrong in apply_linear_scaling.")
        return xr.Dataset(data_vars={'precip': (('time', 'lat', 'lon'), np.full(imerg_precip.shape, np.nan))},
                          coords={'time': imerg_ds['time'], 'lat': imerg_ds['lat'], 'lon': imerg_ds['lon']})

    # Apply gamma distribution-based quantile mapping to correct the distribution
    print("Applying gamma distribution-based quantile mapping to correct the distribution...")
    corrected_precip = xr.apply_ufunc(gamma_quantile_mapping, ls_corrected_precip, cpc_precip.values,
                                      input_core_dims=[['time', 'lat', 'lon'], ['time', 'lat', 'lon']],
                                      output_core_dims=[['time', 'lat', 'lon']],
                                      output_dtypes=[imerg_precip.dtype],
                                      vectorize=True).data

    #print("Quantile mapping completed. Checking for NaN values in corrected precip...")
    if np.all(np.isnan(corrected_precip)):
        #print("All corrected precip values are NaN. Something went wrong in gamma_quantile_mapping.")
        return xr.Dataset(data_vars={'precip': (('time', 'lat', 'lon'), np.full(imerg_precip.shape, np.nan))},
                          coords={'time': imerg_ds['time'], 'lat': imerg_ds['lat'], 'lon': imerg_ds['lon']})

    # Ensure non-negative precipitation values
    corrected_precip = np.maximum(corrected_precip, 0)

    # Create a new xarray.Dataset with the bias-corrected data
    corrected_ds = xr.Dataset(data_vars={'precipitation': (('time', 'lat', 'lon'), corrected_precip)},
                              coords={'time': imerg_ds['time'],
                                      'lat': imerg_ds['lat'],
                                      'lon': imerg_ds['lon']},
                              attrs={'cdm_data_type': 'GRID',
                                     'title': 'Bias Corrected IMERG Late Precipitation L3 1 day 0.1 degree x 0.1 degree',
                                     'summary': 'Precipitation data corrected using Linear Scaling and Empirical Quantile Mapping',
                                     'source': 'IMERG and CPC-UNI',
                                     'history': f'Created on {pd.Timestamp.now()}',
                                     'DOI': '10.5067/GPM/IMERGDF/DAY/07',
                                     'creator_name': 'Benny Istanto',
                                     'creator_role': 'Climate Geographer',
                                     'creator_email': '[email protected]',
                                     'comment': 'This dataset has been bias corrected using a spatial moving window approach'})

    corrected_ds['precipitation'].attrs.update({
        'units': 'mm',
        'long_name': 'Corrected daily mean precipitation rate estimate',
        'standard_name': 'corrected_precipitation'})

    corrected_ds['lat'].attrs.update({
        'units': 'degrees_north',
        'long_name': 'Latitude'
    })
    
    corrected_ds['lon'].attrs.update({
        'units': 'degrees_east',
        'long_name': 'Longitude'
    })
    
    print("Bias correction completed for the entire dataset.")
    return corrected_ds

 def process_dekad(imerg_ds, cpc_ds, dekad_start, dekad_end, land_sea_mask):
    """
    Process bias correction for a single dekad period.

    Parameters:
    imerg_ds (xarray.Dataset): IMERG precipitation dataset.
    cpc_ds (xarray.Dataset): CPC precipitation dataset.
    dekad_start (pd.Timestamp): Start date of the dekad.
    dekad_end (pd.Timestamp): End date of the dekad.
    land_sea_mask (xarray.DataArray): Land-sea mask.

    Returns:
    str: Path to the saved corrected precipitation file.
    """
    print(f"Processing dekad from {dekad_start} to {dekad_end}...")

    imerg_window = imerg_ds.sel(time=slice(dekad_start, dekad_end))
    cpc_window = cpc_ds.sel(time=slice(dekad_start, dekad_end))

    if not imerg_window.time.size or not cpc_window.time.size:
        print(f"No data available for dekad {dekad_start} to {dekad_end}. Skipping...")
        return None

    cpc_window = cpc_window.reindex_like(imerg_window, method='nearest')

    print("Starting bias correction for dekad...")
    corrected_ds = lseqm_spatial(imerg_window, cpc_window)
    print("Bias correction completed for dekad...")

    corrected_dekad = corrected_ds.sel(time=slice(dekad_start, dekad_end))

    land_sea_mask_interp = land_sea_mask.interp(lat=corrected_dekad.lat, lon=corrected_dekad.lon, method="nearest")
    corrected_dekad_masked = corrected_dekad.where(land_sea_mask_interp == 1, drop=True)

    corrected_output_file = f'{corrected_precip_path}/idn_corrected_sw_imergl_{dekad_start.strftime("%Y%m%d")}.nc'

    if os.path.exists(corrected_output_file):
        set_user_decision()
        if user_choice == 'S':
            print(f"Skipping file {corrected_output_file}")
            return None
        elif user_choice == 'A':
            print("Aborting process.")
            return

    print(f"Saving corrected precipitation for {dekad_start} to {dekad_end} to {corrected_output_file}...")
    cf18 = {'precipitation': {'dtype': 'float32', 'zlib': True, '_FillValue': np.nan}}
    corrected_dekad_masked.to_netcdf(corrected_output_file, encoding=cf18, engine='netcdf4')
    print(f"Saved corrected precipitation for {dekad_start} to {dekad_end}.")

    return corrected_output_file

 def process_year(year):
    """
    Process bias correction for a single year.

    Parameters:
    year (int): Year to process.
    """
    try:
        imerg_ds = xr.open_dataset(f'{imerg_path}/idn_imergl_{year}.nc4', decode_times=True)
        cpc_ds = xr.open_dataset(f'{cpc_path}/idn_cpc_{year}.nc4', decode_times=True)
    except FileNotFoundError as e:
        print(f"Error: {e}")
        return

    mask_ds = xr.open_dataset(f'{mask_path}/idn_subset.nc')
    land_sea_mask = mask_ds['land']

    for month in range(1, 13):
        _, days_in_month = calendar.monthrange(year, month)
        dekad_dates = [pd.Timestamp(year, month, 1), pd.Timestamp(year, month, 11), pd.Timestamp(year, month, 21)]
        days_in_dekads = [10, 10, days_in_month - 20]
        if month == 2 and calendar.isleap(year):
            days_in_dekads[2] = 9

        for i, dekad_start in enumerate(dekad_dates):
            dekad_end = dekad_start + pd.Timedelta(days=days_in_dekads[i] - 1)
            process_dekad(imerg_ds, cpc_ds, dekad_start, dekad_end, land_sea_mask)

 def main(start_year, end_year):
    """
    The main process to calculate:
    - bias correction
    - save corrected data to netcdf
    Returns:
    - Output available in folder output/corrected_precip
    """
    start_time = time.time()
    years = list(range(start_year, end_year + 1))
    with Pool() as pool:
        pool.map(process_year, years)
    print(f"Total processing time: {time.time() - start_time:.2f} seconds.")

 if __name__ == '__main__':
    start_year = int(input("Enter the start year: "))
    end_year = int(input("Enter the end year: "))
    main(start_year, end_year)
diff --git a/lseqm_temporalmovingwindow.py b/lseqm_temporalmovingwindow.py
 # Import the library
 import os
 import numpy as np
 import xarray as xr
 import pandas as pd
 import calendar
 from scipy import optimize
 from scipy.stats import gamma, genpareto
 from sklearn.model_selection import KFold
 import time

 # Main directory on Google Drive
 dir = f'/mnt/d/temp/gfm1609'

 # Define the appropriate input and output directory paths
 input_dir = f'{dir}/data/bc/input'
 output_dir = f'{dir}/data/bc/output'
 imerg_path = f'{input_dir}/imergl'
 cpc_path = f'{input_dir}/cpcuni'
 mask_path = f'{dir}/data/subset/iso3'
 corrected_precip_path = f'{output_dir}/temporalwindow'

 # List of output directories
 output_directories = [output_dir, corrected_precip_path]

 # Create the output directories if they don't already exist
 for directory in output_directories:
    os.makedirs(directory, exist_ok=True)

 # Global variable to store user's choice
 user_choice = None

 def set_user_decision():
    """Prompt user for decision on existing files and store it globally."""
    global user_choice
    if user_choice is None:
        decision = input("An output file already exists. Choose an action - Replace (R), Skip (S), Abort (A): ").upper()
        while decision not in ['R', 'S', 'A']:
            print("Invalid choice. Please choose again.")
            decision = input("Choose an action - Replace (R), Skip (S), Abort (A): ").upper()
        user_choice = decision

 # To address the issue of capturing extreme values in satellite data while performing EQM, Tail Adjustment
 # is use to improve fit for extreme value. Tail adjustment with the Generalized Pareto Distribution (GPD)
 # better captures the extreme values by specifically modeling the tails of the distribution, which is
 # crucial for accurately representing extreme precipitation events.
 def gamma_quantile_mapping(imerg_values, cpc_values):
    """
    Apply gamma distribution-based quantile mapping with improved fitting and tail adjustment to correct the distribution of precipitation data.

    This function fits gamma distributions to the IMERG and CPC precipitation values using regularization and stricter constraints
    to improve the fitting accuracy. It then computes the cumulative distribution function (CDF) of the IMERG values and applies
    the inverse CDF of the CPC values to obtain the corrected precipitation values. Additionally, it adjusts the tails using the
    Generalized Pareto Distribution (GPD) to better capture extreme values.

    Parameters:
    imerg_values (numpy.ndarray): Array of IMERG precipitation values.
    cpc_values (numpy.ndarray): Array of CPC precipitation values.

    Returns:
    numpy.ndarray: Corrected precipitation values after gamma quantile mapping with improved fitting and tail adjustment.
    """
    def moment_equations(data, params):
        """
        Define moment equations for fitting gamma distribution using moments.

        Parameters:
        data (numpy.ndarray): Array of data values.
        params (tuple): Parameters (shape, loc, scale) for the gamma distribution.

        Returns:
        tuple: Differences between the estimated and actual moments.
        """
        shape, loc, scale = params
        if abs(shape) < 1e-8:
            # Special case when shape is close to zero
            return (loc + scale * np.euler_gamma - np.mean(data),
                    scale**2 * np.pi**2 / 6 - np.var(data),
                    0)
        else:
            return (loc + scale * (1 - np.euler_gamma * shape) / shape - np.mean(data),
                    scale**2 * (1 - 2 * shape * np.euler_gamma + shape**2 * np.pi**2 / 6) / shape**2 - np.var(data),
                    np.sign(shape) * (3 * np.euler_gamma * shape - 1 - shape**3 * np.pi**2 / 6) / (1 - 2 * shape) - np.sign(np.mean((data - np.mean(data))**3)))

    def fit_gamma_with_moment_equations(data):
        """
        Fit a gamma distribution to the data using moment equations.

        Parameters:
        data (numpy.ndarray): Array of data values.

        Returns:
        tuple: Fitted parameters (shape, loc, scale) of the gamma distribution.
        """
        data = data[~np.isnan(data)]  # Remove NaN values from data
        if len(data) == 0:  # If no data left after removing NaNs, return default values
            return 1, 0, 1
        
        initial_guess = [0.1, np.mean(data), np.std(data)]
        lower_bounds = [0.001, np.min(data), 0.001]
        upper_bounds = [10, np.max(data), np.std(data) * 10]

        # Ensure that each lower bound is strictly less than the corresponding upper bound
        for i in range(3):
            if lower_bounds[i] >= upper_bounds[i]:
                upper_bounds[i] = lower_bounds[i] + 1e-6

        # Ensure initial guess is within bounds
        initial_guess = np.clip(initial_guess, lower_bounds, upper_bounds)

        bounds = (lower_bounds, upper_bounds)

        def objective(params):
            return moment_equations(data, params)

        try:
            result = optimize.least_squares(objective, initial_guess, bounds=bounds, max_nfev=10000)
            shape, loc, scale = result.x
        except ValueError as e:
            print(f"ValueError: {e}")
            print(f"Initial guess: {initial_guess}")
            print(f"Bounds: {bounds}")
            raise

        # Regularization and constraints
        shape = max(shape, 0.001)  # Ensure shape is positive
        scale = max(scale, 0.001)  # Ensure scale is positive

        return shape, loc, scale

    def fit_generalized_pareto_distribution(data, threshold):
        """
        Fit a Generalized Pareto Distribution (GPD) to the excesses above the threshold.

        Parameters:
        data (numpy.ndarray): Array of data values.
        threshold (float): Threshold value for defining the excesses.

        Returns:
        tuple: Fitted parameters of the GPD.
        """
        excesses = data[data > threshold] - threshold
        if len(excesses) < 10:  # Arbitrary minimum number of points for GPD fitting
            #print("Not enough excesses for GPD fitting. Using default parameters.")
            return (0, 0, 1)  # Return a default GPD with zero shape and unit scale
        params = genpareto.fit(excesses)
        return params

    def cross_validate_gpd(data, threshold, n_splits=5):
        """
        Cross-validate GPD fitting by splitting data into folds.

        Parameters:
        data (numpy.ndarray): Array of data values.
        threshold (float): Threshold value for defining the excesses.
        n_splits (int, optional): Number of cross-validation splits. Default is 5.

        Returns:
        tuple: Averaged parameters of the GPD from cross-validation.
        """
        excesses = data[data > threshold] - threshold
        if len(excesses) < n_splits:
            #print("Not enough excesses for cross-validation. Fitting GPD directly.")
            return fit_generalized_pareto_distribution(data, threshold)  # Fit GPD directly if not enough samples for cross-validation

        kf = KFold(n_splits=n_splits)
        params_list = []

        for train_index, test_index in kf.split(excesses):
            train_data, test_data = excesses[train_index], excesses[test_index]
            params = genpareto.fit(train_data)
            params_list.append(params)

        # Average the parameters from cross-validation
        shape_avg = np.mean([params[0] for params in params_list])
        loc_avg = np.mean([params[1] for params in params_list])
        scale_avg = np.mean([params[2] for params in params_list])

        return shape_avg, loc_avg, scale_avg

    # Remove NaN values from the input arrays
    original_shape = imerg_values.shape
    imerg_values = imerg_values[~np.isnan(imerg_values)]
    cpc_values = cpc_values[~np.isnan(cpc_values)]

    if imerg_values.size == 0 or cpc_values.size == 0:
        return np.full(original_shape, np.nan)

    # Fit gamma distributions to the IMERG and CPC values using moment equations
    #print("Fitting gamma distributions to IMERG values...")
    shape1, loc1, scale1 = fit_gamma_with_moment_equations(imerg_values)
    #print(f"IMERG Gamma Params: shape={shape1}, loc={loc1}, scale={scale1}")
    y = gamma.cdf(imerg_values, shape1, loc=loc1, scale=scale1)
    #print(f"Gamma CDF of IMERG values: min={np.nanmin(y)}, max={np.nanmax(y)}")

    #print("Fitting gamma distributions to CPC values...")
    shape2, loc2, scale2 = fit_gamma_with_moment_equations(cpc_values)
    #print(f"CPC Gamma Params: shape={shape2}, loc={loc2}, scale={scale2}")
    cpc_quantiles = gamma.ppf(y, shape2, loc=loc2, scale=scale2)
    #print(f"Gamma PPF of CPC values: min={np.nanmin(cpc_quantiles)}, max={np.nanmax(cpc_quantiles)}")

    # Ensure CPC quantiles are within realistic bounds
    cpc_quantiles = np.maximum(cpc_quantiles, 0)
    #print(f"Adjusted CPC quantiles: min={np.nanmin(cpc_quantiles)}, max={np.nanmax(cpc_quantiles)}")

    # Fit GPD to the tails of the CPC values with cross-validation
    #print("Fitting Generalized Pareto Distribution (GPD) to CPC values...")
    threshold = np.percentile(cpc_values, 95)  # Set threshold at 95th percentile
    if np.isnan(threshold):
        print("Threshold is NaN, skipping GPD fitting.")
    else:
        #print(f"GPD threshold: {threshold}")
        cpc_gpd_params = cross_validate_gpd(cpc_values, threshold)
        #print(f"GPD Params: {cpc_gpd_params}")

        # Adjust the tails using GPD if there are enough data points
        extreme_mask = imerg_values > threshold
        if np.any(extreme_mask):
            cpc_quantiles[extreme_mask] = genpareto.ppf(y[extreme_mask], *cpc_gpd_params) + threshold

    #print(f"Adjusted CPC quantiles after GPD: min={np.nanmin(cpc_quantiles)}, max={np.nanmax(cpc_quantiles)}")

    # Dynamically determine an upper cap based on the characteristics of the data
    #print("Applying dynamic cap to CPC quantiles...")
    dynamic_cap = np.percentile(cpc_values, 99.9)  # Cap at the 99.9th percentile
    #print(f"Dynamic cap: {dynamic_cap}")
    if np.isnan(dynamic_cap):
        #print("Dynamic cap is NaN, skipping dynamic capping.")
        return cpc_quantiles.reshape(imerg_values.shape)
    cpc_quantiles = np.minimum(cpc_quantiles, dynamic_cap)

    # Ensure non-negative corrected values
    cpc_quantiles = np.maximum(cpc_quantiles, 0)

    #print("Reshaping CPC quantiles to original shape...")
    return cpc_quantiles.reshape(original_shape)

 def moving_window_mean(arr, window_size):
    """
    Calculate a moving window mean for a given array.

    The moving window approach is used to smooth local variability in precipitation data,
    mitigate random errors, and provide a robust basis for bias correction. Even when
    both IMERG and CPC datasets have the different/same spatial resolution (0.1° or 0.5°),
    the moving window helps to capture larger-scale patterns and reduce noise.

    Parameters:
    arr (numpy.ndarray): The input array to be smoothed.
    window_size (int): The size of the moving window.

    Returns:
    numpy.ndarray: The smoothed array after applying the moving window mean.
    """
    padding = (window_size - 1) // 2
    padded_arr = np.pad(arr, ((0, 0), (padding, padding), (padding, padding)), mode='reflect')
    result = np.empty_like(arr)
    for i in range(arr.shape[0]):
        for j in range(arr.shape[1]):
            for k in range(arr.shape[2]):
                window = padded_arr[i, j-padding:j+padding+1, k-padding:k+padding+1]
                if np.all(np.isnan(window)):
                    result[i, j, k] = np.nan
                else:
                    result[i, j, k] = np.nanmean(window)
    return result

 def lseqm_moving_window(imerg_ds, cpc_ds, window_size=5):
    """
    Apply the Linear Scaling and Empirical Quantile Mapping (LSEQM) method with a moving window approach.

    This function performs bias correction on precipitation data by combining Linear Scaling (LS)
    and Empirical Quantile Mapping (EQM). The moving window approach is used to smooth local
    variability, mitigate random errors, and capture larger-scale patterns in the data.

    Parameters:
    imerg_ds (xarray.Dataset): IMERG precipitation dataset with dimensions ('time', 'lat', 'lon').
    cpc_ds (xarray.Dataset): CPC precipitation dataset with dimensions ('time', 'lat', 'lon').
    window_size (int, optional): The size of the moving window used for smoothing. Default is 5.

    Returns:
    xarray.Dataset: The bias-corrected precipitation dataset with the same dimensions as the input datasets.

    Notes:
    - The IMERG and CPC datasets are first sorted by time.
    - The Linear Scaling (LS) method corrects the mean bias by applying a scale factor calculated
      from the ratio of moving window means of CPC and IMERG precipitation.
    - The Empirical Quantile Mapping (EQM) method is applied using gamma distribution-based quantile mapping
      to correct the distribution of the precipitation data.
    - The corrected precipitation data is stored in a new xarray.Dataset.

    Example:
    corrected_ds = lseqm_moving_window(imerg_ds, cpc_ds, window_size=5)
    """
    # Sort the input datasets by the time dimension
    imerg_ds = imerg_ds.sortby('time')
    cpc_ds = cpc_ds.sortby('time')

    # Get the precipitation data from the input datasets
    imerg_precip = imerg_ds['precipitation']
    cpc_precip = cpc_ds['precip']

    print("IMERG precip shape:", imerg_precip.shape)
    print("CPC precip shape:", cpc_precip.shape)

    # Calculate the moving window mean for each dataset
    imerg_moving_mean = moving_window_mean(imerg_precip.values, window_size)
    cpc_moving_mean = moving_window_mean(cpc_precip.values, window_size)

    # Perform Linear Scaling (LS) to correct the mean bias
    print("Perform Linear Scaling (LS) to correct the mean bias...")
    ls_scale_factor = np.divide(cpc_moving_mean, imerg_moving_mean, out=np.ones_like(cpc_moving_mean), where=imerg_moving_mean != 0)
    ls_corrected_precip = imerg_precip * ls_scale_factor

    print("Apply gamma distribution-based quantile mapping to correct the distribution...")
    corrected_precip = xr.apply_ufunc(gamma_quantile_mapping, ls_corrected_precip, cpc_precip, input_core_dims=[['time'], ['time']], output_core_dims=[['time']], output_dtypes=[imerg_precip.dtype], vectorize=True).data
    corrected_precip = np.transpose(corrected_precip, (2, 0, 1))

    # Ensure non-negative precipitation values
    corrected_precip = np.maximum(corrected_precip, 0)

    print("Corrected precip shape:", corrected_precip.shape)

    # Create a new xarray.Dataset with the bias-corrected data
    corrected_ds = xr.Dataset(data_vars={'precipitation': (('time', 'lat', 'lon'), corrected_precip)},
                              coords={'time': imerg_ds['time'],
                                      'lat': imerg_ds['lat'],
                                      'lon': imerg_ds['lon']},
                              attrs={'cdm_data_type': 'GRID',
                                     'title': 'Bias Corrected IMERG Late Precipitation L3 1 day 0.1 degree x 0.1 degree',
                                     'summary': 'Precipitation data corrected using Linear Scaling and Empirical Quantile Mapping',
                                     'source': 'IMERG and CPC-UNI',
                                     'history': f'Created on {pd.Timestamp.now()}',
                                     'DOI': '10.5067/GPM/IMERGDF/DAY/07',
                                     'creator_name': 'Benny Istanto',
                                     'creator_role': 'Climate Geographer',
                                     'creator_email': '[email protected]',
                                     'comment': 'This dataset has been bias corrected using a temporal moving window approach'})

    corrected_ds['precipitation'].attrs.update({
        'units': 'mm',
        'long_name': 'Corrected daily mean precipitation rate estimate',
        'standard_name': 'corrected_precipitation'})

    corrected_ds['lat'].attrs.update({
        'units': 'degrees_north',
        'long_name': 'Latitude'
    })
    
    corrected_ds['lon'].attrs.update({
        'units': 'degrees_east',
        'long_name': 'Longitude'
    })
    
    return corrected_ds

 def main(start_year, end_year):
    """
    The main process to calculate:
    - bias correction
    - save corrected data to netcdf
    Returns:
    - Output available in folder output/corrected_precip
    """
    global user_choice

    # Start timing the entire process
    start_time = time.time()

    # Load the land-sea mask from the external NetCDF file
    mask_ds = xr.open_dataset(f'{mask_path}/idn_subset.nc')

    # Create a boolean mask from the 'land' variable
    land_sea_mask = mask_ds['land']

    # Encoding for CF 1.8 (adjust based on your precision and range requirements)
    cf18 = {'precipitation': {'dtype': 'float32', 'zlib': True, '_FillValue': np.nan}}
    # or if using int16 with scaling
    # cf18 = {'precipitation': {'dtype': 'int16', 'zlib': True, '_FillValue': -9999, 'scale_factor': 0.1}}

    # Get the year range
    for year in range(start_year, end_year + 1):
        print(f"Processing year {year}...")
        year_start_time = time.time()

        try:
            # Open the input data
            imerg_ds = xr.open_dataset(f'{imerg_path}/idn_imergl_{year}.nc4', decode_times=True)
            cpc_ds = xr.open_dataset(f'{cpc_path}/idn_cpc_{year}.nc4', decode_times=True)
        except FileNotFoundError as e:
            print(f"Error: {e}")
            continue

        # Get the month information
        for month in range(1, 13):
            _, days_in_month = calendar.monthrange(year, month)

            # Generate the start dates of each dekad for the current month
            dekad_dates = [pd.Timestamp(year, month, 1), pd.Timestamp(year, month, 11), pd.Timestamp(year, month, 21)]
            days_in_dekads = [10, 10, days_in_month - 20]
            if month == 2 and calendar.isleap(year):
                days_in_dekads[2] = 9

            # Loop through the dekads
            for i, dekad_start in enumerate(dekad_dates):
                dekad_str = dekad_start.strftime("%Y%m%d")
                dekad_end = dekad_start + pd.Timedelta(days=days_in_dekads[i] - 1)

                # Calculate the start and end dates for the moving window
                window_start = dekad_start - pd.Timedelta(days=2)
                window_end = dekad_start + pd.Timedelta(days=days_in_dekads[i] + 1)

                # Slice the data for the moving window
                imerg_window = imerg_ds.sel(time=slice(window_start, window_end))
                cpc_window = cpc_ds.sel(time=slice(window_start, window_end))

                if not imerg_window.time.size or not cpc_window.time.size:
                    print(f"Skipping dekad {dekad_str} due to empty time slice.")
                    continue

                # Reindex CPC dataset to match IMERG dataset
                cpc_window = cpc_window.reindex_like(imerg_window, method='nearest')

                # Time the entire dekad processing
                dekad_start_time = time.time()

                # Perform the bias correction using the moving window
                print("Computing bias correction...")
                corrected_ds = lseqm_moving_window(imerg_window, cpc_window)
                print("Bias correction computation completed...")

                # Slice the corrected data for the current dekad
                corrected_dekad = corrected_ds.sel(time=slice(dekad_start, dekad_end))

                # Apply the land-sea mask to the corrected dataset before saving
                land_sea_mask_interp = land_sea_mask.interp(lat=corrected_dekad.lat, lon=corrected_dekad.lon, method="nearest")
                corrected_dekad_masked = corrected_dekad.where(land_sea_mask_interp == 1, drop=True)

                # Save the corrected dekad data to a NetCDF file
                print(f'Saving corrected precipitation for {dekad_str} to a netCDF...')
                corrected_output_file = f'{corrected_precip_path}/idn_corrected_tw_imergl_{dekad_str}.nc'

                # Check if the file already exists
                if os.path.exists(corrected_output_file):
                    set_user_decision()
                    if user_choice == 'S':
                        print(f"Skipping file {corrected_output_file}")
                        continue
                    elif user_choice == 'A':
                        print("Aborting process.")
                        return

                corrected_dekad_masked.to_netcdf(corrected_output_file, encoding = cf18, engine='netcdf4')

                dekad_processing_time = time.time() - dekad_start_time
                print(f"Processed and saved corrected precipitation for {dekad_str} in {dekad_processing_time:.2f} seconds.")

        print(f"Year {year} processing completed in {time.time() - year_start_time:.2f} seconds.")

    print(f"Total processing time: {time.time() - start_time:.2f} seconds.")

 if __name__ == '__main__':
    start_year = int(input("Enter the start year: "))
    end_year = int(input("Enter the end year: "))
    main(start_year, end_year)
diff --git a/todo.md b/todo.md
	# Import the library
	import os
	import numpy as np
	import xarray as xr
	import pandas as pd
	import calendar
	from scipy import optimize
	from scipy.stats import gamma, genpareto
	from sklearn.model_selection import KFold
	from multiprocessing import Pool
	import time

	# Main directory on Google Drive
	dir = f'/mnt/d/temp/gfm1609'

	# Define the appropriate input and output directory paths
	input_dir = f'{dir}/data/bc/input'
	output_dir = f'{dir}/data/bc/output'
	imerg_path = f'{input_dir}/imergl'
	cpc_path = f'{input_dir}/cpcuni'
	mask_path = f'{dir}/data/subset/iso3'
	corrected_precip_path = f'{output_dir}/spatialwindow'

	# List of output directories
	output_directories = [output_dir, corrected_precip_path]

	# Create the output directories if they don't already exist
	for directory in output_directories:
	os.makedirs(directory, exist_ok=True)

	# Global variable to store user's choice
	user_choice = None

	def set_user_decision():
	"""Prompt user for decision on existing files and store it globally."""
	global user_choice
	if user_choice is None:
	decision = input("An output file already exists. Choose an action - Replace (R), Skip (S), Abort (A): ").upper()
	while decision not in ['R', 'S', 'A']:
	print("Invalid choice. Please choose again.")
	decision = input("Choose an action - Replace (R), Skip (S), Abort (A): ").upper()
	user_choice = decision

	# To address the issue of capturing extreme values in satellite data while performing EQM, Tail Adjustment
	# is use to improve fit for extreme value. Tail adjustment with the Generalized Pareto Distribution (GPD)
	# better captures the extreme values by specifically modeling the tails of the distribution, which is
	# crucial for accurately representing extreme precipitation events.
	def gamma_quantile_mapping(imerg_values, cpc_values):
	"""
	Apply gamma distribution-based quantile mapping with improved fitting and tail adjustment
	to correct the distribution of precipitation data.

	This function fits gamma distributions to the IMERG and CPC precipitation values using
	regularization and stricter constraints to improve the fitting accuracy. It then computes
	the cumulative distribution function (CDF) of the IMERG values and applies the inverse CDF
	of the CPC values to obtain the corrected precipitation values. Additionally, it adjusts
	the tails using the Generalized Pareto Distribution (GPD) to better capture extreme values.

	Parameters:
	imerg_values (numpy.ndarray): Array of IMERG precipitation values.
	cpc_values (numpy.ndarray): Array of CPC precipitation values.

	Returns:
	numpy.ndarray: Corrected precipitation values after gamma quantile mapping with improved
	fitting and tail adjustment.
	"""
	def moment_equations(data, params):
	"""
	Define moment equations for fitting gamma distribution using moments.

	Parameters:
	data (numpy.ndarray): Array of data values.
	params (tuple): Parameters (shape, loc, scale) for the gamma distribution.

	Returns:
	tuple: Differences between the estimated and actual moments.
	"""
	shape, loc, scale = params
	if abs(shape) < 1e-8:
	# Special case when shape is close to zero
	return (loc + scale * np.euler_gamma - np.mean(data),
	scale*2 np.pi**2 / 6 - np.var(data),
	0)
	else:
	return (loc + scale * (1 - np.euler_gamma * shape) / shape - np.mean(data),
	scale*2 (1 - 2 * shape * np.euler_gamma + shape*2 np.pi2 / 6) / shape2 - np.var(data),
	np.sign(shape) * (3 * np.euler_gamma * shape - 1 - shape*3 np.pi*2 / 6) / (1 - 2 shape) - np.sign(np.mean((data - np.mean(data))**3)))

	def fit_gamma_with_moment_equations(data):
	"""
	Fit a gamma distribution to the data using moment equations.

	Parameters:
	data (numpy.ndarray): Array of data values.

	Returns:
	tuple: Fitted parameters (shape, loc, scale) of the gamma distribution.
	"""
	data = data[~np.isnan(data)] # Remove NaN values from data
	if len(data) == 0: # If no data left after removing NaNs, return default values
	return 1, 0, 1

	initial_guess = [0.1, np.mean(data), np.std(data)]
	lower_bounds = [0.001, np.min(data), 0.001]
	upper_bounds = [10, np.max(data), np.std(data) * 10]

	# Ensure that each lower bound is strictly less than the corresponding upper bound
	for i in range(3):
	if lower_bounds[i] >= upper_bounds[i]:
	upper_bounds[i] = lower_bounds[i] + 1e-6

	# Ensure initial guess is within bounds
	initial_guess = np.clip(initial_guess, lower_bounds, upper_bounds)

	bounds = (lower_bounds, upper_bounds)

	def objective(params):
	return moment_equations(data, params)

	try:
	result = optimize.least_squares(objective, initial_guess, bounds=bounds, max_nfev=10000)
	shape, loc, scale = result.x
	except ValueError as e:
	print(f"ValueError: {e}")
	print(f"Initial guess: {initial_guess}")
	print(f"Bounds: {bounds}")
	raise

	# Regularization and constraints
	shape = max(shape, 0.001) # Ensure shape is positive
	scale = max(scale, 0.001) # Ensure scale is positive

	return shape, loc, scale

	def fit_generalized_pareto_distribution(data, threshold):
	"""
	Fit a Generalized Pareto Distribution (GPD) to the excesses above the threshold.

	Parameters:
	data (numpy.ndarray): Array of data values.
	threshold (float): Threshold value for defining the excesses.

	Returns:
	tuple: Fitted parameters of the GPD.
	"""
	excesses = data[data > threshold] - threshold
	if len(excesses) < 10: # Arbitrary minimum number of points for GPD fitting
	#print("Not enough excesses for GPD fitting. Using default parameters.")
	return (0, 0, 1) # Return a default GPD with zero shape and unit scale
	params = genpareto.fit(excesses)
	return params

	def cross_validate_gpd(data, threshold, n_splits=5):
	"""
	Cross-validate GPD fitting by splitting data into folds.

	Parameters:
	data (numpy.ndarray): Array of data values.
	threshold (float): Threshold value for defining the excesses.
	n_splits (int, optional): Number of cross-validation splits. Default is 5.

	Returns:
	tuple: Averaged parameters of the GPD from cross-validation.
	"""
	excesses = data[data > threshold] - threshold
	if len(excesses) < n_splits:
	#print("Not enough excesses for cross-validation. Fitting GPD directly.")
	return fit_generalized_pareto_distribution(data, threshold) # Fit GPD directly if not enough samples for cross-validation

	kf = KFold(n_splits=n_splits)
	params_list = []

	for train_index, test_index in kf.split(excesses):
	train_data, test_data = excesses[train_index], excesses[test_index]
	params = genpareto.fit(train_data)
	params_list.append(params)

	# Average the parameters from cross-validation
	shape_avg = np.mean([params[0] for params in params_list])
	loc_avg = np.mean([params[1] for params in params_list])
	scale_avg = np.mean([params[2] for params in params_list])

	return shape_avg, loc_avg, scale_avg

	# Combine the values within the spatial window
	imerg_values_flat = imerg_values.flatten()
	cpc_values_flat = cpc_values.flatten()

	if imerg_values_flat.size == 0 or cpc_values_flat.size == 0:
	return np.full(imerg_values.shape, np.nan)

	#print("Fitting gamma distributions to IMERG values...")
	shape1, loc1, scale1 = fit_gamma_with_moment_equations(imerg_values_flat)
	#print(f"IMERG Gamma Params: shape={shape1}, loc={loc1}, scale={scale1}")
	y = gamma.cdf(imerg_values_flat, shape1, loc=loc1, scale=scale1)
	#print(f"Gamma CDF of IMERG values: min={np.nanmin(y)}, max={np.nanmax(y)}")

	#print("Fitting gamma distributions to CPC values...")
	shape2, loc2, scale2 = fit_gamma_with_moment_equations(cpc_values_flat)
	#print(f"CPC Gamma Params: shape={shape2}, loc={loc2}, scale={scale2}")
	cpc_quantiles_flat = gamma.ppf(y, shape2, loc=loc2, scale=scale2)
	#print(f"Gamma PPF of CPC values: min={np.nanmin(cpc_quantiles_flat)}, max={np.nanmax(cpc_quantiles_flat)}")

	# Ensure CPC quantiles are within realistic bounds
	cpc_quantiles_flat = np.maximum(cpc_quantiles_flat, 0)
	#print(f"Adjusted CPC quantiles: min={np.nanmin(cpc_quantiles_flat)}, max={np.nanmax(cpc_quantiles_flat)}")

	#print("Fitting Generalized Pareto Distribution (GPD) to CPC values...")
	try:
	threshold = np.percentile(cpc_values_flat, 95) # Set threshold at 95th percentile
	except IndexError as e:
	print(f"IndexError: {e}")
	threshold = np.nan

	if np.isnan(threshold):
	print("Threshold is NaN, skipping GPD fitting.")
	else:
	#print(f"GPD threshold: {threshold}")
	cpc_gpd_params = cross_validate_gpd(cpc_values_flat, threshold)
	#print(f"GPD Params: {cpc_gpd_params}")

	# Adjust the tails using GPD if there are enough data points
	extreme_mask = imerg_values_flat > threshold
	if np.any(extreme_mask):
	cpc_quantiles_flat[extreme_mask] = genpareto.ppf(y[extreme_mask], *cpc_gpd_params) + threshold

	#print(f"Adjusted CPC quantiles after GPD: min={np.nanmin(cpc_quantiles_flat)}, max={np.nanmax(cpc_quantiles_flat)}")

	#print("Applying dynamic cap to CPC quantiles...")
	try:
	dynamic_cap = np.percentile(cpc_values_flat, 99.9) # Cap at the 99.9th percentile
	except IndexError as e:
	#print(f"IndexError: {e}")
	dynamic_cap = np.nan

	print(f"Dynamic cap: {dynamic_cap}")
	if np.isnan(dynamic_cap):
	#print("Dynamic cap is NaN, skipping dynamic capping.")
	return cpc_quantiles_flat.reshape(imerg_values.shape)
	cpc_quantiles_flat = np.minimum(cpc_quantiles_flat, dynamic_cap)

	# Ensure non-negative corrected values
	cpc_quantiles_flat = np.maximum(cpc_quantiles_flat, 0)

	#print("Reshaping CPC quantiles to original shape...")
	return cpc_quantiles_flat.reshape(imerg_values.shape)

	def calculate_scale_factor(imerg_precip, cpc_precip):
	"""
	Calculate the scale factor using a 5x5 spatial window.
	Parameters:
	imerg_precip (numpy.ndarray): IMERG precipitation values.
	cpc_precip (numpy.ndarray): CPC precipitation values.
	Returns:
	numpy.ndarray: Scale factors for each grid point.
	"""
	scale_factors = np.full_like(cpc_precip, np.nan)
	time_steps, rows, cols = cpc_precip.shape

	for t in range(time_steps):
	for i in range(rows):
	for j in range(cols):
	# Define the window boundaries
	row_start = max(i - 2, 0)
	row_end = min(i + 3, rows)
	col_start = max(j - 2, 0)
	col_end = min(j + 3, cols)

	imerg_window = imerg_precip[t, row_start:row_end, col_start:col_end]
	cpc_window = cpc_precip[t, row_start:row_end, col_start:col_end]
	if not np.isnan(cpc_precip[t, i, j]) and np.count_nonzero(~np.isnan(imerg_window)) > 0:
	regional_mean_imerg = np.nanmean(imerg_window)
	if regional_mean_imerg != 0:
	scale_factors[t, i, j] = cpc_precip[t, i, j] / regional_mean_imerg
	else:
	scale_factors[t, i, j] = 1.0
	else:
	scale_factors[t, i, j] = np.nan

	return scale_factors

	def apply_linear_scaling(imerg_precip, scale_factors):
	"""
	Apply the scale factors to IMERG precipitation data.

	Parameters:
	imerg_precip (numpy.ndarray): IMERG precipitation values.
	scale_factors (numpy.ndarray): Scale factors for each grid point.

	Returns:
	numpy.ndarray: Spatially calibrated IMERG precipitation values.
	"""
	return imerg_precip * scale_factors

	def lseqm_spatial(imerg_ds, cpc_ds):
	"""
	Apply the Linear Scaling and Empirical Quantile Mapping (LSEQM) method with a spatial moving window approach.

	This function performs bias correction on precipitation data by combining Linear Scaling (LS)
	and Empirical Quantile Mapping (EQM). The spatial moving window approach is used to smooth local
	variability, mitigate random errors, and capture larger-scale patterns in the data.

	Parameters:
	imerg_ds (xarray.Dataset): IMERG precipitation dataset with dimensions ('time', 'lat', 'lon').
	cpc_ds (xarray.Dataset): CPC precipitation dataset with dimensions ('time', 'lat', 'lon').

	Returns:
	xarray.Dataset: The bias-corrected precipitation dataset with the same dimensions as the input datasets.
	"""
	# Sort the input datasets by the time dimension
	imerg_ds = imerg_ds.sortby('time')
	cpc_ds = cpc_ds.sortby('time')

	# Get the precipitation data from the input datasets
	imerg_precip = imerg_ds['precipitation']
	cpc_precip = cpc_ds['precip']

	print(f"IMERG precip shape: {imerg_precip.shape}")
	print(f"CPC precip shape: {cpc_precip.shape}")

	# Calculate the scale factors using a 5x5 spatial moving window
	scale_factors = calculate_scale_factor(imerg_precip.values, cpc_precip.values)

	#print("Scale factors calculated. Checking for NaN values...")
	if np.all(np.isnan(scale_factors)):
	#print("All scale factors are NaN. Something went wrong in calculate_scale_factor.")
	return xr.Dataset(data_vars={'precip': (('time', 'lat', 'lon'), np.full(imerg_precip.shape, np.nan))},
	coords={'time': imerg_ds['time'], 'lat': imerg_ds['lat'], 'lon': imerg_ds['lon']})

	# Apply Linear Scaling (LS) to correct the mean bias
	print("Performing Linear Scaling (LS) to correct the mean bias...")
	ls_corrected_precip = apply_linear_scaling(imerg_precip.values, scale_factors)

	#print("Linear Scaling completed. Checking for NaN values in LS corrected precip...")
	if np.all(np.isnan(ls_corrected_precip)):
	#print("All LS corrected precip values are NaN. Something went wrong in apply_linear_scaling.")
	return xr.Dataset(data_vars={'precip': (('time', 'lat', 'lon'), np.full(imerg_precip.shape, np.nan))},
	coords={'time': imerg_ds['time'], 'lat': imerg_ds['lat'], 'lon': imerg_ds['lon']})

	# Apply gamma distribution-based quantile mapping to correct the distribution
	print("Applying gamma distribution-based quantile mapping to correct the distribution...")
	corrected_precip = xr.apply_ufunc(gamma_quantile_mapping, ls_corrected_precip, cpc_precip.values,
	input_core_dims=[['time', 'lat', 'lon'], ['time', 'lat', 'lon']],
	output_core_dims=[['time', 'lat', 'lon']],
	output_dtypes=[imerg_precip.dtype],
	vectorize=True).data

	#print("Quantile mapping completed. Checking for NaN values in corrected precip...")
	if np.all(np.isnan(corrected_precip)):
	#print("All corrected precip values are NaN. Something went wrong in gamma_quantile_mapping.")
	return xr.Dataset(data_vars={'precip': (('time', 'lat', 'lon'), np.full(imerg_precip.shape, np.nan))},
	coords={'time': imerg_ds['time'], 'lat': imerg_ds['lat'], 'lon': imerg_ds['lon']})

	# Ensure non-negative precipitation values
	corrected_precip = np.maximum(corrected_precip, 0)

	# Create a new xarray.Dataset with the bias-corrected data
	corrected_ds = xr.Dataset(data_vars={'precipitation': (('time', 'lat', 'lon'), corrected_precip)},
	coords={'time': imerg_ds['time'],
	'lat': imerg_ds['lat'],
	'lon': imerg_ds['lon']},
	attrs={'cdm_data_type': 'GRID',
	'title': 'Bias Corrected IMERG Late Precipitation L3 1 day 0.1 degree x 0.1 degree',
	'summary': 'Precipitation data corrected using Linear Scaling and Empirical Quantile Mapping',
	'source': 'IMERG and CPC-UNI',
	'history': f'Created on {pd.Timestamp.now()}',
	'DOI': '10.5067/GPM/IMERGDF/DAY/07',
	'creator_name': 'Benny Istanto',
	'creator_role': 'Climate Geographer',
	'creator_email': '[email protected]',
	'comment': 'This dataset has been bias corrected using a spatial moving window approach'})

	corrected_ds['precipitation'].attrs.update({
	'units': 'mm',
	'long_name': 'Corrected daily mean precipitation rate estimate',
	'standard_name': 'corrected_precipitation'})

	corrected_ds['lat'].attrs.update({
	'units': 'degrees_north',
	'long_name': 'Latitude'
	})

	corrected_ds['lon'].attrs.update({
	'units': 'degrees_east',
	'long_name': 'Longitude'
	})

	print("Bias correction completed for the entire dataset.")
	return corrected_ds

	def process_dekad(imerg_ds, cpc_ds, dekad_start, dekad_end, land_sea_mask):
	"""
	Process bias correction for a single dekad period.

	Parameters:
	imerg_ds (xarray.Dataset): IMERG precipitation dataset.
	cpc_ds (xarray.Dataset): CPC precipitation dataset.
	dekad_start (pd.Timestamp): Start date of the dekad.
	dekad_end (pd.Timestamp): End date of the dekad.
	land_sea_mask (xarray.DataArray): Land-sea mask.

	Returns:
	str: Path to the saved corrected precipitation file.
	"""
	print(f"Processing dekad from {dekad_start} to {dekad_end}...")

	imerg_window = imerg_ds.sel(time=slice(dekad_start, dekad_end))
	cpc_window = cpc_ds.sel(time=slice(dekad_start, dekad_end))

	if not imerg_window.time.size or not cpc_window.time.size:
	print(f"No data available for dekad {dekad_start} to {dekad_end}. Skipping...")
	return None

	cpc_window = cpc_window.reindex_like(imerg_window, method='nearest')

	print("Starting bias correction for dekad...")
	corrected_ds = lseqm_spatial(imerg_window, cpc_window)
	print("Bias correction completed for dekad...")

	corrected_dekad = corrected_ds.sel(time=slice(dekad_start, dekad_end))

	land_sea_mask_interp = land_sea_mask.interp(lat=corrected_dekad.lat, lon=corrected_dekad.lon, method="nearest")
	corrected_dekad_masked = corrected_dekad.where(land_sea_mask_interp == 1, drop=True)

	corrected_output_file = f'{corrected_precip_path}/idn_corrected_sw_imergl_{dekad_start.strftime("%Y%m%d")}.nc'

	if os.path.exists(corrected_output_file):
	set_user_decision()
	if user_choice == 'S':
	print(f"Skipping file {corrected_output_file}")
	return None
	elif user_choice == 'A':
	print("Aborting process.")
	return

	print(f"Saving corrected precipitation for {dekad_start} to {dekad_end} to {corrected_output_file}...")
	cf18 = {'precipitation': {'dtype': 'float32', 'zlib': True, '_FillValue': np.nan}}
	corrected_dekad_masked.to_netcdf(corrected_output_file, encoding=cf18, engine='netcdf4')
	print(f"Saved corrected precipitation for {dekad_start} to {dekad_end}.")

	return corrected_output_file

	def process_year(year):
	"""
	Process bias correction for a single year.

	Parameters:
	year (int): Year to process.
	"""
	try:
	imerg_ds = xr.open_dataset(f'{imerg_path}/idn_imergl_{year}.nc4', decode_times=True)
	cpc_ds = xr.open_dataset(f'{cpc_path}/idn_cpc_{year}.nc4', decode_times=True)
	except FileNotFoundError as e:
	print(f"Error: {e}")
	return

	mask_ds = xr.open_dataset(f'{mask_path}/idn_subset.nc')
	land_sea_mask = mask_ds['land']

	for month in range(1, 13):
	_, days_in_month = calendar.monthrange(year, month)
	dekad_dates = [pd.Timestamp(year, month, 1), pd.Timestamp(year, month, 11), pd.Timestamp(year, month, 21)]
	days_in_dekads = [10, 10, days_in_month - 20]
	if month == 2 and calendar.isleap(year):
	days_in_dekads[2] = 9

	for i, dekad_start in enumerate(dekad_dates):
	dekad_end = dekad_start + pd.Timedelta(days=days_in_dekads[i] - 1)
	process_dekad(imerg_ds, cpc_ds, dekad_start, dekad_end, land_sea_mask)

	def main(start_year, end_year):
	"""
	The main process to calculate:
	- bias correction
	- save corrected data to netcdf
	Returns:
	- Output available in folder output/corrected_precip
	"""
	start_time = time.time()
	years = list(range(start_year, end_year + 1))
	with Pool() as pool:
	pool.map(process_year, years)
	print(f"Total processing time: {time.time() - start_time:.2f} seconds.")

	if __name__ == '__main__':
	start_year = int(input("Enter the start year: "))
	end_year = int(input("Enter the end year: "))
	main(start_year, end_year)