bennyistanto · January 6, 2024 06:23
diff --git a/geotiffs_to_netcdf.ipynb b/geotiffs_to_netcdf.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "5044b463",
   "metadata": {},
   "source": [
    "# GeoTIFFs to single NetCDF"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e045a097",
   "metadata": {},
   "source": [
    "Convert raster GeoTIFF in a folder to single NetCDF file with time dimension enabled that is CF-Compliant\n",
    "http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html\n",
    " \n",
    "Based on Rich Signell's answer on StackExchange: https://gis.stackexchange.com/a/70487\n",
    "\n",
    "This script was tested using ERA5-Land daily data. Adjustment is needed if using other timesteps data. </br> \n",
    "NCO (http://nco.sourceforge.net), GDAL, NetCDF4, numpy must be installed before using this script.\n",
    " \n",
    "Modified by\n",
    "\n",
    "**Benny Istanto**</br>\n",
    "* UN World Food Programme, [email protected] [1]</br>\n",
    "* GOST/DECAT/DECDG, The World Bank, [email protected] [2]</br>\n",
    "\n",
    "Log:</br>\n",
    "* [1] 2020-07-05 First draft</br>\n",
    "* [2] 2023-07-15 Prevent the output shifted half-pixel</br>\n",
    "* [2] 2024-01-04 Handle daily data from multi year period, grouped by year, process each year</br>"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "540dfd4c",
   "metadata": {},
   "source": [
    "### Import Libraries\n",
    "\n",
    "Place all the required imports in the first cell."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "05fa7fbc",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import datetime as dt\n",
    "import os\n",
    "from osgeo import gdal\n",
    "gdal.UseExceptions()\n",
    "import netCDF4\n",
    "import re\n",
    "from collections import defaultdict\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "43784cc0",
   "metadata": {},
   "source": [
    "### Define the Function\n",
    "\n",
    "Define the `create_netcdf_for_year` function in the second cell. This function will be responsible for creating the NetCDF file for each year."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d7acb303",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_netcdf_for_year(year, file_list):\n",
    "    # Use the first file to setup dimensions\n",
    "    first_file_full_path = os.path.join('X:\\\\Temp\\\\era5land\\\\tmax_daily\\\\geotiff', file_list[0])\n",
    "    ds = gdal.Open(first_file_full_path)\n",
    "    a = ds.ReadAsArray()\n",
    "    nlat, nlon = np.shape(a)\n",
    "\n",
    "    # creating arrays of the longitude (lon) and latitude (lat) coordinates for each pixel in the dataset. \n",
    "    b = ds.GetGeoTransform() # bbox, interval\n",
    "    lon = np.arange(nlon) * b[1] + b[0] + (b[1] / 2) # add half the x pixel size to the lon\n",
    "    lat = np.arange(nlat) * b[5] + b[3] + (b[5] / 2) # add half the y pixel size to the lat\n",
    "    lat = np.flipud(lat) # flip the latitudes\n",
    "\n",
    "    basedate = dt.datetime(1981, 1, 1, 0, 0, 0)\n",
    "\n",
    "    # Updated file path for saving output\n",
    "    output_folder = 'X:\\\\Temp\\\\era5land\\\\tmax_daily\\\\nc' # Linux: /mnt/x/Temp/era5land/tmax_daily/nc\n",
    "    if not os.path.exists(output_folder):\n",
    "        os.makedirs(output_folder)\n",
    "    nco = netCDF4.Dataset(os.path.join(output_folder, f'wld_cli_era5land_daily_tmax_{year}.nc'), 'w', clobber=True)\n",
    "    \n",
    "    # chunking is optional, but can improve access a lot: \n",
    "    # (see: http://www.unidata.ucar.edu/blogs/developer/entry/chunking_data_choosing_shapes)\n",
    "    chunk_lon=10\n",
    "    chunk_lat=10\n",
    "    chunk_time=12\n",
    "\n",
    "    # create dimensions, variables and attributes:\n",
    "    nco.createDimension('lon',nlon)\n",
    "    nco.createDimension('lat',nlat)\n",
    "    nco.createDimension('time',None)\n",
    "\n",
    "    timeo = nco.createVariable('time','f4',('time'))\n",
    "    timeo.units = 'days since 1981-1-1 00:00:00'\n",
    "    timeo.standard_name = 'time'\n",
    "    timeo.calendar = 'gregorian'\n",
    "    timeo.axis = 'T'\n",
    "\n",
    "    lono = nco.createVariable('lon','f4',('lon'))\n",
    "    lono.units = 'degrees_east'\n",
    "    lono.standard_name = 'longitude'\n",
    "    lono.long_name = 'longitude'\n",
    "    lono.axis = 'X'\n",
    "\n",
    "    lato = nco.createVariable('lat','f4',('lat'))\n",
    "    lato.units = 'degrees_north'\n",
    "    lato.standard_name = 'latitude'\n",
    "    lato.long_name = 'latitude'\n",
    "    lato.axis = 'Y'\n",
    "\n",
    "    # create container variable for CRS: lon/lat WGS84 datum\n",
    "    crso = nco.createVariable('crs','i4')\n",
    "    crso.long_name = 'Lon/Lat Coords in WGS84'\n",
    "    crso.grid_mapping_name='latitude_longitude'\n",
    "    crso.longitude_of_prime_meridian = 0.0\n",
    "    crso.semi_major_axis = 6378137.0\n",
    "    crso.inverse_flattening = 298.257223563\n",
    "\n",
    "    # create short integer variable for Temperature data, with chunking\n",
    "    t2mo = nco.createVariable('tmax', 'f4',  ('time', 'lat', 'lon'),\n",
    "        zlib=True,chunksizes=[chunk_time,chunk_lat,chunk_lon],fill_value=-9999)\n",
    "    t2mo.units = 'degree_Celsius'\n",
    "    t2mo.description = 'Temperature of air at 2m above the surface of land, sea or in-land waters.'\n",
    "    t2mo.standard_name = 'maximum_air_temperature'\n",
    "    t2mo.long_name = 'maximum air temperature'\n",
    "    t2mo.time_step = 'daily'\n",
    "    t2mo.missing_value = -9999\n",
    "    t2mo.geospatial_lat_min = -90.0\n",
    "    t2mo.geospatial_lat_max = 90.0\n",
    "    t2mo.geospatial_lon_min = -180.0\n",
    "    t2mo.geospatial_lon_max = 180.0\n",
    "    t2mo.grid_mapping = 'crs'\n",
    "    t2mo.set_auto_maskandscale(False)\n",
    "\n",
    "    nco.Conventions='CF-1.6'\n",
    "    nco.cdm_data_type = \"GRID\"\n",
    "    nco.title = \"Global daily maximum air temperature from ERA5-Land\"\n",
    "    nco.summary = \"This data originally derived from ERA5-Land Hourly data, extracted into daily aggregate using Google Earth Engine then exported as GeoTIFF file\"\n",
    "    nco.references = \"Muñoz Sabater, J. (2019): ERA5-Land hourly data from 1950 to present. Copernicus Climate Change Service (C3S) Climate Data Store (CDS). DOI: https://doi.org/10.24381/cds.e2161bac\"\n",
    "    nco.history = \"created by GOST - The World Bank\"\n",
    "    nco.version = \"Version 2023\"\n",
    "    nco.comments = \"time variable denotes the first day of the given day\"\n",
    "    nco.website = \"https://worldbank.org\"\n",
    "    nco.date_created = \"2023-02-26\"\n",
    "    nco.creator_name = \"Benny Istanto\"\n",
    "    nco.creator_role = \"Climate Geographer\"\n",
    "    nco.creator_email = \"[email protected]\"\n",
    "    nco.institution = \"The World Bank\"\n",
    "    nco.note = \"The data is developed to support The World Bank to assess extreme temperature, hot days and heat as part Global Monitoring\"\n",
    "\n",
    "    #write lon,lat\n",
    "    lono[:]=lon\n",
    "    lato[:]=lat\n",
    "\n",
    "    pat = re.compile(r'wld_cli_era5land_tmax_([0-9]{8})\\.tif$')  # raw string for regular expression\n",
    "\n",
    "    itime = 0\n",
    "\n",
    "    for f in file_list:\n",
    "        match = re.match(pat, f)\n",
    "        if match:\n",
    "            # Construct the full path for each file\n",
    "            full_file_path = os.path.join(r'X:\\Temp\\era5land\\tmax_daily\\geotiff', f)\n",
    "            \n",
    "            date_str = match.group(1)\n",
    "            date = dt.datetime.strptime(date_str, '%Y%m%d') # Convert to datetime\n",
    "            print(date)\n",
    "            dtime = (date - basedate).total_seconds() / 86400.\n",
    "            timeo[itime] = dtime\n",
    "            \n",
    "            t2m = gdal.Open(full_file_path)\n",
    "            a = t2m.ReadAsArray()\n",
    "            t2mo[itime, :, :] = np.flipud(a) # flip the data in y-direction\n",
    "            itime += 1\n",
    "\n",
    "    nco.close()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3bab239d",
   "metadata": {},
   "source": [
    "### Group Files by Year and Process Each Year\n",
    "\n",
    "This cell will group the files by year and then call the function defined above for each group."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "014e683a",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Group files by year\n",
    "files_by_year = defaultdict(list)\n",
    "for root, dirs, files in os.walk('X:\\\\Temp\\\\era5land\\\\tmax_daily\\\\geotiff'): # Linux: /mnt/x/Temp/era5land/tmax_daily/geotiff/\n",
    "    for f in files:\n",
    "        if f.endswith('.tif'):\n",
    "            match = re.match(r'wld_cli_era5land_tmax_([0-9]{4})[0-9]{4}\\.tif', f)\n",
    "            if match:\n",
    "                year = match.group(1)\n",
    "                files_by_year[year].append(f)\n",
    "\n",
    "# Process each year\n",
    "for year, file_list in files_by_year.items():\n",
    "    file_list.sort()  # Sort files before processing\n",
    "    create_netcdf_for_year(year, file_list)\n",
    "    print(f'Completed year {year}!')\n",
    "\n",
    "print('All years completed!')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b6b20e99",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"id": "5044b463",
	"metadata": {},
	"source": [
	"# GeoTIFFs to single NetCDF"
	]
	},
	{
	"cell_type": "markdown",
	"id": "e045a097",
	"metadata": {},
	"source": [
	"Convert raster GeoTIFF in a folder to single NetCDF file with time dimension enabled that is CF-Compliant\n",
	"http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html\n",
	" \n",
	"Based on Rich Signell's answer on StackExchange: https://gis.stackexchange.com/a/70487\n",
	"\n",
	"This script was tested using ERA5-Land daily data. Adjustment is needed if using other timesteps data. </br> \n",
	"NCO (http://nco.sourceforge.net), GDAL, NetCDF4, numpy must be installed before using this script.\n",
	" \n",
	"Modified by\n",
	"\n",
	"Benny Istanto</br>\n",
	"* UN World Food Programme, [email protected] [1]</br>\n",
	"* GOST/DECAT/DECDG, The World Bank, [email protected] [2]</br>\n",
	"\n",
	"Log:</br>\n",
	"* [1] 2020-07-05 First draft</br>\n",
	"* [2] 2023-07-15 Prevent the output shifted half-pixel</br>\n",
	"* [2] 2024-01-04 Handle daily data from multi year period, grouped by year, process each year</br>"
	]
	},
	{
	"cell_type": "markdown",
	"id": "540dfd4c",
	"metadata": {},
	"source": [
	"### Import Libraries\n",
	"\n",
	"Place all the required imports in the first cell."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "05fa7fbc",
	"metadata": {},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"import datetime as dt\n",
	"import os\n",
	"from osgeo import gdal\n",
	"gdal.UseExceptions()\n",
	"import netCDF4\n",
	"import re\n",
	"from collections import defaultdict\n"
	]
	},
	{
	"cell_type": "markdown",
	"id": "43784cc0",
	"metadata": {},
	"source": [
	"### Define the Function\n",
	"\n",
	"Define the `create_netcdf_for_year` function in the second cell. This function will be responsible for creating the NetCDF file for each year."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "d7acb303",
	"metadata": {},
	"outputs": [],
	"source": [
	"def create_netcdf_for_year(year, file_list):\n",
	" # Use the first file to setup dimensions\n",
	" first_file_full_path = os.path.join('X:\\\\Temp\\\\era5land\\\\tmax_daily\\\\geotiff', file_list[0])\n",
	" ds = gdal.Open(first_file_full_path)\n",
	" a = ds.ReadAsArray()\n",
	" nlat, nlon = np.shape(a)\n",
	"\n",
	" # creating arrays of the longitude (lon) and latitude (lat) coordinates for each pixel in the dataset. \n",
	" b = ds.GetGeoTransform() # bbox, interval\n",
	" lon = np.arange(nlon) * b[1] + b[0] + (b[1] / 2) # add half the x pixel size to the lon\n",
	" lat = np.arange(nlat) * b[5] + b[3] + (b[5] / 2) # add half the y pixel size to the lat\n",
	" lat = np.flipud(lat) # flip the latitudes\n",
	"\n",
	" basedate = dt.datetime(1981, 1, 1, 0, 0, 0)\n",
	"\n",
	" # Updated file path for saving output\n",
	" output_folder = 'X:\\\\Temp\\\\era5land\\\\tmax_daily\\\\nc' # Linux: /mnt/x/Temp/era5land/tmax_daily/nc\n",
	" if not os.path.exists(output_folder):\n",
	" os.makedirs(output_folder)\n",
	" nco = netCDF4.Dataset(os.path.join(output_folder, f'wld_cli_era5land_daily_tmax_{year}.nc'), 'w', clobber=True)\n",
	" \n",
	" # chunking is optional, but can improve access a lot: \n",
	" # (see: http://www.unidata.ucar.edu/blogs/developer/entry/chunking_data_choosing_shapes)\n",
	" chunk_lon=10\n",
	" chunk_lat=10\n",
	" chunk_time=12\n",
	"\n",
	" # create dimensions, variables and attributes:\n",
	" nco.createDimension('lon',nlon)\n",
	" nco.createDimension('lat',nlat)\n",
	" nco.createDimension('time',None)\n",
	"\n",
	" timeo = nco.createVariable('time','f4',('time'))\n",
	" timeo.units = 'days since 1981-1-1 00:00:00'\n",
	" timeo.standard_name = 'time'\n",
	" timeo.calendar = 'gregorian'\n",
	" timeo.axis = 'T'\n",
	"\n",
	" lono = nco.createVariable('lon','f4',('lon'))\n",
	" lono.units = 'degrees_east'\n",
	" lono.standard_name = 'longitude'\n",
	" lono.long_name = 'longitude'\n",
	" lono.axis = 'X'\n",
	"\n",
	" lato = nco.createVariable('lat','f4',('lat'))\n",
	" lato.units = 'degrees_north'\n",
	" lato.standard_name = 'latitude'\n",
	" lato.long_name = 'latitude'\n",
	" lato.axis = 'Y'\n",
	"\n",
	" # create container variable for CRS: lon/lat WGS84 datum\n",
	" crso = nco.createVariable('crs','i4')\n",
	" crso.long_name = 'Lon/Lat Coords in WGS84'\n",
	" crso.grid_mapping_name='latitude_longitude'\n",
	" crso.longitude_of_prime_meridian = 0.0\n",
	" crso.semi_major_axis = 6378137.0\n",
	" crso.inverse_flattening = 298.257223563\n",
	"\n",
	" # create short integer variable for Temperature data, with chunking\n",
	" t2mo = nco.createVariable('tmax', 'f4', ('time', 'lat', 'lon'),\n",
	" zlib=True,chunksizes=[chunk_time,chunk_lat,chunk_lon],fill_value=-9999)\n",
	" t2mo.units = 'degree_Celsius'\n",
	" t2mo.description = 'Temperature of air at 2m above the surface of land, sea or in-land waters.'\n",
	" t2mo.standard_name = 'maximum_air_temperature'\n",
	" t2mo.long_name = 'maximum air temperature'\n",
	" t2mo.time_step = 'daily'\n",
	" t2mo.missing_value = -9999\n",
	" t2mo.geospatial_lat_min = -90.0\n",
	" t2mo.geospatial_lat_max = 90.0\n",
	" t2mo.geospatial_lon_min = -180.0\n",
	" t2mo.geospatial_lon_max = 180.0\n",
	" t2mo.grid_mapping = 'crs'\n",
	" t2mo.set_auto_maskandscale(False)\n",
	"\n",
	" nco.Conventions='CF-1.6'\n",
	" nco.cdm_data_type = \"GRID\"\n",
	" nco.title = \"Global daily maximum air temperature from ERA5-Land\"\n",
	" nco.summary = \"This data originally derived from ERA5-Land Hourly data, extracted into daily aggregate using Google Earth Engine then exported as GeoTIFF file\"\n",
	" nco.references = \"Muñoz Sabater, J. (2019): ERA5-Land hourly data from 1950 to present. Copernicus Climate Change Service (C3S) Climate Data Store (CDS). DOI: https://doi.org/10.24381/cds.e2161bac\"\n",
	" nco.history = \"created by GOST - The World Bank\"\n",
	" nco.version = \"Version 2023\"\n",
	" nco.comments = \"time variable denotes the first day of the given day\"\n",
	" nco.website = \"https://worldbank.org\"\n",
	" nco.date_created = \"2023-02-26\"\n",
	" nco.creator_name = \"Benny Istanto\"\n",
	" nco.creator_role = \"Climate Geographer\"\n",
	" nco.creator_email = \"[email protected]\"\n",
	" nco.institution = \"The World Bank\"\n",
	" nco.note = \"The data is developed to support The World Bank to assess extreme temperature, hot days and heat as part Global Monitoring\"\n",
	"\n",
	" #write lon,lat\n",
	" lono[:]=lon\n",
	" lato[:]=lat\n",
	"\n",
	" pat = re.compile(r'wld_cli_era5land_tmax_([0-9]{8})\\.tif$') # raw string for regular expression\n",
	"\n",
	" itime = 0\n",
	"\n",
	" for f in file_list:\n",
	" match = re.match(pat, f)\n",
	" if match:\n",
	" # Construct the full path for each file\n",
	" full_file_path = os.path.join(r'X:\\Temp\\era5land\\tmax_daily\\geotiff', f)\n",
	" \n",
	" date_str = match.group(1)\n",
	" date = dt.datetime.strptime(date_str, '%Y%m%d') # Convert to datetime\n",
	" print(date)\n",
	" dtime = (date - basedate).total_seconds() / 86400.\n",
	" timeo[itime] = dtime\n",
	" \n",
	" t2m = gdal.Open(full_file_path)\n",
	" a = t2m.ReadAsArray()\n",
	" t2mo[itime, :, :] = np.flipud(a) # flip the data in y-direction\n",
	" itime += 1\n",
	"\n",
	" nco.close()\n"
	]
	},
	{
	"cell_type": "markdown",
	"id": "3bab239d",
	"metadata": {},
	"source": [
	"### Group Files by Year and Process Each Year\n",
	"\n",
	"This cell will group the files by year and then call the function defined above for each group."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "014e683a",
	"metadata": {
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"# Group files by year\n",
	"files_by_year = defaultdict(list)\n",
	"for root, dirs, files in os.walk('X:\\\\Temp\\\\era5land\\\\tmax_daily\\\\geotiff'): # Linux: /mnt/x/Temp/era5land/tmax_daily/geotiff/\n",
	" for f in files:\n",
	" if f.endswith('.tif'):\n",
	" match = re.match(r'wld_cli_era5land_tmax_([0-9]{4})[0-9]{4}\\.tif', f)\n",
	" if match:\n",
	" year = match.group(1)\n",
	" files_by_year[year].append(f)\n",
	"\n",
	"# Process each year\n",
	"for year, file_list in files_by_year.items():\n",
	" file_list.sort() # Sort files before processing\n",
	" create_netcdf_for_year(year, file_list)\n",
	" print(f'Completed year {year}!')\n",
	"\n",
	"print('All years completed!')\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "b6b20e99",
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.12.1"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}