Last active
October 23, 2023 15:19
-
-
Save bennyistanto/f2f5d50b84b0819e27f67aa04c061ada to your computer and use it in GitHub Desktop.
Join data attribute based on csv's KEY FIELD with DBF output from ArcGIS Zonal Statistics
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 34, | |
"id": "cd0c6a56", | |
"metadata": { | |
"scrolled": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"Processing files for MIN: 100%|████████████████████████████████████| 1000/1000 [00:49<00:00, 20.06it/s]\n", | |
"Processing files for MAX: 100%|████████████████████████████████████| 1000/1000 [00:46<00:00, 21.45it/s]\n", | |
"Processing files for MEAN: 100%|███████████████████████████████████| 1000/1000 [00:43<00:00, 23.23it/s]\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Completed\n" | |
] | |
} | |
], | |
"source": [ | |
"import os\n", | |
"import pandas as pd\n", | |
"from simpledbf import Dbf5\n", | |
"from tqdm import tqdm\n", | |
"\n", | |
"# ISO3 Country Code\n", | |
"iso3 = \"syr\" # syr, tur, lbn, mmr\n", | |
"\n", | |
"# Directory containing the dbf files\n", | |
"dbf_dir = f'/mnt/x/Temp/modis/{iso3}/gee/13_tables/zonal/evi_all/orig'\n", | |
"\n", | |
"# Path to the other CSV file\n", | |
"csv_file = f'/mnt/x/Temp/modis/{iso3}/bnd/{iso3}_adm3_join.csv'\n", | |
"\n", | |
"# Load the other CSV file into a DataFrame\n", | |
"base_csv_df = pd.read_csv(csv_file, sep=';')\n", | |
"\n", | |
"# Get a list of all dbf files in the directory\n", | |
"dbf_files = [f for f in os.listdir(dbf_dir) if f.endswith('.dbf')]\n", | |
"\n", | |
"def process_files(column_name):\n", | |
" # Create an empty DataFrame to store the merged data\n", | |
" merged_df = pd.DataFrame()\n", | |
"\n", | |
" # Iterate over each file in the directory\n", | |
" for filename in tqdm(dbf_files, desc=f\"Processing files for {column_name}\"):\n", | |
" # Extract the date from the filename\n", | |
" date = filename.rsplit('_', 1)[-1].rsplit('.', 1)[0]\n", | |
"\n", | |
" # Load the dbf file into a DataFrame\n", | |
" dbf = Dbf5(os.path.join(dbf_dir, filename))\n", | |
" df = dbf.to_dataframe()\n", | |
"\n", | |
" # Rename the column to the extracted date\n", | |
" df = df.rename(columns={column_name: date})\n", | |
"\n", | |
" # Adjust the key field name for joining purpose\n", | |
" df = df[['ADM3_PCODE', date]] \n", | |
"\n", | |
" # If the merged DataFrame is empty, copy the current DataFrame to it\n", | |
" if merged_df.empty:\n", | |
" merged_df = df.copy()\n", | |
" else:\n", | |
" # Otherwise, merge the current DataFrame with the merged DataFrame\n", | |
" merged_df = pd.merge(merged_df, df, on='ADM3_PCODE', how='outer')\n", | |
"\n", | |
" # Join the merged DataFrame with the other CSV DataFrame based on 'ADM3_PCODE' columns\n", | |
" final_df = pd.merge(base_csv_df, merged_df, on='ADM3_PCODE', how='left') # Change 'inner' to 'left'\n", | |
"\n", | |
" # Replace NaN values with an empty string\n", | |
" final_df.fillna('', inplace=True)\n", | |
"\n", | |
" # Save the final DataFrame to a csv file\n", | |
" final_df.to_csv(f'/mnt/x/Temp/modis/{iso3}/gee/13_tables/zonal/csv/8day/{iso3}_phy_evi_all_{column_name}_mxd13q1_adm3.csv', index=False)\n", | |
"\n", | |
"# Call the function for each of the three columns\n", | |
"process_files('MIN')\n", | |
"process_files('MAX')\n", | |
"process_files('MEAN')\n", | |
"\n", | |
"print('Completed')\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 35, | |
"id": "8dca9846", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"Processing files for MIN: 100%|████████████████████████████████████| 1000/1000 [00:46<00:00, 21.72it/s]\n", | |
"Processing files for MAX: 100%|████████████████████████████████████| 1000/1000 [00:47<00:00, 20.89it/s]\n", | |
"Processing files for MEAN: 100%|███████████████████████████████████| 1000/1000 [00:51<00:00, 19.47it/s]\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Completed\n" | |
] | |
} | |
], | |
"source": [ | |
"import os\n", | |
"import pandas as pd\n", | |
"from simpledbf import Dbf5\n", | |
"from tqdm import tqdm\n", | |
"\n", | |
"# ISO3 Country Code\n", | |
"iso3 = \"syr\" # syr, tur, lbn, mmr\n", | |
"\n", | |
"# Directory containing the dbf files\n", | |
"dbf_dir = f'/mnt/x/Temp/modis/{iso3}/gee/13_tables/zonal/evi_cropland/orig'\n", | |
"\n", | |
"# Path to the other CSV file\n", | |
"csv_file = f'/mnt/x/Temp/modis/{iso3}/bnd/{iso3}_adm3_join.csv'\n", | |
"\n", | |
"# Load the other CSV file into a DataFrame\n", | |
"base_csv_df = pd.read_csv(csv_file, sep=';')\n", | |
"\n", | |
"# Get a list of all dbf files in the directory\n", | |
"dbf_files = [f for f in os.listdir(dbf_dir) if f.endswith('.dbf')]\n", | |
"\n", | |
"def process_files(column_name):\n", | |
" # Create an empty DataFrame to store the merged data\n", | |
" merged_df = pd.DataFrame()\n", | |
"\n", | |
" # Iterate over each file in the directory\n", | |
" for filename in tqdm(dbf_files, desc=f\"Processing files for {column_name}\"):\n", | |
" # Extract the date from the filename\n", | |
" date = filename.rsplit('_', 1)[-1].rsplit('.', 1)[0]\n", | |
"\n", | |
" # Load the dbf file into a DataFrame\n", | |
" dbf = Dbf5(os.path.join(dbf_dir, filename))\n", | |
" df = dbf.to_dataframe()\n", | |
"\n", | |
" # Rename the column to the extracted date\n", | |
" df = df.rename(columns={column_name: date})\n", | |
"\n", | |
" # Adjust the key field name for joining purpose\n", | |
" df = df[['ADM3_PCODE', date]] \n", | |
"\n", | |
" # If the merged DataFrame is empty, copy the current DataFrame to it\n", | |
" if merged_df.empty:\n", | |
" merged_df = df.copy()\n", | |
" else:\n", | |
" # Otherwise, merge the current DataFrame with the merged DataFrame\n", | |
" merged_df = pd.merge(merged_df, df, on='ADM3_PCODE', how='outer')\n", | |
"\n", | |
" # Join the merged DataFrame with the other CSV DataFrame based on 'ADM3_PCODE' columns\n", | |
" final_df = pd.merge(base_csv_df, merged_df, on='ADM3_PCODE', how='left') # Change 'inner' to 'left'\n", | |
"\n", | |
" # Replace NaN values with an empty string\n", | |
" final_df.fillna('', inplace=True)\n", | |
"\n", | |
" # Save the final DataFrame to a csv file\n", | |
" final_df.to_csv(f'/mnt/x/Temp/modis/{iso3}/gee/13_tables/zonal/csv/8day/{iso3}_phy_evi_cropland_{column_name}_mxd13q1_adm3.csv', index=False)\n", | |
"\n", | |
"# Call the function for each of the three columns\n", | |
"process_files('MIN')\n", | |
"process_files('MAX')\n", | |
"process_files('MEAN')\n", | |
"\n", | |
"print('Completed')\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "c1106579", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.10.4" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment