Skip to content

Instantly share code, notes, and snippets.

@grujdin
Last active May 5, 2025 21:12
Show Gist options
  • Save grujdin/95b1e10d852db6064f9b1c556bc2588d to your computer and use it in GitHub Desktop.
Save grujdin/95b1e10d852db6064f9b1c556bc2588d to your computer and use it in GitHub Desktop.
#--------------------- Align EM-DAT database format to GDIS database ----------------------
#
# This script processes EM-DAT disaster event data to align its format with the GDIS database.
# Key tasks include:
# - Splitting multi-location events into separate observations for each named location.
# - Normalizing and cleaning location names for consistency.
# - Expanding province-prefecture mappings when applicable.
# - Matching locations to administrative units using fuzzy matching.
# - Assigning unique serial codes to processed events.
# - Saving the cleaned and structured data into a new Excel file.
#
# ----------------------------------------------------------------------------------------
import pandas as pd # Import pandas for data handling
import unicodedata # For Unicode normalization
import re # For regular expressions
import json # For handling JSON data
from fuzzywuzzy import fuzz # For fuzzy string matching
# ----------------------- Helper Functions -----------------------
# Normalize Unicode characters in a string to their closest ASCII equivalents
def normalize_string(s):
return unicodedata.normalize('NFKD', s).encode('ASCII', 'ignore').decode('ASCII')
# Remove predefined keywords (e.g., "province", "city") and normalize the location string
def clean_location(location):
ignore_keywords = [
'province', 'provinces', 'distrito capital', 'city', 'cities', 'district', 'districts',
'county', 'counties', 'municipalities', 'municipality', 'region', 'regions',
'department', 'state', 'village', 'airport', 'borough'
]
location = normalize_string(location.lower()) # Convert to lowercase and normalize Unicode
words = location.split() # Split into individual words
cleaned_words = [word for word in words if word not in ignore_keywords] # Remove ignored keywords
return ' '.join(cleaned_words) # Reassemble the cleaned location string
# Split a location string at commas that are not enclosed in parentheses
def parse_locations(location_string):
matches = re.finditer(r',\s*(?![^\(\)]*\))', location_string) # Match commas outside parentheses
last_pos = 0
parts = []
for match in matches:
parts.append(location_string[last_pos:match.start()]) # Extract each part before a comma
last_pos = match.end()
parts.append(location_string[last_pos:]) # Add the last part after the final comma
return parts
# Expand province-prefecture mappings where multiple provinces share a common prefecture
def expand_province_prefecture_mapping(location_entries):
expanded_locations = []
pattern = re.compile(r'(.+)\s+\((.+)\)') # Match "Province (Prefecture)" format
for entry in location_entries:
match = pattern.match(entry)
if match:
provinces = match.group(1).split(', ') # Split multiple province names
prefecture = match.group(2).strip()
expanded_locations.extend([f"{province} ({prefecture})" for province in provinces]) # Expand mapping
else:
expanded_locations.append(entry) # Keep the original if no mapping exists
return expanded_locations
# Match a given location to the best-fitting administrative unit using fuzzy string matching
def find_best_matching_admin_units_v3(location, admin_units_string):
try:
admin_units = json.loads(admin_units_string) if admin_units_string else [] # Load admin unit data from JSON
except json.JSONDecodeError:
return None # Return None if JSON decoding fails
location_cleaned = clean_location(location) # Normalize and clean location name
location_parts = location_cleaned.split() # Split into individual words
best_match = None
max_score = 0 # Track the highest similarity score
for unit in admin_units:
names_to_compare = [] # List of potential matching names
if 'adm1_name' in unit:
names_to_compare.append(unit['adm1_name']) # Add administrative level 1 name
if 'adm2_name' in unit:
names_to_compare.append(unit['adm2_name']) # Add administrative level 2 name
for name in names_to_compare:
score = fuzz.ratio(name, location_cleaned) # Compute similarity score
if score > max_score: # Update best match if a higher score is found
max_score = score
best_match = unit
return best_match # Return the best-matching administrative unit
# ----------------------- Data Processing -----------------------
# Load the Excel file containing EM-DAT disaster data
excel_file = 'D:/ProjDB/EMDAT/public_emdat_reduced.xlsx'
df = pd.read_excel(excel_file) # Read data into a Pandas DataFrame
# Prepare a list to store processed event records
rows_list = []
serial_number = 1 # Initialize unique event serial number
# Iterate through each row in the dataset
for index, row in df.iterrows():
admin_units_string = row['Admin Units'] if isinstance(row['Admin Units'], str) else "" # Extract admin units
location_entries = parse_locations(row['Location']) if pd.notna(row['Location']) else [] # Split multi-location entries
# Expand province-prefecture mappings where necessary
expanded_location_entries = expand_province_prefecture_mapping(location_entries)
# Process each parsed location separately
for loc_entry in expanded_location_entries:
match = find_best_matching_admin_units_v3(loc_entry, admin_units_string) # Find the best admin unit match
new_row = row.to_dict() # Convert row to dictionary
new_row['Location'] = loc_entry # Update location field
new_row['Admin Units'] = json.dumps([match]) if match else "[]" # Store matched admin unit(s) as JSON
new_row['Unique Code'] = f"MMR-{serial_number}" # Assign a unique serial code
serial_number += 1 # Increment serial number
rows_list.append(new_row) # Append the processed row to the list
# Convert the processed data into a new DataFrame
reshaped_data = pd.DataFrame(rows_list)
# ----------------------- Save to Excel -----------------------
# Define the path for the output Excel file
new_excel_path = 'D:/ProjDB/EMDAT/public_emdat_GDIS_aligned.xlsx'
# Create an Excel writer object using the 'xlsxwriter' engine
writer = pd.ExcelWriter(new_excel_path, engine='xlsxwriter')
reshaped_data.to_excel(writer, index=False, sheet_name='Sheet1') # Write data to the first sheet
# Apply formatting to highlight rows where admin units were not matched
workbook = writer.book
worksheet = writer.sheets['Sheet1']
red_font_format = workbook.add_format({'font_color': 'red'}) # Define red font format
# Iterate over rows to apply conditional formatting
for idx, row in reshaped_data.iterrows():
if "[]" in row['Admin Units']: # Check if no admin unit was matched
worksheet.set_row(idx + 1, None, red_font_format) # Highlight row in red
writer.close() # Finalize and save the Excel file
# Print confirmation message
print(f"Reshaped data saved to {new_excel_path}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment