Skip to content

Instantly share code, notes, and snippets.

@mikkohei13
Created January 15, 2025 14:46
Show Gist options
  • Save mikkohei13/178edef9e7e9fa9829db2998f4f49159 to your computer and use it in GitHub Desktop.
Save mikkohei13/178edef9e7e9fa9829db2998f4f49159 to your computer and use it in GitHub Desktop.
# Join en and sv names from Syke's file to original Lely habitat file
# Habitat file, tab separated values
habitat_file = 'habitat_classification_v1.0.tsv'
"""
Example data of habitat_classification_v1.0.tsv:
Enum Vihkon elinympäristö 1.taso 2. taso 3. taso Swedish English
MY.habitatEnumValue1 Metsät Metsä Skog Forest
MY.habitatEnumValue2 Kangasmetsät Kangasmetsä Moskog Heath forest
MY.habitatEnumValue4 Lehtomainen kangas Lehtomainen kangas Lundartad mo Herb-rich heath forest
"""
# Translation file, semicoloneparated values
translation_file = 'translations.csv'
"""
Example data of translations.csv:
Enum;fi;sv;en;;;
MY.habitatEnumValue1;Metsät;Skog;Forest;;;
MY.habitatEnumValue2;Kangasmetsät;Moskog;Heath forest;;;
MY.habitatEnumValue3;Tuore kangas;Frisk mo;Mesic heath forest;;;
"""
# Enum file, tab separated values
enum_file = 'enums.tsv'
"""
Example data from schema.laji.fi; enum.tsv:
Enum Finnish
MY.habitatEnumValue1 Metsät
MY.habitatEnumValue2 Kangasmetsät
MY.habitatEnumValue3 Tuore kangas
"""
import pandas
# Load habitat file
habitat_df = pandas.read_csv(habitat_file, sep='\t', encoding='utf-8')
# Load translation file
translation_df = pandas.read_csv(translation_file, sep=';', encoding='utf-8')
# Load enum file
enum_df = pandas.read_csv(enum_file, sep='\t', encoding='utf-8')
# Generate a new column "fi" to habitat_df by picking value from column 1. taso, 2. taso, or 3. taso - whichever is filled in.
habitat_df['fi'] = habitat_df[['1. taso', '2. taso', '3. taso']].apply(lambda row: row.dropna().values[0], axis=1)
# Remove leading and trailing spaces from fi column
habitat_df['fi'] = habitat_df['fi'].str.strip()
# Add new columns sv and en from translation_df, using column "fi" as the key
habitat_df = habitat_df.merge(translation_df, on='fi', how='left')
# Rename columns
habitat_df.rename(columns={'fi': 'Finnish'}, inplace=True)
habitat_df.rename(columns={'Enum_x': 'Enum'}, inplace=True)
habitat_df.rename(columns={'sv': 'Swedish (NEW)'}, inplace=True)
habitat_df.rename(columns={'en': 'English (NEW)'}, inplace=True)
# For each row where Enum is empty, get Enum value from enum_df, using Finnish column as the key
for index, row in habitat_df.iterrows():
if pandas.isna(row['Enum']):
matching_enum = enum_df[enum_df['Finnish'] == row['Finnish']]['Enum'].values
if len(matching_enum) > 0:
habitat_df.at[index, 'Enum'] = matching_enum[0]
else:
print(f"Warning: No matching Enum found for Finnish value: {row['Finnish']}")
# Remove columns Enum_y Unnamed: 4 Unnamed: 5 Unnamed: 6
habitat_df.drop(columns=['Enum_y', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6'], inplace=True)
# Capitalize first letter in columns Finnish, Swedish (NEW), and English (NEW)
habitat_df['Finnish'] = habitat_df['Finnish'].str.capitalize()
habitat_df['Swedish (NEW)'] = habitat_df['Swedish (NEW)'].str.capitalize()
habitat_df['English (NEW)'] = habitat_df['English (NEW)'].str.capitalize()
# Save joined_df to a new tsv file
habitat_df.to_csv('joined_file.tsv', sep='\t', encoding='utf-8', index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment