Created
January 15, 2025 14:46
-
-
Save mikkohei13/178edef9e7e9fa9829db2998f4f49159 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Join en and sv names from Syke's file to original Lely habitat file | |
# Habitat file, tab separated values | |
habitat_file = 'habitat_classification_v1.0.tsv' | |
""" | |
Example data of habitat_classification_v1.0.tsv: | |
Enum Vihkon elinympäristö 1.taso 2. taso 3. taso Swedish English | |
MY.habitatEnumValue1 Metsät Metsä Skog Forest | |
MY.habitatEnumValue2 Kangasmetsät Kangasmetsä Moskog Heath forest | |
MY.habitatEnumValue4 Lehtomainen kangas Lehtomainen kangas Lundartad mo Herb-rich heath forest | |
""" | |
# Translation file, semicoloneparated values | |
translation_file = 'translations.csv' | |
""" | |
Example data of translations.csv: | |
Enum;fi;sv;en;;; | |
MY.habitatEnumValue1;Metsät;Skog;Forest;;; | |
MY.habitatEnumValue2;Kangasmetsät;Moskog;Heath forest;;; | |
MY.habitatEnumValue3;Tuore kangas;Frisk mo;Mesic heath forest;;; | |
""" | |
# Enum file, tab separated values | |
enum_file = 'enums.tsv' | |
""" | |
Example data from schema.laji.fi; enum.tsv: | |
Enum Finnish | |
MY.habitatEnumValue1 Metsät | |
MY.habitatEnumValue2 Kangasmetsät | |
MY.habitatEnumValue3 Tuore kangas | |
""" | |
import pandas | |
# Load habitat file | |
habitat_df = pandas.read_csv(habitat_file, sep='\t', encoding='utf-8') | |
# Load translation file | |
translation_df = pandas.read_csv(translation_file, sep=';', encoding='utf-8') | |
# Load enum file | |
enum_df = pandas.read_csv(enum_file, sep='\t', encoding='utf-8') | |
# Generate a new column "fi" to habitat_df by picking value from column 1. taso, 2. taso, or 3. taso - whichever is filled in. | |
habitat_df['fi'] = habitat_df[['1. taso', '2. taso', '3. taso']].apply(lambda row: row.dropna().values[0], axis=1) | |
# Remove leading and trailing spaces from fi column | |
habitat_df['fi'] = habitat_df['fi'].str.strip() | |
# Add new columns sv and en from translation_df, using column "fi" as the key | |
habitat_df = habitat_df.merge(translation_df, on='fi', how='left') | |
# Rename columns | |
habitat_df.rename(columns={'fi': 'Finnish'}, inplace=True) | |
habitat_df.rename(columns={'Enum_x': 'Enum'}, inplace=True) | |
habitat_df.rename(columns={'sv': 'Swedish (NEW)'}, inplace=True) | |
habitat_df.rename(columns={'en': 'English (NEW)'}, inplace=True) | |
# For each row where Enum is empty, get Enum value from enum_df, using Finnish column as the key | |
for index, row in habitat_df.iterrows(): | |
if pandas.isna(row['Enum']): | |
matching_enum = enum_df[enum_df['Finnish'] == row['Finnish']]['Enum'].values | |
if len(matching_enum) > 0: | |
habitat_df.at[index, 'Enum'] = matching_enum[0] | |
else: | |
print(f"Warning: No matching Enum found for Finnish value: {row['Finnish']}") | |
# Remove columns Enum_y Unnamed: 4 Unnamed: 5 Unnamed: 6 | |
habitat_df.drop(columns=['Enum_y', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6'], inplace=True) | |
# Capitalize first letter in columns Finnish, Swedish (NEW), and English (NEW) | |
habitat_df['Finnish'] = habitat_df['Finnish'].str.capitalize() | |
habitat_df['Swedish (NEW)'] = habitat_df['Swedish (NEW)'].str.capitalize() | |
habitat_df['English (NEW)'] = habitat_df['English (NEW)'].str.capitalize() | |
# Save joined_df to a new tsv file | |
habitat_df.to_csv('joined_file.tsv', sep='\t', encoding='utf-8', index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment