Last active
January 7, 2025 13:55
-
-
Save mikkohei13/bad3c66f1cb86613a49bce6c3ad0c032 to your computer and use it in GitHub Desktop.
Convert Luke atlas dataset 2024 to FinBIF Data Bank format
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Converts "Luken aineistokooste 4. lintuatlakseen" tsv file into FinBIF Data Bank secondary data format. | |
Mikko Heikkinen 2023-12-29, updated 2025-01-07 | |
''' | |
import pandas as pd | |
# Save file from Excel as UTF-8 CSV | |
# Load the file into a Pandas dataframe using tab as the delimiter. Keep "NA" as a value. | |
file_path = 'Luke_lintuatlasdata_2023-2024.csv' | |
df = pd.read_csv(file_path, delimiter=';', na_values=[], keep_default_na=False) | |
row_count = len(df) | |
print(f"Data loaded: {row_count} rows") | |
# List columns | |
print(df.columns) | |
# ADD IDENTIFIER | |
# NOTE: this format must not be changed, otherwise the ID will not be persistent if dataset is updated. | |
# Fix incorrect delimiters | |
df['Ruutu'] = df['Ruutu'].str.replace('.', ':') | |
# Fix incorrect species codes, BEFORE id has been generated | |
df['Laji'] = df['Laji'].str.replace('Trihyp', 'Acthyp') | |
# Add a new "ID" column by concatenating "Ruutu", "Laji" and "vuosi" with a dash as a separator. This should be persistent if dataset is updated. | |
df['ID'] = df['Ruutu'].astype(str) + '-' + df['Laji'] + '-' + df['vuosi'].astype(str) | |
df['ID'] = df['ID'].str.replace(':', '-') # Replaces coordinate serparator | |
df['ID'] = df['ID'].str.lower() # Species codes to lowercase | |
# Fix ambiguous species codes, AFTER id has been generated | |
df['Laji'] = df['Laji'].str.replace('TRIGLA', 'liro') | |
df['Laji'] = df['Laji'].str.replace('Phylus', 'pajulintu') | |
df['Laji'] = df['Laji'].str.replace('Tetrix', 'teeri') | |
df['Laji'] = df['Laji'].str.replace('TETRIX', 'teeri') | |
# DATE CONVERSIONS | |
# Duplicate the 'vuosi' column, since all data are from one year only | |
df['Loppu@yyyy - Yleinen keruutapahtuma'] = df['vuosi'] | |
# Fill in begin and end dates | |
df['Alku@dd - Yleinen keruutapahtuma'] = 1 | |
df['Alku@mm - Yleinen keruutapahtuma'] = 1 | |
df['Loppu@dd - Yleinen keruutapahtuma'] = 31 # Date cannot be in the future, so use today's date if uploading on the same year. | |
df['Loppu@mm - Yleinen keruutapahtuma'] = 12 | |
# OBSERVER CONVERSIONS | |
# Replace "ym." with empty string | |
df['Havainnoijat'] = df['Havainnoijat'].str.replace(' ym.', '') | |
# Use semicolon as a separator in "Havainnoijat" column | |
df['Havainnoijat'] = df['Havainnoijat'].str.replace(',', ';') | |
# Replace "NA" with "Anonyymi" as observer name | |
df['Havainnoijat'] = df['Havainnoijat'].replace('NA', 'Anonyymi') | |
# Make observer names hidden | |
df['Havainnoijien nimet ovat julkisia - Yleinen keruutapahtuma'] = 'Ei' | |
# COORDINATE CONVERSIONS | |
# Add a new coodinate system column with a constant value 'ykj' | |
df['Koordinaatit@System - Keruutapahtuma'] = 'ykj' | |
# Split the 'Ruutu' column into N and E columns | |
df[['Koordinaatit@N - Keruutapahtuma', 'Koordinaatit@E - Keruutapahtuma']] = df['Ruutu'].str.split(':', expand=True) | |
# CLEANUP | |
# Rename columns to match Data Bank format | |
df.rename(columns={ | |
'vuosi': 'Alku@yyyy - Yleinen keruutapahtuma', | |
'Laji': 'Laji - Määritys', | |
'Pesimävarmuusindeksi': 'Pesimävarmuusindeksi - Havainto', | |
'Aineisto': 'Tietojen lähde - Havaintoerä', | |
'Havainnoijat': 'Havainnoijat - Yleinen keruutapahtuma' | |
}, inplace=True) | |
# Remove unneeded columns | |
df.drop(columns=['Lähde'], inplace=True) | |
df.drop(columns=['Ruutu'], inplace=True) | |
# Randomize order for test upload | |
#df = df.sample(frac=1).reset_index(drop=True) | |
# Sort | |
df.sort_values(by='Havainnoijat - Yleinen keruutapahtuma', inplace=True) | |
# Save the dataframe to a CSV file with UTF-8 encoding with BOM | |
csv_file_path = 'output-bom.csv' | |
df.to_csv(csv_file_path, index=False, encoding='utf-8-sig') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment