Skip to content

Instantly share code, notes, and snippets.

@mikkohei13
Last active January 7, 2025 13:55
Show Gist options
  • Save mikkohei13/bad3c66f1cb86613a49bce6c3ad0c032 to your computer and use it in GitHub Desktop.
Save mikkohei13/bad3c66f1cb86613a49bce6c3ad0c032 to your computer and use it in GitHub Desktop.
Convert Luke atlas dataset 2024 to FinBIF Data Bank format
'''
Converts "Luken aineistokooste 4. lintuatlakseen" tsv file into FinBIF Data Bank secondary data format.
Mikko Heikkinen 2023-12-29, updated 2025-01-07
'''
import pandas as pd
# Save file from Excel as UTF-8 CSV
# Load the file into a Pandas dataframe using tab as the delimiter. Keep "NA" as a value.
file_path = 'Luke_lintuatlasdata_2023-2024.csv'
df = pd.read_csv(file_path, delimiter=';', na_values=[], keep_default_na=False)
row_count = len(df)
print(f"Data loaded: {row_count} rows")
# List columns
print(df.columns)
# ADD IDENTIFIER
# NOTE: this format must not be changed, otherwise the ID will not be persistent if dataset is updated.
# Fix incorrect delimiters
df['Ruutu'] = df['Ruutu'].str.replace('.', ':')
# Fix incorrect species codes, BEFORE id has been generated
df['Laji'] = df['Laji'].str.replace('Trihyp', 'Acthyp')
# Add a new "ID" column by concatenating "Ruutu", "Laji" and "vuosi" with a dash as a separator. This should be persistent if dataset is updated.
df['ID'] = df['Ruutu'].astype(str) + '-' + df['Laji'] + '-' + df['vuosi'].astype(str)
df['ID'] = df['ID'].str.replace(':', '-') # Replaces coordinate serparator
df['ID'] = df['ID'].str.lower() # Species codes to lowercase
# Fix ambiguous species codes, AFTER id has been generated
df['Laji'] = df['Laji'].str.replace('TRIGLA', 'liro')
df['Laji'] = df['Laji'].str.replace('Phylus', 'pajulintu')
df['Laji'] = df['Laji'].str.replace('Tetrix', 'teeri')
df['Laji'] = df['Laji'].str.replace('TETRIX', 'teeri')
# DATE CONVERSIONS
# Duplicate the 'vuosi' column, since all data are from one year only
df['Loppu@yyyy - Yleinen keruutapahtuma'] = df['vuosi']
# Fill in begin and end dates
df['Alku@dd - Yleinen keruutapahtuma'] = 1
df['Alku@mm - Yleinen keruutapahtuma'] = 1
df['Loppu@dd - Yleinen keruutapahtuma'] = 31 # Date cannot be in the future, so use today's date if uploading on the same year.
df['Loppu@mm - Yleinen keruutapahtuma'] = 12
# OBSERVER CONVERSIONS
# Replace "ym." with empty string
df['Havainnoijat'] = df['Havainnoijat'].str.replace(' ym.', '')
# Use semicolon as a separator in "Havainnoijat" column
df['Havainnoijat'] = df['Havainnoijat'].str.replace(',', ';')
# Replace "NA" with "Anonyymi" as observer name
df['Havainnoijat'] = df['Havainnoijat'].replace('NA', 'Anonyymi')
# Make observer names hidden
df['Havainnoijien nimet ovat julkisia - Yleinen keruutapahtuma'] = 'Ei'
# COORDINATE CONVERSIONS
# Add a new coodinate system column with a constant value 'ykj'
df['Koordinaatit@System - Keruutapahtuma'] = 'ykj'
# Split the 'Ruutu' column into N and E columns
df[['Koordinaatit@N - Keruutapahtuma', 'Koordinaatit@E - Keruutapahtuma']] = df['Ruutu'].str.split(':', expand=True)
# CLEANUP
# Rename columns to match Data Bank format
df.rename(columns={
'vuosi': 'Alku@yyyy - Yleinen keruutapahtuma',
'Laji': 'Laji - Määritys',
'Pesimävarmuusindeksi': 'Pesimävarmuusindeksi - Havainto',
'Aineisto': 'Tietojen lähde - Havaintoerä',
'Havainnoijat': 'Havainnoijat - Yleinen keruutapahtuma'
}, inplace=True)
# Remove unneeded columns
df.drop(columns=['Lähde'], inplace=True)
df.drop(columns=['Ruutu'], inplace=True)
# Randomize order for test upload
#df = df.sample(frac=1).reset_index(drop=True)
# Sort
df.sort_values(by='Havainnoijat - Yleinen keruutapahtuma', inplace=True)
# Save the dataframe to a CSV file with UTF-8 encoding with BOM
csv_file_path = 'output-bom.csv'
df.to_csv(csv_file_path, index=False, encoding='utf-8-sig')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment