Last active
January 13, 2025 21:16
-
-
Save mikkohei13/7c8c6297d163c0ba9c8f6f98eb669dc2 to your computer and use it in GitHub Desktop.
Script to convert CSV tabular file to FinBIF data bank format
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Script to make conversions to a CSV tabular file containing bird species observations (occurrence records) | |
Updated for 2024 file format | |
Data format sample: | |
id;pvm;aika;laji;lkm;ykj_p;ykj_i;PV-indeksi | |
1593921;01.02.2024;07:16;teeri;1;6890000;3410000;2 | |
1593922;01.02.2024;08:04;metso;1;7070000;3380000;2 | |
1593925;01.02.2024;08:17;metso;1;6980000;3230000;2 | |
1593941;01.02.2024;10:39;teeri;80;7090000;3470000;2 | |
1594064;01.02.2024;11:00;peltopyy;9;6970000;3280000;2 | |
The final file should contain the following columns: | |
ID - Havaintoerä | |
Havainnoijat - Yleinen keruutapahtuma | |
Alku - Yleinen keruutapahtuma | |
Koordinaatit@N - Keruutapahtuma | |
Koordinaatit@E - Keruutapahtuma | |
Koordinaatit@System - Keruutapahtuma | |
Laji - Määritys | |
Määrä - Havainto | |
Pesimävarmuusindeksi - Havainto | |
""" | |
import pandas | |
# Function to load csv file using semicolons as separators into Pandas DataFrame | |
def load_csv(filename): | |
df = pandas.read_csv(filename, sep=';', encoding='utf-8') | |
return df | |
# Process ykj_p and ykj_i columns - only truncate if ending in "0000" | |
def process_coordinate(x, column_name): | |
str_x = str(x) | |
if str_x.endswith('0000'): | |
return str_x[:3] | |
else: | |
print(f"Warning: Unexpected coordinate format in {column_name}: {str_x}") | |
return str_x | |
expected_year = "2024" | |
filename = 'sample.csv' | |
filename = 'OR_havainnot_lintuatlakseen_2022_tarkastettu.csv' | |
filename = 'OR_havainnot_lintuatlakseen_2023_tarkastettu.csv' | |
filename = 'Oma_riista_havainnot_lintuatlas_2024_valmis.csv' | |
df = load_csv(filename) | |
# Convert date (pvm) column to datetime format YYYY-MM-DD | |
df['pvm'] = pandas.to_datetime(df['pvm'], format='%d.%m.%Y').dt.strftime('%Y-%m-%d') | |
# Check that year is expected_year on all rows | |
#if not df['pvm'].str.contains(expected_year).all(): | |
# print(f"Warning: Year is not {expected_year} on all rows") | |
# exit(1) | |
# Combine date (pvm) and time (aika) columns to a single column "Alku - Yleinen keruutapahtuma" datetime format YYYY-MM-DD'T'HH:MM:SS | |
df['Alku - Yleinen keruutapahtuma'] = df['pvm'] + 'T' + df['aika'] + ':00' | |
# If aika field is missing leading hour digit, add it (e.g. 9:50 -> 09:50) | |
df['aika'] = df['aika'].apply(lambda x: '0' + x if len(x) == 4 else x) | |
# Remove columns pvm and aika | |
df.drop(['pvm', 'aika'], axis=1, inplace=True) | |
# On lkm field, replace NA values with empty string | |
df['lkm'] = df['lkm'].fillna('') | |
# On lkm field, convert decimal values to integer. Ignore empty strings. | |
df['lkm'] = df['lkm'].apply(lambda x: int(float(x)) if x != '' else x) | |
# On pesimävarmuusindeksi field, replace NA values with empty string | |
df['PV-indeksi'] = df['PV-indeksi'].fillna('') | |
# On pesimävarmuusindeksi field, convert decimal values to integer. Ignore empty strings. | |
df['PV-indeksi'] = df['PV-indeksi'].apply(lambda x: int(float(x)) if x != '' else x) | |
df['ykj_p'] = df['ykj_p'].apply(lambda x: process_coordinate(x, 'ykj_p')) | |
df['ykj_i'] = df['ykj_i'].apply(lambda x: process_coordinate(x, 'ykj_i')) | |
# Add column "Koordinaatit@System - Keruutapahtuma" with value "ykj" | |
df['Koordinaatit@System - Keruutapahtuma'] = 'ykj' | |
# Add column "Havainnoijat - Yleinen keruutapahtuma" with value "Anonyymi" | |
df['Havainnoijat - Yleinen keruutapahtuma'] = 'Anonyymi' | |
# Remove rows that have NaN values on id column | |
df.dropna(subset=['id'], inplace=True) | |
# Prepend id with "{expectedyear}_" | |
df['id'] = df['id'].apply(lambda x: f"{expected_year}_{x}") | |
# Final step: rename columns | |
# Rename laji to "Laji - Määritys" | |
df.rename(columns={'laji': 'Laji - Määritys'}, inplace=True) | |
# Rename pesimävarmuusindeksi to "Pesimävarmuusindeksi - Havainto" | |
df.rename(columns={'PV-indeksi': 'Pesimävarmuusindeksi - Havainto'}, inplace=True) | |
# Rename ID to "ID - Havaintoerä" | |
df.rename(columns={'id': 'ID - Havaintoerä'}, inplace=True) | |
# Rename lkm to "Lukumäärä - Yleinen keruutapahtuma" | |
df.rename(columns={'lkm': 'Määrä - Havainto'}, inplace=True) | |
# Rename ykj_p and ykj_i to "Koordinaatit@N - Keruutapahtuma" and "Koordinaatit@E - Keruutapahtuma" | |
df.rename(columns={'ykj_p': 'Koordinaatit@N - Keruutapahtuma', 'ykj_i': 'Koordinaatit@E - Keruutapahtuma'}, inplace=True) | |
print(df) | |
# Save the file as csv file UTF-8 with BOM, using filename with prefix "converted_" | |
df.to_csv('converted_' + filename, sep=';', encoding='utf-8-sig', index=False) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment