mikkohei13 · January 7, 2025 13:55
diff --git a/convert.py b/convert.py
 '''
 Converts "Luken aineistokooste 4. lintuatlakseen" tsv file into FinBIF Data Bank secondary data format.
 Mikko Heikkinen 2023-12-29, updated 2025-01-07
 '''

 import pandas as pd

 # Save file from Excel as UTF-8 CSV
 # Load the file into a Pandas dataframe using tab as the delimiter. Keep "NA" as a value.
 file_path = 'Luke_lintuatlasdata_2023-2024.csv'
 df = pd.read_csv(file_path, delimiter=';', na_values=[], keep_default_na=False)

 row_count = len(df)
 print(f"Data loaded: {row_count} rows")

 # List columns
 print(df.columns)

 # ADD IDENTIFIER
 # NOTE: this format must not be changed, otherwise the ID will not be persistent if dataset is updated.

 # Fix incorrect delimiters
 df['Ruutu'] = df['Ruutu'].str.replace('.', ':')

 # Fix incorrect species codes, BEFORE id has been generated
 df['Laji'] = df['Laji'].str.replace('Trihyp', 'Acthyp')

 # Add a new "ID" column by concatenating "Ruutu", "Laji" and "vuosi" with a dash as a separator. This should be persistent if dataset is updated.
 df['ID'] = df['Ruutu'].astype(str) + '-' + df['Laji'] + '-' + df['vuosi'].astype(str)
 df['ID'] = df['ID'].str.replace(':', '-') # Replaces coordinate serparator
 df['ID'] = df['ID'].str.lower() # Species codes to lowercase

 # Fix ambiguous species codes, AFTER id has been generated
 df['Laji'] = df['Laji'].str.replace('TRIGLA', 'liro')
 df['Laji'] = df['Laji'].str.replace('Phylus', 'pajulintu')
 df['Laji'] = df['Laji'].str.replace('Tetrix', 'teeri')
 df['Laji'] = df['Laji'].str.replace('TETRIX', 'teeri')

 # DATE CONVERSIONS
 # Duplicate the 'vuosi' column, since all data are from one year only
 df['Loppu@yyyy - Yleinen keruutapahtuma'] = df['vuosi']

 # Fill in begin and end dates
 df['Alku@dd - Yleinen keruutapahtuma'] = 1
 df['Alku@mm - Yleinen keruutapahtuma'] = 1
 df['Loppu@dd - Yleinen keruutapahtuma'] = 31 # Date cannot be in the future, so use today's date if uploading on the same year.
 df['Loppu@mm - Yleinen keruutapahtuma'] = 12

 # OBSERVER CONVERSIONS
 # Replace "ym." with empty string
 df['Havainnoijat'] = df['Havainnoijat'].str.replace(' ym.', '')

 # Use semicolon as a separator in "Havainnoijat" column
 df['Havainnoijat'] = df['Havainnoijat'].str.replace(',', ';')

 # Replace "NA" with "Anonyymi" as observer name
 df['Havainnoijat'] = df['Havainnoijat'].replace('NA', 'Anonyymi')

 # Make observer names hidden
 df['Havainnoijien nimet ovat julkisia - Yleinen keruutapahtuma'] = 'Ei'

 # COORDINATE CONVERSIONS
 # Add a new coodinate system column with a constant value 'ykj'
 df['Koordinaatit@System - Keruutapahtuma'] = 'ykj'

 # Split the 'Ruutu' column into N and E columns
 df[['Koordinaatit@N - Keruutapahtuma', 'Koordinaatit@E - Keruutapahtuma']] = df['Ruutu'].str.split(':', expand=True)

 # CLEANUP
 # Rename columns to match Data Bank format
 df.rename(columns={
    'vuosi': 'Alku@yyyy - Yleinen keruutapahtuma',
    'Laji': 'Laji - Määritys',
    'Pesimävarmuusindeksi': 'Pesimävarmuusindeksi - Havainto',
    'Aineisto': 'Tietojen lähde - Havaintoerä',
    'Havainnoijat': 'Havainnoijat - Yleinen keruutapahtuma'
 }, inplace=True)

 # Remove unneeded columns
 df.drop(columns=['Lähde'], inplace=True)
 df.drop(columns=['Ruutu'], inplace=True)

 # Randomize order for test upload
 #df = df.sample(frac=1).reset_index(drop=True)

 # Sort
 df.sort_values(by='Havainnoijat - Yleinen keruutapahtuma', inplace=True)

 # Save the dataframe to a CSV file with UTF-8 encoding with BOM
 csv_file_path = 'output-bom.csv'
 df.to_csv(csv_file_path, index=False, encoding='utf-8-sig')
	'''
	Converts "Luken aineistokooste 4. lintuatlakseen" tsv file into FinBIF Data Bank secondary data format.
	Mikko Heikkinen 2023-12-29, updated 2025-01-07
	'''

	import pandas as pd

	# Save file from Excel as UTF-8 CSV
	# Load the file into a Pandas dataframe using tab as the delimiter. Keep "NA" as a value.
	file_path = 'Luke_lintuatlasdata_2023-2024.csv'
	df = pd.read_csv(file_path, delimiter=';', na_values=[], keep_default_na=False)

	row_count = len(df)
	print(f"Data loaded: {row_count} rows")

	# List columns
	print(df.columns)

	# ADD IDENTIFIER
	# NOTE: this format must not be changed, otherwise the ID will not be persistent if dataset is updated.

	# Fix incorrect delimiters
	df['Ruutu'] = df['Ruutu'].str.replace('.', ':')

	# Fix incorrect species codes, BEFORE id has been generated
	df['Laji'] = df['Laji'].str.replace('Trihyp', 'Acthyp')

	# Add a new "ID" column by concatenating "Ruutu", "Laji" and "vuosi" with a dash as a separator. This should be persistent if dataset is updated.
	df['ID'] = df['Ruutu'].astype(str) + '-' + df['Laji'] + '-' + df['vuosi'].astype(str)
	df['ID'] = df['ID'].str.replace(':', '-') # Replaces coordinate serparator
	df['ID'] = df['ID'].str.lower() # Species codes to lowercase

	# Fix ambiguous species codes, AFTER id has been generated
	df['Laji'] = df['Laji'].str.replace('TRIGLA', 'liro')
	df['Laji'] = df['Laji'].str.replace('Phylus', 'pajulintu')
	df['Laji'] = df['Laji'].str.replace('Tetrix', 'teeri')
	df['Laji'] = df['Laji'].str.replace('TETRIX', 'teeri')

	# DATE CONVERSIONS
	# Duplicate the 'vuosi' column, since all data are from one year only
	df['Loppu@yyyy - Yleinen keruutapahtuma'] = df['vuosi']

	# Fill in begin and end dates
	df['Alku@dd - Yleinen keruutapahtuma'] = 1
	df['Alku@mm - Yleinen keruutapahtuma'] = 1
	df['Loppu@dd - Yleinen keruutapahtuma'] = 31 # Date cannot be in the future, so use today's date if uploading on the same year.
	df['Loppu@mm - Yleinen keruutapahtuma'] = 12

	# OBSERVER CONVERSIONS
	# Replace "ym." with empty string
	df['Havainnoijat'] = df['Havainnoijat'].str.replace(' ym.', '')

	# Use semicolon as a separator in "Havainnoijat" column
	df['Havainnoijat'] = df['Havainnoijat'].str.replace(',', ';')

	# Replace "NA" with "Anonyymi" as observer name
	df['Havainnoijat'] = df['Havainnoijat'].replace('NA', 'Anonyymi')

	# Make observer names hidden
	df['Havainnoijien nimet ovat julkisia - Yleinen keruutapahtuma'] = 'Ei'

	# COORDINATE CONVERSIONS
	# Add a new coodinate system column with a constant value 'ykj'
	df['Koordinaatit@System - Keruutapahtuma'] = 'ykj'

	# Split the 'Ruutu' column into N and E columns
	df[['Koordinaatit@N - Keruutapahtuma', 'Koordinaatit@E - Keruutapahtuma']] = df['Ruutu'].str.split(':', expand=True)

	# CLEANUP
	# Rename columns to match Data Bank format
	df.rename(columns={
	'vuosi': 'Alku@yyyy - Yleinen keruutapahtuma',
	'Laji': 'Laji - Määritys',
	'Pesimävarmuusindeksi': 'Pesimävarmuusindeksi - Havainto',
	'Aineisto': 'Tietojen lähde - Havaintoerä',
	'Havainnoijat': 'Havainnoijat - Yleinen keruutapahtuma'
	}, inplace=True)

	# Remove unneeded columns
	df.drop(columns=['Lähde'], inplace=True)
	df.drop(columns=['Ruutu'], inplace=True)

	# Randomize order for test upload
	#df = df.sample(frac=1).reset_index(drop=True)

	# Sort
	df.sort_values(by='Havainnoijat - Yleinen keruutapahtuma', inplace=True)

	# Save the dataframe to a CSV file with UTF-8 encoding with BOM
	csv_file_path = 'output-bom.csv'
	df.to_csv(csv_file_path, index=False, encoding='utf-8-sig')