Created
June 1, 2023 08:34
-
-
Save mikkohei13/52b6635b8089db367b869b07627c977a to your computer and use it in GitHub Desktop.
Script reads laji.fi citatable data file, and creates Excel file and a line chart of weekly species distribution.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import matplotlib.pyplot as plt | |
# Load the csv file | |
df = pd.read_csv('rows_HBF.75704.tsv', sep='\t') | |
# Remove those without a day of year | |
df = df.dropna(subset=['Gathering.Conversions.DayOfYearBegin']) | |
# Set day of years as integers | |
df['Gathering.Conversions.DayOfYearBegin'] = df['Gathering.Conversions.DayOfYearBegin'].astype(int) | |
df['Gathering.Conversions.DayOfYearEnd'] = df['Gathering.Conversions.DayOfYearEnd'].astype(int) | |
# Filter out observations that have a long timespan | |
filtered_df = df[df['Gathering.Conversions.DayOfYearEnd'] - df['Gathering.Conversions.DayOfYearBegin'] <= 5] | |
# Convert day of the year to week of the year | |
df['week_of_year'] = df['Gathering.Conversions.DayOfYearBegin'].apply(lambda x: (x-1)//7+1) | |
# Combine common & scientific name | |
df['name'] = df['Taxon.FinnishName'].astype(str).str.capitalize() + " (" + df['Taxon.ScientificName'].astype(str) + ")" | |
# Calculate weekly counts | |
weekly_counts = df.groupby(['name', 'week_of_year']).size().reset_index(name='weekly_counts') | |
# Calculate total counts | |
total_counts = df.groupby('name').size().reset_index(name='total_counts') | |
# Merge weekly and total counts | |
df_counts = pd.merge(weekly_counts, total_counts, on='name') | |
# Calculate the percentage | |
df_counts['percentage'] = (df_counts['weekly_counts'] / df_counts['total_counts']) * 100 | |
# Filter out weeks with percentage | |
result = df_counts[df_counts['percentage'] > 10] | |
print(result) | |
result.to_excel('weeks.xlsx', index=False) | |
# Count the unique species in each week | |
unique_species_per_week = result.groupby('week_of_year')['name'].nunique() | |
# Create a line chart | |
plt.figure(figsize=(10,6)) | |
plt.plot(unique_species_per_week.index, unique_species_per_week.values) | |
plt.xlabel('Viikko') | |
plt.ylabel('Lajimäärä') | |
plt.title('Lajit, joiden havainnoista 10 % ko. viikolta') | |
plt.grid(True) | |
plt.savefig('line_chart.png') # save the plot to a file |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment