Last active
May 14, 2021 10:06
-
-
Save barnabywalters/e260406c6242618adc2c8aa7bab8c584 to your computer and use it in GitHub Desktop.
Find which species in an iNaturalist export only have observations from a single observer.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
import argparse | |
import pandas as pd | |
""" | |
Find which species in an iNaturalist export only have observations from a single observer. | |
Get an export from here: https://www.inaturalist.org/observations/export with a query such | |
as quality_grade=research&identifications=any&rank=species&projects[]=92926 and at least the | |
following columns: taxon_id, scientific_name, common_name, user_login | |
Download it, extract the CSV, then run this script with the file name as its argument. It will | |
output basic stats formatted as HTML. | |
The only external module required is pandas. | |
Example usage: | |
py uniquely_observed_species.py wien_cnc_2021.csv > wien_cnc_2021_results.html | |
If you provide the --project-id (-p) argument, the taxa links in the output list will link to | |
a list of observations of that taxa within that project. Otherwise, they default to linking | |
to the taxa page. | |
If a quality_grade column is included, non-research-grade observations will be included in the | |
analysis. Uniquely observed species with no research-grade observations will be marked. Species | |
which were observed by multiple people, only one of which has research-grade observation(s) will | |
also be marked. | |
By Barnaby Walters waterpigs.co.uk | |
""" | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description='Given an iNaturalist observation export, find species which were only observed by a single person.') | |
parser.add_argument('export_file') | |
parser.add_argument('-p', '--project-id', dest='project_id', default=None) | |
args = parser.parse_args() | |
uniquely_observed_species = {} | |
df = pd.read_csv(args.export_file) | |
# If quality_grade isn’t given, assume that the export contains only RG observations. | |
if 'quality_grade' not in df.columns: | |
df.loc[:, 'quality_grade'] = 'research' | |
# Filter out casual observations. | |
df = df.query('quality_grade != "casual"') | |
# Create a local species reference from the dataframe. | |
species = df.loc[:, ('taxon_id', 'scientific_name', 'common_name')].drop_duplicates() | |
species = species.set_index(species.loc[:, 'taxon_id']) | |
for tid in species.index: | |
observers = df.query('taxon_id == @tid').loc[:, 'user_login'].drop_duplicates() | |
research_grade_observers = df.query('taxon_id == @tid and quality_grade == "research"').loc[:, 'user_login'].drop_duplicates() | |
if observers.shape[0] == 1: | |
# Only one person made any observations of this species. | |
observer = observers.squeeze() | |
if observer not in uniquely_observed_species: | |
uniquely_observed_species[observer] = [] | |
uniquely_observed_species[observer].append({ | |
'id': tid, | |
'has_research_grade': (not research_grade_observers.empty), | |
'num_other_observers': 0 | |
}) | |
elif research_grade_observers.shape[0] == 1: | |
# Multiple people observed the species, but only one person has research-grade observation(s). | |
rg_observer = research_grade_observers.squeeze() | |
if rg_observer not in uniquely_observed_species: | |
uniquely_observed_species[rg_observer] = [] | |
uniquely_observed_species[rg_observer].append({ | |
'id': tid, | |
'has_research_grade': True, | |
'num_other_observers': observers.shape[0] - 1 | |
}) | |
# Sort observers by number of unique species. | |
sorted_observations = sorted(uniquely_observed_species.items(), key=lambda t: len(t[1]), reverse=True) | |
print(f"<p>{sum([len(t) for _, t in sorted_observations])} taxa uniquely observed by {len(sorted_observations)} observers.</p>") | |
print('<p>') | |
for observer, _ in sorted_observations: | |
print(f"@{observer} ", end='') | |
print('</p>') | |
print('<p><b>bold</b> species are ones for which the given observer has one or more research-grade observations.</p>') | |
print('<p>If only one person has RG observations of a species, but other people have observations which need ID, the number of needs-ID observers are indicated in parentheses.') | |
for observer, taxa in sorted_observations: | |
print(f"""\n\n<p><a href="https://www.inaturalist.org/people/{observer}">@{observer}</a> ({len(taxa)} taxa):</p><ul>""") | |
for tobv in sorted(taxa, key=lambda t: species.loc[t['id']]['scientific_name']): | |
tid = tobv['id'] | |
t = species.loc[tid] | |
if args.project_id: | |
taxa_url = f"https://www.inaturalist.org/observations?taxon_id={tid}&project_id={args.project_id}" | |
else: | |
taxa_url = f'https://www.inaturalist.org/taxa/{tid}' | |
rgb, rge = ('<b>', '</b>') if tobv.get('has_research_grade') else ('', '') | |
others = f" ({tobv.get('num_other_observers', 0)})" if tobv.get('num_other_observers', 0) > 0 else '' | |
if not pd.isnull(t['common_name']): | |
print(f"""<li><a href="{taxa_url}">{rgb}<i>{t['scientific_name']}</i> ({t['common_name']}){rge}{others}</a></li>""") | |
else: | |
print(f"""<li><a href="{taxa_url}">{rgb}<i>{t['scientific_name']}</i>{rge}{others}</a></li>""") | |
print("</ul>") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment