Skip to content

Instantly share code, notes, and snippets.

@mikkohei13
Created November 14, 2024 14:52
Show Gist options
  • Save mikkohei13/fc7ccb467b8088a5ad3ec8310cff3b1a to your computer and use it in GitHub Desktop.
Save mikkohei13/fc7ccb467b8088a5ad3ec8310cff3b1a to your computer and use it in GitHub Desktop.
Compares id's from two files and outputs the id's that are in the first file but not in the second file
# Compares id's from two files and outputs the id's that are in the first file but not in the second file
import pandas as pd
def read_column_to_list(filename, column_name, separator=','):
"""
Reads a single column from a large CSV or TSV file into a list.
:param filename: The path to the CSV or TSV file.
:param column_name: The name of the column to read (default is "id").
:param separator: The delimiter used in the file, default is ',' for CSV.
Use '\t' for TSV files.
:return: A list containing the values from the specified column.
"""
try:
# Read in the first row to check column names, stripping whitespace
column_data = pd.read_csv(
filename,
usecols=[column_name],
sep=separator,
skipinitialspace=True,
encoding="utf-8"
)
return column_data[column_name].tolist()
except ValueError as e:
print(f"Error: {e}. Column '{column_name}' may not exist, or there may be formatting issues.")
return []
except Exception as e:
print(f"An error occurred: {e}")
return []
# First file: Laji.fi data
#filename = 'rows-head.tsv' # debug data
filename = 'rows_HBF.96887.tsv'
laji_ids = read_column_to_list(filename, "Document.DocumentID", separator='\t')
# For each id, replace the prefix with an empty string
laji_ids = [int(id.replace('http://tun.fi/HR.3211/', '')) for id in laji_ids]
print("Laji.fi file ready")
# Second file: iNaturalist data
#filename = 'inat-head.csv' # debug data
filename = 'inaturalist-suomi-20-observations.csv'
inat_ids = read_column_to_list(filename, "id", separator=',')
print("iNat file ready")
# Convert lists to sets
set_laji = set(laji_ids)
set_inat = set(inat_ids)
# Print top 10 of both sets
print("Top 10 of Laji.fi")
print(list(set_laji)[:10])
print("Top 10 of iNat")
print(list(set_inat)[:10])
common_elements = set_laji.intersection(set_inat)
print(f"Number of common elements: {len(common_elements)}")
# Find elements in list_a that are not in list_b
missing_ids = set_laji - set_inat
# Count missing id's
print(f"Total rows in iNat: {len(inat_ids)}")
print(f"Total rows in Laji.fi: {len(laji_ids)}")
print(f"Missing id's: {len(missing_ids)}")
# Export missing id's to a file
with open('missing_ids.txt', 'w') as f:
for id in missing_ids:
f.write(f"{id}\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment