Created
November 14, 2024 14:52
-
-
Save mikkohei13/fc7ccb467b8088a5ad3ec8310cff3b1a to your computer and use it in GitHub Desktop.
Compares id's from two files and outputs the id's that are in the first file but not in the second file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Compares id's from two files and outputs the id's that are in the first file but not in the second file | |
import pandas as pd | |
def read_column_to_list(filename, column_name, separator=','): | |
""" | |
Reads a single column from a large CSV or TSV file into a list. | |
:param filename: The path to the CSV or TSV file. | |
:param column_name: The name of the column to read (default is "id"). | |
:param separator: The delimiter used in the file, default is ',' for CSV. | |
Use '\t' for TSV files. | |
:return: A list containing the values from the specified column. | |
""" | |
try: | |
# Read in the first row to check column names, stripping whitespace | |
column_data = pd.read_csv( | |
filename, | |
usecols=[column_name], | |
sep=separator, | |
skipinitialspace=True, | |
encoding="utf-8" | |
) | |
return column_data[column_name].tolist() | |
except ValueError as e: | |
print(f"Error: {e}. Column '{column_name}' may not exist, or there may be formatting issues.") | |
return [] | |
except Exception as e: | |
print(f"An error occurred: {e}") | |
return [] | |
# First file: Laji.fi data | |
#filename = 'rows-head.tsv' # debug data | |
filename = 'rows_HBF.96887.tsv' | |
laji_ids = read_column_to_list(filename, "Document.DocumentID", separator='\t') | |
# For each id, replace the prefix with an empty string | |
laji_ids = [int(id.replace('http://tun.fi/HR.3211/', '')) for id in laji_ids] | |
print("Laji.fi file ready") | |
# Second file: iNaturalist data | |
#filename = 'inat-head.csv' # debug data | |
filename = 'inaturalist-suomi-20-observations.csv' | |
inat_ids = read_column_to_list(filename, "id", separator=',') | |
print("iNat file ready") | |
# Convert lists to sets | |
set_laji = set(laji_ids) | |
set_inat = set(inat_ids) | |
# Print top 10 of both sets | |
print("Top 10 of Laji.fi") | |
print(list(set_laji)[:10]) | |
print("Top 10 of iNat") | |
print(list(set_inat)[:10]) | |
common_elements = set_laji.intersection(set_inat) | |
print(f"Number of common elements: {len(common_elements)}") | |
# Find elements in list_a that are not in list_b | |
missing_ids = set_laji - set_inat | |
# Count missing id's | |
print(f"Total rows in iNat: {len(inat_ids)}") | |
print(f"Total rows in Laji.fi: {len(laji_ids)}") | |
print(f"Missing id's: {len(missing_ids)}") | |
# Export missing id's to a file | |
with open('missing_ids.txt', 'w') as f: | |
for id in missing_ids: | |
f.write(f"{id}\n") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment