mikkohei13 · November 14, 2024 14:52
diff --git a/compare.py b/compare.py
 # Compares id's from two files and outputs the id's that are in the first file but not in the second file

 import pandas as pd

 def read_column_to_list(filename, column_name, separator=','):
    """
    Reads a single column from a large CSV or TSV file into a list.
    
    :param filename: The path to the CSV or TSV file.
    :param column_name: The name of the column to read (default is "id").
    :param separator: The delimiter used in the file, default is ',' for CSV.
                      Use '\t' for TSV files.
    :return: A list containing the values from the specified column.
    """
    try:
        # Read in the first row to check column names, stripping whitespace
        column_data = pd.read_csv(
            filename,
            usecols=[column_name],
            sep=separator,
            skipinitialspace=True,
            encoding="utf-8"
        )
        return column_data[column_name].tolist()
    except ValueError as e:
        print(f"Error: {e}. Column '{column_name}' may not exist, or there may be formatting issues.")
        return []
    except Exception as e:
        print(f"An error occurred: {e}")
        return []



 # First file: Laji.fi data
 #filename = 'rows-head.tsv' # debug data
 filename = 'rows_HBF.96887.tsv'
 laji_ids = read_column_to_list(filename, "Document.DocumentID", separator='\t')

 # For each id, replace the prefix with an empty string
 laji_ids = [int(id.replace('http://tun.fi/HR.3211/', '')) for id in laji_ids]

 print("Laji.fi file ready")


 # Second file: iNaturalist data
 #filename = 'inat-head.csv' # debug data
 filename = 'inaturalist-suomi-20-observations.csv'
 inat_ids = read_column_to_list(filename, "id", separator=',')

 print("iNat file ready")


 # Convert lists to sets
 set_laji = set(laji_ids)
 set_inat = set(inat_ids)

 # Print top 10 of both sets
 print("Top 10 of Laji.fi")
 print(list(set_laji)[:10])
 print("Top 10 of iNat")
 print(list(set_inat)[:10])

 common_elements = set_laji.intersection(set_inat)
 print(f"Number of common elements: {len(common_elements)}")

 # Find elements in list_a that are not in list_b
 missing_ids = set_laji - set_inat

 # Count missing id's
 print(f"Total rows in iNat: {len(inat_ids)}")
 print(f"Total rows in Laji.fi: {len(laji_ids)}")

 print(f"Missing id's: {len(missing_ids)}")

 # Export missing id's to a file
 with open('missing_ids.txt', 'w') as f:
    for id in missing_ids:
        f.write(f"{id}\n")
	# Compares id's from two files and outputs the id's that are in the first file but not in the second file

	import pandas as pd

	def read_column_to_list(filename, column_name, separator=','):
	"""
	Reads a single column from a large CSV or TSV file into a list.

	:param filename: The path to the CSV or TSV file.
	:param column_name: The name of the column to read (default is "id").
	:param separator: The delimiter used in the file, default is ',' for CSV.
	Use '\t' for TSV files.
	:return: A list containing the values from the specified column.
	"""
	try:
	# Read in the first row to check column names, stripping whitespace
	column_data = pd.read_csv(
	filename,
	usecols=[column_name],
	sep=separator,
	skipinitialspace=True,
	encoding="utf-8"
	)
	return column_data[column_name].tolist()
	except ValueError as e:
	print(f"Error: {e}. Column '{column_name}' may not exist, or there may be formatting issues.")
	return []
	except Exception as e:
	print(f"An error occurred: {e}")
	return []



	# First file: Laji.fi data
	#filename = 'rows-head.tsv' # debug data
	filename = 'rows_HBF.96887.tsv'
	laji_ids = read_column_to_list(filename, "Document.DocumentID", separator='\t')

	# For each id, replace the prefix with an empty string
	laji_ids = [int(id.replace('http://tun.fi/HR.3211/', '')) for id in laji_ids]

	print("Laji.fi file ready")


	# Second file: iNaturalist data
	#filename = 'inat-head.csv' # debug data
	filename = 'inaturalist-suomi-20-observations.csv'
	inat_ids = read_column_to_list(filename, "id", separator=',')

	print("iNat file ready")


	# Convert lists to sets
	set_laji = set(laji_ids)
	set_inat = set(inat_ids)

	# Print top 10 of both sets
	print("Top 10 of Laji.fi")
	print(list(set_laji)[:10])
	print("Top 10 of iNat")
	print(list(set_inat)[:10])

	common_elements = set_laji.intersection(set_inat)
	print(f"Number of common elements: {len(common_elements)}")

	# Find elements in list_a that are not in list_b
	missing_ids = set_laji - set_inat

	# Count missing id's
	print(f"Total rows in iNat: {len(inat_ids)}")
	print(f"Total rows in Laji.fi: {len(laji_ids)}")

	print(f"Missing id's: {len(missing_ids)}")

	# Export missing id's to a file
	with open('missing_ids.txt', 'w') as f:
	for id in missing_ids:
	f.write(f"{id}\n")