harshvardhaniimi · April 19, 2023 19:54
diff --git a/compare_dataframes.py b/compare_dataframes.py
 import pandas as pd
 import hashlib

 def hash_dataframe(df):
    """
    Generate a hash for a DataFrame using the SHA-256 algorithm.
    
    This function creates a hash for each row of the DataFrame using pandas' `hash_pandas_object` 
    and then hashes the resulting array of row hashes using `hashlib.sha256`.
    
    Args:
        df (pd.DataFrame): The DataFrame to be hashed.
    
    Returns:
        str: The resulting SHA-256 hash in hexadecimal format.
    """
    # Create a hash for each row in the DataFrame
    row_hashes = pd.util.hash_pandas_object(df, index=True).values
    
    # Hash the array of row hashes using SHA-256
    return hashlib.sha256(row_hashes).hexdigest()

 def are_dataframes_equal(df1, df2):
    """
    Compare two DataFrames for equality using their SHA-256 hashes.
    
    This function first checks the shape, columns, and dtypes of the DataFrames. If these properties match, 
    it then compares the SHA-256 hashes of the DataFrames. The probability of hashes matching with differences
    in data frame (hash collison) is 1 in 2^256.
    
    Args:
        df1 (pd.DataFrame): The first DataFrame to be compared.
        df2 (pd.DataFrame): The second DataFrame to be compared.
    
    Returns:
        bool: True if the DataFrames are equal, False otherwise.
    """
    # Check shape, columns, and dtypes
    if df1.shape != df2.shape or (df1.columns != df2.columns).any() or (df1.dtypes != df2.dtypes).any():
        return False

    # Compare hashes
    return hash_dataframe(df1) == hash_dataframe(df2)

 # Load your DataFrames
 df1 = pd.read_csv('file1.csv')
 df2 = pd.read_csv('file2.csv')

 # Check if DataFrames are the same
 result = are_dataframes_equal(df1, df2)
 print("DataFrames are equal:", result)
	import pandas as pd
	import hashlib

	def hash_dataframe(df):
	"""
	Generate a hash for a DataFrame using the SHA-256 algorithm.

	This function creates a hash for each row of the DataFrame using pandas' `hash_pandas_object`
	and then hashes the resulting array of row hashes using `hashlib.sha256`.

	Args:
	df (pd.DataFrame): The DataFrame to be hashed.

	Returns:
	str: The resulting SHA-256 hash in hexadecimal format.
	"""
	# Create a hash for each row in the DataFrame
	row_hashes = pd.util.hash_pandas_object(df, index=True).values

	# Hash the array of row hashes using SHA-256
	return hashlib.sha256(row_hashes).hexdigest()

	def are_dataframes_equal(df1, df2):
	"""
	Compare two DataFrames for equality using their SHA-256 hashes.

	This function first checks the shape, columns, and dtypes of the DataFrames. If these properties match,
	it then compares the SHA-256 hashes of the DataFrames. The probability of hashes matching with differences
	in data frame (hash collison) is 1 in 2^256.

	Args:
	df1 (pd.DataFrame): The first DataFrame to be compared.
	df2 (pd.DataFrame): The second DataFrame to be compared.

	Returns:
	bool: True if the DataFrames are equal, False otherwise.
	"""
	# Check shape, columns, and dtypes
	if df1.shape != df2.shape or (df1.columns != df2.columns).any() or (df1.dtypes != df2.dtypes).any():
	return False

	# Compare hashes
	return hash_dataframe(df1) == hash_dataframe(df2)

	# Load your DataFrames
	df1 = pd.read_csv('file1.csv')
	df2 = pd.read_csv('file2.csv')

	# Check if DataFrames are the same
	result = are_dataframes_equal(df1, df2)
	print("DataFrames are equal:", result)