Created
          April 19, 2023 19:54 
        
      - 
      
 - 
        
Save harshvardhaniimi/89441c296316837444b606d13c843a95 to your computer and use it in GitHub Desktop.  
    A function to compare large data frames by comparing their hashes instead of values for efficiency
  
        
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | import pandas as pd | |
| import hashlib | |
| def hash_dataframe(df): | |
| """ | |
| Generate a hash for a DataFrame using the SHA-256 algorithm. | |
| This function creates a hash for each row of the DataFrame using pandas' `hash_pandas_object` | |
| and then hashes the resulting array of row hashes using `hashlib.sha256`. | |
| Args: | |
| df (pd.DataFrame): The DataFrame to be hashed. | |
| Returns: | |
| str: The resulting SHA-256 hash in hexadecimal format. | |
| """ | |
| # Create a hash for each row in the DataFrame | |
| row_hashes = pd.util.hash_pandas_object(df, index=True).values | |
| # Hash the array of row hashes using SHA-256 | |
| return hashlib.sha256(row_hashes).hexdigest() | |
| def are_dataframes_equal(df1, df2): | |
| """ | |
| Compare two DataFrames for equality using their SHA-256 hashes. | |
| This function first checks the shape, columns, and dtypes of the DataFrames. If these properties match, | |
| it then compares the SHA-256 hashes of the DataFrames. The probability of hashes matching with differences | |
| in data frame (hash collison) is 1 in 2^256. | |
| Args: | |
| df1 (pd.DataFrame): The first DataFrame to be compared. | |
| df2 (pd.DataFrame): The second DataFrame to be compared. | |
| Returns: | |
| bool: True if the DataFrames are equal, False otherwise. | |
| """ | |
| # Check shape, columns, and dtypes | |
| if df1.shape != df2.shape or (df1.columns != df2.columns).any() or (df1.dtypes != df2.dtypes).any(): | |
| return False | |
| # Compare hashes | |
| return hash_dataframe(df1) == hash_dataframe(df2) | |
| # Load your DataFrames | |
| df1 = pd.read_csv('file1.csv') | |
| df2 = pd.read_csv('file2.csv') | |
| # Check if DataFrames are the same | |
| result = are_dataframes_equal(df1, df2) | |
| print("DataFrames are equal:", result) | 
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment