miguelvb · January 8, 2025 15:44
diff --git a/claude script companies merge.md b/claude script companies merge.md
diff --git a/script.py b/script.py
 import pandas as pd
 import numpy as np

 def merge_company_data(csv_file):
    """
    Read CSV file, detect duplicates based on name, country, and city,
    and merge them while preserving maximum information.
    
    Parameters:
    csv_file (str): Path to the CSV file
    
    Returns:
    pandas.DataFrame: Merged and deduplicated dataset
    """
    # Read CSV file
    df = pd.read_csv(csv_file)
    
    # Function to select the non-empty value with maximum length
    def select_max_info(series):
        # Remove NaN and empty strings
        valid_values = [str(x) for x in series if pd.notna(x) and str(x).strip()]
        if not valid_values:
            return np.nan
        # Return the longest string
        return max(valid_values, key=len)
    
    # Create a duplicate identifier
    df['duplicate_group'] = df.apply(
        lambda x: f"{x['name']}_{x['country']}_{x['city']}".lower(), 
        axis=1
    )
    
    # Dictionary to store aggregation rules
    agg_rules = {
        'id': 'first',  # Keep the first ID
        'name': 'first',  # Name will be the same for duplicates
        'country': 'first',  # Country will be the same for duplicates
        'city': 'first',  # City will be the same for duplicates
        'data_source': lambda x: ', '.join(x.unique()),  # Combine data sources
        'player_address': select_max_info,
        'player_url': select_max_info
    }
    
    # Group by duplicate identifier and aggregate
    merged_df = df.groupby('duplicate_group', as_index=False).agg(agg_rules)
    
    # Remove the duplicate_group column
    merged_df = merged_df.drop('duplicate_group', axis=1)
    
    return merged_df

 def analyze_duplicates(original_df, merged_df):
    """
    Analyze the results of the merge operation.
    
    Parameters:
    original_df (pandas.DataFrame): Original dataset
    merged_df (pandas.DataFrame): Merged dataset
    """
    print("\nDuplicate Analysis:")
    print("-" * 50)
    print(f"Original number of entries: {len(original_df)}")
    print(f"Number of entries after merging: {len(merged_df)}")
    print(f"Number of duplicates removed: {len(original_df) - len(merged_df)}")

 # Example usage
 if __name__ == "__main__":
    # Sample data
    sample_data = '''id,name,country,city,data_source,player_address,player_url
 1A1B1E3I2L2T2Y_USA_EQZSO2668H,&yet,USA,Richland,crunchbase,140 Gage Blvd Suite 100,http://andyet.com
 1E1T1Y_United States_VXYGE2509U,&yet,USA,Richland,dealroom,,https://andyet.com
 2A1B1E1G1I1L1O2R1S1T1V1Y1Z_USA_PUAQJ6015A,0GLabs,USA,San Francisco,crunchbase,,https://0g.ai
 2A1B1E1G1I1L0001S1T1V1Y1Z_USA_PUouj6015A,0GLabs,USA,San Francisco,dealroom,,https://00g.ai'''
    
    # Save sample data to a temporary CSV file
    with open('temp_companies.csv', 'w') as f:
        f.write(sample_data)
    
    # Read and process the data
    try:
        # Read original data
        original_df = pd.read_csv('temp_companies.csv')
        
        # Process and merge duplicates
        merged_df = merge_company_data('temp_companies.csv')
        
        # Analyze results
        analyze_duplicates(original_df, merged_df)
        
        # Display merged results
        print("\nMerged Data:")
        print("-" * 50)
        print(merged_df.to_string())
        
        # Save merged results
        merged_df.to_csv('merged_companies.csv', index=False)
        print("\nMerged data has been saved to 'merged_companies.csv'")
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")
	import pandas as pd
	import numpy as np

	def merge_company_data(csv_file):
	"""
	Read CSV file, detect duplicates based on name, country, and city,
	and merge them while preserving maximum information.

	Parameters:
	csv_file (str): Path to the CSV file

	Returns:
	pandas.DataFrame: Merged and deduplicated dataset
	"""
	# Read CSV file
	df = pd.read_csv(csv_file)

	# Function to select the non-empty value with maximum length
	def select_max_info(series):
	# Remove NaN and empty strings
	valid_values = [str(x) for x in series if pd.notna(x) and str(x).strip()]
	if not valid_values:
	return np.nan
	# Return the longest string
	return max(valid_values, key=len)

	# Create a duplicate identifier
	df['duplicate_group'] = df.apply(
	lambda x: f"{x['name']}_{x['country']}_{x['city']}".lower(),
	axis=1
	)

	# Dictionary to store aggregation rules
	agg_rules = {
	'id': 'first', # Keep the first ID
	'name': 'first', # Name will be the same for duplicates
	'country': 'first', # Country will be the same for duplicates
	'city': 'first', # City will be the same for duplicates
	'data_source': lambda x: ', '.join(x.unique()), # Combine data sources
	'player_address': select_max_info,
	'player_url': select_max_info
	}

	# Group by duplicate identifier and aggregate
	merged_df = df.groupby('duplicate_group', as_index=False).agg(agg_rules)

	# Remove the duplicate_group column
	merged_df = merged_df.drop('duplicate_group', axis=1)

	return merged_df

	def analyze_duplicates(original_df, merged_df):
	"""
	Analyze the results of the merge operation.

	Parameters:
	original_df (pandas.DataFrame): Original dataset
	merged_df (pandas.DataFrame): Merged dataset
	"""
	print("\nDuplicate Analysis:")
	print("-" * 50)
	print(f"Original number of entries: {len(original_df)}")
	print(f"Number of entries after merging: {len(merged_df)}")
	print(f"Number of duplicates removed: {len(original_df) - len(merged_df)}")

	# Example usage
	if __name__ == "__main__":
	# Sample data
	sample_data = '''id,name,country,city,data_source,player_address,player_url
	1A1B1E3I2L2T2Y_USA_EQZSO2668H,&yet,USA,Richland,crunchbase,140 Gage Blvd Suite 100,http://andyet.com
	1E1T1Y_United States_VXYGE2509U,&yet,USA,Richland,dealroom,,https://andyet.com
	2A1B1E1G1I1L1O2R1S1T1V1Y1Z_USA_PUAQJ6015A,0GLabs,USA,San Francisco,crunchbase,,https://0g.ai
	2A1B1E1G1I1L0001S1T1V1Y1Z_USA_PUouj6015A,0GLabs,USA,San Francisco,dealroom,,https://00g.ai'''

	# Save sample data to a temporary CSV file
	with open('temp_companies.csv', 'w') as f:
	f.write(sample_data)

	# Read and process the data
	try:
	# Read original data
	original_df = pd.read_csv('temp_companies.csv')

	# Process and merge duplicates
	merged_df = merge_company_data('temp_companies.csv')

	# Analyze results
	analyze_duplicates(original_df, merged_df)

	# Display merged results
	print("\nMerged Data:")
	print("-" * 50)
	print(merged_df.to_string())

	# Save merged results
	merged_df.to_csv('merged_companies.csv', index=False)
	print("\nMerged data has been saved to 'merged_companies.csv'")

	except Exception as e:
	print(f"An error occurred: {str(e)}")