Skip to content

Instantly share code, notes, and snippets.

@miguelvb
Created January 8, 2025 15:44
Show Gist options
  • Save miguelvb/e6479c0fb07740cd8117bcdda5feec2e to your computer and use it in GitHub Desktop.
Save miguelvb/e6479c0fb07740cd8117bcdda5feec2e to your computer and use it in GitHub Desktop.
claude script to merge companies

Question

write a python script using pandas dataframes that reads a csv file that contains information on a set of companies and detects duplicates. Once the duplicates are detected you must merge the information of the duplicates in a way that you retain the maximum amount of data. Duplicates are defined by having the same name, country and city fields. For example if two companies are called 'yey' and are in city 'paris' and country 'france' they are considered duplicates and have to be merged. If company 1 has data on its 'url' field and the other does not, we will use for the merge the value of the 'url' field corresponding to the company 1. If both companies have a field that is not empty, choose the field which has more characters in it. Example of csv content: ''' id,name,country,city,data_source,player_address,player_url 1A1B1E3I2L2T2Y_USA_EQZSO2668H,&yet,USA,Richland,crunchbase,140 Gage Blvd Suite 100,http://andyet.com 1E1T1Y_United States_VXYGE2509U,&yet,USA,Richland,dealroom,https://andyet.com 2A1B1E1G1I1L1O2R1S1T1V1Y1Z_USA_PUAQJ6015A,0GLabs,USA,San Francisco,crunchbase,https://0g.ai 2A1B1E1G1I1L0001S1T1V1Y1Z_USA_PUouj6015A,0GLabs,USA,San Francisco,dealroom,https://00g.ai '''

Answer:

see script.py Works great.

import pandas as pd
import numpy as np
def merge_company_data(csv_file):
"""
Read CSV file, detect duplicates based on name, country, and city,
and merge them while preserving maximum information.
Parameters:
csv_file (str): Path to the CSV file
Returns:
pandas.DataFrame: Merged and deduplicated dataset
"""
# Read CSV file
df = pd.read_csv(csv_file)
# Function to select the non-empty value with maximum length
def select_max_info(series):
# Remove NaN and empty strings
valid_values = [str(x) for x in series if pd.notna(x) and str(x).strip()]
if not valid_values:
return np.nan
# Return the longest string
return max(valid_values, key=len)
# Create a duplicate identifier
df['duplicate_group'] = df.apply(
lambda x: f"{x['name']}_{x['country']}_{x['city']}".lower(),
axis=1
)
# Dictionary to store aggregation rules
agg_rules = {
'id': 'first', # Keep the first ID
'name': 'first', # Name will be the same for duplicates
'country': 'first', # Country will be the same for duplicates
'city': 'first', # City will be the same for duplicates
'data_source': lambda x: ', '.join(x.unique()), # Combine data sources
'player_address': select_max_info,
'player_url': select_max_info
}
# Group by duplicate identifier and aggregate
merged_df = df.groupby('duplicate_group', as_index=False).agg(agg_rules)
# Remove the duplicate_group column
merged_df = merged_df.drop('duplicate_group', axis=1)
return merged_df
def analyze_duplicates(original_df, merged_df):
"""
Analyze the results of the merge operation.
Parameters:
original_df (pandas.DataFrame): Original dataset
merged_df (pandas.DataFrame): Merged dataset
"""
print("\nDuplicate Analysis:")
print("-" * 50)
print(f"Original number of entries: {len(original_df)}")
print(f"Number of entries after merging: {len(merged_df)}")
print(f"Number of duplicates removed: {len(original_df) - len(merged_df)}")
# Example usage
if __name__ == "__main__":
# Sample data
sample_data = '''id,name,country,city,data_source,player_address,player_url
1A1B1E3I2L2T2Y_USA_EQZSO2668H,&yet,USA,Richland,crunchbase,140 Gage Blvd Suite 100,http://andyet.com
1E1T1Y_United States_VXYGE2509U,&yet,USA,Richland,dealroom,,https://andyet.com
2A1B1E1G1I1L1O2R1S1T1V1Y1Z_USA_PUAQJ6015A,0GLabs,USA,San Francisco,crunchbase,,https://0g.ai
2A1B1E1G1I1L0001S1T1V1Y1Z_USA_PUouj6015A,0GLabs,USA,San Francisco,dealroom,,https://00g.ai'''
# Save sample data to a temporary CSV file
with open('temp_companies.csv', 'w') as f:
f.write(sample_data)
# Read and process the data
try:
# Read original data
original_df = pd.read_csv('temp_companies.csv')
# Process and merge duplicates
merged_df = merge_company_data('temp_companies.csv')
# Analyze results
analyze_duplicates(original_df, merged_df)
# Display merged results
print("\nMerged Data:")
print("-" * 50)
print(merged_df.to_string())
# Save merged results
merged_df.to_csv('merged_companies.csv', index=False)
print("\nMerged data has been saved to 'merged_companies.csv'")
except Exception as e:
print(f"An error occurred: {str(e)}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment