|
import pandas as pd |
|
import numpy as np |
|
|
|
def merge_company_data(csv_file): |
|
""" |
|
Read CSV file, detect duplicates based on name, country, and city, |
|
and merge them while preserving maximum information. |
|
|
|
Parameters: |
|
csv_file (str): Path to the CSV file |
|
|
|
Returns: |
|
pandas.DataFrame: Merged and deduplicated dataset |
|
""" |
|
# Read CSV file |
|
df = pd.read_csv(csv_file) |
|
|
|
# Function to select the non-empty value with maximum length |
|
def select_max_info(series): |
|
# Remove NaN and empty strings |
|
valid_values = [str(x) for x in series if pd.notna(x) and str(x).strip()] |
|
if not valid_values: |
|
return np.nan |
|
# Return the longest string |
|
return max(valid_values, key=len) |
|
|
|
# Create a duplicate identifier |
|
df['duplicate_group'] = df.apply( |
|
lambda x: f"{x['name']}_{x['country']}_{x['city']}".lower(), |
|
axis=1 |
|
) |
|
|
|
# Dictionary to store aggregation rules |
|
agg_rules = { |
|
'id': 'first', # Keep the first ID |
|
'name': 'first', # Name will be the same for duplicates |
|
'country': 'first', # Country will be the same for duplicates |
|
'city': 'first', # City will be the same for duplicates |
|
'data_source': lambda x: ', '.join(x.unique()), # Combine data sources |
|
'player_address': select_max_info, |
|
'player_url': select_max_info |
|
} |
|
|
|
# Group by duplicate identifier and aggregate |
|
merged_df = df.groupby('duplicate_group', as_index=False).agg(agg_rules) |
|
|
|
# Remove the duplicate_group column |
|
merged_df = merged_df.drop('duplicate_group', axis=1) |
|
|
|
return merged_df |
|
|
|
def analyze_duplicates(original_df, merged_df): |
|
""" |
|
Analyze the results of the merge operation. |
|
|
|
Parameters: |
|
original_df (pandas.DataFrame): Original dataset |
|
merged_df (pandas.DataFrame): Merged dataset |
|
""" |
|
print("\nDuplicate Analysis:") |
|
print("-" * 50) |
|
print(f"Original number of entries: {len(original_df)}") |
|
print(f"Number of entries after merging: {len(merged_df)}") |
|
print(f"Number of duplicates removed: {len(original_df) - len(merged_df)}") |
|
|
|
# Example usage |
|
if __name__ == "__main__": |
|
# Sample data |
|
sample_data = '''id,name,country,city,data_source,player_address,player_url |
|
1A1B1E3I2L2T2Y_USA_EQZSO2668H,&yet,USA,Richland,crunchbase,140 Gage Blvd Suite 100,http://andyet.com |
|
1E1T1Y_United States_VXYGE2509U,&yet,USA,Richland,dealroom,,https://andyet.com |
|
2A1B1E1G1I1L1O2R1S1T1V1Y1Z_USA_PUAQJ6015A,0GLabs,USA,San Francisco,crunchbase,,https://0g.ai |
|
2A1B1E1G1I1L0001S1T1V1Y1Z_USA_PUouj6015A,0GLabs,USA,San Francisco,dealroom,,https://00g.ai''' |
|
|
|
# Save sample data to a temporary CSV file |
|
with open('temp_companies.csv', 'w') as f: |
|
f.write(sample_data) |
|
|
|
# Read and process the data |
|
try: |
|
# Read original data |
|
original_df = pd.read_csv('temp_companies.csv') |
|
|
|
# Process and merge duplicates |
|
merged_df = merge_company_data('temp_companies.csv') |
|
|
|
# Analyze results |
|
analyze_duplicates(original_df, merged_df) |
|
|
|
# Display merged results |
|
print("\nMerged Data:") |
|
print("-" * 50) |
|
print(merged_df.to_string()) |
|
|
|
# Save merged results |
|
merged_df.to_csv('merged_companies.csv', index=False) |
|
print("\nMerged data has been saved to 'merged_companies.csv'") |
|
|
|
except Exception as e: |
|
print(f"An error occurred: {str(e)}") |