finloop · August 13, 2021 13:39
diff --git a/analyze_clusters_by_frequency.py b/analyze_clusters_by_frequency.py
 import matplotlib.pyplot as plt
 from math import ceil


 # Parameters for data. I assume data is in df.
 cluster_col = "cluster" # Column with clusters
 group_by = "customer_city" # Metric to group by "city" etc.
 n_first = 4 # Number of most popular entries of `group_by` in cluster to include


 clusters = df[cluster_col].unique() 
 nclusters = len(clusters)

 # Plot 
 fig1, axs = plt.subplots(ceil(sqrt(nclusters)),ceil(sqrt(nclusters)), figsize=(40,30))
 axs = axs.flatten()

 for i, cluster in enumerate(clusters):
    # Locate all data in cluster, then get n_first most popular entries in group_by. Everything else mark as "other".
    data = df.loc[df[cluster_col] == cluster,:].copy()

    other = data.groupby(group_by).count().sort_values(data.columns[0], ascending=False).iloc[n_first:,:].reset_index()[group_by].array
    data.loc[data[group_by].isin(other), group_by] = "other"

    # Extract data and labels
    data = data.groupby(groupby_col).count().iloc[:,0]

    labels = data.index.to_numpy()
    data = data.to_numpy()


    axs[i].pie(data, labels=labels, autopct='%1.1f%%',
            shadow=True, startangle=90,  textprops={'fontsize': 20})
    
    axs[i].set_title(str(cluster), fontsize=25)

 plt.show()
	import matplotlib.pyplot as plt
	from math import ceil


	# Parameters for data. I assume data is in df.
	cluster_col = "cluster" # Column with clusters
	group_by = "customer_city" # Metric to group by "city" etc.
	n_first = 4 # Number of most popular entries of `group_by` in cluster to include


	clusters = df[cluster_col].unique()
	nclusters = len(clusters)

	# Plot
	fig1, axs = plt.subplots(ceil(sqrt(nclusters)),ceil(sqrt(nclusters)), figsize=(40,30))
	axs = axs.flatten()

	for i, cluster in enumerate(clusters):
	# Locate all data in cluster, then get n_first most popular entries in group_by. Everything else mark as "other".
	data = df.loc[df[cluster_col] == cluster,:].copy()

	other = data.groupby(group_by).count().sort_values(data.columns[0], ascending=False).iloc[n_first:,:].reset_index()[group_by].array
	data.loc[data[group_by].isin(other), group_by] = "other"

	# Extract data and labels
	data = data.groupby(groupby_col).count().iloc[:,0]

	labels = data.index.to_numpy()
	data = data.to_numpy()


	axs[i].pie(data, labels=labels, autopct='%1.1f%%',
	shadow=True, startangle=90, textprops={'fontsize': 20})

	axs[i].set_title(str(cluster), fontsize=25)

	plt.show()