Skip to content

Instantly share code, notes, and snippets.

@hdary85
Last active April 7, 2025 18:05
Show Gist options
  • Save hdary85/ec31af24482b6edd4afadb2a016538cc to your computer and use it in GitHub Desktop.
Save hdary85/ec31af24482b6edd4afadb2a016538cc to your computer and use it in GitHub Desktop.
import matplotlib.pyplot as plt
import seaborn as sns
# 1. Création d'une nouvelle colonne pour distinguer les top 10
df_merged['highlight_top10'] = 0
df_merged.loc[df_final_top10.index, 'highlight_top10'] = 1
# 2. Nouveau graphe PCA avec 3 groupes :
# - 0 = normal
# - 1 = anomalies hors top 10
# - 2 = top 10 anomalies
# Mapping :
# 0 → normal (bleu)
# 1 → anomalies DBSCAN sauf top 10 (rouge)
# 2 → top 10 (jaune)
df_merged['plot_group'] = 'Normal'
df_merged.loc[(df_merged['dbscan_anomaly'] == 1), 'plot_group'] = 'Anomalie'
df_merged.loc[df_merged['highlight_top10'] == 1, 'plot_group'] = 'Top 10'
# 3. Graphe
plt.figure(figsize=(10, 6))
sns.scatterplot(
data=df_merged,
x='pca_1',
y='pca_2',
hue='plot_group',
palette={'Normal': 'blue', 'Anomalie': 'red', 'Top 10': 'gold'},
style='plot_group',
s=100,
alpha=0.7
)
plt.title("📊 Anomalies DBSCAN avec mise en évidence du Top 10 (PCA)")
plt.xlabel("Composante principale 1")
plt.ylabel("Composante principale 2")
plt.legend(title="Groupe")
plt.grid(True)
plt.show()
# Étape 1 : Calculer les loadings de PCA = contribution des variables aux axes PCA
loadings = pd.DataFrame(
pca.components_.T,
index=X.columns,
columns=['PC1', 'PC2']
)
# Étape 2 : Calculer les contributions de chaque variable pour chaque client du top 10
top10_pca = df_merged.loc[top10_indices, ['pca_1', 'pca_2']]
# Contiendra les top 2 variables qui expliquent la position PCA
top_pca_vars = []
for idx, row in top10_pca.iterrows():
# Produit scalaire entre position (pca_1, pca_2) et loadings = importance relative des features
contribs = row['pca_1'] * loadings['PC1'] + row['pca_2'] * loadings['PC2']
top2 = contribs.abs().sort_values(ascending=False).head(2).index
top_pca_vars.append(top2)
# Étape 3 : Ajouter ces infos dans df_final_top10
df_final_top10['pca_var_1'] = [x[0] for x in top_pca_vars]
df_final_top10['pca_var_2'] = [x[1] for x in top_pca_vars]
# Affichage
print("🔎 Top 10 avec variables responsables de leur isolement (selon PCA) :")
print(df_final_top10[['PARTY_KEY', 'suspicion_score', 'pca_var_1', 'pca_var_2']])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment