Last active
April 7, 2025 18:05
-
-
Save hdary85/ec31af24482b6edd4afadb2a016538cc to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
import seaborn as sns | |
# 1. Création d'une nouvelle colonne pour distinguer les top 10 | |
df_merged['highlight_top10'] = 0 | |
df_merged.loc[df_final_top10.index, 'highlight_top10'] = 1 | |
# 2. Nouveau graphe PCA avec 3 groupes : | |
# - 0 = normal | |
# - 1 = anomalies hors top 10 | |
# - 2 = top 10 anomalies | |
# Mapping : | |
# 0 → normal (bleu) | |
# 1 → anomalies DBSCAN sauf top 10 (rouge) | |
# 2 → top 10 (jaune) | |
df_merged['plot_group'] = 'Normal' | |
df_merged.loc[(df_merged['dbscan_anomaly'] == 1), 'plot_group'] = 'Anomalie' | |
df_merged.loc[df_merged['highlight_top10'] == 1, 'plot_group'] = 'Top 10' | |
# 3. Graphe | |
plt.figure(figsize=(10, 6)) | |
sns.scatterplot( | |
data=df_merged, | |
x='pca_1', | |
y='pca_2', | |
hue='plot_group', | |
palette={'Normal': 'blue', 'Anomalie': 'red', 'Top 10': 'gold'}, | |
style='plot_group', | |
s=100, | |
alpha=0.7 | |
) | |
plt.title("📊 Anomalies DBSCAN avec mise en évidence du Top 10 (PCA)") | |
plt.xlabel("Composante principale 1") | |
plt.ylabel("Composante principale 2") | |
plt.legend(title="Groupe") | |
plt.grid(True) | |
plt.show() | |
# Étape 1 : Calculer les loadings de PCA = contribution des variables aux axes PCA | |
loadings = pd.DataFrame( | |
pca.components_.T, | |
index=X.columns, | |
columns=['PC1', 'PC2'] | |
) | |
# Étape 2 : Calculer les contributions de chaque variable pour chaque client du top 10 | |
top10_pca = df_merged.loc[top10_indices, ['pca_1', 'pca_2']] | |
# Contiendra les top 2 variables qui expliquent la position PCA | |
top_pca_vars = [] | |
for idx, row in top10_pca.iterrows(): | |
# Produit scalaire entre position (pca_1, pca_2) et loadings = importance relative des features | |
contribs = row['pca_1'] * loadings['PC1'] + row['pca_2'] * loadings['PC2'] | |
top2 = contribs.abs().sort_values(ascending=False).head(2).index | |
top_pca_vars.append(top2) | |
# Étape 3 : Ajouter ces infos dans df_final_top10 | |
df_final_top10['pca_var_1'] = [x[0] for x in top_pca_vars] | |
df_final_top10['pca_var_2'] = [x[1] for x in top_pca_vars] | |
# Affichage | |
print("🔎 Top 10 avec variables responsables de leur isolement (selon PCA) :") | |
print(df_final_top10[['PARTY_KEY', 'suspicion_score', 'pca_var_1', 'pca_var_2']]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment