Last active
March 31, 2025 20:45
-
-
Save hdary85/37b063b9735e9ade7450f6a33c999e37 to your computer and use it in GitHub Desktop.
anton
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
df['col1'] = df['col1'].fillna(df['col2']) | |
# --- 2. Prétraitement du DataFrame TV2 --- | |
# On s'attend à ce que TV2 contienne les colonnes suivantes en MAJUSCULE : | |
# PARTY_KEY, ROLE, PARTY_TYPE_CODE, RISQUE_SECTEUR, BASE_CURR_AMOUNT, etc. | |
TV2['PARTY_TYPE_CODE'] = TV2['PARTY_TYPE_CODE'].astype(int) | |
# Agrégation générale par PARTY_KEY pour les indicateurs financiers | |
tv2_agg = TV2.groupby('PARTY_KEY').agg( | |
TV_COUNT = ('PARTY_KEY', 'count'), | |
TV_TOTAL_AMOUNT = ('BASE_CURR_AMOUNT', 'sum'), | |
TV_AVG_AMOUNT = ('BASE_CURR_AMOUNT', 'mean'), | |
# Comme le RISQUE_SECTEUR est associé au client, on prend la première valeur rencontrée | |
TV_RISQUE_SECTEUR = ('RISQUE_SECTEUR', 'first'), | |
PARTY_TYPE_CODE = ('PARTY_TYPE_CODE', 'first') | |
).reset_index() | |
# --- 3. Agrégation par rôle (entrants et sortants) avec comptes et sommes --- | |
# On calcule à la fois le nombre et la somme par rôle | |
tv2_roles = TV2.groupby(['PARTY_KEY', 'ROLE'])['BASE_CURR_AMOUNT'].agg(['count', 'sum']).unstack(fill_value=0) | |
# Aplatir les colonnes pour obtenir des noms explicites, par exemple TV_ORIGINATOR_COUNT, TV_ORIGINATOR_SUM, etc. | |
tv2_roles.columns = [f"TV_{role.upper()}_{agg.upper()}" for agg, role in tv2_roles.columns.to_flat_index()] | |
tv2_roles.reset_index(inplace=True) | |
# Fusion des agrégations de TV2 | |
tv2_agg = pd.merge(tv2_agg, tv2_roles, on='PARTY_KEY', how='left') | |
# --- 4. Prétraitement du DataFrame Especes2 --- | |
# Especes2 doit contenir : PARTY_KEY, NBR_TRX, TOTAL | |
Especes2['NBR_TRX'] = pd.to_numeric(Especes2['NBR_TRX'], errors='coerce') | |
Especes2['TOTAL'] = pd.to_numeric(Especes2['TOTAL'], errors='coerce') | |
# Renommage pour plus de clarté | |
Especes2.rename(columns={'NBR_TRX': 'ESPECES_NBR_TRX', 'TOTAL': 'ESPECES_TOTAL'}, inplace=True) | |
# --- 5. Fusion des agrégations TV2 et Especes2 --- | |
df_merged = pd.merge(tv2_agg, Especes2, on='PARTY_KEY', how='outer') | |
# Remplacement des valeurs manquantes par 0 pour les indicateurs numériques | |
cols_to_fill = ['TV_COUNT', 'TV_TOTAL_AMOUNT', 'TV_AVG_AMOUNT', 'TV_RISQUE_SECTEUR', | |
'TV_ORIGINATOR_COUNT', 'TV_BENEFICIARY_COUNT', 'TV_ORIGINATOR_SUM', 'TV_BENEFICIARY_SUM', | |
'ESPECES_NBR_TRX', 'ESPECES_TOTAL'] | |
for col in cols_to_fill: | |
if col in df_merged.columns: | |
df_merged[col] = df_merged[col].fillna(0) | |
# S'assurer que PARTY_TYPE_CODE est renseigné | |
df_merged['PARTY_TYPE_CODE'] = df_merged['PARTY_TYPE_CODE'].fillna(0).astype(int) | |
# --- 6. Intégration du type de client via one-hot encoding --- | |
# Conversion de PARTY_TYPE_CODE en variables indicatrices (par exemple, PTYPE_1 et PTYPE_2) | |
df_merged = pd.get_dummies(df_merged, columns=['PARTY_TYPE_CODE'], prefix='PTYPE', drop_first=False) | |
# --- 7. Création d'une variable supplémentaire --- | |
# Calcul du ratio entre le total des dépôts en espèces et le total des télétransferts | |
df_merged['RATIO_ESPECES_TO_TV'] = df_merged.apply( | |
lambda row: row['ESPECES_TOTAL'] / row['TV_TOTAL_AMOUNT'] if row['TV_TOTAL_AMOUNT'] > 0 else 0, | |
axis=1 | |
) | |
# --- 8. Construction du jeu de features --- | |
features = df_merged[['TV_COUNT', 'TV_TOTAL_AMOUNT', 'TV_AVG_AMOUNT', 'TV_RISQUE_SECTEUR', | |
'TV_ORIGINATOR_COUNT', 'TV_BENEFICIARY_COUNT', 'TV_ORIGINATOR_SUM', 'TV_BENEFICIARY_SUM', | |
'ESPECES_NBR_TRX', 'ESPECES_TOTAL', 'RATIO_ESPECES_TO_TV']].copy() | |
# Intégration des variables indicatrices pour le type de client | |
one_hot_cols = [col for col in df_merged.columns if col.startswith('PTYPE_')] | |
features = pd.concat([features, df_merged[one_hot_cols]], axis=1) | |
# Mise à l'échelle des features | |
scaler = StandardScaler() | |
features_scaled = scaler.fit_transform(features) | |
# --- 9. Détection d'anomalies avec IsolationForest --- | |
iso_forest = IsolationForest(contamination=0.05, random_state=42) | |
df_merged['iforest_label'] = iso_forest.fit_predict(features_scaled) | |
df_merged['iforest_score'] = iso_forest.decision_function(features_scaled) | |
# --- 10. Détection d'anomalies par Autoencodeur --- | |
X_train, X_val = train_test_split(features_scaled, test_size=0.2, random_state=42) | |
input_dim = features_scaled.shape[1] | |
encoding_dim = max(int(input_dim / 2), 1) | |
input_layer = Input(shape=(input_dim,)) | |
encoded = Dense(encoding_dim, activation='relu')(input_layer) | |
encoded = Dense(max(int(encoding_dim / 2), 1), activation='relu')(encoded) | |
decoded = Dense(encoding_dim, activation='relu')(encoded) | |
decoded = Dense(input_dim, activation='linear')(decoded) | |
autoencoder = Model(inputs=input_layer, outputs=decoded) | |
autoencoder.compile(optimizer='adam', loss='mse') | |
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True) | |
autoencoder.fit(X_train, X_train, | |
epochs=100, | |
batch_size=32, | |
shuffle=True, | |
validation_data=(X_val, X_val), | |
callbacks=[early_stop], | |
verbose=0) | |
reconstructions = autoencoder.predict(features_scaled) | |
reconstruction_errors = np.mean(np.square(features_scaled - reconstructions), axis=1) | |
df_merged['ae_reconstruction_error'] = reconstruction_errors | |
ae_threshold = np.percentile(reconstruction_errors, 95) | |
df_merged['ae_label'] = (df_merged['ae_reconstruction_error'] > ae_threshold).astype(int) | |
# --- 11. Détection d'anomalies par DBSCAN --- | |
dbscan = DBSCAN(eps=0.5, min_samples=5) | |
dbscan_labels = dbscan.fit_predict(features_scaled) | |
df_merged['dbscan_label'] = dbscan_labels | |
df_merged['dbscan_anomaly'] = (df_merged['dbscan_label'] == -1).astype(int) | |
# --- 12. Fusion des détections et création du champ "alert_reason" --- | |
df_merged['combined_anomaly'] = ( | |
(df_merged['iforest_label'] == -1) | | |
(df_merged['ae_label'] == 1) | | |
(df_merged['dbscan_anomaly'] == 1) | |
).astype(int) | |
def determine_alert_reason(row): | |
reasons = [] | |
if row['iforest_label'] == -1: | |
reasons.append("IsolationForest") | |
if row['ae_label'] == 1: | |
reasons.append("Autoencodeur") | |
if row['dbscan_anomaly'] == 1: | |
reasons.append("DBSCAN") | |
return "Anomalie détectée par " + ", ".join(reasons) if reasons else "" | |
df_merged['alert_reason'] = df_merged.apply(determine_alert_reason, axis=1) | |
# --- 13. Affichage et sauvegarde des résultats --- | |
print("Nombre total de clients analysés :", df_merged.shape[0]) | |
print("Clients suspects (IsolationForest) :", (df_merged['iforest_label'] == -1).sum()) | |
print("Clients suspects (Autoencodeur) :", df_merged['ae_label'].sum()) | |
print("Clients suspects (DBSCAN) :", df_merged['dbscan_anomaly'].sum()) | |
print("Clients suspects (Méthode combinée) :", df_merged['combined_anomaly'].sum()) | |
suspicious_clients = df_merged[df_merged['combined_anomaly'] == 1] | |
print("\nExemple de clients suspects :") | |
print(suspicious_clients[['PARTY_KEY', 'TV_COUNT', 'TV_TOTAL_AMOUNT', 'ESPECES_NBR_TRX', 'ESPECES_TOTAL', | |
'iforest_score', 'ae_reconstruction_error', 'alert_reason']].head(10)) | |
plt.figure(figsize=(8,5)) | |
plt.hist(reconstruction_errors, bins=50, color='gray', edgecolor='black') | |
plt.axvline(ae_threshold, color='red', linestyle='--', label=f'Seuil AE (95e percentile: {ae_threshold:.3f})') | |
plt.title("Distribution des erreurs de reconstruction (Autoencodeur)") | |
plt.xlabel("Erreur de reconstruction") | |
plt.ylabel("Fréquence") | |
plt.legend() | |
plt.show() | |
df_merged.to_csv('anomaly_detection_final.csv', index=False) | |
# --- Fusion des détections et création du champ "alert_reason" (déjà réalisé) --- | |
df_merged['combined_anomaly'] = ( | |
(df_merged['iforest_label'] == -1) | | |
(df_merged['ae_label'] == 1) | | |
(df_merged['dbscan_anomaly'] == 1) | |
).astype(int) | |
def determine_alert_reason(row): | |
reasons = [] | |
if row['iforest_label'] == -1: | |
reasons.append("IsolationForest") | |
if row['ae_label'] == 1: | |
reasons.append("Autoencodeur") | |
if row['dbscan_anomaly'] == 1: | |
reasons.append("DBSCAN") | |
return "Anomalie détectée par " + ", ".join(reasons) if reasons else "" | |
df_merged['alert_reason'] = df_merged.apply(determine_alert_reason, axis=1) | |
# --- Calcul du score combiné pour le ranking --- | |
# Ici, on définit le score combiné comme : (erreur de reconstruction) - (score IsolationForest) | |
# Ainsi, un score élevé indique un comportement plus anormal (grâce à une forte erreur de reconstruction | |
# et un score IsolationForest très négatif). | |
df_merged['combined_score'] = df_merged['ae_reconstruction_error'] - df_merged['iforest_score'] | |
# --- Filtrer uniquement les anomalies détectées et conserver le top 5 --- | |
suspicious_clients = df_merged[df_merged['combined_anomaly'] == 1] | |
top5 = suspicious_clients.sort_values('combined_score', ascending=False).head(5) | |
print("\nTop 5 clients suspects :") | |
print(top5[['PARTY_KEY', 'combined_score', 'alert_reason']]) | |
# --- Création d'un bar plot pour visualiser le top 5 --- | |
plt.figure(figsize=(10,6)) | |
plt.bar(top5['PARTY_KEY'], top5['combined_score'], color='red') | |
plt.xlabel('PARTY_KEY') | |
plt.ylabel('Combined Anomaly Score') | |
plt.title('Top 5 Clients Suspects') | |
plt.xticks(rotation=45) | |
# Ajout d'annotations avec le champ alert_reason sur chaque barre | |
for idx, row in top5.iterrows(): | |
plt.text(row['PARTY_KEY'], row['combined_score'], row['alert_reason'], | |
ha='center', va='bottom', fontsize=8) | |
plt.tight_layout() | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment