Skip to content

Instantly share code, notes, and snippets.

View etechoptimist's full-sized avatar

Eduardo Toledo etechoptimist

View GitHub Profile
@etechoptimist
etechoptimist / dvc_feature_engineering_production
Created December 13, 2025 00:50
dvc_feature_engineering_production
dvc stage add -n feature_engineering_analysis_oct_2025 \
-d data/raw/analysis_oct_2025/analysis.csv \
-d src/feature_engineering.py \
-o data/features/engineered_features_analysis_oct_2025.parquet \
python src/feature_engineering.py \
--input data/raw/analysis_oct_2025/analysis.csv \
--output data/features/engineered_features_analysis_oct_2025.parquet
@etechoptimist
etechoptimist / dvc_tracking_octoberdataset
Created December 13, 2025 00:34
dvc_tracking_octoberdataset
dvc add data/raw/analysis_oct_2025/analysis.csv
git commit -m "October 2025 monitoring dataset"
git add dvc.yaml dvc.lock src/feature_engineering.py data\features\.
dvc add data/raw/analysis_oct_2025/analysis.csv
dvc commit
git add data/raw/analysis_oct_2025/analysis.csv.dvc data/raw/analysis_oct_2025/.gitignore
git commit -m "October 2025 monitoring dataset"
git tag -a v-oct2025-monitoring -m "Monitoring dataset for October 2025"
dvc push
@etechoptimist
etechoptimist / dvc_monitoring_nannyml
Created December 13, 2025 00:25
dvc_monitoring_nannyml
dvc stage add -n monitoring_oct_2025
-d data/features/engineered_features_analysis_oct_2025.parquet
-d data/features/filtered_features.parquet -d models/ensemble_model.joblib
-d src/monitoring_nannyml.py -o reports/monitoring/oct_2025 python src/monitoring_nannyml.py
--input data/features/engineered_features_analysis_oct_2025.parquet
--output reports/monitoring/oct_2025/performance_estimation.html
@etechoptimist
etechoptimist / run_univariate_drift
Created December 12, 2025 23:15
run_univariate_drift
# ----------------------------------------------------------------------------
# UNIVARIATE DRIFT + RCA
# ----------------------------------------------------------------------------
def run_univariate_drift(reference_df, analysis_df, selected_features, perf_results, output_dir: Path):
logger.info("Running univariate drift + RCA...")
uv = nml.UnivariateDriftCalculator(
column_names=selected_features,
continuous_methods=["kolmogorov_smirnov"], # Best for continuous: detects distribution shape and location changes
categorical_methods=["jensen_shannon"], # Best for categorical: information-theoretic, symmetric distance
@etechoptimist
etechoptimist / run_multivariate_drift
Created December 12, 2025 23:08
run_multivariate_drift
# ----------------------------------------------------------------------------
# MULTIVARIATE DRIFT (PCA)
# ----------------------------------------------------------------------------
def run_multivariate_drift(reference_df, analysis_df, selected_features, output_dir: Path):
logger.info("Running PCA multivariate drift...")
@etechoptimist
etechoptimist / performance_estimation
Created December 12, 2025 22:18
performance_estimation
# ----------------------------------------------------------------------------
# PERFORMANCE ESTIMATION (CBPE ONLY ON ANALYSIS DATA)
# ----------------------------------------------------------------------------
def run_performance_estimation(reference_df, analysis_df, output_dir: Path):
import time
logger.info("Running CBPE performance estimation...")
logger.info(f"Analysis dataset: {len(analysis_df)} records")
print(reference_df["y_true"].nunique())
@etechoptimist
etechoptimist / monitoring_score
Created December 12, 2025 22:14
monitoring_score
# ----------------------------------------------------------------------------
# MODEL SCORING
# ----------------------------------------------------------------------------
def score_model(df: pd.DataFrame, selected_features, model_path: str, is_reference: bool = False):
logger.info(f"Scoring model: {model_path}")
model = load(model_path)
X = df[selected_features].fillna(0)
df["y_pred_proba"] = model.predict_proba(X)[:, 1]
@etechoptimist
etechoptimist / load_and_filter_engineered_monitoring
Created December 12, 2025 22:02
load_and_filter_engineered_monitoring
# ----------------------------------------------------------------------------
# LOAD ENGINEERED MONITORING DATA AND FILTER BY selected_features
# ----------------------------------------------------------------------------
def load_and_filter_engineered_monitoring(engineered_path: str, selected_features):
logger.info(f"Loading engineered monitoring dataset: {engineered_path}")
df = pd.read_parquet(engineered_path)
# Ensure timestamp exists
if "timestamp" in df.columns:
df["timestamp"] = pd.to_datetime(df["timestamp"])
@etechoptimist
etechoptimist / monitoring_load_selected_features
Created December 12, 2025 21:52
monitoring_load_selected_features
# ----------------------------------------------------------------------------
# LOAD TRAINING FILTERED FEATURES → DETERMINE SELECTED FEATURE LIST
# ----------------------------------------------------------------------------
def load_selected_features(filtered_training_path: str):
df = pd.read_parquet(filtered_training_path)
# all model features EXCEPT identifiers, typology, and transaction_type
selected = [
col for col in df.columns
if col not in ["session_id", "table_id", "patron_id", "timestamp", "typology", "transaction_type"]