EatZeBaby · July 7, 2025 08:04
diff --git a/Data Generation - DbDemos Customer Review - Anomaly Version b/Data Generation - DbDemos Customer Review - Anomaly Version
 import pandas as pd
 import numpy as np
 from faker import Faker
 from datetime import datetime, timedelta

 from faker import Faker
 import random
 from faker.providers import BaseProvider

 cost_variability = {
    "hourly_rate_range": (35, 65),  # Agent experience levels
    "peak_hour_multiplier": {
        "morning": 1.25,  # 8-11am
        "afternoon": 1.0,  # 11am-3pm
        "evening": 1.15,  # 3-7pm
        "night": 1.4  # 7pm-8am
    },
    "complexity_factors": {
        "Hardware": (0.8, 2.0),  # (min, max) multiplier
        "Software": (0.9, 1.5),
        "Service": (0.7, 1.8),
        "Training": (0.4, 1.3),
        "Other": (0.2, 1.2),
    },
    "overtime_penalty": 1.5,  # 50% extra after 8h
    "weekend_surcharge": 1.3  # 30% weekend premium
 }
 # Progressive AI Performance Degradation Configuration (for billing cases only)
 progressive_ai_degradation = {
    "april_2025": {
        "resolution_time_multiplier": 1.80,  # 33% increase
        "ai_acceptance_drop": 0.15,           # 15% drop in acceptance
        "first_resolution_drop": 0.20         # 20% drop in FTR
    },
    "may_2025": {
        "resolution_time_multiplier": 3.5,  # 80% increase  
        "ai_acceptance_drop": 0.25,           # 25% drop in acceptance
        "first_resolution_drop": 0.35         # 35% drop in FTR
    }
 }

 # Add to your existing configuration
 product_impacts = {
    "Hardware": {
        "sla_penalty_multiplier": 1.5,  # 50% higher penalties
        "upsell_probability": 0.15,
        "clv_base": 7000,
        "clv_std": 2000,
        "cost_multiplier": 1.2  # 20% higher resolution costs
    },
    "Software": {
        "sla_penalty_multiplier": 1.0,
        "upsell_probability": 0.25,
        "clv_base": 5000,
        "clv_std": 1500,
        "cost_multiplier": 1.0
    },
    "Service": {
        "sla_penalty_multiplier": 0.8,  # 20% lower penalties
        "upsell_probability": 0.35,
        "clv_base": 10000,
        "clv_std": 3000,
        "cost_multiplier": 0.9
    },
    "Training": {
        "sla_penalty_multiplier": 0.7,  # 20% lower penalties
        "upsell_probability": 0.45,
        "clv_base": 10000,
        "clv_std": 3000,
        "cost_multiplier": 0.9
    },
    "Other": {
        "sla_penalty_multiplier": 1,  # 20% lower penalties
        "upsell_probability": 0.5,
        "clv_base": 10000,
        "clv_std": 3000,
        "cost_multiplier": 1
    }
 }

 # List of European countries with their approximate populations (in millions)
 european_countries = [
    ("Albania", 2.8),
    ("Andorra", 0.077),
    ("Armenia", 3.0),
    ("Austria", 8.9),
    ("Azerbaijan", 10.0),
    ("Belarus", 9.4),
    ("Belgium", 11.5),
    ("Bosnia and Herzegovina", 3.3),
    ("Bulgaria", 6.9),
    ("Croatia", 4.0),
    ("Cyprus", 1.2),
    ("Czech Republic", 10.7),
    ("Denmark", 5.8),
    ("Estonia", 1.3),
    ("Finland", 5.5),
    ("France", 67.4),
    ("Australia", 26.4),
    ("Canada", 40.1),
    ("Japan", 124.1),
    ("United States",340),
    ("Georgia", 3.7),
    ("Germany", 83.2),
    ("Greece", 10.4),
    ("Hungary", 9.7),
    ("Iceland", 0.36),
    ("Ireland", 4.9),
    ("Italy", 59.3),
    ("Kazakhstan", 18.8),
    ("Kosovo", 1.8),
    ("Latvia", 1.9),
    ("Liechtenstein", 0.038),
    ("Lithuania", 2.8),
    ("Luxembourg", 0.63),
    ("Malta", 0.51),
    ("Moldova", 2.6),
    ("Monaco", 0.039),
    ("Montenegro", 0.62),
    ("Netherlands", 17.5),
    ("North Macedonia", 2.1),
    ("Norway", 5.4),
    ("Poland", 37.9),
    ("Portugal", 10.3),
    ("Romania", 19.2),
    #("Russia", 144.1),
    ("San Marino", 0.034),
    ("Serbia", 6.7),
    ("Slovakia", 5.4),
    ("Slovenia", 2.1),
    ("Spain", 47.3),
    ("Sweden", 10.3),
    ("Switzerland", 8.6),
    ("Turkey", 84.3),
    ("Ukraine", 41.4),
    ("United Kingdom", 67.2),
    ("Vatican City", 0.0008)
 ]

 # Extract countries and populations
 countries, populations = zip(*european_countries)

 # Custom provider for generating European countries based on population distribution
 class EuropeanCountryProvider(BaseProvider):
    def european_country(self):
        return random.choices(countries, weights=populations, k=1)[0]

 # Initialize Faker
 fake = Faker()

 # Add the custom provider to Faker
 fake.add_provider(EuropeanCountryProvider)
 
 np.random.seed(42)

 # --- Configuration ---
 n_tickets = 24472
 ai_ratio = 0.7  # 70% AI-assisted tickets
 n_agents = 16   # 8 AI, 8 human
 premium_ratio = 0.3  # 30% Premium support tickets
 base_hourly_cost = 45  # Average agent hourly rate
 ai_cost_multiplier = 0.4  # AI is 60% cheaper to operate
 premium_upsell_rate = 0.25  # 25% upsell chance on premium tickets

 # NEW: AI Performance Degradation Configuration
 ai_performance_degradation = {
    "ai_acceptance_drop": 0.15,  # 15% drop in AI suggestion acceptance
    "billing_csat_impact": -0.8,  # 0.8 point drop for billing cases
    "time_resolution_increase": 1.25,  # 25% increase in resolution time
    "first_resolution_drop": 0.20  # 20% drop in first-time resolution rate
 }

 # --- Create Agents ---
 ai_agents = [fake.unique.name() + " (AI)" for _ in range(n_agents//2)]
 human_agents = [fake.unique.name() for _ in range(n_agents//2)]
 all_agents = ai_agents + human_agents

 # --- Generate Unified Dataset ---
 data = {
    "status": [],
    "ticket_id": [],
    "priority": [],
    "source": [],
    "topic": [],
    "agent_group": [],
    "agent_name": [],
    "created_time": [],
    "expected_sla_to_resolve": [],
    "expected_sla_to_first_response": [],
    "first_response_time": [],
    "sla_for_first_response": [],
    "resolution_time": [],
    "sla_for_resolution": [],
    "close_time": [],
    "agent_interactions": [],  # This will store interaction quality scores
    "interaction_notes": [],   # New: Textual notes
    "survey_results": [],
    "product_group": [],
    "support_level": [],
    "country": [],
    "latitude": [],
    "longitude": [],
    "call_transcript": [],
    "compliance_greeting": [],
    "compliance_farewell": [],
    "compliance_data_leak": [],
    "call_sentiment_score": [],
    "compliance_score": [],
    # NEW FIELDS
    "ai_suggestion_accepted": [],  # Only for AI-assisted agents
    "first_time_resolution": [],   # Boolean: resolved on first contact
    "csat_score": [],             # Customer satisfaction score (1-10)
    "resolution_attempt_number": [] # How many attempts to resolve (1 = first time)
 }

 # Quality note templates
 good_notes = [
    "Used AI suggestions to quickly identify root cause",
    "Automated troubleshooting steps applied",
    "Knowledge base article referenced for standard solution",
    "Escalated to specialist with detailed context",
    "Proactive follow-up scheduled"
 ]

 bad_notes = [
    "Customer required multiple explanations",
    "Had to consult supervisor for guidance",
    "Solution required manual research",
    "Back-and-forth communication needed",
    "Documentation was unclear"
 ]

 # Add to your existing configuration
 compliance_rules = {
    "greeting": ["Hello", "Good [morning/afternoon]", "Thank you for calling"],
    "farewell": ["Have a nice day", "Thank you for your time"],
    "data_protection_phrases": ["Let me verify your identity", "For security purposes"],
    "forbidden_phrases": ["I'm angry", "That's stupid", "I don't care"]
 }

 def generate_transcript(is_ai):
    """Generate synthetic call transcript with compliance markers"""
    transcript = []
    
    # --- Greeting ---
    if is_ai or np.random.rand() < 0.8:  # AI has 95% compliance
        transcript.append(np.random.choice([
            "Hello! Thank you for calling [Insurance Co]. How can I help you today?",
            "Good morning! You've reached [Insurance Co]. May I have your policy number?"
        ]))
    else:
        if np.random.rand() < 0.2:  # 20% non-compliant greeting
            transcript.append("Yeah, what do you need?")
        else:
            transcript.append("Hi, [Insurance Co], policy number?")

    # --- Main Conversation ---
    transcript.append(f"Customer: {fake.sentence()}")
    
    # AI agents reference knowledge base
    if is_ai:
        transcript.append("System: Suggested response - 'Let me verify that in our system'")
        transcript.append("Agent: Let me verify that in our system...")
    else:
        transcript.append("Agent: Umm, let me check...")
    
    # --- Data Protection ---
    if is_ai or np.random.rand() < 0.9:
        transcript.append("Agent: Could you confirm the last 4 digits of your SSN for verification?")
    else:
        if np.random.rand() < 0.1:  # 10% data leak risk
            transcript.append("Agent: So your full SSN is 123-45-6789 right?")
    
    # --- Sentiment Control ---
    if not is_ai and np.random.rand() < 0.15:  # 15% frustration risk
        transcript.append("Agent: *sigh* I already explained this...")
    
    # --- Farewell ---
    if is_ai or np.random.rand() < 0.85:
        transcript.append(np.random.choice([
            "Thank you for calling [Insurance Co]. Have a wonderful day!",
            "We appreciate your business. Goodbye!"
        ]))
    else:
        transcript.append("Bye")
    
    return "\n".join(transcript)

 def calculate_financials(row):
    product = row['product_group']
    params = product_impacts[product]
    
    # Operational Costs
    base_cost = base_hourly_cost * (row['resolution_time']/60)
    if row['agent_group'] == "AI-Assisted":
        cost_multiplier = ai_cost_multiplier
    else:
        cost_multiplier = 1.0
        
    cost_per_ticket = base_cost * cost_multiplier * params['cost_multiplier']
    
    # SLA Penalties
    if row['sla_for_resolution'] == "Breached SLA":
        base_penalty = 150 if row['priority'] == "High" else 75
        sla_penalty = base_penalty * params['sla_penalty_multiplier']
    else:
        sla_penalty = 0
        
    # Upsell Revenue
    if row['support_level'] == "Premium" and np.random.rand() < params['upsell_probability']:
        upsell_revenue = np.random.choice([99, 199, 299]) * (1.5 if product == "Service" else 1.0)
    else:
        upsell_revenue = 0
        
    # CLV Calculations
    clv_base = np.random.normal(params['clv_base'], params['clv_std'])
    clv_risk = clv_base * (0.02 if row['survey_results'] < 3 else 0) * (1.2 if product == "Service" else 1.0)
    
    # --- Dynamic Hourly Rate ---
    agent_exp = hash(row['agent_name']) % 10  # 0-9 experience level
    base_rate = np.linspace(*cost_variability["hourly_rate_range"], 10)[agent_exp]
    
    # --- Time-of-Day Premium ---
    hour = row['created_time'].hour
    if 8 <= hour < 11:
        time_mult = cost_variability["peak_hour_multiplier"]["morning"]
    elif 11 <= hour < 15:
        time_mult = 1.0
    elif 15 <= hour < 19:
        time_mult = cost_variability["peak_hour_multiplier"]["evening"]
    else:
        time_mult = cost_variability["peak_hour_multiplier"]["night"]
    
    # --- Product Complexity ---
    product = row['product_group']
    complexity = np.random.uniform(*cost_variability["complexity_factors"][product])
    
    # --- Weekend/Overtime Costs ---
    is_weekend = row['created_time'].weekday() >= 5
    overtime = 1.0 + max(0, row['resolution_time'] - 8)/24 * cost_variability["overtime_penalty"]
    
    # --- AI Cost Logic ---
    if row['agent_group'] == "AI-Assisted":
        cost_mult = ai_cost_multiplier * (0.9 + np.random.normal(0, 0.1))  # 10% AI variance
    else:
        cost_mult = 1.0 * (0.8 + np.random.normal(0, 0.2))  # 20% human variance
    
    # --- Final Calculation ---
    raw_cost = base_rate * (row['resolution_time']/60)
    final_cost = raw_cost * time_mult * complexity * overtime * cost_mult
    
    if is_weekend:
        final_cost *= cost_variability["weekend_surcharge"]
    
    return pd.Series([
        final_cost,
        sla_penalty,
        upsell_revenue,
        clv_base,
        clv_risk
    ])

 # NEW: Function to determine if issue is resolved on first attempt
 def calculate_first_time_resolution(topic, agent_group, ai_accepted=None, created_time=None):
    """Calculate if ticket is resolved on first contact"""
    base_ftr_rate = 0.75  # 75% baseline first-time resolution
    
    # Topic impact on FTR
    topic_multipliers = {
        "Feature request": 0.9,
        "Bug report": 0.8,
        "Other": 0.85,
        "Billing Cases": 0.8,  # Normal performance before April 2025
        "Pricing and licensing": 0.7
    }
    
    ftr_prob = base_ftr_rate * topic_multipliers.get(topic, 0.8)
    
    # AI assistance impact
    if agent_group == "AI-Assisted":
        if ai_accepted:
            ftr_prob *= 1.1  # 10% boost when AI accepted
        else:
            ftr_prob *= 0.8  # 20% penalty when AI rejected
    
    # Apply progressive performance degradation ONLY to billing cases
    if topic == "Billing Cases" and created_time and agent_group == "AI-Assisted":
        if created_time >= datetime(2025, 5, 1):  # May 2025
            ftr_prob *= (1 - progressive_ai_degradation["may_2025"]["first_resolution_drop"])
        elif created_time >= datetime(2025, 4, 1):  # April 2025
            ftr_prob *= (1 - progressive_ai_degradation["april_2025"]["first_resolution_drop"])
    
    return np.random.rand() < ftr_prob

 # NEW: Function to calculate CSAT score
 def calculate_csat_score(topic, first_time_resolution, agent_group, survey_results, created_time=None):
    """Calculate Customer Satisfaction (CSAT) score 1-10"""
    base_csat = 7.2  # Average CSAT score
    
    # Topic-specific impact (billing cases hit hard ONLY from April 2025)
    if topic == "Billing Cases" and created_time and created_time >= datetime(2025, 4, 1):
        base_csat += ai_performance_degradation["billing_csat_impact"]
    
    # First-time resolution bonus
    if first_time_resolution:
        base_csat += 0.8
    else:
        base_csat -= 0.5
    
    # Agent type impact
    if agent_group == "AI-Assisted":
        base_csat += 0.3  # Slight AI advantage
    
    # Correlation with survey results
    csat = base_csat + (survey_results - 3) * 0.4
    
    # Add some randomness and constrain to 1-10
    csat += np.random.normal(0, 0.5)
    return np.clip(csat, 1, 10)

 for i in range(n_tickets):
    # --- Common Fields ---
    ticket_id = f"{1000+i:04d}"
    created_time = fake.date_time_between(start_date=datetime(2024,6,1), end_date=datetime(2025,5,30))
    is_premium = np.random.rand() < premium_ratio
    
    # Topic selection (keeping original distribution)
    topic = np.random.choice(["Feature request","Bug report","Other","Billing Cases","Pricing and licensing"])
    
    # Assign agent
    if np.random.rand() < ai_ratio:
        agent_name = np.random.choice(ai_agents)
        agent_group = "AI-Assisted"
    else:
        agent_name = np.random.choice(human_agents)
        agent_group = "Human Only"
    
    is_ai = agent_group == "AI-Assisted"
    
    # NEW: AI Suggestion Acceptance (only for AI-assisted agents)
    if is_ai:
        base_acceptance_rate = 0.85  # 85% baseline acceptance
        
        # Apply progressive degradation ONLY to billing cases
        if topic == "Billing Cases":
            if created_time >= datetime(2025, 5, 1):  # May 2025
                degraded_acceptance_rate = base_acceptance_rate * (1 - progressive_ai_degradation["may_2025"]["ai_acceptance_drop"])
            elif created_time >= datetime(2025, 4, 1):  # April 2025
                degraded_acceptance_rate = base_acceptance_rate * (1 - progressive_ai_degradation["april_2025"]["ai_acceptance_drop"])
            else:
                degraded_acceptance_rate = base_acceptance_rate  # Normal performance before April 2025
        else:
            degraded_acceptance_rate = base_acceptance_rate  # Normal performance for non-billing cases
        
        ai_suggestion_accepted = np.random.rand() < degraded_acceptance_rate
    else:
        ai_suggestion_accepted = None  # N/A for human-only agents
        
    # NEW: Calculate first-time resolution
    first_time_resolution = calculate_first_time_resolution(topic, agent_group, ai_suggestion_accepted, created_time)
    resolution_attempts = 1 if first_time_resolution else np.random.choice([2, 3, 4], p=[0.6, 0.3, 0.1])
    
    # --- Performance Logic (UPDATED with time-based degradation) ---
    # Resolution times - apply increase ONLY from April 2025
    if is_ai:
        base_res_time = max(2, np.random.normal(3.5, 0.5))
        
        # Apply progressive performance degradation ONLY to billing cases
        if topic == "Billing Cases":
            if created_time >= datetime(2025, 5, 1):  # May 2025
                base_res_time *= progressive_ai_degradation["may_2025"]["resolution_time_multiplier"]
            elif created_time >= datetime(2025, 4, 1):  # April 2025
                base_res_time *= progressive_ai_degradation["april_2025"]["resolution_time_multiplier"]
            # Before April 2025: no degradation
        
        # Factor in AI acceptance
        if ai_suggestion_accepted:
            base_res_time *= 0.9  # 10% faster when AI accepted
        else:
            base_res_time *= 1.3  # 30% slower when AI rejected
    else:
        base_res_time = max(2, np.random.normal(6, 1))
    
    # Multiple attempts increase resolution time
    if not first_time_resolution:
        base_res_time *= resolution_attempts * 0.8
    
    if is_premium:
        res_time = base_res_time * 0.6
        support_level = "Premium"
    else:
        res_time = base_res_time
        support_level = "Basic"
    
    # First response time
    frt = max(0.1, np.random.normal(0.5, 0.2)) if is_ai else max(0.5, np.random.normal(2, 0.5))
    
    # --- Interaction Quality ---
    if is_ai and is_premium:
        # Best notes (AI + Premium)
        note_quality = np.random.choice([4, 5])  # 4-5/5
        notes = np.random.choice(good_notes, size=2, replace=False)
    elif is_ai:
        # Good notes (AI only) - but factor in acceptance
        if ai_suggestion_accepted:
            note_quality = np.random.choice([3, 4])  # 3-4/5
            notes = [np.random.choice(good_notes)]
        else:
            note_quality = np.random.choice([2, 3])  # 2-3/5 when AI rejected
            notes = [np.random.choice(bad_notes)]
    elif is_premium:
        # Medium notes (Premium only)
        note_quality = np.random.choice([2, 3])  # 2-3/5
        notes = [np.random.choice(good_notes + bad_notes)]
    else:
        # Poor notes (Human + Basic)
        note_quality = np.random.choice([1, 2])  # 1-2/5
        notes = np.random.choice(bad_notes, size=2, replace=False)
    
    # Format notes
    formatted_notes = " | ".join(notes)
    
    # Other metrics
    survey = np.random.randint(3, 6) if is_ai else np.random.randint(1, 5)
    
    # NEW: Calculate CSAT score
    csat_score = calculate_csat_score(topic, first_time_resolution, agent_group, survey, created_time)
    
    # --- Populate Data ---
    data["ticket_id"].append(ticket_id)
    data["status"].append(np.random.choice(["Resolved","Closed","In progress","Open"], p=[0.3, 0.2, 0.4,0.1]))
    data["priority"].append(np.random.choice(["Low", "Medium", "High"], p=[0.5, 0.3, 0.2]))
    data["source"].append(np.random.choice(["Phone", "Email", "Chat"],p=[0.15, 0.55, 0.3]))
    data["topic"].append(topic)
    data["agent_group"].append(agent_group)
    data["agent_name"].append(agent_name)
    data["created_time"].append(created_time)
    data["expected_sla_to_resolve"].append(np.random.randint(4, 24))
    data["expected_sla_to_first_response"].append(np.random.randint(1, 4))
    data["first_response_time"].append(frt)
    data["sla_for_first_response"].append("Within SLA" if frt <= data["expected_sla_to_first_response"][-1] else "Breached SLA")
    data["resolution_time"].append(res_time)
    data["sla_for_resolution"].append("Within SLA" if res_time <= data["expected_sla_to_resolve"][-1] else "Breached SLA")
    data["close_time"].append(created_time + timedelta(hours=res_time))
    data["agent_interactions"].append(note_quality)  # Quality score 1-5
    data["interaction_notes"].append(formatted_notes)  # Text notes
    data["survey_results"].append(survey)
    data["product_group"].append(np.random.choice(["Hardware", "Software", "Training","Service","Other"],p=[0.1, 0.15, 0.45, 0.25, 0.05]))
    data["support_level"].append(support_level)
    data["country"].append(fake.european_country())
    data["latitude"].append(fake.latitude())
    data["longitude"].append(fake.longitude())
    
    # NEW FIELDS
    data["ai_suggestion_accepted"].append(ai_suggestion_accepted)
    data["first_time_resolution"].append(first_time_resolution)
    data["csat_score"].append(round(csat_score, 1))
    data["resolution_attempt_number"].append(resolution_attempts)

    transcript = generate_transcript(is_ai)
    
    # AI Analysis Results
    compliance_greeting = any(phrase in transcript for phrase in compliance_rules["greeting"])
    compliance_farewell = any(phrase in transcript for phrase in compliance_rules["farewell"])
    data_leak_risk = any(ssn in transcript for ssn in ["SSN", "social security"]) and not any(
        phrase in transcript for phrase in compliance_rules["data_protection_phrases"])
    
    sentiment_score = np.clip(np.random.normal(0.8 if is_ai else 0.4, 0.2), 0, 1)  # 0-1 scale
    
    # Add to data dictionary
    data["call_transcript"].append(transcript)
    data["compliance_greeting"].append(compliance_greeting)
    data["compliance_farewell"].append(compliance_farewell)
    data["compliance_data_leak"].append(data_leak_risk)
    data["call_sentiment_score"].append(sentiment_score)
    data["compliance_score"].append(
        0.3*compliance_greeting + 
        0.2*compliance_farewell + 
        0.4*(not data_leak_risk) + 
        0.1*sentiment_score
    )

 customer_review = pd.DataFrame(data)

 customer_review[[
    'operational_cost',
    'sla_penalty_cost',
    'upsell_revenue',
    'customer_clv',
    'clv_risk'
 ]] = customer_review.apply(calculate_financials, axis=1)

 # --- Add Churn Risk ---
 customer_review['churn_risk'] = np.where(
    (customer_review['survey_results'] < 3) & 
    (customer_review['sla_for_resolution'] == "Breached SLA"),
    np.random.uniform(0.15, 0.4),  # 15-40% churn risk
    np.random.uniform(0.01, 0.1)    # 1-10% baseline
 )

 # --- Update SLA Table ---
 sla = customer_review[[
    "ticket_id", "expected_sla_to_resolve", "expected_sla_to_first_response",
    "first_response_time", "sla_for_first_response", "resolution_time",
    "sla_for_resolution", "survey_results", "sla_penalty_cost","clv_risk", 
    "interaction_notes", "first_time_resolution", "csat_score", "resolution_attempt_number"
 ]].copy()

 # --- Split into Tables ---
 tickets = customer_review[[
    "ticket_id", "status", "priority", "source", "topic",
    "created_time", "close_time", "product_group",
    "support_level", "country", "latitude", "longitude","operational_cost","call_transcript","compliance_greeting",
    "compliance_data_leak",
    "call_sentiment_score", "compliance_score", "csat_score", "first_time_resolution"
 ]].copy()

 agents = customer_review[[
    "ticket_id", "agent_group", "agent_name", "agent_interactions", "ai_suggestion_accepted"
 ]].copy()

 tickets["created_time"] = tickets["created_time"].astype(str)
 tickets["close_time"] = tickets["close_time"].astype(str)

 # --- Save to Parquet ---
 customer_review.to_parquet("./datasets/customer_review_axr.parquet", engine="pyarrow")
 tickets.to_parquet("./datasets/tickets_axr.parquet", engine="pyarrow")
 sla.to_parquet("./datasets/sla_axr.parquet", engine="pyarrow")
 agents.to_parquet("./datasets/agents_axr.parquet", engine="pyarrow")

 # --- Verification ---
 print("\nPerformance Metrics Summary:")
 print("="*50)

 print(f"\nPerformance Metrics Summary:")
 print("="*50)

 # Split metrics by time period for comparison
 pre_april_2025 = customer_review[customer_review['created_time'] < datetime(2025, 4, 1)]
 post_april_2025 = customer_review[customer_review['created_time'] >= datetime(2025, 4, 1)]

 print(f"\nAI Suggestion Acceptance Rate:")
 if len(pre_april_2025[pre_april_2025['agent_group'] == 'AI-Assisted']) > 0:
    pre_acceptance = pre_april_2025[pre_april_2025['agent_group'] == 'AI-Assisted']['ai_suggestion_accepted'].mean()
    print(f"Before April 2025: {pre_acceptance:.2%}")

 if len(post_april_2025[post_april_2025['agent_group'] == 'AI-Assisted']) > 0:
    post_acceptance = post_april_2025[post_april_2025['agent_group'] == 'AI-Assisted']['ai_suggestion_accepted'].mean()
    print(f"April 2025 onwards: {post_acceptance:.2%}")

 print(f"\nFirst-Time Resolution Rate:")
 print("Before April 2025:")
 if len(pre_april_2025) > 0:
    print(pre_april_2025.groupby('agent_group')['first_time_resolution'].mean())
 print("April 2025 onwards:")
 if len(post_april_2025) > 0:
    print(post_april_2025.groupby('agent_group')['first_time_resolution'].mean())

 print(f"\nAverage Resolution Time by Agent Group:")
 print("Before April 2025:")
 if len(pre_april_2025) > 0:
    print(pre_april_2025.groupby('agent_group')['resolution_time'].mean())
 print("April 2025 onwards:")
 if len(post_april_2025) > 0:
    print(post_april_2025.groupby('agent_group')['resolution_time'].mean())

 print(f"\nCSAT Score for Billing Cases:")
 pre_billing = pre_april_2025[pre_april_2025['topic'] == 'Billing Cases']
 post_billing = post_april_2025[post_april_2025['topic'] == 'Billing Cases']

 if len(pre_billing) > 0:
    print(f"Before April 2025: {pre_billing['csat_score'].mean():.1f}")
 if len(post_billing) > 0:
    print(f"April 2025 onwards: {post_billing['csat_score'].mean():.1f}")

 print(f"\nOverall CSAT Score by Agent Group:")
 print("Before April 2025:")
 if len(pre_april_2025) > 0:
    print(pre_april_2025.groupby('agent_group')['csat_score'].mean())
 print("April 2025 onwards:")
 if len(post_april_2025) > 0:
    print(post_april_2025.groupby('agent_group')['csat_score'].mean())

 print("\nInteraction Quality by Agent Type and Support Level:")
 print(agents.merge(tickets[["ticket_id", "support_level"]], on="ticket_id")
    .groupby(["agent_group", "support_level"])
    ["agent_interactions"].mean())
No results found