bogged-broker · December 30, 2025 04:56
diff --git a/audio_pattern_learner.py b/audio_pattern_learner.py
 """
 audio_pattern_learner.py

 Analyzes audio performance records to identify statistically significant patterns
 that correlate with viral audio success. Provides actionable recommendations for
 TTS and voice-sync engines.

 Version 1: Heuristic/statistical analysis (no deep learning)
 Future: Can be upgraded to RL-based continuous learning system
 """

 import json
 from typing import Dict, List, Optional, Tuple
 from dataclasses import dataclass, asdict
 from collections import defaultdict
 import statistics
 from pathlib import Path


 @dataclass
 class AudioMetrics:
    """Raw audio feature measurements"""
    pace_wpm: float
    avg_pitch_hz: float
    pitch_variance: float
    pitch_jump_count: int  # Number of significant pitch changes
    pause_density: float  # Pauses per minute
    avg_pause_duration_ms: float
    beat_alignment_score: float  # 0-1, how well synced to music
    emphasis_peak_count: int  # Number of vocal emphasis points
    
    
 @dataclass
 class PerformanceMetrics:
    """Video performance outcomes"""
    completion_rate: float  # 0-1
    retention_curve: List[float]  # Retention at 10%, 20%, ..., 100%
    engagement_score: float  # Composite: likes, comments, shares
    viral_score: float  # Composite virality metric
    

 @dataclass
 class AudioProfile:
    """Recommended audio configuration for a niche/platform"""
    niche: str
    platform: str
    
    # Core recommendations
    target_pace_wpm: float
    pace_tolerance: float  # +/- range
    
    pitch_baseline_hz: float
    pitch_variance_target: float
    pitch_jump_frequency: str  # "low", "medium", "high"
    
    pause_density_target: float  # Pauses per minute
    pause_duration_ms: Tuple[float, float]  # (min, max)
    
    beat_alignment_importance: str  # "critical", "important", "optional"
    beat_alignment_threshold: float
    
    emphasis_strategy: str  # "sparse", "moderate", "frequent"
    emphasis_points_per_min: float
    
    # Supporting data
    confidence_score: float  # 0-1, based on sample size
    sample_size: int
    top_performers_analyzed: int
    
    # Explainability
    key_differentiators: List[str]  # What separates winners from losers
    anti_patterns: List[str]  # What to avoid
    

 class AudioPatternLearner:
    """
    Analyzes audio performance data to extract viral patterns.
    
    Architecture:
    1. Data ingestion from audio_performance_store
    2. Statistical analysis: winners vs losers
    3. Pattern clustering by niche/platform
    4. Profile generation with explainability
    
    Future RL upgrade path:
    - Replace statistical thresholds with learned reward functions
    - Implement multi-armed bandit for A/B testing recommendations
    - Add temporal decay for concept drift (trends change over time)
    - Continuous learning loop with performance feedback
    """
    
    def __init__(self, data_dir: str = "./audio_performance_data"):
        self.data_dir = Path(data_dir)
        self.data_dir.mkdir(exist_ok=True)
        
        # Thresholds for winner/loser classification
        self.VIRAL_THRESHOLD = 0.75  # Top 25% are "winners"
        self.MIN_SAMPLE_SIZE = 10  # Minimum records for reliable analysis
        
        # Statistical significance thresholds
        self.SIGNIFICANCE_DELTA = 0.15  # 15% difference = significant
        
    def load_performance_records(self) -> List[Dict]:
        """Load all performance records from storage"""
        records = []
        record_file = self.data_dir / "performance_records.jsonl"
        
        if record_file.exists():
            with open(record_file, 'r') as f:
                for line in f:
                    records.append(json.loads(line))
        
        return records
    
    def classify_performance(self, viral_score: float, all_scores: List[float]) -> str:
        """Classify a record as winner/loser based on percentile"""
        if not all_scores:
            return "unknown"
        
        sorted_scores = sorted(all_scores)
        threshold_idx = int(len(sorted_scores) * self.VIRAL_THRESHOLD)
        threshold = sorted_scores[threshold_idx] if threshold_idx < len(sorted_scores) else sorted_scores[-1]
        
        return "winner" if viral_score >= threshold else "loser"
    
    def calculate_delta(self, winners: List[float], losers: List[float]) -> Dict:
        """
        Calculate statistical difference between winner and loser groups.
        
        Returns delta metrics and significance assessment.
        """
        if not winners or not losers:
            return {"significant": False, "delta": 0, "winner_avg": 0, "loser_avg": 0}
        
        winner_avg = statistics.mean(winners)
        loser_avg = statistics.mean(losers)
        
        # Avoid division by zero
        baseline = max(abs(loser_avg), 0.01)
        delta_pct = (winner_avg - loser_avg) / baseline
        
        return {
            "significant": abs(delta_pct) >= self.SIGNIFICANCE_DELTA,
            "delta": delta_pct,
            "winner_avg": winner_avg,
            "loser_avg": loser_avg,
            "winner_std": statistics.stdev(winners) if len(winners) > 1 else 0,
            "loser_std": statistics.stdev(losers) if len(losers) > 1 else 0,
        }
    
    def analyze_niche_platform(self, records: List[Dict], niche: str, platform: str) -> Optional[AudioProfile]:
        """
        Analyze records for specific niche/platform combination.
        
        Core algorithm:
        1. Filter records by niche/platform
        2. Classify into winners/losers by viral_score percentile
        3. Calculate deltas for each audio feature
        4. Identify significant differentiators
        5. Generate recommended profile
        """
        # Filter relevant records
        filtered = [r for r in records 
                   if r.get("niche") == niche and r.get("platform") == platform]
        
        if len(filtered) < self.MIN_SAMPLE_SIZE:
            return None  # Insufficient data
        
        # Extract viral scores for classification
        viral_scores = [r["performance"]["viral_score"] for r in filtered]
        
        # Separate winners and losers
        winners = []
        losers = []
        
        for record in filtered:
            classification = self.classify_performance(
                record["performance"]["viral_score"],
                viral_scores
            )
            
            if classification == "winner":
                winners.append(record)
            else:
                losers.append(record)
        
        if not winners or not losers:
            return None  # Need both groups for comparison
        
        # Extract audio metrics for each group
        winner_metrics = defaultdict(list)
        loser_metrics = defaultdict(list)
        
        for w in winners:
            audio = w["audio_features"]
            winner_metrics["pace"].append(audio["pace_wpm"])
            winner_metrics["pitch_variance"].append(audio["pitch_variance"])
            winner_metrics["pitch_jumps"].append(audio["pitch_jump_count"])
            winner_metrics["pause_density"].append(audio["pause_density"])
            winner_metrics["pause_duration"].append(audio["avg_pause_duration_ms"])
            winner_metrics["beat_alignment"].append(audio["beat_alignment_score"])
            winner_metrics["emphasis"].append(audio["emphasis_peak_count"])
        
        for l in losers:
            audio = l["audio_features"]
            loser_metrics["pace"].append(audio["pace_wpm"])
            loser_metrics["pitch_variance"].append(audio["pitch_variance"])
            loser_metrics["pitch_jumps"].append(audio["pitch_jump_count"])
            loser_metrics["pause_density"].append(audio["pause_density"])
            loser_metrics["pause_duration"].append(audio["avg_pause_duration_ms"])
            loser_metrics["beat_alignment"].append(audio["beat_alignment_score"])
            loser_metrics["emphasis"].append(audio["emphasis_peak_count"])
        
        # Calculate deltas for each feature
        deltas = {}
        for feature in winner_metrics.keys():
            deltas[feature] = self.calculate_delta(
                winner_metrics[feature],
                loser_metrics[feature]
            )
        
        # Identify key differentiators (features with significant deltas)
        key_differentiators = []
        anti_patterns = []
        
        for feature, delta_info in deltas.items():
            if delta_info["significant"]:
                direction = "higher" if delta_info["delta"] > 0 else "lower"
                key_differentiators.append(
                    f"{feature}: {direction} by {abs(delta_info['delta'])*100:.1f}% "
                    f"({delta_info['winner_avg']:.2f} vs {delta_info['loser_avg']:.2f})"
                )
                
                # Anti-patterns are opposite of winning patterns
                opposite_direction = "lower" if delta_info["delta"] > 0 else "higher"
                anti_patterns.append(f"Avoid {opposite_direction} {feature}")
        
        # Generate recommended profile
        profile = self._build_audio_profile(
            niche=niche,
            platform=platform,
            winner_metrics=winner_metrics,
            deltas=deltas,
            key_differentiators=key_differentiators,
            anti_patterns=anti_patterns,
            sample_size=len(filtered),
            top_performers=len(winners)
        )
        
        return profile
    
    def _build_audio_profile(self, niche: str, platform: str, 
                            winner_metrics: Dict, deltas: Dict,
                            key_differentiators: List[str],
                            anti_patterns: List[str],
                            sample_size: int,
                            top_performers: int) -> AudioProfile:
        """Construct recommended audio profile from analysis"""
        
        # Target pace: use winner average with tolerance
        target_pace = statistics.mean(winner_metrics["pace"])
        pace_std = statistics.stdev(winner_metrics["pace"]) if len(winner_metrics["pace"]) > 1 else 10
        pace_tolerance = min(pace_std, 20)  # Cap tolerance at 20 WPM
        
        # Pitch recommendations
        pitch_baseline = 150.0  # Placeholder, would need pitch data
        pitch_variance_target = statistics.mean(winner_metrics["pitch_variance"])
        
        # Pitch jump frequency classification
        avg_jumps = statistics.mean(winner_metrics["pitch_jumps"])
        if avg_jumps < 5:
            pitch_jump_freq = "low"
        elif avg_jumps < 15:
            pitch_jump_freq = "medium"
        else:
            pitch_jump_freq = "high"
        
        # Pause strategy
        pause_density_target = statistics.mean(winner_metrics["pause_density"])
        pause_durations = winner_metrics["pause_duration"]
        pause_duration_range = (
            min(pause_durations),
            max(pause_durations)
        )
        
        # Beat alignment importance
        beat_alignment_avg = statistics.mean(winner_metrics["beat_alignment"])
        if deltas["beat_alignment"]["significant"] and deltas["beat_alignment"]["delta"] > 0.2:
            beat_importance = "critical"
            beat_threshold = 0.8
        elif beat_alignment_avg > 0.6:
            beat_importance = "important"
            beat_threshold = 0.6
        else:
            beat_importance = "optional"
            beat_threshold = 0.4
        
        # Emphasis strategy
        emphasis_avg = statistics.mean(winner_metrics["emphasis"])
        if emphasis_avg < 3:
            emphasis_strategy = "sparse"
        elif emphasis_avg < 8:
            emphasis_strategy = "moderate"
        else:
            emphasis_strategy = "frequent"
        
        # Confidence based on sample size (logarithmic scale)
        import math
        confidence = min(math.log10(sample_size) / math.log10(100), 1.0)
        
        return AudioProfile(
            niche=niche,
            platform=platform,
            target_pace_wpm=target_pace,
            pace_tolerance=pace_tolerance,
            pitch_baseline_hz=pitch_baseline,
            pitch_variance_target=pitch_variance_target,
            pitch_jump_frequency=pitch_jump_freq,
            pause_density_target=pause_density_target,
            pause_duration_ms=pause_duration_range,
            beat_alignment_importance=beat_importance,
            beat_alignment_threshold=beat_threshold,
            emphasis_strategy=emphasis_strategy,
            emphasis_points_per_min=emphasis_avg,
            confidence_score=confidence,
            sample_size=sample_size,
            top_performers_analyzed=top_performers,
            key_differentiators=key_differentiators,
            anti_patterns=anti_patterns
        )
    
    def learn_all_patterns(self) -> Dict[str, AudioProfile]:
        """
        Analyze all available data and generate profiles for each niche/platform.
        
        Returns dict mapping "niche:platform" -> AudioProfile
        """
        records = self.load_performance_records()
        
        if not records:
            return {}
        
        # Identify unique niche/platform combinations
        combinations = set()
        for record in records:
            niche = record.get("niche", "unknown")
            platform = record.get("platform", "unknown")
            combinations.add((niche, platform))
        
        # Analyze each combination
        profiles = {}
        for niche, platform in combinations:
            profile = self.analyze_niche_platform(records, niche, platform)
            if profile:
                key = f"{niche}:{platform}"
                profiles[key] = profile
        
        # Cache profiles to disk
        self._save_profiles(profiles)
        
        return profiles
    
    def _save_profiles(self, profiles: Dict[str, AudioProfile]):
        """Persist learned profiles to disk"""
        profile_file = self.data_dir / "audio_profiles.json"
        
        serializable = {
            key: asdict(profile) 
            for key, profile in profiles.items()
        }
        
        with open(profile_file, 'w') as f:
            json.dump(serializable, f, indent=2)
    
    def _load_profiles(self) -> Dict[str, AudioProfile]:
        """Load cached profiles from disk"""
        profile_file = self.data_dir / "audio_profiles.json"
        
        if not profile_file.exists():
            return {}
        
        with open(profile_file, 'r') as f:
            data = json.load(f)
        
        profiles = {}
        for key, profile_dict in data.items():
            profiles[key] = AudioProfile(**profile_dict)
        
        return profiles
    
    def get_recommended_audio_profile(self, niche: str, platform: str) -> Optional[AudioProfile]:
        """
        API: Get recommended audio profile for specific niche/platform.
        
        Returns cached profile if available, otherwise triggers learning.
        Falls back to generic profile if specific combination lacks data.
        """
        key = f"{niche}:{platform}"
        
        # Try loading cached profiles
        profiles = self._load_profiles()
        
        if key in profiles:
            return profiles[key]
        
        # Not cached, trigger learning
        all_profiles = self.learn_all_patterns()
        
        if key in all_profiles:
            return all_profiles[key]
        
        # Fallback: try platform-generic profile
        platform_profiles = [p for k, p in all_profiles.items() if k.endswith(f":{platform}")]
        if platform_profiles:
            # Return highest confidence profile for this platform
            return max(platform_profiles, key=lambda p: p.confidence_score)
        
        # Fallback: try niche-generic profile
        niche_profiles = [p for k, p in all_profiles.items() if k.startswith(f"{niche}:")]
        if niche_profiles:
            return max(niche_profiles, key=lambda p: p.confidence_score)
        
        return None  # No data available
    
    def explain_profile(self, profile: AudioProfile) -> str:
        """
        Generate human-readable explanation of why this profile works.
        
        Useful for debugging and building intuition.
        """
        explanation = f"""
 Audio Profile for {profile.niche} on {profile.platform}
 {'='*60}

 Sample Size: {profile.sample_size} videos analyzed
 Top Performers: {profile.top_performers_analyzed}
 Confidence: {profile.confidence_score*100:.1f}%

 CORE RECOMMENDATIONS:
 --------------------
 Pace: {profile.target_pace_wpm:.1f} WPM (±{profile.pace_tolerance:.1f})
 Pitch Variation: {profile.pitch_variance_target:.2f} Hz variance
 Pitch Jumps: {profile.pitch_jump_frequency} frequency
 Pauses: {profile.pause_density_target:.1f} per minute, {profile.pause_duration_ms[0]:.0f}-{profile.pause_duration_ms[1]:.0f}ms each
 Beat Alignment: {profile.beat_alignment_importance} (threshold: {profile.beat_alignment_threshold:.2f})
 Emphasis: {profile.emphasis_strategy} strategy, ~{profile.emphasis_points_per_min:.1f} per minute

 KEY DIFFERENTIATORS (what makes winners win):
 ----------------------------------------------
 """
        for diff in profile.key_differentiators:
            explanation += f"• {diff}\n"
        
        explanation += "\nANTI-PATTERNS (what to avoid):\n"
        explanation += "-------------------------------\n"
        for anti in profile.anti_patterns:
            explanation += f"• {anti}\n"
        
        return explanation


 # =============================================================================
 # FUTURE RL UPGRADE PATHS
 # =============================================================================
 """
 Reinforcement Learning Integration (Future Version 2):

 1. REWARD FUNCTION:
   - Replace fixed viral_score threshold with learned value function
   - Multi-objective: completion_rate, engagement, virality
   - Temporal credit assignment: which audio features at which timestamps drove retention

 2. EXPLORATION/EXPLOITATION:
   - Multi-armed bandit for A/B testing profiles
   - Upper Confidence Bound (UCB) for balancing tried-and-true vs experimental
   - Thompson sampling for Bayesian optimization

 3. CONTINUOUS LEARNING:
   - Online learning: update profiles after each video performance
   - Concept drift detection: identify when trends shift
   - Temporal decay: older data weighted less (trends change)
   - Catastrophic forgetting prevention: maintain ensemble of time-windowed models

 4. CONTEXT-AWARE BANDITS:
   - Contextual features: time of day, season, current events
   - Personalization: audience demographics, viewing history
   - Transfer learning: leverage patterns across similar niches

 5. POLICY GRADIENT METHODS:
   - Direct optimization of audio parameters
   - Differentiable audio synthesis pipeline
   - Actor-critic for continuous action spaces (pace, pitch, etc.)

 6. ARCHITECTURE:
   - Replace AudioPatternLearner with RLAudioOptimizer
   - Add replay buffer for experience replay
   - Add policy network (actor) and value network (critic)
   - Add exploration noise (epsilon-greedy or entropy bonus)

 Example RL pseudocode:

 class RLAudioOptimizer:
    def __init__(self):
        self.policy_net = AudioPolicyNetwork()  # Maps (niche, platform) -> audio params
        self.value_net = AudioValueNetwork()    # Estimates expected virality
        self.replay_buffer = ReplayBuffer()
        
    def get_audio_profile(self, niche, platform, explore=True):
        state = encode_state(niche, platform)
        
        if explore and random() < epsilon:
            return sample_random_profile()  # Exploration
        
        return self.policy_net(state)  # Exploitation
    
    def update(self, video_id, audio_profile, performance_metrics):
        # Store experience
        self.replay_buffer.add(state, audio_profile, reward, next_state)
        
        # Sample batch and update networks
        batch = self.replay_buffer.sample()
        policy_loss = compute_policy_gradient(batch)
        value_loss = compute_td_error(batch)
        
        optimize(policy_loss + value_loss)

 This enables true adaptive learning that improves over time rather than
 fixed statistical analysis.
 """


 # =============================================================================
 # CLI FOR TESTING
 # =============================================================================
 if __name__ == "__main__":
    learner = AudioPatternLearner()
    
    print("Learning patterns from performance data...")
    profiles = learner.learn_all_patterns()
    
    print(f"\nLearned {len(profiles)} audio profiles:")
    for key, profile in profiles.items():
        print(f"\n{key}:")
        print(f"  Pace: {profile.target_pace_wpm:.1f} WPM")
        print(f"  Beat alignment: {profile.beat_alignment_importance}")
        print(f"  Confidence: {profile.confidence_score*100:.1f}%")
    
    # Example: get specific recommendation
    print("\n" + "="*60)
    print("Example: Getting recommendation for 'tech_tips' on 'tiktok'")
    profile = learner.get_recommended_audio_profile("tech_tips", "tiktok")
    
    if profile:
        print(learner.explain_profile(profile))
    else:
        print("No profile available for this niche/platform combination.")
	"""
	audio_pattern_learner.py

	Analyzes audio performance records to identify statistically significant patterns
	that correlate with viral audio success. Provides actionable recommendations for
	TTS and voice-sync engines.

	Version 1: Heuristic/statistical analysis (no deep learning)
	Future: Can be upgraded to RL-based continuous learning system
	"""

	import json
	from typing import Dict, List, Optional, Tuple
	from dataclasses import dataclass, asdict
	from collections import defaultdict
	import statistics
	from pathlib import Path


	@dataclass
	class AudioMetrics:
	"""Raw audio feature measurements"""
	pace_wpm: float
	avg_pitch_hz: float
	pitch_variance: float
	pitch_jump_count: int # Number of significant pitch changes
	pause_density: float # Pauses per minute
	avg_pause_duration_ms: float
	beat_alignment_score: float # 0-1, how well synced to music
	emphasis_peak_count: int # Number of vocal emphasis points


	@dataclass
	class PerformanceMetrics:
	"""Video performance outcomes"""
	completion_rate: float # 0-1
	retention_curve: List[float] # Retention at 10%, 20%, ..., 100%
	engagement_score: float # Composite: likes, comments, shares
	viral_score: float # Composite virality metric


	@dataclass
	class AudioProfile:
	"""Recommended audio configuration for a niche/platform"""
	niche: str
	platform: str

	# Core recommendations
	target_pace_wpm: float
	pace_tolerance: float # +/- range

	pitch_baseline_hz: float
	pitch_variance_target: float
	pitch_jump_frequency: str # "low", "medium", "high"

	pause_density_target: float # Pauses per minute
	pause_duration_ms: Tuple[float, float] # (min, max)

	beat_alignment_importance: str # "critical", "important", "optional"
	beat_alignment_threshold: float

	emphasis_strategy: str # "sparse", "moderate", "frequent"
	emphasis_points_per_min: float

	# Supporting data
	confidence_score: float # 0-1, based on sample size
	sample_size: int
	top_performers_analyzed: int

	# Explainability
	key_differentiators: List[str] # What separates winners from losers
	anti_patterns: List[str] # What to avoid


	class AudioPatternLearner:
	"""
	Analyzes audio performance data to extract viral patterns.

	Architecture:
	1. Data ingestion from audio_performance_store
	2. Statistical analysis: winners vs losers
	3. Pattern clustering by niche/platform
	4. Profile generation with explainability

	Future RL upgrade path:
	- Replace statistical thresholds with learned reward functions
	- Implement multi-armed bandit for A/B testing recommendations
	- Add temporal decay for concept drift (trends change over time)
	- Continuous learning loop with performance feedback
	"""

	def __init__(self, data_dir: str = "./audio_performance_data"):
	self.data_dir = Path(data_dir)
	self.data_dir.mkdir(exist_ok=True)

	# Thresholds for winner/loser classification
	self.VIRAL_THRESHOLD = 0.75 # Top 25% are "winners"
	self.MIN_SAMPLE_SIZE = 10 # Minimum records for reliable analysis

	# Statistical significance thresholds
	self.SIGNIFICANCE_DELTA = 0.15 # 15% difference = significant

	def load_performance_records(self) -> List[Dict]:
	"""Load all performance records from storage"""
	records = []
	record_file = self.data_dir / "performance_records.jsonl"

	if record_file.exists():
	with open(record_file, 'r') as f:
	for line in f:
	records.append(json.loads(line))

	return records

	def classify_performance(self, viral_score: float, all_scores: List[float]) -> str:
	"""Classify a record as winner/loser based on percentile"""
	if not all_scores:
	return "unknown"

	sorted_scores = sorted(all_scores)
	threshold_idx = int(len(sorted_scores) * self.VIRAL_THRESHOLD)
	threshold = sorted_scores[threshold_idx] if threshold_idx < len(sorted_scores) else sorted_scores[-1]

	return "winner" if viral_score >= threshold else "loser"

	def calculate_delta(self, winners: List[float], losers: List[float]) -> Dict:
	"""
	Calculate statistical difference between winner and loser groups.

	Returns delta metrics and significance assessment.
	"""
	if not winners or not losers:
	return {"significant": False, "delta": 0, "winner_avg": 0, "loser_avg": 0}

	winner_avg = statistics.mean(winners)
	loser_avg = statistics.mean(losers)

	# Avoid division by zero
	baseline = max(abs(loser_avg), 0.01)
	delta_pct = (winner_avg - loser_avg) / baseline

	return {
	"significant": abs(delta_pct) >= self.SIGNIFICANCE_DELTA,
	"delta": delta_pct,
	"winner_avg": winner_avg,
	"loser_avg": loser_avg,
	"winner_std": statistics.stdev(winners) if len(winners) > 1 else 0,
	"loser_std": statistics.stdev(losers) if len(losers) > 1 else 0,
	}

	def analyze_niche_platform(self, records: List[Dict], niche: str, platform: str) -> Optional[AudioProfile]:
	"""
	Analyze records for specific niche/platform combination.

	Core algorithm:
	1. Filter records by niche/platform
	2. Classify into winners/losers by viral_score percentile
	3. Calculate deltas for each audio feature
	4. Identify significant differentiators
	5. Generate recommended profile
	"""
	# Filter relevant records
	filtered = [r for r in records
	if r.get("niche") == niche and r.get("platform") == platform]

	if len(filtered) < self.MIN_SAMPLE_SIZE:
	return None # Insufficient data

	# Extract viral scores for classification
	viral_scores = [r["performance"]["viral_score"] for r in filtered]

	# Separate winners and losers
	winners = []
	losers = []

	for record in filtered:
	classification = self.classify_performance(
	record["performance"]["viral_score"],
	viral_scores
	)

	if classification == "winner":
	winners.append(record)
	else:
	losers.append(record)

	if not winners or not losers:
	return None # Need both groups for comparison

	# Extract audio metrics for each group
	winner_metrics = defaultdict(list)
	loser_metrics = defaultdict(list)

	for w in winners:
	audio = w["audio_features"]
	winner_metrics["pace"].append(audio["pace_wpm"])
	winner_metrics["pitch_variance"].append(audio["pitch_variance"])
	winner_metrics["pitch_jumps"].append(audio["pitch_jump_count"])
	winner_metrics["pause_density"].append(audio["pause_density"])
	winner_metrics["pause_duration"].append(audio["avg_pause_duration_ms"])
	winner_metrics["beat_alignment"].append(audio["beat_alignment_score"])
	winner_metrics["emphasis"].append(audio["emphasis_peak_count"])

	for l in losers:
	audio = l["audio_features"]
	loser_metrics["pace"].append(audio["pace_wpm"])
	loser_metrics["pitch_variance"].append(audio["pitch_variance"])
	loser_metrics["pitch_jumps"].append(audio["pitch_jump_count"])
	loser_metrics["pause_density"].append(audio["pause_density"])
	loser_metrics["pause_duration"].append(audio["avg_pause_duration_ms"])
	loser_metrics["beat_alignment"].append(audio["beat_alignment_score"])
	loser_metrics["emphasis"].append(audio["emphasis_peak_count"])

	# Calculate deltas for each feature
	deltas = {}
	for feature in winner_metrics.keys():
	deltas[feature] = self.calculate_delta(
	winner_metrics[feature],
	loser_metrics[feature]
	)

	# Identify key differentiators (features with significant deltas)
	key_differentiators = []
	anti_patterns = []

	for feature, delta_info in deltas.items():
	if delta_info["significant"]:
	direction = "higher" if delta_info["delta"] > 0 else "lower"
	key_differentiators.append(
	f"{feature}: {direction} by {abs(delta_info['delta'])*100:.1f}% "
	f"({delta_info['winner_avg']:.2f} vs {delta_info['loser_avg']:.2f})"
	)

	# Anti-patterns are opposite of winning patterns
	opposite_direction = "lower" if delta_info["delta"] > 0 else "higher"
	anti_patterns.append(f"Avoid {opposite_direction} {feature}")

	# Generate recommended profile
	profile = self._build_audio_profile(
	niche=niche,
	platform=platform,
	winner_metrics=winner_metrics,
	deltas=deltas,
	key_differentiators=key_differentiators,
	anti_patterns=anti_patterns,
	sample_size=len(filtered),
	top_performers=len(winners)
	)

	return profile

	def _build_audio_profile(self, niche: str, platform: str,
	winner_metrics: Dict, deltas: Dict,
	key_differentiators: List[str],
	anti_patterns: List[str],
	sample_size: int,
	top_performers: int) -> AudioProfile:
	"""Construct recommended audio profile from analysis"""

	# Target pace: use winner average with tolerance
	target_pace = statistics.mean(winner_metrics["pace"])
	pace_std = statistics.stdev(winner_metrics["pace"]) if len(winner_metrics["pace"]) > 1 else 10
	pace_tolerance = min(pace_std, 20) # Cap tolerance at 20 WPM

	# Pitch recommendations
	pitch_baseline = 150.0 # Placeholder, would need pitch data
	pitch_variance_target = statistics.mean(winner_metrics["pitch_variance"])

	# Pitch jump frequency classification
	avg_jumps = statistics.mean(winner_metrics["pitch_jumps"])
	if avg_jumps < 5:
	pitch_jump_freq = "low"
	elif avg_jumps < 15:
	pitch_jump_freq = "medium"
	else:
	pitch_jump_freq = "high"

	# Pause strategy
	pause_density_target = statistics.mean(winner_metrics["pause_density"])
	pause_durations = winner_metrics["pause_duration"]
	pause_duration_range = (
	min(pause_durations),
	max(pause_durations)
	)

	# Beat alignment importance
	beat_alignment_avg = statistics.mean(winner_metrics["beat_alignment"])
	if deltas["beat_alignment"]["significant"] and deltas["beat_alignment"]["delta"] > 0.2:
	beat_importance = "critical"
	beat_threshold = 0.8
	elif beat_alignment_avg > 0.6:
	beat_importance = "important"
	beat_threshold = 0.6
	else:
	beat_importance = "optional"
	beat_threshold = 0.4

	# Emphasis strategy
	emphasis_avg = statistics.mean(winner_metrics["emphasis"])
	if emphasis_avg < 3:
	emphasis_strategy = "sparse"
	elif emphasis_avg < 8:
	emphasis_strategy = "moderate"
	else:
	emphasis_strategy = "frequent"

	# Confidence based on sample size (logarithmic scale)
	import math
	confidence = min(math.log10(sample_size) / math.log10(100), 1.0)

	return AudioProfile(
	niche=niche,
	platform=platform,
	target_pace_wpm=target_pace,
	pace_tolerance=pace_tolerance,
	pitch_baseline_hz=pitch_baseline,
	pitch_variance_target=pitch_variance_target,
	pitch_jump_frequency=pitch_jump_freq,
	pause_density_target=pause_density_target,
	pause_duration_ms=pause_duration_range,
	beat_alignment_importance=beat_importance,
	beat_alignment_threshold=beat_threshold,
	emphasis_strategy=emphasis_strategy,
	emphasis_points_per_min=emphasis_avg,
	confidence_score=confidence,
	sample_size=sample_size,
	top_performers_analyzed=top_performers,
	key_differentiators=key_differentiators,
	anti_patterns=anti_patterns
	)

	def learn_all_patterns(self) -> Dict[str, AudioProfile]:
	"""
	Analyze all available data and generate profiles for each niche/platform.

	Returns dict mapping "niche:platform" -> AudioProfile
	"""
	records = self.load_performance_records()

	if not records:
	return {}

	# Identify unique niche/platform combinations
	combinations = set()
	for record in records:
	niche = record.get("niche", "unknown")
	platform = record.get("platform", "unknown")
	combinations.add((niche, platform))

	# Analyze each combination
	profiles = {}
	for niche, platform in combinations:
	profile = self.analyze_niche_platform(records, niche, platform)
	if profile:
	key = f"{niche}:{platform}"
	profiles[key] = profile

	# Cache profiles to disk
	self._save_profiles(profiles)

	return profiles

	def _save_profiles(self, profiles: Dict[str, AudioProfile]):
	"""Persist learned profiles to disk"""
	profile_file = self.data_dir / "audio_profiles.json"

	serializable = {
	key: asdict(profile)
	for key, profile in profiles.items()
	}

	with open(profile_file, 'w') as f:
	json.dump(serializable, f, indent=2)

	def _load_profiles(self) -> Dict[str, AudioProfile]:
	"""Load cached profiles from disk"""
	profile_file = self.data_dir / "audio_profiles.json"

	if not profile_file.exists():
	return {}

	with open(profile_file, 'r') as f:
	data = json.load(f)

	profiles = {}
	for key, profile_dict in data.items():
	profiles[key] = AudioProfile(**profile_dict)

	return profiles

	def get_recommended_audio_profile(self, niche: str, platform: str) -> Optional[AudioProfile]:
	"""
	API: Get recommended audio profile for specific niche/platform.

	Returns cached profile if available, otherwise triggers learning.
	Falls back to generic profile if specific combination lacks data.
	"""
	key = f"{niche}:{platform}"

	# Try loading cached profiles
	profiles = self._load_profiles()

	if key in profiles:
	return profiles[key]

	# Not cached, trigger learning
	all_profiles = self.learn_all_patterns()

	if key in all_profiles:
	return all_profiles[key]

	# Fallback: try platform-generic profile
	platform_profiles = [p for k, p in all_profiles.items() if k.endswith(f":{platform}")]
	if platform_profiles:
	# Return highest confidence profile for this platform
	return max(platform_profiles, key=lambda p: p.confidence_score)

	# Fallback: try niche-generic profile
	niche_profiles = [p for k, p in all_profiles.items() if k.startswith(f"{niche}:")]
	if niche_profiles:
	return max(niche_profiles, key=lambda p: p.confidence_score)

	return None # No data available

	def explain_profile(self, profile: AudioProfile) -> str:
	"""
	Generate human-readable explanation of why this profile works.

	Useful for debugging and building intuition.
	"""
	explanation = f"""
	Audio Profile for {profile.niche} on {profile.platform}
	{'='*60}

	Sample Size: {profile.sample_size} videos analyzed
	Top Performers: {profile.top_performers_analyzed}
	Confidence: {profile.confidence_score*100:.1f}%

	CORE RECOMMENDATIONS:
	--------------------
	Pace: {profile.target_pace_wpm:.1f} WPM (±{profile.pace_tolerance:.1f})
	Pitch Variation: {profile.pitch_variance_target:.2f} Hz variance
	Pitch Jumps: {profile.pitch_jump_frequency} frequency
	Pauses: {profile.pause_density_target:.1f} per minute, {profile.pause_duration_ms[0]:.0f}-{profile.pause_duration_ms[1]:.0f}ms each
	Beat Alignment: {profile.beat_alignment_importance} (threshold: {profile.beat_alignment_threshold:.2f})
	Emphasis: {profile.emphasis_strategy} strategy, ~{profile.emphasis_points_per_min:.1f} per minute

	KEY DIFFERENTIATORS (what makes winners win):
	----------------------------------------------
	"""
	for diff in profile.key_differentiators:
	explanation += f"• {diff}\n"

	explanation += "\nANTI-PATTERNS (what to avoid):\n"
	explanation += "-------------------------------\n"
	for anti in profile.anti_patterns:
	explanation += f"• {anti}\n"

	return explanation


	# =============================================================================
	# FUTURE RL UPGRADE PATHS
	# =============================================================================
	"""
	Reinforcement Learning Integration (Future Version 2):

	1. REWARD FUNCTION:
	- Replace fixed viral_score threshold with learned value function
	- Multi-objective: completion_rate, engagement, virality
	- Temporal credit assignment: which audio features at which timestamps drove retention

	2. EXPLORATION/EXPLOITATION:
	- Multi-armed bandit for A/B testing profiles
	- Upper Confidence Bound (UCB) for balancing tried-and-true vs experimental
	- Thompson sampling for Bayesian optimization

	3. CONTINUOUS LEARNING:
	- Online learning: update profiles after each video performance
	- Concept drift detection: identify when trends shift
	- Temporal decay: older data weighted less (trends change)
	- Catastrophic forgetting prevention: maintain ensemble of time-windowed models

	4. CONTEXT-AWARE BANDITS:
	- Contextual features: time of day, season, current events
	- Personalization: audience demographics, viewing history
	- Transfer learning: leverage patterns across similar niches

	5. POLICY GRADIENT METHODS:
	- Direct optimization of audio parameters
	- Differentiable audio synthesis pipeline
	- Actor-critic for continuous action spaces (pace, pitch, etc.)

	6. ARCHITECTURE:
	- Replace AudioPatternLearner with RLAudioOptimizer
	- Add replay buffer for experience replay
	- Add policy network (actor) and value network (critic)
	- Add exploration noise (epsilon-greedy or entropy bonus)

	Example RL pseudocode:

	class RLAudioOptimizer:
	def __init__(self):
	self.policy_net = AudioPolicyNetwork() # Maps (niche, platform) -> audio params
	self.value_net = AudioValueNetwork() # Estimates expected virality
	self.replay_buffer = ReplayBuffer()

	def get_audio_profile(self, niche, platform, explore=True):
	state = encode_state(niche, platform)

	if explore and random() < epsilon:
	return sample_random_profile() # Exploration

	return self.policy_net(state) # Exploitation

	def update(self, video_id, audio_profile, performance_metrics):
	# Store experience
	self.replay_buffer.add(state, audio_profile, reward, next_state)

	# Sample batch and update networks
	batch = self.replay_buffer.sample()
	policy_loss = compute_policy_gradient(batch)
	value_loss = compute_td_error(batch)

	optimize(policy_loss + value_loss)

	This enables true adaptive learning that improves over time rather than
	fixed statistical analysis.
	"""


	# =============================================================================
	# CLI FOR TESTING
	# =============================================================================
	if __name__ == "__main__":
	learner = AudioPatternLearner()

	print("Learning patterns from performance data...")
	profiles = learner.learn_all_patterns()

	print(f"\nLearned {len(profiles)} audio profiles:")
	for key, profile in profiles.items():
	print(f"\n{key}:")
	print(f" Pace: {profile.target_pace_wpm:.1f} WPM")
	print(f" Beat alignment: {profile.beat_alignment_importance}")
	print(f" Confidence: {profile.confidence_score*100:.1f}%")

	# Example: get specific recommendation
	print("\n" + "="*60)
	print("Example: Getting recommendation for 'tech_tips' on 'tiktok'")
	profile = learner.get_recommended_audio_profile("tech_tips", "tiktok")

	if profile:
	print(learner.explain_profile(profile))
	else:
	print("No profile available for this niche/platform combination.")
No results found