si3mshady · November 24, 2025 17:50
diff --git a/README.md b/README.md
diff --git a/ppo_mjollnir_orbit_v33.py b/ppo_mjollnir_orbit_v33.py
 # ====================================================================================
 #     PPO MJÖLNIR ORBIT TRAINING SCRIPT (V33 - TIGHT ENTRY & REWARD DISPLAY)
 # ====================================================================================

 # ------------------------------------------------------------------------------------
 # --- IMPORTS & SETUP ⚙️ ---
 # ------------------------------------------------------------------------------------

 # !pip install stable_baselines3[extra] gymnasium[classic_control] -q
 # !apt-get install -y swig
 # !pip install 'gymnasium[box2d]' -q
 # !pip install pyvirtualdisplay -q
 # !apt-get install -y xvfb xserver-xephyr vnc4server
 # !pip install moviepy -q

 # - Google Colab 


 import os
 import numpy as np
 import gymnasium as gym
 from gymnasium import spaces
 from stable_baselines3 import PPO
 from stable_baselines3.common.env_util import make_vec_env
 from stable_baselines3.common.vec_env import VecVideoRecorder, DummyVecEnv
 from IPython.display import HTML
 import base64
 import time
 import math

 # Define Paths and Budget
 TOTAL_TIMESTEPS = 500_000
 LOG_DIR = "./ppo_mjollnir_logs/"
 VIDEO_DIR = f"{LOG_DIR}/videos/"
 os.makedirs(LOG_DIR, exist_ok=True)
 os.makedirs(VIDEO_DIR, exist_ok=True)


 # ------------------------------------------------------------------------------------
 ## 1. Custom Environment: MjollnirEnvV33 (Tight Entry, Stabilized Control)
 # ------------------------------------------------------------------------------------

 class MjollnirEnvV33(gym.Env):
    """
    V33 enforces a tighter orbit entry condition (must reach R_orbit)
    and adds step reward display to the render frame.
    """
    metadata = {"render_modes": ["rgb_array", "human"], "render_fps": 15}

    PHASE_APPROACH = 0
    PHASE_ORBIT = 1

    def __init__(self, render_mode=None, size=1000, dt=1/30):
        super().__init__()
        self.size = size
        self.dt = dt

        # --- CORE DYNAMICS & ENVIRONMENT CONSTANTS ---
        self.max_force = 240.0
        self.hammer_mass = 5.0
        self.max_steps = 3000
        self.MAX_VELOCITY_NORM = 20.0

        # --- V33 Orbital Constants (50% REDUCTION) ---
        # ORBIT_RADIUS (White Ring) and ORBIT_ENTRY_DISTANCE (Blue Ring)
        self.ORBIT_RADIUS = 75.0
        self.ORBIT_TOLERANCE = 15.0
        self.ORBIT_ENTRY_DISTANCE = 90.0 # Blue Ring
        self.ORBIT_HARD_TRUNCATION = 175.0 # Red Ring

        # --- Black Hole Constants (INCREASED GRAVITY) ---
        self.BH_POS = np.array([self.size / 2, self.size / 2], dtype=np.float32)
        self.BH_RADIUS = 35.0
        self.BH_GRAVITY_CONSTANT = 6000.0
        self.BH_COLLISION_PENALTY = -15000.0

        # --- V33 PHYSICS FIX: Calculate the ideal circular velocity (V_circ) ---
        self.TARGET_ORBITAL_SPEED = np.sqrt(self.BH_GRAVITY_CONSTANT / self.ORBIT_RADIUS)

        # --- V33 State Variables ---
        self._current_phase = self.PHASE_APPROACH
        self._bh_last_angle = 0.0
        self._bh_total_angle_swept = 0.0
        self._bh_last_distance = 0.0
        self._last_step_reward = 0.0 # New variable to track reward for display
        self._total_reward_accumulated = 0.0 # Total reward tracker

        # Action Space is Radial/Tangential force
        self.action_space = spaces.Box(-1.0, 1.0, shape=(2,), dtype=np.float32)

        # Observation Space reflects the critical variables (normalized)
        OBS_SIZE = 9
        self.observation_space = spaces.Box(-np.inf, np.inf, shape=(OBS_SIZE,), dtype=np.float32)

        self.render_mode = render_mode
        self.window = None
        self.clock = None

    def _get_info(self):
        bh_delta_pos = self.BH_POS - self._hammer_pos
        distance_bh = np.linalg.norm(bh_delta_pos)

        bh_to_hammer_vec = self._hammer_pos - self.BH_POS
        bh_to_hammer_norm = bh_to_hammer_vec / distance_bh if distance_bh > 0 else np.zeros(2)

        # Radial vector (normalized, points away from BH)
        radial_vec = bh_to_hammer_norm
        # Tangential vector (normalized, points counter-clockwise, 90 deg from radial)
        tangential_vec = np.array([-radial_vec[1], radial_vec[0]])

        # Velocity Components: V_R is along radial, V_T is along tangential
        V_R = np.dot(self._hammer_vel, radial_vec)
        V_T = np.dot(self._hammer_vel, tangential_vec)

        # Angular movement calculation (for reward and angle_swept)
        bh_angle = np.arctan2(bh_delta_pos[1], bh_delta_pos[0])
        angle_moved = bh_angle - self._bh_last_angle
        angle_moved = (angle_moved + np.pi) % (2 * np.pi) - np.pi
        self._bh_last_angle = bh_angle

        return {
            "angle_moved": angle_moved,
            "distance_bh": distance_bh,
            "delta_pos_bh": bh_delta_pos,
            "bh_total_angle_swept": self._bh_total_angle_swept,
            "radial_velocity": V_R,
            "tangential_velocity": V_T,
            "hammer_speed": np.linalg.norm(self._hammer_vel),
            "radial_vector": radial_vec,
            "tangential_vector": tangential_vec,
        }

    def _get_obs(self):
        info = self._get_info()
        bh_delta_pos_norm = info['delta_pos_bh'] / self.size

        # Normalize Velocities for stable training
        V_R_norm = info['radial_velocity'] / self.MAX_VELOCITY_NORM
        V_T_norm = info['tangential_velocity'] / self.MAX_VELOCITY_NORM

        return np.concatenate([
            self._hammer_pos / self.size,         # 2 (Normalized Pos)
            bh_delta_pos_norm,                    # 2 (Normalized Delta Pos)
            [info['distance_bh'] / self.size],    # 1 (Normalized Distance)
            [V_R_norm],                           # 1 (Normalized V_R)
            [V_T_norm],                           # 1 (Normalized V_T)
            [self._current_phase],                # 1
            [self.ORBIT_RADIUS / self.size]       # 1 (Target R_orbit)
        ], dtype=np.float32)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = 0
        self._current_phase = self.PHASE_APPROACH
        self._bh_total_angle_swept = 0.0
        self._hammer_vel = np.zeros(2, dtype=np.float32)
        self._last_step_reward = 0.0
        self._total_reward_accumulated = 0.0

        # Initial position far away but within the new screen boundaries
        self._hammer_pos = np.array([
            self.np_random.uniform(self.size*0.1, self.size*0.2),
            self.np_random.uniform(self.size*0.1, self.size*0.2),
        ], dtype=np.float32)

        bh_delta_pos = self.BH_POS - self._hammer_pos
        self._bh_last_angle = np.arctan2(bh_delta_pos[1], bh_delta_pos[0])
        self._bh_last_distance = np.linalg.norm(bh_delta_pos)

        obs = self._get_obs()
        info = self._get_info()
        return obs, info

    def step(self, action):
        self.current_step += 1
        info_before = self._get_info()
        bh_dist_before = info_before['distance_bh']

        # --- 1. Forces and Dynamics (V33 Steering Control) ---

        action_tangential = action[0]
        action_radial = action[1]

        radial_vec = info_before['radial_vector']
        tangential_vec = info_before['tangential_vector']

        # Engine Force Calculation
        tangential_force = tangential_vec * action_tangential * self.max_force
        radial_force = radial_vec * action_radial * self.max_force * 0.5
        total_engine_force = tangential_force + radial_force

        # Gravity Force
        bh_dist = bh_dist_before
        bh_delta_pos = info_before['delta_pos_bh']
        bh_dist_clamped = max(bh_dist, 10.0)
        bh_gravity_magnitude = (self.BH_GRAVITY_CONSTANT * self.hammer_mass) / (bh_dist_clamped**2)
        bh_gravity_force = bh_delta_pos / bh_dist_clamped * bh_gravity_magnitude

        # Integration
        total_force = total_engine_force + bh_gravity_force
        hammer_accel = total_force / self.hammer_mass
        self._hammer_vel += hammer_accel * self.dt
        self._hammer_pos += self._hammer_vel * self.dt

        # --- 2. Reward Calculation (V33) ---
        reward = 0.0
        info_after = self._get_info()
        bh_dist_after = info_after['distance_bh']
        V_R_after = info_after['radial_velocity']
        V_T_after = info_after['tangential_velocity']
        current_speed = info_after['hammer_speed']

        terminated = False
        truncated = False

        # R_ACTION_PENALTY
        reward -= 0.01 * (action_tangential**2 + action_radial**2)

        # --- PHASE 0: APPROACH ---
        if self._current_phase == self.PHASE_APPROACH:

            # R_GOAL_SEEKING_REWARD: Incentivize reaching the new target radius (75m)
            dist_to_orbit_before = np.abs(bh_dist_before - self.ORBIT_RADIUS)
            dist_to_orbit_after = np.abs(bh_dist_after - self.ORBIT_RADIUS)
            distance_reduction = dist_to_orbit_before - dist_to_orbit_after
            reward += 500.0 * distance_reduction / self.size

            # R_STATIONARITY_PENALTY
            MIN_SPEED_THRESHOLD = 5.0 # m/s
            if current_speed < MIN_SPEED_THRESHOLD:
                reward -= 5.0

            # V33 FIX: Transition Condition: Must reach ORBIT_RADIUS (75.0m, white ring)
            if bh_dist_after <= self.ORBIT_RADIUS:

                # R_SPEED_PENALTY_ON_ENTRY: Targets the actual V_circ (8.94 m/s)
                speed_error = np.abs(V_T_after - self.TARGET_ORBITAL_SPEED)
                reward_speed_penalty = -500.0 * (speed_error / self.TARGET_ORBITAL_SPEED)**2

                # R_RADIAL_VELOCITY_PENALTY_ON_ENTRY
                radial_penalty = -1000.0 * np.abs(V_R_after) / self.TARGET_ORBITAL_SPEED

                reward += reward_speed_penalty
                reward += radial_penalty

                reward += 1000.0 # Entry bonus
                self._current_phase = self.PHASE_ORBIT
                print(f"Transitioned to ORBIT Phase! Tangential Speed Penalty: {reward_speed_penalty:.1f}, Radial Penalty: {radial_penalty:.1f}")

        # --- PHASE 1: ORBIT (V33 Stability Enforcement) ---
        elif self._current_phase == self.PHASE_ORBIT:

            angle_moved = info_after['angle_moved']

            # R_PROGRESSIVE_COMPLETION (Meme Reward)
            if angle_moved > 0:
                 reward += 50.0 * angle_moved
                 self._bh_total_angle_swept += angle_moved
            else:
                 # Small penalty for moving backward
                 reward += 100.0 * angle_moved

            # R_Orbit_Stability (QUADRATIC PENALTY)
            orbit_error = np.abs(bh_dist_after - self.ORBIT_RADIUS)

            if orbit_error < self.ORBIT_TOLERANCE:
                 reward += 15.0 # Constant bonus for staying in tolerance
            else:
                 scaled_error = orbit_error / self.ORBIT_RADIUS
                 reward -= 200.0 * (scaled_error)**2

            # R_Radial_Velocity_Penalty (Punish movement away/toward BH)
            reward -= 50.0 * np.abs(V_R_after) / self.TARGET_ORBITAL_SPEED

            # R_360_Completion (Terminal Reward)
            if self._bh_total_angle_swept >= 2 * np.pi:
                 reward += 15000.0
                 terminated = True
                 print(f"--- SUCCESS: Full 360-degree orbit completed in {self.current_step} steps! ---")


        # --- 3. Termination/Truncation (V33 Limits) ---

        # R_COLLISION_PENALTY
        collision_distance_bh = self.BH_RADIUS + 10.0
        if bh_dist_after <= collision_distance_bh:
            reward += self.BH_COLLISION_PENALTY
            terminated = True

        # HARD TRUNCATION APPLIED ONLY IN ORBIT PHASE
        if self._current_phase == self.PHASE_ORBIT:
            if bh_dist_after > self.ORBIT_HARD_TRUNCATION:
                reward -= 7500.0
                truncated = True
                print(f"--- TRUNCATED: Agent drifted outside hard limit ({bh_dist_after:.1f} m) ---")

        # R_TIMEOUT
        truncated = truncated or (self.current_step >= self.max_steps)

        self._bh_last_distance = bh_dist_after
        self._last_step_reward = reward
        self._total_reward_accumulated += reward

        return self._get_obs(), reward, terminated, truncated, info_after

    def render(self):
        if self.render_mode == "rgb_array":
            return self._render_frame()

    def _render_frame(self):
        import pygame

        if self.window is None:
            import pygame
            pygame.init()
            self.window = pygame.display.set_mode((self.size, self.size))
            pygame.display.set_caption("Mjölnir Orbit-v33 (TIGHT, REWARD FEEDBACK)")

        if self.clock is None:
            self.clock = pygame.time.Clock()

        canvas = pygame.Surface((self.size, self.size))
        canvas.fill((20, 20, 40))

        info = self._get_info()
        bh_dist = info['distance_bh']
        angle_swept = self._bh_total_angle_swept

        font = pygame.font.Font(None, 30) # Smaller font for more text

        # --- Draw Static Elements ---
        bh_pos_int = self.BH_POS.astype(int)
        pygame.draw.circle(canvas, (0, 0, 0), bh_pos_int, int(self.BH_RADIUS))
        pygame.draw.circle(canvas, (255, 60, 0), bh_pos_int, int(self.BH_RADIUS * 1.5), 5) # Orange Ring
        pygame.draw.circle(canvas, (255, 255, 255), bh_pos_int, int(self.ORBIT_RADIUS), 1) # White Ring (Target Orbit)

        # Blue Ring (Entry/Approach Guidance)
        if self._current_phase == self.PHASE_APPROACH:
            pygame.draw.circle(canvas, (0, 150, 255), bh_pos_int, int(self.ORBIT_ENTRY_DISTANCE), 1)

        pygame.draw.circle(canvas, (255, 0, 0), bh_pos_int, int(self.ORBIT_HARD_TRUNCATION), 1) # Red Ring (Hard Truncation)

        # 3. Draw Agent (Mjölnir Hammer)
        hammer_pos = self._hammer_pos

        head_width = 30
        head_height = 20
        handle_width = 5
        handle_length = 25

        head_points = np.array([
            [-head_width / 2, -head_height / 2],
            [head_width / 2, -head_height / 2],
            [head_width / 2, head_height / 2],
            [-head_width / 2, head_height / 2]
        ])

        handle_points = np.array([
            [-handle_width / 2, head_height / 2],
            [handle_width / 2, head_height / 2],
            [handle_width / 2, head_height / 2 + handle_length],
            [-handle_width / 2, head_height / 2 + handle_length]
        ])

        hammer_angle = 0.0
        if np.linalg.norm(self._hammer_vel) > 1.0:
            vel_x, vel_y = self._hammer_vel
            hammer_angle = np.arctan2(vel_y, vel_x)

        rot_matrix = np.array([
            [np.cos(hammer_angle), -np.sin(hammer_angle)],
            [np.sin(hammer_angle),  np.cos(hammer_angle)]
        ])

        transformed_head_points = np.dot(head_points, rot_matrix.T) + hammer_pos
        transformed_handle_points = np.dot(handle_points, rot_matrix.T) + hammer_pos

        head_draw_points = [tuple(p.astype(int)) for p in transformed_head_points]
        handle_draw_points = [tuple(p.astype(int)) for p in transformed_handle_points]

        pygame.draw.polygon(canvas, (200, 200, 200), head_draw_points)
        pygame.draw.polygon(canvas, (139, 69, 19), handle_draw_points)


        # 4. Draw Info text (V33 - Includes Reward)
        text_y_start = 10
        y_offset = 30

        PHASE_NAMES = ["APPROACH", "ORBIT"]
        PHASE_COLORS = [(255, 255, 0), (0, 255, 0)]
        current_phase_name = PHASE_NAMES[self._current_phase]

        # Line 1: Phase and Step Count
        phase_text = f"PHASE: {current_phase_name} | STEP: {self.current_step}/{self.max_steps}"
        text_surface_phase = font.render(phase_text, True, PHASE_COLORS[self._current_phase])
        canvas.blit(text_surface_phase, (10, text_y_start))

        # Line 2: Distance Error
        orbit_error = bh_dist - self.ORBIT_RADIUS
        bh_text = f"BH DIST: {bh_dist:.1f} m (Error: {orbit_error:.1f})"
        text_color_bh = (255, 100, 100) if np.abs(orbit_error) > self.ORBIT_TOLERANCE else (100, 200, 255)
        text_surface_bh = font.render(bh_text, True, text_color_bh)
        canvas.blit(text_surface_bh, (10, text_y_start + y_offset))

        # Line 3: Velocities
        v_t_text = f"V_T: {info['tangential_velocity']:.1f} m/s | V_R: {info['radial_velocity']:.1f} m/s (Target: {self.TARGET_ORBITAL_SPEED:.2f})"
        text_surface_v = font.render(v_t_text, True, (255, 255, 255))
        canvas.blit(text_surface_v, (10, text_y_start + 2*y_offset))

        # Line 4: Progressive Completion
        circum_text = f"TOTAL ORBIT: {np.rad2deg(angle_swept):.1f}° (Goal: 360.0°)"
        text_surface_circum = font.render(circum_text, True, (0, 255, 0) if angle_swept > np.pi else (150, 150, 150))
        canvas.blit(text_surface_circum, (10, text_y_start + 3*y_offset))

        # Line 5 (NEW): Step Reward and Total Reward
        # Total reward is often very large/negative due to penalties, so we color it yellow
        reward_color = (0, 255, 0) if self._last_step_reward >= 0 else (255, 100, 100)
        reward_text = f"STEP REWARD: {self._last_step_reward:.2f} | TOTAL REWARD: {self._total_reward_accumulated:.1f}"
        text_surface_reward = font.render(reward_text, True, reward_color)
        canvas.blit(text_surface_reward, (10, text_y_start + 4*y_offset))

        if self.render_mode == "rgb_array":
            return np.transpose(
                np.array(pygame.surfarray.pixels3d(canvas)), axes=(1, 0, 2)
            )

    def close(self):
        if self.window is not None:
            import pygame
            pygame.quit()
            self.window = None

 # ------------------------------------------------------------------------------------
 ## 2. PPO Training and Evaluation (V33 ENVIRONMENT) 🚀
 # ------------------------------------------------------------------------------------

 # --- Helper Functions (Same) ---
 def linear_schedule(initial_value: float):
    def func(progress_remaining: float) -> float:
        return initial_value * progress_remaining
    return func

 def show_video(video_dir):
    mp4list = [f for f in os.listdir(video_dir) if f.endswith('.mp4')]
    if len(mp4list) > 0:
        mp4list.sort(key=lambda x: os.path.getmtime(os.path.join(video_dir, x)), reverse=True)
        file = mp4list[0]
        video = open(os.path.join(video_dir, file), 'rb').read()
        b64 = base64.b64encode(video).decode()
        return HTML(f'<video width="100%" controls><source src="data:video/mp4;base64,{b64}" type="video/mp4"></video>')
    else:
        print("No video file found.")
        return HTML('<p>No video file found.</p>')

 # --- Hyperparameters (Same) ---
 INIT_LR = 0.0001
 N_STEPS = 2048
 BATCH_SIZE = 64
 N_EPOCHS = 10
 GAMMA = 0.99
 ENT_COEF = 0.01
 POLICY_KWARGS = dict(net_arch=[dict(pi=[64, 64], vf=[64, 64])])


 # --- Model Initialization and Training ---
 env_id = "MjollnirOrbit-v33"
 # Register the new V33 environment
 gym.envs.registration.register(id=env_id, entry_point=MjollnirEnvV33, max_episode_steps=3000)

 print(f"Starting Mjölnir training for {TOTAL_TIMESTEPS} timesteps using {env_id} (TIGHT ENTRY, REWARD FEEDBACK)...")

 train_env = make_vec_env(env_id, n_envs=10, env_kwargs=dict(dt=1/30))

 model = PPO(
    "MlpPolicy",
    train_env,
    learning_rate=linear_schedule(INIT_LR),
    n_steps=N_STEPS,
    batch_size=BATCH_SIZE,
    n_epochs=N_EPOCHS,
    gamma=GAMMA,
    ent_coef=ENT_COEF,
    policy_kwargs=POLICY_KWARGS,
    verbose=1,
    device="cuda"
 )

 # Start training the V33 model
 model.learn(total_timesteps=TOTAL_TIMESTEPS)

 # --- Video Recording and Evaluation ---
 print("\n--- Recording Trained Agent Performance ---")

 # Ensure the correct V33 environment is used for evaluation
 eval_env = DummyVecEnv([lambda: gym.make(env_id, render_mode="rgb_array", dt=1/30)])
 video_length = 6000

 eval_env = VecVideoRecorder(
    eval_env,
    VIDEO_DIR,
    record_video_trigger=lambda x: x == 0,
    video_length=video_length,
    name_prefix=f"ppo-mjollnir-agent-v33"
 )

 # Robust Reset
 try:
    obs, _ = eval_env.reset()
 except ValueError:
    obs = eval_env.reset()

 if obs.ndim == 1:
    obs = obs[None]

 for _ in range(video_length):
    action, _ = model.predict(obs, deterministic=True)

    try:
        obs, _, terminated, truncated, _ = eval_env.step(action)
    except ValueError:
        obs, _, done, _ = eval_env.step(action)
        terminated = done
        truncated = np.array([False])

    if terminated[0] or truncated[0]:
        try:
            obs, _ = eval_env.reset()
        except ValueError:
            obs = eval_env.reset()
        break

 eval_env.close()

 # --- Display Results ---
 print("\n--- Video of Trained Agent ---")
 show_video(VIDEO_DIR)
	# ====================================================================================
	# PPO MJÖLNIR ORBIT TRAINING SCRIPT (V33 - TIGHT ENTRY & REWARD DISPLAY)
	# ====================================================================================

	# ------------------------------------------------------------------------------------
	# --- IMPORTS & SETUP ⚙️ ---
	# ------------------------------------------------------------------------------------

	# !pip install stable_baselines3[extra] gymnasium[classic_control] -q
	# !apt-get install -y swig
	# !pip install 'gymnasium[box2d]' -q
	# !pip install pyvirtualdisplay -q
	# !apt-get install -y xvfb xserver-xephyr vnc4server
	# !pip install moviepy -q

	# - Google Colab


	import os
	import numpy as np
	import gymnasium as gym
	from gymnasium import spaces
	from stable_baselines3 import PPO
	from stable_baselines3.common.env_util import make_vec_env
	from stable_baselines3.common.vec_env import VecVideoRecorder, DummyVecEnv
	from IPython.display import HTML
	import base64
	import time
	import math

	# Define Paths and Budget
	TOTAL_TIMESTEPS = 500_000
	LOG_DIR = "./ppo_mjollnir_logs/"
	VIDEO_DIR = f"{LOG_DIR}/videos/"
	os.makedirs(LOG_DIR, exist_ok=True)
	os.makedirs(VIDEO_DIR, exist_ok=True)


	# ------------------------------------------------------------------------------------
	## 1. Custom Environment: MjollnirEnvV33 (Tight Entry, Stabilized Control)
	# ------------------------------------------------------------------------------------

	class MjollnirEnvV33(gym.Env):
	"""
	V33 enforces a tighter orbit entry condition (must reach R_orbit)
	and adds step reward display to the render frame.
	"""
	metadata = {"render_modes": ["rgb_array", "human"], "render_fps": 15}

	PHASE_APPROACH = 0
	PHASE_ORBIT = 1

	def __init__(self, render_mode=None, size=1000, dt=1/30):
	super().__init__()
	self.size = size
	self.dt = dt

	# --- CORE DYNAMICS & ENVIRONMENT CONSTANTS ---
	self.max_force = 240.0
	self.hammer_mass = 5.0
	self.max_steps = 3000
	self.MAX_VELOCITY_NORM = 20.0

	# --- V33 Orbital Constants (50% REDUCTION) ---
	# ORBIT_RADIUS (White Ring) and ORBIT_ENTRY_DISTANCE (Blue Ring)
	self.ORBIT_RADIUS = 75.0
	self.ORBIT_TOLERANCE = 15.0
	self.ORBIT_ENTRY_DISTANCE = 90.0 # Blue Ring
	self.ORBIT_HARD_TRUNCATION = 175.0 # Red Ring

	# --- Black Hole Constants (INCREASED GRAVITY) ---
	self.BH_POS = np.array([self.size / 2, self.size / 2], dtype=np.float32)
	self.BH_RADIUS = 35.0
	self.BH_GRAVITY_CONSTANT = 6000.0
	self.BH_COLLISION_PENALTY = -15000.0

	# --- V33 PHYSICS FIX: Calculate the ideal circular velocity (V_circ) ---
	self.TARGET_ORBITAL_SPEED = np.sqrt(self.BH_GRAVITY_CONSTANT / self.ORBIT_RADIUS)

	# --- V33 State Variables ---
	self._current_phase = self.PHASE_APPROACH
	self._bh_last_angle = 0.0
	self._bh_total_angle_swept = 0.0
	self._bh_last_distance = 0.0
	self._last_step_reward = 0.0 # New variable to track reward for display
	self._total_reward_accumulated = 0.0 # Total reward tracker

	# Action Space is Radial/Tangential force
	self.action_space = spaces.Box(-1.0, 1.0, shape=(2,), dtype=np.float32)

	# Observation Space reflects the critical variables (normalized)
	OBS_SIZE = 9
	self.observation_space = spaces.Box(-np.inf, np.inf, shape=(OBS_SIZE,), dtype=np.float32)

	self.render_mode = render_mode
	self.window = None
	self.clock = None

	def _get_info(self):
	bh_delta_pos = self.BH_POS - self._hammer_pos
	distance_bh = np.linalg.norm(bh_delta_pos)

	bh_to_hammer_vec = self._hammer_pos - self.BH_POS
	bh_to_hammer_norm = bh_to_hammer_vec / distance_bh if distance_bh > 0 else np.zeros(2)

	# Radial vector (normalized, points away from BH)
	radial_vec = bh_to_hammer_norm
	# Tangential vector (normalized, points counter-clockwise, 90 deg from radial)
	tangential_vec = np.array([-radial_vec[1], radial_vec[0]])

	# Velocity Components: V_R is along radial, V_T is along tangential
	V_R = np.dot(self._hammer_vel, radial_vec)
	V_T = np.dot(self._hammer_vel, tangential_vec)

	# Angular movement calculation (for reward and angle_swept)
	bh_angle = np.arctan2(bh_delta_pos[1], bh_delta_pos[0])
	angle_moved = bh_angle - self._bh_last_angle
	angle_moved = (angle_moved + np.pi) % (2 * np.pi) - np.pi
	self._bh_last_angle = bh_angle

	return {
	"angle_moved": angle_moved,
	"distance_bh": distance_bh,
	"delta_pos_bh": bh_delta_pos,
	"bh_total_angle_swept": self._bh_total_angle_swept,
	"radial_velocity": V_R,
	"tangential_velocity": V_T,
	"hammer_speed": np.linalg.norm(self._hammer_vel),
	"radial_vector": radial_vec,
	"tangential_vector": tangential_vec,
	}

	def _get_obs(self):
	info = self._get_info()
	bh_delta_pos_norm = info['delta_pos_bh'] / self.size

	# Normalize Velocities for stable training
	V_R_norm = info['radial_velocity'] / self.MAX_VELOCITY_NORM
	V_T_norm = info['tangential_velocity'] / self.MAX_VELOCITY_NORM

	return np.concatenate([
	self._hammer_pos / self.size, # 2 (Normalized Pos)
	bh_delta_pos_norm, # 2 (Normalized Delta Pos)
	[info['distance_bh'] / self.size], # 1 (Normalized Distance)
	[V_R_norm], # 1 (Normalized V_R)
	[V_T_norm], # 1 (Normalized V_T)
	[self._current_phase], # 1
	[self.ORBIT_RADIUS / self.size] # 1 (Target R_orbit)
	], dtype=np.float32)

	def reset(self, seed=None, options=None):
	super().reset(seed=seed)
	self.current_step = 0
	self._current_phase = self.PHASE_APPROACH
	self._bh_total_angle_swept = 0.0
	self._hammer_vel = np.zeros(2, dtype=np.float32)
	self._last_step_reward = 0.0
	self._total_reward_accumulated = 0.0

	# Initial position far away but within the new screen boundaries
	self._hammer_pos = np.array([
	self.np_random.uniform(self.size0.1, self.size0.2),
	self.np_random.uniform(self.size0.1, self.size0.2),
	], dtype=np.float32)

	bh_delta_pos = self.BH_POS - self._hammer_pos
	self._bh_last_angle = np.arctan2(bh_delta_pos[1], bh_delta_pos[0])
	self._bh_last_distance = np.linalg.norm(bh_delta_pos)

	obs = self._get_obs()
	info = self._get_info()
	return obs, info

	def step(self, action):
	self.current_step += 1
	info_before = self._get_info()
	bh_dist_before = info_before['distance_bh']

	# --- 1. Forces and Dynamics (V33 Steering Control) ---

	action_tangential = action[0]
	action_radial = action[1]

	radial_vec = info_before['radial_vector']
	tangential_vec = info_before['tangential_vector']

	# Engine Force Calculation
	tangential_force = tangential_vec * action_tangential * self.max_force
	radial_force = radial_vec * action_radial * self.max_force * 0.5
	total_engine_force = tangential_force + radial_force

	# Gravity Force
	bh_dist = bh_dist_before
	bh_delta_pos = info_before['delta_pos_bh']
	bh_dist_clamped = max(bh_dist, 10.0)
	bh_gravity_magnitude = (self.BH_GRAVITY_CONSTANT * self.hammer_mass) / (bh_dist_clamped**2)
	bh_gravity_force = bh_delta_pos / bh_dist_clamped * bh_gravity_magnitude

	# Integration
	total_force = total_engine_force + bh_gravity_force
	hammer_accel = total_force / self.hammer_mass
	self._hammer_vel += hammer_accel * self.dt
	self._hammer_pos += self._hammer_vel * self.dt

	# --- 2. Reward Calculation (V33) ---
	reward = 0.0
	info_after = self._get_info()
	bh_dist_after = info_after['distance_bh']
	V_R_after = info_after['radial_velocity']
	V_T_after = info_after['tangential_velocity']
	current_speed = info_after['hammer_speed']

	terminated = False
	truncated = False

	# R_ACTION_PENALTY
	reward -= 0.01 * (action_tangential2 + action_radial2)

	# --- PHASE 0: APPROACH ---
	if self._current_phase == self.PHASE_APPROACH:

	# R_GOAL_SEEKING_REWARD: Incentivize reaching the new target radius (75m)
	dist_to_orbit_before = np.abs(bh_dist_before - self.ORBIT_RADIUS)
	dist_to_orbit_after = np.abs(bh_dist_after - self.ORBIT_RADIUS)
	distance_reduction = dist_to_orbit_before - dist_to_orbit_after
	reward += 500.0 * distance_reduction / self.size

	# R_STATIONARITY_PENALTY
	MIN_SPEED_THRESHOLD = 5.0 # m/s
	if current_speed < MIN_SPEED_THRESHOLD:
	reward -= 5.0

	# V33 FIX: Transition Condition: Must reach ORBIT_RADIUS (75.0m, white ring)
	if bh_dist_after <= self.ORBIT_RADIUS:

	# R_SPEED_PENALTY_ON_ENTRY: Targets the actual V_circ (8.94 m/s)
	speed_error = np.abs(V_T_after - self.TARGET_ORBITAL_SPEED)
	reward_speed_penalty = -500.0 * (speed_error / self.TARGET_ORBITAL_SPEED)**2

	# R_RADIAL_VELOCITY_PENALTY_ON_ENTRY
	radial_penalty = -1000.0 * np.abs(V_R_after) / self.TARGET_ORBITAL_SPEED

	reward += reward_speed_penalty
	reward += radial_penalty

	reward += 1000.0 # Entry bonus
	self._current_phase = self.PHASE_ORBIT
	print(f"Transitioned to ORBIT Phase! Tangential Speed Penalty: {reward_speed_penalty:.1f}, Radial Penalty: {radial_penalty:.1f}")

	# --- PHASE 1: ORBIT (V33 Stability Enforcement) ---
	elif self._current_phase == self.PHASE_ORBIT:

	angle_moved = info_after['angle_moved']

	# R_PROGRESSIVE_COMPLETION (Meme Reward)
	if angle_moved > 0:
	reward += 50.0 * angle_moved
	self._bh_total_angle_swept += angle_moved
	else:
	# Small penalty for moving backward
	reward += 100.0 * angle_moved

	# R_Orbit_Stability (QUADRATIC PENALTY)
	orbit_error = np.abs(bh_dist_after - self.ORBIT_RADIUS)

	if orbit_error < self.ORBIT_TOLERANCE:
	reward += 15.0 # Constant bonus for staying in tolerance
	else:
	scaled_error = orbit_error / self.ORBIT_RADIUS
	reward -= 200.0 * (scaled_error)**2

	# R_Radial_Velocity_Penalty (Punish movement away/toward BH)
	reward -= 50.0 * np.abs(V_R_after) / self.TARGET_ORBITAL_SPEED

	# R_360_Completion (Terminal Reward)
	if self._bh_total_angle_swept >= 2 * np.pi:
	reward += 15000.0
	terminated = True
	print(f"--- SUCCESS: Full 360-degree orbit completed in {self.current_step} steps! ---")


	# --- 3. Termination/Truncation (V33 Limits) ---

	# R_COLLISION_PENALTY
	collision_distance_bh = self.BH_RADIUS + 10.0
	if bh_dist_after <= collision_distance_bh:
	reward += self.BH_COLLISION_PENALTY
	terminated = True

	# HARD TRUNCATION APPLIED ONLY IN ORBIT PHASE
	if self._current_phase == self.PHASE_ORBIT:
	if bh_dist_after > self.ORBIT_HARD_TRUNCATION:
	reward -= 7500.0
	truncated = True
	print(f"--- TRUNCATED: Agent drifted outside hard limit ({bh_dist_after:.1f} m) ---")

	# R_TIMEOUT
	truncated = truncated or (self.current_step >= self.max_steps)

	self._bh_last_distance = bh_dist_after
	self._last_step_reward = reward
	self._total_reward_accumulated += reward

	return self._get_obs(), reward, terminated, truncated, info_after

	def render(self):
	if self.render_mode == "rgb_array":
	return self._render_frame()

	def _render_frame(self):
	import pygame

	if self.window is None:
	import pygame
	pygame.init()
	self.window = pygame.display.set_mode((self.size, self.size))
	pygame.display.set_caption("Mjölnir Orbit-v33 (TIGHT, REWARD FEEDBACK)")

	if self.clock is None:
	self.clock = pygame.time.Clock()

	canvas = pygame.Surface((self.size, self.size))
	canvas.fill((20, 20, 40))

	info = self._get_info()
	bh_dist = info['distance_bh']
	angle_swept = self._bh_total_angle_swept

	font = pygame.font.Font(None, 30) # Smaller font for more text

	# --- Draw Static Elements ---
	bh_pos_int = self.BH_POS.astype(int)
	pygame.draw.circle(canvas, (0, 0, 0), bh_pos_int, int(self.BH_RADIUS))
	pygame.draw.circle(canvas, (255, 60, 0), bh_pos_int, int(self.BH_RADIUS * 1.5), 5) # Orange Ring
	pygame.draw.circle(canvas, (255, 255, 255), bh_pos_int, int(self.ORBIT_RADIUS), 1) # White Ring (Target Orbit)

	# Blue Ring (Entry/Approach Guidance)
	if self._current_phase == self.PHASE_APPROACH:
	pygame.draw.circle(canvas, (0, 150, 255), bh_pos_int, int(self.ORBIT_ENTRY_DISTANCE), 1)

	pygame.draw.circle(canvas, (255, 0, 0), bh_pos_int, int(self.ORBIT_HARD_TRUNCATION), 1) # Red Ring (Hard Truncation)

	# 3. Draw Agent (Mjölnir Hammer)
	hammer_pos = self._hammer_pos

	head_width = 30
	head_height = 20
	handle_width = 5
	handle_length = 25

	head_points = np.array([
	[-head_width / 2, -head_height / 2],
	[head_width / 2, -head_height / 2],
	[head_width / 2, head_height / 2],
	[-head_width / 2, head_height / 2]
	])

	handle_points = np.array([
	[-handle_width / 2, head_height / 2],
	[handle_width / 2, head_height / 2],
	[handle_width / 2, head_height / 2 + handle_length],
	[-handle_width / 2, head_height / 2 + handle_length]
	])

	hammer_angle = 0.0
	if np.linalg.norm(self._hammer_vel) > 1.0:
	vel_x, vel_y = self._hammer_vel
	hammer_angle = np.arctan2(vel_y, vel_x)

	rot_matrix = np.array([
	[np.cos(hammer_angle), -np.sin(hammer_angle)],
	[np.sin(hammer_angle), np.cos(hammer_angle)]
	])

	transformed_head_points = np.dot(head_points, rot_matrix.T) + hammer_pos
	transformed_handle_points = np.dot(handle_points, rot_matrix.T) + hammer_pos

	head_draw_points = [tuple(p.astype(int)) for p in transformed_head_points]
	handle_draw_points = [tuple(p.astype(int)) for p in transformed_handle_points]

	pygame.draw.polygon(canvas, (200, 200, 200), head_draw_points)
	pygame.draw.polygon(canvas, (139, 69, 19), handle_draw_points)


	# 4. Draw Info text (V33 - Includes Reward)
	text_y_start = 10
	y_offset = 30

	PHASE_NAMES = ["APPROACH", "ORBIT"]
	PHASE_COLORS = [(255, 255, 0), (0, 255, 0)]
	current_phase_name = PHASE_NAMES[self._current_phase]

	# Line 1: Phase and Step Count
	phase_text = f"PHASE: {current_phase_name} \| STEP: {self.current_step}/{self.max_steps}"
	text_surface_phase = font.render(phase_text, True, PHASE_COLORS[self._current_phase])
	canvas.blit(text_surface_phase, (10, text_y_start))

	# Line 2: Distance Error
	orbit_error = bh_dist - self.ORBIT_RADIUS
	bh_text = f"BH DIST: {bh_dist:.1f} m (Error: {orbit_error:.1f})"
	text_color_bh = (255, 100, 100) if np.abs(orbit_error) > self.ORBIT_TOLERANCE else (100, 200, 255)
	text_surface_bh = font.render(bh_text, True, text_color_bh)
	canvas.blit(text_surface_bh, (10, text_y_start + y_offset))

	# Line 3: Velocities
	v_t_text = f"V_T: {info['tangential_velocity']:.1f} m/s \| V_R: {info['radial_velocity']:.1f} m/s (Target: {self.TARGET_ORBITAL_SPEED:.2f})"
	text_surface_v = font.render(v_t_text, True, (255, 255, 255))
	canvas.blit(text_surface_v, (10, text_y_start + 2*y_offset))

	# Line 4: Progressive Completion
	circum_text = f"TOTAL ORBIT: {np.rad2deg(angle_swept):.1f}° (Goal: 360.0°)"
	text_surface_circum = font.render(circum_text, True, (0, 255, 0) if angle_swept > np.pi else (150, 150, 150))
	canvas.blit(text_surface_circum, (10, text_y_start + 3*y_offset))

	# Line 5 (NEW): Step Reward and Total Reward
	# Total reward is often very large/negative due to penalties, so we color it yellow
	reward_color = (0, 255, 0) if self._last_step_reward >= 0 else (255, 100, 100)
	reward_text = f"STEP REWARD: {self._last_step_reward:.2f} \| TOTAL REWARD: {self._total_reward_accumulated:.1f}"
	text_surface_reward = font.render(reward_text, True, reward_color)
	canvas.blit(text_surface_reward, (10, text_y_start + 4*y_offset))

	if self.render_mode == "rgb_array":
	return np.transpose(
	np.array(pygame.surfarray.pixels3d(canvas)), axes=(1, 0, 2)
	)

	def close(self):
	if self.window is not None:
	import pygame
	pygame.quit()
	self.window = None

	# ------------------------------------------------------------------------------------
	## 2. PPO Training and Evaluation (V33 ENVIRONMENT) 🚀
	# ------------------------------------------------------------------------------------

	# --- Helper Functions (Same) ---
	def linear_schedule(initial_value: float):
	def func(progress_remaining: float) -> float:
	return initial_value * progress_remaining
	return func

	def show_video(video_dir):
	mp4list = [f for f in os.listdir(video_dir) if f.endswith('.mp4')]
	if len(mp4list) > 0:
	mp4list.sort(key=lambda x: os.path.getmtime(os.path.join(video_dir, x)), reverse=True)
	file = mp4list[0]
	video = open(os.path.join(video_dir, file), 'rb').read()
	b64 = base64.b64encode(video).decode()
	return HTML(f'<video width="100%" controls><source src="data:video/mp4;base64,{b64}" type="video/mp4"></video>')
	else:
	print("No video file found.")
	return HTML('<p>No video file found.</p>')

	# --- Hyperparameters (Same) ---
	INIT_LR = 0.0001
	N_STEPS = 2048
	BATCH_SIZE = 64
	N_EPOCHS = 10
	GAMMA = 0.99
	ENT_COEF = 0.01
	POLICY_KWARGS = dict(net_arch=[dict(pi=[64, 64], vf=[64, 64])])


	# --- Model Initialization and Training ---
	env_id = "MjollnirOrbit-v33"
	# Register the new V33 environment
	gym.envs.registration.register(id=env_id, entry_point=MjollnirEnvV33, max_episode_steps=3000)

	print(f"Starting Mjölnir training for {TOTAL_TIMESTEPS} timesteps using {env_id} (TIGHT ENTRY, REWARD FEEDBACK)...")

	train_env = make_vec_env(env_id, n_envs=10, env_kwargs=dict(dt=1/30))

	model = PPO(
	"MlpPolicy",
	train_env,
	learning_rate=linear_schedule(INIT_LR),
	n_steps=N_STEPS,
	batch_size=BATCH_SIZE,
	n_epochs=N_EPOCHS,
	gamma=GAMMA,
	ent_coef=ENT_COEF,
	policy_kwargs=POLICY_KWARGS,
	verbose=1,
	device="cuda"
	)

	# Start training the V33 model
	model.learn(total_timesteps=TOTAL_TIMESTEPS)

	# --- Video Recording and Evaluation ---
	print("\n--- Recording Trained Agent Performance ---")

	# Ensure the correct V33 environment is used for evaluation
	eval_env = DummyVecEnv([lambda: gym.make(env_id, render_mode="rgb_array", dt=1/30)])
	video_length = 6000

	eval_env = VecVideoRecorder(
	eval_env,
	VIDEO_DIR,
	record_video_trigger=lambda x: x == 0,
	video_length=video_length,
	name_prefix=f"ppo-mjollnir-agent-v33"
	)

	# Robust Reset
	try:
	obs, _ = eval_env.reset()
	except ValueError:
	obs = eval_env.reset()

	if obs.ndim == 1:
	obs = obs[None]

	for _ in range(video_length):
	action, _ = model.predict(obs, deterministic=True)

	try:
	obs, _, terminated, truncated, _ = eval_env.step(action)
	except ValueError:
	obs, _, done, _ = eval_env.step(action)
	terminated = done
	truncated = np.array([False])

	if terminated[0] or truncated[0]:
	try:
	obs, _ = eval_env.reset()
	except ValueError:
	obs = eval_env.reset()
	break

	eval_env.close()

	# --- Display Results ---
	print("\n--- Video of Trained Agent ---")
	show_video(VIDEO_DIR)
No results found