AndreVallestero · October 3, 2024 06:25
diff --git a/main.cpp b/main.cpp
 /*
 Simple benchmark for agent vision.

 This is specifically for finding the closest agent, which is not a very common use case in
  simulations. Most simulations use the data from each ray as an input parameter for their
  agents' neural net. This benchmark was made for buddhaman/AnymaEngine, which only takes
  the closest friendly and aggressive agent as inputs into the neural net. The results for
  32768 agents * 10 iterations without chunking are as follows (smaller is better):

  - 9.039s raycast: this is the old method used by AnymaEngine
  - 4.504s anglecheck_dot: this should have a performance profile similar to the existing
                implementation, except it uses a sqrt instead of atan
  - 4.204s anglecheck_cross: this only has multiplication and addition/subtraction, all of
                which can be done in 1 clock cycle. *This method gets my recommendation for
                AnymaEngine*.
  - 1.421s anglecheck_cross_simd: This is by far the fastest, but probably requires an
                engine rewrite since it needs all the agent data to be in an ECS pattern
                (contiguous blocks of memory) in order to efficiently leverage SIMD.

  The main difference in this benchmark compared to AnymaEngine is that there is no
    chunking, and instead of a raw angle, the agent's orientation is stored as a unit
    vector. No chunking was a deliberate choice to reduce complexity and to scale the load
    more easily. Unit vectors should be used instead of rads/degrees due to being simpler
    and faster in almost every scenario, especially if there are any plans to do SIMD or
    GPU offloading.

 build instructions:
  call vcvars64
  cl main.cpp /O2 /openmp /arch:AVX /MT /W3 /std:c++20 /DNDEBUG
 */

 #include <omp.h>
 #include <iostream>
 #include <random>
 #define _USE_MATH_DEFINES
 #include <math.h>
 #include <time.h>
 #include <cmath>
 #include <immintrin.h>

 #define NUM_AGENTS 32768 // Must be a power of 2 for the simd implementation to work (fixable, but needs extra logic to handle the remainder, and I'm lazy)
 #define AGENT_RANGE 70
 #define ARENA_SIZE 140
 #define AGENT_SIZE 1
 #define AGENT_FOV 72

 #define UNIT_VEC_36_X 0.809016994
 #define UNIT_VEC_36_Y 0.58778525229
 #define UNIT_VEC_12_X 0.97814760073
 #define UNIT_VEC_12_Y 0.20791169081
 float RAYS[4][2] = {{UNIT_VEC_36_X, -UNIT_VEC_36_Y}, {UNIT_VEC_12_X, -UNIT_VEC_12_Y}, {UNIT_VEC_12_X, UNIT_VEC_12_Y}, {UNIT_VEC_36_X, UNIT_VEC_36_Y}};

 struct Agent {
    float x;
    float y;
    float angle_x;
    float angle_y;
 };

 struct World {
    float* agent_x;
    float* agent_y;
    float* agent_angle_x;
    float* agent_angle_y;
 };

 /*
 x, y initial vector
 dx, dy rotation vector
 tx, ty resultant vector
 */
 inline void rotate_unit_vector(float x, float y, float dx, float dy, float* tx, float* ty) {
    *tx = x*dx - y*dy;
    *ty = x*dy + y*dx;
 }

 int* test_raycast(Agent* agents) {
    int* results = (int*)malloc(sizeof(int) * NUM_AGENTS);

    #pragma omp parallel for 
    for (int i = 0; i < NUM_AGENTS; i++) {
        Agent agent = agents[i];
        float closest_dist = AGENT_RANGE * AGENT_RANGE;
        int closest_idx = -1;

        for (int j = 0; j < 4; j++) {
            float rx, ry;
            rotate_unit_vector(agent.angle_x, agent.angle_y, RAYS[j][0], RAYS[j][1], &rx, &ry);

            for (int k = 0; k < NUM_AGENTS; k++) {
                if (i == k) continue;

                // Relative vector from target to agent (order is important)
                Agent target = agents[k];
                float tx = agent.x - target.x;
                float ty = agent.y - target.y;

                // Coefficients of the quadratic equation
                // a is implicitly 1 for unit vectors
                float b = 2 * (rx * tx + ry * ty);
                float c = tx * tx + ty * ty - AGENT_SIZE * AGENT_SIZE;

                // overlapping
                //if (c < 0) continue;

                // Discriminant
                float discriminant = b * b - 4 * c;
                if (discriminant < 0) continue;

                // Calculate the two potential t values
                float sqrt_discriminant = sqrt(discriminant);
                float t1 = (-b - sqrt_discriminant) / 2;
                float t2 = (-b + sqrt_discriminant) / 2;

                // Check if either intersection is within the ray's range and is the closest target
                if (0 <= t1 && t1 <= AGENT_RANGE && t1 < closest_dist) {
                    closest_dist = t1;
                    closest_idx = k;
                }
                if (0 <= t2 && t2 <= AGENT_RANGE && t2 < closest_dist) {
                    closest_dist = t2;
                    closest_idx = k;
                }
            }
        }
        results[i] = closest_idx;
    }
    return results;
 }

 int* test_anglecheck_dot(Agent* agents) {
    int* results = (int*)malloc(sizeof(int) * NUM_AGENTS);

    #pragma omp parallel for 
    for (int i = 0; i < NUM_AGENTS; i++) {
        Agent agent = agents[i];
        float closest_dist_sq = AGENT_RANGE * AGENT_RANGE; // Use squared distance for comparison
        int closest_idx = -1;

        // Create left and right
        float left_bound_x, left_bound_y, right_bound_x, right_bound_y;
        rotate_unit_vector(agent.angle_x, agent.angle_y, RAYS[0][0], RAYS[0][1], &left_bound_x, &left_bound_y);
        rotate_unit_vector(agent.angle_x, agent.angle_y, RAYS[3][0], RAYS[3][1], &right_bound_x, &right_bound_y);
        float dot_left_right = left_bound_x * right_bound_x + left_bound_y * right_bound_y;

        for (int j = 0; j < NUM_AGENTS; j++) {
            if (i == j) continue;

            // Get relative vector from agent to target then normalize
            Agent target = agents[j];
            float tx = target.x - agent.x;
            float ty = target.y - agent.y;
            float t_dist_sq = tx * tx + ty * ty; // Use squared distance to avoid sqrt

            // Early exit if the target is outside of range, even without this optimization, this method is 2x faster than raycast
            if (t_dist_sq > AGENT_RANGE * AGENT_RANGE || t_dist_sq > closest_dist_sq) continue; 

            // Normalize tx, ty vector (divide by t_dist which is sqrt of t_dist_sq)
            float t_dist = sqrt(t_dist_sq); // sqrt here only when needed
            tx /= t_dist;
            ty /= t_dist;

            // Calculate dot products for the left and right bounds
            float dot_left = left_bound_x * tx + left_bound_y * ty;
            float dot_right = right_bound_x * tx + right_bound_y * ty;

            // Check if target is within the field of view and closer than the current closest
            if (dot_left > dot_left_right && dot_right > dot_left_right) {
                closest_dist_sq = t_dist_sq;
                closest_idx = j;
            }
        }

        results[i] = closest_idx;
    }
    return results;
 }

 int* test_anglecheck_cross(Agent* agents) {
    int* results = (int*)malloc(sizeof(int) * NUM_AGENTS);

    #pragma omp parallel for 
    for (int i = 0; i < NUM_AGENTS; i++) {
        Agent agent = agents[i];
        float closest_dist_sq = AGENT_RANGE * AGENT_RANGE; // Use squared distance for comparison
        int closest_idx = -1;

        // Create left and right
        float left_bound_x, left_bound_y, right_bound_x, right_bound_y;
        rotate_unit_vector(agent.angle_x, agent.angle_y, RAYS[0][0], RAYS[0][1], &left_bound_x, &left_bound_y);
        rotate_unit_vector(agent.angle_x, agent.angle_y, RAYS[3][0], RAYS[3][1], &right_bound_x, &right_bound_y);

        for (int j = 0; j < NUM_AGENTS; j++) {
            if (i == j) continue;

            // Get relative vector from agent to target
            Agent target = agents[j];
            float tx = target.x - agent.x;
            float ty = target.y - agent.y;

            // Skip if out of range, or could not be the closest
            float t_dist_sq = tx * tx + ty * ty;
            if (t_dist_sq > AGENT_RANGE * AGENT_RANGE || t_dist_sq >= closest_dist_sq) continue; 

            // Skip if to the left of the left ray
            float cross_1 = left_bound_x * ty - left_bound_y * tx;
            if (cross_1 < 0) continue;

            // Skip if to the right of the right ray
            float cross_2 = tx * right_bound_y - ty * right_bound_x;
            if (cross_2 < 0) continue;

            // Update closest
            closest_dist_sq = t_dist_sq;
            closest_idx = j;
        }
        results[i] = closest_idx;
    }
    return results;
 }

 int* test_anglecheck_cross_simd(World* world) {
    int* results = (int*)malloc(sizeof(int) * NUM_AGENTS);

    #pragma omp parallel for 
    for (int i = 0; i < NUM_AGENTS; i++) {
        int closest_idx = -1;
        float closest_dist_sq = AGENT_RANGE * AGENT_RANGE; // Use squared distance for comparison
        __m256 closest_dist_sq_v = _mm256_set1_ps(closest_dist_sq);

        // Create left and right boundary vectors
        float left_bound_x, left_bound_y, right_bound_x, right_bound_y;
        rotate_unit_vector(world->agent_angle_x[i], world->agent_angle_y[i], RAYS[0][0], RAYS[0][1], &left_bound_x, &left_bound_y);
        rotate_unit_vector(world->agent_angle_x[i], world->agent_angle_y[i], RAYS[3][0], RAYS[3][1], &right_bound_x, &right_bound_y);

        // Iterate over agents in chunks of 8 (assuming AVX which handles 8 floats at once)
        for (int j = 0; j < NUM_AGENTS; j += 8) {
            // Load target agent positions (x and y)
            __m256 target_x = _mm256_loadu_ps(&world->agent_x[j]);  // Load 8 floats starting from agents[j].x
            __m256 target_y = _mm256_loadu_ps(&world->agent_y[j]);  // Load 8 floats starting from agents[j].y

            // Compute relative vector: tx = target.x - agent.x, ty = target.y - agent.y
            __m256 tx = _mm256_sub_ps(target_x, _mm256_set1_ps(world->agent_x[i]));
            __m256 ty = _mm256_sub_ps(target_y, _mm256_set1_ps(world->agent_y[i]));

            // Compute squared distance: t_dist_sq = tx * tx + ty * ty
            __m256 t_dist_sq = _mm256_fmadd_ps(tx, tx, _mm256_mul_ps(ty, ty)); // fused multiply add

            // Mask for agents within range
            __m256 range_mask = _mm256_cmp_ps(t_dist_sq, _mm256_set1_ps(AGENT_RANGE * AGENT_RANGE), _CMP_LE_OQ);

            // Compute cross products for boundary checks
            __m256 cross_1 = _mm256_sub_ps(_mm256_mul_ps(_mm256_set1_ps(left_bound_x), ty),
                                           _mm256_mul_ps(_mm256_set1_ps(left_bound_y), tx));
            __m256 cross_2 = _mm256_sub_ps(_mm256_mul_ps(tx, _mm256_set1_ps(right_bound_y)),
                                           _mm256_mul_ps(ty, _mm256_set1_ps(right_bound_x)));
                                              
            // Mask for agents between the rays (cross_1 > 0 and cross_2 > 0)
            __m256 ray_mask = _mm256_and_ps(_mm256_cmp_ps(cross_1, _mm256_setzero_ps(), _CMP_GT_OQ),
                                            _mm256_cmp_ps(cross_2, _mm256_setzero_ps(), _CMP_GT_OQ));

            // Combine range_mask and ray_mask to get the final valid targets
            __m256 valid_mask = _mm256_and_ps(range_mask, ray_mask);

            // Store the valid_mask and t_dist_sq results to memory to extract individual elements
            float valid_mask_arr[8], t_dist_sq_arr[8];
            _mm256_storeu_ps(valid_mask_arr, valid_mask);
            _mm256_storeu_ps(t_dist_sq_arr, t_dist_sq);

            // Extract results and check closest distance
            for (int k = 0; k < 8; ++k) {
                int candidate_idx = j + k;
                if (valid_mask_arr[k] && t_dist_sq_arr[k] < closest_dist_sq && candidate_idx != i) {
                    closest_dist_sq = t_dist_sq_arr[k];
                    closest_idx = candidate_idx;
                }
            }
        }
        results[i] = closest_idx;
    }

    return results;
 }

 void print_results(Agent* agents, int* results) {
    for (int i = 0; i < NUM_AGENTS; i++)
        std::cout << "agent: " << i << " pos " << agents[i].x << " " <<  agents[i].y << "\tangle " <<  agents[i].angle_x << " " << agents[i].angle_y
            << "|\ttarget: " << results[i] << " pos " << (results[i] != -1 ? agents[results[i]].x : -1) << " " << (results[i] != -1 ? agents[results[i]].y : -1)  << std::endl;
 }

 int main() {
    std::cout << "Generating " << NUM_AGENTS << " in a " << ARENA_SIZE << "*" << ARENA_SIZE << " area" << std::endl;
    std::minstd_rand gen(std::random_device{}());
    std::uniform_real_distribution<float> dist_arena(0, ARENA_SIZE);
    std::uniform_real_distribution<float> dist_ori_x(-1, 1);
    std::uniform_int_distribution<int> dist_ori_sign(0, 1);
    Agent agents[NUM_AGENTS];

    // This is used for an ECS-based methods, or methods than require contiguous data, for example, SIMD
    World* world = (World*)malloc(sizeof(World)); 
    world->agent_x = (float*)malloc(sizeof(float) * NUM_AGENTS);
    world->agent_y = (float*)malloc(sizeof(float) * NUM_AGENTS);
    world->agent_angle_x = (float*)malloc(sizeof(float) * NUM_AGENTS);
    world->agent_angle_y = (float*)malloc(sizeof(float) * NUM_AGENTS);

    for (int i = 0; i < NUM_AGENTS; i++) {
        Agent* agent = &agents[i];
        agent->x = dist_arena(gen);
        agent->y = dist_arena(gen);
        agent->angle_x = dist_ori_x(gen);
        agent->angle_y = sqrt(1 - agent->angle_x * agent->angle_x) * (dist_ori_sign(gen) * 2 - 1);

        world->agent_x[i] = agent->x;
        world->agent_y[i] = agent->y;
        world->agent_angle_x[i] = agent->angle_x;
        world->agent_angle_y[i] = agent->angle_y;
    }

    // All methods should return an array containing the indices of the nearest agent, or -1 if there is none
    clock_t t = clock();
    for (int i = 0; i < 10; i++)
        free(test_raycast(agents));
    t = clock() - t;
    //print_results(agents, results);
    std::cout << "Raycast took " << ((float)t)/CLOCKS_PER_SEC << " seconds" << std::endl;

    t = clock();
    for (int i = 0; i < 10; i++)
        free(test_anglecheck_dot(agents));
    t = clock() - t;
    //print_results(agents, results);
    std::cout << "Anglecheck_dot took " << ((float)t)/CLOCKS_PER_SEC << " seconds" << std::endl;

    t = clock();
    for (int i = 0; i < 10; i++)
        free(test_anglecheck_cross(agents));
    t = clock() - t;
    //print_results(agents, results);
    std::cout << "Anglecheck_cross took " << ((float)t)/CLOCKS_PER_SEC << " seconds" << std::endl;
    
    t = clock();
    for (int i = 0; i < 10; i++)
        free(test_anglecheck_cross_simd(world));
    t = clock() - t;
    //print_results(agents, results);
    std::cout << "Anglecheck_cross_simd took " << ((float)t)/CLOCKS_PER_SEC << " seconds" << std::endl;

    std::cout << "Terminated successfully" << std::endl;
    return 0;
 }
	/*
	Simple benchmark for agent vision.

	This is specifically for finding the closest agent, which is not a very common use case in
	simulations. Most simulations use the data from each ray as an input parameter for their
	agents' neural net. This benchmark was made for buddhaman/AnymaEngine, which only takes
	the closest friendly and aggressive agent as inputs into the neural net. The results for
	32768 agents * 10 iterations without chunking are as follows (smaller is better):

	- 9.039s raycast: this is the old method used by AnymaEngine
	- 4.504s anglecheck_dot: this should have a performance profile similar to the existing
	implementation, except it uses a sqrt instead of atan
	- 4.204s anglecheck_cross: this only has multiplication and addition/subtraction, all of
	which can be done in 1 clock cycle. *This method gets my recommendation for
	AnymaEngine*.
	- 1.421s anglecheck_cross_simd: This is by far the fastest, but probably requires an
	engine rewrite since it needs all the agent data to be in an ECS pattern
	(contiguous blocks of memory) in order to efficiently leverage SIMD.

	The main difference in this benchmark compared to AnymaEngine is that there is no
	chunking, and instead of a raw angle, the agent's orientation is stored as a unit
	vector. No chunking was a deliberate choice to reduce complexity and to scale the load
	more easily. Unit vectors should be used instead of rads/degrees due to being simpler
	and faster in almost every scenario, especially if there are any plans to do SIMD or
	GPU offloading.

	build instructions:
	call vcvars64
	cl main.cpp /O2 /openmp /arch:AVX /MT /W3 /std:c++20 /DNDEBUG
	*/

	#include <omp.h>
	#include <iostream>
	#include <random>
	#define _USE_MATH_DEFINES
	#include <math.h>
	#include <time.h>
	#include <cmath>
	#include <immintrin.h>

	#define NUM_AGENTS 32768 // Must be a power of 2 for the simd implementation to work (fixable, but needs extra logic to handle the remainder, and I'm lazy)
	#define AGENT_RANGE 70
	#define ARENA_SIZE 140
	#define AGENT_SIZE 1
	#define AGENT_FOV 72

	#define UNIT_VEC_36_X 0.809016994
	#define UNIT_VEC_36_Y 0.58778525229
	#define UNIT_VEC_12_X 0.97814760073
	#define UNIT_VEC_12_Y 0.20791169081
	float RAYS[4][2] = {{UNIT_VEC_36_X, -UNIT_VEC_36_Y}, {UNIT_VEC_12_X, -UNIT_VEC_12_Y}, {UNIT_VEC_12_X, UNIT_VEC_12_Y}, {UNIT_VEC_36_X, UNIT_VEC_36_Y}};

	struct Agent {
	float x;
	float y;
	float angle_x;
	float angle_y;
	};

	struct World {
	float* agent_x;
	float* agent_y;
	float* agent_angle_x;
	float* agent_angle_y;
	};

	/*
	x, y initial vector
	dx, dy rotation vector
	tx, ty resultant vector
	*/
	inline void rotate_unit_vector(float x, float y, float dx, float dy, float* tx, float* ty) {
	tx = xdx - y*dy;
	ty = xdy + y*dx;
	}

	int* test_raycast(Agent* agents) {
	int* results = (int)malloc(sizeof(int) NUM_AGENTS);

	#pragma omp parallel for
	for (int i = 0; i < NUM_AGENTS; i++) {
	Agent agent = agents[i];
	float closest_dist = AGENT_RANGE * AGENT_RANGE;
	int closest_idx = -1;

	for (int j = 0; j < 4; j++) {
	float rx, ry;
	rotate_unit_vector(agent.angle_x, agent.angle_y, RAYS[j][0], RAYS[j][1], &rx, &ry);

	for (int k = 0; k < NUM_AGENTS; k++) {
	if (i == k) continue;

	// Relative vector from target to agent (order is important)
	Agent target = agents[k];
	float tx = agent.x - target.x;
	float ty = agent.y - target.y;

	// Coefficients of the quadratic equation
	// a is implicitly 1 for unit vectors
	float b = 2 * (rx * tx + ry * ty);
	float c = tx * tx + ty * ty - AGENT_SIZE * AGENT_SIZE;

	// overlapping
	//if (c < 0) continue;

	// Discriminant
	float discriminant = b * b - 4 * c;
	if (discriminant < 0) continue;

	// Calculate the two potential t values
	float sqrt_discriminant = sqrt(discriminant);
	float t1 = (-b - sqrt_discriminant) / 2;
	float t2 = (-b + sqrt_discriminant) / 2;

	// Check if either intersection is within the ray's range and is the closest target
	if (0 <= t1 && t1 <= AGENT_RANGE && t1 < closest_dist) {
	closest_dist = t1;
	closest_idx = k;
	}
	if (0 <= t2 && t2 <= AGENT_RANGE && t2 < closest_dist) {
	closest_dist = t2;
	closest_idx = k;
	}
	}
	}
	results[i] = closest_idx;
	}
	return results;
	}

	int* test_anglecheck_dot(Agent* agents) {
	int* results = (int)malloc(sizeof(int) NUM_AGENTS);

	#pragma omp parallel for
	for (int i = 0; i < NUM_AGENTS; i++) {
	Agent agent = agents[i];
	float closest_dist_sq = AGENT_RANGE * AGENT_RANGE; // Use squared distance for comparison
	int closest_idx = -1;

	// Create left and right
	float left_bound_x, left_bound_y, right_bound_x, right_bound_y;
	rotate_unit_vector(agent.angle_x, agent.angle_y, RAYS[0][0], RAYS[0][1], &left_bound_x, &left_bound_y);
	rotate_unit_vector(agent.angle_x, agent.angle_y, RAYS[3][0], RAYS[3][1], &right_bound_x, &right_bound_y);
	float dot_left_right = left_bound_x * right_bound_x + left_bound_y * right_bound_y;

	for (int j = 0; j < NUM_AGENTS; j++) {
	if (i == j) continue;

	// Get relative vector from agent to target then normalize
	Agent target = agents[j];
	float tx = target.x - agent.x;
	float ty = target.y - agent.y;
	float t_dist_sq = tx * tx + ty * ty; // Use squared distance to avoid sqrt

	// Early exit if the target is outside of range, even without this optimization, this method is 2x faster than raycast
	if (t_dist_sq > AGENT_RANGE * AGENT_RANGE \|\| t_dist_sq > closest_dist_sq) continue;

	// Normalize tx, ty vector (divide by t_dist which is sqrt of t_dist_sq)
	float t_dist = sqrt(t_dist_sq); // sqrt here only when needed
	tx /= t_dist;
	ty /= t_dist;

	// Calculate dot products for the left and right bounds
	float dot_left = left_bound_x * tx + left_bound_y * ty;
	float dot_right = right_bound_x * tx + right_bound_y * ty;

	// Check if target is within the field of view and closer than the current closest
	if (dot_left > dot_left_right && dot_right > dot_left_right) {
	closest_dist_sq = t_dist_sq;
	closest_idx = j;
	}
	}

	results[i] = closest_idx;
	}
	return results;
	}

	int* test_anglecheck_cross(Agent* agents) {
	int* results = (int)malloc(sizeof(int) NUM_AGENTS);

	#pragma omp parallel for
	for (int i = 0; i < NUM_AGENTS; i++) {
	Agent agent = agents[i];
	float closest_dist_sq = AGENT_RANGE * AGENT_RANGE; // Use squared distance for comparison
	int closest_idx = -1;

	// Create left and right
	float left_bound_x, left_bound_y, right_bound_x, right_bound_y;
	rotate_unit_vector(agent.angle_x, agent.angle_y, RAYS[0][0], RAYS[0][1], &left_bound_x, &left_bound_y);
	rotate_unit_vector(agent.angle_x, agent.angle_y, RAYS[3][0], RAYS[3][1], &right_bound_x, &right_bound_y);

	for (int j = 0; j < NUM_AGENTS; j++) {
	if (i == j) continue;

	// Get relative vector from agent to target
	Agent target = agents[j];
	float tx = target.x - agent.x;
	float ty = target.y - agent.y;

	// Skip if out of range, or could not be the closest
	float t_dist_sq = tx * tx + ty * ty;
	if (t_dist_sq > AGENT_RANGE * AGENT_RANGE \|\| t_dist_sq >= closest_dist_sq) continue;

	// Skip if to the left of the left ray
	float cross_1 = left_bound_x * ty - left_bound_y * tx;
	if (cross_1 < 0) continue;

	// Skip if to the right of the right ray
	float cross_2 = tx * right_bound_y - ty * right_bound_x;
	if (cross_2 < 0) continue;

	// Update closest
	closest_dist_sq = t_dist_sq;
	closest_idx = j;
	}
	results[i] = closest_idx;
	}
	return results;
	}

	int* test_anglecheck_cross_simd(World* world) {
	int* results = (int)malloc(sizeof(int) NUM_AGENTS);

	#pragma omp parallel for
	for (int i = 0; i < NUM_AGENTS; i++) {
	int closest_idx = -1;
	float closest_dist_sq = AGENT_RANGE * AGENT_RANGE; // Use squared distance for comparison
	__m256 closest_dist_sq_v = _mm256_set1_ps(closest_dist_sq);

	// Create left and right boundary vectors
	float left_bound_x, left_bound_y, right_bound_x, right_bound_y;
	rotate_unit_vector(world->agent_angle_x[i], world->agent_angle_y[i], RAYS[0][0], RAYS[0][1], &left_bound_x, &left_bound_y);
	rotate_unit_vector(world->agent_angle_x[i], world->agent_angle_y[i], RAYS[3][0], RAYS[3][1], &right_bound_x, &right_bound_y);

	// Iterate over agents in chunks of 8 (assuming AVX which handles 8 floats at once)
	for (int j = 0; j < NUM_AGENTS; j += 8) {
	// Load target agent positions (x and y)
	__m256 target_x = _mm256_loadu_ps(&world->agent_x[j]); // Load 8 floats starting from agents[j].x
	__m256 target_y = _mm256_loadu_ps(&world->agent_y[j]); // Load 8 floats starting from agents[j].y

	// Compute relative vector: tx = target.x - agent.x, ty = target.y - agent.y
	__m256 tx = _mm256_sub_ps(target_x, _mm256_set1_ps(world->agent_x[i]));
	__m256 ty = _mm256_sub_ps(target_y, _mm256_set1_ps(world->agent_y[i]));

	// Compute squared distance: t_dist_sq = tx * tx + ty * ty
	__m256 t_dist_sq = _mm256_fmadd_ps(tx, tx, _mm256_mul_ps(ty, ty)); // fused multiply add

	// Mask for agents within range
	__m256 range_mask = _mm256_cmp_ps(t_dist_sq, _mm256_set1_ps(AGENT_RANGE * AGENT_RANGE), _CMP_LE_OQ);

	// Compute cross products for boundary checks
	__m256 cross_1 = _mm256_sub_ps(_mm256_mul_ps(_mm256_set1_ps(left_bound_x), ty),
	_mm256_mul_ps(_mm256_set1_ps(left_bound_y), tx));
	__m256 cross_2 = _mm256_sub_ps(_mm256_mul_ps(tx, _mm256_set1_ps(right_bound_y)),
	_mm256_mul_ps(ty, _mm256_set1_ps(right_bound_x)));

	// Mask for agents between the rays (cross_1 > 0 and cross_2 > 0)
	__m256 ray_mask = _mm256_and_ps(_mm256_cmp_ps(cross_1, _mm256_setzero_ps(), _CMP_GT_OQ),
	_mm256_cmp_ps(cross_2, _mm256_setzero_ps(), _CMP_GT_OQ));

	// Combine range_mask and ray_mask to get the final valid targets
	__m256 valid_mask = _mm256_and_ps(range_mask, ray_mask);

	// Store the valid_mask and t_dist_sq results to memory to extract individual elements
	float valid_mask_arr[8], t_dist_sq_arr[8];
	_mm256_storeu_ps(valid_mask_arr, valid_mask);
	_mm256_storeu_ps(t_dist_sq_arr, t_dist_sq);

	// Extract results and check closest distance
	for (int k = 0; k < 8; ++k) {
	int candidate_idx = j + k;
	if (valid_mask_arr[k] && t_dist_sq_arr[k] < closest_dist_sq && candidate_idx != i) {
	closest_dist_sq = t_dist_sq_arr[k];
	closest_idx = candidate_idx;
	}
	}
	}
	results[i] = closest_idx;
	}

	return results;
	}

	void print_results(Agent* agents, int* results) {
	for (int i = 0; i < NUM_AGENTS; i++)
	std::cout << "agent: " << i << " pos " << agents[i].x << " " << agents[i].y << "\tangle " << agents[i].angle_x << " " << agents[i].angle_y
	<< "\|\ttarget: " << results[i] << " pos " << (results[i] != -1 ? agents[results[i]].x : -1) << " " << (results[i] != -1 ? agents[results[i]].y : -1) << std::endl;
	}

	int main() {
	std::cout << "Generating " << NUM_AGENTS << " in a " << ARENA_SIZE << "*" << ARENA_SIZE << " area" << std::endl;
	std::minstd_rand gen(std::random_device{}());
	std::uniform_real_distribution<float> dist_arena(0, ARENA_SIZE);
	std::uniform_real_distribution<float> dist_ori_x(-1, 1);
	std::uniform_int_distribution<int> dist_ori_sign(0, 1);
	Agent agents[NUM_AGENTS];

	// This is used for an ECS-based methods, or methods than require contiguous data, for example, SIMD
	World* world = (World*)malloc(sizeof(World));
	world->agent_x = (float)malloc(sizeof(float) NUM_AGENTS);
	world->agent_y = (float)malloc(sizeof(float) NUM_AGENTS);
	world->agent_angle_x = (float)malloc(sizeof(float) NUM_AGENTS);
	world->agent_angle_y = (float)malloc(sizeof(float) NUM_AGENTS);

	for (int i = 0; i < NUM_AGENTS; i++) {
	Agent* agent = &agents[i];
	agent->x = dist_arena(gen);
	agent->y = dist_arena(gen);
	agent->angle_x = dist_ori_x(gen);
	agent->angle_y = sqrt(1 - agent->angle_x * agent->angle_x) * (dist_ori_sign(gen) * 2 - 1);

	world->agent_x[i] = agent->x;
	world->agent_y[i] = agent->y;
	world->agent_angle_x[i] = agent->angle_x;
	world->agent_angle_y[i] = agent->angle_y;
	}

	// All methods should return an array containing the indices of the nearest agent, or -1 if there is none
	clock_t t = clock();
	for (int i = 0; i < 10; i++)
	free(test_raycast(agents));
	t = clock() - t;
	//print_results(agents, results);
	std::cout << "Raycast took " << ((float)t)/CLOCKS_PER_SEC << " seconds" << std::endl;

	t = clock();
	for (int i = 0; i < 10; i++)
	free(test_anglecheck_dot(agents));
	t = clock() - t;
	//print_results(agents, results);
	std::cout << "Anglecheck_dot took " << ((float)t)/CLOCKS_PER_SEC << " seconds" << std::endl;

	t = clock();
	for (int i = 0; i < 10; i++)
	free(test_anglecheck_cross(agents));
	t = clock() - t;
	//print_results(agents, results);
	std::cout << "Anglecheck_cross took " << ((float)t)/CLOCKS_PER_SEC << " seconds" << std::endl;

	t = clock();
	for (int i = 0; i < 10; i++)
	free(test_anglecheck_cross_simd(world));
	t = clock() - t;
	//print_results(agents, results);
	std::cout << "Anglecheck_cross_simd took " << ((float)t)/CLOCKS_PER_SEC << " seconds" << std::endl;

	std::cout << "Terminated successfully" << std::endl;
	return 0;
	}