Skip to content

Instantly share code, notes, and snippets.

@AndreVallestero
Last active October 3, 2024 06:25
Show Gist options
  • Save AndreVallestero/2f64a124efb9dbc9c061c9b54a93c7b5 to your computer and use it in GitHub Desktop.
Save AndreVallestero/2f64a124efb9dbc9c061c9b54a93c7b5 to your computer and use it in GitHub Desktop.
Simple benchmark for agent vision.
/*
Simple benchmark for agent vision.
This is specifically for finding the closest agent, which is not a very common use case in
simulations. Most simulations use the data from each ray as an input parameter for their
agents' neural net. This benchmark was made for buddhaman/AnymaEngine, which only takes
the closest friendly and aggressive agent as inputs into the neural net. The results for
32768 agents * 10 iterations without chunking are as follows (smaller is better):
- 9.039s raycast: this is the old method used by AnymaEngine
- 4.504s anglecheck_dot: this should have a performance profile similar to the existing
implementation, except it uses a sqrt instead of atan
- 4.204s anglecheck_cross: this only has multiplication and addition/subtraction, all of
which can be done in 1 clock cycle. *This method gets my recommendation for
AnymaEngine*.
- 1.421s anglecheck_cross_simd: This is by far the fastest, but probably requires an
engine rewrite since it needs all the agent data to be in an ECS pattern
(contiguous blocks of memory) in order to efficiently leverage SIMD.
The main difference in this benchmark compared to AnymaEngine is that there is no
chunking, and instead of a raw angle, the agent's orientation is stored as a unit
vector. No chunking was a deliberate choice to reduce complexity and to scale the load
more easily. Unit vectors should be used instead of rads/degrees due to being simpler
and faster in almost every scenario, especially if there are any plans to do SIMD or
GPU offloading.
build instructions:
call vcvars64
cl main.cpp /O2 /openmp /arch:AVX /MT /W3 /std:c++20 /DNDEBUG
*/
#include <omp.h>
#include <iostream>
#include <random>
#define _USE_MATH_DEFINES
#include <math.h>
#include <time.h>
#include <cmath>
#include <immintrin.h>
#define NUM_AGENTS 32768 // Must be a power of 2 for the simd implementation to work (fixable, but needs extra logic to handle the remainder, and I'm lazy)
#define AGENT_RANGE 70
#define ARENA_SIZE 140
#define AGENT_SIZE 1
#define AGENT_FOV 72
#define UNIT_VEC_36_X 0.809016994
#define UNIT_VEC_36_Y 0.58778525229
#define UNIT_VEC_12_X 0.97814760073
#define UNIT_VEC_12_Y 0.20791169081
float RAYS[4][2] = {{UNIT_VEC_36_X, -UNIT_VEC_36_Y}, {UNIT_VEC_12_X, -UNIT_VEC_12_Y}, {UNIT_VEC_12_X, UNIT_VEC_12_Y}, {UNIT_VEC_36_X, UNIT_VEC_36_Y}};
struct Agent {
float x;
float y;
float angle_x;
float angle_y;
};
struct World {
float* agent_x;
float* agent_y;
float* agent_angle_x;
float* agent_angle_y;
};
/*
x, y initial vector
dx, dy rotation vector
tx, ty resultant vector
*/
inline void rotate_unit_vector(float x, float y, float dx, float dy, float* tx, float* ty) {
*tx = x*dx - y*dy;
*ty = x*dy + y*dx;
}
int* test_raycast(Agent* agents) {
int* results = (int*)malloc(sizeof(int) * NUM_AGENTS);
#pragma omp parallel for
for (int i = 0; i < NUM_AGENTS; i++) {
Agent agent = agents[i];
float closest_dist = AGENT_RANGE * AGENT_RANGE;
int closest_idx = -1;
for (int j = 0; j < 4; j++) {
float rx, ry;
rotate_unit_vector(agent.angle_x, agent.angle_y, RAYS[j][0], RAYS[j][1], &rx, &ry);
for (int k = 0; k < NUM_AGENTS; k++) {
if (i == k) continue;
// Relative vector from target to agent (order is important)
Agent target = agents[k];
float tx = agent.x - target.x;
float ty = agent.y - target.y;
// Coefficients of the quadratic equation
// a is implicitly 1 for unit vectors
float b = 2 * (rx * tx + ry * ty);
float c = tx * tx + ty * ty - AGENT_SIZE * AGENT_SIZE;
// overlapping
//if (c < 0) continue;
// Discriminant
float discriminant = b * b - 4 * c;
if (discriminant < 0) continue;
// Calculate the two potential t values
float sqrt_discriminant = sqrt(discriminant);
float t1 = (-b - sqrt_discriminant) / 2;
float t2 = (-b + sqrt_discriminant) / 2;
// Check if either intersection is within the ray's range and is the closest target
if (0 <= t1 && t1 <= AGENT_RANGE && t1 < closest_dist) {
closest_dist = t1;
closest_idx = k;
}
if (0 <= t2 && t2 <= AGENT_RANGE && t2 < closest_dist) {
closest_dist = t2;
closest_idx = k;
}
}
}
results[i] = closest_idx;
}
return results;
}
int* test_anglecheck_dot(Agent* agents) {
int* results = (int*)malloc(sizeof(int) * NUM_AGENTS);
#pragma omp parallel for
for (int i = 0; i < NUM_AGENTS; i++) {
Agent agent = agents[i];
float closest_dist_sq = AGENT_RANGE * AGENT_RANGE; // Use squared distance for comparison
int closest_idx = -1;
// Create left and right
float left_bound_x, left_bound_y, right_bound_x, right_bound_y;
rotate_unit_vector(agent.angle_x, agent.angle_y, RAYS[0][0], RAYS[0][1], &left_bound_x, &left_bound_y);
rotate_unit_vector(agent.angle_x, agent.angle_y, RAYS[3][0], RAYS[3][1], &right_bound_x, &right_bound_y);
float dot_left_right = left_bound_x * right_bound_x + left_bound_y * right_bound_y;
for (int j = 0; j < NUM_AGENTS; j++) {
if (i == j) continue;
// Get relative vector from agent to target then normalize
Agent target = agents[j];
float tx = target.x - agent.x;
float ty = target.y - agent.y;
float t_dist_sq = tx * tx + ty * ty; // Use squared distance to avoid sqrt
// Early exit if the target is outside of range, even without this optimization, this method is 2x faster than raycast
if (t_dist_sq > AGENT_RANGE * AGENT_RANGE || t_dist_sq > closest_dist_sq) continue;
// Normalize tx, ty vector (divide by t_dist which is sqrt of t_dist_sq)
float t_dist = sqrt(t_dist_sq); // sqrt here only when needed
tx /= t_dist;
ty /= t_dist;
// Calculate dot products for the left and right bounds
float dot_left = left_bound_x * tx + left_bound_y * ty;
float dot_right = right_bound_x * tx + right_bound_y * ty;
// Check if target is within the field of view and closer than the current closest
if (dot_left > dot_left_right && dot_right > dot_left_right) {
closest_dist_sq = t_dist_sq;
closest_idx = j;
}
}
results[i] = closest_idx;
}
return results;
}
int* test_anglecheck_cross(Agent* agents) {
int* results = (int*)malloc(sizeof(int) * NUM_AGENTS);
#pragma omp parallel for
for (int i = 0; i < NUM_AGENTS; i++) {
Agent agent = agents[i];
float closest_dist_sq = AGENT_RANGE * AGENT_RANGE; // Use squared distance for comparison
int closest_idx = -1;
// Create left and right
float left_bound_x, left_bound_y, right_bound_x, right_bound_y;
rotate_unit_vector(agent.angle_x, agent.angle_y, RAYS[0][0], RAYS[0][1], &left_bound_x, &left_bound_y);
rotate_unit_vector(agent.angle_x, agent.angle_y, RAYS[3][0], RAYS[3][1], &right_bound_x, &right_bound_y);
for (int j = 0; j < NUM_AGENTS; j++) {
if (i == j) continue;
// Get relative vector from agent to target
Agent target = agents[j];
float tx = target.x - agent.x;
float ty = target.y - agent.y;
// Skip if out of range, or could not be the closest
float t_dist_sq = tx * tx + ty * ty;
if (t_dist_sq > AGENT_RANGE * AGENT_RANGE || t_dist_sq >= closest_dist_sq) continue;
// Skip if to the left of the left ray
float cross_1 = left_bound_x * ty - left_bound_y * tx;
if (cross_1 < 0) continue;
// Skip if to the right of the right ray
float cross_2 = tx * right_bound_y - ty * right_bound_x;
if (cross_2 < 0) continue;
// Update closest
closest_dist_sq = t_dist_sq;
closest_idx = j;
}
results[i] = closest_idx;
}
return results;
}
int* test_anglecheck_cross_simd(World* world) {
int* results = (int*)malloc(sizeof(int) * NUM_AGENTS);
#pragma omp parallel for
for (int i = 0; i < NUM_AGENTS; i++) {
int closest_idx = -1;
float closest_dist_sq = AGENT_RANGE * AGENT_RANGE; // Use squared distance for comparison
__m256 closest_dist_sq_v = _mm256_set1_ps(closest_dist_sq);
// Create left and right boundary vectors
float left_bound_x, left_bound_y, right_bound_x, right_bound_y;
rotate_unit_vector(world->agent_angle_x[i], world->agent_angle_y[i], RAYS[0][0], RAYS[0][1], &left_bound_x, &left_bound_y);
rotate_unit_vector(world->agent_angle_x[i], world->agent_angle_y[i], RAYS[3][0], RAYS[3][1], &right_bound_x, &right_bound_y);
// Iterate over agents in chunks of 8 (assuming AVX which handles 8 floats at once)
for (int j = 0; j < NUM_AGENTS; j += 8) {
// Load target agent positions (x and y)
__m256 target_x = _mm256_loadu_ps(&world->agent_x[j]); // Load 8 floats starting from agents[j].x
__m256 target_y = _mm256_loadu_ps(&world->agent_y[j]); // Load 8 floats starting from agents[j].y
// Compute relative vector: tx = target.x - agent.x, ty = target.y - agent.y
__m256 tx = _mm256_sub_ps(target_x, _mm256_set1_ps(world->agent_x[i]));
__m256 ty = _mm256_sub_ps(target_y, _mm256_set1_ps(world->agent_y[i]));
// Compute squared distance: t_dist_sq = tx * tx + ty * ty
__m256 t_dist_sq = _mm256_fmadd_ps(tx, tx, _mm256_mul_ps(ty, ty)); // fused multiply add
// Mask for agents within range
__m256 range_mask = _mm256_cmp_ps(t_dist_sq, _mm256_set1_ps(AGENT_RANGE * AGENT_RANGE), _CMP_LE_OQ);
// Compute cross products for boundary checks
__m256 cross_1 = _mm256_sub_ps(_mm256_mul_ps(_mm256_set1_ps(left_bound_x), ty),
_mm256_mul_ps(_mm256_set1_ps(left_bound_y), tx));
__m256 cross_2 = _mm256_sub_ps(_mm256_mul_ps(tx, _mm256_set1_ps(right_bound_y)),
_mm256_mul_ps(ty, _mm256_set1_ps(right_bound_x)));
// Mask for agents between the rays (cross_1 > 0 and cross_2 > 0)
__m256 ray_mask = _mm256_and_ps(_mm256_cmp_ps(cross_1, _mm256_setzero_ps(), _CMP_GT_OQ),
_mm256_cmp_ps(cross_2, _mm256_setzero_ps(), _CMP_GT_OQ));
// Combine range_mask and ray_mask to get the final valid targets
__m256 valid_mask = _mm256_and_ps(range_mask, ray_mask);
// Store the valid_mask and t_dist_sq results to memory to extract individual elements
float valid_mask_arr[8], t_dist_sq_arr[8];
_mm256_storeu_ps(valid_mask_arr, valid_mask);
_mm256_storeu_ps(t_dist_sq_arr, t_dist_sq);
// Extract results and check closest distance
for (int k = 0; k < 8; ++k) {
int candidate_idx = j + k;
if (valid_mask_arr[k] && t_dist_sq_arr[k] < closest_dist_sq && candidate_idx != i) {
closest_dist_sq = t_dist_sq_arr[k];
closest_idx = candidate_idx;
}
}
}
results[i] = closest_idx;
}
return results;
}
void print_results(Agent* agents, int* results) {
for (int i = 0; i < NUM_AGENTS; i++)
std::cout << "agent: " << i << " pos " << agents[i].x << " " << agents[i].y << "\tangle " << agents[i].angle_x << " " << agents[i].angle_y
<< "|\ttarget: " << results[i] << " pos " << (results[i] != -1 ? agents[results[i]].x : -1) << " " << (results[i] != -1 ? agents[results[i]].y : -1) << std::endl;
}
int main() {
std::cout << "Generating " << NUM_AGENTS << " in a " << ARENA_SIZE << "*" << ARENA_SIZE << " area" << std::endl;
std::minstd_rand gen(std::random_device{}());
std::uniform_real_distribution<float> dist_arena(0, ARENA_SIZE);
std::uniform_real_distribution<float> dist_ori_x(-1, 1);
std::uniform_int_distribution<int> dist_ori_sign(0, 1);
Agent agents[NUM_AGENTS];
// This is used for an ECS-based methods, or methods than require contiguous data, for example, SIMD
World* world = (World*)malloc(sizeof(World));
world->agent_x = (float*)malloc(sizeof(float) * NUM_AGENTS);
world->agent_y = (float*)malloc(sizeof(float) * NUM_AGENTS);
world->agent_angle_x = (float*)malloc(sizeof(float) * NUM_AGENTS);
world->agent_angle_y = (float*)malloc(sizeof(float) * NUM_AGENTS);
for (int i = 0; i < NUM_AGENTS; i++) {
Agent* agent = &agents[i];
agent->x = dist_arena(gen);
agent->y = dist_arena(gen);
agent->angle_x = dist_ori_x(gen);
agent->angle_y = sqrt(1 - agent->angle_x * agent->angle_x) * (dist_ori_sign(gen) * 2 - 1);
world->agent_x[i] = agent->x;
world->agent_y[i] = agent->y;
world->agent_angle_x[i] = agent->angle_x;
world->agent_angle_y[i] = agent->angle_y;
}
// All methods should return an array containing the indices of the nearest agent, or -1 if there is none
clock_t t = clock();
for (int i = 0; i < 10; i++)
free(test_raycast(agents));
t = clock() - t;
//print_results(agents, results);
std::cout << "Raycast took " << ((float)t)/CLOCKS_PER_SEC << " seconds" << std::endl;
t = clock();
for (int i = 0; i < 10; i++)
free(test_anglecheck_dot(agents));
t = clock() - t;
//print_results(agents, results);
std::cout << "Anglecheck_dot took " << ((float)t)/CLOCKS_PER_SEC << " seconds" << std::endl;
t = clock();
for (int i = 0; i < 10; i++)
free(test_anglecheck_cross(agents));
t = clock() - t;
//print_results(agents, results);
std::cout << "Anglecheck_cross took " << ((float)t)/CLOCKS_PER_SEC << " seconds" << std::endl;
t = clock();
for (int i = 0; i < 10; i++)
free(test_anglecheck_cross_simd(world));
t = clock() - t;
//print_results(agents, results);
std::cout << "Anglecheck_cross_simd took " << ((float)t)/CLOCKS_PER_SEC << " seconds" << std::endl;
std::cout << "Terminated successfully" << std::endl;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment