Last active
October 3, 2024 06:25
-
-
Save AndreVallestero/2f64a124efb9dbc9c061c9b54a93c7b5 to your computer and use it in GitHub Desktop.
Simple benchmark for agent vision.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Simple benchmark for agent vision. | |
This is specifically for finding the closest agent, which is not a very common use case in | |
simulations. Most simulations use the data from each ray as an input parameter for their | |
agents' neural net. This benchmark was made for buddhaman/AnymaEngine, which only takes | |
the closest friendly and aggressive agent as inputs into the neural net. The results for | |
32768 agents * 10 iterations without chunking are as follows (smaller is better): | |
- 9.039s raycast: this is the old method used by AnymaEngine | |
- 4.504s anglecheck_dot: this should have a performance profile similar to the existing | |
implementation, except it uses a sqrt instead of atan | |
- 4.204s anglecheck_cross: this only has multiplication and addition/subtraction, all of | |
which can be done in 1 clock cycle. *This method gets my recommendation for | |
AnymaEngine*. | |
- 1.421s anglecheck_cross_simd: This is by far the fastest, but probably requires an | |
engine rewrite since it needs all the agent data to be in an ECS pattern | |
(contiguous blocks of memory) in order to efficiently leverage SIMD. | |
The main difference in this benchmark compared to AnymaEngine is that there is no | |
chunking, and instead of a raw angle, the agent's orientation is stored as a unit | |
vector. No chunking was a deliberate choice to reduce complexity and to scale the load | |
more easily. Unit vectors should be used instead of rads/degrees due to being simpler | |
and faster in almost every scenario, especially if there are any plans to do SIMD or | |
GPU offloading. | |
build instructions: | |
call vcvars64 | |
cl main.cpp /O2 /openmp /arch:AVX /MT /W3 /std:c++20 /DNDEBUG | |
*/ | |
#include <omp.h> | |
#include <iostream> | |
#include <random> | |
#define _USE_MATH_DEFINES | |
#include <math.h> | |
#include <time.h> | |
#include <cmath> | |
#include <immintrin.h> | |
#define NUM_AGENTS 32768 // Must be a power of 2 for the simd implementation to work (fixable, but needs extra logic to handle the remainder, and I'm lazy) | |
#define AGENT_RANGE 70 | |
#define ARENA_SIZE 140 | |
#define AGENT_SIZE 1 | |
#define AGENT_FOV 72 | |
#define UNIT_VEC_36_X 0.809016994 | |
#define UNIT_VEC_36_Y 0.58778525229 | |
#define UNIT_VEC_12_X 0.97814760073 | |
#define UNIT_VEC_12_Y 0.20791169081 | |
float RAYS[4][2] = {{UNIT_VEC_36_X, -UNIT_VEC_36_Y}, {UNIT_VEC_12_X, -UNIT_VEC_12_Y}, {UNIT_VEC_12_X, UNIT_VEC_12_Y}, {UNIT_VEC_36_X, UNIT_VEC_36_Y}}; | |
struct Agent { | |
float x; | |
float y; | |
float angle_x; | |
float angle_y; | |
}; | |
struct World { | |
float* agent_x; | |
float* agent_y; | |
float* agent_angle_x; | |
float* agent_angle_y; | |
}; | |
/* | |
x, y initial vector | |
dx, dy rotation vector | |
tx, ty resultant vector | |
*/ | |
inline void rotate_unit_vector(float x, float y, float dx, float dy, float* tx, float* ty) { | |
*tx = x*dx - y*dy; | |
*ty = x*dy + y*dx; | |
} | |
int* test_raycast(Agent* agents) { | |
int* results = (int*)malloc(sizeof(int) * NUM_AGENTS); | |
#pragma omp parallel for | |
for (int i = 0; i < NUM_AGENTS; i++) { | |
Agent agent = agents[i]; | |
float closest_dist = AGENT_RANGE * AGENT_RANGE; | |
int closest_idx = -1; | |
for (int j = 0; j < 4; j++) { | |
float rx, ry; | |
rotate_unit_vector(agent.angle_x, agent.angle_y, RAYS[j][0], RAYS[j][1], &rx, &ry); | |
for (int k = 0; k < NUM_AGENTS; k++) { | |
if (i == k) continue; | |
// Relative vector from target to agent (order is important) | |
Agent target = agents[k]; | |
float tx = agent.x - target.x; | |
float ty = agent.y - target.y; | |
// Coefficients of the quadratic equation | |
// a is implicitly 1 for unit vectors | |
float b = 2 * (rx * tx + ry * ty); | |
float c = tx * tx + ty * ty - AGENT_SIZE * AGENT_SIZE; | |
// overlapping | |
//if (c < 0) continue; | |
// Discriminant | |
float discriminant = b * b - 4 * c; | |
if (discriminant < 0) continue; | |
// Calculate the two potential t values | |
float sqrt_discriminant = sqrt(discriminant); | |
float t1 = (-b - sqrt_discriminant) / 2; | |
float t2 = (-b + sqrt_discriminant) / 2; | |
// Check if either intersection is within the ray's range and is the closest target | |
if (0 <= t1 && t1 <= AGENT_RANGE && t1 < closest_dist) { | |
closest_dist = t1; | |
closest_idx = k; | |
} | |
if (0 <= t2 && t2 <= AGENT_RANGE && t2 < closest_dist) { | |
closest_dist = t2; | |
closest_idx = k; | |
} | |
} | |
} | |
results[i] = closest_idx; | |
} | |
return results; | |
} | |
int* test_anglecheck_dot(Agent* agents) { | |
int* results = (int*)malloc(sizeof(int) * NUM_AGENTS); | |
#pragma omp parallel for | |
for (int i = 0; i < NUM_AGENTS; i++) { | |
Agent agent = agents[i]; | |
float closest_dist_sq = AGENT_RANGE * AGENT_RANGE; // Use squared distance for comparison | |
int closest_idx = -1; | |
// Create left and right | |
float left_bound_x, left_bound_y, right_bound_x, right_bound_y; | |
rotate_unit_vector(agent.angle_x, agent.angle_y, RAYS[0][0], RAYS[0][1], &left_bound_x, &left_bound_y); | |
rotate_unit_vector(agent.angle_x, agent.angle_y, RAYS[3][0], RAYS[3][1], &right_bound_x, &right_bound_y); | |
float dot_left_right = left_bound_x * right_bound_x + left_bound_y * right_bound_y; | |
for (int j = 0; j < NUM_AGENTS; j++) { | |
if (i == j) continue; | |
// Get relative vector from agent to target then normalize | |
Agent target = agents[j]; | |
float tx = target.x - agent.x; | |
float ty = target.y - agent.y; | |
float t_dist_sq = tx * tx + ty * ty; // Use squared distance to avoid sqrt | |
// Early exit if the target is outside of range, even without this optimization, this method is 2x faster than raycast | |
if (t_dist_sq > AGENT_RANGE * AGENT_RANGE || t_dist_sq > closest_dist_sq) continue; | |
// Normalize tx, ty vector (divide by t_dist which is sqrt of t_dist_sq) | |
float t_dist = sqrt(t_dist_sq); // sqrt here only when needed | |
tx /= t_dist; | |
ty /= t_dist; | |
// Calculate dot products for the left and right bounds | |
float dot_left = left_bound_x * tx + left_bound_y * ty; | |
float dot_right = right_bound_x * tx + right_bound_y * ty; | |
// Check if target is within the field of view and closer than the current closest | |
if (dot_left > dot_left_right && dot_right > dot_left_right) { | |
closest_dist_sq = t_dist_sq; | |
closest_idx = j; | |
} | |
} | |
results[i] = closest_idx; | |
} | |
return results; | |
} | |
int* test_anglecheck_cross(Agent* agents) { | |
int* results = (int*)malloc(sizeof(int) * NUM_AGENTS); | |
#pragma omp parallel for | |
for (int i = 0; i < NUM_AGENTS; i++) { | |
Agent agent = agents[i]; | |
float closest_dist_sq = AGENT_RANGE * AGENT_RANGE; // Use squared distance for comparison | |
int closest_idx = -1; | |
// Create left and right | |
float left_bound_x, left_bound_y, right_bound_x, right_bound_y; | |
rotate_unit_vector(agent.angle_x, agent.angle_y, RAYS[0][0], RAYS[0][1], &left_bound_x, &left_bound_y); | |
rotate_unit_vector(agent.angle_x, agent.angle_y, RAYS[3][0], RAYS[3][1], &right_bound_x, &right_bound_y); | |
for (int j = 0; j < NUM_AGENTS; j++) { | |
if (i == j) continue; | |
// Get relative vector from agent to target | |
Agent target = agents[j]; | |
float tx = target.x - agent.x; | |
float ty = target.y - agent.y; | |
// Skip if out of range, or could not be the closest | |
float t_dist_sq = tx * tx + ty * ty; | |
if (t_dist_sq > AGENT_RANGE * AGENT_RANGE || t_dist_sq >= closest_dist_sq) continue; | |
// Skip if to the left of the left ray | |
float cross_1 = left_bound_x * ty - left_bound_y * tx; | |
if (cross_1 < 0) continue; | |
// Skip if to the right of the right ray | |
float cross_2 = tx * right_bound_y - ty * right_bound_x; | |
if (cross_2 < 0) continue; | |
// Update closest | |
closest_dist_sq = t_dist_sq; | |
closest_idx = j; | |
} | |
results[i] = closest_idx; | |
} | |
return results; | |
} | |
int* test_anglecheck_cross_simd(World* world) { | |
int* results = (int*)malloc(sizeof(int) * NUM_AGENTS); | |
#pragma omp parallel for | |
for (int i = 0; i < NUM_AGENTS; i++) { | |
int closest_idx = -1; | |
float closest_dist_sq = AGENT_RANGE * AGENT_RANGE; // Use squared distance for comparison | |
__m256 closest_dist_sq_v = _mm256_set1_ps(closest_dist_sq); | |
// Create left and right boundary vectors | |
float left_bound_x, left_bound_y, right_bound_x, right_bound_y; | |
rotate_unit_vector(world->agent_angle_x[i], world->agent_angle_y[i], RAYS[0][0], RAYS[0][1], &left_bound_x, &left_bound_y); | |
rotate_unit_vector(world->agent_angle_x[i], world->agent_angle_y[i], RAYS[3][0], RAYS[3][1], &right_bound_x, &right_bound_y); | |
// Iterate over agents in chunks of 8 (assuming AVX which handles 8 floats at once) | |
for (int j = 0; j < NUM_AGENTS; j += 8) { | |
// Load target agent positions (x and y) | |
__m256 target_x = _mm256_loadu_ps(&world->agent_x[j]); // Load 8 floats starting from agents[j].x | |
__m256 target_y = _mm256_loadu_ps(&world->agent_y[j]); // Load 8 floats starting from agents[j].y | |
// Compute relative vector: tx = target.x - agent.x, ty = target.y - agent.y | |
__m256 tx = _mm256_sub_ps(target_x, _mm256_set1_ps(world->agent_x[i])); | |
__m256 ty = _mm256_sub_ps(target_y, _mm256_set1_ps(world->agent_y[i])); | |
// Compute squared distance: t_dist_sq = tx * tx + ty * ty | |
__m256 t_dist_sq = _mm256_fmadd_ps(tx, tx, _mm256_mul_ps(ty, ty)); // fused multiply add | |
// Mask for agents within range | |
__m256 range_mask = _mm256_cmp_ps(t_dist_sq, _mm256_set1_ps(AGENT_RANGE * AGENT_RANGE), _CMP_LE_OQ); | |
// Compute cross products for boundary checks | |
__m256 cross_1 = _mm256_sub_ps(_mm256_mul_ps(_mm256_set1_ps(left_bound_x), ty), | |
_mm256_mul_ps(_mm256_set1_ps(left_bound_y), tx)); | |
__m256 cross_2 = _mm256_sub_ps(_mm256_mul_ps(tx, _mm256_set1_ps(right_bound_y)), | |
_mm256_mul_ps(ty, _mm256_set1_ps(right_bound_x))); | |
// Mask for agents between the rays (cross_1 > 0 and cross_2 > 0) | |
__m256 ray_mask = _mm256_and_ps(_mm256_cmp_ps(cross_1, _mm256_setzero_ps(), _CMP_GT_OQ), | |
_mm256_cmp_ps(cross_2, _mm256_setzero_ps(), _CMP_GT_OQ)); | |
// Combine range_mask and ray_mask to get the final valid targets | |
__m256 valid_mask = _mm256_and_ps(range_mask, ray_mask); | |
// Store the valid_mask and t_dist_sq results to memory to extract individual elements | |
float valid_mask_arr[8], t_dist_sq_arr[8]; | |
_mm256_storeu_ps(valid_mask_arr, valid_mask); | |
_mm256_storeu_ps(t_dist_sq_arr, t_dist_sq); | |
// Extract results and check closest distance | |
for (int k = 0; k < 8; ++k) { | |
int candidate_idx = j + k; | |
if (valid_mask_arr[k] && t_dist_sq_arr[k] < closest_dist_sq && candidate_idx != i) { | |
closest_dist_sq = t_dist_sq_arr[k]; | |
closest_idx = candidate_idx; | |
} | |
} | |
} | |
results[i] = closest_idx; | |
} | |
return results; | |
} | |
void print_results(Agent* agents, int* results) { | |
for (int i = 0; i < NUM_AGENTS; i++) | |
std::cout << "agent: " << i << " pos " << agents[i].x << " " << agents[i].y << "\tangle " << agents[i].angle_x << " " << agents[i].angle_y | |
<< "|\ttarget: " << results[i] << " pos " << (results[i] != -1 ? agents[results[i]].x : -1) << " " << (results[i] != -1 ? agents[results[i]].y : -1) << std::endl; | |
} | |
int main() { | |
std::cout << "Generating " << NUM_AGENTS << " in a " << ARENA_SIZE << "*" << ARENA_SIZE << " area" << std::endl; | |
std::minstd_rand gen(std::random_device{}()); | |
std::uniform_real_distribution<float> dist_arena(0, ARENA_SIZE); | |
std::uniform_real_distribution<float> dist_ori_x(-1, 1); | |
std::uniform_int_distribution<int> dist_ori_sign(0, 1); | |
Agent agents[NUM_AGENTS]; | |
// This is used for an ECS-based methods, or methods than require contiguous data, for example, SIMD | |
World* world = (World*)malloc(sizeof(World)); | |
world->agent_x = (float*)malloc(sizeof(float) * NUM_AGENTS); | |
world->agent_y = (float*)malloc(sizeof(float) * NUM_AGENTS); | |
world->agent_angle_x = (float*)malloc(sizeof(float) * NUM_AGENTS); | |
world->agent_angle_y = (float*)malloc(sizeof(float) * NUM_AGENTS); | |
for (int i = 0; i < NUM_AGENTS; i++) { | |
Agent* agent = &agents[i]; | |
agent->x = dist_arena(gen); | |
agent->y = dist_arena(gen); | |
agent->angle_x = dist_ori_x(gen); | |
agent->angle_y = sqrt(1 - agent->angle_x * agent->angle_x) * (dist_ori_sign(gen) * 2 - 1); | |
world->agent_x[i] = agent->x; | |
world->agent_y[i] = agent->y; | |
world->agent_angle_x[i] = agent->angle_x; | |
world->agent_angle_y[i] = agent->angle_y; | |
} | |
// All methods should return an array containing the indices of the nearest agent, or -1 if there is none | |
clock_t t = clock(); | |
for (int i = 0; i < 10; i++) | |
free(test_raycast(agents)); | |
t = clock() - t; | |
//print_results(agents, results); | |
std::cout << "Raycast took " << ((float)t)/CLOCKS_PER_SEC << " seconds" << std::endl; | |
t = clock(); | |
for (int i = 0; i < 10; i++) | |
free(test_anglecheck_dot(agents)); | |
t = clock() - t; | |
//print_results(agents, results); | |
std::cout << "Anglecheck_dot took " << ((float)t)/CLOCKS_PER_SEC << " seconds" << std::endl; | |
t = clock(); | |
for (int i = 0; i < 10; i++) | |
free(test_anglecheck_cross(agents)); | |
t = clock() - t; | |
//print_results(agents, results); | |
std::cout << "Anglecheck_cross took " << ((float)t)/CLOCKS_PER_SEC << " seconds" << std::endl; | |
t = clock(); | |
for (int i = 0; i < 10; i++) | |
free(test_anglecheck_cross_simd(world)); | |
t = clock() - t; | |
//print_results(agents, results); | |
std::cout << "Anglecheck_cross_simd took " << ((float)t)/CLOCKS_PER_SEC << " seconds" << std::endl; | |
std::cout << "Terminated successfully" << std::endl; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment