Created
May 30, 2013 10:14
-
-
Save mrmekon/5676948 to your computer and use it in GitHub Desktop.
Compares the performance of fixed-point vs floating-point arithmetic on iPhone 5 (Apple A6 processor) by evaluating a contrived 'movement update' algorithm over a large array of 'particle' structs. Intended to identify best practices for developing an efficient particle engine. I recommend testing on the device with 'Release' versions, using -O3…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// main.m | |
// fp_profiler | |
// | |
// Compares the performance of fixed-point vs floating-point for simple | |
// "particle engine" movement loops. | |
// | |
// Optimized for iPhone 5 (Apple A6 / ARMv7) | |
// | |
// Created by Trevor Bentley on 5/29/13. | |
// Copyright (c) 2013 Trevor Bentley. All rights reserved. | |
// | |
#import <dispatch/dispatch.h> | |
#define compiler_assert(e) enum {____ASSERT = 1/(!!(e))} | |
/*********************************************************************** | |
** FIXED POINT | |
***********************************************************************/ | |
typedef int32_t f23Q9; | |
#define F23Q9_SHIFT 9 | |
#define F23Q9_SCALE (1<<F23Q9_SHIFT) | |
#define INT2F23Q9(val) ( (f23Q9) ((val) << F23Q9_SHIFT ) ) | |
#define FLOAT2F23Q9(val) ( (f23Q9) ((val) * (float)F23Q9_SCALE ) ) | |
#define F23Q92UINT(val) ( (uint32_t) ((val) >> F23Q9_SHIFT ) ) | |
#define F23Q92INT(val) ( (int32_t) ((val) / F23Q9_SCALE ) ) | |
#define F23Q92FLOAT(val) ( (float) ((val) / (float)F23Q9_SCALE ) ) | |
#define F23Q9_MUL(a,b) ( (f23Q9) ((int32_t)(a)*(b) >> F23Q9_SHIFT) ) | |
#define F23Q9_DIV(a,b) ( (f23Q9) ((int32_t)(a)*F23Q9_SCALE / (b) ) ) | |
/*********************************************************************** | |
** PARTICLES | |
***********************************************************************/ | |
#define CACHELINE_SIZE 64 | |
#define PARTICLE_ALIGNMENT 32 | |
#define PARTICLE_POOL_SIZE (1024*1024*4) | |
typedef struct { | |
int_fast16_t x; | |
int_fast16_t y; | |
struct {uint8_t r; uint8_t g; uint8_t b; uint8_t a;} __attribute__((packed)) color; | |
f23Q9 vx; | |
f23Q9 vy; | |
f23Q9 ax; | |
f23Q9 ay; | |
f23Q9 scale; | |
} __attribute__((aligned (PARTICLE_ALIGNMENT))) particle_t; | |
typedef struct { | |
int_fast16_t x; | |
int_fast16_t y; | |
struct {uint8_t r; uint8_t g; uint8_t b; uint8_t a;} __attribute__((packed)) color; | |
float vx; | |
float vy; | |
float ax; | |
float ay; | |
float scale; | |
} __attribute__((aligned (PARTICLE_ALIGNMENT))) fparticle_t; | |
particle_t __attribute__((aligned (CACHELINE_SIZE))) particles[PARTICLE_POOL_SIZE]; | |
fparticle_t __attribute__((aligned (CACHELINE_SIZE))) fparticles[PARTICLE_POOL_SIZE]; | |
compiler_assert(sizeof(particle_t) <= PARTICLE_ALIGNMENT); | |
/*********************************************************************** | |
** PROFILER DEFINITIONS | |
***********************************************************************/ | |
#define PROFILER_LOOP_COUNT 15 | |
void particles_memset(void); | |
void particles_init(void); | |
void particles_async(dispatch_queue_t *dqueue); | |
void particles_applyloop(dispatch_queue_t *dqueue); | |
void particles_intsync(dispatch_queue_t *dqueue); | |
void particles_float(dispatch_queue_t *dqueue); | |
void test_fixed_macros(void); | |
/*********************************************************************** | |
** PROFILER FUNCTIONS | |
***********************************************************************/ | |
static void (*funcs[])(dispatch_queue_t *dqueue) = | |
{particles_async, particles_applyloop, particles_intsync, particles_float}; | |
static const int func_count = sizeof(funcs)/sizeof(void(*)(void)); | |
void particles_memset(void) { | |
memset(particles, 0xDEADBEEF, PARTICLE_POOL_SIZE*sizeof(particle_t)); | |
memset(fparticles, 0xDEADBEEF, PARTICLE_POOL_SIZE*sizeof(fparticle_t)); | |
} | |
void particles_init(void) { | |
particle_t *p; | |
p = particles; | |
for (int i = 0; i < PARTICLE_POOL_SIZE; ++i) { | |
p->x = arc4random() % 200; | |
p->y = arc4random() % 200; | |
memset(&(p->color), 255, sizeof(p->color)); | |
p->vx = INT2F23Q9(arc4random() % 10 - 5); | |
p->vy = INT2F23Q9(arc4random() % 10 - 5); | |
p->ax = INT2F23Q9(arc4random() % 10 - 5); | |
p->ay = INT2F23Q9(arc4random() % 10 - 5); | |
p->scale = INT2F23Q9(arc4random() % 2); | |
++p; | |
} | |
fparticle_t *fp; | |
fp = fparticles; | |
for (int i = 0; i < PARTICLE_POOL_SIZE; ++i) { | |
fp->x = arc4random() % 200; | |
fp->y = arc4random() % 200; | |
memset(&(fp->color), 255, sizeof(fp->color)); | |
fp->vx = (arc4random() % 10 - 5) / 5.0f; | |
fp->vy = (arc4random() % 10 - 5) / 5.0f; | |
fp->ax = (arc4random() % 10 - 5) / 5.0f; | |
fp->ay = (arc4random() % 10 - 5) / 5.0f; | |
fp->scale = (arc4random() % 10) / 20.0f; | |
++fp; | |
} | |
} | |
#define F23Q9_UNROLLED_BLOCK() \ | |
p->x = (int_fast16_t)(p->x + F23Q92INT(F23Q9_MUL(p->vx, p->scale))); \ | |
p->y = (int_fast16_t)(p->y + F23Q92INT(F23Q9_MUL(p->vy, p->scale))); \ | |
p->vx = p->vx + F23Q9_MUL(p->scale,p->ax); \ | |
p->vy = p->vy + F23Q9_DIV(p->ay,p->scale); \ | |
++p; \ | |
p->x = (int_fast16_t)(p->x + F23Q92INT(F23Q9_MUL(p->vx, p->scale))); \ | |
p->y = (int_fast16_t)(p->y + F23Q92INT(F23Q9_MUL(p->vy, p->scale))); \ | |
p->vx = p->vx + F23Q9_MUL(p->scale,p->ax); \ | |
p->vy = p->vy + F23Q9_DIV(p->ay,p->scale); \ | |
++p; | |
void particles_async(dispatch_queue_t *dqueue) { | |
dispatch_async(*dqueue, ^{ | |
particle_t *p = particles; | |
for (int i = 0; i < PARTICLE_POOL_SIZE>>2; ++i) { | |
F23Q9_UNROLLED_BLOCK(); | |
} | |
}); | |
dispatch_async(*dqueue, ^{ | |
particle_t *p = particles+(PARTICLE_POOL_SIZE>>1); | |
for (int i = 0; i < PARTICLE_POOL_SIZE>>2; ++i) { | |
F23Q9_UNROLLED_BLOCK(); | |
} | |
}); | |
dispatch_barrier_sync(*dqueue, ^{}); | |
} | |
void particles_applyloop(dispatch_queue_t *dqueue) { | |
dispatch_apply(2, *dqueue, ^(size_t idx) { | |
particle_t *p = particles+(idx*PARTICLE_POOL_SIZE>>1); | |
for (int i = 0; i < PARTICLE_POOL_SIZE >> 2; ++i) { | |
F23Q9_UNROLLED_BLOCK(); | |
} | |
}); | |
} | |
void particles_intsync(dispatch_queue_t *dqueue) { | |
particle_t *p = particles; | |
for (int i = 0; i < PARTICLE_POOL_SIZE>>1; ++i) { | |
F23Q9_UNROLLED_BLOCK(); | |
} | |
} | |
void particles_float(dispatch_queue_t *dqueue) { | |
fparticle_t *fp = fparticles; | |
for (int i = 0; i < PARTICLE_POOL_SIZE>>1; ++i) { | |
fp->x = (int_fast16_t)(fp->x + fp->scale*fp->vx); | |
fp->y = (int_fast16_t)(fp->y + fp->scale*fp->vy); | |
fp->vx = fp->vx + fp->scale*fp->ax; | |
fp->vy = fp->vy + fp->ay/fp->scale; | |
++fp; | |
fp->x = (int_fast16_t)(fp->x + fp->scale*fp->vx); | |
fp->y = (int_fast16_t)(fp->y + fp->scale*fp->vy); | |
fp->vx = fp->vx + fp->scale*fp->ax; | |
fp->vy = fp->vy + fp->ay/fp->scale; | |
++fp; | |
} | |
} | |
void test_fixed_macros(void) { | |
int myInt = 127; | |
uint16_t myUsInt = 1; | |
int myNegInt = -128; | |
float myFloat = 4100.12345f; | |
float myNegFloat = -14.45678f; | |
NSLog(@"Conversion:"); | |
NSLog(@"%d -> %x -> %d", myInt, INT2F23Q9(myInt), F23Q92INT(INT2F23Q9(myInt))); | |
NSLog(@"%u -> %x -> %u", myUsInt, INT2F23Q9(myUsInt), F23Q92UINT(INT2F23Q9(myUsInt))); | |
NSLog(@"%d -> %x -> %d", myNegInt, INT2F23Q9(myNegInt), F23Q92INT(INT2F23Q9(myNegInt))); | |
NSLog(@"%f -> %x -> %f", myFloat, FLOAT2F23Q9(myFloat), F23Q92FLOAT(FLOAT2F23Q9(myFloat))); | |
NSLog(@"%f -> %x -> %f", myNegFloat, FLOAT2F23Q9(myNegFloat), F23Q92FLOAT(FLOAT2F23Q9(myNegFloat))); | |
NSLog(@"Multiplication:"); | |
NSLog(@"3*4 -> %d", F23Q92INT(F23Q9_MUL(INT2F23Q9(3), INT2F23Q9(4)))); | |
NSLog(@"3*-4 -> %d", F23Q92INT(F23Q9_MUL(INT2F23Q9(3), INT2F23Q9(-4)))); | |
NSLog(@"-3*-4 -> %d", F23Q92INT(F23Q9_MUL(INT2F23Q9(-3), INT2F23Q9(-4)))); | |
NSLog(@"1.25*2.36 -> %f", F23Q92FLOAT(F23Q9_MUL(FLOAT2F23Q9(1.25), FLOAT2F23Q9(2.36)))); | |
NSLog(@"1.25*-2.36 -> %f", F23Q92FLOAT(F23Q9_MUL(FLOAT2F23Q9(1.25), FLOAT2F23Q9(-2.36)))); | |
NSLog(@"-1.25*-2.36 -> %f", F23Q92FLOAT(F23Q9_MUL(FLOAT2F23Q9(-1.25), FLOAT2F23Q9(-2.36)))); | |
NSLog(@"Division:"); | |
NSLog(@"3/4 -> %f", F23Q92FLOAT(F23Q9_DIV(INT2F23Q9(3), INT2F23Q9(4)))); | |
NSLog(@"3/-4 -> %f", F23Q92FLOAT(F23Q9_DIV(INT2F23Q9(3), INT2F23Q9(-4)))); | |
NSLog(@"-3/-4 -> %f", F23Q92FLOAT(F23Q9_DIV(INT2F23Q9(-3), INT2F23Q9(-4)))); | |
NSLog(@"1.25/2.36 -> %f", F23Q92FLOAT(F23Q9_DIV(FLOAT2F23Q9(1.25), FLOAT2F23Q9(2.36)))); | |
NSLog(@"1.25/-2.36 -> %f", F23Q92FLOAT(F23Q9_DIV(FLOAT2F23Q9(1.25), FLOAT2F23Q9(-2.36)))); | |
NSLog(@"-1.25/-2.36 -> %f", F23Q92FLOAT(F23Q9_DIV(FLOAT2F23Q9(-1.25), FLOAT2F23Q9(-2.36)))); | |
} | |
int main(int argc, char *argv[]) | |
{ | |
CFAbsoluteTime start, end; | |
dispatch_queue_t dqueue = dispatch_queue_create("com.trevorbentley.fp_profiler", DISPATCH_QUEUE_CONCURRENT); | |
test_fixed_macros(); | |
NSLog(@"Initializing..."); | |
particles_memset(); | |
particles_init(); | |
NSLog(@"Profiling..."); | |
for (int state = 0; state < func_count; state++) { | |
start = CFAbsoluteTimeGetCurrent(); | |
for (int i = 0; i < PROFILER_LOOP_COUNT; i++) { | |
funcs[state](&dqueue); | |
} | |
end = CFAbsoluteTimeGetCurrent(); | |
NSLog(@"%d: %f", state, (end-start)/PROFILER_LOOP_COUNT); | |
} | |
NSLog(@"Profiling finished."); | |
while(1) ; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment