Last active
March 26, 2025 08:19
-
-
Save vurtun/6a4f284f75e3133586b04b6692dac0fd to your computer and use it in GitHub Desktop.
animation sampling
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <immintrin.h> | |
#include <pthread.h> | |
#include <unistd.h> | |
#include <stdlib.h> | |
#define MAX_MDLS 2048 | |
#define MAX_ANIMS 256 | |
#define MAX_JOINTS 256 | |
#define ALIGN32 __attribute__((aligned(32))) | |
#if 0 | |
// ToDo instead of using fixed scalers, use start/range quantization. Also explore dynamic quantization | |
inline uint16_t EncodeFloat( float value, float const quantizationRangeStartValue, float const quantizationRangeLength ) | |
{ | |
EE_ASSERT( quantizationRangeLength != 0 ); | |
float const normalizedValue = ( value - quantizationRangeStartValue ) / quantizationRangeLength; | |
uint16_t const encodedValue = EncodeUnsignedNormalizedFloat<16>( normalizedValue ); | |
return encodedValue; | |
} | |
inline float DecodeFloat( uint16_t encodedValue, float const quantizationRangeStartValue, float const quantizationRangeLength ) | |
{ | |
EE_ASSERT( quantizationRangeLength != 0 ); | |
float const normalizedValue = DecodeUnsignedNormalizedFloat<16>( encodedValue ); | |
float const decodedValue = ( normalizedValue * quantizationRangeLength ) + quantizationRangeStartValue; | |
return decodedValue; | |
} | |
#endif | |
#define POS_SCALE 1000 // 0.001 precision (mm) | |
#define SCL_SCALE 10000 // 0.0001 precision | |
#define ROT_SCALE 10000 // 0.0001 precision for quaternions | |
struct anm { | |
int frame_cnt; | |
short *pos_x ALIGN32; | |
short *pos_y ALIGN32; | |
short *pos_z ALIGN32; | |
short *scl ALIGN32; | |
short *rot_x ALIGN32; | |
short *rot_y ALIGN32; | |
short *rot_z ALIGN32; | |
short *rot_w ALIGN32; | |
}; | |
struct mdl_anm { | |
int anim; | |
int frame; | |
int next_frame; | |
float between; | |
}; | |
struct mdl mdls[MAX_MDLS]; | |
struct anm anms[MAX_ANIMS]; | |
struct mdl_anm mdl_anms[MAX_MDLS]; | |
typedef struct { | |
float *out; | |
int start_m; | |
int end_m; | |
} thread_arg_t; | |
static pthread_t thread_pool[128]; | |
static volatile int running = 1; | |
static thread_arg_t thread_args[128]; | |
static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; | |
static pthread_cond_t cond = PTHREAD_COND_INITIALIZER; | |
static int tasks_ready = 0; | |
static void | |
alloc_anim(struct anm *anm, int frame_cnt) { | |
anm->frame_cnt = frame_cnt; | |
size_t size = MAX_JOINTS * frame_cnt * sizeof(int16_t); | |
posix_memalign((void**)&anm->pos_x, 32, size); | |
posix_memalign((void**)&anm->pos_y, 32, size); | |
posix_memalign((void**)&anm->pos_z, 32, size); | |
posix_memalign((void**)&anm->scl, 32, size); | |
posix_memalign((void**)&anm->rot_x, 32, size); | |
posix_memalign((void**)&anm->rot_y, 32, size); | |
posix_memalign((void**)&anm->rot_z, 32, size); | |
posix_memalign((void**)&anm->rot_w, 32, size); | |
} | |
static void | |
load_anim(struct anm *anm, FILE *file) { | |
fread(&anm->frame_cnt, sizeof(int), 1, file); | |
alloc_anim(anm, anm->frame_cnt); | |
size_t size = MAX_JOINTS * anm->frame_cnt * sizeof(short); | |
fread(anm->pos_x, sizeof(short), size, file); | |
fread(anm->pos_y, sizeof(short), size, file); | |
fread(anm->pos_z, sizeof(short), size, file); | |
fread(anm->scl, sizeof(short), size, file); | |
fread(anm->rot_x, sizeof(short), size, file); | |
fread(anm->rot_y, sizeof(short), size, file); | |
fread(anm->rot_z, sizeof(short), size, file); | |
fread(anm->rot_w, sizeof(short), size, file); | |
} | |
static void | |
anim_calc_pose_thread(void *arg) { | |
thread_arg_t *targ = (thread_arg_t *)arg; | |
float *out = targ->out; | |
int start_m = targ->start_m; | |
int end_m = targ->end_m; | |
__m256 pos_scale = _mm256_set1_ps(1.0f / POS_SCALE); | |
__m256 scl_scale = _mm256_set1_ps(1.0f / SCL_SCALE); | |
__m256 rot_scale = _mm256_set1_ps(1.0f / ROT_SCALE); | |
for (int m = start_m; m < end_m; m++) { | |
int a = mdl_anms[m].anim; | |
const struct anm *anm = &anms[a]; | |
int frm = mdl_anms[m].frame; | |
int nxt_frm = mdl_anms[m].next_frame; | |
float t = mdl_anms[m].between; | |
__m256 t_vec = _mm256_set1_ps(t); | |
__m256 one_minus_t = _mm256_set1_ps(1.0f - t); | |
const int frm_off = frm * MAX_JOINTS; | |
const int nxt_off = nxt_frm * MAX_JOINTS; | |
_mm_prefetch((const char*)&anm->pos_x[nxt_off], _MM_HINT_T0); | |
_mm_prefetch((const char*)&anm->pos_y[nxt_off], _MM_HINT_T0); | |
_mm_prefetch((const char*)&anm->pos_z[nxt_off], _MM_HINT_T0); | |
_mm_prefetch((const char*)&anm->scl[nxt_off], _MM_HINT_T0); | |
_mm_prefetch((const char*)&anm->rot_x[nxt_off], _MM_HINT_T0); | |
_mm_prefetch((const char*)&anm->rot_y[nxt_off], _MM_HINT_T0); | |
_mm_prefetch((const char*)&anm->rot_z[nxt_off], _MM_HINT_T0); | |
_mm_prefetch((const char*)&anm->rot_w[nxt_off], _MM_HINT_T0); | |
float *out_ptr = out + m * MAX_JOINTS * 8; | |
for (int j = 0; j < MAX_JOINTS; j += 8) { | |
__m256i px0 = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&anm->pos_x[frm_off + j])); | |
__m256i px1 = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&anm->pos_x[nxt_off + j])); | |
__m256i py0 = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&anm->pos_y[frm_off + j])); | |
__m256i py1 = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&anm->pos_y[nxt_off + j])); | |
__m256i pz0 = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&anm->pos_z[frm_off + j])); | |
__m256i pz1 = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&anm->pos_z[nxt_off + j])); | |
__m256i scl0 = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&anm->scl[frm_off + j])); | |
__m256i scl1 = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&anm->scl[nxt_off + j])); | |
__m256 px0_f = _mm256_cvtepi32_ps(px0); | |
__m256 px1_f = _mm256_cvtepi32_ps(px1); | |
__m256 py0_f = _mm256_cvtepi32_ps(py0); | |
__m256 py1_f = _mm256_cvtepi32_ps(py1); | |
__m256 pz0_f = _mm256_cvtepi32_ps(pz0); | |
__m256 pz1_f = _mm256_cvtepi32_ps(pz1); | |
__m256 scl0_f = _mm256_cvtepi32_ps(scl0); | |
__m256 scl1_f = _mm256_cvtepi32_ps(scl1); | |
__m256 px_interp = _mm256_fmadd_ps(px1_f, t_vec, _mm256_mul_ps(px0_f, one_minus_t)); | |
__m256 py_interp = _mm256_fmadd_ps(py1_f, t_vec, _mm256_mul_ps(py0_f, one_minus_t)); | |
__m256 pz_interp = _mm256_fmadd_ps(pz1_f, t_vec, _mm256_mul_ps(pz0_f, one_minus_t)); | |
__m256 scl_interp = _mm256_fmadd_ps(scl1_f, t_vec, _mm256_mul_ps(scl0_f, one_minus_t)); | |
px_interp = _mm256_mul_ps(px_interp, pos_scale); | |
py_interp = _mm256_mul_ps(py_interp, pos_scale); | |
pz_interp = _mm256_mul_ps(pz_interp, pos_scale); | |
scl_interp = _mm256_mul_ps(scl_interp, scl_scale); | |
__m256i rx0 = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&anm->rot_x[frm_off + j])); | |
__m256i rx1 = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&anm->rot_x[nxt_off + j])); | |
__m256i ry0 = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&anm->rot_y[frm_off + j])); | |
__m256i ry1 = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&anm->rot_y[nxt_off + j])); | |
__m256i rz0 = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&anm->rot_z[frm_off + j])); | |
__m256i rz1 = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&anm->rot_z[nxt_off + j])); | |
__m256i rw0 = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&anm->rot_w[frm_off + j])); | |
__m256i rw1 = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&anm->rot_w[nxt_off + j])); | |
__m256 rx0_f = _mm256_cvtepi32_ps(rx0); | |
__m256 rx1_f = _mm256_cvtepi32_ps(rx1); | |
__m256 ry0_f = _mm256_cvtepi32_ps(ry0); | |
__m256 ry1_f = _mm256_cvtepi32_ps(ry1); | |
__m256 rz0_f = _mm256_cvtepi32_ps(rz0); | |
__m256 rz1_f = _mm256_cvtepi32_ps(rz1); | |
__m256 rw0_f = _mm256_cvtepi32_ps(rw0); | |
__m256 rw1_f = _mm256_cvtepi32_ps(rw1); | |
__m256 rx_interp = _mm256_fmadd_ps(rx1_f, t_vec, _mm256_mul_ps(rx0_f, one_minus_t)); | |
__m256 ry_interp = _mm256_fmadd_ps(ry1_f, t_vec, _mm256_mul_ps(ry0_f, one_minus_t)); | |
__m256 rz_interp = _mm256_fmadd_ps(rz1_f, t_vec, _mm256_mul_ps(rz0_f, one_minus_t)); | |
__m256 rw_interp = _mm256_fmadd_ps(rw1_f, t_vec, _mm256_mul_ps(rw0_f, one_minus_t)); | |
__m256 q_sq = _mm256_fmadd_ps(rx_interp, rx_interp, _mm256_mul_ps(ry_interp, ry_interp)); | |
q_sq = _mm256_fmadd_ps(rz_interp, rz_interp, q_sq); | |
q_sq = _mm256_fmadd_ps(rw_interp, rw_interp, q_sq); | |
__m256 q_inv_mag = _mm256_rsqrt_ps(q_sq); | |
rx_interp = _mm256_mul_ps(rx_interp, q_inv_mag); | |
ry_interp = _mm256_mul_ps(ry_interp, q_inv_mag); | |
rz_interp = _mm256_mul_ps(rz_interp, q_inv_mag); | |
rw_interp = _mm256_mul_ps(rw_interp, q_inv_mag); | |
rx_interp = _mm256_mul_ps(rx_interp, rot_scale); | |
ry_interp = _mm256_mul_ps(ry_interp, rot_scale); | |
rz_interp = _mm256_mul_ps(rz_interp, rot_scale); | |
rw_interp = _mm256_mul_ps(rw_interp, rot_scale); | |
_mm256_store_ps(out_ptr + j * 8, px_interp); | |
_mm256_store_ps(out_ptr + j * 8 + 8, py_interp); | |
_mm256_store_ps(out_ptr + j * 8 + 16, pz_interp); | |
_mm256_store_ps(out_ptr + j * 8 + 24, scl_interp); | |
_mm256_store_ps(out_ptr + j * 8 + 32, rx_interp); | |
_mm256_store_ps(out_ptr + j * 8 + 40, ry_interp); | |
_mm256_store_ps(out_ptr + j * 8 + 48, rz_interp); | |
_mm256_store_ps(out_ptr + j * 8 + 56, rw_interp); | |
} | |
} | |
} | |
static void* | |
worker(void *arg) { | |
int tid = *(int*)arg; | |
while (running) { | |
pthread_mutex_lock(&mutex); | |
while (!tasks_ready) pthread_cond_wait(&cond, &mutex); | |
thread_arg_t targ = thread_args[tid]; | |
pthread_mutex_unlock(&mutex); | |
anim_calc_pose_thread(&targ); | |
} | |
return NULL; | |
} | |
static void | |
anim_calc_pose(float *restrict out) { | |
int num_cores = sysconf(_SC_NPROCESSORS_ONLN); | |
if (num_cores > 128) num_cores = 128; | |
if (num_cores < 1) num_cores = 1; | |
static int initialized = 0; | |
if (!initialized) { | |
for (int t = 0; t < num_cores; t++) { | |
int *tid = malloc(sizeof(int)); | |
*tid = t; | |
pthread_create(&thread_pool[t], NULL, worker, tid); | |
} | |
initialized = 1; | |
} | |
int models_per_thread = MAX_MDLS / num_cores; | |
int remainder = MAX_MDLS % num_cores; | |
pthread_mutex_lock(&mutex); | |
for (int t = 0; t < num_cores; t++) { | |
thread_args[t].out = out; | |
thread_args[t].start_m = t * models_per_thread + (t < remainder ? t : remainder); | |
thread_args[t].end_m = thread_args[t].start_m + models_per_thread + (t < remainder ? 1 : 0); | |
if (t == num_cores - 1) { | |
thread_args[t].end_m = MAX_MDLS; | |
} | |
} | |
tasks_ready = 1; | |
pthread_cond_broadcast(&cond); | |
pthread_mutex_unlock(&mutex); | |
sched_yield(); | |
pthread_mutex_lock(&mutex); | |
tasks_ready = 0; | |
pthread_mutex_unlock(&mutex); | |
} | |
extern int | |
main(void) { | |
mlockall(MCL_CURRENT); | |
return 0; | |
} | |
#if 1 | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <math.h> | |
#include "ufbx.h" | |
#define MAX_JOINTS 256 | |
#define MAX_FRAMES 1024 | |
#define MAX_COUNT (MAX_JOINTS*MAX_FRAMES) | |
static struct anm_data { | |
int frame_cnt; | |
short pos_x[MAX_COUNT]; | |
short pos_y[MAX_COUNT]; | |
short pos_z[MAX_COUNT]; | |
short scl[MAX_COUNT]; | |
short rot_x[MAX_COUNT]; | |
short rot_y[MAX_COUNT]; | |
short rot_z[MAX_COUNT]; | |
short rot_w[MAX_COUNT]; | |
} anm_data data; | |
static void | |
die(const char *fmt, ...) { | |
va_list args; | |
va_start(args, fmt); | |
vfprintf(stderr, fmt, args); | |
fprintf(stderr, "\n"); | |
va_end(args); | |
exit(1); | |
} | |
static int | |
estrtol(const char *str) { | |
char *ep = NULL; | |
long n = strtol(str, &ep, 10); | |
if (*ep != '\0' || ep == str) { | |
die("Invalid number: %s\n", str); | |
} | |
if (n < INT_MIN || n > INT_MAX) { | |
die("Number: %d is out of range\n", n); | |
} | |
return cast(int, n); | |
} | |
extern int | |
main(int argc, char *argv[]) { | |
if (argc != 3) { | |
die("Usage: %s <input.fbx> <animation> <first frame> <last frame> <fps> <output.bin>\n", argv[0]); | |
} | |
const char *input_path = argv[1]; | |
const char *animation = argv[2]; | |
int frame_start = estrtol(argv[3]); | |
int frame_end = estrtol(argv[4]); | |
int fps = estrtol(argv[5]); | |
const char *output_path = argv[6]; | |
// load FBX file | |
ufbx_load_opts opts = {0}; | |
ufbx_error error; | |
ufbx_scene *scene = ufbx_load_file(input_path, &opts, &error); | |
if (!scene) { | |
die(stderr, "Failed to load FBX: %s\n", error.description.data); | |
} | |
// find animation | |
ufbx_anim_stack *stk = 0; | |
if (scene->anim_stacks.count == 1u ) { | |
stk = scene->anim_stacks[ 0u ]; | |
} else { | |
int stk_idx = 0; | |
for (stk_idx = 0; stk_idx < scene->anim_stacks.count; stk_idx++ ) { | |
stk = scene->anim_stacks[ stk_idx ]; | |
if (!memcmp( stk->name.data, animation, stk->name.length)) { | |
break; | |
} | |
} | |
if (stk_idx >= scene->anim_stacks.count ) { | |
ufbx_free_scene(scene); | |
die(stderr, "No animations found in FBX file with name %s\n", animation); | |
} | |
} | |
ufbx_anim *anim = stk->anim; | |
// precompute and write animation data | |
data.frame_cnt = (int)((anim->time_end - anim->time_begin) * fps + 0.5); | |
if (data.frame_cnt < 1) { | |
data.frame_cnt = 1; | |
} | |
frame_start = data.frame_cnt > frame_start ? frame_cnt - 1 : frame_start; | |
frame_start = frame_start < 0 ? 0 : frame_start; | |
frame_end = frame_end < 0 ? data.frame_cnt : frame_end; | |
frame_end = frame_end > datas.frame_cnt ? data.frame_cnt : frame_end; | |
if (frame_end <= frame_start) { | |
ufbx_free_scene(scene); | |
die(stderr, "Frame count range %d to %d is invalid\n", frame_start, frame_end); | |
} | |
if (frame_end - frame_start > MAX_FRAMES) { | |
ufbx_free_scene(scene); | |
die(stderr, "Frame count range from %d to %d greater than max frames %d\n", frame_start, frame_end, MAX_FRAMES); | |
} | |
data.frame_cnt = frame_end - frame_start; | |
int joint_cnt; | |
const ufbx_node* joint_nodes[MAX_JOINTS] = {0}; | |
for (int n = 0; n < scnene->nodes.count; ++n) { | |
const ufbx_node *node = scene->nodes.data[n]; | |
if (!memcmp(node->name.data, "root", 4) || node->bone != 0){ | |
joint_nodes[joint_cnt++] = node; | |
} | |
} | |
if (!joint_cnt){ | |
die("no joint nodes found in fbx file!\n"); | |
} | |
if (joint_cnt > MAX_JOINTS) { | |
die("to many joints: %d limit is %d\n", joint_cnt, MAX_JOINTS) | |
} | |
int total_cnt = joint_cnt < MAX_JOINTS ? joint_cnt : MAX_JOINTS; | |
for (int j = 0; j < total_cnt; j++) { | |
ufbx_node *node = (j < joint_cnt) ? joint_nodes[j] : 0; | |
for (int f = frame_start; f < frame_end; f++) { | |
double time = anim->time_begin + f * (1.0 / fps); | |
ufbx_transform transform = node ? ufbx_evaluate_transform(anim, node, time) : ufbx_identity_transform; | |
int idx = j + f * MAX_JOINTS; | |
data.pos_x[idx] = (short)(transform.translation.x * POS_SCALE); | |
data.pos_y[idx] = (short)(transform.translation.y * POS_SCALE); | |
data.pos_z[idx] = (short)(transform.translation.z * POS_SCALE); | |
data.scl[idx] = (short)(transform.scale.x * SCL_SCALE); | |
float rx = transform.rotation.x; | |
float ry = transform.rotation.y; | |
float rz = transform.rotation.z; | |
float rw = transform.rotation.w; | |
data.rot_x[idx] = (short)(rx * ROT_SCALE); | |
data.rot_y[idx] = (short)(ry * ROT_SCALE); | |
data.rot_z[idx] = (short)(rz * ROT_SCALE); | |
data.rot_w[idx] = (short)(rw * ROT_SCALE); | |
} | |
} | |
{ | |
FILE *file = fopen(filename, "wb"); | |
size_t frame_size = MAX_JOINTS * data->frame_cnt; | |
fwrite(&data->frame_cnt, sizeof(int), 1, file); | |
fwrite(data->pos_x, sizeof(short), frame_size, file); | |
fwrite(data->pos_y, sizeof(short), frame_size, file); | |
fwrite(data->pos_z, sizeof(short), frame_size, file); | |
fwrite(data->scl, sizeof(short), frame_size, file); | |
fwrite(data->rot_x, sizeof(short), frame_size, file); | |
fwrite(data->rot_y, sizeof(short), frame_size, file); | |
fwrite(data->rot_z, sizeof(short), frame_size, file); | |
fwrite(data->rot_w, sizeof(short), frame_size, file); | |
fclose(file); | |
} | |
ufbx_free_scene(scene); | |
return 0; | |
} | |
#endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment