Skip to content

Instantly share code, notes, and snippets.

@AliElSaleh
Last active March 14, 2023 21:15
Show Gist options
  • Save AliElSaleh/ea8997ba669286efc51f4386d7be78ec to your computer and use it in GitHub Desktop.
Save AliElSaleh/ea8997ba669286efc51f4386d7be78ec to your computer and use it in GitHub Desktop.
Haversine Distance Problem (Loading and Parsing JSON, Performing SIMD/Non-SIMD Math) **Code will not compile** this is using my own code base/engine. Also uses a SIMD library called "simde", which provides extra functionality like trigonometric functions. You will need an AVX2 capable CPU to run the SIMD code
#include "simde/simde-math.h"
#include "simde/x86/avx2.h"
#include "simde/x86/svml.h"
#include <Windows.h>
STRUCT(HaversineParseData)
{
char* Base; // 8
u64 BytesToProcess; // 8
f64 Sum; // 8
u64 Count; // 8
u8 Pad[32];
}
__attribute__((aligned(64)));
#define NUM_HAVERSINE_THREADS 64
#define sqr(a) ((a) * (a))
internal DWORD WINAPI Internal_Main_ComputeHaversineProcessThread_Lean(LPVOID lpParameter)
{
HaversineParseData* h = (HaversineParseData*)lpParameter;
u64 count = 0;
f64 sum = 0.0f;
f64 x0 = 0.0f, y0 = 0.0f, x1 = 0.0f, y1 = 0.0f;
f64* Data[4] = {&x0, &y0, &x1, &y1};
u8 Index = 0;
char* p = h->Base;
// Parse data
while ((p-h->Base) < h->BytesToProcess)
{
if (UNLIKELY(*p == ':'))
{
p++;
// find x0, y0, x1, y1 strings-float pair, covert to float and store in the above Data array
u8 j = 0;
char Char = *p;
char FloatBuffer[16] = {0};
while (Char > ',' && Char < ':' && j < 15)
{
FloatBuffer[j] = Char;
++j;
++p;
Char = *p;
if (Char == '.')
{
FloatBuffer[j] = *p;
++j;
++p;
FloatBuffer[j] = *p;
++j;
++p;
FloatBuffer[j] = *p;
++j;
++p;
FloatBuffer[j] = *p;
++j;
++p;
FloatBuffer[j] = *p;
++j;
++p;
FloatBuffer[j] = *p;
++j;
++p;
FloatBuffer[j] = *p;
++j;
++p;
break;
}
}
*Data[Index] = strtof(FloatBuffer, NULL);
Index++;
if (Index == 4)
{
Index = 0;
//p += 11; // skip a bunch of chars
f64 dY = (y1 - y0) * DEG2RAD * 0.5;
f64 dX = (x1 - x0) * DEG2RAD * 0.5;
y0 *= DEG2RAD;
y1 *= DEG2RAD;
f64 dxSine = Sin(dX);
f64 dySine = Sin(dY);
f64 root = (sqr(dySine)) + Cos(y0) * Cos(y1) * sqr(dxSine);
sum += 12742.0 * ASin(Sqrt(root)); // 12742 = 6371 = earth km radius * 2.0f
count++;
}
}
p++;
}
h->Count = count;
h->Sum = sum;
return 0;
}
internal DWORD WINAPI Internal_Main_ComputeHaversineProcessThread_SIMD(LPVOID lpParameter)
{
HaversineParseData* h = (HaversineParseData*)lpParameter;
char* p = h->Base;
u64 count = 0;
f64 sum = 0.0;
f64* FloatArray_X0 = MemAlloc(h->BytesToProcess/4, MemoryTag_Test);
f64* FloatArray_Y0 = MemAlloc(h->BytesToProcess/4, MemoryTag_Test);
f64* FloatArray_X1 = MemAlloc(h->BytesToProcess/4, MemoryTag_Test);
f64* FloatArray_Y1 = MemAlloc(h->BytesToProcess/4, MemoryTag_Test);
f64* Data[4] = {FloatArray_X0, FloatArray_Y0, FloatArray_X1, FloatArray_Y1};
u8 Index = 0;
// Parse data
while ((p-h->Base) < h->BytesToProcess)
{
if (UNLIKELY(*p == ':'))
{
p++;
// find x0, y0, x1, y1 strings-float pair, covert to float and store in the above Data array
u8 j = 0;
char Char = *p;
char FloatBuffer[16] = {0};
while (Char > ',' && Char < ':' && j < 15)
{
FloatBuffer[j] = Char;
++j;
++p;
Char = *p;
}
u8 Offset = Index%4;
*Data[Offset] = strtof(FloatBuffer, NULL);
Data[Offset]++;
Index++;
if (Index == 4)
{
Index = 0;
count++;
}
}
p++;
}
// Perform math
simde__m256d Sum = _mm256_set1_pd(0.0);
for (u64 i = 0; i < count; i+=4)
{
simde__m256d x0 = simde_mm256_loadu_pd(FloatArray_X0);
simde__m256d x1 = simde_mm256_loadu_pd(FloatArray_X1);
simde__m256d y0 = simde_mm256_loadu_pd(FloatArray_Y0);
simde__m256d y1 = simde_mm256_loadu_pd(FloatArray_Y1);
simde__m256d dY = simde_mm256_mul_pd(simde_mm256_sub_pd(y1, y0), simde_mm256_mul_pd(simde_mm256_set1_pd(DEG2RAD), simde_mm256_set1_pd(0.5)));
simde__m256d dX = simde_mm256_mul_pd(simde_mm256_sub_pd(x1, x0), simde_mm256_mul_pd(simde_mm256_set1_pd(DEG2RAD), simde_mm256_set1_pd(0.5)));
y0 = simde_mm256_mul_pd(y0, _mm256_set1_pd(DEG2RAD));
y1 = simde_mm256_mul_pd(y1, _mm256_set1_pd(DEG2RAD));
simde__m256d dxSine = simde_mm256_sin_pd(dX);
simde__m256d dySine = simde_mm256_sin_pd(dY);
simde__m256d root = simde_mm256_add_pd(simde_mm256_mul_pd(dySine, dySine), simde_mm256_mul_pd(_mm256_mul_pd(simde_mm256_cos_pd(y0), simde_mm256_cos_pd(y1)), simde_mm256_mul_pd(dxSine, dxSine)));
Sum = simde_mm256_add_pd(Sum, simde_mm256_mul_pd(simde_mm256_set1_pd(12742.0f), simde_mm256_asin_pd(simde_mm256_sqrt_pd(root)))); // 12742 = 6371 = earth km radius * 2.0f
FloatArray_X0 += 4;
FloatArray_Y0 += 4;
FloatArray_X1 += 4;
FloatArray_Y1 += 4;
}
for (u8 i = 0; i < 8; ++i)
sum += Sum[i];
h->Count = count;
h->Sum = sum;
return 0;
}
// basically main.c
u8 Math_Compute10MillionHaversineDistances_Scalar()
{
u64 count = 0;
f64 sum = 0.0f;
f64 x0 = 0.0f, y0 = 0.0f, x1 = 0.0f, y1 = 0.0f;
u8 Index = 0;
f64* Data[4] = {&x0, &y0, &x1, &y1};
f64 TotalParseTime = 0.0;
f64 TotalMathTime = 0.0;
Clock c;
Clock_Start(&c);
HANDLE h = CreateFile("data.json", GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_READONLY, NULL);
HANDLE fm = CreateFileMapping(h, NULL, PAGE_READONLY, 0, 0, "data.json");
LARGE_INTEGER FileSize;
GetFileSizeEx(h, &FileSize);
char* JsonData = MapViewOfFile(fm, FILE_MAP_READ, 0, 0, FileSize.QuadPart);
Clock_Tick(&c);
f64 FileOpenTime = c.ElapsedTime;
char* p = JsonData;
while (*p++)
{
if (*p == '[')
{
break;
}
}
Clock_Start(&c);
while (*p++)
{
// Parse data
if (UNLIKELY(*p == ':'))
{
p++;
Clock ParseClock;
Clock_Start(&ParseClock);
// find x0, y0, x1, y1 strings-float pair, covert to float and store in the above Data array
u8 j = 0;
char Char = *p;
char FloatBuffer[16] = {0};
while (Char > ',' && Char < ':' && j < 15)
{
FloatBuffer[j] = Char;
++j;
++p;
Char = *p;
}
*Data[Index%4] = strtof(FloatBuffer, NULL);
Index++;
Clock_Tick(&ParseClock);
TotalParseTime += ParseClock.ElapsedTime;
}
// Perform math
if (UNLIKELY(*p == '}'))
{
//p+=11; // skip a bunch of chars
//LOG_INFO("%llu | x0: %f y0: %f x1: %f y1: %f", count, x0, y0, x1, y1);
Clock MathClock;
Clock_Start(&MathClock);
f64 dY = (y1 - y0) * DEG2RAD * 0.5;
f64 dX = (x1 - x0) * DEG2RAD * 0.5;
y0 *= DEG2RAD;
y1 *= DEG2RAD;
f64 dxSine = Sin(dX);
f64 dySine = Sin(dY);
f64 root = (sqr(dySine)) + Cos(y0) * Cos(y1) * sqr(dxSine);
sum += 12742.0 * ASin(Sqrt(root)); // 12742 = 6371 = earth km radius * 2.0f
Clock_Tick(&MathClock);
TotalMathTime += MathClock.ElapsedTime;
count++;
}
if (UNLIKELY(*p == ']'))
{
break;
}
}
Clock_Tick(&c);
//Filesystem_Close(&f);
f64 avg = sum / (f64)count;
LOG_INFO("Sum of %d records: %f", count, sum);
LOG_INFO("Avg of %d records: %f", count, avg);
LOG_INFO("File open time = %f seconds", FileOpenTime);
LOG_INFO("Data parse time = %f seconds", TotalParseTime);
LOG_INFO("Math Time = %f seconds", TotalMathTime);
LOG_INFO("Total = %.2f seconds", c.ElapsedTime);
LOG_INFO("Throughput = %llu Haversine/second", (u64)(count/c.ElapsedTime));
return true;
}
// basically main.c
u8 Math_Compute10MillionHaversineDistances_MultiThreaded()
{
Clock c;
Clock_Start(&c);
HANDLE h = CreateFile("data.json", GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_READONLY, NULL);
HANDLE fm = CreateFileMapping(h, NULL, PAGE_READONLY, 0, 0, "data.json");
LARGE_INTEGER FileSize;
GetFileSizeEx(h, &FileSize);
char* JsonData = MapViewOfFile(fm, FILE_MAP_READ, 0, 0, FileSize.QuadPart);
Clock_Tick(&c);
f64 FileOpenTime = c.ElapsedTime;
char* p = JsonData;
while (*p++)
{
if (*p == '[')
{
break;
}
}
const u8 NumThreads = NUM_HAVERSINE_THREADS;
HANDLE Threads[NUM_HAVERSINE_THREADS] = {0};
HaversineParseData ParseData[NUM_HAVERSINE_THREADS] = {0};
u64 TotalBytes = FileSize.QuadPart;
for (u8 i = 0; i < NumThreads; ++i)
{
u64 BytesToProcess = (TotalBytes/NumThreads);
if (i == NumThreads-1)
{
BytesToProcess = FileSize.QuadPart - (p-JsonData);
}
else
{
char* p2 = p;
p2 += BytesToProcess;
while (*p2)
{
if (*p2 == '}')
{
BytesToProcess++;
break;
}
BytesToProcess++;
p2++;
}
}
ParseData[i].Base = p;
ParseData[i].BytesToProcess = BytesToProcess;
Threads[i] = CreateThread(NULL, 0, Internal_Main_ComputeHaversineProcessThread_Lean, &ParseData[i], CREATE_SUSPENDED, NULL);
SetThreadPriority(Threads[i], THREAD_PRIORITY_HIGHEST);
ResumeThread(Threads[i]);
p += BytesToProcess;
}
Clock_Start(&c);
WaitForMultipleObjects(NumThreads, Threads, true, INFINITE);
Clock_Tick(&c);
u64 Count = 0;
f64 Sum = 0.0f;
for (u8 i = 0; i < NumThreads; ++i)
{
Count += ParseData[i].Count;
Sum += ParseData[i].Sum;
CloseHandle(Threads[i]);
}
//Filesystem_Close(&f);
f64 avg = Sum / (f64)Count;
LOG_INFO("Sum of %d records: %f", Count, Sum);
LOG_INFO("Avg of %d records: %f", Count, avg);
LOG_INFO("File open time = %f seconds", FileOpenTime);
//LOG_INFO("Data parse time = %f seconds", GTotalParseTime);
//LOG_INFO("Math Time = %f seconds", GTotalMathTime);
LOG_INFO("Total = %.2f seconds", c.ElapsedTime);
LOG_INFO("Throughput = %llu Haversine/second", (u64)(Count/c.ElapsedTime));
return true;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment