Last active
March 14, 2023 21:15
-
-
Save AliElSaleh/ea8997ba669286efc51f4386d7be78ec to your computer and use it in GitHub Desktop.
Haversine Distance Problem (Loading and Parsing JSON, Performing SIMD/Non-SIMD Math) **Code will not compile** this is using my own code base/engine. Also uses a SIMD library called "simde", which provides extra functionality like trigonometric functions. You will need an AVX2 capable CPU to run the SIMD code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "simde/simde-math.h" | |
#include "simde/x86/avx2.h" | |
#include "simde/x86/svml.h" | |
#include <Windows.h> | |
STRUCT(HaversineParseData) | |
{ | |
char* Base; // 8 | |
u64 BytesToProcess; // 8 | |
f64 Sum; // 8 | |
u64 Count; // 8 | |
u8 Pad[32]; | |
} | |
__attribute__((aligned(64))); | |
#define NUM_HAVERSINE_THREADS 64 | |
#define sqr(a) ((a) * (a)) | |
internal DWORD WINAPI Internal_Main_ComputeHaversineProcessThread_Lean(LPVOID lpParameter) | |
{ | |
HaversineParseData* h = (HaversineParseData*)lpParameter; | |
u64 count = 0; | |
f64 sum = 0.0f; | |
f64 x0 = 0.0f, y0 = 0.0f, x1 = 0.0f, y1 = 0.0f; | |
f64* Data[4] = {&x0, &y0, &x1, &y1}; | |
u8 Index = 0; | |
char* p = h->Base; | |
// Parse data | |
while ((p-h->Base) < h->BytesToProcess) | |
{ | |
if (UNLIKELY(*p == ':')) | |
{ | |
p++; | |
// find x0, y0, x1, y1 strings-float pair, covert to float and store in the above Data array | |
u8 j = 0; | |
char Char = *p; | |
char FloatBuffer[16] = {0}; | |
while (Char > ',' && Char < ':' && j < 15) | |
{ | |
FloatBuffer[j] = Char; | |
++j; | |
++p; | |
Char = *p; | |
if (Char == '.') | |
{ | |
FloatBuffer[j] = *p; | |
++j; | |
++p; | |
FloatBuffer[j] = *p; | |
++j; | |
++p; | |
FloatBuffer[j] = *p; | |
++j; | |
++p; | |
FloatBuffer[j] = *p; | |
++j; | |
++p; | |
FloatBuffer[j] = *p; | |
++j; | |
++p; | |
FloatBuffer[j] = *p; | |
++j; | |
++p; | |
FloatBuffer[j] = *p; | |
++j; | |
++p; | |
break; | |
} | |
} | |
*Data[Index] = strtof(FloatBuffer, NULL); | |
Index++; | |
if (Index == 4) | |
{ | |
Index = 0; | |
//p += 11; // skip a bunch of chars | |
f64 dY = (y1 - y0) * DEG2RAD * 0.5; | |
f64 dX = (x1 - x0) * DEG2RAD * 0.5; | |
y0 *= DEG2RAD; | |
y1 *= DEG2RAD; | |
f64 dxSine = Sin(dX); | |
f64 dySine = Sin(dY); | |
f64 root = (sqr(dySine)) + Cos(y0) * Cos(y1) * sqr(dxSine); | |
sum += 12742.0 * ASin(Sqrt(root)); // 12742 = 6371 = earth km radius * 2.0f | |
count++; | |
} | |
} | |
p++; | |
} | |
h->Count = count; | |
h->Sum = sum; | |
return 0; | |
} | |
internal DWORD WINAPI Internal_Main_ComputeHaversineProcessThread_SIMD(LPVOID lpParameter) | |
{ | |
HaversineParseData* h = (HaversineParseData*)lpParameter; | |
char* p = h->Base; | |
u64 count = 0; | |
f64 sum = 0.0; | |
f64* FloatArray_X0 = MemAlloc(h->BytesToProcess/4, MemoryTag_Test); | |
f64* FloatArray_Y0 = MemAlloc(h->BytesToProcess/4, MemoryTag_Test); | |
f64* FloatArray_X1 = MemAlloc(h->BytesToProcess/4, MemoryTag_Test); | |
f64* FloatArray_Y1 = MemAlloc(h->BytesToProcess/4, MemoryTag_Test); | |
f64* Data[4] = {FloatArray_X0, FloatArray_Y0, FloatArray_X1, FloatArray_Y1}; | |
u8 Index = 0; | |
// Parse data | |
while ((p-h->Base) < h->BytesToProcess) | |
{ | |
if (UNLIKELY(*p == ':')) | |
{ | |
p++; | |
// find x0, y0, x1, y1 strings-float pair, covert to float and store in the above Data array | |
u8 j = 0; | |
char Char = *p; | |
char FloatBuffer[16] = {0}; | |
while (Char > ',' && Char < ':' && j < 15) | |
{ | |
FloatBuffer[j] = Char; | |
++j; | |
++p; | |
Char = *p; | |
} | |
u8 Offset = Index%4; | |
*Data[Offset] = strtof(FloatBuffer, NULL); | |
Data[Offset]++; | |
Index++; | |
if (Index == 4) | |
{ | |
Index = 0; | |
count++; | |
} | |
} | |
p++; | |
} | |
// Perform math | |
simde__m256d Sum = _mm256_set1_pd(0.0); | |
for (u64 i = 0; i < count; i+=4) | |
{ | |
simde__m256d x0 = simde_mm256_loadu_pd(FloatArray_X0); | |
simde__m256d x1 = simde_mm256_loadu_pd(FloatArray_X1); | |
simde__m256d y0 = simde_mm256_loadu_pd(FloatArray_Y0); | |
simde__m256d y1 = simde_mm256_loadu_pd(FloatArray_Y1); | |
simde__m256d dY = simde_mm256_mul_pd(simde_mm256_sub_pd(y1, y0), simde_mm256_mul_pd(simde_mm256_set1_pd(DEG2RAD), simde_mm256_set1_pd(0.5))); | |
simde__m256d dX = simde_mm256_mul_pd(simde_mm256_sub_pd(x1, x0), simde_mm256_mul_pd(simde_mm256_set1_pd(DEG2RAD), simde_mm256_set1_pd(0.5))); | |
y0 = simde_mm256_mul_pd(y0, _mm256_set1_pd(DEG2RAD)); | |
y1 = simde_mm256_mul_pd(y1, _mm256_set1_pd(DEG2RAD)); | |
simde__m256d dxSine = simde_mm256_sin_pd(dX); | |
simde__m256d dySine = simde_mm256_sin_pd(dY); | |
simde__m256d root = simde_mm256_add_pd(simde_mm256_mul_pd(dySine, dySine), simde_mm256_mul_pd(_mm256_mul_pd(simde_mm256_cos_pd(y0), simde_mm256_cos_pd(y1)), simde_mm256_mul_pd(dxSine, dxSine))); | |
Sum = simde_mm256_add_pd(Sum, simde_mm256_mul_pd(simde_mm256_set1_pd(12742.0f), simde_mm256_asin_pd(simde_mm256_sqrt_pd(root)))); // 12742 = 6371 = earth km radius * 2.0f | |
FloatArray_X0 += 4; | |
FloatArray_Y0 += 4; | |
FloatArray_X1 += 4; | |
FloatArray_Y1 += 4; | |
} | |
for (u8 i = 0; i < 8; ++i) | |
sum += Sum[i]; | |
h->Count = count; | |
h->Sum = sum; | |
return 0; | |
} | |
// basically main.c | |
u8 Math_Compute10MillionHaversineDistances_Scalar() | |
{ | |
u64 count = 0; | |
f64 sum = 0.0f; | |
f64 x0 = 0.0f, y0 = 0.0f, x1 = 0.0f, y1 = 0.0f; | |
u8 Index = 0; | |
f64* Data[4] = {&x0, &y0, &x1, &y1}; | |
f64 TotalParseTime = 0.0; | |
f64 TotalMathTime = 0.0; | |
Clock c; | |
Clock_Start(&c); | |
HANDLE h = CreateFile("data.json", GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_READONLY, NULL); | |
HANDLE fm = CreateFileMapping(h, NULL, PAGE_READONLY, 0, 0, "data.json"); | |
LARGE_INTEGER FileSize; | |
GetFileSizeEx(h, &FileSize); | |
char* JsonData = MapViewOfFile(fm, FILE_MAP_READ, 0, 0, FileSize.QuadPart); | |
Clock_Tick(&c); | |
f64 FileOpenTime = c.ElapsedTime; | |
char* p = JsonData; | |
while (*p++) | |
{ | |
if (*p == '[') | |
{ | |
break; | |
} | |
} | |
Clock_Start(&c); | |
while (*p++) | |
{ | |
// Parse data | |
if (UNLIKELY(*p == ':')) | |
{ | |
p++; | |
Clock ParseClock; | |
Clock_Start(&ParseClock); | |
// find x0, y0, x1, y1 strings-float pair, covert to float and store in the above Data array | |
u8 j = 0; | |
char Char = *p; | |
char FloatBuffer[16] = {0}; | |
while (Char > ',' && Char < ':' && j < 15) | |
{ | |
FloatBuffer[j] = Char; | |
++j; | |
++p; | |
Char = *p; | |
} | |
*Data[Index%4] = strtof(FloatBuffer, NULL); | |
Index++; | |
Clock_Tick(&ParseClock); | |
TotalParseTime += ParseClock.ElapsedTime; | |
} | |
// Perform math | |
if (UNLIKELY(*p == '}')) | |
{ | |
//p+=11; // skip a bunch of chars | |
//LOG_INFO("%llu | x0: %f y0: %f x1: %f y1: %f", count, x0, y0, x1, y1); | |
Clock MathClock; | |
Clock_Start(&MathClock); | |
f64 dY = (y1 - y0) * DEG2RAD * 0.5; | |
f64 dX = (x1 - x0) * DEG2RAD * 0.5; | |
y0 *= DEG2RAD; | |
y1 *= DEG2RAD; | |
f64 dxSine = Sin(dX); | |
f64 dySine = Sin(dY); | |
f64 root = (sqr(dySine)) + Cos(y0) * Cos(y1) * sqr(dxSine); | |
sum += 12742.0 * ASin(Sqrt(root)); // 12742 = 6371 = earth km radius * 2.0f | |
Clock_Tick(&MathClock); | |
TotalMathTime += MathClock.ElapsedTime; | |
count++; | |
} | |
if (UNLIKELY(*p == ']')) | |
{ | |
break; | |
} | |
} | |
Clock_Tick(&c); | |
//Filesystem_Close(&f); | |
f64 avg = sum / (f64)count; | |
LOG_INFO("Sum of %d records: %f", count, sum); | |
LOG_INFO("Avg of %d records: %f", count, avg); | |
LOG_INFO("File open time = %f seconds", FileOpenTime); | |
LOG_INFO("Data parse time = %f seconds", TotalParseTime); | |
LOG_INFO("Math Time = %f seconds", TotalMathTime); | |
LOG_INFO("Total = %.2f seconds", c.ElapsedTime); | |
LOG_INFO("Throughput = %llu Haversine/second", (u64)(count/c.ElapsedTime)); | |
return true; | |
} | |
// basically main.c | |
u8 Math_Compute10MillionHaversineDistances_MultiThreaded() | |
{ | |
Clock c; | |
Clock_Start(&c); | |
HANDLE h = CreateFile("data.json", GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_READONLY, NULL); | |
HANDLE fm = CreateFileMapping(h, NULL, PAGE_READONLY, 0, 0, "data.json"); | |
LARGE_INTEGER FileSize; | |
GetFileSizeEx(h, &FileSize); | |
char* JsonData = MapViewOfFile(fm, FILE_MAP_READ, 0, 0, FileSize.QuadPart); | |
Clock_Tick(&c); | |
f64 FileOpenTime = c.ElapsedTime; | |
char* p = JsonData; | |
while (*p++) | |
{ | |
if (*p == '[') | |
{ | |
break; | |
} | |
} | |
const u8 NumThreads = NUM_HAVERSINE_THREADS; | |
HANDLE Threads[NUM_HAVERSINE_THREADS] = {0}; | |
HaversineParseData ParseData[NUM_HAVERSINE_THREADS] = {0}; | |
u64 TotalBytes = FileSize.QuadPart; | |
for (u8 i = 0; i < NumThreads; ++i) | |
{ | |
u64 BytesToProcess = (TotalBytes/NumThreads); | |
if (i == NumThreads-1) | |
{ | |
BytesToProcess = FileSize.QuadPart - (p-JsonData); | |
} | |
else | |
{ | |
char* p2 = p; | |
p2 += BytesToProcess; | |
while (*p2) | |
{ | |
if (*p2 == '}') | |
{ | |
BytesToProcess++; | |
break; | |
} | |
BytesToProcess++; | |
p2++; | |
} | |
} | |
ParseData[i].Base = p; | |
ParseData[i].BytesToProcess = BytesToProcess; | |
Threads[i] = CreateThread(NULL, 0, Internal_Main_ComputeHaversineProcessThread_Lean, &ParseData[i], CREATE_SUSPENDED, NULL); | |
SetThreadPriority(Threads[i], THREAD_PRIORITY_HIGHEST); | |
ResumeThread(Threads[i]); | |
p += BytesToProcess; | |
} | |
Clock_Start(&c); | |
WaitForMultipleObjects(NumThreads, Threads, true, INFINITE); | |
Clock_Tick(&c); | |
u64 Count = 0; | |
f64 Sum = 0.0f; | |
for (u8 i = 0; i < NumThreads; ++i) | |
{ | |
Count += ParseData[i].Count; | |
Sum += ParseData[i].Sum; | |
CloseHandle(Threads[i]); | |
} | |
//Filesystem_Close(&f); | |
f64 avg = Sum / (f64)Count; | |
LOG_INFO("Sum of %d records: %f", Count, Sum); | |
LOG_INFO("Avg of %d records: %f", Count, avg); | |
LOG_INFO("File open time = %f seconds", FileOpenTime); | |
//LOG_INFO("Data parse time = %f seconds", GTotalParseTime); | |
//LOG_INFO("Math Time = %f seconds", GTotalMathTime); | |
LOG_INFO("Total = %.2f seconds", c.ElapsedTime); | |
LOG_INFO("Throughput = %llu Haversine/second", (u64)(Count/c.ElapsedTime)); | |
return true; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment