Skip to content

Instantly share code, notes, and snippets.

@hfiref0x
Created June 11, 2025 02:14
Show Gist options
  • Save hfiref0x/5d6be98a3ead6c5bd8d9fb0335138b49 to your computer and use it in GitHub Desktop.
Save hfiref0x/5d6be98a3ead6c5bd8d9fb0335138b49 to your computer and use it in GitHub Desktop.
Checksum calculation benchmark (Scalar/SSE2/AVX2)
// MSVC compiler: /O2 /arch:AVX2
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <windows.h>
#include <emmintrin.h>
#include <immintrin.h>
#ifndef DEFAULT_FILESIZE_MB
#define DEFAULT_FILESIZE_MB 3
#endif
#ifndef DEFAULT_REPS
#define DEFAULT_REPS 100
#endif
#define ALIGNMENT 32 // for AVX2
// Scalar PE checksum
DWORD checksum_scalar(const void* base_address, ULONG file_length, const USHORT* opt_hdr_chksum) {
ULONG partial_sum = 0;
const USHORT* source = (const USHORT*)base_address;
ULONG length = file_length / 2;
ULONG i;
for (i = 0; i < length; ++i) {
partial_sum += source[i];
partial_sum = (partial_sum & 0xFFFF) + (partial_sum >> 16);
}
if (file_length & 1) {
partial_sum += ((const uint8_t*)base_address)[file_length - 1];
partial_sum = (partial_sum & 0xFFFF) + (partial_sum >> 16);
}
partial_sum -= (partial_sum < opt_hdr_chksum[0]);
partial_sum -= opt_hdr_chksum[0];
partial_sum -= (partial_sum < opt_hdr_chksum[1]);
partial_sum -= opt_hdr_chksum[1];
partial_sum = (partial_sum & 0xFFFF) + (partial_sum >> 16);
return (ULONG)partial_sum + file_length;
}
// SSE2 PE checksum (with per-word folding)
DWORD checksum_sse2_pe_match(const void* base_address, ULONG file_length, const USHORT* opt_hdr_chksum) {
const uint8_t* data8 = (const uint8_t*)base_address;
ULONG len = file_length;
uint32_t sum = 0;
ULONG i = 0;
for (; i + 16 <= len; i += 16) {
__m128i v = _mm_loadu_si128((const __m128i*)(data8 + i));
uint16_t tmp[8];
_mm_storeu_si128((__m128i*)tmp, v);
for (int k = 0; k < 8; ++k) {
sum += tmp[k];
sum = (sum & 0xFFFF) + (sum >> 16);
}
}
for (; i + 1 < len; i += 2) {
sum += *(const uint16_t*)(data8 + i);
sum = (sum & 0xFFFF) + (sum >> 16);
}
if (len & 1) {
sum += data8[len - 1];
sum = (sum & 0xFFFF) + (sum >> 16);
}
sum -= (sum < opt_hdr_chksum[0]);
sum -= opt_hdr_chksum[0];
sum -= (sum < opt_hdr_chksum[1]);
sum -= opt_hdr_chksum[1];
sum = (sum & 0xFFFF) + (sum >> 16);
return (ULONG)sum + file_length;
}
// AVX2 PE checksum (with per-word folding)
DWORD checksum_avx2_pe_match(const void* base_address, ULONG file_length, const USHORT* opt_hdr_chksum) {
const uint8_t* data8 = (const uint8_t*)base_address;
ULONG len = file_length;
uint32_t sum = 0;
ULONG i = 0;
for (; i + 32 <= len; i += 32) {
__m256i v = _mm256_loadu_si256((const __m256i*)(data8 + i));
uint16_t tmp[16];
_mm256_storeu_si256((__m256i*)tmp, v);
for (int k = 0; k < 16; ++k) {
sum += tmp[k];
sum = (sum & 0xFFFF) + (sum >> 16);
}
}
for (; i + 1 < len; i += 2) {
sum += *(const uint16_t*)(data8 + i);
sum = (sum & 0xFFFF) + (sum >> 16);
}
if (len & 1) {
sum += data8[len - 1];
sum = (sum & 0xFFFF) + (sum >> 16);
}
sum -= (sum < opt_hdr_chksum[0]);
sum -= opt_hdr_chksum[0];
sum -= (sum < opt_hdr_chksum[1]);
sum -= opt_hdr_chksum[1];
sum = (sum & 0xFFFF) + (sum >> 16);
return (ULONG)sum + file_length;
}
double seconds(LARGE_INTEGER start, LARGE_INTEGER end, LARGE_INTEGER freq) {
return (double)(end.QuadPart - start.QuadPart) / (double)freq.QuadPart;
}
void fill_random(void* buf, size_t sz) {
uint32_t* p = (uint32_t*)buf;
size_t n = sz / sizeof(uint32_t);
for (size_t i = 0; i < n; ++i)
p[i] = rand();
uint8_t* pb = (uint8_t*)buf;
for (size_t i = n * 4; i < sz; ++i)
pb[i] = rand() & 0xff;
}
void usage(const char* prog) {
printf("Usage: %s [filesize_MB] [repetitions]\n", prog);
printf(" filesize_MB: Size of buffer in megabytes (default: %d)\n", DEFAULT_FILESIZE_MB);
printf(" repetitions: Number of repetitions (default: %d)\n", DEFAULT_REPS);
}
int main(int argc, char* argv[]) {
int filesize_MB = DEFAULT_FILESIZE_MB;
int reps = DEFAULT_REPS;
if (argc > 1) {
if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0) {
usage(argv[0]);
return 0;
}
filesize_MB = atoi(argv[1]);
if (filesize_MB <= 0) {
printf("Invalid filesize_MB.\n");
usage(argv[0]);
return 1;
}
}
if (argc > 2) {
reps = atoi(argv[2]);
if (reps <= 0) {
printf("Invalid repetitions.\n");
usage(argv[0]);
return 1;
}
}
const size_t FILESIZE = (size_t)filesize_MB * 1024 * 1024;
void* buffer = _aligned_malloc(FILESIZE, ALIGNMENT);
if (!buffer) {
printf("Allocation failed\n");
return 1;
}
fill_random(buffer, FILESIZE);
USHORT checksum_field[2] = { 0,0 }; // Simulate as zero
LARGE_INTEGER freq, t0, t1;
QueryPerformanceFrequency(&freq);
DWORD result_scalar = 0, result_sse2 = 0, result_avx2 = 0;
// Warm-up
checksum_scalar(buffer, (ULONG)FILESIZE, checksum_field);
checksum_sse2_pe_match(buffer, (ULONG)FILESIZE, checksum_field);
checksum_avx2_pe_match(buffer, (ULONG)FILESIZE, checksum_field);
// Scalar benchmark
QueryPerformanceCounter(&t0);
for (int i = 0; i < reps; ++i)
result_scalar = checksum_scalar(buffer, (ULONG)FILESIZE, checksum_field);
QueryPerformanceCounter(&t1);
double time_scalar = seconds(t0, t1, freq);
// SSE2 benchmark
QueryPerformanceCounter(&t0);
for (int i = 0; i < reps; ++i)
result_sse2 = checksum_sse2_pe_match(buffer, (ULONG)FILESIZE, checksum_field);
QueryPerformanceCounter(&t1);
double time_sse2 = seconds(t0, t1, freq);
// AVX2 benchmark
QueryPerformanceCounter(&t0);
for (int i = 0; i < reps; ++i)
result_avx2 = checksum_avx2_pe_match(buffer, (ULONG)FILESIZE, checksum_field);
QueryPerformanceCounter(&t1);
double time_avx2 = seconds(t0, t1, freq);
printf("File size: %d MB, Repetitions: %d\n", filesize_MB, reps);
printf("Scalar checksum: %08X, Time: %.4f sec, %.2f MB/s\n", result_scalar, time_scalar, (filesize_MB * reps) / time_scalar);
printf("SSE2 checksum: %08X, Time: %.4f sec, %.2f MB/s\n", result_sse2, time_sse2, (filesize_MB * reps) / time_sse2);
printf("AVX2 checksum: %08X, Time: %.4f sec, %.2f MB/s\n", result_avx2, time_avx2, (filesize_MB * reps) / time_avx2);
if (result_scalar != result_sse2 || result_scalar != result_avx2)
printf("Checksums DO NOT MATCH!\n");
else
printf("Checksums match.\n");
_aligned_free(buffer);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment