Created
June 11, 2025 02:14
-
-
Save hfiref0x/5d6be98a3ead6c5bd8d9fb0335138b49 to your computer and use it in GitHub Desktop.
Checksum calculation benchmark (Scalar/SSE2/AVX2)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// MSVC compiler: /O2 /arch:AVX2 | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <stdint.h> | |
#include <windows.h> | |
#include <emmintrin.h> | |
#include <immintrin.h> | |
#ifndef DEFAULT_FILESIZE_MB | |
#define DEFAULT_FILESIZE_MB 3 | |
#endif | |
#ifndef DEFAULT_REPS | |
#define DEFAULT_REPS 100 | |
#endif | |
#define ALIGNMENT 32 // for AVX2 | |
// Scalar PE checksum | |
DWORD checksum_scalar(const void* base_address, ULONG file_length, const USHORT* opt_hdr_chksum) { | |
ULONG partial_sum = 0; | |
const USHORT* source = (const USHORT*)base_address; | |
ULONG length = file_length / 2; | |
ULONG i; | |
for (i = 0; i < length; ++i) { | |
partial_sum += source[i]; | |
partial_sum = (partial_sum & 0xFFFF) + (partial_sum >> 16); | |
} | |
if (file_length & 1) { | |
partial_sum += ((const uint8_t*)base_address)[file_length - 1]; | |
partial_sum = (partial_sum & 0xFFFF) + (partial_sum >> 16); | |
} | |
partial_sum -= (partial_sum < opt_hdr_chksum[0]); | |
partial_sum -= opt_hdr_chksum[0]; | |
partial_sum -= (partial_sum < opt_hdr_chksum[1]); | |
partial_sum -= opt_hdr_chksum[1]; | |
partial_sum = (partial_sum & 0xFFFF) + (partial_sum >> 16); | |
return (ULONG)partial_sum + file_length; | |
} | |
// SSE2 PE checksum (with per-word folding) | |
DWORD checksum_sse2_pe_match(const void* base_address, ULONG file_length, const USHORT* opt_hdr_chksum) { | |
const uint8_t* data8 = (const uint8_t*)base_address; | |
ULONG len = file_length; | |
uint32_t sum = 0; | |
ULONG i = 0; | |
for (; i + 16 <= len; i += 16) { | |
__m128i v = _mm_loadu_si128((const __m128i*)(data8 + i)); | |
uint16_t tmp[8]; | |
_mm_storeu_si128((__m128i*)tmp, v); | |
for (int k = 0; k < 8; ++k) { | |
sum += tmp[k]; | |
sum = (sum & 0xFFFF) + (sum >> 16); | |
} | |
} | |
for (; i + 1 < len; i += 2) { | |
sum += *(const uint16_t*)(data8 + i); | |
sum = (sum & 0xFFFF) + (sum >> 16); | |
} | |
if (len & 1) { | |
sum += data8[len - 1]; | |
sum = (sum & 0xFFFF) + (sum >> 16); | |
} | |
sum -= (sum < opt_hdr_chksum[0]); | |
sum -= opt_hdr_chksum[0]; | |
sum -= (sum < opt_hdr_chksum[1]); | |
sum -= opt_hdr_chksum[1]; | |
sum = (sum & 0xFFFF) + (sum >> 16); | |
return (ULONG)sum + file_length; | |
} | |
// AVX2 PE checksum (with per-word folding) | |
DWORD checksum_avx2_pe_match(const void* base_address, ULONG file_length, const USHORT* opt_hdr_chksum) { | |
const uint8_t* data8 = (const uint8_t*)base_address; | |
ULONG len = file_length; | |
uint32_t sum = 0; | |
ULONG i = 0; | |
for (; i + 32 <= len; i += 32) { | |
__m256i v = _mm256_loadu_si256((const __m256i*)(data8 + i)); | |
uint16_t tmp[16]; | |
_mm256_storeu_si256((__m256i*)tmp, v); | |
for (int k = 0; k < 16; ++k) { | |
sum += tmp[k]; | |
sum = (sum & 0xFFFF) + (sum >> 16); | |
} | |
} | |
for (; i + 1 < len; i += 2) { | |
sum += *(const uint16_t*)(data8 + i); | |
sum = (sum & 0xFFFF) + (sum >> 16); | |
} | |
if (len & 1) { | |
sum += data8[len - 1]; | |
sum = (sum & 0xFFFF) + (sum >> 16); | |
} | |
sum -= (sum < opt_hdr_chksum[0]); | |
sum -= opt_hdr_chksum[0]; | |
sum -= (sum < opt_hdr_chksum[1]); | |
sum -= opt_hdr_chksum[1]; | |
sum = (sum & 0xFFFF) + (sum >> 16); | |
return (ULONG)sum + file_length; | |
} | |
double seconds(LARGE_INTEGER start, LARGE_INTEGER end, LARGE_INTEGER freq) { | |
return (double)(end.QuadPart - start.QuadPart) / (double)freq.QuadPart; | |
} | |
void fill_random(void* buf, size_t sz) { | |
uint32_t* p = (uint32_t*)buf; | |
size_t n = sz / sizeof(uint32_t); | |
for (size_t i = 0; i < n; ++i) | |
p[i] = rand(); | |
uint8_t* pb = (uint8_t*)buf; | |
for (size_t i = n * 4; i < sz; ++i) | |
pb[i] = rand() & 0xff; | |
} | |
void usage(const char* prog) { | |
printf("Usage: %s [filesize_MB] [repetitions]\n", prog); | |
printf(" filesize_MB: Size of buffer in megabytes (default: %d)\n", DEFAULT_FILESIZE_MB); | |
printf(" repetitions: Number of repetitions (default: %d)\n", DEFAULT_REPS); | |
} | |
int main(int argc, char* argv[]) { | |
int filesize_MB = DEFAULT_FILESIZE_MB; | |
int reps = DEFAULT_REPS; | |
if (argc > 1) { | |
if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0) { | |
usage(argv[0]); | |
return 0; | |
} | |
filesize_MB = atoi(argv[1]); | |
if (filesize_MB <= 0) { | |
printf("Invalid filesize_MB.\n"); | |
usage(argv[0]); | |
return 1; | |
} | |
} | |
if (argc > 2) { | |
reps = atoi(argv[2]); | |
if (reps <= 0) { | |
printf("Invalid repetitions.\n"); | |
usage(argv[0]); | |
return 1; | |
} | |
} | |
const size_t FILESIZE = (size_t)filesize_MB * 1024 * 1024; | |
void* buffer = _aligned_malloc(FILESIZE, ALIGNMENT); | |
if (!buffer) { | |
printf("Allocation failed\n"); | |
return 1; | |
} | |
fill_random(buffer, FILESIZE); | |
USHORT checksum_field[2] = { 0,0 }; // Simulate as zero | |
LARGE_INTEGER freq, t0, t1; | |
QueryPerformanceFrequency(&freq); | |
DWORD result_scalar = 0, result_sse2 = 0, result_avx2 = 0; | |
// Warm-up | |
checksum_scalar(buffer, (ULONG)FILESIZE, checksum_field); | |
checksum_sse2_pe_match(buffer, (ULONG)FILESIZE, checksum_field); | |
checksum_avx2_pe_match(buffer, (ULONG)FILESIZE, checksum_field); | |
// Scalar benchmark | |
QueryPerformanceCounter(&t0); | |
for (int i = 0; i < reps; ++i) | |
result_scalar = checksum_scalar(buffer, (ULONG)FILESIZE, checksum_field); | |
QueryPerformanceCounter(&t1); | |
double time_scalar = seconds(t0, t1, freq); | |
// SSE2 benchmark | |
QueryPerformanceCounter(&t0); | |
for (int i = 0; i < reps; ++i) | |
result_sse2 = checksum_sse2_pe_match(buffer, (ULONG)FILESIZE, checksum_field); | |
QueryPerformanceCounter(&t1); | |
double time_sse2 = seconds(t0, t1, freq); | |
// AVX2 benchmark | |
QueryPerformanceCounter(&t0); | |
for (int i = 0; i < reps; ++i) | |
result_avx2 = checksum_avx2_pe_match(buffer, (ULONG)FILESIZE, checksum_field); | |
QueryPerformanceCounter(&t1); | |
double time_avx2 = seconds(t0, t1, freq); | |
printf("File size: %d MB, Repetitions: %d\n", filesize_MB, reps); | |
printf("Scalar checksum: %08X, Time: %.4f sec, %.2f MB/s\n", result_scalar, time_scalar, (filesize_MB * reps) / time_scalar); | |
printf("SSE2 checksum: %08X, Time: %.4f sec, %.2f MB/s\n", result_sse2, time_sse2, (filesize_MB * reps) / time_sse2); | |
printf("AVX2 checksum: %08X, Time: %.4f sec, %.2f MB/s\n", result_avx2, time_avx2, (filesize_MB * reps) / time_avx2); | |
if (result_scalar != result_sse2 || result_scalar != result_avx2) | |
printf("Checksums DO NOT MATCH!\n"); | |
else | |
printf("Checksums match.\n"); | |
_aligned_free(buffer); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment