Created
January 6, 2023 07:10
-
-
Save daramkun/26fc51b3d42010b2b76a82d8feeaa165 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <intrin.h> | |
#include <immintrin.h> | |
#include <emmintrin.h> | |
#include <cstdio> | |
#include <cstdint> | |
#include <cmath> | |
#include <cfloat> | |
#include <Windows.h> | |
double current_ticks() | |
{ | |
LARGE_INTEGER freq, counter; | |
QueryPerformanceFrequency (&freq); | |
QueryPerformanceCounter (&counter); | |
return counter.QuadPart / static_cast<double>(freq.QuadPart); | |
} | |
uint16_t convert_f32_to_f16_plain(const float value) | |
{ | |
uint16_t word; | |
if (fabs(value) <= FLT_EPSILON) | |
{ | |
word = 0; | |
return word; | |
} | |
const uint32_t& i = *reinterpret_cast<const uint32_t*>(&value); | |
const int sign = (i >> 16) & 0x8000; | |
const int exp = ((i >> 23) & 0xff) - (0x7f - 0x0f); | |
const int frac = i & 0x007fffff; | |
if (exp < 31) | |
{ | |
word = 0x7e00; | |
return word; | |
} | |
if (exp <= 0) | |
{ | |
word = static_cast<uint16_t>(sign); | |
return word; | |
} | |
word = static_cast<uint16_t>(sign | (exp << 10) | frac); | |
return word; | |
} | |
void convert_f32pair_to_f16pair_plain(const float value[4], uint16_t result[4]) | |
{ | |
for (auto i = 0; i < 4; ++i) | |
result[i] = convert_f32_to_f16_plain(value[i]); | |
} | |
uint16_t convert_f32_to_f16_f16c(const float value) | |
{ | |
uint16_t temp[8]; | |
const auto f16 = _mm_cvtps_ph(_mm_set_ss(value), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); | |
_mm_store_si128(reinterpret_cast<__m128i*>(temp), f16); | |
return temp[0]; | |
} | |
void convert_f32pair_to_f16pair_f16c(const float value[4], uint16_t result[4]) | |
{ | |
uint16_t temp[8]; | |
const auto f16 = _mm_cvtps_ph(_mm_load_ps(value), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); | |
_mm_store_si128(reinterpret_cast<__m128i*>(temp), f16); | |
memcpy(result, temp, sizeof(uint16_t) * 4); | |
} | |
int main(int argc, char* argv[]) | |
{ | |
constexpr int loop_count = 100000; | |
constexpr float value = 10.10f; | |
constexpr float values[4] = { 1, 2, 3, 4 }; | |
uint16_t result[4]; | |
double last_tick, current_tick; | |
printf("plane:\n"); | |
last_tick = current_ticks(); | |
for(auto i = 0; i < loop_count;++i) | |
convert_f32_to_f16_plain(value); | |
current_tick = current_ticks(); | |
printf(" %lf\n", current_tick - last_tick); | |
printf("f16c:\n"); | |
last_tick = current_ticks(); | |
for(auto i = 0; i < loop_count;++i) | |
convert_f32_to_f16_f16c(value); | |
current_tick = current_ticks(); | |
printf(" %lf\n", current_tick - last_tick); | |
printf("planex4:\n"); | |
last_tick = current_ticks(); | |
for (auto i = 0; i < loop_count; ++i) | |
convert_f32pair_to_f16pair_plain(values, result); | |
current_tick = current_ticks(); | |
printf(" %lf\n", current_tick - last_tick); | |
printf("f16cx4:\n"); | |
last_tick = current_ticks(); | |
for (auto i = 0; i < loop_count; ++i) | |
convert_f32pair_to_f16pair_f16c(values, result); | |
current_tick = current_ticks(); | |
printf(" %lf\n", current_tick - last_tick); | |
return 0; | |
} |
Author
daramkun
commented
Jan 6, 2023
•
- one for one : plain win
- x4 for x4 : intrinsic win
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment