Created
December 8, 2011 01:23
-
-
Save toddlipcon/1445662 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <xmmintrin.h> | |
| #include <stdio.h> | |
| #include <stdint.h> | |
| #include <string.h> | |
| #include <assert.h> | |
| __m64 zero ; | |
| inline long atol_chunk(char *str) { | |
| uint32_t chunk = *((uint32_t *)str); | |
| // 0x31323334 -> 0x01020304 | |
| uint64_t chunk_64 = chunk & 0x0f0f0f0f; | |
| // Interleave the bytes from above with 0s | |
| // so in = 0x0001000200030004 | |
| __m64 chunk_m = *((__m64 *)&chunk_64); | |
| __m64 in = _mm_unpacklo_pi8(chunk_m, zero); | |
| // Multiply times the bases. This adds up | |
| // the left half and the right half | |
| __m64 mult = _mm_set_pi16(1, 10, 100, 1000); | |
| __m64 ret_m = _mm_madd_pi16(in, mult); | |
| // Shift back and add together | |
| return ((uint64_t)ret_m >> 32) + | |
| ((uint64_t)ret_m & 0xffff); | |
| } | |
| long my_atol(char *arg) { | |
| int len = strlen(arg); | |
| long ret = 0; | |
| while (len >= 4) { | |
| ret *= 10000; | |
| long chunk = atol_chunk(arg); | |
| arg += 4; | |
| len -= 4; | |
| ret += chunk; | |
| } | |
| switch (len) { | |
| case 0: | |
| return ret; | |
| case 1: | |
| ret *= 10; | |
| ret += (*arg - '0'); | |
| return ret; | |
| case 2: | |
| ret *= 100; | |
| ret += (*arg++ - '0') * 10; | |
| ret += *arg - '0'; | |
| return ret; | |
| case 3: | |
| ret *= 1000; | |
| char buf[4] = {'0','0','0','0'}; | |
| memcpy(buf + 4 - len, arg, len); | |
| ret += atol_chunk(buf); | |
| return ret; | |
| default: | |
| assert(0 && "x"); | |
| } | |
| } | |
| __inline__ uint64_t rdtsc(void) { | |
| uint32_t lo, hi; | |
| __asm__ __volatile__ ( // serialize | |
| "xorl %%eax,%%eax \n cpuid" | |
| ::: "%rax", "%rbx", "%rcx", "%rdx"); | |
| /* We cannot use "=A", since this would use %rax on x86_64 and return only the lower 32bits of the TSC */ | |
| __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi)); | |
| return (uint64_t)hi << 32 | lo; | |
| } | |
| int main() { | |
| char buf[100]; | |
| long sum = 0; | |
| int i; | |
| zero = _mm_set_pi16(0,0,0,0); | |
| uint64_t st = rdtsc(); | |
| sum = 0; | |
| for (i = 0; i < 10000000; i++) { | |
| sprintf(buf, "%d", i); | |
| sum += atol(buf); | |
| } | |
| uint64_t et = rdtsc(); | |
| printf("sum : %ld took %ld ticks\n", sum, (et - st)); | |
| st = rdtsc(); | |
| sum = 0; | |
| for (i = 0; i < 10000000; i++) { | |
| sprintf(buf, "%d", i); | |
| long ret = my_atol(buf); | |
| sum += ret; | |
| #ifdef CHECK | |
| if (ret != i) { | |
| printf("fail at %d != %ld\n", i, ret); | |
| exit(1); | |
| } | |
| #endif | |
| } | |
| et = rdtsc(); | |
| printf("sum : %ld took %ld ticks\n", sum, (et - st)); | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment