Skip to content

Instantly share code, notes, and snippets.

@PkmX
Created November 8, 2014 06:09
Show Gist options
  • Save PkmX/b4c4e488e7a6f7717a7b to your computer and use it in GitHub Desktop.
Save PkmX/b4c4e488e7a6f7717a7b to your computer and use it in GitHub Desktop.
Parallel Programming Homework 1 - Monte Carlo π estimation with pthread
CC := gcc
CFLAGS := -std=gnu99 -Wall -pedantic -pthread -Ofast -march=native -funroll-loops -ftree-vectorize
BIN := pi
SRCS := pi.c
ZIP := HW1_0356162.zip
.PHONY: all clean zip
all: $(BIN)
$(BIN): $(SRCS)
$(CC) $(CFLAGS) $^ -o $@
clean:
-rm $(BIN)
zip: $(ZIP)
$(ZIP): $(SRCS) Makefile
zip $@ $^
#include <assert.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/sysinfo.h>
#include <pthread.h>
typedef int32_t intmindiv;
float intmindiv_generate(intmindiv* const rng) {
*rng *= 16807;
return (float) (*rng) / (float) 0x80000000;
}
uint64_t do_toss_impl(const uint64_t tosses) {
unsigned seed = time(NULL);
intmindiv rng = rand_r(&seed);
uint64_t in_circle = 0;
for (uint64_t i = 0; i < tosses; ++i) {
const float x = intmindiv_generate(&rng);
const float y = intmindiv_generate(&rng);
const float distance = x * x + y * y;
in_circle += (distance < 1.0f) ? 1 : 0;
}
return in_circle;
}
#ifdef __AVX__
typedef int32_t v4si __attribute__ ((vector_size(16)));
typedef int32_t v8si __attribute__ ((vector_size(32)));
typedef float v8sf __attribute__ ((vector_size(32)));
typedef v8si intmindiv8;
v8sf intmindiv8_generate(intmindiv8* const rng) {
*rng *= 16807;
return __builtin_ia32_cvtdq2ps256(*rng) / (float) 0x80000000;
}
int v8si_hsum(const v8si v) {
const v4si u = __builtin_ia32_vextractf128_si256(v, 0) + __builtin_ia32_vextractf128_si256(v, 1);
const v4si w = __builtin_ia32_phaddd128(u, u);
return __builtin_ia32_phaddd128(w, w)[0];
}
uint64_t do_toss_impl_avx(const uint64_t tosses) {
unsigned seed = time(NULL);
intmindiv8 vrng = { rand_r(&seed), rand_r(&seed), rand_r(&seed), rand_r(&seed), rand_r(&seed), rand_r(&seed), rand_r(&seed), rand_r(&seed) };
v8si in_circles = {0, 0, 0, 0, 0, 0, 0, 0};
for (uint64_t i = 0; i < tosses / 8; ++i) {
const v8sf vx = intmindiv8_generate(&vrng);
const v8sf vy = intmindiv8_generate(&vrng);
const v8sf vdistance = vx * vx + vy * vy;
const v8sf vradius = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
in_circles -= (v8si) __builtin_ia32_cmpps256(vdistance, vradius, 1);
}
return v8si_hsum(in_circles) + do_toss_impl(tosses % 8);
}
#endif // __AVX__
typedef struct toss_args_t {
uint64_t tosses;
uint64_t in_circle;
} toss_args;
void* do_toss(toss_args* const args) {
#ifdef __AVX__
args->in_circle = do_toss_impl_avx(args->tosses);
#else
args->in_circle = do_toss_impl(args->tosses);
#endif
return NULL;
}
int main(const int argc, const char * const * const argv) {
assert(argc >= 2);
const int nprocs = argc >= 3 ? atoi(argv[2]) : get_nprocs_conf();
const uint64_t tosses = atoll(argv[1]);
pthread_t threads[nprocs];
toss_args args[nprocs];
for (int n = 0; n < nprocs; ++n) {
args[n].tosses = tosses / nprocs + (n == 0 ? tosses % nprocs : 0);
args[n].in_circle = 0;
pthread_create(&threads[n], NULL, (void* (*)(void*)) do_toss, &args[n]);
}
uint64_t in_circle = 0;
for (int n = 0; n < nprocs; ++n) {
pthread_join(threads[n], NULL);
in_circle += args[n].in_circle;
}
const double pi = 4.0 * ((double) in_circle / (double) tosses);
printf("%.20g\n", pi);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment