Last active
June 26, 2019 18:16
-
-
Save cycyyy/1d8223448511cdf924391bcad836f449 to your computer and use it in GitHub Desktop.
lurker.page_rank.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <fstream> | |
#include <vector> | |
#include <map> | |
#include <string> | |
#include <sstream> | |
#include <cstdlib> | |
#include <cstdio> | |
#include <cstring> | |
#include <cmath> | |
#include <omp.h> | |
#include <immintrin.h> | |
#include <numa.h> | |
#include <unistd.h> | |
#include "peg_util.h" | |
#include <profiler.hh> | |
#include "ranker.hh" | |
#include <malloc.h> | |
//#include <papi.h> | |
using std::ifstream; | |
using std::string; | |
using std::getline; | |
using std::stringstream; | |
using std::cout; | |
using std::endl; | |
using std::to_string; | |
#define DUMP 0.85 | |
#define NUM_P_INT 16 // Number of packed intergers in one __m512i variable | |
#define ALIGNED_BYTES 64 | |
unsigned NNODES, NEDGES; | |
unsigned NUM_THREADS; | |
unsigned TILE_WIDTH; | |
unsigned CHUNK_SIZE; | |
unsigned SIZE_BUFFER_MAX; | |
unsigned ROW_STEP; | |
Profiler* p = new Profiler(); | |
//double start; | |
//double now; | |
//FILE *time_out; | |
//char *time_file = "timeline.txt"; | |
///////////////////// | |
//void page_rank(unsigned *n1s, unsigned *n2s, unsigned *graph_degrees, float *rank, float *sum) { | |
// | |
// //for(int i=0;i<10;i++) { | |
// double start_time = omp_get_wtime(); | |
// | |
//#pragma omp parallel for num_threads(NUM_THREADS) | |
// for(unsigned j=0;j<NEDGES;j++) { | |
// int n1 = n1s[j]; | |
// int n2 = n2s[j]; | |
//#pragma omp atomic | |
// sum[n2] += rank[n1]/graph_degrees[n1]; | |
// } | |
// //cout << (end.tv_sec - start.tv_sec) + (end.tv_usec - start.tv_usec) / 1000000.0 << endl; | |
// double end_time = omp_get_wtime(); | |
// printf("%u %lf\n", NUM_THREADS, end_time - start_time); | |
// | |
// for(unsigned j = 0; j < NNODES; j++) { | |
// rank[j] = (1 - DUMP) / NNODES + DUMP * sum[j]; | |
// } | |
// //} | |
//} | |
//////////////////////////// | |
//const __m512i one_v = _mm512_set1_epi32(1); | |
//const __m512i zero_v = _mm512_set1_epi32(0); | |
//const __m512i minusone_v = _mm512_set1_epi32(-1); | |
inline void get_seq_sum(\ | |
unsigned *n1s,\ | |
unsigned *n2s,\ | |
unsigned *graph_degrees,\ | |
float *rank,\ | |
float *sum,\ | |
const unsigned &index,\ | |
const unsigned &frontier) | |
{ | |
for (unsigned i = index; i < frontier; ++i) { | |
unsigned n1 = n1s[i]; | |
unsigned n2 = n2s[i]; | |
//#pragma omp atomic | |
sum[n2] += rank[n1]/graph_degrees[n1]; | |
} | |
} | |
// Scan the data, accumulate the values with the same index. | |
// Then, store the cumulative sum to the last element in the data with the same index. | |
inline void scan_for_gather_add_scatter_conflict_safe_epi32( | |
__m512i &data, | |
__m512i indices) | |
{ | |
__m512i cd = _mm512_conflict_epi32(indices); | |
__mmask16 todo_mask = _mm512_test_epi32_mask(cd, _mm512_set1_epi32(-1)); | |
if (todo_mask) { | |
__m512i lz = _mm512_lzcnt_epi32(cd); | |
__m512i lid = _mm512_sub_epi32(_mm512_set1_epi32(31), lz); | |
while (todo_mask) { | |
__m512i todo_bcast = _mm512_broadcastmw_epi32(todo_mask); | |
__mmask16 now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd, todo_bcast); | |
__m512i data_perm = _mm512_mask_permutexvar_epi32(_mm512_undefined_epi32(), now_mask, lid, data); | |
data = _mm512_mask_add_epi32(data, now_mask, data, data_perm); | |
todo_mask = _mm512_kxor(todo_mask, now_mask); | |
} | |
} | |
} | |
// Scan the data, accumulate the values with the same index. | |
// Then, store the cumulative sum to the last element in the data with the same index. | |
inline void scan_for_gather_add_scatter_conflict_safe_ps( | |
__m512 &data, | |
__m512i indices) | |
{ | |
__m512i cd = _mm512_conflict_epi32(indices); | |
__mmask16 todo_mask = _mm512_test_epi32_mask(cd, _mm512_set1_epi32(-1)); | |
if (todo_mask) { | |
__m512i lz = _mm512_lzcnt_epi32(cd); | |
__m512i lid = _mm512_sub_epi32(_mm512_set1_epi32(31), lz); | |
while (todo_mask) { | |
__m512i todo_bcast = _mm512_broadcastmw_epi32(todo_mask); | |
__mmask16 now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd, todo_bcast); | |
__m512 data_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(), now_mask, lid, data); | |
data = _mm512_mask_add_ps(data, now_mask, data, data_perm); | |
todo_mask = _mm512_kxor(todo_mask, now_mask); | |
} | |
} | |
} | |
inline void kernel_pageRank(\ | |
//void kernel_pageRank( | |
unsigned *n1_buffer,\ | |
unsigned *n2_buffer,\ | |
const unsigned &size_buffer,\ | |
float *sum,\ | |
float *rank,\ | |
unsigned *graph_degrees) | |
{ | |
struct timespec ts; | |
struct timespec te; | |
unsigned remainder = size_buffer % NUM_P_INT; | |
unsigned bound_edge_i = size_buffer - remainder; | |
for (unsigned edge_i = 0; edge_i < bound_edge_i; edge_i += NUM_P_INT) { | |
__m512i n1_v = _mm512_load_epi32(n1_buffer + edge_i); | |
__m512i n2_v = _mm512_load_epi32(n2_buffer + edge_i); | |
__m512 rank_v = _mm512_i32gather_ps(n1_v, rank, sizeof(float)); | |
__m512i graph_degrees_vi = _mm512_i32gather_epi32(n1_v, graph_degrees, sizeof(int)); | |
__m512 graph_degrees_v = _mm512_cvtepi32_ps(graph_degrees_vi); | |
__m512 tmp_sum = _mm512_div_ps(rank_v, graph_degrees_v); | |
scan_for_gather_add_scatter_conflict_safe_ps(tmp_sum, n2_v); | |
__m512 sum_n2_v = _mm512_i32gather_ps(n2_v, sum, sizeof(float)); | |
tmp_sum = _mm512_add_ps(tmp_sum, sum_n2_v); | |
_mm512_i32scatter_ps(sum, n2_v, tmp_sum, sizeof(float)); | |
//// Full loaded SIMD lanes | |
//__m512i n1_v = _mm512_load_epi32(n1_buffer + edge_i); | |
//__m512i n2_v = _mm512_load_epi32(n2_buffer + edge_i); | |
//__m512i conflict_n2 = _mm512_conflict_epi32(n2_v); | |
//__mmask16 todo_mask = _mm512_test_epi32_mask(conflict_n2, _mm512_set1_epi32(-1)); | |
//__m512 rank_v = _mm512_i32gather_ps(n1_v, rank, sizeof(float)); | |
//__m512i graph_degrees_vi = _mm512_i32gather_epi32(n1_v, graph_degrees, sizeof(int)); | |
//__m512 graph_degrees_v = _mm512_cvtepi32_ps(graph_degrees_vi); | |
//__m512 tmp_sum = _mm512_div_ps(rank_v, graph_degrees_v); | |
//if (todo_mask) { | |
// __m512i lz = _mm512_lzcnt_epi32(conflict_n2); | |
// __m512i lid = _mm512_sub_epi32(_mm512_set1_epi32(31), lz); | |
// while (todo_mask) { | |
// __m512i todo_bcast = _mm512_broadcastmw_epi32(todo_mask); | |
// __mmask16 now_mask = _mm512_mask_testn_epi32_mask(todo_mask, conflict_n2, todo_bcast); | |
// __m512 tmp_sum_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(), now_mask, lid, tmp_sum); | |
// tmp_sum = _mm512_mask_add_ps(tmp_sum, now_mask, tmp_sum, tmp_sum_perm); | |
// todo_mask = _mm512_kxor(todo_mask, now_mask); | |
// } | |
//} | |
//__m512 sum_n2_v = _mm512_i32gather_ps(n2_v, sum, sizeof(float)); | |
//tmp_sum = _mm512_add_ps(tmp_sum, sum_n2_v); | |
//_mm512_i32scatter_ps(sum, n2_v, tmp_sum, sizeof(float)); | |
} | |
if (remainder > 0) { | |
__mmask16 in_range_m = (__mmask16) ((unsigned short) 0xFFFF >> (NUM_P_INT - remainder)); | |
__m512i n1_v = _mm512_mask_load_epi32(_mm512_undefined_epi32(), in_range_m, n1_buffer + bound_edge_i); | |
__m512i n2_v = _mm512_mask_load_epi32(_mm512_undefined_epi32(), in_range_m, n2_buffer + bound_edge_i); | |
__m512 rank_v = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), in_range_m, n1_v, rank, sizeof(float)); | |
__m512i graph_degrees_vi = _mm512_mask_i32gather_epi32(_mm512_set1_epi32(1), in_range_m, n1_v, graph_degrees, sizeof(int)); | |
__m512 graph_degrees_v = _mm512_cvtepi32_ps(graph_degrees_vi); | |
__m512 tmp_sum = _mm512_mask_div_ps(_mm512_set1_ps(0), in_range_m, rank_v, graph_degrees_v); | |
scan_for_gather_add_scatter_conflict_safe_ps(tmp_sum, n2_v); | |
__m512 sum_n2_v = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), in_range_m, n2_v, sum, sizeof(float)); | |
tmp_sum = _mm512_mask_add_ps(_mm512_undefined_ps(), in_range_m, tmp_sum, sum_n2_v); | |
_mm512_mask_i32scatter_ps(sum, in_range_m, n2_v, tmp_sum, sizeof(float)); | |
} | |
//get_seq_sum(n1_buffer, n2_buffer, graph_degrees, rank, sum, edge_i, size_buffer); | |
//#pragma omp parallel for num_threads(NUM_THREADS) | |
// for(unsigned j=0;j<NEDGES;j++) { | |
// int n1 = n1s[j]; | |
// int n2 = n2s[j]; | |
//#pragma omp atomic | |
// sum[n2] += rank[n1]/graph_degrees[n1]; | |
// } | |
} | |
inline void scheduler( | |
unsigned row_index, | |
unsigned tile_step, | |
unsigned *graph_heads, | |
unsigned *graph_tails, | |
unsigned *n1_buffer, | |
unsigned *n2_buffer, | |
unsigned *tile_offsets, | |
unsigned *tile_sizes, | |
float *sum, | |
float *rank, | |
unsigned *graph_degrees, | |
const unsigned &side_length) | |
{ | |
unsigned *sizes_buffers = (unsigned *) calloc(NUM_THREADS, sizeof(unsigned)); | |
unsigned bound_row_id = row_index + tile_step; | |
#pragma omp parallel for schedule(dynamic, CHUNK_SIZE) | |
//#pragma omp parallel for | |
for (unsigned col_id = 0; col_id < side_length; ++col_id) { | |
unsigned tid = omp_get_thread_num(); | |
unsigned *n1_buffer_base = n1_buffer + tid * SIZE_BUFFER_MAX; | |
unsigned *n2_buffer_base = n2_buffer + tid * SIZE_BUFFER_MAX; | |
//unsigned size_buffer = 0; | |
//unsigned capacity = SIZE_BUFFER_MAX; | |
unsigned size_buffer = sizes_buffers[tid]; | |
unsigned capacity = SIZE_BUFFER_MAX - size_buffer; | |
for (unsigned row_id = row_index; row_id < bound_row_id; ++row_id) { | |
unsigned tile_id = row_id * side_length + col_id; | |
if (0 == tile_sizes[tile_id]) { | |
continue; | |
} | |
// Load to buffer | |
unsigned edge_i = tile_offsets[tile_id]; | |
unsigned remain = tile_sizes[tile_id]; | |
while (remain != 0) { | |
if (capacity > 0) { | |
if (capacity > remain) { | |
// Put all remain into the buffer | |
memcpy(n1_buffer_base + size_buffer, graph_heads + edge_i, remain * sizeof(unsigned)); | |
memcpy(n2_buffer_base + size_buffer, graph_tails + edge_i, remain * sizeof(unsigned)); | |
edge_i += remain; | |
capacity -= remain; | |
size_buffer += remain; | |
remain = 0; | |
} else { | |
// Fill the buffer to full | |
memcpy(n1_buffer_base + size_buffer, graph_heads + edge_i, capacity * sizeof(unsigned)); | |
memcpy(n2_buffer_base + size_buffer, graph_tails + edge_i, capacity * sizeof(unsigned)); | |
edge_i += capacity; | |
remain -= capacity; | |
size_buffer += capacity; | |
capacity = 0; | |
} | |
} else { | |
// Buffer is full already | |
// Process what in buffer | |
kernel_pageRank(\ | |
n1_buffer_base,\ | |
n2_buffer_base,\ | |
SIZE_BUFFER_MAX,\ | |
sum,\ | |
rank,\ | |
graph_degrees); | |
capacity = SIZE_BUFFER_MAX; | |
size_buffer = 0; | |
} | |
} | |
} | |
sizes_buffers[tid] = size_buffer; | |
// Process the buffer | |
//kernel_pageRank(\ | |
// n1_buffer_base,\ | |
// n2_buffer_base,\ | |
// size_buffer,\ | |
// sum,\ | |
// rank,\ | |
// graph_degrees); | |
} | |
// Process the remains in buffer | |
#pragma omp parallel | |
{ | |
unsigned tid = omp_get_thread_num(); | |
unsigned *n1_buffer_base = n1_buffer + tid * SIZE_BUFFER_MAX; | |
unsigned *n2_buffer_base = n2_buffer + tid * SIZE_BUFFER_MAX; | |
unsigned size_buffer = sizes_buffers[tid]; | |
if (size_buffer > 0) { | |
kernel_pageRank(\ | |
n1_buffer_base,\ | |
n2_buffer_base,\ | |
size_buffer,\ | |
sum,\ | |
rank,\ | |
graph_degrees); | |
} | |
} | |
free(sizes_buffers); | |
// unsigned bound_row_id = row_index + tile_step; | |
//#pragma omp parallel for schedule(dynamic, CHUNK_SIZE) | |
// for (unsigned col_id = 0; col_id < side_length; ++col_id) { | |
// unsigned tid = omp_get_thread_num(); | |
// unsigned *n1_buffer_base = n1_buffer + tid * SIZE_BUFFER_MAX; | |
// unsigned *n2_buffer_base = n2_buffer + tid * SIZE_BUFFER_MAX; | |
// unsigned size_buffer = 0; | |
// unsigned capacity = SIZE_BUFFER_MAX; | |
// for (unsigned row_id = row_index; row_id < bound_row_id; ++row_id) { | |
// unsigned tile_id = row_id * side_length + col_id; | |
// if (0 == tile_sizes[tile_id]) { | |
// continue; | |
// } | |
// // Load to buffer | |
// unsigned edge_i = tile_offsets[tile_id]; | |
// unsigned remain = tile_sizes[tile_id]; | |
// while (remain != 0) { | |
// if (capacity > 0) { | |
// if (capacity > remain) { | |
// // Put all remain into the buffer | |
// memcpy(n1_buffer_base + size_buffer, graph_heads + edge_i, remain * sizeof(unsigned)); | |
// memcpy(n2_buffer_base + size_buffer, graph_tails + edge_i, remain * sizeof(unsigned)); | |
// edge_i += remain; | |
// capacity -= remain; | |
// size_buffer += remain; | |
// remain = 0; | |
// } else { | |
// // Fill the buffer to full | |
// memcpy(n1_buffer_base + size_buffer, graph_heads + edge_i, capacity * sizeof(unsigned)); | |
// memcpy(n2_buffer_base + size_buffer, graph_tails + edge_i, capacity * sizeof(unsigned)); | |
// edge_i += capacity; | |
// remain -= capacity; | |
// size_buffer += capacity; | |
// capacity = 0; | |
// } | |
// } else { | |
// // Buffer is full already | |
// // Process what in buffer | |
// kernel_pageRank(\ | |
// n1_buffer_base,\ | |
// n2_buffer_base,\ | |
// SIZE_BUFFER_MAX,\ | |
// sum,\ | |
// rank,\ | |
// graph_degrees); | |
// capacity = SIZE_BUFFER_MAX; | |
// size_buffer = 0; | |
// } | |
// } | |
// } | |
// // Process the buffer | |
// kernel_pageRank(\ | |
// n1_buffer_base,\ | |
// n2_buffer_base,\ | |
// size_buffer,\ | |
// sum,\ | |
// rank,\ | |
// graph_degrees); | |
// | |
// } | |
} | |
void page_rank(\ | |
unsigned *graph_heads, \ | |
unsigned *graph_tails, \ | |
unsigned *graph_degrees, \ | |
unsigned *tile_sizes, \ | |
float *rank, \ | |
float *sum, \ | |
unsigned *tile_offsets, \ | |
unsigned num_tiles, \ | |
unsigned side_length, \ | |
unsigned *n1_buffer, \ | |
unsigned *n2_buffer) | |
{ | |
//unsigned ROW_STEP = 1; | |
if (side_length < ROW_STEP) { | |
printf("Error: ROW_STEP (%u) is to large, larger than side_length (%u)\n", \ | |
ROW_STEP, side_length); | |
exit(3); | |
} | |
omp_set_num_threads(NUM_THREADS); | |
//unsigned row_index; | |
unsigned remainder = side_length % ROW_STEP; | |
unsigned bound_row_index = side_length - remainder; | |
// Cache miss | |
//CacheMissRate miss_rate; | |
double start_time = omp_get_wtime(); // Timer Starts | |
//miss_rate.measure_start(); | |
for (unsigned row_index = 0; row_index < bound_row_index; row_index += ROW_STEP) { | |
scheduler(\ | |
row_index,\ | |
//row_index + ROW_STEP, | |
ROW_STEP, | |
graph_heads,\ | |
graph_tails,\ | |
n1_buffer,\ | |
n2_buffer,\ | |
tile_offsets,\ | |
tile_sizes,\ | |
sum,\ | |
rank,\ | |
graph_degrees,\ | |
side_length); | |
} | |
// printf("\n"); | |
if (remainder > 0) { | |
scheduler(\ | |
bound_row_index,\ | |
remainder,\ | |
graph_heads,\ | |
graph_tails,\ | |
n1_buffer,\ | |
n2_buffer,\ | |
tile_offsets,\ | |
tile_sizes,\ | |
sum,\ | |
rank,\ | |
graph_degrees,\ | |
side_length); | |
} | |
//#pragma omp parallel num_threads(64) | |
// for(unsigned j = 0; j < NNODES; j++) { | |
// rank[j] = (1 - DUMP) / NNODES + DUMP * sum[j]; | |
// } | |
remainder = NNODES % NUM_P_INT; | |
unsigned bound_v_i = NNODES - remainder; | |
__m512 dump_v = _mm512_set1_ps(DUMP); | |
__m512 one_minus_dump_v = _mm512_sub_ps(_mm512_set1_ps(1.0), dump_v); | |
__m512 first_term = _mm512_div_ps(one_minus_dump_v, _mm512_set1_ps((float) NNODES)); | |
#pragma omp parallel for | |
for (unsigned v_i = 0; v_i < bound_v_i; v_i += NUM_P_INT) { | |
__m512 sum_v = _mm512_load_ps(sum + v_i); | |
__m512 second_term = _mm512_mul_ps(dump_v, sum_v); | |
_mm512_store_ps(rank + v_i, _mm512_add_ps(first_term, second_term)); | |
} | |
if (remainder > 0) { | |
unsigned short in_range_m_t = (unsigned short) 0xFFFF >> (NUM_P_INT - remainder); | |
__mmask16 in_range_m = (__mmask16) ((unsigned short) 0xFFFF >> (NUM_P_INT - remainder)); | |
__m512 sum_v = _mm512_mask_load_ps(_mm512_undefined_ps(), in_range_m, sum + bound_v_i); | |
__m512 second_term = _mm512_mask_mul_ps(_mm512_undefined_ps(), in_range_m, dump_v, sum_v); | |
_mm512_mask_store_ps(rank + bound_v_i, in_range_m, | |
_mm512_mask_add_ps(_mm512_undefined_ps(), in_range_m, first_term, second_term)); | |
} | |
double end_time = omp_get_wtime(); | |
double rt; | |
rt = end_time - start_time; | |
// printf("%u %lf\n", NUM_THREADS, rt = end_time - start_time); | |
bot_best_perform.record(rt, NUM_THREADS); | |
} | |
void print(float *rank) | |
{ | |
FILE *fout = fopen("ranks.txt", "w"); | |
for(unsigned i=0;i<NNODES;i++) { | |
//cout << rank[i] << " "; | |
fprintf(fout, "%lf\n", rank[i]); | |
} | |
//cout << endl; | |
fclose(fout); | |
} | |
void input(char filename[]) | |
{ | |
#ifdef ONEDEBUG | |
printf("input: %s\n", filename); | |
#endif | |
//string prefix = string(filename) + "_tiled-" + to_string(TILE_WIDTH); | |
//string file_name_pre = string(filename) + "_reorder"; | |
string file_name_pre = string(filename); | |
string prefix = file_name_pre + "_coo-tiled-" + to_string(TILE_WIDTH); | |
string fname = prefix + "-" + to_string(0); | |
FILE *fin = fopen(fname.c_str(), "r"); | |
if (!fin) { | |
fprintf(stderr, "cannot open file: %s\n", fname.c_str()); | |
exit(1); | |
} | |
fscanf(fin, "%u %u", &NNODES, &NEDGES); | |
fclose(fin); | |
unsigned num_tiles; | |
unsigned side_length; | |
if (NNODES % TILE_WIDTH) { | |
side_length = NNODES / TILE_WIDTH + 1; | |
} else { | |
side_length = NNODES / TILE_WIDTH; | |
} | |
num_tiles = side_length * side_length; | |
// Read the offset and number of edges for every tile | |
fname = prefix + "-offsets"; | |
fin = fopen(fname.c_str(), "r"); | |
if (!fin) { | |
fprintf(stderr, "cannot open file: %s\n", fname.c_str()); | |
exit(1); | |
} | |
//unsigned *tile_offsets = (unsigned *) malloc(num_tiles * sizeof(unsigned)); | |
//nsigned *tile_offsets = (unsigned *) p->lurker_malloc(num_tiles * sizeof(unsigned)); | |
printf("tile_offsets: "); | |
unsigned *tile_offsets = (unsigned *) p->lurker_malloc_tier(num_tiles * sizeof(unsigned), 1024); | |
for (unsigned i = 0; i < num_tiles; ++i) { | |
fscanf(fin, "%u", tile_offsets + i); | |
} | |
fclose(fin); | |
//unsigned *tile_sizes = (unsigned *) malloc(num_tiles * sizeof(unsigned)); | |
//unsigned *tile_sizes = (unsigned *) p->lurker_malloc(num_tiles * sizeof(unsigned)); | |
printf("tile_sizes: "); | |
unsigned *tile_sizes = (unsigned *) p->lurker_malloc_tier(num_tiles * sizeof(unsigned), 1024); | |
//memset(tile_sizes, 0, num_tiles * sizeof(unsigned)); | |
for (unsigned i = 0; i < num_tiles; ++i) { | |
if (i != num_tiles - 1) { | |
tile_sizes[i] = tile_offsets[i + 1] - tile_offsets[i]; | |
} else { | |
tile_sizes[i] = NEDGES - tile_offsets[i]; | |
} | |
} | |
// Read graph_degrees | |
fname = prefix + "-nneibor"; | |
fin = fopen(fname.c_str(), "r"); | |
if (!fin) { | |
fprintf(stderr, "cannot open file: %s\n", fname.c_str()); | |
exit(1); | |
} | |
//unsigned *graph_degrees = (unsigned *) malloc(NNODES * sizeof(unsigned)); | |
//unsigned *graph_degrees = (unsigned *) p->lurker_malloc(NNODES * sizeof(unsigned)); | |
printf("graph_degrees: "); | |
unsigned *graph_degrees = (unsigned *) p->lurker_malloc_tier(NNODES * sizeof(unsigned), 1); | |
for (unsigned i = 0; i < NNODES; ++i) { | |
fscanf(fin, "%u", graph_degrees + i); | |
} | |
fclose(fin); | |
// Read tiles | |
//unsigned *graph_heads = (unsigned *) _mm_malloc(NEDGES * sizeof(unsigned), ALIGNED_BYTES); | |
//unsigned *graph_tails = (unsigned *) _mm_malloc(NEDGES * sizeof(unsigned), ALIGNED_BYTES); | |
//unsigned *graph_heads = (unsigned *) p->lurker_malloc(NEDGES * sizeof(unsigned)); | |
//unsigned *graph_tails = (unsigned *) p->lurker_malloc(NEDGES * sizeof(unsigned)); | |
printf("graph_heads: "); | |
unsigned *graph_heads = (unsigned *) p->lurker_malloc_tier(NEDGES * sizeof(unsigned), 1024); | |
printf("graph_tails: "); | |
unsigned *graph_tails = (unsigned *) p->lurker_malloc_tier(NEDGES * sizeof(unsigned), 1024); | |
unsigned edge_bound = NEDGES / ALIGNED_BYTES; | |
NUM_THREADS = 64; | |
#pragma omp parallel for num_threads(NUM_HT) | |
for (unsigned i = 0; i < NUM_HT; i++) { | |
p->enroll(); | |
} | |
#pragma omp parallel num_threads(NUM_THREADS) private(fname, fin) | |
{ | |
unsigned tid = omp_get_thread_num(); | |
unsigned offset = tid * edge_bound; | |
fname = prefix + "-" + to_string(tid); | |
fin = fopen(fname.c_str(), "r"); | |
if (!fin) { | |
fprintf(stderr, "cannot open file: %s\n", fname.c_str()); | |
exit(1); | |
} | |
if (0 == tid) { | |
fscanf(fin, "%u %u", &NNODES, &NEDGES); | |
} | |
unsigned bound_index; | |
if (NUM_THREADS - 1 != tid) { | |
bound_index = edge_bound + offset; | |
} else { | |
bound_index = NEDGES; | |
} | |
for (unsigned index = offset; index < bound_index; ++index) { | |
unsigned n1; | |
unsigned n2; | |
fscanf(fin, "%u %u", &n1, &n2); | |
n1--; | |
n2--; | |
graph_heads[index] = n1; | |
graph_tails[index] = n2; | |
//unsigned n1_id = n1 / TILE_WIDTH; | |
//unsigned n2_id = n2 / TILE_WIDTH; | |
//unsigned tile_id = n1_id * side_length + n2_id; | |
} | |
fclose(fin); | |
} | |
//float *rank = (float *) _mm_malloc(NNODES * sizeof(float), ALIGNED_BYTES); | |
//float *sum = (float *) _mm_malloc(NNODES * sizeof(float), ALIGNED_BYTES); | |
float *rank = (float *) p->lurker_malloc_tier(NNODES * sizeof(float), 1); | |
float *sum = (float *) p->lurker_malloc_tier(NNODES * sizeof(float), 1); | |
// Ranker::numa_migrate(rank, NNODES * sizeof(float), 1); | |
// Ranker::numa_migrate(sum, NNODES * sizeof(float), 1); | |
// p->rank_and_move(); | |
// #pragma omp parallel for num_threads(64) | |
// for (unsigned i = 0; i < NNODES; i++) { | |
// rank[i] = 1.0; | |
// sum[i] = 0.0; | |
// } | |
// #pragma omp parallel for num_threads(64) | |
// for (unsigned i = 0; i < NNODES; i++) { | |
// rank[i] = 1.0; | |
// sum[i] = 0.0; | |
// } | |
//float *rank = (float *) numa_alloc_onnode(NNODES * sizeof(float), 1); | |
//float *sum = (float *) numa_alloc_onnode(NNODES * sizeof(float), 1); | |
//float *rank = (float *) p->lurker_malloc(NNODES * sizeof(float)); | |
//float *sum = (float *) p->lurker_malloc(NNODES * sizeof(float)); | |
#ifdef ONEDEBUG | |
unsigned bound_i = 7; | |
#else | |
unsigned bound_i = 9; | |
#endif | |
// PageRank | |
CHUNK_SIZE = 1; | |
SIZE_BUFFER_MAX = 512; | |
//SIZE_BUFFER_MAX = 400; | |
//printf("tile_size: %u\n", TILE_WIDTH);//test | |
//printf("stripe_length: %u\n", ROW_STEP); | |
unsigned k_set = 10; | |
unsigned *n1_buffer = (unsigned *) p->lurker_malloc(sizeof(unsigned) * SIZE_BUFFER_MAX * 256); | |
unsigned *n2_buffer = (unsigned *) p->lurker_malloc(sizeof(unsigned) * SIZE_BUFFER_MAX * 256); | |
#ifdef SKX | |
for (unsigned i = 1; i < 3; ++i) { | |
NUM_THREADS = (unsigned) 24 * i; | |
#else | |
for (unsigned i = 6; i < bound_i; ++i) { | |
NUM_THREADS = (unsigned) pow(2, i); | |
#endif | |
//unsigned *n1_buffer = (unsigned *) _mm_malloc(sizeof(unsigned) * SIZE_BUFFER_MAX * NUM_THREADS, ALIGNED_BYTES); | |
//unsigned *n2_buffer = (unsigned *) _mm_malloc(sizeof(unsigned) * SIZE_BUFFER_MAX * NUM_THREADS, ALIGNED_BYTES); | |
//unsigned *n1_buffer = (unsigned *) p->lurker_malloc(sizeof(unsigned) * SIZE_BUFFER_MAX * 256); | |
//unsigned *n2_buffer = (unsigned *) p->lurker_malloc(sizeof(unsigned) * SIZE_BUFFER_MAX * 256); | |
//unsigned *n1_buffer = (unsigned *) numa_alloc_onnode(sizeof(unsigned) * SIZE_BUFFER_MAX * NUM_THREADS, 1); | |
//unsigned *n2_buffer = (unsigned *) numa_alloc_onnode(sizeof(unsigned) * SIZE_BUFFER_MAX * NUM_THREADS, 1); | |
//unsigned *n1_buffer = (unsigned *) p->lurker_malloc_tier(sizeof(unsigned) * SIZE_BUFFER_MAX * NUM_THREADS, 16); | |
//unsigned *n2_buffer = (unsigned *) p->lurker_malloc_tier(sizeof(unsigned) * SIZE_BUFFER_MAX * NUM_THREADS, 16); | |
bot_best_perform.reset(); | |
// p->reset(); | |
// p->start_all(); | |
for (int k = 0; k < k_set; ++k) { | |
#pragma omp parallel for num_threads(64) | |
for (unsigned i = 0; i < NNODES; i++) { | |
rank[i] = 1.0; | |
sum[i] = 0.0; | |
} | |
#ifndef ONEDEBUG | |
//sleep(10); | |
#endif | |
struct timeval st,end; | |
struct timezone tzp; | |
gettimeofday(&st,&tzp); | |
float *rank_tmp = (float *) malloc(NNODES * sizeof(float)); | |
float *sum_tmp = (float *) malloc(NNODES * sizeof(float)); | |
memcpy(rank_tmp, rank, NNODES * sizeof(float)); | |
memcpy(sum_tmp,sum, NNODES * sizeof(float)); | |
munmap(rank, NNODES * sizeof(float) / 4096 * 4096); | |
munmap(sum, NNODES * sizeof(float) / 4096 * 4096); | |
void *a = mmap(rank, NNODES * sizeof(float) / 4096 * 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); | |
void *b = mmap(sum, NNODES * sizeof(float) / 4096 * 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); | |
Ranker::numa_migrate(rank, NNODES * sizeof(float), 1); | |
Ranker::numa_migrate(sum, NNODES * sizeof(float), 1); | |
memcpy(rank, rank_tmp, NNODES * sizeof(float)); | |
memcpy(sum,sum_tmp, NNODES * sizeof(float)); | |
free(rank_tmp); | |
free(sum_tmp); | |
gettimeofday(&end,&tzp); | |
printf("using %.0f microseconds\n",(end.tv_sec-st.tv_sec)*1.e6 + (end.tv_usec - st.tv_usec)); | |
p->start_all(); | |
page_rank(\ | |
graph_heads, \ | |
graph_tails, \ | |
graph_degrees, \ | |
tile_sizes, \ | |
rank, \ | |
sum, \ | |
tile_offsets, \ | |
num_tiles, \ | |
side_length, \ | |
n1_buffer, \ | |
n2_buffer); | |
} | |
bot_best_perform.print_average(NUM_THREADS); | |
p->stop_all(); | |
p->rank_and_move(); | |
//_mm_free(n1_buffer); | |
//_mm_free(n2_buffer); | |
//numa_free(n1_buffer, sizeof(unsigned) * SIZE_BUFFER_MAX * NUM_THREADS); | |
//numa_free(n2_buffer, sizeof(unsigned) * SIZE_BUFFER_MAX * NUM_THREADS); | |
//p->lurker_free_tier(n1_buffer, sizeof(unsigned) * SIZE_BUFFER_MAX * NUM_THREADS, 16); | |
//p->lurker_free_tier(n2_buffer, sizeof(unsigned) * SIZE_BUFFER_MAX * NUM_THREADS, 16); | |
} | |
p->lurker_free(n1_buffer, sizeof(unsigned) * SIZE_BUFFER_MAX * 256); | |
p->lurker_free(n2_buffer, sizeof(unsigned) * SIZE_BUFFER_MAX * 256); | |
#ifdef ONEDEBUG | |
//print(rank); | |
#endif | |
// Free memory | |
// _mm_free(graph_heads); | |
// _mm_free(graph_tails); | |
// free(graph_degrees); | |
// free(tile_offsets); | |
// free(tile_sizes); | |
// _mm_free(rank); | |
// _mm_free(sum); | |
//p->lurker_free(graph_heads, NEDGES * sizeof(unsigned)); | |
//p->lurker_free(graph_tails, NEDGES * sizeof(unsigned)); | |
//p->lurker_free(graph_degrees, NNODES * sizeof(unsigned)); | |
//p->lurker_free(tile_offsets, num_tiles * sizeof(unsigned)); | |
//p->lurker_free(tile_sizes, num_tiles * sizeof(unsigned)); | |
//p->lurker_free(rank, NNODES * sizeof(float)); | |
//p->lurker_free(sum, NNODES * sizeof(float)); | |
p->lurker_free_tier(graph_heads, NEDGES * sizeof(unsigned), 1024 * 10); | |
p->lurker_free_tier(graph_tails, NEDGES * sizeof(unsigned), 1024 * 10); | |
p->lurker_free_tier(graph_degrees, NNODES * sizeof(unsigned), 512); | |
p->lurker_free_tier(tile_offsets, num_tiles * sizeof(unsigned), 1024); | |
p->lurker_free_tier(tile_sizes, num_tiles * sizeof(unsigned), 1024); | |
p->lurker_free_tier(rank, NNODES * sizeof(float), 32); | |
p->lurker_free_tier(sum, NNODES * sizeof(float), 32); | |
//numa_free(rank, NNODES * sizeof(float)); | |
//numa_free(sum, NNODES * sizeof(float)); | |
delete p; | |
} | |
int main(int argc, char *argv[]) | |
{ | |
char *filename; | |
if (argc > 3) { | |
filename = argv[1]; | |
TILE_WIDTH = strtoul(argv[2], NULL, 0); | |
ROW_STEP = strtoul(argv[3], NULL, 0); | |
} else { | |
//filename = "/home/zpeng/benchmarks/data/pokec_combine/soc-pokec"; | |
//TILE_WIDTH = 1024; | |
//ROW_STEP = 16; | |
printf("Usage: ./pageRank <data> <tile_width> <stripe_length>\n"); | |
exit(1); | |
} | |
input(filename); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment