Skip to content

Instantly share code, notes, and snippets.

@tedliosu
Last active October 7, 2024 06:12
Show Gist options
  • Save tedliosu/e4ac21005fa792f5274012316affd768 to your computer and use it in GitHub Desktop.
Save tedliosu/e4ac21005fa792f5274012316affd768 to your computer and use it in GitHub Desktop.
OpenMP Parallel Mergesort Sample
/* Iterative C program for merge sort, partly inspired
* by Geeks for geeks article on bottom up mergesort as
* well as using code from university slides on co-rank
* merges by Dr. Steven S. Lumetta, parallelized */
// compile with "gcc -fopenmp -O3 iterative_mergesort_openmp.c -o iterative_mergesort_openmp -lm"
/* Uploaded this file as a gist instead of within a proper repo since
* the code within this file isn't quite up to par with coding standards
* of code normally uploaded to my github profile */
#include <math.h>
#include <omp.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
// CPU PARAMS
#define MAX_PARALLEL_NESTING 2
#define SCALING_CONST 2
long min(long first, long second) {
return (first < second ? first : second);
}
void merge(double* first_input_array,
double* second_input_array,
double* dest_array,
long first_input_arr_len,
long second_input_arr_len) {
long current_index_first_arr = 0;
long current_index_second_arr = 0;
long current_index_dest_arr = 0;
while (current_index_first_arr < first_input_arr_len &&
current_index_second_arr < second_input_arr_len) {
if (first_input_array[current_index_first_arr] <=
second_input_array[current_index_second_arr]) {
dest_array[current_index_dest_arr++] =
first_input_array[current_index_first_arr++];
} else {
dest_array[current_index_dest_arr++] =
second_input_array[current_index_second_arr++];
}
}
if (current_index_first_arr >= first_input_arr_len) {
for (; current_index_second_arr < second_input_arr_len;
++current_index_second_arr) {
dest_array[current_index_dest_arr++] =
second_input_array[current_index_second_arr];
}
} else {
for (; current_index_first_arr < first_input_arr_len;
++current_index_first_arr) {
dest_array[current_index_dest_arr++] =
first_input_array[current_index_first_arr];
}
}
}
long determine_corank(long dest_arr_starting_index,
double* first_input_arr,
long first_input_arr_len,
double* second_input_arr,
long second_input_arr_len) {
long low, high;
if (dest_arr_starting_index > second_input_arr_len) {
low = dest_arr_starting_index - second_input_arr_len;
} else {
low = 0;
}
if (dest_arr_starting_index < first_input_arr_len) {
high = dest_arr_starting_index;
} else {
high = first_input_arr_len;
}
while (low < high) {
long current_index_first_arr = low + (high - low) / 2;
long current_index_second_arr =
dest_arr_starting_index - current_index_first_arr;
if (current_index_first_arr > 0 &&
current_index_second_arr < second_input_arr_len &&
first_input_arr[current_index_first_arr - 1] >
second_input_arr[current_index_second_arr]) {
high = current_index_first_arr - 1;
} else if (current_index_second_arr > 0 &&
current_index_first_arr < first_input_arr_len &&
first_input_arr[current_index_first_arr] <=
second_input_arr[current_index_second_arr - 1]) {
low = current_index_first_arr + 1;
} else {
return current_index_first_arr;
}
}
return low;
}
void parallel_merge_using_corank(double* first_input_array,
double* second_input_array,
double* dest_array,
long first_input_array_len,
long second_input_array_len,
long num_of_threads,
long overall_arr_size) {
long min_array_portion_len =
min(first_input_array_len, second_input_array_len);
if (min_array_portion_len <= 0) {
min_array_portion_len = 1;
}
long total_thread_count = 1 + ((num_of_threads * SCALING_CONST - 1) /
(overall_arr_size / min_array_portion_len));
#pragma omp parallel for shared(total_thread_count) \
num_threads(total_thread_count)
for (long thread_id = 0; thread_id < total_thread_count; ++thread_id) {
long dest_arr_chunk_size =
1 + ((first_input_array_len + second_input_array_len - 1) /
total_thread_count);
long dest_arr_starting_index = thread_id * dest_arr_chunk_size;
dest_arr_starting_index =
min(first_input_array_len + second_input_array_len,
dest_arr_starting_index);
long dest_arr_ending_index = dest_arr_starting_index + dest_arr_chunk_size;
dest_arr_ending_index = min(first_input_array_len + second_input_array_len,
dest_arr_ending_index);
long first_arr_starting_index = determine_corank(
dest_arr_starting_index, first_input_array, first_input_array_len,
second_input_array, second_input_array_len);
long first_arr_ending_index = determine_corank(
dest_arr_ending_index, first_input_array, first_input_array_len,
second_input_array, second_input_array_len);
long second_arr_starting_index =
dest_arr_starting_index - first_arr_starting_index;
long second_arr_ending_index =
dest_arr_ending_index - first_arr_ending_index;
merge(&first_input_array[first_arr_starting_index],
&second_input_array[second_arr_starting_index],
&dest_array[dest_arr_starting_index],
first_arr_ending_index - first_arr_starting_index,
second_arr_ending_index - second_arr_starting_index);
}
}
/* Iterative parallel_merge_sort function to sort arr[0...n-1] */
void parallel_merge_sort(double** arr, double** arr_aux, long n, long num_threads) {
long curr_size; // For current size of subarrays to be merged
// curr_size varies from 1 to n/2
long left_start; // For picking starting index of left subarray
// to be merged
double* arr_deref = *arr;
double* arr_aux_deref = *arr_aux;
#pragma omp parallel for shared(arr, arr_aux, n)
for (long current_index = 0; current_index < n; ++current_index) {
arr_deref[current_index] = arr_aux_deref[current_index];
}
long serial_merge_threshold = (n >> ((int)ceil(log(num_threads) / log(2))));
// Merge subarrays in bottom up manner. First merge subarrays of
// size 1 to create sorted subarrays of size 2, then merge subarrays
// of size 2 to create sorted subarrays of size 4, and so on.
for (curr_size = 1; curr_size <= n - 1; curr_size = 2 * curr_size) {
#pragma omp parallel for shared(arr, arr_aux, curr_size, n) private(left_start)
// Pick starting point of different subarrays of current size
for (left_start = 0; left_start < n - 1; left_start += 2 * curr_size) {
// Find ending point of left subarray. mid+1 is starting
// point of right
long mid = min(left_start + curr_size - 1, n - 1);
long right_end = min(left_start + 2 * curr_size - 1, n - 1);
long arr_left_len = mid - left_start + 1;
long arr_right_len = right_end - mid;
if (arr_left_len <= serial_merge_threshold &&
arr_right_len <= serial_merge_threshold) {
merge(&arr_aux_deref[left_start], &arr_aux_deref[mid + 1], &arr_deref[left_start],
arr_left_len, arr_right_len);
} else {
parallel_merge_using_corank(&arr_aux_deref[left_start], &arr_aux_deref[mid + 1],
&arr_deref[left_start], arr_left_len,
arr_right_len, num_threads, n);
}
}
// Swap pointers to avoid redundant copying
double* tmp_array_ptr = *arr_aux;
*arr_aux = *arr;
*arr = tmp_array_ptr;
arr_deref = *arr;
arr_aux_deref = *arr_aux;
}
}
/* Driver program to test above functions */
int main() {
omp_set_max_active_levels(MAX_PARALLEL_NESTING);
srand(100);
long N;
printf("Enter array size: ");
int blah = scanf("%ld", &N);
// Hack to suppress compiler warning about scanf
(void)blah;
double* arr = malloc(N * sizeof(*arr));
double* arr_aux = malloc(N * sizeof(*arr));
for (long i = 0; i < N; i++) {
arr[i] = rand();
}
long num_threads = 0;
#pragma omp parallel shared(num_threads)
{ num_threads = omp_get_num_threads(); }
double start_time = omp_get_wtime();
parallel_merge_sort(&arr, &arr_aux, N, num_threads);
double end_time = omp_get_wtime();
bool sort_correctly = true;
for (long i = 1; i < N; i++) {
if (arr[i] < arr[i - 1]) {
printf("Sorting failed using parallel_merge_sort!\n");
sort_correctly = false;
break;
}
}
if (sort_correctly) {
printf("Sorting succeeded using parallel_merge_sort!\n");
}
printf(
"Time took to sort array of length %ld "
"using %ld threads: %lf seconds.\n",
N, num_threads, end_time - start_time);
free(arr);
free(arr_aux);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment