tedliosu · October 7, 2024 06:12
diff --git a/iterative_mergesort_openmp.c b/iterative_mergesort_openmp.c
 /* Iterative C program for merge sort, partly inspired
 * by Geeks for geeks article on bottom up mergesort as
 * well as using code from university slides on co-rank
 * merges by Dr. Steven S. Lumetta, parallelized */

 // compile with "gcc -fopenmp -O3 iterative_mergesort_openmp.c -o iterative_mergesort_openmp -lm"

 /* Uploaded this file as a gist instead of within a proper repo since
 * the code within this file isn't quite up to par with coding standards
 * of code normally uploaded to my github profile */

 #include <math.h>
 #include <omp.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>


 // CPU PARAMS
 #define MAX_PARALLEL_NESTING 2
 #define SCALING_CONST 2

 long min(long first, long second) {
  return (first < second ? first : second);
 }

 void merge(double* first_input_array,
           double* second_input_array,
           double* dest_array,
           long first_input_arr_len,
           long second_input_arr_len) {
  long current_index_first_arr = 0;
  long current_index_second_arr = 0;
  long current_index_dest_arr = 0;

  while (current_index_first_arr < first_input_arr_len &&
         current_index_second_arr < second_input_arr_len) {
    if (first_input_array[current_index_first_arr] <=
        second_input_array[current_index_second_arr]) {
      dest_array[current_index_dest_arr++] =
          first_input_array[current_index_first_arr++];
    } else {
      dest_array[current_index_dest_arr++] =
          second_input_array[current_index_second_arr++];
    }
  }

  if (current_index_first_arr >= first_input_arr_len) {
    for (; current_index_second_arr < second_input_arr_len;
         ++current_index_second_arr) {
      dest_array[current_index_dest_arr++] =
          second_input_array[current_index_second_arr];
    }

  } else {
    for (; current_index_first_arr < first_input_arr_len;
         ++current_index_first_arr) {
      dest_array[current_index_dest_arr++] =
          first_input_array[current_index_first_arr];
    }
  }
 }

 long determine_corank(long dest_arr_starting_index,
                      double* first_input_arr,
                      long first_input_arr_len,
                      double* second_input_arr,
                      long second_input_arr_len) {
  long low, high;
  if (dest_arr_starting_index > second_input_arr_len) {
    low = dest_arr_starting_index - second_input_arr_len;
  } else {
    low = 0;
  }
  if (dest_arr_starting_index < first_input_arr_len) {
    high = dest_arr_starting_index;
  } else {
    high = first_input_arr_len;
  }

  while (low < high) {
    long current_index_first_arr = low + (high - low) / 2;
    long current_index_second_arr =
        dest_arr_starting_index - current_index_first_arr;

    if (current_index_first_arr > 0 &&
        current_index_second_arr < second_input_arr_len &&
        first_input_arr[current_index_first_arr - 1] >
            second_input_arr[current_index_second_arr]) {
      high = current_index_first_arr - 1;

    } else if (current_index_second_arr > 0 &&
               current_index_first_arr < first_input_arr_len &&
               first_input_arr[current_index_first_arr] <=
                   second_input_arr[current_index_second_arr - 1]) {
      low = current_index_first_arr + 1;

    } else {
      return current_index_first_arr;
    }
  }

  return low;
 }

 void parallel_merge_using_corank(double* first_input_array,
                                 double* second_input_array,
                                 double* dest_array,
                                 long first_input_array_len,
                                 long second_input_array_len,
                                 long num_of_threads,
                                 long overall_arr_size) {
  long min_array_portion_len =
      min(first_input_array_len, second_input_array_len);
  if (min_array_portion_len <= 0) {
    min_array_portion_len = 1;
  }

  long total_thread_count = 1 + ((num_of_threads * SCALING_CONST - 1) /
                                 (overall_arr_size / min_array_portion_len));

 #pragma omp parallel for shared(total_thread_count) \
    num_threads(total_thread_count)
  for (long thread_id = 0; thread_id < total_thread_count; ++thread_id) {
    long dest_arr_chunk_size =
        1 + ((first_input_array_len + second_input_array_len - 1) /
             total_thread_count);

    long dest_arr_starting_index = thread_id * dest_arr_chunk_size;
    dest_arr_starting_index =
        min(first_input_array_len + second_input_array_len,
            dest_arr_starting_index);
    long dest_arr_ending_index = dest_arr_starting_index + dest_arr_chunk_size;
    dest_arr_ending_index = min(first_input_array_len + second_input_array_len,
                                dest_arr_ending_index);

    long first_arr_starting_index = determine_corank(
        dest_arr_starting_index, first_input_array, first_input_array_len,
        second_input_array, second_input_array_len);
    long first_arr_ending_index = determine_corank(
        dest_arr_ending_index, first_input_array, first_input_array_len,
        second_input_array, second_input_array_len);

    long second_arr_starting_index =
        dest_arr_starting_index - first_arr_starting_index;
    long second_arr_ending_index =
        dest_arr_ending_index - first_arr_ending_index;

    merge(&first_input_array[first_arr_starting_index],
          &second_input_array[second_arr_starting_index],
          &dest_array[dest_arr_starting_index],
          first_arr_ending_index - first_arr_starting_index,
          second_arr_ending_index - second_arr_starting_index);
  }
 }

 /* Iterative parallel_merge_sort function to sort arr[0...n-1] */
 void parallel_merge_sort(double** arr, double** arr_aux, long n, long num_threads) {

  long curr_size;   // For current size of subarrays to be merged
                    // curr_size varies from 1 to n/2
  long left_start;  // For picking starting index of left subarray
                    // to be merged

  double* arr_deref = *arr;
  double* arr_aux_deref = *arr_aux;
  
 #pragma omp parallel for shared(arr, arr_aux, n)
  for (long current_index = 0; current_index < n; ++current_index) {
    arr_deref[current_index] = arr_aux_deref[current_index];
  }
  long serial_merge_threshold = (n >> ((int)ceil(log(num_threads) / log(2))));

  // Merge subarrays in bottom up manner. First merge subarrays of
  // size 1 to create sorted subarrays of size 2, then merge subarrays
  // of size 2 to create sorted subarrays of size 4, and so on.
  for (curr_size = 1; curr_size <= n - 1; curr_size = 2 * curr_size) {

 #pragma omp parallel for shared(arr, arr_aux, curr_size, n) private(left_start)
    // Pick starting point of different subarrays of current size
    for (left_start = 0; left_start < n - 1; left_start += 2 * curr_size) {
      // Find ending point of left subarray. mid+1 is starting
      // point of right
      long mid = min(left_start + curr_size - 1, n - 1);
      long right_end = min(left_start + 2 * curr_size - 1, n - 1);
      long arr_left_len = mid - left_start + 1;
      long arr_right_len = right_end - mid;
      if (arr_left_len <= serial_merge_threshold &&
          arr_right_len <= serial_merge_threshold) {
        merge(&arr_aux_deref[left_start], &arr_aux_deref[mid + 1], &arr_deref[left_start],
              arr_left_len, arr_right_len);

      } else {
        parallel_merge_using_corank(&arr_aux_deref[left_start], &arr_aux_deref[mid + 1],
                                    &arr_deref[left_start], arr_left_len,
                                    arr_right_len, num_threads, n);
      }
    }
    
    // Swap pointers to avoid redundant copying
    double* tmp_array_ptr = *arr_aux;
    *arr_aux = *arr;
    *arr = tmp_array_ptr;
    arr_deref = *arr;
    arr_aux_deref = *arr_aux;
    
  }

  
 }

 /* Driver program to test above functions */
 int main() {
  omp_set_max_active_levels(MAX_PARALLEL_NESTING);
  srand(100);

  long N;
  printf("Enter array size: ");
  int blah = scanf("%ld", &N);
  // Hack to suppress compiler warning about scanf
  (void)blah;

  double* arr = malloc(N * sizeof(*arr));
  double* arr_aux = malloc(N * sizeof(*arr));
  for (long i = 0; i < N; i++) {
    arr[i] = rand();
  }

  long num_threads = 0;

 #pragma omp parallel shared(num_threads)
  { num_threads = omp_get_num_threads(); }

  double start_time = omp_get_wtime();
  parallel_merge_sort(&arr, &arr_aux, N, num_threads);
  double end_time = omp_get_wtime();

  bool sort_correctly = true;

  for (long i = 1; i < N; i++) {
    if (arr[i] < arr[i - 1]) {
      printf("Sorting failed using parallel_merge_sort!\n");
      sort_correctly = false;
      break;
    }
  }

  if (sort_correctly) {
    printf("Sorting succeeded using parallel_merge_sort!\n");
  }

  printf(
      "Time took to sort array of length %ld "
      "using %ld threads: %lf seconds.\n",
      N, num_threads, end_time - start_time);

  free(arr);
  free(arr_aux);

  return 0;
 }
	/* Iterative C program for merge sort, partly inspired
	* by Geeks for geeks article on bottom up mergesort as
	* well as using code from university slides on co-rank
	* merges by Dr. Steven S. Lumetta, parallelized */

	// compile with "gcc -fopenmp -O3 iterative_mergesort_openmp.c -o iterative_mergesort_openmp -lm"

	/* Uploaded this file as a gist instead of within a proper repo since
	* the code within this file isn't quite up to par with coding standards
	* of code normally uploaded to my github profile */

	#include <math.h>
	#include <omp.h>
	#include <stdbool.h>
	#include <stdio.h>
	#include <stdlib.h>


	// CPU PARAMS
	#define MAX_PARALLEL_NESTING 2
	#define SCALING_CONST 2

	long min(long first, long second) {
	return (first < second ? first : second);
	}

	void merge(double* first_input_array,
	double* second_input_array,
	double* dest_array,
	long first_input_arr_len,
	long second_input_arr_len) {
	long current_index_first_arr = 0;
	long current_index_second_arr = 0;
	long current_index_dest_arr = 0;

	while (current_index_first_arr < first_input_arr_len &&
	current_index_second_arr < second_input_arr_len) {
	if (first_input_array[current_index_first_arr] <=
	second_input_array[current_index_second_arr]) {
	dest_array[current_index_dest_arr++] =
	first_input_array[current_index_first_arr++];
	} else {
	dest_array[current_index_dest_arr++] =
	second_input_array[current_index_second_arr++];
	}
	}

	if (current_index_first_arr >= first_input_arr_len) {
	for (; current_index_second_arr < second_input_arr_len;
	++current_index_second_arr) {
	dest_array[current_index_dest_arr++] =
	second_input_array[current_index_second_arr];
	}

	} else {
	for (; current_index_first_arr < first_input_arr_len;
	++current_index_first_arr) {
	dest_array[current_index_dest_arr++] =
	first_input_array[current_index_first_arr];
	}
	}
	}

	long determine_corank(long dest_arr_starting_index,
	double* first_input_arr,
	long first_input_arr_len,
	double* second_input_arr,
	long second_input_arr_len) {
	long low, high;
	if (dest_arr_starting_index > second_input_arr_len) {
	low = dest_arr_starting_index - second_input_arr_len;
	} else {
	low = 0;
	}
	if (dest_arr_starting_index < first_input_arr_len) {
	high = dest_arr_starting_index;
	} else {
	high = first_input_arr_len;
	}

	while (low < high) {
	long current_index_first_arr = low + (high - low) / 2;
	long current_index_second_arr =
	dest_arr_starting_index - current_index_first_arr;

	if (current_index_first_arr > 0 &&
	current_index_second_arr < second_input_arr_len &&
	first_input_arr[current_index_first_arr - 1] >
	second_input_arr[current_index_second_arr]) {
	high = current_index_first_arr - 1;

	} else if (current_index_second_arr > 0 &&
	current_index_first_arr < first_input_arr_len &&
	first_input_arr[current_index_first_arr] <=
	second_input_arr[current_index_second_arr - 1]) {
	low = current_index_first_arr + 1;

	} else {
	return current_index_first_arr;
	}
	}

	return low;
	}

	void parallel_merge_using_corank(double* first_input_array,
	double* second_input_array,
	double* dest_array,
	long first_input_array_len,
	long second_input_array_len,
	long num_of_threads,
	long overall_arr_size) {
	long min_array_portion_len =
	min(first_input_array_len, second_input_array_len);
	if (min_array_portion_len <= 0) {
	min_array_portion_len = 1;
	}

	long total_thread_count = 1 + ((num_of_threads * SCALING_CONST - 1) /
	(overall_arr_size / min_array_portion_len));

	#pragma omp parallel for shared(total_thread_count) \
	num_threads(total_thread_count)
	for (long thread_id = 0; thread_id < total_thread_count; ++thread_id) {
	long dest_arr_chunk_size =
	1 + ((first_input_array_len + second_input_array_len - 1) /
	total_thread_count);

	long dest_arr_starting_index = thread_id * dest_arr_chunk_size;
	dest_arr_starting_index =
	min(first_input_array_len + second_input_array_len,
	dest_arr_starting_index);
	long dest_arr_ending_index = dest_arr_starting_index + dest_arr_chunk_size;
	dest_arr_ending_index = min(first_input_array_len + second_input_array_len,
	dest_arr_ending_index);

	long first_arr_starting_index = determine_corank(
	dest_arr_starting_index, first_input_array, first_input_array_len,
	second_input_array, second_input_array_len);
	long first_arr_ending_index = determine_corank(
	dest_arr_ending_index, first_input_array, first_input_array_len,
	second_input_array, second_input_array_len);

	long second_arr_starting_index =
	dest_arr_starting_index - first_arr_starting_index;
	long second_arr_ending_index =
	dest_arr_ending_index - first_arr_ending_index;

	merge(&first_input_array[first_arr_starting_index],
	&second_input_array[second_arr_starting_index],
	&dest_array[dest_arr_starting_index],
	first_arr_ending_index - first_arr_starting_index,
	second_arr_ending_index - second_arr_starting_index);
	}
	}

	/* Iterative parallel_merge_sort function to sort arr[0...n-1] */
	void parallel_merge_sort(double arr, double arr_aux, long n, long num_threads) {

	long curr_size; // For current size of subarrays to be merged
	// curr_size varies from 1 to n/2
	long left_start; // For picking starting index of left subarray
	// to be merged

	double* arr_deref = *arr;
	double* arr_aux_deref = *arr_aux;

	#pragma omp parallel for shared(arr, arr_aux, n)
	for (long current_index = 0; current_index < n; ++current_index) {
	arr_deref[current_index] = arr_aux_deref[current_index];
	}
	long serial_merge_threshold = (n >> ((int)ceil(log(num_threads) / log(2))));

	// Merge subarrays in bottom up manner. First merge subarrays of
	// size 1 to create sorted subarrays of size 2, then merge subarrays
	// of size 2 to create sorted subarrays of size 4, and so on.
	for (curr_size = 1; curr_size <= n - 1; curr_size = 2 * curr_size) {

	#pragma omp parallel for shared(arr, arr_aux, curr_size, n) private(left_start)
	// Pick starting point of different subarrays of current size
	for (left_start = 0; left_start < n - 1; left_start += 2 * curr_size) {
	// Find ending point of left subarray. mid+1 is starting
	// point of right
	long mid = min(left_start + curr_size - 1, n - 1);
	long right_end = min(left_start + 2 * curr_size - 1, n - 1);
	long arr_left_len = mid - left_start + 1;
	long arr_right_len = right_end - mid;
	if (arr_left_len <= serial_merge_threshold &&
	arr_right_len <= serial_merge_threshold) {
	merge(&arr_aux_deref[left_start], &arr_aux_deref[mid + 1], &arr_deref[left_start],
	arr_left_len, arr_right_len);

	} else {
	parallel_merge_using_corank(&arr_aux_deref[left_start], &arr_aux_deref[mid + 1],
	&arr_deref[left_start], arr_left_len,
	arr_right_len, num_threads, n);
	}
	}

	// Swap pointers to avoid redundant copying
	double* tmp_array_ptr = *arr_aux;
	arr_aux = arr;
	*arr = tmp_array_ptr;
	arr_deref = *arr;
	arr_aux_deref = *arr_aux;

	}


	}

	/* Driver program to test above functions */
	int main() {
	omp_set_max_active_levels(MAX_PARALLEL_NESTING);
	srand(100);

	long N;
	printf("Enter array size: ");
	int blah = scanf("%ld", &N);
	// Hack to suppress compiler warning about scanf
	(void)blah;

	double* arr = malloc(N * sizeof(*arr));
	double* arr_aux = malloc(N * sizeof(*arr));
	for (long i = 0; i < N; i++) {
	arr[i] = rand();
	}

	long num_threads = 0;

	#pragma omp parallel shared(num_threads)
	{ num_threads = omp_get_num_threads(); }

	double start_time = omp_get_wtime();
	parallel_merge_sort(&arr, &arr_aux, N, num_threads);
	double end_time = omp_get_wtime();

	bool sort_correctly = true;

	for (long i = 1; i < N; i++) {
	if (arr[i] < arr[i - 1]) {
	printf("Sorting failed using parallel_merge_sort!\n");
	sort_correctly = false;
	break;
	}
	}

	if (sort_correctly) {
	printf("Sorting succeeded using parallel_merge_sort!\n");
	}

	printf(
	"Time took to sort array of length %ld "
	"using %ld threads: %lf seconds.\n",
	N, num_threads, end_time - start_time);

	free(arr);
	free(arr_aux);

	return 0;
	}