Skip to content

Instantly share code, notes, and snippets.

@ashwanirathee
Last active March 11, 2025 07:16
Show Gist options
  • Save ashwanirathee/b2bd7b9ad81179b48863c4074ff0258a to your computer and use it in GitHub Desktop.
Save ashwanirathee/b2bd7b9ad81179b48863c4074ff0258a to your computer and use it in GitHub Desktop.
Sort2
import matplotlib.pyplot as plt
sizes1 = [10**1, 10**2, 10**3, 10**4, 10**5, 10**6, 10**7]
cpu_times = [.002759, .003312, .002829, .005624, .022991, .233091, 2.803243]
gpu_times = [.721383, .142897, .107374, .105827, .210548, .309812, 1.781547]
recursive_cpu_times = [.002717, .002820, .002900, .004930, .025398, .259715, 2.869863]
recursive_gpu_times = [.120757, .119873, .146225, .184894, 1.020488, 8.396131, 85.096905]
iterative_cpu_times = [.002658, .002787, .002795, .010535, .025303, .266396, 2.875699]
iterative_gpu_times = [.102239, .104223, .151719, .106983, .132566, .481036, 3.875283]
# Create the plot
plt.figure(figsize=(8, 5))
plt.plot(sizes1, cpu_times, marker='o', linestyle='-', label="std::sort")
plt.plot(sizes1, gpu_times, marker='s', linestyle='-', label="thrust::sort")
plt.plot(sizes1, iterative_cpu_times, marker='o', linestyle='-', label="CPU Iterative")
plt.plot(sizes1, iterative_gpu_times, marker='s', linestyle='-', label="GPU Iterative")
plt.plot(sizes1, recursive_cpu_times, marker='o', linestyle='-', label="CPU Recursive")
plt.plot(sizes1, recursive_gpu_times, marker='s', linestyle='-', label="GPU Recursive")
# Log scale for better visualization
plt.xscale("log")
plt.yscale("log")
# Labels and title
plt.xlabel("Array Size")
plt.ylabel("Time (seconds)")
# plt.title("CPU vs GPU Merge Sort Execution Time")
plt.title("Merge Sort Execution Time Comparison")
plt.legend()
plt.grid(True, which="both", linestyle="--", linewidth=0.5)
plt.savefig("merge_sort_comparison.png")
# Show the plot
plt.show()
#!/bin/bash
# Compile CUDA and C++ programs
/usr/local/cuda/bin/nvcc -rdc=true -o default_gpu default_gpu.cu
# g++ -o iterative_merge_sort_cpu merge_sort.cpp
g++ -o default_cpu default_cpu.cpp
# Function to convert nanoseconds to seconds with precision
convert_to_seconds() {
echo "scale=6; $1 / 1000000000" | bc
}
# Initialize arrays to store results
cpu_results=()
gpu_results=()
# Loop for different power values for both CPU and GPU merge sort
for power in 1 2 3 4 5 6 7; do
echo "Power:" $power
# Measure CPU and GPU merge sort execution times
cpu_start=$(date +%s%N)
./default_cpu $power
cpu_end=$(date +%s%N)
cpu_results+=($(convert_to_seconds $((cpu_end - cpu_start))))
gpu_start=$(date +%s%N)
./default_gpu $power
gpu_end=$(date +%s%N)
gpu_results+=($(convert_to_seconds $((gpu_end - gpu_start))))
done
# Print results together
echo "CPU Merge Sort times (seconds): [${cpu_results[@]}]"
echo "GPU Merge Sort times (seconds): [${gpu_results[@]}]"
rm ./default_cpu
rm ./default_gpu
#include <iostream>
#include <vector>
#include <algorithm>
#include <random>
#include <chrono>
#include "utils.h"
using namespace std;
using namespace std::chrono;
void mergeSort(uint8_t* arr, long long size) {
std::sort(arr, arr + size);
}
int main(int argc, char* argv[]) {
if (argc != 2) {
cerr << "Usage: " << argv[0] << " <array_size>" << endl;
return 1;
}
int p = atoi(argv[1]);
long long n = pow(10, p);
uint8_t* arr = generateRandomArray(n);
uint8_t* arrCopy = new uint8_t[n];
std::copy(arr, arr + n, arrCopy);
auto start = std::chrono::high_resolution_clock::now();
mergeSort(arr, n);
auto end = std::chrono::high_resolution_clock::now();
auto duration = duration_cast<microseconds>(end - start);
std::sort(arrCopy, arrCopy + n);
bool is_correct = true;
for (long long i = 0; i < n; i++) {
if (arr[i] != arrCopy[i]) {
is_correct = false;
cout << "Mismatch at index " << i << ": CPU sorted " << arr[i] << " != CPU sorted " << arrCopy[i] << endl;
break;
}
}
if (!is_correct) {
cout << "CPU Merge Sort has errors!" << endl;
}
delete[] arr;
delete[] arrCopy;
cout << "CPU Iterative Sort took: " << duration.count() << " micro s" << endl;
return 0;
}
#include <iostream>
#include <vector>
#include <algorithm>
#include <random>
#include <chrono>
#include <cuda_runtime.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include "utils.h"
using namespace std;
using namespace std::chrono;
void mergeSort(uint8_t* d_arr, long long size) {
thrust::device_vector<uint8_t> d_vec(d_arr, d_arr + size);
thrust::sort(d_vec.begin(), d_vec.end());
thrust::copy(d_vec.begin(), d_vec.end(), d_arr);
}
int main(int argc, char* argv[]) {
if (argc != 2) {
cerr << "Usage: " << argv[0] << " <array_size>" << endl;
return 1;
}
int p = atoi(argv[1]);
long long n = pow(10, p);
uint8_t* arr = generateRandomArray(n);
uint8_t* arrCopy = new uint8_t[n];
std::copy(arr, arr + n, arrCopy);
uint8_t* d_arr;
cudaMalloc(&d_arr, n * sizeof(uint8_t));
cudaMemcpy(d_arr, arr, n * sizeof(uint8_t), cudaMemcpyHostToDevice);
auto start = std::chrono::high_resolution_clock::now();
mergeSort(d_arr, n);
auto end = std::chrono::high_resolution_clock::now();
auto duration = duration_cast<microseconds>(end - start);
cudaMemcpy(arr, d_arr, n * sizeof(uint8_t), cudaMemcpyDeviceToHost);
std::sort(arrCopy, arrCopy + n);
bool is_correct = true;
for (long long i = 0; i < n; i++) {
if (arr[i] != arrCopy[i]) {
is_correct = false;
cout << "Mismatch at index " << i << ": CUDA sorted " << arr[i] << " != CPU sorted " << arrCopy[i] << endl;
break;
}
}
if (!is_correct) {
cout << "CUDA Merge Sort has errors!" << endl;
}
delete[] arr;
delete[] arrCopy;
cudaFree(d_arr);
cout << "CUDA Iterative Merge Sort took: " << duration.count() << " micro s" << endl;
return 0;
}
#include <iostream>
#include <chrono>
#include "utils.h"
#include <algorithm>
#include <cstring>
using namespace std;
using namespace std::chrono;
void mergeKernel(uint8_t* arr, uint8_t* temp, long long left, long long mid, long long right) {
long long i = left, j = mid + 1, k = left;
while (i <= mid && j <= right)
temp[k++] = (arr[i] <= arr[j]) ? arr[i++] : arr[j++];
while (i <= mid) temp[k++] = arr[i++];
while (j <= right) temp[k++] = arr[j++];
for (long long l = left; l <= right; l++) {
arr[l] = temp[l];
}
}
void mergeSort(uint8_t* arr, uint8_t* temp, long long n) {
long long left, mid, right, size;
for (size = 1; size < n; size *= 2) {
for (left = 0; left < n - size; left += 2 * size) {
mid = left + size - 1;
right = std::min(left + 2 * size - 1, n - 1);
mergeKernel(arr, temp, left, mid, right);
}
}
}
int main(int argc, char* argv[]) {
if (argc != 2) {
cerr << "Usage: " << argv[0] << " <array_size>" << endl;
return 1;
}
int p = atoi(argv[1]);
long long n = static_cast<int>(pow(10, p));
uint8_t* arr = generateRandomArray(n);
uint8_t* temp = new uint8_t[n]; // Allocate temp array
uint8_t* arrCopy = new uint8_t[n]; // Copy for CPU sorting
std::copy(arr, arr + n, arrCopy);
// Start timing
auto start = high_resolution_clock::now();
mergeSort(arr, temp, n);
auto stop = high_resolution_clock::now();
auto duration = duration_cast<microseconds>(stop - start);
std::sort(arrCopy, arrCopy + n);
bool is_correct = true;
for (long long i = 0; i < n; i++) {
if (arr[i] != arrCopy[i]) {
is_correct = false;
cout << "Mismatch at index " << i << ": CPU sorted " << arr[i] << " != CPU sorted " << arrCopy[i] << endl;
break;
}
}
if (!is_correct) {
cout << "CPU Merge Sort has errors!" << endl;
}
delete[] arr;
delete[] temp;
delete[] arrCopy;
cout << "CPU Iterative Sort took: " << duration.count() << " micro s" << endl;
return 0;
}
#include <iostream>
#include <cuda_runtime.h>
#include <chrono>
#include "utils.h"
#include <algorithm>
using namespace std;
using namespace std::chrono;
#define THREADS_PER_BLOCK 256
#define CUDA_CHECK(call) \
do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
std::cerr << "CUDA error: " << cudaGetErrorString(err) << " at " << __FILE__ << ":" << __LINE__ << std::endl; \
exit(EXIT_FAILURE); \
} \
} while (0)
__global__ void mergeKernel(uint8_t* arr, uint8_t* temp, long long curr_size, long long n) {
long long index = blockIdx.x * blockDim.x + threadIdx.x;
long long left = 2 * curr_size * index;
if (left >= n) return; // left < 0 handles the case where the number overfloweed into negative
long long mid = min(left + curr_size - 1, n - 1);
long long right = min(left + 2 * curr_size - 1, n - 1);
long long i = left, j = mid + 1, k = left;
while (i <= mid && j <= right)
temp[k++] = (arr[i] <= arr[j]) ? arr[i++] : arr[j++];
while (i <= mid) temp[k++] = arr[i++];
while (j <= right) temp[k++] = arr[j++];
}
void mergeSort(uint8_t* arr, uint8_t* temp, long long n) {
bool flipflop = true;
long long numThreads, gridSize;
long long size; // size means the merge arrays sizes
for (size = 1; size < n; size *= 2) {
numThreads = max(n / (2 * size), (long long)1);
gridSize = (numThreads + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
mergeKernel<<<gridSize, THREADS_PER_BLOCK>>>(flipflop ? arr : temp, flipflop ? temp : arr, size, n);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize());
flipflop = !flipflop;
}
if (!flipflop) CUDA_CHECK(cudaMemcpy(arr, temp, n * sizeof(uint8_t), cudaMemcpyDeviceToDevice));
}
int main(int argc, char* argv[]) {
if (argc != 2) {
cerr << "Usage: " << argv[0] << " <array_size>" << endl;
return 1;
}
int p = atoi(argv[1]);
long long n = pow(10, p);
uint8_t* h_arr = generateRandomArray(n);
uint8_t* h_arr_copy = new uint8_t[n]; // Copy for CPU sorting
memcpy(h_arr_copy, h_arr, n * sizeof(uint8_t));
uint8_t *d_arr, *d_temp;
cudaMalloc(&d_arr, n * sizeof(uint8_t));
cudaMalloc(&d_temp, n * sizeof(uint8_t));
// Copy input data to GPU
cudaMemcpy(d_arr, h_arr, n * sizeof(uint8_t), cudaMemcpyHostToDevice);
// Start timing
auto start = high_resolution_clock::now();
mergeSort(d_arr, d_temp, n);
auto stop = high_resolution_clock::now();
auto duration = duration_cast<microseconds>(stop - start);
cudaMemcpy(h_arr, d_arr, n * sizeof(uint8_t), cudaMemcpyDeviceToHost);
std::sort(h_arr_copy, h_arr_copy + n);
bool is_correct = true;
for (int i = 0; i < n; i++) {
if (h_arr[i] != h_arr_copy[i]) {
is_correct = false;
cout << "Mismatch at index " << i << ": CUDA sorted " << h_arr[i] << " != CPU sorted " << h_arr_copy[i] << endl;
break;
}
}
if (!is_correct) {
cout << "CUDA Merge Sort has errors!" << endl;
}
cudaFree(d_arr);
cudaFree(d_temp);
delete[] h_arr;
cout << "CUDA Iterative Merge Sort took: " << duration.count() << " micro s" << endl;
return 0;
}
#include <iostream>
#include <vector>
#include <chrono>
#include "utils.h"
#include <algorithm>
#include <chrono>
using namespace std;
using namespace std::chrono;
void merge(uint8_t* arr, uint8_t* temp, long long left, long long mid, long long right) {
long long i = left, j = mid + 1, k = left;
// Merge directly into temp array
while (i <= mid && j <= right) {
temp[k++] = (arr[i] <= arr[j]) ? arr[i++] : arr[j++];
}
while (i <= mid) temp[k++] = arr[i++];
while (j <= right) temp[k++] = arr[j++];
for (long long i = left; i <= right; i++) {
arr[i] = temp[i];
}
}
void mergeSort(uint8_t* arr, uint8_t* temp, long long left, long long right) {
long long mid;
if (left < right) {
mid = left + (right - left) / 2;
mergeSort(arr, temp, left, mid);
mergeSort(arr, temp, mid + 1, right);
merge(arr, temp, left, mid, right);
}
}
int main(int argc, char* argv[]) {
if (argc != 2) {
cerr << "Usage: " << argv[0] << " <array_size>" << endl;
return 1;
}
int p = atoi(argv[1]);
long long n = pow(10, p);
uint8_t* arr = generateRandomArray(n);
uint8_t* temp = new uint8_t[n]; // Allocate temp array
uint8_t* arrCopy = new uint8_t[n];
std::copy(arr, arr + n, arrCopy);
auto start = std::chrono::high_resolution_clock::now();
mergeSort(arr, temp, 0, n - 1);
auto end = std::chrono::high_resolution_clock::now();
auto duration = duration_cast<microseconds>(end - start);
std::sort(arrCopy, arrCopy + n);
bool is_correct = true;
for (long long i = 0; i < n; i++) {
if (arr[i] != arrCopy[i]) {
is_correct = false;
cout << "Mismatch at index " << i << ": CPU sorted " << arr[i] << " != CPU sorted " << arrCopy[i] << endl;
break;
}
}
if (!is_correct) {
cout << "CPU Merge Sort has errors!" << endl;
}
delete[] arr;
delete[] temp;
delete[] arrCopy;
cout << "CPU Iterative Sort took: " << duration.count() << " micro s" << endl;
return 0;
}
#include <iostream>
#include <cuda_runtime.h>
#include "utils.h"
#include <chrono>
#include <algorithm>
using namespace std;
using namespace std::chrono;
__global__ void merge(uint8_t* arr, uint8_t* temp, long long left, long long mid, long long right) {
long long i = left, j = mid + 1, k = left;
// Merge directly into temp array
while (i <= mid && j <= right) {
temp[k++] = (arr[i] <= arr[j]) ? arr[i++] : arr[j++];
}
while (i <= mid) temp[k++] = arr[i++];
while (j <= right) temp[k++] = arr[j++];
for (long long i = left; i <= right; i++) {
arr[i] = temp[i];
}
}
void mergeSort(uint8_t* arr, uint8_t* temp, long long left, long long right) {
long long mid;
if (left < right) {
mid = left + (right - left) / 2;
mergeSort(arr, temp, left, mid);
mergeSort(arr, temp, mid + 1, right);
merge<<<1, 1>>>(arr, temp, left, mid, right); // Call merge kernel
cudaDeviceSynchronize(); // Wait for kernel execution
}
}
int main(int argc, char* argv[]) {
if (argc != 2) {
cerr << "Usage: " << argv[0] << " <array_size>" << endl;
return 1;
}
int p = atoi(argv[1]);
long long n = pow(10, p);
uint8_t* arr = generateRandomArray(n);
uint8_t* arrCopy = new uint8_t[n];
std::copy(arr, arr + n, arrCopy);
uint8_t *cu_arr, *cu_temp;
cudaMalloc(&cu_arr, n * sizeof(uint8_t)); // Allocate memory on GPU
cudaMalloc(&cu_temp, n * sizeof(uint8_t));
cudaMemcpy(cu_arr, arr, n * sizeof(uint8_t), cudaMemcpyHostToDevice); // Copy data to GPU
auto start = std::chrono::high_resolution_clock::now();
mergeSort(cu_arr, cu_temp, 0, n - 1);;
auto end = std::chrono::high_resolution_clock::now();
auto duration = duration_cast<microseconds>(end - start);
cudaMemcpy(arr, cu_arr, n * sizeof(uint8_t), cudaMemcpyDeviceToHost);
std::sort(arrCopy, arrCopy + n);
bool is_correct = true;
for (long long i = 0; i < n; i++) {
if (arr[i] != arrCopy[i]) {
is_correct = false;
cout << "Mismatch at index " << i << ": CUDA sorted " << arr[i] << " != CPU sorted " << arrCopy[i] << endl;
break;
}
}
if (!is_correct) {
cout << "CUDA Merge Sort has errors!" << endl;
}
// free memory
cudaFree(cu_arr);
cudaFree(cu_temp);
delete[] arr;
delete[] arrCopy;
cout << "CUDA Iterative Merge Sort took: " << duration.count() << " micro s" << endl;
return 0;
}
#!/bin/bash
# Compile CUDA and C++ programs
/usr/local/cuda/bin/nvcc -o iterative_merge_sort_gpu iterative_merge_sort.cu
g++ -o iterative_merge_sort_cpu iterative_merge_sort.cpp
# Function to convert nanoseconds to seconds with precision
convert_to_seconds() {
echo "scale=6; $1 / 1000000000" | bc
}
# Initialize arrays to store results
cpu_results=()
gpu_results=()
# Loop for different power values for both CPU and GPU merge sort
for power in 1 2 3 4 5 6 7; do
echo "Power:" $power
# Measure CPU and GPU merge sort execution times
cpu_start=$(date +%s%N)
./iterative_merge_sort_cpu $power
cpu_end=$(date +%s%N)
cpu_results+=($(convert_to_seconds $((cpu_end - cpu_start))))
gpu_start=$(date +%s%N)
./iterative_merge_sort_gpu $power
gpu_end=$(date +%s%N)
gpu_results+=($(convert_to_seconds $((gpu_end - gpu_start))))
done
# Print results together
echo "CPU Merge Sort times (seconds): [${cpu_results[@]}]"
echo "GPU Merge Sort times (seconds): [${gpu_results[@]}]"
rm ./iterative_merge_sort_cpu
rm ./iterative_merge_sort_gpu
#!/bin/bash
# Compile CUDA and C++ programs
/usr/local/cuda/bin/nvcc -rdc=true -o merge_sort_gpu merge_sort.cu
# g++ -o iterative_merge_sort_cpu merge_sort.cpp
g++ -o merge_sort_cpu merge_sort.cpp
# Function to convert nanoseconds to seconds with precision
convert_to_seconds() {
echo "scale=6; $1 / 1000000000" | bc
}
# Initialize arrays to store results
cpu_results=()
gpu_results=()
# Loop for different power values for both CPU and GPU merge sort
for power in 1 2 3 4 5 6 7; do
echo "Power:" $power
# Measure CPU and GPU merge sort execution times
cpu_start=$(date +%s%N)
./merge_sort_cpu $power
cpu_end=$(date +%s%N)
cpu_results+=($(convert_to_seconds $((cpu_end - cpu_start))))
gpu_start=$(date +%s%N)
./merge_sort_gpu $power
gpu_end=$(date +%s%N)
gpu_results+=($(convert_to_seconds $((gpu_end - gpu_start))))
done
# Print results together
echo "CPU Merge Sort times (seconds): [${cpu_results[@]}]"
echo "GPU Merge Sort times (seconds): [${gpu_results[@]}]"
rm ./merge_sort_cpu
rm ./merge_sort_gpu
#include <vector>
#include <random>
using namespace std;
void printVector(const vector<int>& arr) {
for (int num : arr) {
cout << num << " ";
}
cout << endl;
}
#include <iostream>
#include <random>
using namespace std;
int* generateRandomArray(long int size, int minValue = 1, int maxValue = 100) {
int* arr = new int[size]; // Dynamically allocate memory for the array
random_device rd; // Obtain a random seed
mt19937 gen(rd()); // Mersenne Twister PRNG with a fixed seed (1)
uniform_int_distribution<int> dist(minValue, maxValue); // Uniform distribution
for (int i = 0; i < size; i++) {
arr[i] = dist(gen);
}
return arr; // Caller must delete[] this array after use
}
#include <cstdint> // For uint8_t
uint8_t* generateRandomArray(long long size, uint8_t minValue = 0, uint8_t maxValue = 255) {
uint8_t* arr = new uint8_t[size]; // Dynamically allocate memory for the array
std::random_device rd;
std::mt19937 gen(1);
std::uniform_int_distribution<uint8_t> dist(minValue, maxValue);
for (long long i = 0; i < size; i++) {
arr[i] = dist(gen);
}
return arr; // Caller must delete[] this array after use
}
bool sort_test(int* arr, int n){
for(int i=1;i<n;i++){
if(arr[i] < arr[i-1]){
cout << "Merge Sort is problematic:" << arr[i-1] << "_" << arr[i] << endl;
return false;
}
}
return true;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment