This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
class Vector: | |
def __init__(self, w, h, d, n=None): | |
self.w = w | |
self.h = h | |
self.d = d | |
self.n = n | |
self.half_w = w // 2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
__global__ void vectorAdd(const float *A, const float *B, float *C, uint32_t N) { | |
const uint32_t threads_per_block = blockDim.x; | |
const uint32_t total_blocks = gridDim.x; | |
const uint32_t block_id = blockIdx.x; | |
const uint32_t thread_id = threadIdx.x; | |
const uint32_t total_threads = total_blocks * threads_per_block; | |
const uint32_t idx = block_id * threads_per_block + thread_id; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <chrono> | |
__global__ void vectorAdd(const float *A, const float *B, float *C, uint32_t N) { | |
const uint32_t threads_per_block = blockDim.x; | |
const uint32_t total_blocks = gridDim.x; | |
const uint32_t block_id = blockIdx.x; | |
const uint32_t thread_id = threadIdx.x; | |
const uint32_t total_threads = total_blocks * threads_per_block; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <torch/extension.h> | |
#include <cstdint> | |
__global__ void kernel_vector_add(uint32_t N, float* a, float* b, float* c) { | |
uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x; | |
uint32_t max_threads = gridDim.x * blockDim.x; | |
for (uint32_t i = idx; i < N; i += max_threads) { | |
c[i] = a[i] + b[i]; | |
} |