Created
July 8, 2016 07:43
-
-
Save JonathanRaiman/c6a55691c3dba8969abc636f621c8792 to your computer and use it in GitHub Desktop.
CUDA / CPU one file Array library
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <vector> | |
#include <string> | |
#include <memory> | |
#include <sstream> | |
#include <iostream> | |
#define XINLINE __device__ __host__ | |
#define MAX_DIM 10 | |
#define INDENT_INCREMENT 2 | |
namespace functor { | |
struct Fill { | |
double fill_; | |
XINLINE Fill(double fill) : fill_(fill) {} | |
template<typename T> | |
T XINLINE operator()(const T& val) { | |
return fill_; | |
} | |
}; | |
struct Increment { | |
double inc_; | |
XINLINE Increment(double inc) : inc_(inc) {} | |
template<typename T> | |
T XINLINE operator()(const T& val) { | |
return val + inc_; | |
} | |
}; | |
struct Decrement { | |
double dec_; | |
XINLINE Decrement(double dec) : dec_(dec) {} | |
template<typename T> | |
T XINLINE operator()(const T& val) { | |
return val + dec_; | |
} | |
}; | |
} | |
namespace saver { | |
struct Increment { | |
template<typename T> | |
static void XINLINE save(T& left, const T& right) { | |
left += right; | |
} | |
}; | |
struct Decrement { | |
template<typename T> | |
static void XINLINE save(T& left, const T& right) { | |
left -= right; | |
} | |
}; | |
struct Assign { | |
template<typename T> | |
static void XINLINE save(T& left, const T& right) { | |
left = right; | |
} | |
}; | |
} | |
int div_ceil(int a, int b) { | |
return (a + b - 1) / b; | |
} | |
void make_message(std::stringstream* ss) {} | |
template<typename Arg, typename... Args> | |
void make_message(std::stringstream* ss, const Arg& arg, const Args&... args) { | |
(*ss) << arg; | |
make_message(ss, args...); | |
} | |
template<typename... Args> | |
std::string make_message(const Args&... args) { | |
std::stringstream ss; | |
make_message(&ss, args...); | |
return ss.str(); | |
} | |
struct Dimension { | |
int sizes_[MAX_DIM]; | |
int ndim_; | |
Dimension(const std::vector<int>& sizes) : ndim_(sizes.size()) { | |
for (int i = 0; i < sizes.size();i++) { | |
sizes_[i] = sizes[i]; | |
} | |
} | |
XINLINE Dimension(std::initializer_list<int> sizes) : ndim_(sizes.size()) { | |
int i = 0; | |
for (auto iter = sizes.begin(); iter != sizes.end(); iter++) { | |
sizes_[i] = *iter; | |
i++; | |
} | |
} | |
XINLINE ~Dimension() { | |
} | |
int XINLINE ndim() const { | |
return ndim_; | |
} | |
int XINLINE operator[](int dim) const { | |
return sizes_[dim]; | |
} | |
void XINLINE set_dim(int dim, int value) { | |
sizes_[dim] = value; | |
} | |
Dimension(const Dimension& other) : ndim_(other.ndim_) { | |
for (int i = 0; i < ndim_;i++) { | |
sizes_[i] = other.sizes_[i]; | |
} | |
} | |
Dimension& XINLINE operator=(const Dimension& other) { | |
ndim_ = other.ndim(); | |
for (int i = 0; i < other.ndim(); i++) { | |
sizes_[i] = other[i]; | |
} | |
return *this; | |
} | |
int XINLINE numel() const { | |
int volume = 1; | |
for (int i = 0; i < ndim_; i++) { | |
volume *= sizes_[i]; | |
} | |
return volume; | |
} | |
Dimension XINLINE subsize() const { | |
Dimension subdim({}); | |
subdim.ndim_ = ndim_ - 1; | |
for (int i = 1; i < ndim_; i++) { | |
subdim.set_dim(i - 1, sizes_[i]); | |
} | |
return subdim; | |
} | |
static int XINLINE index2offset(const Dimension& sizes, const Dimension& indices) { | |
int offset = 0; | |
int volume = 1; | |
for (int i = indices.ndim() - 1; i >= 0; i--) { | |
offset += indices[i] * volume; | |
volume *= sizes[i]; | |
} | |
return offset; | |
} | |
}; | |
std::ostream& operator<<(std::ostream& stream, const Dimension& dims) { | |
stream << "("; | |
for (int i = 0; i < dims.ndim();i++) { | |
stream << dims[i]; | |
if (i != dims.ndim() - 1) { | |
stream << ", "; | |
} else { | |
stream << ")"; | |
} | |
} | |
return stream; | |
} | |
enum DEVICE_T { | |
DEVICE_T_CPU, | |
DEVICE_T_GPU | |
}; | |
std::ostream& operator<<(std::ostream& stream, const DEVICE_T& device) { | |
return stream << ((device == DEVICE_T_CPU) ? "cpu" : "gpu"); | |
} | |
template<typename Container, typename T> | |
struct ArrayLike { | |
Dimension sizes_; | |
T* ptr_; | |
int offset_; | |
const DEVICE_T device_; | |
int XINLINE numel() const { | |
return sizes_.numel(); | |
} | |
int XINLINE offset() const { | |
return offset_; | |
} | |
const Dimension& XINLINE size() const { | |
return sizes_; | |
} | |
T& XINLINE operator[](const int& index) { | |
return *(ptr_ + offset_ + index); | |
} | |
const T& XINLINE operator[](const int& index) const { | |
return *(ptr_ + offset_ + index); | |
} | |
T& XINLINE operator[](const Dimension& indices) { | |
int idx_offset = Dimension::index2offset(this->sizes_, indices); | |
return *(this->ptr_ + this->offset_ + idx_offset); | |
} | |
const T& XINLINE operator[](const Dimension& indices) const { | |
int idx_offset = Dimension::index2offset(this->sizes_, indices); | |
return *(this->ptr_ + this->offset_ + idx_offset); | |
} | |
int XINLINE ndim() const { | |
return sizes_.ndim(); | |
} | |
ArrayLike(const Dimension& sizes, const DEVICE_T& dev, T* ptr, const int& offset) | |
: sizes_(sizes), | |
device_(dev), | |
ptr_(ptr), | |
offset_(offset) {} | |
}; | |
template<typename T> | |
struct ArrayReference : public ArrayLike<ArrayReference<T>, T> { | |
using ArrayLike<ArrayReference<T>, T>::ArrayLike; | |
}; | |
template<typename Functor, typename T> | |
void __global__ | |
unary_kernel(Functor func, | |
const ArrayReference<T> arr, | |
ArrayReference<T> dest) { | |
int idx = blockDim.x * blockIdx.x + threadIdx.x; | |
int stride = blockDim.x * gridDim.x; | |
int size = arr.numel(); | |
for (int i = idx; i < size; i += stride) { | |
dest[i] = func(arr[i]); | |
} | |
} | |
template<typename Functor, typename T> | |
void __global__ | |
binary_kernel(Functor func, | |
const ArrayReference<T> arr, | |
const ArrayReference<T> arr2, | |
ArrayReference<T> dest) { | |
int idx = blockDim.x * blockIdx.x + threadIdx.x; | |
int stride = blockDim.x * gridDim.x; | |
int size = arr.numel(); | |
for (int i = idx; i < size; i += stride) { | |
dest[i] = func(arr[i], arr2[i]); | |
} | |
} | |
template<typename Saver, typename SrcT, typename IndexT> | |
void scatter_saver_cpu(ArrayReference<SrcT> source, | |
const ArrayReference<IndexT> indices, | |
const ArrayReference<SrcT> updates) { | |
int source_cols = source.size()[1]; | |
int size = indices.numel(); | |
for (int j = 0; j < size; ++j) { | |
auto scatter_index = indices[j]; | |
for (int col_idx = 0; col_idx < source_cols; ++col_idx) { | |
Saver::save(source[{scatter_index, col_idx}], updates[{j, col_idx}]); | |
} | |
} | |
} | |
template<typename Saver, typename SrcT, typename IndexT> | |
__global__ | |
void scatter_saver_reduction_kernel( | |
ArrayReference<SrcT> source, | |
const ArrayReference<IndexT> indices, | |
const ArrayReference<SrcT> updates) { | |
int source_cols = source.size()[1]; | |
int idx = blockDim.x * blockIdx.x + threadIdx.x; | |
int stride = blockDim.x * gridDim.x; | |
int size = indices.numel(); | |
for (int col_idx = idx; col_idx < source_cols; col_idx += stride) { | |
for (int j = 0; j < size; j++) { | |
auto scatter_index = indices[j]; | |
Saver::save(source[{scatter_index, col_idx}], updates[{j, col_idx}]); | |
} | |
} | |
} | |
// __global__ scatter_saver_atomic(real_t* source, real_t* indices, real_t* updates) { | |
// } | |
template<typename Functor, typename Container, typename DestContainer> | |
void launch_unary_kernel(const Functor& func, | |
const Container& arr, | |
DestContainer dest) { | |
const int NT = 128; | |
const int max_blocks = 40960; | |
// divide up by each embedding dimension (to avoid data-races in accumulation) | |
int grid_size = div_ceil(arr.numel(), NT); | |
grid_size = std::min(grid_size, max_blocks); | |
auto view = arr.const_view(); | |
auto dest_view = dest.view(); | |
unary_kernel<<<grid_size, NT, 0, NULL>>>( | |
func, view, dest_view | |
); | |
} | |
template<typename Functor, typename Container1, typename Container2, typename DestContainer> | |
void launch_binary_kernel(const Functor& func, | |
const Container1& arr1, | |
const Container2& arr2, | |
DestContainer dest) { | |
const int NT = 128; | |
const int max_blocks = 40960; | |
// divide up by each embedding dimension (to avoid data-races in accumulation) | |
if (arr1.numel() != arr2.numel()) { | |
throw std::runtime_error( | |
make_message( | |
"error: inputs to binary elementwise operation" | |
" must have the same number of elements (got " | |
"left.numel() = ", arr1.numel(), | |
", right.numel() = ", arr2.numel(), ")." | |
) | |
); | |
} | |
int grid_size = div_ceil(arr1.numel(), NT); | |
grid_size = std::min(grid_size, max_blocks); | |
auto view1 = arr1.const_view(); | |
auto view2 = arr2.const_view(); | |
auto dest_view = dest.view(); | |
binary_kernel<<<grid_size, NT, 0, NULL>>>( | |
func, view1, view2, dest_view | |
); | |
} | |
struct Device { | |
static int gpu_device; | |
static void set_gpu_device(int device) { | |
if (gpu_device != device) { | |
int count; | |
cudaGetDeviceCount(&count); | |
if (count == 0) { | |
throw std::runtime_error( | |
"no gpu devices found" | |
); | |
} else { | |
std::cout << count << " gpu device" << (count == 1 ? " " : "s ") << "found" << std::endl; | |
} | |
auto status = cudaSetDevice(device); | |
if (status != cudaSuccess) { | |
throw std::runtime_error( | |
make_message( | |
"could not set the gpu device to ", | |
device, ", reason = ", cudaGetErrorString(status) | |
) | |
); | |
} | |
gpu_device = device; | |
} | |
} | |
}; | |
int Device::gpu_device = -1; | |
template<typename T> | |
struct ArrayGather; | |
template<typename T> | |
struct Array : public ArrayLike<Array<T>, T> { | |
std::shared_ptr<int> owners_; | |
Array() : ArrayLike<Array<T>, T>({}, DEVICE_T_CPU, nullptr, 0), | |
owners_(std::make_shared<int>(1)) {} | |
void allocate_data() { | |
if (this->device_ == DEVICE_T_CPU) { | |
allocate_data_cpu(); | |
} else { | |
allocate_data_gpu(); | |
} | |
} | |
void allocate_data_cpu() { | |
auto ptr = malloc(this->sizes_.numel() * sizeof(T)); | |
if (ptr == NULL) { | |
throw std::runtime_error( | |
make_message( | |
"could not allocate ", | |
this->sizes_.numel() * sizeof(T), | |
" bytes on the cpu" | |
) | |
); | |
} else { | |
this->ptr_ = (T*)ptr; | |
} | |
} | |
void allocate_data_gpu() { | |
Device::set_gpu_device(0); | |
auto status = cudaMalloc(&this->ptr_, this->sizes_.numel() * sizeof(T)); | |
if (status != cudaSuccess) { | |
throw std::runtime_error( | |
make_message( | |
"could not allocate ", | |
this->sizes_.numel() * sizeof(T), | |
" bytes on the gpu, reason = ", | |
cudaGetErrorString(status) | |
) | |
); | |
} | |
} | |
Array(const Dimension& sizes, DEVICE_T dev) | |
: ArrayLike<Array<T>, T>(sizes, dev, nullptr, 0), | |
owners_(std::make_shared<int>(1)) { | |
allocate_data(); | |
} | |
Array<T> subtensor(int idx) const { | |
auto arr = slice(idx, idx + 1); | |
arr.sizes_ = arr.sizes_.subsize(); | |
return arr; | |
} | |
Array<T> slice(int start, int end) const { | |
Array<T> arr = *this; | |
int subvolume = 1; | |
for (int i = 1; i < this->ndim(); i++) { | |
subvolume *= this->sizes_[i]; | |
} | |
arr.offset_ = this->offset_ + start * subvolume; | |
arr.sizes_.set_dim(0, end - start); | |
return arr; | |
} | |
T& XINLINE operator[](const int& index) { | |
return *(this->ptr_ + this->offset_ + index); | |
} | |
const T& XINLINE operator[](const int& index) const { | |
return *(this->ptr_ + this->offset_ + index); | |
} | |
T& XINLINE operator[](const Dimension& indices) { | |
int idx_offset = Dimension::index2offset(this->sizes_, indices); | |
return *(this->ptr_ + this->offset_ + idx_offset); | |
} | |
const T& XINLINE operator[](const Dimension& indices) const { | |
int idx_offset = Dimension::index2offset(this->sizes_, indices); | |
return *(this->ptr_ + this->offset_ + idx_offset); | |
} | |
ArrayGather<T> operator[](const Array<int>& indices); | |
Array<T> to_device(DEVICE_T dev) const { | |
if (this->device_ == dev) { | |
return *this; | |
} else { | |
Array<T> arr(this->sizes_, dev); | |
auto copy_type = this->device_ == DEVICE_T_CPU ? | |
cudaMemcpyHostToDevice : | |
cudaMemcpyDeviceToHost; | |
// gpu -> cpu | |
auto status = cudaMemcpy( | |
arr.ptr_ + arr.offset_, | |
this->ptr_ + this->offset_, | |
this->sizes_.numel() * sizeof(T), | |
copy_type | |
); | |
if (status != cudaSuccess) { | |
std::string reason; | |
throw std::runtime_error( | |
make_message( | |
"could not copy ", | |
this->sizes_.numel() * sizeof(T), | |
" bytes from ", | |
this->device_ == DEVICE_T_CPU ? | |
"cpu to gpu" : "gpu to cpu", | |
", reason = ", | |
cudaGetErrorString(status) | |
) | |
); | |
} | |
return arr; | |
} | |
} | |
Array(const Array<T>& other) | |
: ArrayLike<Array<T>, T>(other.sizes_, other.device_, other.ptr_, other.offset_), | |
owners_(other.owners_) { | |
(*owners_) += 1; | |
} | |
void free_data() { | |
if (this->device_ == DEVICE_T_CPU) { | |
free_data_cpu(); | |
} else { | |
free_data_gpu(); | |
} | |
} | |
void free_data_cpu() { | |
free((void*)this->ptr_); | |
} | |
void free_data_gpu() { | |
auto status = cudaFree((void*)this->ptr_); | |
if (status != cudaSuccess) { | |
throw std::runtime_error( | |
make_message( | |
"could not free memory on device : ", | |
status == cudaErrorInvalidDevicePointer ? | |
"cudaErrorInvalidDevicePointer" : | |
"cudaErrorInitializationError" | |
) | |
); | |
} | |
} | |
~Array() { | |
(*owners_) -= 1; | |
if (this->ptr_ != nullptr && (*owners_) == 0) { | |
free_data(); | |
} | |
} | |
void print() const { | |
return print(0, false); | |
} | |
Array& operator=(const double& other); | |
Array& operator+=(const double& other); | |
Array& operator-=(const double& other); | |
void print(int indent, bool print_comma) const { | |
if (this->device_ == DEVICE_T_CPU) { | |
if (this->ndim() == 1) { | |
std::cout << std::string(indent, ' ') << "["; | |
for (int i = 0; i < this->sizes_[0]; i++) { | |
std::cout << (*this)[i]; | |
if (i != this->sizes_[0] - 1) { | |
std::cout << " "; | |
} | |
} | |
if (print_comma) { | |
std::cout << "],\n"; | |
} else { | |
std::cout << "]\n"; | |
} | |
} else if (this->ndim() > 1) { | |
std::cout << std::string(indent, ' ') << "[\n"; | |
for (int i = 0; i < this->sizes_[0]; i++) { | |
auto child = subtensor(i); | |
child.print(indent + INDENT_INCREMENT, i != this->sizes_[0] - 1); | |
} | |
if (print_comma) { | |
std::cout << std::string(indent, ' ') << "],\n"; | |
} else { | |
std::cout << std::string(indent, ' ') << "]\n"; | |
} | |
} else { | |
std::cout << std::string(indent, ' ') << "()\n" << std::endl; | |
} | |
} else { | |
auto arr = to_device(DEVICE_T_CPU); | |
arr.print(); | |
} | |
} | |
ArrayReference<T> view() { | |
return ArrayReference<T>( | |
this->sizes_, | |
this->device_, | |
this->ptr_, | |
this->offset_ | |
); | |
} | |
const ArrayReference<T> const_view() const { | |
return ArrayReference<T>( | |
this->sizes_, | |
this->device_, | |
this->ptr_, | |
this->offset_ | |
); | |
} | |
}; | |
template<typename T> | |
Array<T>& Array<T>::operator=(const double& other) { | |
if (this->device_ == DEVICE_T_CPU) { | |
for (int i = 0; i < this->numel();i++) { | |
(*this)[i] = other; | |
} | |
} else { | |
launch_unary_kernel(functor::Fill(other), *this, *this); | |
} | |
return *this; | |
} | |
template<typename T> | |
Array<T>& Array<T>::operator+=(const double& other) { | |
if (this->device_ == DEVICE_T_CPU) { | |
for (int i = 0; i < this->numel();i++) { | |
(*this)[i] += other; | |
} | |
} else { | |
launch_unary_kernel(functor::Increment(other), *this, *this); | |
} | |
return *this; | |
} | |
template<typename T> | |
Array<T>& Array<T>::operator-=(const double& other) { | |
if (this->device_ == DEVICE_T_CPU) { | |
for (int i = 0; i < this->numel();i++) { | |
(*this)[i] -= other; | |
} | |
} else { | |
launch_unary_kernel(functor::Decrement(other), *this, *this); | |
} | |
return *this; | |
} | |
template<typename Saver, typename SrcT, typename IndexT> | |
void scatter_saver(Array<SrcT> source, | |
const Array<IndexT>& indices, | |
const Array<SrcT>& updates, | |
cudaStream_t stream=NULL) { | |
if (source.device_ == DEVICE_T_GPU) { | |
const int NT = 128; | |
const int max_blocks = 40960; | |
// divide up by each embedding dimension (to avoid data-races in accumulation) | |
int grid_size = div_ceil(source.size()[1], NT); | |
grid_size = std::min(grid_size, max_blocks); | |
auto source_view = source.view(); | |
auto indices_view = indices.const_view(); | |
auto updated_view = updates.const_view(); | |
scatter_saver_reduction_kernel<Saver><<<grid_size, NT, 0, stream>>>( | |
source_view, | |
indices_view, | |
updated_view | |
); | |
} else { | |
auto source_view = source.view(); | |
auto indices_view = indices.const_view(); | |
auto updated_view = updates.const_view(); | |
scatter_saver_cpu<Saver>( | |
source_view, | |
indices_view, | |
updated_view | |
); | |
} | |
} | |
template<typename T> | |
struct ArrayGather { | |
Array<T> array_; | |
Array<int> indices_; | |
ArrayGather(const Array<T>& array, const Array<int>& indices) | |
: array_(array), indices_(indices) {} | |
ArrayGather<T>& operator+=(const Array<T>& updates) { | |
scatter_saver<saver::Increment>(array_, indices_, updates); | |
return *this; | |
} | |
ArrayGather<T>& operator=(const Array<T>& updates) { | |
scatter_saver<saver::Assign>(array_, indices_, updates); | |
return *this; | |
} | |
ArrayGather<T>& operator-=(const Array<T>& updates) { | |
scatter_saver<saver::Decrement>(array_, indices_, updates); | |
return *this; | |
} | |
}; | |
template<typename T> | |
ArrayGather<T> Array<T>::operator[](const Array<int>& indices) { | |
return ArrayGather<T>(*this, indices); | |
} | |
int main() { | |
int dim = 5; | |
int cols = 3; | |
Array<float> source({dim, cols}, DEVICE_T_GPU); | |
source = 0; | |
Array<float> updates({dim, cols}, DEVICE_T_GPU); | |
updates = 2; | |
Array<int> indices({dim}, DEVICE_T_CPU); | |
int i = 0; | |
for (auto index : {0, 0, 2, 1, 2}) { | |
indices[i++] = index; | |
} | |
indices.print(); | |
auto indices_gpu = indices.to_device(DEVICE_T_GPU); | |
source.print(); | |
// increment repeatedly at this location: | |
source[indices_gpu] += updates; | |
source.print(); | |
// decrement repeatedly at this location: | |
source[indices_gpu] -= updates; | |
source.print(); | |
// not well defined in many to one setup | |
source[indices_gpu] = updates; | |
source.print(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment