Skip to content

Instantly share code, notes, and snippets.

@JonathanRaiman
Created July 8, 2016 07:43
Show Gist options
  • Save JonathanRaiman/c6a55691c3dba8969abc636f621c8792 to your computer and use it in GitHub Desktop.
Save JonathanRaiman/c6a55691c3dba8969abc636f621c8792 to your computer and use it in GitHub Desktop.
CUDA / CPU one file Array library
#include <vector>
#include <string>
#include <memory>
#include <sstream>
#include <iostream>
#define XINLINE __device__ __host__
#define MAX_DIM 10
#define INDENT_INCREMENT 2
namespace functor {
struct Fill {
double fill_;
XINLINE Fill(double fill) : fill_(fill) {}
template<typename T>
T XINLINE operator()(const T& val) {
return fill_;
}
};
struct Increment {
double inc_;
XINLINE Increment(double inc) : inc_(inc) {}
template<typename T>
T XINLINE operator()(const T& val) {
return val + inc_;
}
};
struct Decrement {
double dec_;
XINLINE Decrement(double dec) : dec_(dec) {}
template<typename T>
T XINLINE operator()(const T& val) {
return val + dec_;
}
};
}
namespace saver {
struct Increment {
template<typename T>
static void XINLINE save(T& left, const T& right) {
left += right;
}
};
struct Decrement {
template<typename T>
static void XINLINE save(T& left, const T& right) {
left -= right;
}
};
struct Assign {
template<typename T>
static void XINLINE save(T& left, const T& right) {
left = right;
}
};
}
int div_ceil(int a, int b) {
return (a + b - 1) / b;
}
void make_message(std::stringstream* ss) {}
template<typename Arg, typename... Args>
void make_message(std::stringstream* ss, const Arg& arg, const Args&... args) {
(*ss) << arg;
make_message(ss, args...);
}
template<typename... Args>
std::string make_message(const Args&... args) {
std::stringstream ss;
make_message(&ss, args...);
return ss.str();
}
struct Dimension {
int sizes_[MAX_DIM];
int ndim_;
Dimension(const std::vector<int>& sizes) : ndim_(sizes.size()) {
for (int i = 0; i < sizes.size();i++) {
sizes_[i] = sizes[i];
}
}
XINLINE Dimension(std::initializer_list<int> sizes) : ndim_(sizes.size()) {
int i = 0;
for (auto iter = sizes.begin(); iter != sizes.end(); iter++) {
sizes_[i] = *iter;
i++;
}
}
XINLINE ~Dimension() {
}
int XINLINE ndim() const {
return ndim_;
}
int XINLINE operator[](int dim) const {
return sizes_[dim];
}
void XINLINE set_dim(int dim, int value) {
sizes_[dim] = value;
}
Dimension(const Dimension& other) : ndim_(other.ndim_) {
for (int i = 0; i < ndim_;i++) {
sizes_[i] = other.sizes_[i];
}
}
Dimension& XINLINE operator=(const Dimension& other) {
ndim_ = other.ndim();
for (int i = 0; i < other.ndim(); i++) {
sizes_[i] = other[i];
}
return *this;
}
int XINLINE numel() const {
int volume = 1;
for (int i = 0; i < ndim_; i++) {
volume *= sizes_[i];
}
return volume;
}
Dimension XINLINE subsize() const {
Dimension subdim({});
subdim.ndim_ = ndim_ - 1;
for (int i = 1; i < ndim_; i++) {
subdim.set_dim(i - 1, sizes_[i]);
}
return subdim;
}
static int XINLINE index2offset(const Dimension& sizes, const Dimension& indices) {
int offset = 0;
int volume = 1;
for (int i = indices.ndim() - 1; i >= 0; i--) {
offset += indices[i] * volume;
volume *= sizes[i];
}
return offset;
}
};
std::ostream& operator<<(std::ostream& stream, const Dimension& dims) {
stream << "(";
for (int i = 0; i < dims.ndim();i++) {
stream << dims[i];
if (i != dims.ndim() - 1) {
stream << ", ";
} else {
stream << ")";
}
}
return stream;
}
enum DEVICE_T {
DEVICE_T_CPU,
DEVICE_T_GPU
};
std::ostream& operator<<(std::ostream& stream, const DEVICE_T& device) {
return stream << ((device == DEVICE_T_CPU) ? "cpu" : "gpu");
}
template<typename Container, typename T>
struct ArrayLike {
Dimension sizes_;
T* ptr_;
int offset_;
const DEVICE_T device_;
int XINLINE numel() const {
return sizes_.numel();
}
int XINLINE offset() const {
return offset_;
}
const Dimension& XINLINE size() const {
return sizes_;
}
T& XINLINE operator[](const int& index) {
return *(ptr_ + offset_ + index);
}
const T& XINLINE operator[](const int& index) const {
return *(ptr_ + offset_ + index);
}
T& XINLINE operator[](const Dimension& indices) {
int idx_offset = Dimension::index2offset(this->sizes_, indices);
return *(this->ptr_ + this->offset_ + idx_offset);
}
const T& XINLINE operator[](const Dimension& indices) const {
int idx_offset = Dimension::index2offset(this->sizes_, indices);
return *(this->ptr_ + this->offset_ + idx_offset);
}
int XINLINE ndim() const {
return sizes_.ndim();
}
ArrayLike(const Dimension& sizes, const DEVICE_T& dev, T* ptr, const int& offset)
: sizes_(sizes),
device_(dev),
ptr_(ptr),
offset_(offset) {}
};
template<typename T>
struct ArrayReference : public ArrayLike<ArrayReference<T>, T> {
using ArrayLike<ArrayReference<T>, T>::ArrayLike;
};
template<typename Functor, typename T>
void __global__
unary_kernel(Functor func,
const ArrayReference<T> arr,
ArrayReference<T> dest) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
int size = arr.numel();
for (int i = idx; i < size; i += stride) {
dest[i] = func(arr[i]);
}
}
template<typename Functor, typename T>
void __global__
binary_kernel(Functor func,
const ArrayReference<T> arr,
const ArrayReference<T> arr2,
ArrayReference<T> dest) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
int size = arr.numel();
for (int i = idx; i < size; i += stride) {
dest[i] = func(arr[i], arr2[i]);
}
}
template<typename Saver, typename SrcT, typename IndexT>
void scatter_saver_cpu(ArrayReference<SrcT> source,
const ArrayReference<IndexT> indices,
const ArrayReference<SrcT> updates) {
int source_cols = source.size()[1];
int size = indices.numel();
for (int j = 0; j < size; ++j) {
auto scatter_index = indices[j];
for (int col_idx = 0; col_idx < source_cols; ++col_idx) {
Saver::save(source[{scatter_index, col_idx}], updates[{j, col_idx}]);
}
}
}
template<typename Saver, typename SrcT, typename IndexT>
__global__
void scatter_saver_reduction_kernel(
ArrayReference<SrcT> source,
const ArrayReference<IndexT> indices,
const ArrayReference<SrcT> updates) {
int source_cols = source.size()[1];
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
int size = indices.numel();
for (int col_idx = idx; col_idx < source_cols; col_idx += stride) {
for (int j = 0; j < size; j++) {
auto scatter_index = indices[j];
Saver::save(source[{scatter_index, col_idx}], updates[{j, col_idx}]);
}
}
}
// __global__ scatter_saver_atomic(real_t* source, real_t* indices, real_t* updates) {
// }
template<typename Functor, typename Container, typename DestContainer>
void launch_unary_kernel(const Functor& func,
const Container& arr,
DestContainer dest) {
const int NT = 128;
const int max_blocks = 40960;
// divide up by each embedding dimension (to avoid data-races in accumulation)
int grid_size = div_ceil(arr.numel(), NT);
grid_size = std::min(grid_size, max_blocks);
auto view = arr.const_view();
auto dest_view = dest.view();
unary_kernel<<<grid_size, NT, 0, NULL>>>(
func, view, dest_view
);
}
template<typename Functor, typename Container1, typename Container2, typename DestContainer>
void launch_binary_kernel(const Functor& func,
const Container1& arr1,
const Container2& arr2,
DestContainer dest) {
const int NT = 128;
const int max_blocks = 40960;
// divide up by each embedding dimension (to avoid data-races in accumulation)
if (arr1.numel() != arr2.numel()) {
throw std::runtime_error(
make_message(
"error: inputs to binary elementwise operation"
" must have the same number of elements (got "
"left.numel() = ", arr1.numel(),
", right.numel() = ", arr2.numel(), ")."
)
);
}
int grid_size = div_ceil(arr1.numel(), NT);
grid_size = std::min(grid_size, max_blocks);
auto view1 = arr1.const_view();
auto view2 = arr2.const_view();
auto dest_view = dest.view();
binary_kernel<<<grid_size, NT, 0, NULL>>>(
func, view1, view2, dest_view
);
}
struct Device {
static int gpu_device;
static void set_gpu_device(int device) {
if (gpu_device != device) {
int count;
cudaGetDeviceCount(&count);
if (count == 0) {
throw std::runtime_error(
"no gpu devices found"
);
} else {
std::cout << count << " gpu device" << (count == 1 ? " " : "s ") << "found" << std::endl;
}
auto status = cudaSetDevice(device);
if (status != cudaSuccess) {
throw std::runtime_error(
make_message(
"could not set the gpu device to ",
device, ", reason = ", cudaGetErrorString(status)
)
);
}
gpu_device = device;
}
}
};
int Device::gpu_device = -1;
template<typename T>
struct ArrayGather;
template<typename T>
struct Array : public ArrayLike<Array<T>, T> {
std::shared_ptr<int> owners_;
Array() : ArrayLike<Array<T>, T>({}, DEVICE_T_CPU, nullptr, 0),
owners_(std::make_shared<int>(1)) {}
void allocate_data() {
if (this->device_ == DEVICE_T_CPU) {
allocate_data_cpu();
} else {
allocate_data_gpu();
}
}
void allocate_data_cpu() {
auto ptr = malloc(this->sizes_.numel() * sizeof(T));
if (ptr == NULL) {
throw std::runtime_error(
make_message(
"could not allocate ",
this->sizes_.numel() * sizeof(T),
" bytes on the cpu"
)
);
} else {
this->ptr_ = (T*)ptr;
}
}
void allocate_data_gpu() {
Device::set_gpu_device(0);
auto status = cudaMalloc(&this->ptr_, this->sizes_.numel() * sizeof(T));
if (status != cudaSuccess) {
throw std::runtime_error(
make_message(
"could not allocate ",
this->sizes_.numel() * sizeof(T),
" bytes on the gpu, reason = ",
cudaGetErrorString(status)
)
);
}
}
Array(const Dimension& sizes, DEVICE_T dev)
: ArrayLike<Array<T>, T>(sizes, dev, nullptr, 0),
owners_(std::make_shared<int>(1)) {
allocate_data();
}
Array<T> subtensor(int idx) const {
auto arr = slice(idx, idx + 1);
arr.sizes_ = arr.sizes_.subsize();
return arr;
}
Array<T> slice(int start, int end) const {
Array<T> arr = *this;
int subvolume = 1;
for (int i = 1; i < this->ndim(); i++) {
subvolume *= this->sizes_[i];
}
arr.offset_ = this->offset_ + start * subvolume;
arr.sizes_.set_dim(0, end - start);
return arr;
}
T& XINLINE operator[](const int& index) {
return *(this->ptr_ + this->offset_ + index);
}
const T& XINLINE operator[](const int& index) const {
return *(this->ptr_ + this->offset_ + index);
}
T& XINLINE operator[](const Dimension& indices) {
int idx_offset = Dimension::index2offset(this->sizes_, indices);
return *(this->ptr_ + this->offset_ + idx_offset);
}
const T& XINLINE operator[](const Dimension& indices) const {
int idx_offset = Dimension::index2offset(this->sizes_, indices);
return *(this->ptr_ + this->offset_ + idx_offset);
}
ArrayGather<T> operator[](const Array<int>& indices);
Array<T> to_device(DEVICE_T dev) const {
if (this->device_ == dev) {
return *this;
} else {
Array<T> arr(this->sizes_, dev);
auto copy_type = this->device_ == DEVICE_T_CPU ?
cudaMemcpyHostToDevice :
cudaMemcpyDeviceToHost;
// gpu -> cpu
auto status = cudaMemcpy(
arr.ptr_ + arr.offset_,
this->ptr_ + this->offset_,
this->sizes_.numel() * sizeof(T),
copy_type
);
if (status != cudaSuccess) {
std::string reason;
throw std::runtime_error(
make_message(
"could not copy ",
this->sizes_.numel() * sizeof(T),
" bytes from ",
this->device_ == DEVICE_T_CPU ?
"cpu to gpu" : "gpu to cpu",
", reason = ",
cudaGetErrorString(status)
)
);
}
return arr;
}
}
Array(const Array<T>& other)
: ArrayLike<Array<T>, T>(other.sizes_, other.device_, other.ptr_, other.offset_),
owners_(other.owners_) {
(*owners_) += 1;
}
void free_data() {
if (this->device_ == DEVICE_T_CPU) {
free_data_cpu();
} else {
free_data_gpu();
}
}
void free_data_cpu() {
free((void*)this->ptr_);
}
void free_data_gpu() {
auto status = cudaFree((void*)this->ptr_);
if (status != cudaSuccess) {
throw std::runtime_error(
make_message(
"could not free memory on device : ",
status == cudaErrorInvalidDevicePointer ?
"cudaErrorInvalidDevicePointer" :
"cudaErrorInitializationError"
)
);
}
}
~Array() {
(*owners_) -= 1;
if (this->ptr_ != nullptr && (*owners_) == 0) {
free_data();
}
}
void print() const {
return print(0, false);
}
Array& operator=(const double& other);
Array& operator+=(const double& other);
Array& operator-=(const double& other);
void print(int indent, bool print_comma) const {
if (this->device_ == DEVICE_T_CPU) {
if (this->ndim() == 1) {
std::cout << std::string(indent, ' ') << "[";
for (int i = 0; i < this->sizes_[0]; i++) {
std::cout << (*this)[i];
if (i != this->sizes_[0] - 1) {
std::cout << " ";
}
}
if (print_comma) {
std::cout << "],\n";
} else {
std::cout << "]\n";
}
} else if (this->ndim() > 1) {
std::cout << std::string(indent, ' ') << "[\n";
for (int i = 0; i < this->sizes_[0]; i++) {
auto child = subtensor(i);
child.print(indent + INDENT_INCREMENT, i != this->sizes_[0] - 1);
}
if (print_comma) {
std::cout << std::string(indent, ' ') << "],\n";
} else {
std::cout << std::string(indent, ' ') << "]\n";
}
} else {
std::cout << std::string(indent, ' ') << "()\n" << std::endl;
}
} else {
auto arr = to_device(DEVICE_T_CPU);
arr.print();
}
}
ArrayReference<T> view() {
return ArrayReference<T>(
this->sizes_,
this->device_,
this->ptr_,
this->offset_
);
}
const ArrayReference<T> const_view() const {
return ArrayReference<T>(
this->sizes_,
this->device_,
this->ptr_,
this->offset_
);
}
};
template<typename T>
Array<T>& Array<T>::operator=(const double& other) {
if (this->device_ == DEVICE_T_CPU) {
for (int i = 0; i < this->numel();i++) {
(*this)[i] = other;
}
} else {
launch_unary_kernel(functor::Fill(other), *this, *this);
}
return *this;
}
template<typename T>
Array<T>& Array<T>::operator+=(const double& other) {
if (this->device_ == DEVICE_T_CPU) {
for (int i = 0; i < this->numel();i++) {
(*this)[i] += other;
}
} else {
launch_unary_kernel(functor::Increment(other), *this, *this);
}
return *this;
}
template<typename T>
Array<T>& Array<T>::operator-=(const double& other) {
if (this->device_ == DEVICE_T_CPU) {
for (int i = 0; i < this->numel();i++) {
(*this)[i] -= other;
}
} else {
launch_unary_kernel(functor::Decrement(other), *this, *this);
}
return *this;
}
template<typename Saver, typename SrcT, typename IndexT>
void scatter_saver(Array<SrcT> source,
const Array<IndexT>& indices,
const Array<SrcT>& updates,
cudaStream_t stream=NULL) {
if (source.device_ == DEVICE_T_GPU) {
const int NT = 128;
const int max_blocks = 40960;
// divide up by each embedding dimension (to avoid data-races in accumulation)
int grid_size = div_ceil(source.size()[1], NT);
grid_size = std::min(grid_size, max_blocks);
auto source_view = source.view();
auto indices_view = indices.const_view();
auto updated_view = updates.const_view();
scatter_saver_reduction_kernel<Saver><<<grid_size, NT, 0, stream>>>(
source_view,
indices_view,
updated_view
);
} else {
auto source_view = source.view();
auto indices_view = indices.const_view();
auto updated_view = updates.const_view();
scatter_saver_cpu<Saver>(
source_view,
indices_view,
updated_view
);
}
}
template<typename T>
struct ArrayGather {
Array<T> array_;
Array<int> indices_;
ArrayGather(const Array<T>& array, const Array<int>& indices)
: array_(array), indices_(indices) {}
ArrayGather<T>& operator+=(const Array<T>& updates) {
scatter_saver<saver::Increment>(array_, indices_, updates);
return *this;
}
ArrayGather<T>& operator=(const Array<T>& updates) {
scatter_saver<saver::Assign>(array_, indices_, updates);
return *this;
}
ArrayGather<T>& operator-=(const Array<T>& updates) {
scatter_saver<saver::Decrement>(array_, indices_, updates);
return *this;
}
};
template<typename T>
ArrayGather<T> Array<T>::operator[](const Array<int>& indices) {
return ArrayGather<T>(*this, indices);
}
int main() {
int dim = 5;
int cols = 3;
Array<float> source({dim, cols}, DEVICE_T_GPU);
source = 0;
Array<float> updates({dim, cols}, DEVICE_T_GPU);
updates = 2;
Array<int> indices({dim}, DEVICE_T_CPU);
int i = 0;
for (auto index : {0, 0, 2, 1, 2}) {
indices[i++] = index;
}
indices.print();
auto indices_gpu = indices.to_device(DEVICE_T_GPU);
source.print();
// increment repeatedly at this location:
source[indices_gpu] += updates;
source.print();
// decrement repeatedly at this location:
source[indices_gpu] -= updates;
source.print();
// not well defined in many to one setup
source[indices_gpu] = updates;
source.print();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment