Skip to content

Instantly share code, notes, and snippets.

@ShigekiKarita
Created July 18, 2017 11:01
Show Gist options
  • Save ShigekiKarita/7d0ac68833c87c10818cbc4729a24719 to your computer and use it in GitHub Desktop.
Save ShigekiKarita/7d0ac68833c87c10818cbc4729a24719 to your computer and use it in GitHub Desktop.
#include <assert.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <algorithm>
#include <iostream>
#include <limits>
#include <cmath>
#include <utility>
#include <memory>
struct CudaDeleter
{
void operator()(void *p) {
cudaError_t res = cudaFree(p);
if (res != cudaSuccess)
{
std::cout << "Error freeing: " << cudaGetErrorString(res) << std::endl;
}
}
};
template <typename T>
std::unique_ptr<T, CudaDeleter> make_unique_cuda(size_t size) {
void *pMemory = nullptr;
cudaError_t res = cudaMalloc(&pMemory, size);
if (res != cudaSuccess) {
std::cout << "Error allocating pMemory: " << cudaGetErrorString(res) << std::endl;
throw;
}
return std::unique_ptr<T, CudaDeleter>(static_cast<T*>(pMemory));
}
constexpr auto inf = std::numeric_limits<float>::infinity();
template <size_t N>
struct MultiIndex {
using type = MultiIndex<N>;
const size_t size_list[N];
template <typename ... Ts>
__host__ __device__
size_t operator()(const Ts ... indices) const {
constexpr size_t M = sizeof...(indices);
static_assert(M == N, "fancy indexing is not supported yet");
const size_t indice_list[M] = { indices ... };
size_t ret = 0;
for (size_t m = 0; m < M; ++m) {
assert(indice_list[m] < size_list[m]);
ret += (m > 0 ? size_list[m-1] : 1) * indice_list[m];
}
return ret;
}
std::unique_ptr<type, CudaDeleter> to_gpu() {
auto d = make_unique_cuda<type>(sizeof(type));
cudaMemcpy(d.get(), this, sizeof(*this), cudaMemcpyHostToDevice);
return std::move(d);
}
};
template <size_t N>
std::ostream& operator<<(std::ostream &o, const MultiIndex<N>& mindex) {
o << "MultiIndex<" << N << ">(";
for (size_t n = 0; n < N; ++n) {
o << mindex.size_list[n];
if (n != N - 1) {
o << ", ";
}
}
o << ")";
return o;
}
template <typename ... Ts>
MultiIndex<sizeof...(Ts)> make_index(Ts ... sizes) {
return {{sizes ...}};
}
struct L1 {
template <typename T>
__device__ __host__ T operator()(T a, T b) const {
return std::fabs(a - b);
}
};
struct L2 {
template <typename T>
__device__ __host__ T operator()(T a, T b) const {
return std::sqrt(a * a + b * b);
}
};
template <typename Dist, typename Vec>
auto dtw(const Vec& a, const Vec& b) -> decltype(a[0]) {
size_t alen = a.size() + 1;
size_t blen = b.size() + 1;
auto dpmat = Vec(alen * blen);
auto at = make_index(alen, blen);
constexpr auto dist = Dist();
for (size_t i = 1; i < alen; ++i) {
dpmat[at(i, 0ul)] = inf;
}
for (size_t j = 1; j < blen; ++j) {
dpmat[at(0ul, j)] = inf;
}
dpmat[at(0ul, 0ul)] = 0;
for (size_t i = 1; i < alen; ++i) {
for (size_t j = 1; j < blen; ++j) {
auto cost = dist(a[i], b[j]);
dpmat[at(i, j)] = cost + std::min({
dpmat[at(i-1, j)],
dpmat[at(i-1, j-1)],
dpmat[at(i, j-1)]});
}
}
for (size_t i = 0; i < alen; ++i) {
for (size_t j = 0; j < blen; ++j) {
std::cout << dpmat[at(i, j)] << ", ";
}
std::cout << std::endl;
}
std::cout << std::endl;
return dpmat[at(alen-1, blen-1)];
}
template <typename Dist>
__global__ void dtw_cuda(const float* a, const float* b, float* dpmat, const MultiIndex<2>* at_ptr) {
const auto& at = *at_ptr;
const auto alen = at.size_list[0];
const auto blen = at.size_list[1];
constexpr auto dist = Dist();
for (size_t i = 1; i < alen; ++i) {
dpmat[at(i, 0ul)] = inf;
}
for (size_t j = 1; j < blen; ++j) {
dpmat[at(0ul, j)] = inf;
}
dpmat[at(0ul, 0ul)] = 0;
for (size_t i = 1; i < alen; ++i) {
for (size_t j = 1; j < blen; ++j) {
auto cost = dist(a[i], b[j]);
dpmat[at(i, j)] =
cost + min(min(
dpmat[at(i-1, j)],
dpmat[at(i-1, j-1)]),
dpmat[at(i, j-1)]);
}
}
printf("ans: %f\n", dpmat[at(alen-1, blen-1)]);
}
template <typename Storage>
struct PackedSequence {
Storage storage;
size_t* batch_sizes;
};
template <typename Iter>
std::vector<size_t> batch_to_length(const Iter batch_begin, const Iter batch_end) {
std::vector<size_t> length_list(*batch_begin); // must be sorted with a descend order
for (Iter b = batch_begin; b != batch_end; ++b) {
for (size_t i = 0; i < *b; ++i) {
++length_list[i];
}
}
return std::move(length_list);
}
void test_dtw() {
const float ra[] = {1,1,1,1,1};
const float rb[] = {1,1,2,2,1,3,3};
thrust::host_vector<float> a(ra, ra + 5);
thrust::host_vector<float> b(rb, rb + 7);
thrust::device_vector<float> da = a;
thrust::device_vector<float> db = b;
size_t alen = a.size() + 1;
size_t blen = b.size() + 1;
thrust::device_vector<float> dpmat(alen, blen);
auto at = make_index(alen, blen);
std::cout << at << std::endl;
dtw_cuda<L1><<<1, 1>>>(thrust::raw_pointer_cast(da.data()),
thrust::raw_pointer_cast(db.data()),
thrust::raw_pointer_cast(dpmat.data()),
at.to_gpu().get());
// auto dpmat = make_unique_cuda<float>
// FIXME: this transfer is bug
thrust::host_vector<float> result = dpmat; //(dpmat.size());
// thrust::copy(dpmat.begin(), dpmat.end(), result.begin());
std::cout << result[at(alen-1, blen-1)] << std::endl;;
for (size_t i = 0; i < alen; ++i) {
for (size_t j = 0; j < blen; ++j) {
std::cout << result[at(i, j)] << ", ";
}
std::cout << std::endl;
}
std::cout << std::endl;
std::cout << dtw<L1>(a, b) << " == 6 ?" << std::endl;
std::cout << dtw<L2>(a, b) << " == 13.3006 ?" << std::endl;
}
void test_packed_sequence() {
std::vector<float> padded = {
0, 1, 2, 3,
4, 5, 6, 7,
8, 9, 10, 110
};
std::vector<size_t> lens = {3, 2, 2, 1};
std::vector<float> packed = {
0, 1, 2, 3,
4, 5, 6,
8
};
std::vector<size_t> batchs = {4, 3, 1};
PackedSequence<float*> p{packed.data(), batchs.data()} ;
const auto length_b = batch_to_length(batchs.cbegin(), batchs.cend());
for (size_t i = 0; i < length_b.size(); ++i) {
assert(lens[i] == length_b[i]);
}
const auto batchs_l = batch_to_length(lens.cbegin(), lens.cend());
for (size_t i = 0; i < batchs_l.size(); ++i) {
assert(batchs[i] == batchs_l[i]);
}
}
int main() {
test_dtw();
test_packed_sequence();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment