ShigekiKarita · July 18, 2017 11:01
diff --git a/dtw2.cu b/dtw2.cu
 #include <assert.h>
 #include <thrust/host_vector.h>

 #include <thrust/device_vector.h>

 #include <algorithm> 
 #include <iostream>
 #include <limits>
 #include <cmath>
 #include <utility>
 #include <memory>


 struct CudaDeleter
 {
    void operator()(void *p) {
        cudaError_t res = cudaFree(p);
        if (res != cudaSuccess)
        {
            std::cout << "Error freeing: " << cudaGetErrorString(res) << std::endl;
        }
    }
 };


 template <typename T>
 std::unique_ptr<T, CudaDeleter> make_unique_cuda(size_t size) {
    void *pMemory = nullptr;
    cudaError_t res = cudaMalloc(&pMemory, size);
    if (res != cudaSuccess) {
        std::cout << "Error allocating pMemory: " << cudaGetErrorString(res) << std::endl;
        throw;
    }
    return std::unique_ptr<T, CudaDeleter>(static_cast<T*>(pMemory));
 }



 constexpr auto inf = std::numeric_limits<float>::infinity();


 template <size_t N>
 struct MultiIndex {
    using type = MultiIndex<N>;
    const size_t size_list[N];

    template <typename ... Ts>
    __host__ __device__
    size_t operator()(const Ts ... indices) const {
        constexpr size_t M = sizeof...(indices);
        static_assert(M == N, "fancy indexing is not supported yet");

        const size_t indice_list[M] = { indices ... };
        size_t ret = 0;
        for (size_t m = 0; m < M; ++m) {
            assert(indice_list[m] < size_list[m]);
            ret += (m > 0 ? size_list[m-1] : 1) * indice_list[m];
        }
        return ret;
    }

    std::unique_ptr<type, CudaDeleter> to_gpu() {
        auto d = make_unique_cuda<type>(sizeof(type));
        cudaMemcpy(d.get(), this, sizeof(*this), cudaMemcpyHostToDevice);
        return std::move(d);
    }
 };


 template <size_t N>
 std::ostream& operator<<(std::ostream &o, const MultiIndex<N>& mindex) {
    o << "MultiIndex<" << N << ">(";
    for (size_t n = 0; n < N; ++n) {
        o << mindex.size_list[n];
        if (n != N - 1) {
            o << ", ";
        }
    }
    
    o << ")";
    return o;
 }

 template <typename ... Ts>
 MultiIndex<sizeof...(Ts)> make_index(Ts ... sizes) {
    return {{sizes ...}};
 }

 struct L1 {
    template <typename T>
    __device__ __host__ T operator()(T a, T b) const {
        return std::fabs(a - b);
    }
 };

 struct L2 {
    template <typename T>
    __device__ __host__ T operator()(T a, T b) const {
        return std::sqrt(a * a + b * b);
    }
 };

 template <typename Dist, typename Vec>
 auto dtw(const Vec& a, const Vec& b) -> decltype(a[0]) {
    size_t alen = a.size() + 1;
    size_t blen = b.size() + 1;
    auto dpmat = Vec(alen * blen);
    auto at = make_index(alen, blen);
    constexpr auto dist = Dist();

    for (size_t i = 1; i < alen; ++i) {
        dpmat[at(i, 0ul)] = inf;
    }
    for (size_t j = 1; j < blen; ++j) {
        dpmat[at(0ul, j)] = inf;
    }
    dpmat[at(0ul, 0ul)] = 0;

    for (size_t i = 1; i < alen; ++i) {
        for (size_t j = 1; j < blen; ++j) {
            auto cost = dist(a[i], b[j]);
            dpmat[at(i, j)] = cost + std::min({
                    dpmat[at(i-1, j)],
                    dpmat[at(i-1, j-1)],
                    dpmat[at(i,   j-1)]});
        }
    }

    for (size_t i = 0; i < alen; ++i) {
        for (size_t j = 0; j < blen; ++j) {
            std::cout << dpmat[at(i, j)] << ", ";
        }
        std::cout << std::endl;
    }
    std::cout << std::endl;

    return dpmat[at(alen-1, blen-1)];
 }


 template <typename Dist>
 __global__ void dtw_cuda(const float* a, const float* b, float* dpmat, const MultiIndex<2>* at_ptr) {
    const auto& at = *at_ptr;
    const auto alen = at.size_list[0];
    const auto blen = at.size_list[1];
    constexpr auto dist = Dist();

    for (size_t i = 1; i < alen; ++i) {
        dpmat[at(i, 0ul)] = inf;
    }
    for (size_t j = 1; j < blen; ++j) {
        dpmat[at(0ul, j)] = inf;
    }
    dpmat[at(0ul, 0ul)] = 0;

    for (size_t i = 1; i < alen; ++i) {
        for (size_t j = 1; j < blen; ++j) {
            auto cost = dist(a[i], b[j]);
            dpmat[at(i, j)] = 
                cost + min(min(
                    dpmat[at(i-1, j)],
                    dpmat[at(i-1, j-1)]),
                    dpmat[at(i,   j-1)]);
        }
    }
    printf("ans: %f\n", dpmat[at(alen-1, blen-1)]); 
 }


 template <typename Storage>
 struct PackedSequence {
    Storage storage;
    size_t* batch_sizes;
 };

 template <typename Iter>
 std::vector<size_t> batch_to_length(const Iter batch_begin, const Iter batch_end) {
    std::vector<size_t> length_list(*batch_begin); // must be sorted with a descend order
    for (Iter b = batch_begin; b != batch_end; ++b) {
        for (size_t i = 0; i < *b; ++i) {
            ++length_list[i];
        }
    }
    return std::move(length_list);
 }

 void test_dtw() {
    const float ra[] = {1,1,1,1,1};
    const float rb[] = {1,1,2,2,1,3,3};
    thrust::host_vector<float> a(ra, ra + 5);
    thrust::host_vector<float> b(rb, rb + 7);

    thrust::device_vector<float> da = a;
    thrust::device_vector<float> db = b;
    size_t alen = a.size() + 1;
    size_t blen = b.size() + 1;
    thrust::device_vector<float> dpmat(alen, blen);
    auto at = make_index(alen, blen);
    std::cout << at << std::endl;
    dtw_cuda<L1><<<1, 1>>>(thrust::raw_pointer_cast(da.data()),
                           thrust::raw_pointer_cast(db.data()),
                           thrust::raw_pointer_cast(dpmat.data()),
                           at.to_gpu().get());

    // auto dpmat = make_unique_cuda<float>
    // FIXME: this transfer is bug
    thrust::host_vector<float> result = dpmat; //(dpmat.size());
    // thrust::copy(dpmat.begin(), dpmat.end(), result.begin());
    std::cout << result[at(alen-1, blen-1)] << std::endl;;

    for (size_t i = 0; i < alen; ++i) {
        for (size_t j = 0; j < blen; ++j) {
            std::cout << result[at(i, j)] << ", ";
        }
        std::cout << std::endl;
    }
    std::cout << std::endl;

    std::cout << dtw<L1>(a, b) << " == 6 ?" << std::endl;
    std::cout << dtw<L2>(a, b) << " == 13.3006 ?" << std::endl;
 }


 void test_packed_sequence() {
    std::vector<float> padded = {
        0, 1, 2, 3,
        4, 5, 6, 7,
        8, 9, 10, 110
    };
    std::vector<size_t> lens = {3, 2, 2, 1};

    std::vector<float> packed = {
        0, 1, 2, 3,
        4, 5, 6,
        8
    };
    std::vector<size_t> batchs = {4, 3, 1};

    PackedSequence<float*> p{packed.data(), batchs.data()} ;

    const auto length_b = batch_to_length(batchs.cbegin(), batchs.cend());
    for (size_t i = 0; i < length_b.size(); ++i) {
        assert(lens[i] == length_b[i]);
    }

    const auto batchs_l = batch_to_length(lens.cbegin(), lens.cend());
    for (size_t i = 0; i < batchs_l.size(); ++i) {
        assert(batchs[i] == batchs_l[i]);
    }
 }

 int main() {
    test_dtw();
    test_packed_sequence();
 }
	#include <assert.h>
	#include <thrust/host_vector.h>

	#include <thrust/device_vector.h>

	#include <algorithm>
	#include <iostream>
	#include <limits>
	#include <cmath>
	#include <utility>
	#include <memory>


	struct CudaDeleter
	{
	void operator()(void *p) {
	cudaError_t res = cudaFree(p);
	if (res != cudaSuccess)
	{
	std::cout << "Error freeing: " << cudaGetErrorString(res) << std::endl;
	}
	}
	};


	template <typename T>
	std::unique_ptr<T, CudaDeleter> make_unique_cuda(size_t size) {
	void *pMemory = nullptr;
	cudaError_t res = cudaMalloc(&pMemory, size);
	if (res != cudaSuccess) {
	std::cout << "Error allocating pMemory: " << cudaGetErrorString(res) << std::endl;
	throw;
	}
	return std::unique_ptr<T, CudaDeleter>(static_cast<T*>(pMemory));
	}



	constexpr auto inf = std::numeric_limits<float>::infinity();


	template <size_t N>
	struct MultiIndex {
	using type = MultiIndex<N>;
	const size_t size_list[N];

	template <typename ... Ts>
	__host__ __device__
	size_t operator()(const Ts ... indices) const {
	constexpr size_t M = sizeof...(indices);
	static_assert(M == N, "fancy indexing is not supported yet");

	const size_t indice_list[M] = { indices ... };
	size_t ret = 0;
	for (size_t m = 0; m < M; ++m) {
	assert(indice_list[m] < size_list[m]);
	ret += (m > 0 ? size_list[m-1] : 1) * indice_list[m];
	}
	return ret;
	}

	std::unique_ptr<type, CudaDeleter> to_gpu() {
	auto d = make_unique_cuda<type>(sizeof(type));
	cudaMemcpy(d.get(), this, sizeof(*this), cudaMemcpyHostToDevice);
	return std::move(d);
	}
	};


	template <size_t N>
	std::ostream& operator<<(std::ostream &o, const MultiIndex<N>& mindex) {
	o << "MultiIndex<" << N << ">(";
	for (size_t n = 0; n < N; ++n) {
	o << mindex.size_list[n];
	if (n != N - 1) {
	o << ", ";
	}
	}

	o << ")";
	return o;
	}

	template <typename ... Ts>
	MultiIndex<sizeof...(Ts)> make_index(Ts ... sizes) {
	return {{sizes ...}};
	}

	struct L1 {
	template <typename T>
	__device__ __host__ T operator()(T a, T b) const {
	return std::fabs(a - b);
	}
	};

	struct L2 {
	template <typename T>
	__device__ __host__ T operator()(T a, T b) const {
	return std::sqrt(a * a + b * b);
	}
	};

	template <typename Dist, typename Vec>
	auto dtw(const Vec& a, const Vec& b) -> decltype(a[0]) {
	size_t alen = a.size() + 1;
	size_t blen = b.size() + 1;
	auto dpmat = Vec(alen * blen);
	auto at = make_index(alen, blen);
	constexpr auto dist = Dist();

	for (size_t i = 1; i < alen; ++i) {
	dpmat[at(i, 0ul)] = inf;
	}
	for (size_t j = 1; j < blen; ++j) {
	dpmat[at(0ul, j)] = inf;
	}
	dpmat[at(0ul, 0ul)] = 0;

	for (size_t i = 1; i < alen; ++i) {
	for (size_t j = 1; j < blen; ++j) {
	auto cost = dist(a[i], b[j]);
	dpmat[at(i, j)] = cost + std::min({
	dpmat[at(i-1, j)],
	dpmat[at(i-1, j-1)],
	dpmat[at(i, j-1)]});
	}
	}

	for (size_t i = 0; i < alen; ++i) {
	for (size_t j = 0; j < blen; ++j) {
	std::cout << dpmat[at(i, j)] << ", ";
	}
	std::cout << std::endl;
	}
	std::cout << std::endl;

	return dpmat[at(alen-1, blen-1)];
	}


	template <typename Dist>
	__global__ void dtw_cuda(const float* a, const float* b, float* dpmat, const MultiIndex<2>* at_ptr) {
	const auto& at = *at_ptr;
	const auto alen = at.size_list[0];
	const auto blen = at.size_list[1];
	constexpr auto dist = Dist();

	for (size_t i = 1; i < alen; ++i) {
	dpmat[at(i, 0ul)] = inf;
	}
	for (size_t j = 1; j < blen; ++j) {
	dpmat[at(0ul, j)] = inf;
	}
	dpmat[at(0ul, 0ul)] = 0;

	for (size_t i = 1; i < alen; ++i) {
	for (size_t j = 1; j < blen; ++j) {
	auto cost = dist(a[i], b[j]);
	dpmat[at(i, j)] =
	cost + min(min(
	dpmat[at(i-1, j)],
	dpmat[at(i-1, j-1)]),
	dpmat[at(i, j-1)]);
	}
	}
	printf("ans: %f\n", dpmat[at(alen-1, blen-1)]);
	}


	template <typename Storage>
	struct PackedSequence {
	Storage storage;
	size_t* batch_sizes;
	};

	template <typename Iter>
	std::vector<size_t> batch_to_length(const Iter batch_begin, const Iter batch_end) {
	std::vector<size_t> length_list(*batch_begin); // must be sorted with a descend order
	for (Iter b = batch_begin; b != batch_end; ++b) {
	for (size_t i = 0; i < *b; ++i) {
	++length_list[i];
	}
	}
	return std::move(length_list);
	}

	void test_dtw() {
	const float ra[] = {1,1,1,1,1};
	const float rb[] = {1,1,2,2,1,3,3};
	thrust::host_vector<float> a(ra, ra + 5);
	thrust::host_vector<float> b(rb, rb + 7);

	thrust::device_vector<float> da = a;
	thrust::device_vector<float> db = b;
	size_t alen = a.size() + 1;
	size_t blen = b.size() + 1;
	thrust::device_vector<float> dpmat(alen, blen);
	auto at = make_index(alen, blen);
	std::cout << at << std::endl;
	dtw_cuda<L1><<<1, 1>>>(thrust::raw_pointer_cast(da.data()),
	thrust::raw_pointer_cast(db.data()),
	thrust::raw_pointer_cast(dpmat.data()),
	at.to_gpu().get());

	// auto dpmat = make_unique_cuda<float>
	// FIXME: this transfer is bug
	thrust::host_vector<float> result = dpmat; //(dpmat.size());
	// thrust::copy(dpmat.begin(), dpmat.end(), result.begin());
	std::cout << result[at(alen-1, blen-1)] << std::endl;;

	for (size_t i = 0; i < alen; ++i) {
	for (size_t j = 0; j < blen; ++j) {
	std::cout << result[at(i, j)] << ", ";
	}
	std::cout << std::endl;
	}
	std::cout << std::endl;

	std::cout << dtw<L1>(a, b) << " == 6 ?" << std::endl;
	std::cout << dtw<L2>(a, b) << " == 13.3006 ?" << std::endl;
	}


	void test_packed_sequence() {
	std::vector<float> padded = {
	0, 1, 2, 3,
	4, 5, 6, 7,
	8, 9, 10, 110
	};
	std::vector<size_t> lens = {3, 2, 2, 1};

	std::vector<float> packed = {
	0, 1, 2, 3,
	4, 5, 6,
	8
	};
	std::vector<size_t> batchs = {4, 3, 1};

	PackedSequence<float*> p{packed.data(), batchs.data()} ;

	const auto length_b = batch_to_length(batchs.cbegin(), batchs.cend());
	for (size_t i = 0; i < length_b.size(); ++i) {
	assert(lens[i] == length_b[i]);
	}

	const auto batchs_l = batch_to_length(lens.cbegin(), lens.cend());
	for (size_t i = 0; i < batchs_l.size(); ++i) {
	assert(batchs[i] == batchs_l[i]);
	}
	}

	int main() {
	test_dtw();
	test_packed_sequence();
	}