ShigekiKarita · July 18, 2017 00:47
diff --git a/dtw.cu b/dtw.cu
 #include <assert.h>
 #include <thrust/host_vector.h>

 #include <thrust/device_vector.h>

 #include <algorithm> 
 #include <iostream>
 #include <limits>
 #include <cmath>
 #include <utility>

 const static auto inf = std::numeric_limits<float>::infinity();


 template <size_t N>
 struct MultiIndex {
    const size_t size_list[N];

    template <typename ... Ts>
    __host__ __device__
    size_t operator()(const Ts ... indices) const {
        constexpr size_t M = sizeof...(indices);
        static_assert(M == N, "fancy indexing is not supported yet");

        const size_t indice_list[M] = { indices ... };
        size_t ret = 0;
        for (size_t m = 0; m < M; ++m) {
            assert(indice_list[m] < size_list[m]);
            ret += (m > 0 ? size_list[m-1] : 1) * indice_list[m];
        }
        return ret;
    }
 };

 template <typename ... Ts>
 MultiIndex<sizeof...(Ts)> make_index(Ts ... sizes) {
    return {{sizes ...}};
 }

 struct L1 {
    template <typename T>
    T operator()(T a, T b) {
        return std::abs(a - b);
    }
 };

 struct L2 {
    template <typename T>
    T operator()(T a, T b) {
        return std::sqrt(a * a + b * b);
    }
 };

 template <typename Dist, typename Vec>
 auto dtw(const Vec& a, const Vec& b) -> decltype(a[0]) {
    size_t alen = a.size() + 1;
    size_t blen = b.size() + 1;
    auto dpmat = Vec(alen * blen);
    auto at = make_index(alen, blen);
    auto dist = Dist();

    for (size_t i = 1; i < alen; ++i) {
        dpmat[at(i, 0ul)] = inf;
    }
    for (size_t j = 1; j < blen; ++j) {
        dpmat[at(0ul, j)] = inf;
    }
    dpmat[at(0ul, 0ul)] = 0;

    for (size_t i = 1; i < alen; ++i) {
        for (size_t j = 1; j < blen; ++j) {
            auto cost = dist(a[i], b[j]);
            dpmat[at(i, j)] = cost + std::min({
                    dpmat[at(i-1, j)],
                    dpmat[at(i-1, j-1)],
                    dpmat[at(i,   j-1)]});
        }
    }
    return dpmat[at(alen-1, blen-1)];
 }


 template <typename Storage, typename Indexer>
 __global__ void dtw_cuda(const Storage a, const Storage b, Storage dpmat, Indexer at) {
    const auto alen = at.size_list[0];
    const auto blen = at.size_list[1];

    for (size_t i = 1; i < alen; ++i) {
        dpmat[at(i, 0ul)] = inf;
    }
    for (size_t j = 1; j < blen; ++j) {
        dpmat[at(0ul, j)] = inf;
    }
    dpmat[at(0ul, 0ul)] = 0;

    for (size_t i = 1; i < alen; ++i) {
        for (size_t j = 1; j < blen; ++j) {
            auto cost = fabs(a[i] - b[j]);
            dpmat[at(i, j)] = cost + fminf(fminf(
                    dpmat[at(i-1, j)],
                    dpmat[at(i-1, j-1)]),
                    dpmat[at(i,   j-1)]);
        }
    }
 }


 template <typename Storage>
 struct PackedSequence {
    Storage storage;
    size_t* batch_sizes;
 };

 template <typename Iter>
 std::vector<size_t> batch_to_length(const Iter batch_begin, const Iter batch_end) {
    std::vector<size_t> length_list(*batch_begin); // must be sorted with a descend order
    for (Iter b = batch_begin; b != batch_end; ++b) {
        for (size_t i = 0; i < *b; ++i) {
            ++length_list[i];
        }
    }
    return std::move(length_list);
 }


 void test_packed_sequence() {
    std::vector<float> padded = {
        0, 1, 2, 3,
        4, 5, 6, 7,
        8, 9, 10, 110
    };
    std::vector<size_t> lens = {3, 2, 2, 1};

    std::vector<float> packed = {
        0, 1, 2, 3,
        4, 5, 6,
        8
    };
    std::vector<size_t> batchs = {4, 3, 1};

    PackedSequence<float*> p{packed.data(), batchs.data()} ;

    const auto length_b = batch_to_length(batchs.cbegin(), batchs.cend());
    for (size_t i = 0; i < length_b.size(); ++i) {
        assert(lens[i] == length_b[i]);
    }

    const auto batchs_l = batch_to_length(lens.cbegin(), lens.cend());
    for (size_t i = 0; i < batchs_l.size(); ++i) {
        assert(batchs[i] == batchs_l[i]);
    }
 }

 int main() {
    const float ra[] = {1,1,1,1,1};
    const float rb[] = {1,1,2,2,1,3,3};
    thrust::host_vector<float> a(ra, ra + 5);
    thrust::host_vector<float> b(rb, rb + 7);

    thrust::device_vector<float> da = a;
    thrust::device_vector<float> db = b;
    size_t alen = a.size() + 1;
    size_t blen = b.size() + 1;
    thrust::device_vector<float> dpmat(alen, blen);
    const auto at = make_index(alen, blen);
    dtw_cuda<<<1, 1>>>(thrust::raw_pointer_cast(da.data()),
                       thrust::raw_pointer_cast(db.data()),
                       thrust::raw_pointer_cast(dpmat.data()),
                       at);


    thrust::host_vector<float> result(dpmat.size());
    thrust::copy(dpmat.begin(), dpmat.end(), result.begin());

    std::cout << result[at(alen-1, blen-1)] << std::endl;;

    for (size_t i = 0; i < alen; ++i) {
        for (size_t j = 0; j < blen; ++j) {
            std::cout << result[at(i, j)] << ", ";
        }
        std::cout << std::endl;
    }
    std::cout << std::endl;

    std::cout << dtw<L1>(a, b) << " == 6 ?" << std::endl;
    std::cout << dtw<L2>(a, b) << " == 13.3006 ?" << std::endl;
    test_packed_sequence();
 }
	#include <assert.h>
	#include <thrust/host_vector.h>

	#include <thrust/device_vector.h>

	#include <algorithm>
	#include <iostream>
	#include <limits>
	#include <cmath>
	#include <utility>

	const static auto inf = std::numeric_limits<float>::infinity();


	template <size_t N>
	struct MultiIndex {
	const size_t size_list[N];

	template <typename ... Ts>
	__host__ __device__
	size_t operator()(const Ts ... indices) const {
	constexpr size_t M = sizeof...(indices);
	static_assert(M == N, "fancy indexing is not supported yet");

	const size_t indice_list[M] = { indices ... };
	size_t ret = 0;
	for (size_t m = 0; m < M; ++m) {
	assert(indice_list[m] < size_list[m]);
	ret += (m > 0 ? size_list[m-1] : 1) * indice_list[m];
	}
	return ret;
	}
	};

	template <typename ... Ts>
	MultiIndex<sizeof...(Ts)> make_index(Ts ... sizes) {
	return {{sizes ...}};
	}

	struct L1 {
	template <typename T>
	T operator()(T a, T b) {
	return std::abs(a - b);
	}
	};

	struct L2 {
	template <typename T>
	T operator()(T a, T b) {
	return std::sqrt(a * a + b * b);
	}
	};

	template <typename Dist, typename Vec>
	auto dtw(const Vec& a, const Vec& b) -> decltype(a[0]) {
	size_t alen = a.size() + 1;
	size_t blen = b.size() + 1;
	auto dpmat = Vec(alen * blen);
	auto at = make_index(alen, blen);
	auto dist = Dist();

	for (size_t i = 1; i < alen; ++i) {
	dpmat[at(i, 0ul)] = inf;
	}
	for (size_t j = 1; j < blen; ++j) {
	dpmat[at(0ul, j)] = inf;
	}
	dpmat[at(0ul, 0ul)] = 0;

	for (size_t i = 1; i < alen; ++i) {
	for (size_t j = 1; j < blen; ++j) {
	auto cost = dist(a[i], b[j]);
	dpmat[at(i, j)] = cost + std::min({
	dpmat[at(i-1, j)],
	dpmat[at(i-1, j-1)],
	dpmat[at(i, j-1)]});
	}
	}
	return dpmat[at(alen-1, blen-1)];
	}


	template <typename Storage, typename Indexer>
	__global__ void dtw_cuda(const Storage a, const Storage b, Storage dpmat, Indexer at) {
	const auto alen = at.size_list[0];
	const auto blen = at.size_list[1];

	for (size_t i = 1; i < alen; ++i) {
	dpmat[at(i, 0ul)] = inf;
	}
	for (size_t j = 1; j < blen; ++j) {
	dpmat[at(0ul, j)] = inf;
	}
	dpmat[at(0ul, 0ul)] = 0;

	for (size_t i = 1; i < alen; ++i) {
	for (size_t j = 1; j < blen; ++j) {
	auto cost = fabs(a[i] - b[j]);
	dpmat[at(i, j)] = cost + fminf(fminf(
	dpmat[at(i-1, j)],
	dpmat[at(i-1, j-1)]),
	dpmat[at(i, j-1)]);
	}
	}
	}


	template <typename Storage>
	struct PackedSequence {
	Storage storage;
	size_t* batch_sizes;
	};

	template <typename Iter>
	std::vector<size_t> batch_to_length(const Iter batch_begin, const Iter batch_end) {
	std::vector<size_t> length_list(*batch_begin); // must be sorted with a descend order
	for (Iter b = batch_begin; b != batch_end; ++b) {
	for (size_t i = 0; i < *b; ++i) {
	++length_list[i];
	}
	}
	return std::move(length_list);
	}


	void test_packed_sequence() {
	std::vector<float> padded = {
	0, 1, 2, 3,
	4, 5, 6, 7,
	8, 9, 10, 110
	};
	std::vector<size_t> lens = {3, 2, 2, 1};

	std::vector<float> packed = {
	0, 1, 2, 3,
	4, 5, 6,
	8
	};
	std::vector<size_t> batchs = {4, 3, 1};

	PackedSequence<float*> p{packed.data(), batchs.data()} ;

	const auto length_b = batch_to_length(batchs.cbegin(), batchs.cend());
	for (size_t i = 0; i < length_b.size(); ++i) {
	assert(lens[i] == length_b[i]);
	}

	const auto batchs_l = batch_to_length(lens.cbegin(), lens.cend());
	for (size_t i = 0; i < batchs_l.size(); ++i) {
	assert(batchs[i] == batchs_l[i]);
	}
	}

	int main() {
	const float ra[] = {1,1,1,1,1};
	const float rb[] = {1,1,2,2,1,3,3};
	thrust::host_vector<float> a(ra, ra + 5);
	thrust::host_vector<float> b(rb, rb + 7);

	thrust::device_vector<float> da = a;
	thrust::device_vector<float> db = b;
	size_t alen = a.size() + 1;
	size_t blen = b.size() + 1;
	thrust::device_vector<float> dpmat(alen, blen);
	const auto at = make_index(alen, blen);
	dtw_cuda<<<1, 1>>>(thrust::raw_pointer_cast(da.data()),
	thrust::raw_pointer_cast(db.data()),
	thrust::raw_pointer_cast(dpmat.data()),
	at);


	thrust::host_vector<float> result(dpmat.size());
	thrust::copy(dpmat.begin(), dpmat.end(), result.begin());

	std::cout << result[at(alen-1, blen-1)] << std::endl;;

	for (size_t i = 0; i < alen; ++i) {
	for (size_t j = 0; j < blen; ++j) {
	std::cout << result[at(i, j)] << ", ";
	}
	std::cout << std::endl;
	}
	std::cout << std::endl;

	std::cout << dtw<L1>(a, b) << " == 6 ?" << std::endl;
	std::cout << dtw<L2>(a, b) << " == 13.3006 ?" << std::endl;
	test_packed_sequence();
	}