Last active
December 3, 2015 08:05
-
-
Save Riyaaaaa/b8b87cf180d5f9892ef5 to your computer and use it in GitHub Desktop.
C++ AMPによるGPGPU入門 ref: http://qiita.com/Riyaaaa_a/items/40054b893e70b54f5a26
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <amp.h> | |
using namespace concurrency; | |
int main() { | |
int v[11] = {'G', 'd', 'k', 'k', 'n', 31, 'v', 'n', 'q', 'k', 'c'}; | |
array_view<int> av(11, v); | |
parallel_for_each(av.get_extent(), [=](index<1> idx) restrict(amp) { | |
av[idx] += 1; | |
}); | |
for(unsigned int i = 0; i < av.get_extent().size(); i++) | |
std::cout << static_cast<char>(av(i)); | |
return 0; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include<iostream> | |
#include<algorithm> | |
#include<vector> | |
using concurrency::accelerator; | |
std::vector<accelerator> findAccelerators(){ | |
std::vector<accelerator> accels; | |
accels = accelerator::get_all(); | |
for(int i=0; i<accels.size(); i++){ | |
std::wcout << i+1 << "th device = " << accels[i].get_description() << "\n"; | |
} | |
//emulatorのアクセラレータを削除します | |
accels.erase(std::remove_if(accels.begin(),accels.end(),[](accelerator& accel){return accel.get_is_emulated();}),accels.end()); | |
return accels; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
0th device = NVIDIA GeForce GTX 670 | |
1th device = Microsoft Basic Render Driver | |
2th device = Software Adapter | |
3th device = CPU accelerator | |
accelerator: NVIDIA GeForce GTX 670 | |
version of the accelerator: 720896 | |
memory: 1.98681 [GB] | |
is supporting double precision: yes | |
is attached to a display: yes | |
is supporting cpu shared memory: yes | |
4 | |
1 | |
5760/5760 | |
1 | |
----------------cpu calculation succeeded--------------- | |
score 1.94933[s] | |
-------------------parallel calculation---------------- | |
rows/cols 810/1440 | |
----------------gpu calculation succeeded--------------- | |
score 0.0314382[s] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void getAccelDiscription(const accelerator& accel){ | |
std::wcout << "accelerator: "<< accel.get_description() << std::endl; | |
std::cout << "version of the accelerator: " << accel.get_version() << std::endl; | |
std::cout << "memory: " << accel.get_dedicated_memory()/1024./1000. << " [GB]" << std::endl;; | |
std::cout << "is supporting double precision: " << (accel.get_supports_double_precision() ? "yes" : "no") << std::endl; | |
std::cout << "is attached to a display: " << (accel.get_has_display() ? "yes" : "no") << std::endl; | |
std::cout << "is supporting cpu shared memory: " << (accel.get_supports_cpu_shared_memory() ? "yes" : "no") << std::endl; | |
return; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
std::vector<accelerator>::iterator getBiggestMemoryAccelerator(std::vector<accelerator>& accels){ | |
return std::max_element(accels.begin(),accels.end(),[](const accelerator& rhs,const accelerator& lhs){return rhs.get_dedicated_memory() < lhs.get_dedicated_memory();}); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
template < | |
typename _Value_type, | |
int _Rank | |
> | |
friend class array; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
template < | |
typename _Value_type, | |
int _Rank = 1 | |
> | |
class array_view; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include"amp.h" | |
#include<array> | |
#include<iostream> | |
template<class T,int dim,class F> | |
void accessArray(concurrency::array<T,dim>& vGArray,F&& function){ | |
concurrency::array_view<T,dim> vGArrayView = vGArray; //concurrency::arrayのラッパーを作成 | |
function(vGArrayView); //array_viewはcpu側からアクセス可能 | |
} | |
template<class T, int dim> | |
std::unique_ptr<concurrency::array<T, dim>> createArray(const concurrency::accelerator& accel, int size) { | |
return std::make_unique<concurrency::array<T, dim>>(size, accel.get_default_view()); | |
} | |
int main() { | |
constexpr int dim = 1; | |
const int size = 100; | |
concurrency::accelerator accel = *getBiggestMemoryAccelerator(findAccelerators()); | |
auto vGArray(createArray<int, dim>(accel, size)); | |
accessArray(*vGArray, [&](auto& _array) { | |
for (int i = 0; i<size; i++)_array[i] = i; | |
}); | |
return 0; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
int main() { | |
constexpr int dim = 1; | |
const int size = 100; | |
std::array<int,size> arr; | |
concurrency::accelerator accel = *getBiggestMemoryAccelerator(findAccelerators()); | |
concurrency::extent<dim> ex; | |
ex[0] = size; | |
concurrency::array_view<int, dim> view(size,reinterpret_cast<int*>(&arr[0])); //iteratorに対応していないので仕方なくbegin()ではなくポインタをキャストしてます | |
parallel_for_each(accel.get_default_view(), | |
ex, | |
[=](concurrency::index<dim> gindex) restrict(amp) { | |
view[gindex] = 114514; //array_viewはコピーキャプチャ可能 | |
} | |
); | |
view.synchronize(); //メモリ参照元と同期します | |
for (int i = 0; i<size; i++) { | |
std::cout << arr[i] << ","; | |
} | |
std::cout << std::endl; | |
return 0; | |
} | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
template < | |
int _Rank | |
> | |
class index; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
template<class T,int Rank,typename... Args> | |
T& accessArrayByIndex(const concurrency::array_view<T,Rank>& a,Args... indexes) restrict(amp) | |
{ | |
static_assert(sizeof...(indexes) == Rank,"number of index is incorrect"); | |
concurrency::index<Rank> idx(indexes...); | |
return a[idx]; | |
} | |
int main(void){ | |
constexpr int COLS=6,ROWS=4; | |
std::array<std::array<float,COLS>,ROWS> data={ | |
1,2,3,4,5,6, | |
7,8,9,10,11,12, | |
1,2,3,4,5,6, | |
7,8,9,10,11,12 | |
}; | |
concurrency::array_view<float,2> data_view(ROWS,COLS,reinterpret_cast<float*>(&data[0][0])); | |
//print data_view[3][2] by index<2>; | |
std::cout << accessArrayByIndex(data_view,3,2); | |
std::cout << std::endl; | |
return 0; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include"opencv_include.h" //これはopencvの環境構築用のヘッダです | |
#include"amp.h" | |
#include"index/tiled_index_modules.hpp" | |
#include<iostream> | |
#include<chrono> | |
void image_processing_test(concurrency::accelerator& accel) | |
{ | |
cv::Mat input; | |
cv::Mat_<float> gray,gray_cpu; | |
input = cv::imread("image_middle.jpg",cv::IMREAD_GRAYSCALE); | |
input.convertTo(gray, CV_32FC1); | |
input.convertTo(gray_cpu, CV_32FC1); | |
for (int rows = 0; rows < input.rows; rows++) { | |
for (int cols = 0; cols < input.cols; cols++) { | |
gray.at<float>(rows, cols) /= 255.; | |
} | |
} | |
std::cout << gray.elemSize1() << std::endl; | |
std::cout << gray.channels() << std::endl; | |
std::cout << gray.step << "/" << gray.elemSize() * gray.cols << std::endl; | |
std::cout << gray.isContinuous() << std::endl; | |
constexpr int convolution_size = 15; | |
{ | |
std::chrono::time_point<std::chrono::system_clock> now = std::chrono::system_clock::now(); | |
for (int rows = 0; rows < gray.rows; rows++) { | |
for (int cols = 0; cols < gray.cols; cols++) { | |
int sum; | |
for (int y = -convolution_size; y <= convolution_size; y++) { | |
for (int x = -convolution_size; x <= convolution_size; x++) { | |
if (rows + y >= 0 && rows + y < gray.rows && cols + x >= 0 && cols + x < gray.cols) | |
sum += gray.data[(rows + y) * gray.step + (cols + x) * gray.elemSize()]; | |
else sum += gray.data[rows * gray.step + cols * gray.elemSize()]; | |
} | |
} | |
gray_cpu.data[rows * gray.step + cols * gray.elemSize()] = sum / pow(2 * convolution_size + 1, 2); | |
} | |
} | |
std::chrono::time_point<std::chrono::system_clock> after = std::chrono::system_clock::now(); | |
std::cout << "----------------cpu calculation succeeded---------------" << std::endl; | |
std::chrono::duration<double> diff = after - now; | |
std::cout << "score " << diff.count() << "[s]" << std::endl; | |
} | |
{ | |
std::chrono::time_point<std::chrono::system_clock> now = std::chrono::system_clock::now(); | |
auto result = convolutionCalculateAverage<float, convolution_size*2, convolution_size*2>(reinterpret_cast<float*>(&gray.data[0]), input.rows, input.cols, accel); | |
std::chrono::time_point<std::chrono::system_clock> after = std::chrono::system_clock::now(); | |
std::cout << "----------------gpu calculation succeeded---------------" << std::endl; | |
std::chrono::duration<double> diff = after - now; | |
std::cout << "score " << diff.count() << "[s]" << std::endl; | |
for (int rows = 0; rows < input.rows; rows++) { | |
for (int cols = 0; cols < input.cols; cols++) { | |
gray.at<float>(rows, cols) = result[rows*input.cols + cols]; | |
} | |
} | |
} | |
cv::namedWindow("window", CV_WINDOW_AUTOSIZE); | |
cv::namedWindow("window2", CV_WINDOW_AUTOSIZE); | |
cv::imshow("window", gray); | |
cv::imshow("window2", input); | |
cv::waitKey(0); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
template<typename T,int TILE_COLS, int TILE_ROWS> | |
std::unique_ptr<T[]> convolutionCalculateAverage(T* data, int rows, int cols,const concurrency::accelerator& accel) | |
{ | |
std::unique_ptr<T[]> average(new T[rows*cols]); | |
concurrency::array_view<T, 2> data_view(rows, cols, data); | |
concurrency::array_view<float, 2> average_view(rows, cols, reinterpret_cast<float*>(average.get())); | |
std::cout << "\n-------------------parallel calculation-----------------" << std::endl; | |
std::cout << "rows/cols " << rows << "/" << cols << std::endl; | |
average_view.discard_data(); | |
parallel_for_each( | |
data_view.get_extent().tile<TILE_ROWS, TILE_COLS>(), | |
[=](concurrency::tiled_index<TILE_ROWS, TILE_COLS> idx) restrict(amp) { | |
tile_static T nums[TILE_ROWS][TILE_COLS]; | |
nums[idx.local[1]][idx.local[0]] = data_view[idx.global]; | |
idx.barrier.wait(); | |
T sum=0; | |
for (int i = 0; i<TILE_ROWS; i++) { | |
for (int j = 0; j<TILE_COLS; j++) { | |
sum += nums[i][j]; | |
} | |
} | |
average_view[idx.global] = sum / static_cast<T>(TILE_ROWS*TILE_COLS); | |
} | |
); | |
average_view.synchronize(); | |
return std::move(average); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment