Created
December 13, 2018 18:52
-
-
Save csullivan/12700613dab29b337fa7061a476f3340 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/src/ngraph/runtime/gpu/gpu_backend.cpp b/src/ngraph/runtime/gpu/gpu_backend.cpp | |
index 2f1472e4..f54d4ce0 100644 | |
--- a/src/ngraph/runtime/gpu/gpu_backend.cpp | |
+++ b/src/ngraph/runtime/gpu/gpu_backend.cpp | |
@@ -162,6 +162,7 @@ bool runtime::gpu::GPU_Backend::call(shared_ptr<Function> func, | |
const vector<shared_ptr<runtime::Tensor>>& outputs, | |
const vector<shared_ptr<runtime::Tensor>>& inputs) | |
{ | |
+ std::cout << "call " << func->get_name() << std::endl; | |
bool rc = true; | |
validate_call(func, outputs, inputs); | |
diff --git a/src/ngraph/runtime/gpu/gpu_external_function.cpp b/src/ngraph/runtime/gpu/gpu_external_function.cpp | |
index 71cdd614..8f0e00ff 100644 | |
--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp | |
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp | |
@@ -175,9 +175,39 @@ void runtime::gpu::GPU_ExternalFunction::emit_op(GPU_ExternalFunction* external_ | |
const std::vector<GPUTensorWrapper>& args, | |
const std::vector<GPUTensorWrapper>& out) | |
{ | |
+ if (node->is_parameter() || node->is_constant()) | |
+ { | |
+ return; | |
+ } | |
+ | |
+ | |
+ | |
+ // writer.block_begin(); | |
+ // // manifest calls | |
+ // writer << "std::cout << \" exec " << node->get_name() << "\" << std::endl;\n"; | |
+ // size_t i; | |
+ // i = 0; | |
+ // writer << "std::string origin;\n"; | |
+ // writer << "// inputs\n"; | |
+ // for (auto& tensor : args) | |
+ // { | |
+ // writer << "origin = runtime::gpu::is_device_pointer(" << tensor.get_name() << ") ? \"DEVICE\" : \"HOST\";\n"; | |
+ // writer << "std::cout << \" input " << i << ": [\" << origin << \"] " << node->get_argument(i)->get_name() << "[" << tensor.get_name() << "]" << "\" << std::endl;\n"; | |
+ // i++; | |
+ // } | |
+ // i = 0; | |
+ // writer << "// outputs\n"; | |
+ // for (auto& tensor : out) | |
+ // { | |
+ // writer << "origin = runtime::gpu::is_device_pointer(" << tensor.get_name() << ") ? \"DEVICE\" : \"HOST\";\n"; | |
+ // writer << "std::cout << \" output " << i << ": [\" << origin << \"] " << node->get_argument(i)->get_name() << "[" << tensor.get_name() << "]" << "\" << std::endl;\n"; | |
+ // i++; | |
+ // } | |
+ // writer.block_end(); | |
+ | |
auto emit_function = GPU_Emitter::get_emit_function(*node); | |
emit_function(external_function, writer, node, args, out); | |
-}; | |
+} | |
const size_t runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction::s_memory_pool_alignment = 64; | |
@@ -200,6 +230,8 @@ const string& runtime::gpu::GPU_ExternalFunction::get_pch_header_source() | |
{ | |
static string s_pch_header_source = R"( | |
// Generated by the nGraph GPU backend | |
+#include <unordered_map> | |
+#include <cstring> | |
#include <cublas_v2.h> | |
#include <cuda.h> | |
#include <cuda_runtime.h> | |
@@ -423,6 +455,52 @@ void runtime::gpu::GPU_ExternalFunction::emit_functions() | |
<< "gpu::GPURuntimeContext* ctx) __attribute__ ((optnone))\n"; | |
m_writer.block_begin(); | |
{ | |
+ m_writer << "std::unordered_map<int, void*> input_map;\n"; | |
+ m_writer << "std::unordered_map<int, void*> output_map;\n"; | |
+ m_writer << "bool is_device;\n"; | |
+ for (auto i = 0; i < current_function->get_parameters().size(); i++) | |
+ { | |
+ m_writer << "is_device = runtime::gpu::is_device_pointer(inputs[" << i << "]);\n"; | |
+ m_writer << "if (!is_device)\n"; | |
+ m_writer.block_begin(); | |
+ { | |
+ //m_writer << "std::cout << \"inputs[" << i << "] is on HOST\" << std::endl;\n"; | |
+ m_writer << "if (input_map.count(" << i << ") == 0)\n"; | |
+ m_writer.block_begin(); | |
+ { | |
+ const auto& param = current_function->get_parameters().at(i); | |
+ auto size = shape_size(param->get_output_shape(0)) * param->get_element_type().size(); | |
+ m_writer << "input_map.insert({" << i << ", inputs[" << i << "]});\n"; | |
+ m_writer << "inputs[" << i << "] = runtime::gpu::create_gpu_buffer(" << size << ", " << "inputs[" << i << "]);\n"; | |
+ } | |
+ m_writer.block_end(); | |
+ } | |
+ m_writer.block_end(); | |
+ } | |
+ | |
+ for (auto i = 0; i < current_function->get_results().size(); i++) | |
+ { | |
+ m_writer << "is_device = runtime::gpu::is_device_pointer(outputs[" << i << "]);\n"; | |
+ m_writer << "if (!is_device)\n"; | |
+ m_writer.block_begin(); | |
+ { | |
+ //m_writer << "std::cout << \"outputs[" << i << "] is on HOST\" << std::endl;\n"; | |
+ m_writer << "if (output_map.count(" << i << ") == 0)\n"; | |
+ m_writer.block_begin(); | |
+ { | |
+ const auto& res = current_function->get_results().at(i); | |
+ auto size = shape_size(res->get_output_shape(0)) * res->get_element_type().size(); | |
+ m_writer << "output_map.insert({" << i << ", outputs[" << i << "]});\n"; | |
+ m_writer << "outputs[" << i << "] = runtime::gpu::create_gpu_buffer(" << size << ", " << "outputs[" << i << "]);\n"; | |
+ } | |
+ m_writer.block_end(); | |
+ } | |
+ m_writer.block_end(); | |
+ } | |
+ | |
+ | |
+ | |
+ | |
m_writer << "m_runtime_context = ctx;\n"; | |
// set constant pointers during the first run | |
m_writer << "invoke_constant_mem_ptr();\n"; | |
@@ -533,6 +611,44 @@ void runtime::gpu::GPU_ExternalFunction::emit_functions() | |
emit_debug_function_exit(node.get()); | |
} | |
} | |
+ m_writer << "static std::vector<size_t> inputs_size_in_bytes =\n"; | |
+ m_writer.block_begin(); | |
+ for (auto i = 0; i < current_function->get_parameters().size(); i++) | |
+ { | |
+ const auto& param = current_function->get_parameters().at(i); | |
+ auto size = shape_size(param->get_output_shape(0)) * param->get_element_type().size(); | |
+ m_writer << size << ",\n"; | |
+ } | |
+ m_writer.block_end(); | |
+ m_writer << ";\n"; | |
+ | |
+ m_writer << "for (auto& entry : input_map)\n"; | |
+ m_writer.block_begin(); | |
+ { | |
+ m_writer << "runtime::gpu::free_gpu_buffer(inputs[entry.first]);\n"; | |
+ m_writer << "inputs[entry.first] = entry.second;\n"; | |
+ } | |
+ m_writer.block_end(); | |
+ | |
+ m_writer << "static std::vector<size_t> outputs_size_in_bytes =\n"; | |
+ m_writer.block_begin(); | |
+ for (auto i = 0; i < current_function->get_results().size(); i++) | |
+ { | |
+ const auto& res = current_function->get_results().at(i); | |
+ auto size = shape_size(res->get_output_shape(0)) * res->get_element_type().size(); | |
+ m_writer << size << ",\n"; | |
+ } | |
+ m_writer.block_end(); | |
+ m_writer << ";\n"; | |
+ | |
+ m_writer << "for (auto& entry : output_map)\n"; | |
+ m_writer.block_begin(); | |
+ { | |
+ m_writer << "runtime::gpu::cuda_memcpyDtH(entry.second, outputs[entry.first], outputs_size_in_bytes[entry.first]);\n"; | |
+ m_writer << "runtime::gpu::free_gpu_buffer(outputs[entry.first]);\n"; | |
+ m_writer << "outputs[entry.first] = entry.second;\n"; | |
+ } | |
+ m_writer.block_end(); | |
} | |
m_writer.block_end(); // End generated function | |
} | |
@@ -602,6 +718,7 @@ void runtime::gpu::GPU_ExternalFunction::compile() | |
m_writer << common_function_string << "\n"; | |
emit_functions(); | |
+ //std::cout << m_function_name << std::endl; | |
// allocate device buffers for primitive arguments and workspace | |
allocator->close(); | |
m_shared_context->m_primitive_emitter->allocate_primitive_memory(); | |
diff --git a/src/ngraph/runtime/gpu/gpu_invoke.cpp b/src/ngraph/runtime/gpu/gpu_invoke.cpp | |
index cc5d6470..9622d56d 100644 | |
--- a/src/ngraph/runtime/gpu/gpu_invoke.cpp | |
+++ b/src/ngraph/runtime/gpu/gpu_invoke.cpp | |
@@ -16,6 +16,8 @@ | |
#include <string> | |
+#include <cuda_runtime.h> | |
+ | |
#include "ngraph/runtime/gpu/gpu_invoke.hpp" | |
#include "ngraph/runtime/gpu/gpu_runtime_context.hpp" | |
@@ -32,3 +34,24 @@ extern "C" void* ngraph::runtime::gpu::invoke_memory_primitive(const GPURuntimeC | |
{ | |
return ctx->gpu_memory_primitives[primitive_index](); | |
} | |
+ | |
+ | |
+extern "C" bool is_device_pointer(const void *ptr) | |
+{ | |
+ bool is_device_ptr = false; | |
+ cudaPointerAttributes attributes; | |
+ auto err = cudaPointerGetAttributes(&attributes, ptr); | |
+ if(err != cudaSuccess) | |
+ { | |
+ err = cudaGetLastError(); | |
+ err = cudaGetLastError(); | |
+ return is_device_ptr; | |
+ } | |
+ | |
+ if(attributes.devicePointer != nullptr) | |
+ { | |
+ is_device_ptr = true; | |
+ } | |
+ | |
+ return is_device_ptr; | |
+} | |
diff --git a/src/ngraph/runtime/gpu/gpu_invoke.hpp b/src/ngraph/runtime/gpu/gpu_invoke.hpp | |
index 67b28cfc..573ef54a 100644 | |
--- a/src/ngraph/runtime/gpu/gpu_invoke.hpp | |
+++ b/src/ngraph/runtime/gpu/gpu_invoke.hpp | |
@@ -31,6 +31,7 @@ namespace ngraph | |
void** result); | |
extern "C" void* invoke_memory_primitive(const GPURuntimeContext* ctx, | |
size_t primitive_index); | |
+ extern "C" bool is_device_pointer(const void *ptr); | |
} | |
} | |
} | |
diff --git a/src/ngraph/runtime/gpu/gpu_memory_manager.cpp b/src/ngraph/runtime/gpu/gpu_memory_manager.cpp | |
index 396b1dcf..924104f0 100644 | |
--- a/src/ngraph/runtime/gpu/gpu_memory_manager.cpp | |
+++ b/src/ngraph/runtime/gpu/gpu_memory_manager.cpp | |
@@ -73,6 +73,9 @@ void runtime::gpu::GPUMemoryManager::allocate() | |
m_buffer_offset = ngraph::pass::MemoryManager::align( | |
m_buffer_offset, runtime::gpu::GPUMemoryManager::alignment); | |
// the back most node is always empty, fill it here | |
+ static double total = 0.0; | |
+ total += m_buffer_offset; | |
+ std::cout << "constants (argspace): " << total / 1024. / 1024. << " MiB" << std::endl; | |
m_argspace_mem.back().ptr = runtime::gpu::create_gpu_buffer(m_buffer_offset); | |
m_argspace_mem.back().size = m_buffer_offset; | |
// copy buffered kernel arguments to device | |
@@ -88,6 +91,9 @@ void runtime::gpu::GPUMemoryManager::allocate() | |
auto workspace_size = m_workspace_manager->max_allocated(); | |
if (workspace_size) | |
{ | |
+ static double total = 0.0; | |
+ total += workspace_size; | |
+ std::cout << "workspace: " << total / 1024. / 1024. << " MiB" << std::endl; | |
m_workspace_mem.back().ptr = runtime::gpu::create_gpu_buffer(workspace_size); | |
m_workspace_mem.back().size = workspace_size; | |
m_workspace_mem.push_back({nullptr, 0}); | |
diff --git a/src/ngraph/runtime/gpu/gpu_tensor.cpp b/src/ngraph/runtime/gpu/gpu_tensor.cpp | |
index c61de7ac..cf6f3289 100644 | |
--- a/src/ngraph/runtime/gpu/gpu_tensor.cpp | |
+++ b/src/ngraph/runtime/gpu/gpu_tensor.cpp | |
@@ -14,6 +14,7 @@ | |
// limitations under the License. | |
//***************************************************************************** | |
+#include <algorithm> | |
#include <memory> | |
#include <cuda_runtime.h> | |
@@ -23,6 +24,8 @@ | |
#include "ngraph/runtime/gpu/gpu_backend.hpp" | |
#include "ngraph/runtime/gpu/gpu_tensor.hpp" | |
#include "ngraph/runtime/gpu/gpu_util.hpp" | |
+#include "ngraph/util.hpp" | |
+ | |
using namespace ngraph; | |
using namespace std; | |
@@ -37,6 +40,7 @@ runtime::gpu::GPUTensor::GPUTensor(const ngraph::element::Type& element_type, | |
std::make_shared<ngraph::descriptor::layout::DenseTensorLayout>(*m_descriptor)); | |
m_buffer_size = shape_size(shape) * element_type.size(); | |
+ | |
if (memory_pointer != nullptr) | |
{ | |
m_allocated_buffer_pool = memory_pointer; | |
@@ -44,6 +48,9 @@ runtime::gpu::GPUTensor::GPUTensor(const ngraph::element::Type& element_type, | |
} | |
else if (m_buffer_size > 0) | |
{ | |
+ static double total = 0.0; | |
+ total += m_buffer_size; | |
+ std::cout << "create_tensor: " << total /1024. /1024.<< " MiB" << std::endl; | |
m_allocated_buffer_pool = runtime::gpu::create_gpu_buffer(m_buffer_size); | |
} | |
} | |
@@ -60,10 +67,37 @@ runtime::gpu::GPUTensor::~GPUTensor() | |
runtime::gpu::free_gpu_buffer(m_allocated_buffer_pool); | |
} | |
} | |
+static bool is_device_pointer(const void *ptr) { | |
+ bool is_device_ptr = false; | |
+ cudaPointerAttributes attributes; | |
+ auto err = cudaPointerGetAttributes(&attributes, ptr); | |
+ if(err != cudaSuccess) | |
+ { | |
+ err = cudaGetLastError(); | |
+ err = cudaGetLastError(); | |
+ return is_device_ptr; | |
+ } | |
+ | |
+ if(attributes.devicePointer != nullptr) | |
+ { | |
+ is_device_ptr = true; | |
+ } | |
+ | |
+ return is_device_ptr; | |
+} | |
void runtime::gpu::GPUTensor::write(const void* source, size_t tensor_offset, size_t n) | |
{ | |
- CUDA_RT_SAFE_CALL(cudaMemcpy(m_allocated_buffer_pool, source, n, cudaMemcpyHostToDevice)); | |
+ if (is_device_pointer(source)) | |
+ { | |
+ CUDA_RT_SAFE_CALL(cudaMemcpy(m_allocated_buffer_pool, source, n, cudaMemcpyDeviceToDevice)); | |
+ } | |
+ else | |
+ { | |
+ CUDA_RT_SAFE_CALL(cudaMemcpy(m_allocated_buffer_pool, source, n, cudaMemcpyHostToDevice)); | |
+ } | |
+ cudaGetLastError(); | |
+ cudaGetLastError(); | |
} | |
void runtime::gpu::GPUTensor::read(void* target, size_t tensor_offset, size_t n) const |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment