Skip to content

Instantly share code, notes, and snippets.

@csullivan
Created December 13, 2018 18:52
Show Gist options
  • Save csullivan/12700613dab29b337fa7061a476f3340 to your computer and use it in GitHub Desktop.
Save csullivan/12700613dab29b337fa7061a476f3340 to your computer and use it in GitHub Desktop.
diff --git a/src/ngraph/runtime/gpu/gpu_backend.cpp b/src/ngraph/runtime/gpu/gpu_backend.cpp
index 2f1472e4..f54d4ce0 100644
--- a/src/ngraph/runtime/gpu/gpu_backend.cpp
+++ b/src/ngraph/runtime/gpu/gpu_backend.cpp
@@ -162,6 +162,7 @@ bool runtime::gpu::GPU_Backend::call(shared_ptr<Function> func,
const vector<shared_ptr<runtime::Tensor>>& outputs,
const vector<shared_ptr<runtime::Tensor>>& inputs)
{
+ std::cout << "call " << func->get_name() << std::endl;
bool rc = true;
validate_call(func, outputs, inputs);
diff --git a/src/ngraph/runtime/gpu/gpu_external_function.cpp b/src/ngraph/runtime/gpu/gpu_external_function.cpp
index 71cdd614..8f0e00ff 100644
--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
@@ -175,9 +175,39 @@ void runtime::gpu::GPU_ExternalFunction::emit_op(GPU_ExternalFunction* external_
const std::vector<GPUTensorWrapper>& args,
const std::vector<GPUTensorWrapper>& out)
{
+ if (node->is_parameter() || node->is_constant())
+ {
+ return;
+ }
+
+
+
+ // writer.block_begin();
+ // // manifest calls
+ // writer << "std::cout << \" exec " << node->get_name() << "\" << std::endl;\n";
+ // size_t i;
+ // i = 0;
+ // writer << "std::string origin;\n";
+ // writer << "// inputs\n";
+ // for (auto& tensor : args)
+ // {
+ // writer << "origin = runtime::gpu::is_device_pointer(" << tensor.get_name() << ") ? \"DEVICE\" : \"HOST\";\n";
+ // writer << "std::cout << \" input " << i << ": [\" << origin << \"] " << node->get_argument(i)->get_name() << "[" << tensor.get_name() << "]" << "\" << std::endl;\n";
+ // i++;
+ // }
+ // i = 0;
+ // writer << "// outputs\n";
+ // for (auto& tensor : out)
+ // {
+ // writer << "origin = runtime::gpu::is_device_pointer(" << tensor.get_name() << ") ? \"DEVICE\" : \"HOST\";\n";
+ // writer << "std::cout << \" output " << i << ": [\" << origin << \"] " << node->get_argument(i)->get_name() << "[" << tensor.get_name() << "]" << "\" << std::endl;\n";
+ // i++;
+ // }
+ // writer.block_end();
+
auto emit_function = GPU_Emitter::get_emit_function(*node);
emit_function(external_function, writer, node, args, out);
-};
+}
const size_t runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction::s_memory_pool_alignment = 64;
@@ -200,6 +230,8 @@ const string& runtime::gpu::GPU_ExternalFunction::get_pch_header_source()
{
static string s_pch_header_source = R"(
// Generated by the nGraph GPU backend
+#include <unordered_map>
+#include <cstring>
#include <cublas_v2.h>
#include <cuda.h>
#include <cuda_runtime.h>
@@ -423,6 +455,52 @@ void runtime::gpu::GPU_ExternalFunction::emit_functions()
<< "gpu::GPURuntimeContext* ctx) __attribute__ ((optnone))\n";
m_writer.block_begin();
{
+ m_writer << "std::unordered_map<int, void*> input_map;\n";
+ m_writer << "std::unordered_map<int, void*> output_map;\n";
+ m_writer << "bool is_device;\n";
+ for (auto i = 0; i < current_function->get_parameters().size(); i++)
+ {
+ m_writer << "is_device = runtime::gpu::is_device_pointer(inputs[" << i << "]);\n";
+ m_writer << "if (!is_device)\n";
+ m_writer.block_begin();
+ {
+ //m_writer << "std::cout << \"inputs[" << i << "] is on HOST\" << std::endl;\n";
+ m_writer << "if (input_map.count(" << i << ") == 0)\n";
+ m_writer.block_begin();
+ {
+ const auto& param = current_function->get_parameters().at(i);
+ auto size = shape_size(param->get_output_shape(0)) * param->get_element_type().size();
+ m_writer << "input_map.insert({" << i << ", inputs[" << i << "]});\n";
+ m_writer << "inputs[" << i << "] = runtime::gpu::create_gpu_buffer(" << size << ", " << "inputs[" << i << "]);\n";
+ }
+ m_writer.block_end();
+ }
+ m_writer.block_end();
+ }
+
+ for (auto i = 0; i < current_function->get_results().size(); i++)
+ {
+ m_writer << "is_device = runtime::gpu::is_device_pointer(outputs[" << i << "]);\n";
+ m_writer << "if (!is_device)\n";
+ m_writer.block_begin();
+ {
+ //m_writer << "std::cout << \"outputs[" << i << "] is on HOST\" << std::endl;\n";
+ m_writer << "if (output_map.count(" << i << ") == 0)\n";
+ m_writer.block_begin();
+ {
+ const auto& res = current_function->get_results().at(i);
+ auto size = shape_size(res->get_output_shape(0)) * res->get_element_type().size();
+ m_writer << "output_map.insert({" << i << ", outputs[" << i << "]});\n";
+ m_writer << "outputs[" << i << "] = runtime::gpu::create_gpu_buffer(" << size << ", " << "outputs[" << i << "]);\n";
+ }
+ m_writer.block_end();
+ }
+ m_writer.block_end();
+ }
+
+
+
+
m_writer << "m_runtime_context = ctx;\n";
// set constant pointers during the first run
m_writer << "invoke_constant_mem_ptr();\n";
@@ -533,6 +611,44 @@ void runtime::gpu::GPU_ExternalFunction::emit_functions()
emit_debug_function_exit(node.get());
}
}
+ m_writer << "static std::vector<size_t> inputs_size_in_bytes =\n";
+ m_writer.block_begin();
+ for (auto i = 0; i < current_function->get_parameters().size(); i++)
+ {
+ const auto& param = current_function->get_parameters().at(i);
+ auto size = shape_size(param->get_output_shape(0)) * param->get_element_type().size();
+ m_writer << size << ",\n";
+ }
+ m_writer.block_end();
+ m_writer << ";\n";
+
+ m_writer << "for (auto& entry : input_map)\n";
+ m_writer.block_begin();
+ {
+ m_writer << "runtime::gpu::free_gpu_buffer(inputs[entry.first]);\n";
+ m_writer << "inputs[entry.first] = entry.second;\n";
+ }
+ m_writer.block_end();
+
+ m_writer << "static std::vector<size_t> outputs_size_in_bytes =\n";
+ m_writer.block_begin();
+ for (auto i = 0; i < current_function->get_results().size(); i++)
+ {
+ const auto& res = current_function->get_results().at(i);
+ auto size = shape_size(res->get_output_shape(0)) * res->get_element_type().size();
+ m_writer << size << ",\n";
+ }
+ m_writer.block_end();
+ m_writer << ";\n";
+
+ m_writer << "for (auto& entry : output_map)\n";
+ m_writer.block_begin();
+ {
+ m_writer << "runtime::gpu::cuda_memcpyDtH(entry.second, outputs[entry.first], outputs_size_in_bytes[entry.first]);\n";
+ m_writer << "runtime::gpu::free_gpu_buffer(outputs[entry.first]);\n";
+ m_writer << "outputs[entry.first] = entry.second;\n";
+ }
+ m_writer.block_end();
}
m_writer.block_end(); // End generated function
}
@@ -602,6 +718,7 @@ void runtime::gpu::GPU_ExternalFunction::compile()
m_writer << common_function_string << "\n";
emit_functions();
+ //std::cout << m_function_name << std::endl;
// allocate device buffers for primitive arguments and workspace
allocator->close();
m_shared_context->m_primitive_emitter->allocate_primitive_memory();
diff --git a/src/ngraph/runtime/gpu/gpu_invoke.cpp b/src/ngraph/runtime/gpu/gpu_invoke.cpp
index cc5d6470..9622d56d 100644
--- a/src/ngraph/runtime/gpu/gpu_invoke.cpp
+++ b/src/ngraph/runtime/gpu/gpu_invoke.cpp
@@ -16,6 +16,8 @@
#include <string>
+#include <cuda_runtime.h>
+
#include "ngraph/runtime/gpu/gpu_invoke.hpp"
#include "ngraph/runtime/gpu/gpu_runtime_context.hpp"
@@ -32,3 +34,24 @@ extern "C" void* ngraph::runtime::gpu::invoke_memory_primitive(const GPURuntimeC
{
return ctx->gpu_memory_primitives[primitive_index]();
}
+
+
+extern "C" bool is_device_pointer(const void *ptr)
+{
+ bool is_device_ptr = false;
+ cudaPointerAttributes attributes;
+ auto err = cudaPointerGetAttributes(&attributes, ptr);
+ if(err != cudaSuccess)
+ {
+ err = cudaGetLastError();
+ err = cudaGetLastError();
+ return is_device_ptr;
+ }
+
+ if(attributes.devicePointer != nullptr)
+ {
+ is_device_ptr = true;
+ }
+
+ return is_device_ptr;
+}
diff --git a/src/ngraph/runtime/gpu/gpu_invoke.hpp b/src/ngraph/runtime/gpu/gpu_invoke.hpp
index 67b28cfc..573ef54a 100644
--- a/src/ngraph/runtime/gpu/gpu_invoke.hpp
+++ b/src/ngraph/runtime/gpu/gpu_invoke.hpp
@@ -31,6 +31,7 @@ namespace ngraph
void** result);
extern "C" void* invoke_memory_primitive(const GPURuntimeContext* ctx,
size_t primitive_index);
+ extern "C" bool is_device_pointer(const void *ptr);
}
}
}
diff --git a/src/ngraph/runtime/gpu/gpu_memory_manager.cpp b/src/ngraph/runtime/gpu/gpu_memory_manager.cpp
index 396b1dcf..924104f0 100644
--- a/src/ngraph/runtime/gpu/gpu_memory_manager.cpp
+++ b/src/ngraph/runtime/gpu/gpu_memory_manager.cpp
@@ -73,6 +73,9 @@ void runtime::gpu::GPUMemoryManager::allocate()
m_buffer_offset = ngraph::pass::MemoryManager::align(
m_buffer_offset, runtime::gpu::GPUMemoryManager::alignment);
// the back most node is always empty, fill it here
+ static double total = 0.0;
+ total += m_buffer_offset;
+ std::cout << "constants (argspace): " << total / 1024. / 1024. << " MiB" << std::endl;
m_argspace_mem.back().ptr = runtime::gpu::create_gpu_buffer(m_buffer_offset);
m_argspace_mem.back().size = m_buffer_offset;
// copy buffered kernel arguments to device
@@ -88,6 +91,9 @@ void runtime::gpu::GPUMemoryManager::allocate()
auto workspace_size = m_workspace_manager->max_allocated();
if (workspace_size)
{
+ static double total = 0.0;
+ total += workspace_size;
+ std::cout << "workspace: " << total / 1024. / 1024. << " MiB" << std::endl;
m_workspace_mem.back().ptr = runtime::gpu::create_gpu_buffer(workspace_size);
m_workspace_mem.back().size = workspace_size;
m_workspace_mem.push_back({nullptr, 0});
diff --git a/src/ngraph/runtime/gpu/gpu_tensor.cpp b/src/ngraph/runtime/gpu/gpu_tensor.cpp
index c61de7ac..cf6f3289 100644
--- a/src/ngraph/runtime/gpu/gpu_tensor.cpp
+++ b/src/ngraph/runtime/gpu/gpu_tensor.cpp
@@ -14,6 +14,7 @@
// limitations under the License.
//*****************************************************************************
+#include <algorithm>
#include <memory>
#include <cuda_runtime.h>
@@ -23,6 +24,8 @@
#include "ngraph/runtime/gpu/gpu_backend.hpp"
#include "ngraph/runtime/gpu/gpu_tensor.hpp"
#include "ngraph/runtime/gpu/gpu_util.hpp"
+#include "ngraph/util.hpp"
+
using namespace ngraph;
using namespace std;
@@ -37,6 +40,7 @@ runtime::gpu::GPUTensor::GPUTensor(const ngraph::element::Type& element_type,
std::make_shared<ngraph::descriptor::layout::DenseTensorLayout>(*m_descriptor));
m_buffer_size = shape_size(shape) * element_type.size();
+
if (memory_pointer != nullptr)
{
m_allocated_buffer_pool = memory_pointer;
@@ -44,6 +48,9 @@ runtime::gpu::GPUTensor::GPUTensor(const ngraph::element::Type& element_type,
}
else if (m_buffer_size > 0)
{
+ static double total = 0.0;
+ total += m_buffer_size;
+ std::cout << "create_tensor: " << total /1024. /1024.<< " MiB" << std::endl;
m_allocated_buffer_pool = runtime::gpu::create_gpu_buffer(m_buffer_size);
}
}
@@ -60,10 +67,37 @@ runtime::gpu::GPUTensor::~GPUTensor()
runtime::gpu::free_gpu_buffer(m_allocated_buffer_pool);
}
}
+static bool is_device_pointer(const void *ptr) {
+ bool is_device_ptr = false;
+ cudaPointerAttributes attributes;
+ auto err = cudaPointerGetAttributes(&attributes, ptr);
+ if(err != cudaSuccess)
+ {
+ err = cudaGetLastError();
+ err = cudaGetLastError();
+ return is_device_ptr;
+ }
+
+ if(attributes.devicePointer != nullptr)
+ {
+ is_device_ptr = true;
+ }
+
+ return is_device_ptr;
+}
void runtime::gpu::GPUTensor::write(const void* source, size_t tensor_offset, size_t n)
{
- CUDA_RT_SAFE_CALL(cudaMemcpy(m_allocated_buffer_pool, source, n, cudaMemcpyHostToDevice));
+ if (is_device_pointer(source))
+ {
+ CUDA_RT_SAFE_CALL(cudaMemcpy(m_allocated_buffer_pool, source, n, cudaMemcpyDeviceToDevice));
+ }
+ else
+ {
+ CUDA_RT_SAFE_CALL(cudaMemcpy(m_allocated_buffer_pool, source, n, cudaMemcpyHostToDevice));
+ }
+ cudaGetLastError();
+ cudaGetLastError();
}
void runtime::gpu::GPUTensor::read(void* target, size_t tensor_offset, size_t n) const
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment