csullivan · December 13, 2018 18:52
diff --git a/hetero_layer.diff b/hetero_layer.diff
 diff --git a/src/ngraph/runtime/gpu/gpu_backend.cpp b/src/ngraph/runtime/gpu/gpu_backend.cpp
 index 2f1472e4..f54d4ce0 100644
 --- a/src/ngraph/runtime/gpu/gpu_backend.cpp
 +++ b/src/ngraph/runtime/gpu/gpu_backend.cpp
 @@ -162,6 +162,7 @@ bool runtime::gpu::GPU_Backend::call(shared_ptr<Function> func,
                                      const vector<shared_ptr<runtime::Tensor>>& outputs,
                                      const vector<shared_ptr<runtime::Tensor>>& inputs)
 {
 +    std::cout << "call " << func->get_name() << std::endl;
     bool rc = true;
 
     validate_call(func, outputs, inputs);
 diff --git a/src/ngraph/runtime/gpu/gpu_external_function.cpp b/src/ngraph/runtime/gpu/gpu_external_function.cpp
 index 71cdd614..8f0e00ff 100644
 --- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
 +++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
 @@ -175,9 +175,39 @@ void runtime::gpu::GPU_ExternalFunction::emit_op(GPU_ExternalFunction* external_
                                                  const std::vector<GPUTensorWrapper>& args,
                                                  const std::vector<GPUTensorWrapper>& out)
 {
 +    if (node->is_parameter() || node->is_constant())
 +    {
 +        return;
 +    }
 +
 +
 +
 +    // writer.block_begin();
 +    // // manifest calls
 +    // writer << "std::cout << \"  exec " << node->get_name() << "\" << std::endl;\n";
 +    // size_t i;
 +    // i = 0;
 +    // writer << "std::string origin;\n";
 +    // writer << "// inputs\n";
 +    // for (auto& tensor : args)
 +    // {
 +    //     writer << "origin = runtime::gpu::is_device_pointer(" << tensor.get_name() << ") ? \"DEVICE\" : \"HOST\";\n";
 +    //     writer << "std::cout << \"    input " << i << ": [\" << origin << \"] " << node->get_argument(i)->get_name() << "[" << tensor.get_name() << "]" << "\" << std::endl;\n";
 +    //     i++;
 +    // }
 +    // i = 0;
 +    // writer << "// outputs\n";
 +    // for (auto& tensor : out)
 +    // {
 +    //     writer << "origin = runtime::gpu::is_device_pointer(" << tensor.get_name() << ") ? \"DEVICE\" : \"HOST\";\n";
 +    //     writer << "std::cout << \"   output " << i << ": [\" << origin << \"] " << node->get_argument(i)->get_name() << "[" << tensor.get_name() << "]" << "\" << std::endl;\n";
 +    //     i++;
 +    // }
 +    // writer.block_end();
 +
     auto emit_function = GPU_Emitter::get_emit_function(*node);
     emit_function(external_function, writer, node, args, out);
 -};
 +}
 
 const size_t runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction::s_memory_pool_alignment = 64;
 
 @@ -200,6 +230,8 @@ const string& runtime::gpu::GPU_ExternalFunction::get_pch_header_source()
 {
     static string s_pch_header_source = R"(
 // Generated by the nGraph GPU backend
 +#include <unordered_map>
 +#include <cstring>
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 @@ -423,6 +455,52 @@ void runtime::gpu::GPU_ExternalFunction::emit_functions()
                  << "gpu::GPURuntimeContext* ctx) __attribute__ ((optnone))\n";
         m_writer.block_begin();
         {
 +            m_writer << "std::unordered_map<int, void*> input_map;\n";
 +            m_writer << "std::unordered_map<int, void*> output_map;\n";
 +            m_writer << "bool is_device;\n";
 +            for (auto i = 0; i < current_function->get_parameters().size(); i++)
 +            {
 +                m_writer << "is_device = runtime::gpu::is_device_pointer(inputs[" << i << "]);\n";
 +                m_writer << "if (!is_device)\n";
 +                m_writer.block_begin();
 +                {
 +                    //m_writer << "std::cout << \"inputs[" << i << "] is on HOST\" << std::endl;\n";
 +                    m_writer << "if (input_map.count(" << i << ") == 0)\n";
 +                    m_writer.block_begin();
 +                    {
 +                        const auto& param = current_function->get_parameters().at(i);
 +                        auto size = shape_size(param->get_output_shape(0)) * param->get_element_type().size();
 +                        m_writer << "input_map.insert({" << i << ", inputs[" << i << "]});\n";
 +                        m_writer << "inputs[" << i << "] = runtime::gpu::create_gpu_buffer(" << size << ", " << "inputs[" << i << "]);\n";
 +                    }
 +                    m_writer.block_end();
 +                }
 +                m_writer.block_end();
 +            }
 +
 +            for (auto i = 0; i < current_function->get_results().size(); i++)
 +            {
 +                m_writer << "is_device = runtime::gpu::is_device_pointer(outputs[" << i << "]);\n";
 +                m_writer << "if (!is_device)\n";
 +                m_writer.block_begin();
 +                {
 +                    //m_writer << "std::cout << \"outputs[" << i << "] is on HOST\" << std::endl;\n";
 +                    m_writer << "if (output_map.count(" << i << ") == 0)\n";
 +                    m_writer.block_begin();
 +                    {
 +                        const auto& res = current_function->get_results().at(i);
 +                        auto size = shape_size(res->get_output_shape(0)) * res->get_element_type().size();
 +                        m_writer << "output_map.insert({" << i << ", outputs[" << i << "]});\n";
 +                        m_writer << "outputs[" << i << "] = runtime::gpu::create_gpu_buffer(" << size << ", " << "outputs[" << i << "]);\n";
 +                    }
 +                    m_writer.block_end();
 +                }
 +                m_writer.block_end();
 +            }
 +
 +
 +
 +
             m_writer << "m_runtime_context = ctx;\n";
             // set constant pointers during the first run
             m_writer << "invoke_constant_mem_ptr();\n";
 @@ -533,6 +611,44 @@ void runtime::gpu::GPU_ExternalFunction::emit_functions()
                     emit_debug_function_exit(node.get());
                 }
             }
 +            m_writer << "static std::vector<size_t> inputs_size_in_bytes =\n";
 +            m_writer.block_begin();
 +            for (auto i = 0; i < current_function->get_parameters().size(); i++)
 +            {
 +                const auto& param = current_function->get_parameters().at(i);
 +                auto size = shape_size(param->get_output_shape(0)) * param->get_element_type().size();
 +                m_writer << size << ",\n";
 +            }
 +            m_writer.block_end();
 +            m_writer << ";\n";
 +
 +            m_writer << "for (auto& entry : input_map)\n";
 +            m_writer.block_begin();
 +            {
 +                m_writer << "runtime::gpu::free_gpu_buffer(inputs[entry.first]);\n";
 +                m_writer << "inputs[entry.first] = entry.second;\n";
 +            }
 +            m_writer.block_end();
 +
 +            m_writer << "static std::vector<size_t> outputs_size_in_bytes =\n";
 +            m_writer.block_begin();
 +            for (auto i = 0; i < current_function->get_results().size(); i++)
 +            {
 +                const auto& res = current_function->get_results().at(i);
 +                auto size = shape_size(res->get_output_shape(0)) * res->get_element_type().size();
 +                m_writer << size << ",\n";
 +            }
 +            m_writer.block_end();
 +            m_writer << ";\n";
 +
 +            m_writer << "for (auto& entry : output_map)\n";
 +            m_writer.block_begin();
 +            {
 +                m_writer << "runtime::gpu::cuda_memcpyDtH(entry.second, outputs[entry.first], outputs_size_in_bytes[entry.first]);\n";
 +                m_writer << "runtime::gpu::free_gpu_buffer(outputs[entry.first]);\n";
 +                m_writer << "outputs[entry.first] = entry.second;\n";
 +            }
 +            m_writer.block_end();
         }
         m_writer.block_end(); // End generated function
     }
 @@ -602,6 +718,7 @@ void runtime::gpu::GPU_ExternalFunction::compile()
     m_writer << common_function_string << "\n";
     emit_functions();
 
 +    //std::cout << m_function_name << std::endl;
     // allocate device buffers for primitive arguments and workspace
     allocator->close();
     m_shared_context->m_primitive_emitter->allocate_primitive_memory();
 diff --git a/src/ngraph/runtime/gpu/gpu_invoke.cpp b/src/ngraph/runtime/gpu/gpu_invoke.cpp
 index cc5d6470..9622d56d 100644
 --- a/src/ngraph/runtime/gpu/gpu_invoke.cpp
 +++ b/src/ngraph/runtime/gpu/gpu_invoke.cpp
 @@ -16,6 +16,8 @@
 
 #include <string>
 
 +#include <cuda_runtime.h>
 +
 #include "ngraph/runtime/gpu/gpu_invoke.hpp"
 #include "ngraph/runtime/gpu/gpu_runtime_context.hpp"
 
 @@ -32,3 +34,24 @@ extern "C" void* ngraph::runtime::gpu::invoke_memory_primitive(const GPURuntimeC
 {
     return ctx->gpu_memory_primitives[primitive_index]();
 }
 +
 +
 +extern "C" bool is_device_pointer(const void *ptr)
 +{
 +    bool is_device_ptr = false;
 +    cudaPointerAttributes attributes;
 +    auto err = cudaPointerGetAttributes(&attributes, ptr);
 +    if(err != cudaSuccess)
 +    {
 +        err = cudaGetLastError();
 +        err = cudaGetLastError();
 +        return is_device_ptr;
 +    }
 +
 +    if(attributes.devicePointer != nullptr)
 +    {
 +        is_device_ptr = true;
 +    }
 +
 +    return is_device_ptr;
 +}
 diff --git a/src/ngraph/runtime/gpu/gpu_invoke.hpp b/src/ngraph/runtime/gpu/gpu_invoke.hpp
 index 67b28cfc..573ef54a 100644
 --- a/src/ngraph/runtime/gpu/gpu_invoke.hpp
 +++ b/src/ngraph/runtime/gpu/gpu_invoke.hpp
 @@ -31,6 +31,7 @@ namespace ngraph
                                              void** result);
             extern "C" void* invoke_memory_primitive(const GPURuntimeContext* ctx,
                                                      size_t primitive_index);
 +            extern "C" bool is_device_pointer(const void *ptr);
         }
     }
 }
 diff --git a/src/ngraph/runtime/gpu/gpu_memory_manager.cpp b/src/ngraph/runtime/gpu/gpu_memory_manager.cpp
 index 396b1dcf..924104f0 100644
 --- a/src/ngraph/runtime/gpu/gpu_memory_manager.cpp
 +++ b/src/ngraph/runtime/gpu/gpu_memory_manager.cpp
 @@ -73,6 +73,9 @@ void runtime::gpu::GPUMemoryManager::allocate()
         m_buffer_offset = ngraph::pass::MemoryManager::align(
             m_buffer_offset, runtime::gpu::GPUMemoryManager::alignment);
         // the back most node is always empty, fill it here
 +        static double total = 0.0;
 +        total += m_buffer_offset;
 +        std::cout << "constants (argspace): " << total / 1024. / 1024. << " MiB" << std::endl;
         m_argspace_mem.back().ptr = runtime::gpu::create_gpu_buffer(m_buffer_offset);
         m_argspace_mem.back().size = m_buffer_offset;
         // copy buffered kernel arguments to device
 @@ -88,6 +91,9 @@ void runtime::gpu::GPUMemoryManager::allocate()
     auto workspace_size = m_workspace_manager->max_allocated();
     if (workspace_size)
     {
 +        static double total = 0.0;
 +        total += workspace_size;
 +        std::cout << "workspace: " << total / 1024. / 1024. << " MiB" << std::endl;
         m_workspace_mem.back().ptr = runtime::gpu::create_gpu_buffer(workspace_size);
         m_workspace_mem.back().size = workspace_size;
         m_workspace_mem.push_back({nullptr, 0});
 diff --git a/src/ngraph/runtime/gpu/gpu_tensor.cpp b/src/ngraph/runtime/gpu/gpu_tensor.cpp
 index c61de7ac..cf6f3289 100644
 --- a/src/ngraph/runtime/gpu/gpu_tensor.cpp
 +++ b/src/ngraph/runtime/gpu/gpu_tensor.cpp
 @@ -14,6 +14,7 @@
 // limitations under the License.
 //*****************************************************************************
 
 +#include <algorithm>
 #include <memory>
 
 #include <cuda_runtime.h>
 @@ -23,6 +24,8 @@
 #include "ngraph/runtime/gpu/gpu_backend.hpp"
 #include "ngraph/runtime/gpu/gpu_tensor.hpp"
 #include "ngraph/runtime/gpu/gpu_util.hpp"
 +#include "ngraph/util.hpp"
 +
 
 using namespace ngraph;
 using namespace std;
 @@ -37,6 +40,7 @@ runtime::gpu::GPUTensor::GPUTensor(const ngraph::element::Type& element_type,
         std::make_shared<ngraph::descriptor::layout::DenseTensorLayout>(*m_descriptor));
 
     m_buffer_size = shape_size(shape) * element_type.size();
 +
     if (memory_pointer != nullptr)
     {
         m_allocated_buffer_pool = memory_pointer;
 @@ -44,6 +48,9 @@ runtime::gpu::GPUTensor::GPUTensor(const ngraph::element::Type& element_type,
     }
     else if (m_buffer_size > 0)
     {
 +        static double total = 0.0;
 +        total += m_buffer_size;
 +        std::cout << "create_tensor: " << total /1024. /1024.<< " MiB" << std::endl;
         m_allocated_buffer_pool = runtime::gpu::create_gpu_buffer(m_buffer_size);
     }
 }
 @@ -60,10 +67,37 @@ runtime::gpu::GPUTensor::~GPUTensor()
         runtime::gpu::free_gpu_buffer(m_allocated_buffer_pool);
     }
 }
 +static bool is_device_pointer(const void *ptr) {
 +    bool is_device_ptr = false;
 +    cudaPointerAttributes attributes;
 +    auto err = cudaPointerGetAttributes(&attributes, ptr);
 +    if(err != cudaSuccess)
 +    {
 +        err = cudaGetLastError();
 +        err = cudaGetLastError();
 +        return is_device_ptr;
 +    }
 +
 +    if(attributes.devicePointer != nullptr)
 +    {
 +        is_device_ptr = true;
 +    }
 +
 +    return is_device_ptr;
 +}
 
 void runtime::gpu::GPUTensor::write(const void* source, size_t tensor_offset, size_t n)
 {
 -    CUDA_RT_SAFE_CALL(cudaMemcpy(m_allocated_buffer_pool, source, n, cudaMemcpyHostToDevice));
 +    if (is_device_pointer(source))
 +    {
 +        CUDA_RT_SAFE_CALL(cudaMemcpy(m_allocated_buffer_pool, source, n, cudaMemcpyDeviceToDevice));
 +    }
 +    else
 +    {
 +        CUDA_RT_SAFE_CALL(cudaMemcpy(m_allocated_buffer_pool, source, n, cudaMemcpyHostToDevice));
 +    }
 +    cudaGetLastError();
 +    cudaGetLastError();
 }
 
 void runtime::gpu::GPUTensor::read(void* target, size_t tensor_offset, size_t n) const
	diff --git a/src/ngraph/runtime/gpu/gpu_backend.cpp b/src/ngraph/runtime/gpu/gpu_backend.cpp
	index 2f1472e4..f54d4ce0 100644
	--- a/src/ngraph/runtime/gpu/gpu_backend.cpp
	+++ b/src/ngraph/runtime/gpu/gpu_backend.cpp
	@@ -162,6 +162,7 @@ bool runtime::gpu::GPU_Backend::call(shared_ptr<Function> func,
	const vector<shared_ptr<runtime::Tensor>>& outputs,
	const vector<shared_ptr<runtime::Tensor>>& inputs)
	{
	+ std::cout << "call " << func->get_name() << std::endl;
	bool rc = true;

	validate_call(func, outputs, inputs);
	diff --git a/src/ngraph/runtime/gpu/gpu_external_function.cpp b/src/ngraph/runtime/gpu/gpu_external_function.cpp
	index 71cdd614..8f0e00ff 100644
	--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
	+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
	@@ -175,9 +175,39 @@ void runtime::gpu::GPU_ExternalFunction::emit_op(GPU_ExternalFunction* external_
	const std::vector<GPUTensorWrapper>& args,
	const std::vector<GPUTensorWrapper>& out)
	{
	+ if (node->is_parameter() \|\| node->is_constant())
	+ {
	+ return;
	+ }
	+
	+
	+
	+ // writer.block_begin();
	+ // // manifest calls
	+ // writer << "std::cout << \" exec " << node->get_name() << "\" << std::endl;\n";
	+ // size_t i;
	+ // i = 0;
	+ // writer << "std::string origin;\n";
	+ // writer << "// inputs\n";
	+ // for (auto& tensor : args)
	+ // {
	+ // writer << "origin = runtime::gpu::is_device_pointer(" << tensor.get_name() << ") ? \"DEVICE\" : \"HOST\";\n";
	+ // writer << "std::cout << \" input " << i << ": [\" << origin << \"] " << node->get_argument(i)->get_name() << "[" << tensor.get_name() << "]" << "\" << std::endl;\n";
	+ // i++;
	+ // }
	+ // i = 0;
	+ // writer << "// outputs\n";
	+ // for (auto& tensor : out)
	+ // {
	+ // writer << "origin = runtime::gpu::is_device_pointer(" << tensor.get_name() << ") ? \"DEVICE\" : \"HOST\";\n";
	+ // writer << "std::cout << \" output " << i << ": [\" << origin << \"] " << node->get_argument(i)->get_name() << "[" << tensor.get_name() << "]" << "\" << std::endl;\n";
	+ // i++;
	+ // }
	+ // writer.block_end();
	+
	auto emit_function = GPU_Emitter::get_emit_function(*node);
	emit_function(external_function, writer, node, args, out);
	-};
	+}

	const size_t runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction::s_memory_pool_alignment = 64;

	@@ -200,6 +230,8 @@ const string& runtime::gpu::GPU_ExternalFunction::get_pch_header_source()
	{
	static string s_pch_header_source = R"(
	// Generated by the nGraph GPU backend
	+#include <unordered_map>
	+#include <cstring>
	#include <cublas_v2.h>
	#include <cuda.h>
	#include <cuda_runtime.h>
	@@ -423,6 +455,52 @@ void runtime::gpu::GPU_ExternalFunction::emit_functions()
	<< "gpu::GPURuntimeContext* ctx) __attribute__ ((optnone))\n";
	m_writer.block_begin();
	{
	+ m_writer << "std::unordered_map<int, void*> input_map;\n";
	+ m_writer << "std::unordered_map<int, void*> output_map;\n";
	+ m_writer << "bool is_device;\n";
	+ for (auto i = 0; i < current_function->get_parameters().size(); i++)
	+ {
	+ m_writer << "is_device = runtime::gpu::is_device_pointer(inputs[" << i << "]);\n";
	+ m_writer << "if (!is_device)\n";
	+ m_writer.block_begin();
	+ {
	+ //m_writer << "std::cout << \"inputs[" << i << "] is on HOST\" << std::endl;\n";
	+ m_writer << "if (input_map.count(" << i << ") == 0)\n";
	+ m_writer.block_begin();
	+ {
	+ const auto& param = current_function->get_parameters().at(i);
	+ auto size = shape_size(param->get_output_shape(0)) * param->get_element_type().size();
	+ m_writer << "input_map.insert({" << i << ", inputs[" << i << "]});\n";
	+ m_writer << "inputs[" << i << "] = runtime::gpu::create_gpu_buffer(" << size << ", " << "inputs[" << i << "]);\n";
	+ }
	+ m_writer.block_end();
	+ }
	+ m_writer.block_end();
	+ }
	+
	+ for (auto i = 0; i < current_function->get_results().size(); i++)
	+ {
	+ m_writer << "is_device = runtime::gpu::is_device_pointer(outputs[" << i << "]);\n";
	+ m_writer << "if (!is_device)\n";
	+ m_writer.block_begin();
	+ {
	+ //m_writer << "std::cout << \"outputs[" << i << "] is on HOST\" << std::endl;\n";
	+ m_writer << "if (output_map.count(" << i << ") == 0)\n";
	+ m_writer.block_begin();
	+ {
	+ const auto& res = current_function->get_results().at(i);
	+ auto size = shape_size(res->get_output_shape(0)) * res->get_element_type().size();
	+ m_writer << "output_map.insert({" << i << ", outputs[" << i << "]});\n";
	+ m_writer << "outputs[" << i << "] = runtime::gpu::create_gpu_buffer(" << size << ", " << "outputs[" << i << "]);\n";
	+ }
	+ m_writer.block_end();
	+ }
	+ m_writer.block_end();
	+ }
	+
	+
	+
	+
	m_writer << "m_runtime_context = ctx;\n";
	// set constant pointers during the first run
	m_writer << "invoke_constant_mem_ptr();\n";
	@@ -533,6 +611,44 @@ void runtime::gpu::GPU_ExternalFunction::emit_functions()
	emit_debug_function_exit(node.get());
	}
	}
	+ m_writer << "static std::vector<size_t> inputs_size_in_bytes =\n";
	+ m_writer.block_begin();
	+ for (auto i = 0; i < current_function->get_parameters().size(); i++)
	+ {
	+ const auto& param = current_function->get_parameters().at(i);
	+ auto size = shape_size(param->get_output_shape(0)) * param->get_element_type().size();
	+ m_writer << size << ",\n";
	+ }
	+ m_writer.block_end();
	+ m_writer << ";\n";
	+
	+ m_writer << "for (auto& entry : input_map)\n";
	+ m_writer.block_begin();
	+ {
	+ m_writer << "runtime::gpu::free_gpu_buffer(inputs[entry.first]);\n";
	+ m_writer << "inputs[entry.first] = entry.second;\n";
	+ }
	+ m_writer.block_end();
	+
	+ m_writer << "static std::vector<size_t> outputs_size_in_bytes =\n";
	+ m_writer.block_begin();
	+ for (auto i = 0; i < current_function->get_results().size(); i++)
	+ {
	+ const auto& res = current_function->get_results().at(i);
	+ auto size = shape_size(res->get_output_shape(0)) * res->get_element_type().size();
	+ m_writer << size << ",\n";
	+ }
	+ m_writer.block_end();
	+ m_writer << ";\n";
	+
	+ m_writer << "for (auto& entry : output_map)\n";
	+ m_writer.block_begin();
	+ {
	+ m_writer << "runtime::gpu::cuda_memcpyDtH(entry.second, outputs[entry.first], outputs_size_in_bytes[entry.first]);\n";
	+ m_writer << "runtime::gpu::free_gpu_buffer(outputs[entry.first]);\n";
	+ m_writer << "outputs[entry.first] = entry.second;\n";
	+ }
	+ m_writer.block_end();
	}
	m_writer.block_end(); // End generated function
	}
	@@ -602,6 +718,7 @@ void runtime::gpu::GPU_ExternalFunction::compile()
	m_writer << common_function_string << "\n";
	emit_functions();

	+ //std::cout << m_function_name << std::endl;
	// allocate device buffers for primitive arguments and workspace
	allocator->close();
	m_shared_context->m_primitive_emitter->allocate_primitive_memory();
	diff --git a/src/ngraph/runtime/gpu/gpu_invoke.cpp b/src/ngraph/runtime/gpu/gpu_invoke.cpp
	index cc5d6470..9622d56d 100644
	--- a/src/ngraph/runtime/gpu/gpu_invoke.cpp
	+++ b/src/ngraph/runtime/gpu/gpu_invoke.cpp
	@@ -16,6 +16,8 @@

	#include <string>

	+#include <cuda_runtime.h>
	+
	#include "ngraph/runtime/gpu/gpu_invoke.hpp"
	#include "ngraph/runtime/gpu/gpu_runtime_context.hpp"

	@@ -32,3 +34,24 @@ extern "C" void* ngraph::runtime::gpu::invoke_memory_primitive(const GPURuntimeC
	{
	return ctx->gpu_memory_primitives[primitive_index]();
	}
	+
	+
	+extern "C" bool is_device_pointer(const void *ptr)
	+{
	+ bool is_device_ptr = false;
	+ cudaPointerAttributes attributes;
	+ auto err = cudaPointerGetAttributes(&attributes, ptr);
	+ if(err != cudaSuccess)
	+ {
	+ err = cudaGetLastError();
	+ err = cudaGetLastError();
	+ return is_device_ptr;
	+ }
	+
	+ if(attributes.devicePointer != nullptr)
	+ {
	+ is_device_ptr = true;
	+ }
	+
	+ return is_device_ptr;
	+}
	diff --git a/src/ngraph/runtime/gpu/gpu_invoke.hpp b/src/ngraph/runtime/gpu/gpu_invoke.hpp
	index 67b28cfc..573ef54a 100644
	--- a/src/ngraph/runtime/gpu/gpu_invoke.hpp
	+++ b/src/ngraph/runtime/gpu/gpu_invoke.hpp
	@@ -31,6 +31,7 @@ namespace ngraph
	void** result);
	extern "C" void* invoke_memory_primitive(const GPURuntimeContext* ctx,
	size_t primitive_index);
	+ extern "C" bool is_device_pointer(const void *ptr);
	}
	}
	}
	diff --git a/src/ngraph/runtime/gpu/gpu_memory_manager.cpp b/src/ngraph/runtime/gpu/gpu_memory_manager.cpp
	index 396b1dcf..924104f0 100644
	--- a/src/ngraph/runtime/gpu/gpu_memory_manager.cpp
	+++ b/src/ngraph/runtime/gpu/gpu_memory_manager.cpp
	@@ -73,6 +73,9 @@ void runtime::gpu::GPUMemoryManager::allocate()
	m_buffer_offset = ngraph::pass::MemoryManager::align(
	m_buffer_offset, runtime::gpu::GPUMemoryManager::alignment);
	// the back most node is always empty, fill it here
	+ static double total = 0.0;
	+ total += m_buffer_offset;
	+ std::cout << "constants (argspace): " << total / 1024. / 1024. << " MiB" << std::endl;
	m_argspace_mem.back().ptr = runtime::gpu::create_gpu_buffer(m_buffer_offset);
	m_argspace_mem.back().size = m_buffer_offset;
	// copy buffered kernel arguments to device
	@@ -88,6 +91,9 @@ void runtime::gpu::GPUMemoryManager::allocate()
	auto workspace_size = m_workspace_manager->max_allocated();
	if (workspace_size)
	{
	+ static double total = 0.0;
	+ total += workspace_size;
	+ std::cout << "workspace: " << total / 1024. / 1024. << " MiB" << std::endl;
	m_workspace_mem.back().ptr = runtime::gpu::create_gpu_buffer(workspace_size);
	m_workspace_mem.back().size = workspace_size;
	m_workspace_mem.push_back({nullptr, 0});
	diff --git a/src/ngraph/runtime/gpu/gpu_tensor.cpp b/src/ngraph/runtime/gpu/gpu_tensor.cpp
	index c61de7ac..cf6f3289 100644
	--- a/src/ngraph/runtime/gpu/gpu_tensor.cpp
	+++ b/src/ngraph/runtime/gpu/gpu_tensor.cpp
	@@ -14,6 +14,7 @@
	// limitations under the License.
	//*****************************************************************************

	+#include <algorithm>
	#include <memory>

	#include <cuda_runtime.h>
	@@ -23,6 +24,8 @@
	#include "ngraph/runtime/gpu/gpu_backend.hpp"
	#include "ngraph/runtime/gpu/gpu_tensor.hpp"
	#include "ngraph/runtime/gpu/gpu_util.hpp"
	+#include "ngraph/util.hpp"
	+

	using namespace ngraph;
	using namespace std;
	@@ -37,6 +40,7 @@ runtime::gpu::GPUTensor::GPUTensor(const ngraph::element::Type& element_type,
	std::make_shared<ngraph::descriptor::layout::DenseTensorLayout>(*m_descriptor));

	m_buffer_size = shape_size(shape) * element_type.size();
	+
	if (memory_pointer != nullptr)
	{
	m_allocated_buffer_pool = memory_pointer;
	@@ -44,6 +48,9 @@ runtime::gpu::GPUTensor::GPUTensor(const ngraph::element::Type& element_type,
	}
	else if (m_buffer_size > 0)
	{
	+ static double total = 0.0;
	+ total += m_buffer_size;
	+ std::cout << "create_tensor: " << total /1024. /1024.<< " MiB" << std::endl;
	m_allocated_buffer_pool = runtime::gpu::create_gpu_buffer(m_buffer_size);
	}
	}
	@@ -60,10 +67,37 @@ runtime::gpu::GPUTensor::~GPUTensor()
	runtime::gpu::free_gpu_buffer(m_allocated_buffer_pool);
	}
	}
	+static bool is_device_pointer(const void *ptr) {
	+ bool is_device_ptr = false;
	+ cudaPointerAttributes attributes;
	+ auto err = cudaPointerGetAttributes(&attributes, ptr);
	+ if(err != cudaSuccess)
	+ {
	+ err = cudaGetLastError();
	+ err = cudaGetLastError();
	+ return is_device_ptr;
	+ }
	+
	+ if(attributes.devicePointer != nullptr)
	+ {
	+ is_device_ptr = true;
	+ }
	+
	+ return is_device_ptr;
	+}

	void runtime::gpu::GPUTensor::write(const void* source, size_t tensor_offset, size_t n)
	{
	- CUDA_RT_SAFE_CALL(cudaMemcpy(m_allocated_buffer_pool, source, n, cudaMemcpyHostToDevice));
	+ if (is_device_pointer(source))
	+ {
	+ CUDA_RT_SAFE_CALL(cudaMemcpy(m_allocated_buffer_pool, source, n, cudaMemcpyDeviceToDevice));
	+ }
	+ else
	+ {
	+ CUDA_RT_SAFE_CALL(cudaMemcpy(m_allocated_buffer_pool, source, n, cudaMemcpyHostToDevice));
	+ }
	+ cudaGetLastError();
	+ cudaGetLastError();
	}

	void runtime::gpu::GPUTensor::read(void* target, size_t tensor_offset, size_t n) const