Skip to content

Instantly share code, notes, and snippets.

@csullivan
Last active October 1, 2018 22:44
Show Gist options
  • Save csullivan/36d2fd2eebd55788dac2b8c25315784d to your computer and use it in GitHub Desktop.
Save csullivan/36d2fd2eebd55788dac2b8c25315784d to your computer and use it in GitHub Desktop.
diff --git a/src/ngraph/runtime/cpu/cpu_external_function.cpp b/src/ngraph/runtime/cpu/cpu_external_function.cpp
index bc30f4d1..4fbd85aa 100644
--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -22,6 +22,7 @@
#include <typeindex>
#include <typeinfo>
#include <unordered_map>
+#include <algorithm>
// Kill clang diagnostics bug
#pragma clang diagnostic push
@@ -1122,6 +1123,14 @@ void runtime::cpu::CPU_ExternalFunction::propagate_in_place_output(
} while (propagate_further);
}
+static void print_cpu_f32_tensor(const void* p, size_t element_count, size_t element_size)
+{
+ std::vector<float> local(element_count);
+ size_t size_in_bytes = element_size * element_count;
+ memcpy(local.data(), p, size_in_bytes);
+ std::cout << "{" << join(local) << "}" << std::endl;
+}
+
void runtime::cpu::CPU_ExternalFunction::build()
{
if (m_is_built)
@@ -1136,23 +1145,23 @@ void runtime::cpu::CPU_ExternalFunction::build()
// nv_cwi is required only by some frontends
// in which case they should run this pass(CPUWorkspaceInsertion) explicitly
NodeVector nv_cwi;
- pass_manager.register_pass<ngraph::pass::NopElimination>();
+ //pass_manager.register_pass<ngraph::pass::NopElimination>();
// TODO (pruthvi): Enable all the disabeled RNN fusion graph pass after fixing
// failing mxnet unit tests.
// pass_manager.register_pass<runtime::cpu::pass::LSTMFusion>();
// pass_manager.register_pass<runtime::cpu::pass::RNNFusion>();
// pass_manager.register_pass<runtime::cpu::pass::ConcatInputs>();
- pass_manager.register_pass<ngraph::pass::AlgebraicSimplification>();
- pass_manager.register_pass<runtime::cpu::pass::CPUBatchFusion>();
- pass_manager.register_pass<ngraph::pass::CommonSubexpressionElimination>();
- pass_manager.register_pass<ngraph::pass::CoreFusion>();
- pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
- pass_manager.register_pass<runtime::cpu::pass::CPUCollapseDims>();
- pass_manager.register_pass<runtime::cpu::pass::CPUWorkspaceInsertion>(nv_cwi);
- pass_manager.register_pass<runtime::cpu::pass::CPUAssignment>(this);
+ // pass_manager.register_pass<ngraph::pass::AlgebraicSimplification>();
+ // pass_manager.register_pass<runtime::cpu::pass::CPUBatchFusion>();
+ // pass_manager.register_pass<ngraph::pass::CommonSubexpressionElimination>();
+ // pass_manager.register_pass<ngraph::pass::CoreFusion>();
+ // pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
+ // pass_manager.register_pass<runtime::cpu::pass::CPUCollapseDims>();
+ // pass_manager.register_pass<runtime::cpu::pass::CPUWorkspaceInsertion>(nv_cwi);
+ // pass_manager.register_pass<runtime::cpu::pass::CPUAssignment>(this);
pass_manager.register_pass<runtime::cpu::pass::CPULayout>(this);
- pass_manager.register_pass<runtime::cpu::pass::CPUPostLayoutOptimizations>();
- pass_manager.register_pass<ngraph::pass::GetOutputElementElimination>();
+ // pass_manager.register_pass<runtime::cpu::pass::CPUPostLayoutOptimizations>();
+ // pass_manager.register_pass<ngraph::pass::GetOutputElementElimination>();
pass_manager.register_pass<ngraph::pass::Liveness>();
pass_manager.register_pass<ngraph::pass::MemoryLayout>(size_t(s_memory_pool_alignment), true);
pass_manager.run_passes(m_function, false);
@@ -1353,7 +1362,47 @@ void runtime::cpu::CPU_ExternalFunction::build()
enable_nodename_list.emplace_back(make_pair(enable, node->get_name()));
}
- executor = [&](CPURuntimeContext* ctx, vector<void*>& inputs, vector<void*>& outputs) {
+
+ //std::function<void(CPURuntimeContext*)>
+ std::vector<std::string> node_names;
+ std::vector<std::vector<void*>> input_tensors;
+ std::vector<std::vector<size_t>> input_tensor_sizes;
+ std::vector<std::vector<void*>> output_tensors;
+ std::vector<std::vector<size_t>> output_tensor_sizes;
+ for (shared_ptr<Node> node : m_function->get_ordered_ops())
+ {
+ node_names.push_back(node->get_name());
+ if (node->is_parameter() || node->is_constant())
+ {
+ continue;
+ }
+ std::vector<void*> tensors;
+ std::vector<size_t> sizes;
+ for (const descriptor::Input& input : node->get_inputs())
+ {
+ const descriptor::Output& output = input.get_output();
+ shared_ptr<descriptor::TensorView> tv = output.get_tensor_ptr();
+ tensors.push_back(&tensor_data[tv->get_name()]);
+ sizes.push_back(tv->size());
+ }
+ input_tensors.push_back(tensors);
+ input_tensor_sizes.push_back(sizes);
+ tensors.clear();
+ sizes.clear();
+
+ for (const descriptor::Output& output : node->get_outputs())
+ {
+ shared_ptr<descriptor::TensorView> tv = output.get_tensor_ptr();
+ tensors.push_back(&tensor_data[tv->get_name()]);
+ sizes.push_back(tv->size());
+ }
+ output_tensors.push_back(tensors);
+ output_tensor_sizes.push_back(sizes);
+ }
+
+
+
+ executor = [&,node_names, input_tensors, input_tensor_sizes, output_tensors, output_tensor_sizes](CPURuntimeContext* ctx, vector<void*>& inputs, vector<void*>& outputs) {
cpu::Timestamp start_ts;
int profiler_count = 0;
@@ -1480,6 +1529,7 @@ void runtime::cpu::CPU_ExternalFunction::build()
}
else
{
+ size_t node_idx = 0;
for (const auto& p : enables)
{
if (p.first(ctx) || ctx->first_iteration)
@@ -1490,7 +1540,27 @@ void runtime::cpu::CPU_ExternalFunction::build()
{
start_ts = cpu::Clock::now();
}
+
+ std::cout << node_names.at(node_idx+1) << " inputs:" << std::endl;
+ for (size_t n = 0; n < input_tensors.at(node_idx).size(); n++)
+ {
+ void* tensor = *static_cast<void**>(input_tensors.at(node_idx)[n]);
+ auto& size = input_tensor_sizes.at(node_idx)[n];
+ print_cpu_f32_tensor(tensor, std::min<size_t>(size/4, 10), 4);
+ }
+ std::cout << std::endl;
+
(*functor)(ctx);
+ std::cout << node_names.at(node_idx+1) << " outputs:" << std::endl;
+ for (size_t n = 0; n < output_tensors.at(node_idx).size(); n++)
+ {
+ void* tensor = *static_cast<void**>(output_tensors.at(node_idx)[n]);
+ auto& size = output_tensor_sizes.at(node_idx)[n];
+ print_cpu_f32_tensor(tensor, std::min<size_t>(size/4, 10), 4);
+ }
+ std::cout << std::endl;
+
+
if (runtime::cpu::IsTracingEnabled())
{
ctx->op_durations[profiler_count++] =
@@ -1499,6 +1569,7 @@ void runtime::cpu::CPU_ExternalFunction::build()
.count();
}
+ node_idx++;
std::advance(functor, 1);
}
}
@@ -1511,6 +1582,7 @@ void runtime::cpu::CPU_ExternalFunction::build()
ctx->op_durations[profiler_count++] = 0;
}
}
+ node_idx++;
std::advance(functor, p.second);
}
}
diff --git a/src/ngraph/runtime/gpu/gpu_external_function.cpp b/src/ngraph/runtime/gpu/gpu_external_function.cpp
index 94a28c08..23935104 100644
--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
@@ -23,6 +23,7 @@
#include <mutex>
#include <string>
#include <tuple>
+#include <algorithm>
#include "ngraph/descriptor/input.hpp"
#include "ngraph/descriptor/layout/dense_tensor_layout.hpp"
@@ -586,6 +587,26 @@ void runtime::gpu::GPU_ExternalFunction::emit_functions()
emit_debug_function_entry(node.get());
}
+ auto print_tensor_values = [&](const std::shared_ptr<Node>& local_node,
+ const vector<GPU_TensorViewWrapper>& tensors,
+ std::string tensor_kind)
+ {
+ if (local_node->get_arguments().size() > 0){
+ m_writer << "std::cout << \"" << local_node->get_name() << " " << tensor_kind << ":\" << std::endl;\n";
+ for (auto& tensor : tensors)
+ {
+ // m_writer << "std::cout << \" " << tensor.get_name() << ":\" << std::endl;\n";
+ m_writer << "runtime::gpu::print_gpu_f32_tensor("
+ << tensor.get_name() << ", "
+ << std::min<size_t>(tensor.get_size(), 10) << ", "
+ << tensor.get_element_type().size() << ");\n";
+ }
+ m_writer << "std::cout << std::endl;\n";
+ }
+ };
+
+ print_tensor_values(node, in, "inputs");
+
// Emit operation body
auto it = m_node_function_map.find(node.get());
if (it == m_node_function_map.end())
@@ -614,6 +635,7 @@ void runtime::gpu::GPU_ExternalFunction::emit_functions()
{
emit_debug_function_exit(node.get());
}
+ print_tensor_values(node, out, "outputs");
}
}
m_writer.block_end(); // End generated function
@@ -642,11 +664,11 @@ void runtime::gpu::GPU_ExternalFunction::compile()
auto allocator = std::make_shared<runtime::gpu::GPUAllocator>(
m_shared_context->m_primitive_emitter->get_memory_allocator());
- m_pass_manager.register_pass<ngraph::pass::LikeReplacement>();
+ // m_pass_manager.register_pass<ngraph::pass::LikeReplacement>();
m_pass_manager
.register_pass<ngraph::pass::AssignLayout<descriptor::layout::DenseTensorLayout>>();
- m_pass_manager.register_pass<runtime::gpu::pass::GPULayout>(this);
+ // m_pass_manager.register_pass<runtime::gpu::pass::GPULayout>(this);
m_pass_manager.register_pass<ngraph::pass::Liveness>();
m_pass_manager.register_pass<ngraph::pass::MemoryLayout>(s_memory_pool_alignment);
@@ -662,8 +684,8 @@ void runtime::gpu::GPU_ExternalFunction::compile()
m_pass_manager.register_pass<ngraph::pass::CommonFunctionCollection>(
femitter, m_node_function_map, common_function_string);
- string dump_filename = file_util::path_join(s_output_dir, m_function_name + "_ops.txt");
- m_pass_manager.register_pass<ngraph::pass::DumpSorted>(dump_filename);
+ // string dump_filename = file_util::path_join(s_output_dir, m_function_name + "_ops.txt");
+ // m_pass_manager.register_pass<ngraph::pass::DumpSorted>(dump_filename);
m_pass_manager.run_passes(m_function);
@csullivan
Copy link
Author

This patch works on commit NervanaSystems/ngraph@76a0e18

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment