Last active
October 1, 2018 22:44
-
-
Save csullivan/36d2fd2eebd55788dac2b8c25315784d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/src/ngraph/runtime/cpu/cpu_external_function.cpp b/src/ngraph/runtime/cpu/cpu_external_function.cpp | |
index bc30f4d1..4fbd85aa 100644 | |
--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp | |
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp | |
@@ -22,6 +22,7 @@ | |
#include <typeindex> | |
#include <typeinfo> | |
#include <unordered_map> | |
+#include <algorithm> | |
// Kill clang diagnostics bug | |
#pragma clang diagnostic push | |
@@ -1122,6 +1123,14 @@ void runtime::cpu::CPU_ExternalFunction::propagate_in_place_output( | |
} while (propagate_further); | |
} | |
+static void print_cpu_f32_tensor(const void* p, size_t element_count, size_t element_size) | |
+{ | |
+ std::vector<float> local(element_count); | |
+ size_t size_in_bytes = element_size * element_count; | |
+ memcpy(local.data(), p, size_in_bytes); | |
+ std::cout << "{" << join(local) << "}" << std::endl; | |
+} | |
+ | |
void runtime::cpu::CPU_ExternalFunction::build() | |
{ | |
if (m_is_built) | |
@@ -1136,23 +1145,23 @@ void runtime::cpu::CPU_ExternalFunction::build() | |
// nv_cwi is required only by some frontends | |
// in which case they should run this pass(CPUWorkspaceInsertion) explicitly | |
NodeVector nv_cwi; | |
- pass_manager.register_pass<ngraph::pass::NopElimination>(); | |
+ //pass_manager.register_pass<ngraph::pass::NopElimination>(); | |
// TODO (pruthvi): Enable all the disabeled RNN fusion graph pass after fixing | |
// failing mxnet unit tests. | |
// pass_manager.register_pass<runtime::cpu::pass::LSTMFusion>(); | |
// pass_manager.register_pass<runtime::cpu::pass::RNNFusion>(); | |
// pass_manager.register_pass<runtime::cpu::pass::ConcatInputs>(); | |
- pass_manager.register_pass<ngraph::pass::AlgebraicSimplification>(); | |
- pass_manager.register_pass<runtime::cpu::pass::CPUBatchFusion>(); | |
- pass_manager.register_pass<ngraph::pass::CommonSubexpressionElimination>(); | |
- pass_manager.register_pass<ngraph::pass::CoreFusion>(); | |
- pass_manager.register_pass<runtime::cpu::pass::CPUFusion>(); | |
- pass_manager.register_pass<runtime::cpu::pass::CPUCollapseDims>(); | |
- pass_manager.register_pass<runtime::cpu::pass::CPUWorkspaceInsertion>(nv_cwi); | |
- pass_manager.register_pass<runtime::cpu::pass::CPUAssignment>(this); | |
+ // pass_manager.register_pass<ngraph::pass::AlgebraicSimplification>(); | |
+ // pass_manager.register_pass<runtime::cpu::pass::CPUBatchFusion>(); | |
+ // pass_manager.register_pass<ngraph::pass::CommonSubexpressionElimination>(); | |
+ // pass_manager.register_pass<ngraph::pass::CoreFusion>(); | |
+ // pass_manager.register_pass<runtime::cpu::pass::CPUFusion>(); | |
+ // pass_manager.register_pass<runtime::cpu::pass::CPUCollapseDims>(); | |
+ // pass_manager.register_pass<runtime::cpu::pass::CPUWorkspaceInsertion>(nv_cwi); | |
+ // pass_manager.register_pass<runtime::cpu::pass::CPUAssignment>(this); | |
pass_manager.register_pass<runtime::cpu::pass::CPULayout>(this); | |
- pass_manager.register_pass<runtime::cpu::pass::CPUPostLayoutOptimizations>(); | |
- pass_manager.register_pass<ngraph::pass::GetOutputElementElimination>(); | |
+ // pass_manager.register_pass<runtime::cpu::pass::CPUPostLayoutOptimizations>(); | |
+ // pass_manager.register_pass<ngraph::pass::GetOutputElementElimination>(); | |
pass_manager.register_pass<ngraph::pass::Liveness>(); | |
pass_manager.register_pass<ngraph::pass::MemoryLayout>(size_t(s_memory_pool_alignment), true); | |
pass_manager.run_passes(m_function, false); | |
@@ -1353,7 +1362,47 @@ void runtime::cpu::CPU_ExternalFunction::build() | |
enable_nodename_list.emplace_back(make_pair(enable, node->get_name())); | |
} | |
- executor = [&](CPURuntimeContext* ctx, vector<void*>& inputs, vector<void*>& outputs) { | |
+ | |
+ //std::function<void(CPURuntimeContext*)> | |
+ std::vector<std::string> node_names; | |
+ std::vector<std::vector<void*>> input_tensors; | |
+ std::vector<std::vector<size_t>> input_tensor_sizes; | |
+ std::vector<std::vector<void*>> output_tensors; | |
+ std::vector<std::vector<size_t>> output_tensor_sizes; | |
+ for (shared_ptr<Node> node : m_function->get_ordered_ops()) | |
+ { | |
+ node_names.push_back(node->get_name()); | |
+ if (node->is_parameter() || node->is_constant()) | |
+ { | |
+ continue; | |
+ } | |
+ std::vector<void*> tensors; | |
+ std::vector<size_t> sizes; | |
+ for (const descriptor::Input& input : node->get_inputs()) | |
+ { | |
+ const descriptor::Output& output = input.get_output(); | |
+ shared_ptr<descriptor::TensorView> tv = output.get_tensor_ptr(); | |
+ tensors.push_back(&tensor_data[tv->get_name()]); | |
+ sizes.push_back(tv->size()); | |
+ } | |
+ input_tensors.push_back(tensors); | |
+ input_tensor_sizes.push_back(sizes); | |
+ tensors.clear(); | |
+ sizes.clear(); | |
+ | |
+ for (const descriptor::Output& output : node->get_outputs()) | |
+ { | |
+ shared_ptr<descriptor::TensorView> tv = output.get_tensor_ptr(); | |
+ tensors.push_back(&tensor_data[tv->get_name()]); | |
+ sizes.push_back(tv->size()); | |
+ } | |
+ output_tensors.push_back(tensors); | |
+ output_tensor_sizes.push_back(sizes); | |
+ } | |
+ | |
+ | |
+ | |
+ executor = [&,node_names, input_tensors, input_tensor_sizes, output_tensors, output_tensor_sizes](CPURuntimeContext* ctx, vector<void*>& inputs, vector<void*>& outputs) { | |
cpu::Timestamp start_ts; | |
int profiler_count = 0; | |
@@ -1480,6 +1529,7 @@ void runtime::cpu::CPU_ExternalFunction::build() | |
} | |
else | |
{ | |
+ size_t node_idx = 0; | |
for (const auto& p : enables) | |
{ | |
if (p.first(ctx) || ctx->first_iteration) | |
@@ -1490,7 +1540,27 @@ void runtime::cpu::CPU_ExternalFunction::build() | |
{ | |
start_ts = cpu::Clock::now(); | |
} | |
+ | |
+ std::cout << node_names.at(node_idx+1) << " inputs:" << std::endl; | |
+ for (size_t n = 0; n < input_tensors.at(node_idx).size(); n++) | |
+ { | |
+ void* tensor = *static_cast<void**>(input_tensors.at(node_idx)[n]); | |
+ auto& size = input_tensor_sizes.at(node_idx)[n]; | |
+ print_cpu_f32_tensor(tensor, std::min<size_t>(size/4, 10), 4); | |
+ } | |
+ std::cout << std::endl; | |
+ | |
(*functor)(ctx); | |
+ std::cout << node_names.at(node_idx+1) << " outputs:" << std::endl; | |
+ for (size_t n = 0; n < output_tensors.at(node_idx).size(); n++) | |
+ { | |
+ void* tensor = *static_cast<void**>(output_tensors.at(node_idx)[n]); | |
+ auto& size = output_tensor_sizes.at(node_idx)[n]; | |
+ print_cpu_f32_tensor(tensor, std::min<size_t>(size/4, 10), 4); | |
+ } | |
+ std::cout << std::endl; | |
+ | |
+ | |
if (runtime::cpu::IsTracingEnabled()) | |
{ | |
ctx->op_durations[profiler_count++] = | |
@@ -1499,6 +1569,7 @@ void runtime::cpu::CPU_ExternalFunction::build() | |
.count(); | |
} | |
+ node_idx++; | |
std::advance(functor, 1); | |
} | |
} | |
@@ -1511,6 +1582,7 @@ void runtime::cpu::CPU_ExternalFunction::build() | |
ctx->op_durations[profiler_count++] = 0; | |
} | |
} | |
+ node_idx++; | |
std::advance(functor, p.second); | |
} | |
} | |
diff --git a/src/ngraph/runtime/gpu/gpu_external_function.cpp b/src/ngraph/runtime/gpu/gpu_external_function.cpp | |
index 94a28c08..23935104 100644 | |
--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp | |
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp | |
@@ -23,6 +23,7 @@ | |
#include <mutex> | |
#include <string> | |
#include <tuple> | |
+#include <algorithm> | |
#include "ngraph/descriptor/input.hpp" | |
#include "ngraph/descriptor/layout/dense_tensor_layout.hpp" | |
@@ -586,6 +587,26 @@ void runtime::gpu::GPU_ExternalFunction::emit_functions() | |
emit_debug_function_entry(node.get()); | |
} | |
+ auto print_tensor_values = [&](const std::shared_ptr<Node>& local_node, | |
+ const vector<GPU_TensorViewWrapper>& tensors, | |
+ std::string tensor_kind) | |
+ { | |
+ if (local_node->get_arguments().size() > 0){ | |
+ m_writer << "std::cout << \"" << local_node->get_name() << " " << tensor_kind << ":\" << std::endl;\n"; | |
+ for (auto& tensor : tensors) | |
+ { | |
+ // m_writer << "std::cout << \" " << tensor.get_name() << ":\" << std::endl;\n"; | |
+ m_writer << "runtime::gpu::print_gpu_f32_tensor(" | |
+ << tensor.get_name() << ", " | |
+ << std::min<size_t>(tensor.get_size(), 10) << ", " | |
+ << tensor.get_element_type().size() << ");\n"; | |
+ } | |
+ m_writer << "std::cout << std::endl;\n"; | |
+ } | |
+ }; | |
+ | |
+ print_tensor_values(node, in, "inputs"); | |
+ | |
// Emit operation body | |
auto it = m_node_function_map.find(node.get()); | |
if (it == m_node_function_map.end()) | |
@@ -614,6 +635,7 @@ void runtime::gpu::GPU_ExternalFunction::emit_functions() | |
{ | |
emit_debug_function_exit(node.get()); | |
} | |
+ print_tensor_values(node, out, "outputs"); | |
} | |
} | |
m_writer.block_end(); // End generated function | |
@@ -642,11 +664,11 @@ void runtime::gpu::GPU_ExternalFunction::compile() | |
auto allocator = std::make_shared<runtime::gpu::GPUAllocator>( | |
m_shared_context->m_primitive_emitter->get_memory_allocator()); | |
- m_pass_manager.register_pass<ngraph::pass::LikeReplacement>(); | |
+ // m_pass_manager.register_pass<ngraph::pass::LikeReplacement>(); | |
m_pass_manager | |
.register_pass<ngraph::pass::AssignLayout<descriptor::layout::DenseTensorLayout>>(); | |
- m_pass_manager.register_pass<runtime::gpu::pass::GPULayout>(this); | |
+ // m_pass_manager.register_pass<runtime::gpu::pass::GPULayout>(this); | |
m_pass_manager.register_pass<ngraph::pass::Liveness>(); | |
m_pass_manager.register_pass<ngraph::pass::MemoryLayout>(s_memory_pool_alignment); | |
@@ -662,8 +684,8 @@ void runtime::gpu::GPU_ExternalFunction::compile() | |
m_pass_manager.register_pass<ngraph::pass::CommonFunctionCollection>( | |
femitter, m_node_function_map, common_function_string); | |
- string dump_filename = file_util::path_join(s_output_dir, m_function_name + "_ops.txt"); | |
- m_pass_manager.register_pass<ngraph::pass::DumpSorted>(dump_filename); | |
+ // string dump_filename = file_util::path_join(s_output_dir, m_function_name + "_ops.txt"); | |
+ // m_pass_manager.register_pass<ngraph::pass::DumpSorted>(dump_filename); | |
m_pass_manager.run_passes(m_function); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This patch works on commit NervanaSystems/ngraph@76a0e18