Skip to content

Instantly share code, notes, and snippets.

@csullivan
Created November 30, 2018 23:54
Show Gist options
  • Save csullivan/73bca9ad2b7b0a6343b9567dfda4cbeb to your computer and use it in GitHub Desktop.
Save csullivan/73bca9ad2b7b0a6343b9567dfda4cbeb to your computer and use it in GitHub Desktop.
diff --git a/src/ngraph/runtime/gpu/gpu_external_function.cpp b/src/ngraph/runtime/gpu/gpu_external_function.cpp
index 71cdd614..e836f16b 100644
--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
@@ -561,7 +561,7 @@ void runtime::gpu::GPU_ExternalFunction::compile()
m_shared_context->m_primitive_emitter->get_memory_allocator());
ngraph::pass::Manager pass_manager;
-#if CUDNN_VERSION >= 7200
+#if CUDNN_VERSION >= 9200
// recurrent network fusion
pass_manager.register_pass<runtime::gpu::pass::LSTMFusion>();
pass_manager.register_pass<runtime::gpu::pass::RNNFusion>();
diff --git a/src/ngraph/runtime/gpu/pass/gpu_layout.cpp b/src/ngraph/runtime/gpu/pass/gpu_layout.cpp
index b29bb7ca..e42731e1 100644
--- a/src/ngraph/runtime/gpu/pass/gpu_layout.cpp
+++ b/src/ngraph/runtime/gpu/pass/gpu_layout.cpp
@@ -23,6 +23,7 @@
#include "gpu_layout.hpp"
#include "ngraph/op/get_output_element.hpp"
#include "ngraph/op/concat.hpp"
+#include "ngraph/op/broadcast.hpp"
#include "ngraph/op/replace_slice.hpp"
#include "ngraph/op/reshape.hpp"
#include "ngraph/op/topk.hpp"
@@ -48,23 +49,29 @@ namespace ngraph
{
ngraph::replace_node(concat, first_arg);
}
- // else
- // {
- // bool is_broadcast = true;
- // for (auto& arg : concat->get_arguments())
- // {
- // if (arg != first_arg)
- // {
- // is_broadcast = false;
- // }
- // }
- // if (is_broadcast)
- // {
- // auto result_shape = concat->get_shape();
- // auto broadcast = std::make_shared<ngraph::op::Broadcast>(first_arg, result_shape, AxisSet{concat->get_concatenation_axis()});
- // ngraph::replace_node(concat, broadcast);
- // }
- // }
+ else
+ {
+ bool is_broadcast = true;
+ for (auto& arg : concat->get_arguments())
+ {
+ if (arg != first_arg)
+ {
+ is_broadcast = false;
+ }
+ }
+ if (is_broadcast && ngraph::shape_size(first_arg->get_shape()) == 1)
+ {
+ auto arg_shape = first_arg->get_shape();
+ auto axis = concat->get_concatenation_axis();
+ auto out_shape = arg_shape;
+ out_shape.erase(out_shape.begin() + axis);
+ auto reshape = std::make_shared<ngraph::op::Reshape>(first_arg, ngraph::get_default_order(arg_shape.size()), out_shape);
+
+ auto result_shape = concat->get_shape();
+ auto broadcast = std::make_shared<ngraph::op::Broadcast>(reshape, result_shape, AxisSet{axis});
+ ngraph::replace_node(concat, broadcast);
+ }
+ }
}
template <>
void GPULayout::LAYOUT_DECL(ngraph::op::ReplaceSlice)
diff --git a/src/ngraph/runtime/gpu/pass/gpu_rnn_fusion.cpp b/src/ngraph/runtime/gpu/pass/gpu_rnn_fusion.cpp
index 29384380..e1b4feb6 100644
--- a/src/ngraph/runtime/gpu/pass/gpu_rnn_fusion.cpp
+++ b/src/ngraph/runtime/gpu/pass/gpu_rnn_fusion.cpp
@@ -50,7 +50,7 @@
#define RETURN_IF_FALSE(cond, message) \
if (!(cond)) \
{ \
- NGRAPH_DEBUG << message; \
+ NGRAPH_DEBUG << "[FAILURE]" << message; \
return false; \
}
@@ -77,20 +77,21 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_sigmoid()
if (m.get_match_root()->get_element_type() != element::f32)
{
- NGRAPH_DEBUG << "mpattern = " << m.get_match_root()->get_name()
+ NGRAPH_DEBUG << "[FAILURE] mpattern = " << m.get_match_root()->get_name()
<< " type is not float!";
return false;
}
if (m.get_match_root()->get_outputs().size() != pattern_map[input]->get_outputs().size())
{
- NGRAPH_DEBUG << "mpattern = " << m.get_match_root()->get_name()
+ NGRAPH_DEBUG << "[FAILURE] mpattern = " << m.get_match_root()->get_name()
<< "input= " << pattern_map[input]->get_name() << "size dont match!";
return false;
}
auto sigmoid_node = std::make_shared<op::Sigmoid>(pattern_map[input]);
ngraph::replace_node(m.get_match_root(), sigmoid_node);
+ NGRAPH_DEBUG << "[SUCCESS] Fused op::Sigmoid";
return true;
};
@@ -134,9 +135,15 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_lstm_fprop()
std::make_shared<op::Reshape>(weights_i2h, AxisVector{1, 0}, Shape{100, 400});
auto dot_1 = std::make_shared<op::Dot>(input_xt, weights_i2h_reshape);
- auto bias_i2h = std::make_shared<pattern::op::Label>(element::f32, Shape{400});
- auto broadcast_bias_i2h = std::make_shared<op::Broadcast>(bias_i2h, Shape{10, 400}, AxisSet{0});
- auto add_1 = std::make_shared<op::Add>(dot_1, broadcast_bias_i2h);
+ auto broadcast_pred = [](std::shared_ptr<Node> n) {
+ return ((std::dynamic_pointer_cast<op::Broadcast>(n) != nullptr) ||
+ (std::dynamic_pointer_cast<op::Reshape>(n) != nullptr));
+ };
+ //auto bias_i2h = std::make_shared<pattern::op::Label>(element::f32, Shape{400});
+ //auto broadcast_bias_i2h = std::make_shared<op::Broadcast>(bias_i2h, Shape{10, 400}, AxisSet{0});
+ auto bias_i2h = std::make_shared<pattern::op::Label>(element::f32, Shape{10, 400});
+ auto skip_broadcast_i2h = std::make_shared<pattern::op::Skip>(bias_i2h, broadcast_pred);
+ auto add_1 = std::make_shared<op::Add>(dot_1, skip_broadcast_i2h);
auto hidden_ht = std::make_shared<pattern::op::Label>(element::f32, Shape{10, 50});
auto weights_h2h = std::make_shared<pattern::op::Label>(element::f32, Shape{400, 50});
@@ -144,9 +151,12 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_lstm_fprop()
std::make_shared<op::Reshape>(weights_h2h, AxisVector{1, 0}, Shape{50, 400});
auto dot_2 = std::make_shared<op::Dot>(hidden_ht, param2_2_reshape);
- auto bias_h2h = std::make_shared<pattern::op::Label>(element::f32, Shape{400});
- auto broadcast_bias_h2h = std::make_shared<op::Broadcast>(bias_h2h, Shape{10, 400}, AxisSet{0});
- auto add_2 = std::make_shared<op::Add>(dot_2, broadcast_bias_h2h);
+ //auto bias_h2h = std::make_shared<pattern::op::Label>(element::f32, Shape{400});
+ //auto broadcast_bias_h2h = std::make_shared<op::Broadcast>(bias_h2h, Shape{10, 400}, AxisSet{0});
+ //auto add_2 = std::make_shared<op::Add>(dot_2, broadcast_bias_h2h);
+ auto bias_h2h = std::make_shared<pattern::op::Label>(element::f32, Shape{10, 400});
+ auto skip_broadcast_h2h = std::make_shared<pattern::op::Skip>(bias_h2h, broadcast_pred);
+ auto add_2 = std::make_shared<op::Add>(dot_2, skip_broadcast_h2h);
auto X = std::make_shared<op::Add>(add_2, add_1);
// construct forget gate
@@ -193,7 +203,7 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_lstm_fprop()
if (m.get_match_root()->get_element_type() != element::f32)
{
- NGRAPH_DEBUG << "mpattern = " << m.get_match_root()->get_name()
+ NGRAPH_DEBUG << "[FAILURE] mpattern = " << m.get_match_root()->get_name()
<< " type is not float!";
return false;
}
@@ -205,11 +215,12 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_lstm_fprop()
if (input_xt_rank != 2 || hidden_ht_rank != 2 || weights_i2h_rank != 2 ||
weights_h2h_rank != 2)
{
+ NGRAPH_DEBUG << "[FAILURE] Rank of input/hidden data or weights is not equal to two";
return false;
}
- RETURN_IF_FALSE(bias_i2h->get_shape().size() == 1 && bias_h2h->get_shape().size() == 1,
- "Bias should have rank of 1 for Rnn op");
+ // RETURN_IF_FALSE(bias_i2h->get_shape().size() == 1 && bias_h2h->get_shape().size() == 1,
+ // "Bias should have rank of 1 for Rnn op");
// Determine which is ht_1 and xt. but if both xt and ht_1 have the same shape we need to capture this
// reliably in the RNN fusion.
@@ -333,6 +344,7 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_lstm_fprop()
// find the user's for {ht} and replace them with lstm_goe_0
ngraph::replace_node(m.get_match_root(), ht_output);
+ NGRAPH_DEBUG << "[SUCCESS] Fused LSTM (op::Rnn)";
return true;
};
auto m = std::make_shared<pattern::Matcher>(ht, callback);
@@ -445,6 +457,7 @@ void ngraph::runtime::gpu::pass::RNNFusion::construct_rnn_lstm_fprop()
// dont fuse, if the PM didn't discover all the cells belonging to RNN layer.
// we dont want to throw an assertion, if pattern matcher cannot discover all
// nodes belonging to RNN, instead we will return and can compute LSTM cell wise
+ NGRAPH_DEBUG << "[FAILURE] Could not find all LSTM cells to fuse into single RNN layer";
return false;
}
@@ -457,6 +470,7 @@ void ngraph::runtime::gpu::pass::RNNFusion::construct_rnn_lstm_fprop()
auto num_of_lstm_matched = m.get_number_of_recurrent_matches();
if (num_of_lstm_matched <= 1)
{
+ NGRAPH_DEBUG << "[FAILURE] Found only one recurrent match for single-layer rnn fusion";
return false;
}
@@ -597,6 +611,7 @@ void ngraph::runtime::gpu::pass::RNNFusion::construct_rnn_lstm_fprop()
NGRAPH_DEBUG << "End of recurrent fusion call back "
<< "matched_node: " << m.get_match_root()->get_name();
+ NGRAPH_DEBUG << "[SUCCESS] Fused Single-Layer Rnn";
return true;
};
@@ -686,9 +701,11 @@ void ngraph::runtime::gpu::pass::MultiLayerRNNFusion::construct_multi_layer_rnn_
pattern::recurrent_graph_rewrite_callback callback =
[src_layer_label, src_iter_label, params_label, state_iter_label, rnn_ht_label](
pattern::RecurrentMatcher& m) {
+ NGRAPH_DEBUG << "In callback for Multi-Layer Rnn fusion";
if (m.get_number_of_recurrent_matches() <= 1)
{
+ NGRAPH_DEBUG << "[FAILURE] Found only one recurrent match for multi-layer rnn fusion";
return false;
}
@@ -705,7 +722,7 @@ void ngraph::runtime::gpu::pass::MultiLayerRNNFusion::construct_multi_layer_rnn_
{
if (src_nodes[i]->get_shape()[1] != rnn_ht_out_nodes[i]->get_shape()[1])
{
- NGRAPH_DEBUG << "Not fusing since the feature sizes for xt and ht_1 dont match";
+ NGRAPH_DEBUG << "[FAILURE] Not fusing since the feature sizes for xt and ht_1 dont match";
return false;
}
}
@@ -846,7 +863,7 @@ void ngraph::runtime::gpu::pass::MultiLayerRNNFusion::construct_multi_layer_rnn_
}
}
}
-
+ NGRAPH_DEBUG << "[SUCCESS] Fused Multi-Layer Rnn";
return true;
};
diff --git a/test/backend_test.in.cpp b/test/backend_test.in.cpp
index 6b3b06ba..6b071359 100644
--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
@@ -21,6 +21,7 @@
#include <cstdlib>
#include <random>
#include <string>
+#include <fstream>
#include "gtest/gtest.h"
#include "ngraph/autodiff/adjoints.hpp"
@@ -5931,3 +5932,75 @@ NGRAPH_TEST(${BACKEND_NAME}, shape_of_5d)
vector<uint64_t> expected{2, 4, 8, 16, 32};
EXPECT_EQ(expected, read_vector<uint64_t>(result));
}
+
+NGRAPH_TEST(${BACKEND_NAME}, compare_bks)
+{
+ stringstream ss("temp.json");
+ shared_ptr<Function> func = ngraph::deserialize(ss);
+
+ NodeVector new_results;
+ for (auto n : func->get_ordered_ops())
+ {
+ //dont include op::Results otherwise Function c-tor will complain
+ std::cout << "Node element type: " << n->get_element_type() << std::endl;
+ if (!n->is_output() && !n->is_parameter() && !n->is_constant() && !(n->get_outputs().size()>1)
+ && n->get_element_type() == element::f32)
+ {
+ // place conditionals here if you want to only make certain ops an output/result node
+ if (auto node = std::dynamic_pointer_cast<op::Sum>(n))
+ {
+ if (node->get_shape().size() == 2)
+ {
+ new_results.push_back(n);
+ break;
+ }
+ }
+ }
+ }
+
+ //no need to include original results they are subsumed by new_results
+ auto new_func = make_shared<Function>(new_results, func->get_parameters());
+
+ // // uncomment these lines to serialize the new_func for later use
+ // // I use this for splicing a small graph out of a larger one
+ string js = serialize(new_func, 4);
+ std::ofstream outfile;
+ outfile.open("gnmt_first_layers.json");
+ outfile << js;
+ outfile.close();
+ if (new_func) exit(0);
+
+ test::Uniform<float> rng(10.0f, 20.0f, 2112);
+ vector<vector<float>> args;
+ // for (shared_ptr<op::Parameter> param : new_func->get_parameters())
+ // {
+ // vector<float> tensor_val(shape_size(param->get_shape()));
+ // rng.initialize(tensor_val);
+ // args.push_back(tensor_val);
+ // }
+
+ auto& params = new_func->get_parameters();
+ std::vector<float> tensor_val(shape_size(params.front()->get_shape()), 1.0f);
+ args.push_back(tensor_val);
+ tensor_val.back() = 2112.0;
+ args.push_back(tensor_val);
+
+ auto cpu_func = clone_function(*new_func);
+ auto bk_func = clone_function(*new_func);
+ auto cpu_results = execute(cpu_func, args, "CPU");
+ auto bk_results = execute(bk_func, args, "${BACKEND_NAME}");
+ for (size_t i = 0; i < cpu_results.size(); i++)
+ {
+ std::cout << "Comparing results for " << new_results.at(i)->get_name() <<std::endl;
+ if (auto node = dynamic_pointer_cast<op::GetOutputElement>(new_results.at(i)))
+ {
+ std::cout << " Parent node: ";
+ for (auto& p : node->get_arguments())
+ {
+ std::cout << " " << p->get_name() << std::endl;
+ std::cout << " nargs: " << p->get_arguments().size() << std::endl;
+ }
+ }
+ EXPECT_TRUE(test::all_close_f(cpu_results.at(i), bk_results.at(i)));
+ }
+}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment