csullivan · November 30, 2018 23:54
diff --git a/gnmt_ngraph.diff b/gnmt_ngraph.diff
 diff --git a/src/ngraph/runtime/gpu/gpu_external_function.cpp b/src/ngraph/runtime/gpu/gpu_external_function.cpp
 index 71cdd614..e836f16b 100644
 --- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
 +++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
 @@ -561,7 +561,7 @@ void runtime::gpu::GPU_ExternalFunction::compile()
         m_shared_context->m_primitive_emitter->get_memory_allocator());
 
     ngraph::pass::Manager pass_manager;
 -#if CUDNN_VERSION >= 7200
 +#if CUDNN_VERSION >= 9200
     // recurrent network fusion
     pass_manager.register_pass<runtime::gpu::pass::LSTMFusion>();
     pass_manager.register_pass<runtime::gpu::pass::RNNFusion>();
 diff --git a/src/ngraph/runtime/gpu/pass/gpu_layout.cpp b/src/ngraph/runtime/gpu/pass/gpu_layout.cpp
 index b29bb7ca..e42731e1 100644
 --- a/src/ngraph/runtime/gpu/pass/gpu_layout.cpp
 +++ b/src/ngraph/runtime/gpu/pass/gpu_layout.cpp
 @@ -23,6 +23,7 @@
 #include "gpu_layout.hpp"
 #include "ngraph/op/get_output_element.hpp"
 #include "ngraph/op/concat.hpp"
 +#include "ngraph/op/broadcast.hpp"
 #include "ngraph/op/replace_slice.hpp"
 #include "ngraph/op/reshape.hpp"
 #include "ngraph/op/topk.hpp"
 @@ -48,23 +49,29 @@ namespace ngraph
                     {
                         ngraph::replace_node(concat, first_arg);
                     }
 -                    // else
 -                    // {
 -                    //     bool is_broadcast = true;
 -                    //     for (auto& arg : concat->get_arguments())
 -                    //     {
 -                    //         if (arg != first_arg)
 -                    //         {
 -                    //             is_broadcast = false;
 -                    //         }
 -                    //     }
 -                    //     if (is_broadcast)
 -                    //     {
 -                    //         auto result_shape = concat->get_shape();
 -                    //         auto broadcast = std::make_shared<ngraph::op::Broadcast>(first_arg, result_shape, AxisSet{concat->get_concatenation_axis()});
 -                    //         ngraph::replace_node(concat, broadcast);
 -                    //     }
 -                    // }
 +                    else
 +                    {
 +                        bool is_broadcast = true;
 +                        for (auto& arg : concat->get_arguments())
 +                        {
 +                            if (arg != first_arg)
 +                            {
 +                                is_broadcast = false;
 +                            }
 +                        }
 +                        if (is_broadcast && ngraph::shape_size(first_arg->get_shape()) == 1)
 +                        {
 +                            auto arg_shape = first_arg->get_shape();
 +                            auto axis = concat->get_concatenation_axis();
 +                            auto out_shape = arg_shape;
 +                            out_shape.erase(out_shape.begin() + axis);
 +                            auto reshape = std::make_shared<ngraph::op::Reshape>(first_arg,  ngraph::get_default_order(arg_shape.size()), out_shape);
 +
 +                            auto result_shape = concat->get_shape();
 +                            auto broadcast = std::make_shared<ngraph::op::Broadcast>(reshape, result_shape, AxisSet{axis});
 +                            ngraph::replace_node(concat, broadcast);
 +                        }
 +                    }
                 }
                 template <>
                 void GPULayout::LAYOUT_DECL(ngraph::op::ReplaceSlice)
 diff --git a/src/ngraph/runtime/gpu/pass/gpu_rnn_fusion.cpp b/src/ngraph/runtime/gpu/pass/gpu_rnn_fusion.cpp
 index 29384380..e1b4feb6 100644
 --- a/src/ngraph/runtime/gpu/pass/gpu_rnn_fusion.cpp
 +++ b/src/ngraph/runtime/gpu/pass/gpu_rnn_fusion.cpp
 @@ -50,7 +50,7 @@
 #define RETURN_IF_FALSE(cond, message)                                                             \
     if (!(cond))                                                                                   \
     {                                                                                              \
 -        NGRAPH_DEBUG << message;                                                                   \
 +        NGRAPH_DEBUG << "[FAILURE]" << message;                          \
         return false;                                                                              \
     }
 
 @@ -77,20 +77,21 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_sigmoid()
 
         if (m.get_match_root()->get_element_type() != element::f32)
         {
 -            NGRAPH_DEBUG << "mpattern = " << m.get_match_root()->get_name()
 +            NGRAPH_DEBUG << "[FAILURE] mpattern = " << m.get_match_root()->get_name()
                          << " type is not float!";
             return false;
         }
 
         if (m.get_match_root()->get_outputs().size() != pattern_map[input]->get_outputs().size())
         {
 -            NGRAPH_DEBUG << "mpattern = " << m.get_match_root()->get_name()
 +            NGRAPH_DEBUG << "[FAILURE] mpattern = " << m.get_match_root()->get_name()
                          << "input= " << pattern_map[input]->get_name() << "size dont match!";
             return false;
         }
 
         auto sigmoid_node = std::make_shared<op::Sigmoid>(pattern_map[input]);
         ngraph::replace_node(m.get_match_root(), sigmoid_node);
 +        NGRAPH_DEBUG << "[SUCCESS] Fused op::Sigmoid";
         return true;
     };
 
 @@ -134,9 +135,15 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_lstm_fprop()
         std::make_shared<op::Reshape>(weights_i2h, AxisVector{1, 0}, Shape{100, 400});
     auto dot_1 = std::make_shared<op::Dot>(input_xt, weights_i2h_reshape);
 
 -    auto bias_i2h = std::make_shared<pattern::op::Label>(element::f32, Shape{400});
 -    auto broadcast_bias_i2h = std::make_shared<op::Broadcast>(bias_i2h, Shape{10, 400}, AxisSet{0});
 -    auto add_1 = std::make_shared<op::Add>(dot_1, broadcast_bias_i2h);
 +    auto broadcast_pred = [](std::shared_ptr<Node> n) {
 +        return ((std::dynamic_pointer_cast<op::Broadcast>(n) != nullptr) ||
 +                (std::dynamic_pointer_cast<op::Reshape>(n) != nullptr));
 +    };
 +    //auto bias_i2h = std::make_shared<pattern::op::Label>(element::f32, Shape{400});
 +    //auto broadcast_bias_i2h = std::make_shared<op::Broadcast>(bias_i2h, Shape{10, 400}, AxisSet{0});
 +    auto bias_i2h = std::make_shared<pattern::op::Label>(element::f32, Shape{10, 400});
 +    auto skip_broadcast_i2h = std::make_shared<pattern::op::Skip>(bias_i2h, broadcast_pred);
 +    auto add_1 = std::make_shared<op::Add>(dot_1, skip_broadcast_i2h);
 
     auto hidden_ht = std::make_shared<pattern::op::Label>(element::f32, Shape{10, 50});
     auto weights_h2h = std::make_shared<pattern::op::Label>(element::f32, Shape{400, 50});
 @@ -144,9 +151,12 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_lstm_fprop()
         std::make_shared<op::Reshape>(weights_h2h, AxisVector{1, 0}, Shape{50, 400});
     auto dot_2 = std::make_shared<op::Dot>(hidden_ht, param2_2_reshape);
 
 -    auto bias_h2h = std::make_shared<pattern::op::Label>(element::f32, Shape{400});
 -    auto broadcast_bias_h2h = std::make_shared<op::Broadcast>(bias_h2h, Shape{10, 400}, AxisSet{0});
 -    auto add_2 = std::make_shared<op::Add>(dot_2, broadcast_bias_h2h);
 +    //auto bias_h2h = std::make_shared<pattern::op::Label>(element::f32, Shape{400});
 +    //auto broadcast_bias_h2h = std::make_shared<op::Broadcast>(bias_h2h, Shape{10, 400}, AxisSet{0});
 +    //auto add_2 = std::make_shared<op::Add>(dot_2, broadcast_bias_h2h);
 +    auto bias_h2h = std::make_shared<pattern::op::Label>(element::f32, Shape{10, 400});
 +    auto skip_broadcast_h2h = std::make_shared<pattern::op::Skip>(bias_h2h, broadcast_pred);
 +    auto add_2 = std::make_shared<op::Add>(dot_2, skip_broadcast_h2h);
 
     auto X = std::make_shared<op::Add>(add_2, add_1);
     // construct forget gate
 @@ -193,7 +203,7 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_lstm_fprop()
 
         if (m.get_match_root()->get_element_type() != element::f32)
         {
 -            NGRAPH_DEBUG << "mpattern = " << m.get_match_root()->get_name()
 +            NGRAPH_DEBUG << "[FAILURE] mpattern = " << m.get_match_root()->get_name()
                          << " type is not float!";
             return false;
         }
 @@ -205,11 +215,12 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_lstm_fprop()
         if (input_xt_rank != 2 || hidden_ht_rank != 2 || weights_i2h_rank != 2 ||
             weights_h2h_rank != 2)
         {
 +            NGRAPH_DEBUG << "[FAILURE] Rank of input/hidden data or weights is not equal to two";
             return false;
         }
 
 -        RETURN_IF_FALSE(bias_i2h->get_shape().size() == 1 && bias_h2h->get_shape().size() == 1,
 -                        "Bias should have rank of 1 for Rnn op");
 +        // RETURN_IF_FALSE(bias_i2h->get_shape().size() == 1 && bias_h2h->get_shape().size() == 1,
 +        //                 "Bias should have rank of 1 for Rnn op");
 
         // Determine which is ht_1 and xt. but if both xt and ht_1 have the same shape we need to capture this
         // reliably in the RNN fusion.
 @@ -333,6 +344,7 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_lstm_fprop()
 
         // find the user's for {ht} and replace them with lstm_goe_0
         ngraph::replace_node(m.get_match_root(), ht_output);
 +        NGRAPH_DEBUG << "[SUCCESS] Fused LSTM (op::Rnn)";
         return true;
     };
     auto m = std::make_shared<pattern::Matcher>(ht, callback);
 @@ -445,6 +457,7 @@ void ngraph::runtime::gpu::pass::RNNFusion::construct_rnn_lstm_fprop()
             // dont fuse, if the PM didn't discover all the cells belonging to RNN layer.
             // we dont want to throw an assertion, if pattern matcher cannot discover all
             // nodes belonging to RNN, instead we will return and can compute LSTM cell wise
 +            NGRAPH_DEBUG << "[FAILURE] Could not find all LSTM cells to fuse into single RNN layer";
             return false;
         }
 
 @@ -457,6 +470,7 @@ void ngraph::runtime::gpu::pass::RNNFusion::construct_rnn_lstm_fprop()
         auto num_of_lstm_matched = m.get_number_of_recurrent_matches();
         if (num_of_lstm_matched <= 1)
         {
 +            NGRAPH_DEBUG << "[FAILURE] Found only one recurrent match for single-layer rnn fusion";
             return false;
         }
 
 @@ -597,6 +611,7 @@ void ngraph::runtime::gpu::pass::RNNFusion::construct_rnn_lstm_fprop()
 
         NGRAPH_DEBUG << "End of recurrent fusion call back "
                      << "matched_node: " << m.get_match_root()->get_name();
 +        NGRAPH_DEBUG << "[SUCCESS] Fused Single-Layer Rnn";
         return true;
 
     };
 @@ -686,9 +701,11 @@ void ngraph::runtime::gpu::pass::MultiLayerRNNFusion::construct_multi_layer_rnn_
     pattern::recurrent_graph_rewrite_callback callback =
         [src_layer_label, src_iter_label, params_label, state_iter_label, rnn_ht_label](
             pattern::RecurrentMatcher& m) {
 +        NGRAPH_DEBUG << "In callback for Multi-Layer Rnn fusion";
 
             if (m.get_number_of_recurrent_matches() <= 1)
             {
 +                NGRAPH_DEBUG << "[FAILURE] Found only one recurrent match for multi-layer rnn fusion";
                 return false;
             }
 
 @@ -705,7 +722,7 @@ void ngraph::runtime::gpu::pass::MultiLayerRNNFusion::construct_multi_layer_rnn_
             {
                 if (src_nodes[i]->get_shape()[1] != rnn_ht_out_nodes[i]->get_shape()[1])
                 {
 -                    NGRAPH_DEBUG << "Not fusing since the feature sizes for xt and ht_1 dont match";
 +                    NGRAPH_DEBUG << "[FAILURE] Not fusing since the feature sizes for xt and ht_1 dont match";
                     return false;
                 }
             }
 @@ -846,7 +863,7 @@ void ngraph::runtime::gpu::pass::MultiLayerRNNFusion::construct_multi_layer_rnn_
                     }
                 }
             }
 -
 +            NGRAPH_DEBUG << "[SUCCESS] Fused Multi-Layer Rnn";
             return true;
         };
 
 diff --git a/test/backend_test.in.cpp b/test/backend_test.in.cpp
 index 6b3b06ba..6b071359 100644
 --- a/test/backend_test.in.cpp
 +++ b/test/backend_test.in.cpp
 @@ -21,6 +21,7 @@
 #include <cstdlib>
 #include <random>
 #include <string>
 +#include <fstream>
 #include "gtest/gtest.h"
 
 #include "ngraph/autodiff/adjoints.hpp"
 @@ -5931,3 +5932,75 @@ NGRAPH_TEST(${BACKEND_NAME}, shape_of_5d)
     vector<uint64_t> expected{2, 4, 8, 16, 32};
     EXPECT_EQ(expected, read_vector<uint64_t>(result));
 }
 +
 +NGRAPH_TEST(${BACKEND_NAME}, compare_bks)
 +{
 +    stringstream ss("temp.json");
 +    shared_ptr<Function> func = ngraph::deserialize(ss);
 +
 +    NodeVector new_results;
 +    for (auto n : func->get_ordered_ops())
 +    {
 +        //dont include op::Results otherwise Function c-tor will complain
 +        std::cout << "Node element type: " << n->get_element_type() << std::endl;
 +        if (!n->is_output() && !n->is_parameter() && !n->is_constant() && !(n->get_outputs().size()>1)
 +            && n->get_element_type() == element::f32)
 +        {
 +            // place conditionals here if you want to only make certain ops an output/result node
 +            if (auto node = std::dynamic_pointer_cast<op::Sum>(n))
 +            {
 +                if (node->get_shape().size() == 2)
 +                {
 +                    new_results.push_back(n);
 +                    break;
 +                }
 +            }
 +        }
 +    }
 +
 +    //no need to include original results they are subsumed by new_results
 +    auto new_func = make_shared<Function>(new_results, func->get_parameters());
 +
 +    // // uncomment these lines to serialize the new_func for later use
 +    // // I use this for splicing a small graph out of a larger one
 +    string js = serialize(new_func, 4);
 +    std::ofstream outfile;
 +    outfile.open("gnmt_first_layers.json");
 +    outfile << js;
 +    outfile.close();
 +    if (new_func) exit(0);
 +
 +    test::Uniform<float> rng(10.0f, 20.0f, 2112);
 +    vector<vector<float>> args;
 +    // for (shared_ptr<op::Parameter> param : new_func->get_parameters())
 +    // {
 +    //     vector<float> tensor_val(shape_size(param->get_shape()));
 +    //     rng.initialize(tensor_val);
 +    //     args.push_back(tensor_val);
 +    // }
 +
 +    auto& params = new_func->get_parameters();
 +    std::vector<float> tensor_val(shape_size(params.front()->get_shape()), 1.0f);
 +    args.push_back(tensor_val);
 +    tensor_val.back() = 2112.0;
 +    args.push_back(tensor_val);
 +
 +    auto cpu_func = clone_function(*new_func);
 +    auto bk_func = clone_function(*new_func);
 +    auto cpu_results = execute(cpu_func, args, "CPU");
 +    auto bk_results = execute(bk_func, args, "${BACKEND_NAME}");
 +    for (size_t i = 0; i < cpu_results.size(); i++)
 +    {
 +        std::cout << "Comparing results for " << new_results.at(i)->get_name() <<std::endl;
 +        if (auto node = dynamic_pointer_cast<op::GetOutputElement>(new_results.at(i)))
 +        {
 +            std::cout << "  Parent node: ";
 +            for (auto& p : node->get_arguments())
 +            {
 +                std::cout << " " << p->get_name() << std::endl;
 +                std::cout << "   nargs: " << p->get_arguments().size() << std::endl;
 +            }
 +        }
 +        EXPECT_TRUE(test::all_close_f(cpu_results.at(i), bk_results.at(i)));
 +    }
 +}
	diff --git a/src/ngraph/runtime/gpu/gpu_external_function.cpp b/src/ngraph/runtime/gpu/gpu_external_function.cpp
	index 71cdd614..e836f16b 100644
	--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
	+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
	@@ -561,7 +561,7 @@ void runtime::gpu::GPU_ExternalFunction::compile()
	m_shared_context->m_primitive_emitter->get_memory_allocator());

	ngraph::pass::Manager pass_manager;
	-#if CUDNN_VERSION >= 7200
	+#if CUDNN_VERSION >= 9200
	// recurrent network fusion
	pass_manager.register_pass<runtime::gpu::pass::LSTMFusion>();
	pass_manager.register_pass<runtime::gpu::pass::RNNFusion>();
	diff --git a/src/ngraph/runtime/gpu/pass/gpu_layout.cpp b/src/ngraph/runtime/gpu/pass/gpu_layout.cpp
	index b29bb7ca..e42731e1 100644
	--- a/src/ngraph/runtime/gpu/pass/gpu_layout.cpp
	+++ b/src/ngraph/runtime/gpu/pass/gpu_layout.cpp
	@@ -23,6 +23,7 @@
	#include "gpu_layout.hpp"
	#include "ngraph/op/get_output_element.hpp"
	#include "ngraph/op/concat.hpp"
	+#include "ngraph/op/broadcast.hpp"
	#include "ngraph/op/replace_slice.hpp"
	#include "ngraph/op/reshape.hpp"
	#include "ngraph/op/topk.hpp"
	@@ -48,23 +49,29 @@ namespace ngraph
	{
	ngraph::replace_node(concat, first_arg);
	}
	- // else
	- // {
	- // bool is_broadcast = true;
	- // for (auto& arg : concat->get_arguments())
	- // {
	- // if (arg != first_arg)
	- // {
	- // is_broadcast = false;
	- // }
	- // }
	- // if (is_broadcast)
	- // {
	- // auto result_shape = concat->get_shape();
	- // auto broadcast = std::make_shared<ngraph::op::Broadcast>(first_arg, result_shape, AxisSet{concat->get_concatenation_axis()});
	- // ngraph::replace_node(concat, broadcast);
	- // }
	- // }
	+ else
	+ {
	+ bool is_broadcast = true;
	+ for (auto& arg : concat->get_arguments())
	+ {
	+ if (arg != first_arg)
	+ {
	+ is_broadcast = false;
	+ }
	+ }
	+ if (is_broadcast && ngraph::shape_size(first_arg->get_shape()) == 1)
	+ {
	+ auto arg_shape = first_arg->get_shape();
	+ auto axis = concat->get_concatenation_axis();
	+ auto out_shape = arg_shape;
	+ out_shape.erase(out_shape.begin() + axis);
	+ auto reshape = std::make_shared<ngraph::op::Reshape>(first_arg, ngraph::get_default_order(arg_shape.size()), out_shape);
	+
	+ auto result_shape = concat->get_shape();
	+ auto broadcast = std::make_shared<ngraph::op::Broadcast>(reshape, result_shape, AxisSet{axis});
	+ ngraph::replace_node(concat, broadcast);
	+ }
	+ }
	}
	template <>
	void GPULayout::LAYOUT_DECL(ngraph::op::ReplaceSlice)
	diff --git a/src/ngraph/runtime/gpu/pass/gpu_rnn_fusion.cpp b/src/ngraph/runtime/gpu/pass/gpu_rnn_fusion.cpp
	index 29384380..e1b4feb6 100644
	--- a/src/ngraph/runtime/gpu/pass/gpu_rnn_fusion.cpp
	+++ b/src/ngraph/runtime/gpu/pass/gpu_rnn_fusion.cpp
	@@ -50,7 +50,7 @@
	#define RETURN_IF_FALSE(cond, message) \
	if (!(cond)) \
	{ \
	- NGRAPH_DEBUG << message; \
	+ NGRAPH_DEBUG << "[FAILURE]" << message; \
	return false; \
	}

	@@ -77,20 +77,21 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_sigmoid()

	if (m.get_match_root()->get_element_type() != element::f32)
	{
	- NGRAPH_DEBUG << "mpattern = " << m.get_match_root()->get_name()
	+ NGRAPH_DEBUG << "[FAILURE] mpattern = " << m.get_match_root()->get_name()
	<< " type is not float!";
	return false;
	}

	if (m.get_match_root()->get_outputs().size() != pattern_map[input]->get_outputs().size())
	{
	- NGRAPH_DEBUG << "mpattern = " << m.get_match_root()->get_name()
	+ NGRAPH_DEBUG << "[FAILURE] mpattern = " << m.get_match_root()->get_name()
	<< "input= " << pattern_map[input]->get_name() << "size dont match!";
	return false;
	}

	auto sigmoid_node = std::make_shared<op::Sigmoid>(pattern_map[input]);
	ngraph::replace_node(m.get_match_root(), sigmoid_node);
	+ NGRAPH_DEBUG << "[SUCCESS] Fused op::Sigmoid";
	return true;
	};

	@@ -134,9 +135,15 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_lstm_fprop()
	std::make_shared<op::Reshape>(weights_i2h, AxisVector{1, 0}, Shape{100, 400});
	auto dot_1 = std::make_shared<op::Dot>(input_xt, weights_i2h_reshape);

	- auto bias_i2h = std::make_shared<pattern::op::Label>(element::f32, Shape{400});
	- auto broadcast_bias_i2h = std::make_shared<op::Broadcast>(bias_i2h, Shape{10, 400}, AxisSet{0});
	- auto add_1 = std::make_shared<op::Add>(dot_1, broadcast_bias_i2h);
	+ auto broadcast_pred = [](std::shared_ptr<Node> n) {
	+ return ((std::dynamic_pointer_cast<op::Broadcast>(n) != nullptr) \|\|
	+ (std::dynamic_pointer_cast<op::Reshape>(n) != nullptr));
	+ };
	+ //auto bias_i2h = std::make_shared<pattern::op::Label>(element::f32, Shape{400});
	+ //auto broadcast_bias_i2h = std::make_shared<op::Broadcast>(bias_i2h, Shape{10, 400}, AxisSet{0});
	+ auto bias_i2h = std::make_shared<pattern::op::Label>(element::f32, Shape{10, 400});
	+ auto skip_broadcast_i2h = std::make_shared<pattern::op::Skip>(bias_i2h, broadcast_pred);
	+ auto add_1 = std::make_shared<op::Add>(dot_1, skip_broadcast_i2h);

	auto hidden_ht = std::make_shared<pattern::op::Label>(element::f32, Shape{10, 50});
	auto weights_h2h = std::make_shared<pattern::op::Label>(element::f32, Shape{400, 50});
	@@ -144,9 +151,12 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_lstm_fprop()
	std::make_shared<op::Reshape>(weights_h2h, AxisVector{1, 0}, Shape{50, 400});
	auto dot_2 = std::make_shared<op::Dot>(hidden_ht, param2_2_reshape);

	- auto bias_h2h = std::make_shared<pattern::op::Label>(element::f32, Shape{400});
	- auto broadcast_bias_h2h = std::make_shared<op::Broadcast>(bias_h2h, Shape{10, 400}, AxisSet{0});
	- auto add_2 = std::make_shared<op::Add>(dot_2, broadcast_bias_h2h);
	+ //auto bias_h2h = std::make_shared<pattern::op::Label>(element::f32, Shape{400});
	+ //auto broadcast_bias_h2h = std::make_shared<op::Broadcast>(bias_h2h, Shape{10, 400}, AxisSet{0});
	+ //auto add_2 = std::make_shared<op::Add>(dot_2, broadcast_bias_h2h);
	+ auto bias_h2h = std::make_shared<pattern::op::Label>(element::f32, Shape{10, 400});
	+ auto skip_broadcast_h2h = std::make_shared<pattern::op::Skip>(bias_h2h, broadcast_pred);
	+ auto add_2 = std::make_shared<op::Add>(dot_2, skip_broadcast_h2h);

	auto X = std::make_shared<op::Add>(add_2, add_1);
	// construct forget gate
	@@ -193,7 +203,7 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_lstm_fprop()

	if (m.get_match_root()->get_element_type() != element::f32)
	{
	- NGRAPH_DEBUG << "mpattern = " << m.get_match_root()->get_name()
	+ NGRAPH_DEBUG << "[FAILURE] mpattern = " << m.get_match_root()->get_name()
	<< " type is not float!";
	return false;
	}
	@@ -205,11 +215,12 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_lstm_fprop()
	if (input_xt_rank != 2 \|\| hidden_ht_rank != 2 \|\| weights_i2h_rank != 2 \|\|
	weights_h2h_rank != 2)
	{
	+ NGRAPH_DEBUG << "[FAILURE] Rank of input/hidden data or weights is not equal to two";
	return false;
	}

	- RETURN_IF_FALSE(bias_i2h->get_shape().size() == 1 && bias_h2h->get_shape().size() == 1,
	- "Bias should have rank of 1 for Rnn op");
	+ // RETURN_IF_FALSE(bias_i2h->get_shape().size() == 1 && bias_h2h->get_shape().size() == 1,
	+ // "Bias should have rank of 1 for Rnn op");

	// Determine which is ht_1 and xt. but if both xt and ht_1 have the same shape we need to capture this
	// reliably in the RNN fusion.
	@@ -333,6 +344,7 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_lstm_fprop()

	// find the user's for {ht} and replace them with lstm_goe_0
	ngraph::replace_node(m.get_match_root(), ht_output);
	+ NGRAPH_DEBUG << "[SUCCESS] Fused LSTM (op::Rnn)";
	return true;
	};
	auto m = std::make_shared<pattern::Matcher>(ht, callback);
	@@ -445,6 +457,7 @@ void ngraph::runtime::gpu::pass::RNNFusion::construct_rnn_lstm_fprop()
	// dont fuse, if the PM didn't discover all the cells belonging to RNN layer.
	// we dont want to throw an assertion, if pattern matcher cannot discover all
	// nodes belonging to RNN, instead we will return and can compute LSTM cell wise
	+ NGRAPH_DEBUG << "[FAILURE] Could not find all LSTM cells to fuse into single RNN layer";
	return false;
	}

	@@ -457,6 +470,7 @@ void ngraph::runtime::gpu::pass::RNNFusion::construct_rnn_lstm_fprop()
	auto num_of_lstm_matched = m.get_number_of_recurrent_matches();
	if (num_of_lstm_matched <= 1)
	{
	+ NGRAPH_DEBUG << "[FAILURE] Found only one recurrent match for single-layer rnn fusion";
	return false;
	}

	@@ -597,6 +611,7 @@ void ngraph::runtime::gpu::pass::RNNFusion::construct_rnn_lstm_fprop()

	NGRAPH_DEBUG << "End of recurrent fusion call back "
	<< "matched_node: " << m.get_match_root()->get_name();
	+ NGRAPH_DEBUG << "[SUCCESS] Fused Single-Layer Rnn";
	return true;

	};
	@@ -686,9 +701,11 @@ void ngraph::runtime::gpu::pass::MultiLayerRNNFusion::construct_multi_layer_rnn_
	pattern::recurrent_graph_rewrite_callback callback =
	[src_layer_label, src_iter_label, params_label, state_iter_label, rnn_ht_label](
	pattern::RecurrentMatcher& m) {
	+ NGRAPH_DEBUG << "In callback for Multi-Layer Rnn fusion";

	if (m.get_number_of_recurrent_matches() <= 1)
	{
	+ NGRAPH_DEBUG << "[FAILURE] Found only one recurrent match for multi-layer rnn fusion";
	return false;
	}

	@@ -705,7 +722,7 @@ void ngraph::runtime::gpu::pass::MultiLayerRNNFusion::construct_multi_layer_rnn_
	{
	if (src_nodes[i]->get_shape()[1] != rnn_ht_out_nodes[i]->get_shape()[1])
	{
	- NGRAPH_DEBUG << "Not fusing since the feature sizes for xt and ht_1 dont match";
	+ NGRAPH_DEBUG << "[FAILURE] Not fusing since the feature sizes for xt and ht_1 dont match";
	return false;
	}
	}
	@@ -846,7 +863,7 @@ void ngraph::runtime::gpu::pass::MultiLayerRNNFusion::construct_multi_layer_rnn_
	}
	}
	}
	-
	+ NGRAPH_DEBUG << "[SUCCESS] Fused Multi-Layer Rnn";
	return true;
	};

	diff --git a/test/backend_test.in.cpp b/test/backend_test.in.cpp
	index 6b3b06ba..6b071359 100644
	--- a/test/backend_test.in.cpp
	+++ b/test/backend_test.in.cpp
	@@ -21,6 +21,7 @@
	#include <cstdlib>
	#include <random>
	#include <string>
	+#include <fstream>
	#include "gtest/gtest.h"

	#include "ngraph/autodiff/adjoints.hpp"
	@@ -5931,3 +5932,75 @@ NGRAPH_TEST(${BACKEND_NAME}, shape_of_5d)
	vector<uint64_t> expected{2, 4, 8, 16, 32};
	EXPECT_EQ(expected, read_vector<uint64_t>(result));
	}
	+
	+NGRAPH_TEST(${BACKEND_NAME}, compare_bks)
	+{
	+ stringstream ss("temp.json");
	+ shared_ptr<Function> func = ngraph::deserialize(ss);
	+
	+ NodeVector new_results;
	+ for (auto n : func->get_ordered_ops())
	+ {
	+ //dont include op::Results otherwise Function c-tor will complain
	+ std::cout << "Node element type: " << n->get_element_type() << std::endl;
	+ if (!n->is_output() && !n->is_parameter() && !n->is_constant() && !(n->get_outputs().size()>1)
	+ && n->get_element_type() == element::f32)
	+ {
	+ // place conditionals here if you want to only make certain ops an output/result node
	+ if (auto node = std::dynamic_pointer_cast<op::Sum>(n))
	+ {
	+ if (node->get_shape().size() == 2)
	+ {
	+ new_results.push_back(n);
	+ break;
	+ }
	+ }
	+ }
	+ }
	+
	+ //no need to include original results they are subsumed by new_results
	+ auto new_func = make_shared<Function>(new_results, func->get_parameters());
	+
	+ // // uncomment these lines to serialize the new_func for later use
	+ // // I use this for splicing a small graph out of a larger one
	+ string js = serialize(new_func, 4);
	+ std::ofstream outfile;
	+ outfile.open("gnmt_first_layers.json");
	+ outfile << js;
	+ outfile.close();
	+ if (new_func) exit(0);
	+
	+ test::Uniform<float> rng(10.0f, 20.0f, 2112);
	+ vector<vector<float>> args;
	+ // for (shared_ptr<op::Parameter> param : new_func->get_parameters())
	+ // {
	+ // vector<float> tensor_val(shape_size(param->get_shape()));
	+ // rng.initialize(tensor_val);
	+ // args.push_back(tensor_val);
	+ // }
	+
	+ auto& params = new_func->get_parameters();
	+ std::vector<float> tensor_val(shape_size(params.front()->get_shape()), 1.0f);
	+ args.push_back(tensor_val);
	+ tensor_val.back() = 2112.0;
	+ args.push_back(tensor_val);
	+
	+ auto cpu_func = clone_function(*new_func);
	+ auto bk_func = clone_function(*new_func);
	+ auto cpu_results = execute(cpu_func, args, "CPU");
	+ auto bk_results = execute(bk_func, args, "${BACKEND_NAME}");
	+ for (size_t i = 0; i < cpu_results.size(); i++)
	+ {
	+ std::cout << "Comparing results for " << new_results.at(i)->get_name() <<std::endl;
	+ if (auto node = dynamic_pointer_cast<op::GetOutputElement>(new_results.at(i)))
	+ {
	+ std::cout << " Parent node: ";
	+ for (auto& p : node->get_arguments())
	+ {
	+ std::cout << " " << p->get_name() << std::endl;
	+ std::cout << " nargs: " << p->get_arguments().size() << std::endl;
	+ }
	+ }
	+ EXPECT_TRUE(test::all_close_f(cpu_results.at(i), bk_results.at(i)));
	+ }
	+}