Created
November 30, 2018 23:54
-
-
Save csullivan/73bca9ad2b7b0a6343b9567dfda4cbeb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/src/ngraph/runtime/gpu/gpu_external_function.cpp b/src/ngraph/runtime/gpu/gpu_external_function.cpp | |
index 71cdd614..e836f16b 100644 | |
--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp | |
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp | |
@@ -561,7 +561,7 @@ void runtime::gpu::GPU_ExternalFunction::compile() | |
m_shared_context->m_primitive_emitter->get_memory_allocator()); | |
ngraph::pass::Manager pass_manager; | |
-#if CUDNN_VERSION >= 7200 | |
+#if CUDNN_VERSION >= 9200 | |
// recurrent network fusion | |
pass_manager.register_pass<runtime::gpu::pass::LSTMFusion>(); | |
pass_manager.register_pass<runtime::gpu::pass::RNNFusion>(); | |
diff --git a/src/ngraph/runtime/gpu/pass/gpu_layout.cpp b/src/ngraph/runtime/gpu/pass/gpu_layout.cpp | |
index b29bb7ca..e42731e1 100644 | |
--- a/src/ngraph/runtime/gpu/pass/gpu_layout.cpp | |
+++ b/src/ngraph/runtime/gpu/pass/gpu_layout.cpp | |
@@ -23,6 +23,7 @@ | |
#include "gpu_layout.hpp" | |
#include "ngraph/op/get_output_element.hpp" | |
#include "ngraph/op/concat.hpp" | |
+#include "ngraph/op/broadcast.hpp" | |
#include "ngraph/op/replace_slice.hpp" | |
#include "ngraph/op/reshape.hpp" | |
#include "ngraph/op/topk.hpp" | |
@@ -48,23 +49,29 @@ namespace ngraph | |
{ | |
ngraph::replace_node(concat, first_arg); | |
} | |
- // else | |
- // { | |
- // bool is_broadcast = true; | |
- // for (auto& arg : concat->get_arguments()) | |
- // { | |
- // if (arg != first_arg) | |
- // { | |
- // is_broadcast = false; | |
- // } | |
- // } | |
- // if (is_broadcast) | |
- // { | |
- // auto result_shape = concat->get_shape(); | |
- // auto broadcast = std::make_shared<ngraph::op::Broadcast>(first_arg, result_shape, AxisSet{concat->get_concatenation_axis()}); | |
- // ngraph::replace_node(concat, broadcast); | |
- // } | |
- // } | |
+ else | |
+ { | |
+ bool is_broadcast = true; | |
+ for (auto& arg : concat->get_arguments()) | |
+ { | |
+ if (arg != first_arg) | |
+ { | |
+ is_broadcast = false; | |
+ } | |
+ } | |
+ if (is_broadcast && ngraph::shape_size(first_arg->get_shape()) == 1) | |
+ { | |
+ auto arg_shape = first_arg->get_shape(); | |
+ auto axis = concat->get_concatenation_axis(); | |
+ auto out_shape = arg_shape; | |
+ out_shape.erase(out_shape.begin() + axis); | |
+ auto reshape = std::make_shared<ngraph::op::Reshape>(first_arg, ngraph::get_default_order(arg_shape.size()), out_shape); | |
+ | |
+ auto result_shape = concat->get_shape(); | |
+ auto broadcast = std::make_shared<ngraph::op::Broadcast>(reshape, result_shape, AxisSet{axis}); | |
+ ngraph::replace_node(concat, broadcast); | |
+ } | |
+ } | |
} | |
template <> | |
void GPULayout::LAYOUT_DECL(ngraph::op::ReplaceSlice) | |
diff --git a/src/ngraph/runtime/gpu/pass/gpu_rnn_fusion.cpp b/src/ngraph/runtime/gpu/pass/gpu_rnn_fusion.cpp | |
index 29384380..e1b4feb6 100644 | |
--- a/src/ngraph/runtime/gpu/pass/gpu_rnn_fusion.cpp | |
+++ b/src/ngraph/runtime/gpu/pass/gpu_rnn_fusion.cpp | |
@@ -50,7 +50,7 @@ | |
#define RETURN_IF_FALSE(cond, message) \ | |
if (!(cond)) \ | |
{ \ | |
- NGRAPH_DEBUG << message; \ | |
+ NGRAPH_DEBUG << "[FAILURE]" << message; \ | |
return false; \ | |
} | |
@@ -77,20 +77,21 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_sigmoid() | |
if (m.get_match_root()->get_element_type() != element::f32) | |
{ | |
- NGRAPH_DEBUG << "mpattern = " << m.get_match_root()->get_name() | |
+ NGRAPH_DEBUG << "[FAILURE] mpattern = " << m.get_match_root()->get_name() | |
<< " type is not float!"; | |
return false; | |
} | |
if (m.get_match_root()->get_outputs().size() != pattern_map[input]->get_outputs().size()) | |
{ | |
- NGRAPH_DEBUG << "mpattern = " << m.get_match_root()->get_name() | |
+ NGRAPH_DEBUG << "[FAILURE] mpattern = " << m.get_match_root()->get_name() | |
<< "input= " << pattern_map[input]->get_name() << "size dont match!"; | |
return false; | |
} | |
auto sigmoid_node = std::make_shared<op::Sigmoid>(pattern_map[input]); | |
ngraph::replace_node(m.get_match_root(), sigmoid_node); | |
+ NGRAPH_DEBUG << "[SUCCESS] Fused op::Sigmoid"; | |
return true; | |
}; | |
@@ -134,9 +135,15 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_lstm_fprop() | |
std::make_shared<op::Reshape>(weights_i2h, AxisVector{1, 0}, Shape{100, 400}); | |
auto dot_1 = std::make_shared<op::Dot>(input_xt, weights_i2h_reshape); | |
- auto bias_i2h = std::make_shared<pattern::op::Label>(element::f32, Shape{400}); | |
- auto broadcast_bias_i2h = std::make_shared<op::Broadcast>(bias_i2h, Shape{10, 400}, AxisSet{0}); | |
- auto add_1 = std::make_shared<op::Add>(dot_1, broadcast_bias_i2h); | |
+ auto broadcast_pred = [](std::shared_ptr<Node> n) { | |
+ return ((std::dynamic_pointer_cast<op::Broadcast>(n) != nullptr) || | |
+ (std::dynamic_pointer_cast<op::Reshape>(n) != nullptr)); | |
+ }; | |
+ //auto bias_i2h = std::make_shared<pattern::op::Label>(element::f32, Shape{400}); | |
+ //auto broadcast_bias_i2h = std::make_shared<op::Broadcast>(bias_i2h, Shape{10, 400}, AxisSet{0}); | |
+ auto bias_i2h = std::make_shared<pattern::op::Label>(element::f32, Shape{10, 400}); | |
+ auto skip_broadcast_i2h = std::make_shared<pattern::op::Skip>(bias_i2h, broadcast_pred); | |
+ auto add_1 = std::make_shared<op::Add>(dot_1, skip_broadcast_i2h); | |
auto hidden_ht = std::make_shared<pattern::op::Label>(element::f32, Shape{10, 50}); | |
auto weights_h2h = std::make_shared<pattern::op::Label>(element::f32, Shape{400, 50}); | |
@@ -144,9 +151,12 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_lstm_fprop() | |
std::make_shared<op::Reshape>(weights_h2h, AxisVector{1, 0}, Shape{50, 400}); | |
auto dot_2 = std::make_shared<op::Dot>(hidden_ht, param2_2_reshape); | |
- auto bias_h2h = std::make_shared<pattern::op::Label>(element::f32, Shape{400}); | |
- auto broadcast_bias_h2h = std::make_shared<op::Broadcast>(bias_h2h, Shape{10, 400}, AxisSet{0}); | |
- auto add_2 = std::make_shared<op::Add>(dot_2, broadcast_bias_h2h); | |
+ //auto bias_h2h = std::make_shared<pattern::op::Label>(element::f32, Shape{400}); | |
+ //auto broadcast_bias_h2h = std::make_shared<op::Broadcast>(bias_h2h, Shape{10, 400}, AxisSet{0}); | |
+ //auto add_2 = std::make_shared<op::Add>(dot_2, broadcast_bias_h2h); | |
+ auto bias_h2h = std::make_shared<pattern::op::Label>(element::f32, Shape{10, 400}); | |
+ auto skip_broadcast_h2h = std::make_shared<pattern::op::Skip>(bias_h2h, broadcast_pred); | |
+ auto add_2 = std::make_shared<op::Add>(dot_2, skip_broadcast_h2h); | |
auto X = std::make_shared<op::Add>(add_2, add_1); | |
// construct forget gate | |
@@ -193,7 +203,7 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_lstm_fprop() | |
if (m.get_match_root()->get_element_type() != element::f32) | |
{ | |
- NGRAPH_DEBUG << "mpattern = " << m.get_match_root()->get_name() | |
+ NGRAPH_DEBUG << "[FAILURE] mpattern = " << m.get_match_root()->get_name() | |
<< " type is not float!"; | |
return false; | |
} | |
@@ -205,11 +215,12 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_lstm_fprop() | |
if (input_xt_rank != 2 || hidden_ht_rank != 2 || weights_i2h_rank != 2 || | |
weights_h2h_rank != 2) | |
{ | |
+ NGRAPH_DEBUG << "[FAILURE] Rank of input/hidden data or weights is not equal to two"; | |
return false; | |
} | |
- RETURN_IF_FALSE(bias_i2h->get_shape().size() == 1 && bias_h2h->get_shape().size() == 1, | |
- "Bias should have rank of 1 for Rnn op"); | |
+ // RETURN_IF_FALSE(bias_i2h->get_shape().size() == 1 && bias_h2h->get_shape().size() == 1, | |
+ // "Bias should have rank of 1 for Rnn op"); | |
// Determine which is ht_1 and xt. but if both xt and ht_1 have the same shape we need to capture this | |
// reliably in the RNN fusion. | |
@@ -333,6 +344,7 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_lstm_fprop() | |
// find the user's for {ht} and replace them with lstm_goe_0 | |
ngraph::replace_node(m.get_match_root(), ht_output); | |
+ NGRAPH_DEBUG << "[SUCCESS] Fused LSTM (op::Rnn)"; | |
return true; | |
}; | |
auto m = std::make_shared<pattern::Matcher>(ht, callback); | |
@@ -445,6 +457,7 @@ void ngraph::runtime::gpu::pass::RNNFusion::construct_rnn_lstm_fprop() | |
// dont fuse, if the PM didn't discover all the cells belonging to RNN layer. | |
// we dont want to throw an assertion, if pattern matcher cannot discover all | |
// nodes belonging to RNN, instead we will return and can compute LSTM cell wise | |
+ NGRAPH_DEBUG << "[FAILURE] Could not find all LSTM cells to fuse into single RNN layer"; | |
return false; | |
} | |
@@ -457,6 +470,7 @@ void ngraph::runtime::gpu::pass::RNNFusion::construct_rnn_lstm_fprop() | |
auto num_of_lstm_matched = m.get_number_of_recurrent_matches(); | |
if (num_of_lstm_matched <= 1) | |
{ | |
+ NGRAPH_DEBUG << "[FAILURE] Found only one recurrent match for single-layer rnn fusion"; | |
return false; | |
} | |
@@ -597,6 +611,7 @@ void ngraph::runtime::gpu::pass::RNNFusion::construct_rnn_lstm_fprop() | |
NGRAPH_DEBUG << "End of recurrent fusion call back " | |
<< "matched_node: " << m.get_match_root()->get_name(); | |
+ NGRAPH_DEBUG << "[SUCCESS] Fused Single-Layer Rnn"; | |
return true; | |
}; | |
@@ -686,9 +701,11 @@ void ngraph::runtime::gpu::pass::MultiLayerRNNFusion::construct_multi_layer_rnn_ | |
pattern::recurrent_graph_rewrite_callback callback = | |
[src_layer_label, src_iter_label, params_label, state_iter_label, rnn_ht_label]( | |
pattern::RecurrentMatcher& m) { | |
+ NGRAPH_DEBUG << "In callback for Multi-Layer Rnn fusion"; | |
if (m.get_number_of_recurrent_matches() <= 1) | |
{ | |
+ NGRAPH_DEBUG << "[FAILURE] Found only one recurrent match for multi-layer rnn fusion"; | |
return false; | |
} | |
@@ -705,7 +722,7 @@ void ngraph::runtime::gpu::pass::MultiLayerRNNFusion::construct_multi_layer_rnn_ | |
{ | |
if (src_nodes[i]->get_shape()[1] != rnn_ht_out_nodes[i]->get_shape()[1]) | |
{ | |
- NGRAPH_DEBUG << "Not fusing since the feature sizes for xt and ht_1 dont match"; | |
+ NGRAPH_DEBUG << "[FAILURE] Not fusing since the feature sizes for xt and ht_1 dont match"; | |
return false; | |
} | |
} | |
@@ -846,7 +863,7 @@ void ngraph::runtime::gpu::pass::MultiLayerRNNFusion::construct_multi_layer_rnn_ | |
} | |
} | |
} | |
- | |
+ NGRAPH_DEBUG << "[SUCCESS] Fused Multi-Layer Rnn"; | |
return true; | |
}; | |
diff --git a/test/backend_test.in.cpp b/test/backend_test.in.cpp | |
index 6b3b06ba..6b071359 100644 | |
--- a/test/backend_test.in.cpp | |
+++ b/test/backend_test.in.cpp | |
@@ -21,6 +21,7 @@ | |
#include <cstdlib> | |
#include <random> | |
#include <string> | |
+#include <fstream> | |
#include "gtest/gtest.h" | |
#include "ngraph/autodiff/adjoints.hpp" | |
@@ -5931,3 +5932,75 @@ NGRAPH_TEST(${BACKEND_NAME}, shape_of_5d) | |
vector<uint64_t> expected{2, 4, 8, 16, 32}; | |
EXPECT_EQ(expected, read_vector<uint64_t>(result)); | |
} | |
+ | |
+NGRAPH_TEST(${BACKEND_NAME}, compare_bks) | |
+{ | |
+ stringstream ss("temp.json"); | |
+ shared_ptr<Function> func = ngraph::deserialize(ss); | |
+ | |
+ NodeVector new_results; | |
+ for (auto n : func->get_ordered_ops()) | |
+ { | |
+ //dont include op::Results otherwise Function c-tor will complain | |
+ std::cout << "Node element type: " << n->get_element_type() << std::endl; | |
+ if (!n->is_output() && !n->is_parameter() && !n->is_constant() && !(n->get_outputs().size()>1) | |
+ && n->get_element_type() == element::f32) | |
+ { | |
+ // place conditionals here if you want to only make certain ops an output/result node | |
+ if (auto node = std::dynamic_pointer_cast<op::Sum>(n)) | |
+ { | |
+ if (node->get_shape().size() == 2) | |
+ { | |
+ new_results.push_back(n); | |
+ break; | |
+ } | |
+ } | |
+ } | |
+ } | |
+ | |
+ //no need to include original results they are subsumed by new_results | |
+ auto new_func = make_shared<Function>(new_results, func->get_parameters()); | |
+ | |
+ // // uncomment these lines to serialize the new_func for later use | |
+ // // I use this for splicing a small graph out of a larger one | |
+ string js = serialize(new_func, 4); | |
+ std::ofstream outfile; | |
+ outfile.open("gnmt_first_layers.json"); | |
+ outfile << js; | |
+ outfile.close(); | |
+ if (new_func) exit(0); | |
+ | |
+ test::Uniform<float> rng(10.0f, 20.0f, 2112); | |
+ vector<vector<float>> args; | |
+ // for (shared_ptr<op::Parameter> param : new_func->get_parameters()) | |
+ // { | |
+ // vector<float> tensor_val(shape_size(param->get_shape())); | |
+ // rng.initialize(tensor_val); | |
+ // args.push_back(tensor_val); | |
+ // } | |
+ | |
+ auto& params = new_func->get_parameters(); | |
+ std::vector<float> tensor_val(shape_size(params.front()->get_shape()), 1.0f); | |
+ args.push_back(tensor_val); | |
+ tensor_val.back() = 2112.0; | |
+ args.push_back(tensor_val); | |
+ | |
+ auto cpu_func = clone_function(*new_func); | |
+ auto bk_func = clone_function(*new_func); | |
+ auto cpu_results = execute(cpu_func, args, "CPU"); | |
+ auto bk_results = execute(bk_func, args, "${BACKEND_NAME}"); | |
+ for (size_t i = 0; i < cpu_results.size(); i++) | |
+ { | |
+ std::cout << "Comparing results for " << new_results.at(i)->get_name() <<std::endl; | |
+ if (auto node = dynamic_pointer_cast<op::GetOutputElement>(new_results.at(i))) | |
+ { | |
+ std::cout << " Parent node: "; | |
+ for (auto& p : node->get_arguments()) | |
+ { | |
+ std::cout << " " << p->get_name() << std::endl; | |
+ std::cout << " nargs: " << p->get_arguments().size() << std::endl; | |
+ } | |
+ } | |
+ EXPECT_TRUE(test::all_close_f(cpu_results.at(i), bk_results.at(i))); | |
+ } | |
+} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment