Created
October 1, 2021 18:56
-
-
Save albertz/21e00a500e41eb0c8d27a8519e763f0e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from returnn.config import Config | |
from returnn.tf.engine import Engine | |
import sys | |
sys.path.append("tests") | |
from test_TFNetworkLayer import make_feed_dict | |
n_in = 40 | |
model_filename = "test-703.model.001" | |
def make_net_dict(): | |
return { | |
"output": { | |
"class": "conv", "from": "data", | |
"filter_size": [3,3], "padding": "same", | |
"n_out": 32, "activation": None, "with_bias": True | |
} | |
} | |
config = Config({ | |
"extern_data": {"data": {"shape": (None, n_in, 1)}}, | |
"task": "train", | |
"network": make_net_dict(), | |
}) | |
engine = Engine(config=config) | |
engine.init_train_from_config() | |
engine.save_model(model_filename) | |
config.typed_dict["extern_data"]["data"]["shape"] = (None, 1, n_in) | |
config.typed_dict["task"] = "eval" | |
config.typed_dict["load"] = model_filename | |
engine = Engine(config=config) | |
engine.init_network_from_config() | |
net = engine.network | |
out = net.get_layer("output").output | |
engine.tf_session.run(out.placeholder, feed_dict=make_feed_dict(net.extern_data)) |
Note, this TF code:
import tensorflow as tf
import numpy
tf.compat.v1.disable_eager_execution()
n_in = 1
n_out = 32
filter_size = (3, 3)
filter_shape = list(filter_size) + [n_in, n_out]
with tf.Graph().as_default() as graph:
with tf.compat.v1.Session(graph=graph) as session:
x = tf.compat.v1.placeholder(tf.float32, (None, None, 1, 40)) # [B,T,1,40]
filters = tf.compat.v1.get_variable(name="W", shape=filter_shape)
y = tf.compat.v1.nn.convolution(x, filter=filters, padding="SAME")
session.run(y, feed_dict={x: numpy.zeros((3, 4, 1, 40))})
Produces a similar error as the CPU error:
tensorflow.python.framework.errors_impl.InvalidArgumentError: Depth of output (32) is not a multiple of the number of groups (40) for '{{node convolution}} = Conv2D[T=DT_FLOAT, data_format="NHWC", dilations=[1, 1, 1, 1], explicit_paddings=[], padding="SAME", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true](Placeholder, convolution/ReadVariableOp)' with input shapes: [?,?,1,40], [3,3,1,32].
This code produces the same convolution error on GPU:
import tensorflow as tf
import numpy
tf.compat.v1.disable_eager_execution()
with tf.Graph().as_default() as graph:
with tf.compat.v1.Session(graph=graph) as session:
x = tf.compat.v1.placeholder(tf.float32, (None, None, 1, 40)) # [B,T,1,40]
filters = tf.compat.v1.placeholder(tf.float32, (3, 3, None, 32))
y = tf.compat.v1.nn.convolution(x, filter=filters, padding="SAME")
session.run(
y,
feed_dict={
x: numpy.zeros((3, 4, 1, 40)),
filters: numpy.zeros((3, 3, 1, 32)),
})
Error:
2021-10-01 23:05:27.951528: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6173 MB memory: -> device: 0, name: NVIDIA GeForce RTX 2070, pci bus id:0000:09:00.0, compute capability: 7.5
2021-10-01 23:05:28.331213: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8204
2021-10-01 23:05:28.866860: W tensorflow/core/framework/op_kernel.cc:1692] OP_REQUIRES failed at conv_ops.cc:1276 : Not found: No algorithm worked!
Traceback (most recent call last):
File "/home/az/.local/lib/python3.8/site-packages/tensorflow/python/client/session.py", line 1375, in _do_call
return fn(*args)
File "/home/az/.local/lib/python3.8/site-packages/tensorflow/python/client/session.py", line 1359, in _run_fn
return self._call_tf_sessionrun(options, feed_dict, fetch_list,
File "/home/az/.local/lib/python3.8/site-packages/tensorflow/python/client/session.py", line 1451, in _call_tf_sessionrun
return tf_session.TF_SessionRun_wrapper(self._session, options, feed_dict,
tensorflow.python.framework.errors_impl.NotFoundError: 2 root error(s) found.
(0) Not found: No algorithm worked!
[[{{node convolution}}]]
(1) Not found: No algorithm worked!
[[{{node convolution}}]]
[[convolution/_5]]
0 successful operations.
0 derived errors ignored.
So I reported this here: tensorflow/tensorflow#52223
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The model checkpoint loading mismatch should produce an error, but it does not. I reported that here: tensorflow/tensorflow#52220