Created
July 16, 2015 07:53
-
-
Save anonymous/c76dcb3e40bb488ca993 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
After doing this, you may fix the identity used for this commit with: | |
git commit --amend --reset-author | |
1 file changed, 1 insertion(+), 1 deletion(-) | |
ubuntu@ip-10-0-0-156:~/nmt$ git push | |
warning: push.default is unset; its implicit value is changing in | |
Git 2.0 from 'matching' to 'simple'. To squelch this message | |
and maintain the current behavior after the default changes, use: | |
git config --global push.default matching | |
To squelch this message and adopt the new behavior now, use: | |
git config --global push.default simple | |
When push.default is set to 'matching', git will push local branches | |
to the remote branches that already exist with the same name. | |
In Git 2.0, Git will default to the more conservative 'simple' | |
behavior, which only pushes the current branch to the corresponding | |
remote branch that 'git pull' uses to update the current branch. | |
See 'git help config' and search for 'push.default' for further information. | |
(the 'simple' mode was introduced in Git 1.7.11. Use the similar mode | |
'current' instead of 'simple' if you sometimes use older versions of Git) | |
Counting objects: 9, done. | |
Delta compression using up to 8 threads. | |
Compressing objects: 100% (4/4), done. | |
Writing objects: 100% (5/5), 444 bytes | 0 bytes/s, done. | |
Total 5 (delta 2), reused 0 (delta 0) | |
To ssh://[email protected]:2413/chugen-shu/nmt | |
5f55d01..3b748bf master -> master | |
ubuntu@ip-10-0-0-156:~/nmt$ CUDA_LAUNCH_BLOCKING=1 python ./trails/lstm_encdec/train.py /home/ubuntu/data/pickles/remt1.v80k_40k.unk | |
pos.b80.trun40.rev.pack --valid /home/ubuntu/data/pickles/remt1.v80k_40k.unkpos.b80.trun40.rev.valid.pack --source_size 80000 --targ | |
et_size 40001 --hidden_size 1024 --approx --word_embed 1000 --arch one_layer_search --encoder_mask --optimizer sgd --lr 0.01 --train | |
_size 12500 --predict | |
Using gpu device 0: GRID K520 | |
INFO:deepy.networks.network:deepy version = 0.1.7 | |
INFO:deepy.layers.layer:create weight W_embed: (80000, 1000) | |
INFO:deepy.layers.layer:create weight W_embed: (40001, 1000) | |
INFO:deepy.layers.layer:create weight W_h: (1024, 1024) | |
INFO:deepy.layers.layer:create bias B_h: (1024,) | |
INFO:deepy.layers.layer:create weight W_i: (1000, 1024) | |
INFO:deepy.layers.layer:create weight W_h: (1024, 1024) | |
INFO:deepy.layers.layer:create bias B_h: (1024,) | |
INFO:deepy.layers.layer:create weight W_i: (1000, 1024) | |
INFO:deepy.layers.layer:create weight W_h: (1024, 1024) | |
INFO:deepy.layers.layer:create bias B_h: (1024,) | |
INFO:deepy.layers.layer:create weight W_i: (2048, 1024) | |
INFO:deepy.layers.layer:create weight W_i2: (1000, 1024) | |
INFO:deepy.layers.layer:create weight W_ua: (2048, 1024) | |
INFO:deepy.layers.layer:create weight W_wa: (1024, 1024) | |
INFO:deepy.layers.layer:create weight W_va: (1024,) | |
INFO:deepy.layers.layer:create weight W_dense4: (1024, 600) | |
INFO:deepy.layers.layer:create bias B_dense4: (600,) | |
INFO:deepy.layers.layer:create weight W_dense5: (600, 40001) | |
INFO:deepy.layers.layer:create bias B_dense5: (40001,) | |
INFO:deepy.dataset.ondisk_dataset:Cache on memory | |
INFO:deepy.trainers.trainers:changing optimization method to 'SGD' | |
INFO:deepy.networks.network:network inputs: x | |
INFO:deepy.networks.network:network targets: y mask | |
INFO:deepy.networks.network:network parameters: W_embed W_embed W_h B_h W_i W_h B_h W_i W_h B_h W_i W_i2 W_va W_wa W_ua W_dense4 B_d | |
ense4 W_dense5 B_dense5 | |
INFO:deepy.networks.network:parameter count: 156121305 | |
INFO:deepy.trainers.trainers:monitor list: J | |
INFO:deepy.trainers.trainers:compile evaluation function | |
/usr/local/lib/python2.7/dist-packages/theano/scan_module/scan_perform_ext.py:133: RuntimeWarning: numpy.ndarray size changed, may i | |
ndicate binary incompatibility | |
from scan_perform.scan_perform import * | |
INFO:deepy.trainers.trainers:compiling SGDTrainer learning function | |
INFO:deepy.trainers.optimize:optimize method=SGD parameters=[W_embed, W_embed, W_h, B_h, W_i, W_h, B_h, W_i, W_h, B_h, W_i, W_i2, W_ | |
va, W_wa, W_ua, W_dense4, B_dense4, W_dense5, B_dense5] | |
INFO:deepy.trainers.optimize:ada_family_core: [('gparams', [Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{s | |
witch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no | |
_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace | |
}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Ele | |
mwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0]), ('learnin | |
g_rate', learning_rate), ('eps', 1e-06), ('beta', 0.0), ('params', [W_embed, W_embed, W_h, B_h, W_i, W_h, B_h, W_i, W_h, B_h, W_i, W | |
_i2, W_va, W_wa, W_ua, W_dense4, B_dense4, W_dense5, B_dense5]), ('rho', 0.95), ('gsum_regularization', 0.0001), ('method', 'SGD')] | |
INFO:deepy.trainers.trainers:Added 0 free parameters for optimization | |
INFO:deepy.trainers.trainers:network updates: | |
INFO:deepy.trainers.trainers:learning updates: W_embed W_embed W_h B_h W_i W_h B_h W_i W_h B_h W_i W_i2 W_va W_wa W_ua W_dense4 B_de | |
nse4 W_dense5 B_dense5 | |
Timing Info | |
----------- | |
--> <time> <% time> - <total time> <% total time>' | |
<time> computation time for this node | |
<% time> fraction of total computation time for this node | |
<total time> time for this node + total times for this node's ancestors | |
<% total time> total time for this node over total computation time | |
N.B.: | |
* Times include the node time and the function overhead. | |
* <total time> and <% total time> may over-count computation times | |
if inputs to a node share a common ancestor and should be viewed as a | |
loose upper bound. Their intended use is to help rule out potential nodes | |
to remove when optimizing a graph because their <total time> is very low. | |
HostFromGpu [@A] '' 559 | |
Traceback (most recent call last): | |
File "./trails/lstm_encdec/train.py", line 217, in <module> | |
trainer = SGDTrainer(model, training_config) | |
File "/home/ubuntu/deepy/deepy/trainers/trainers.py", line 340, in __init__ | |
super(SGDTrainer, self).__init__(network, config, "SGD") | |
File "/home/ubuntu/deepy/deepy/trainers/trainers.py", line 320, in __init__ | |
theano.printing.debugprint(self.learning_func, open("/tmp/dbg.txt", "w")) | |
File "/usr/local/lib/python2.7/dist-packages/theano/printing.py", line 145, in debugprint | |
profile=p) | |
File "/usr/local/lib/python2.7/dist-packages/theano/compile/debugmode.py", line 649, in debugprint | |
debugprint(i, new_prefix, depth=depth - 1, done=done, | |
TypeError: unsupported operand type(s) for -: 'file' and 'int' | |
Function profiling | |
================== | |
Message: /home/ubuntu/deepy/deepy/trainers/trainers.py:73 | |
Time in 0 calls to Function.__call__: 0.000000e+00s | |
Total compile time: 7.344402e+00s | |
Number of Apply nodes: 232 | |
Theano Optimizer time: 6.673198e+00s | |
Theano validate time: 9.848905e-02s | |
Theano Linker time (includes C, CUDA code generation/compiling): 6.316910e-01s | |
Import time 7.022858e-02s | |
Time in all call to theano.grad() 1.048867e+00s | |
Function profiling | |
================== | |
Message: /usr/local/lib/python2.7/dist-packages/theano/sandbox/cuda/dnn.py:206 | |
Time in 1 calls to Function.__call__: 2.479553e-05s | |
Time in Function.fn.__call__: 8.106232e-06s (32.692%) | |
Time in thunks: 9.536743e-07s (3.846%) | |
Total compile time: 3.535144e+00s | |
Number of Apply nodes: 1 | |
Theano Optimizer time: 2.193451e-05s | |
Theano validate time: 0.000000e+00s | |
Theano Linker time (includes C, CUDA code generation/compiling): 3.528808e+00s | |
Import time 1.091957e-03s | |
Time in all call to theano.grad() 1.048867e+00s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
100.0% 100.0% 0.000s 9.54e-07s C 1 1 theano.sandbox.cuda.dnn.DnnVersion | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
100.0% 100.0% 0.000s 9.54e-07s C 1 1 <theano.sandbox.cuda.dnn.DnnVersion object at 0x7f1a0e5ba55 | |
0> | |
... (remaining 0 Ops account for 0.00%(0.00s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
100.0% 100.0% 0.000s 9.54e-07s 1 0 <theano.sandbox.cuda.dnn.DnnVersion object at 0x7f1a0e5ba550>() | |
... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime) | |
Time in all call to theano.grad() 1.048867e+00s | |
Time in all call to theano.grad() 1.048867e+00s | |
Time in all call to theano.grad() 1.048867e+00s | |
Function profiling | |
================== | |
Message: /usr/local/lib/python2.7/dist-packages/theano/tensor/blas_c.py:733 | |
Time in 1 calls to Function.__call__: 4.439354e-04s | |
Time in Function.fn.__call__: 4.110336e-04s (92.589%) | |
Time in thunks: 3.530979e-04s (79.538%) | |
Total compile time: 1.492095e-02s | |
Number of Apply nodes: 5 | |
Theano Optimizer time: 5.284071e-03s | |
Theano validate time: 1.003742e-04s | |
Theano Linker time (includes C, CUDA code generation/compiling): 4.110098e-03s | |
Import time 3.008842e-04s | |
Time in all call to theano.grad() 1.048867e+00s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
64.6% 64.6% 0.000s 7.60e-05s C 3 3 theano.sandbox.cuda.basic_ops.GpuFromHost | |
29.2% 93.7% 0.000s 1.03e-04s C 1 1 theano.sandbox.cuda.blas.GpuGemv | |
6.3% 100.0% 0.000s 2.22e-05s C 1 1 theano.sandbox.cuda.basic_ops.HostFromGpu | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
64.6% 64.6% 0.000s 7.60e-05s C 3 3 GpuFromHost | |
29.2% 93.7% 0.000s 1.03e-04s C 1 1 GpuGemv{no_inplace} | |
6.3% 100.0% 0.000s 2.22e-05s C 1 1 HostFromGpu | |
... (remaining 0 Ops account for 0.00%(0.00s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
52.4% 52.4% 0.000s 1.85e-04s 1 2 GpuFromHost(aa) | |
29.2% 81.6% 0.000s 1.03e-04s 1 3 GpuGemv{no_inplace}(GpuFromHost.0, TensorConstant{1.0}, GpuFromHost.0, | |
GpuFromHost.0, TensorConstant{0.0}) | |
6.8% 88.3% 0.000s 2.38e-05s 1 1 GpuFromHost(xx) | |
6.3% 94.6% 0.000s 2.22e-05s 1 4 HostFromGpu(GpuGemv{no_inplace}.0) | |
5.4% 100.0% 0.000s 1.91e-05s 1 0 GpuFromHost(yy) | |
... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime) | |
Function profiling | |
================== | |
Message: /home/ubuntu/deepy/deepy/trainers/trainers.py:319 | |
Time in 0 calls to Function.__call__: 0.000000e+00s | |
Total compile time: 2.803392e+01s | |
Number of Apply nodes: 730 | |
Theano Optimizer time: 2.555789e+01s | |
Theano validate time: 4.052165e-01s | |
Theano Linker time (includes C, CUDA code generation/compiling): 2.352550e+00s | |
Import time 2.984858e-02s | |
Time in all call to theano.grad() 1.048867e+00s | |
Time in all call to theano.grad() 1.048867e+00s | |
Time in all call to theano.grad() 1.048867e+00s | |
Time in all call to theano.grad() 1.048867e+00s | |
Time in all call to theano.grad() 1.048867e+00s | |
Time in all call to theano.grad() 1.048867e+00s | |
Time in all call to theano.grad() 1.048867e+00s | |
Function profiling | |
================== | |
Message: Sum of all(4) printed profiles at exit excluding Scan op profile. | |
Time in 2 calls to Function.__call__: 4.687309e-04s | |
Time in Function.fn.__call__: 4.191399e-04s (89.420%) | |
Time in thunks: 3.540516e-04s (75.534%) | |
Total compile time: 3.892838e+01s | |
Number of Apply nodes: 232 | |
Theano Optimizer time: 3.223640e+01s | |
Theano validate time: 5.038059e-01s | |
Theano Linker time (includes C, CUDA code generation/compiling): 6.517159e+00s | |
Import time 1.014700e-01s | |
Time in all call to theano.grad() 1.048867e+00s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
64.4% 64.4% 0.000s 7.60e-05s C 3 3 theano.sandbox.cuda.basic_ops.GpuFromHost | |
29.1% 93.5% 0.000s 1.03e-04s C 1 1 theano.sandbox.cuda.blas.GpuGemv | |
6.3% 99.7% 0.000s 2.22e-05s C 1 1 theano.sandbox.cuda.basic_ops.HostFromGpu | |
0.3% 100.0% 0.000s 9.54e-07s C 1 1 theano.sandbox.cuda.dnn.DnnVersion | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
64.4% 64.4% 0.000s 7.60e-05s C 3 3 GpuFromHost | |
29.1% 93.5% 0.000s 1.03e-04s C 1 1 GpuGemv{no_inplace} | |
6.3% 99.7% 0.000s 2.22e-05s C 1 1 HostFromGpu | |
0.3% 100.0% 0.000s 9.54e-07s C 1 1 <theano.sandbox.cuda.dnn.DnnVersion object at 0x7f1a0e5ba550 | |
> | |
... (remaining 0 Ops account for 0.00%(0.00s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
52.3% 52.3% 0.000s 1.85e-04s 1 2 GpuFromHost(aa) | |
29.1% 81.3% 0.000s 1.03e-04s 1 3 GpuGemv{no_inplace}(GpuFromHost.0, TensorConstant{1.0}, GpuFromHost.0, | |
GpuFromHost.0, TensorConstant{0.0}) | |
6.7% 88.1% 0.000s 2.38e-05s 1 1 GpuFromHost(xx) | |
6.3% 94.3% 0.000s 2.22e-05s 1 4 HostFromGpu(GpuGemv{no_inplace}.0) | |
5.4% 99.7% 0.000s 1.91e-05s 1 0 GpuFromHost(yy) | |
0.3% 100.0% 0.000s 9.54e-07s 1 0 <theano.sandbox.cuda.dnn.DnnVersion object at 0x7f1a0e5ba550>() | |
... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime) | |
ubuntu@ip-10-0-0-156:~/nmt$ gist /tmp/dbg.txt | |
Error: Cannot gist empty files | |
ubuntu@ip-10-0-0-156:~/nmt$ gist /tmp/dbg.txt | |
Error: Cannot gist empty files | |
ubuntu@ip-10-0-0-156:~/nmt$ CUDA_LAUNCH_BLOCKING=1 python ./trails/lstm_encdec/train.py /home/ubuntu/data/pickles/remt1.v80k_40k.unk | |
pos.b80.trun40.rev.pack --valid /home/ubuntu/data/pickles/remt1.v80k_40k.unkpos.b80.trun40.rev.valid.pack --source_size 80000 --targ | |
et_size 40001 --hidden_size 1024 --approx --word_embed 1000 --arch one_layer_search --encoder_mask --optimizer sgd --lr 0.01 --train | |
_size 12500 --predict | |
Using gpu device 0: GRID K520 | |
INFO:deepy.networks.network:deepy version = 0.1.7 | |
INFO:deepy.layers.layer:create weight W_embed: (80000, 1000) | |
INFO:deepy.layers.layer:create weight W_embed: (40001, 1000) | |
INFO:deepy.layers.layer:create weight W_h: (1024, 1024) | |
INFO:deepy.layers.layer:create bias B_h: (1024,) | |
INFO:deepy.layers.layer:create weight W_i: (1000, 1024) | |
INFO:deepy.layers.layer:create weight W_h: (1024, 1024) | |
INFO:deepy.layers.layer:create bias B_h: (1024,) | |
INFO:deepy.layers.layer:create weight W_i: (1000, 1024) | |
INFO:deepy.layers.layer:create weight W_h: (1024, 1024) | |
INFO:deepy.layers.layer:create bias B_h: (1024,) | |
INFO:deepy.layers.layer:create weight W_i: (2048, 1024) | |
INFO:deepy.layers.layer:create weight W_i2: (1000, 1024) | |
INFO:deepy.layers.layer:create weight W_ua: (2048, 1024) | |
INFO:deepy.layers.layer:create weight W_wa: (1024, 1024) | |
INFO:deepy.layers.layer:create weight W_va: (1024,) | |
INFO:deepy.layers.layer:create weight W_dense4: (1024, 600) | |
INFO:deepy.layers.layer:create bias B_dense4: (600,) | |
INFO:deepy.layers.layer:create weight W_dense5: (600, 40001) | |
INFO:deepy.layers.layer:create bias B_dense5: (40001,) | |
INFO:deepy.dataset.ondisk_dataset:Cache on memory | |
INFO:deepy.trainers.trainers:changing optimization method to 'SGD' | |
INFO:deepy.networks.network:network inputs: x | |
INFO:deepy.networks.network:network targets: y mask | |
INFO:deepy.networks.network:network parameters: W_embed W_embed W_h B_h W_i W_h B_h W_i W_h B_h W_i W_i2 W_va W_wa W_ua W_dense4 B_d | |
ense4 W_dense5 B_dense5 | |
INFO:deepy.networks.network:parameter count: 156121305 | |
INFO:deepy.trainers.trainers:monitor list: J | |
INFO:deepy.trainers.trainers:compile evaluation function | |
/usr/local/lib/python2.7/dist-packages/theano/scan_module/scan_perform_ext.py:133: RuntimeWarning: numpy.ndarray size changed, may i | |
ndicate binary incompatibility | |
from scan_perform.scan_perform import * | |
INFO:deepy.trainers.trainers:compiling SGDTrainer learning function | |
INFO:deepy.trainers.optimize:optimize method=SGD parameters=[W_embed, W_embed, W_h, B_h, W_i, W_h, B_h, W_i, W_h, B_h, W_i, W_i2, W_ | |
va, W_wa, W_ua, W_dense4, B_dense4, W_dense5, B_dense5] | |
INFO:deepy.trainers.optimize:ada_family_core: [('gparams', [Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{s | |
witch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no | |
_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace | |
}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Ele | |
mwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0]), ('learnin | |
g_rate', learning_rate), ('eps', 1e-06), ('beta', 0.0), ('params', [W_embed, W_embed, W_h, B_h, W_i, W_h, B_h, W_i, W_h, B_h, W_i, W | |
_i2, W_va, W_wa, W_ua, W_dense4, B_dense4, W_dense5, B_dense5]), ('rho', 0.95), ('gsum_regularization', 0.0001), ('method', 'SGD')] | |
INFO:deepy.trainers.trainers:Added 0 free parameters for optimization | |
INFO:deepy.trainers.trainers:network updates: | |
INFO:deepy.trainers.trainers:learning updates: W_embed W_embed W_h B_h W_i W_h B_h W_i W_h B_h W_i W_i2 W_va W_wa W_ua W_dense4 B_de | |
nse4 W_dense5 B_dense5 | |
[2] > /home/ubuntu/deepy/deepy/trainers/trainers.py(321)__init__() | |
-> theano.printing.debugprint(self.learning_func, open("/tmp/dbg.txt", "w")) | |
(Pdb++) theano.printing.debugprint(self.learning_func, open("/tmp/dbg.txt", "w")) | |
Timing Info | |
----------- | |
--> <time> <% time> - <total time> <% total time>' | |
<time> computation time for this node | |
<% time> fraction of total computation time for this node | |
<total time> time for this node + total times for this node's ancestors | |
<% total time> total time for this node over total computation time | |
N.B.: | |
* Times include the node time and the function overhead. | |
* <total time> and <% total time> may over-count computation times | |
if inputs to a node share a common ancestor and should be viewed as a | |
loose upper bound. Their intended use is to help rule out potential nodes | |
to remove when optimizing a graph because their <total time> is very low. | |
HostFromGpu [@A] '' 559 | |
*** TypeError: unsupported operand type(s) for -: 'file' and 'int' | |
(Pdb++) theano.printing.debugprint(self.learning_func, file=open("/tmp/dbg.txt", "w")) | |
<open file '/tmp/dbg.txt', mode 'w' at 0x7fcdfe346ed0> | |
(Pdb++) | |
Traceback (most recent call last): | |
File "./trails/lstm_encdec/train.py", line 217, in <module> | |
trainer = SGDTrainer(model, training_config) | |
File "/home/ubuntu/deepy/deepy/trainers/trainers.py", line 341, in __init__ | |
super(SGDTrainer, self).__init__(network, config, "SGD") | |
File "/home/ubuntu/deepy/deepy/trainers/trainers.py", line 321, in __init__ | |
theano.printing.debugprint(self.learning_func, open("/tmp/dbg.txt", "w")) | |
File "/home/ubuntu/deepy/deepy/trainers/trainers.py", line 321, in __init__ | |
theano.printing.debugprint(self.learning_func, open("/tmp/dbg.txt", "w")) | |
File "/usr/lib/python2.7/bdb.py", line 49, in trace_dispatch | |
return self.dispatch_line(frame) | |
File "/usr/lib/python2.7/bdb.py", line 68, in dispatch_line | |
if self.quitting: raise BdbQuit | |
bdb.BdbQuit | |
Function profiling | |
================== | |
Message: /home/ubuntu/deepy/deepy/trainers/trainers.py:73 | |
Time in 0 calls to Function.__call__: 0.000000e+00s | |
Total compile time: 7.403779e+00s | |
Number of Apply nodes: 232 | |
Theano Optimizer time: 6.725634e+00s | |
Theano validate time: 9.744573e-02s | |
Theano Linker time (includes C, CUDA code generation/compiling): 6.389041e-01s | |
Import time 7.395124e-02s | |
Time in all call to theano.grad() 1.065451e+00s | |
Function profiling | |
================== | |
Message: /usr/local/lib/python2.7/dist-packages/theano/sandbox/cuda/dnn.py:206 | |
Time in 1 calls to Function.__call__: 2.288818e-05s | |
Time in Function.fn.__call__: 7.152557e-06s (31.250%) | |
Total compile time: 3.538999e+00s | |
Number of Apply nodes: 1 | |
Theano Optimizer time: 2.217293e-05s | |
Theano validate time: 0.000000e+00s | |
Theano Linker time (includes C, CUDA code generation/compiling): 3.532910e+00s | |
Import time 1.065016e-03s | |
Time in all call to theano.grad() 1.065451e+00s | |
No execution time accumulated (hint: try config profiling.time_thunks=1) | |
Time in all call to theano.grad() 1.065451e+00s | |
Time in all call to theano.grad() 1.065451e+00s | |
Time in all call to theano.grad() 1.065451e+00s | |
Function profiling | |
================== | |
Message: /usr/local/lib/python2.7/dist-packages/theano/tensor/blas_c.py:733 | |
Time in 1 calls to Function.__call__: 6.258488e-04s | |
Time in Function.fn.__call__: 5.919933e-04s (94.590%) | |
Time in thunks: 5.378723e-04s (85.943%) | |
Total compile time: 1.489615e-02s | |
Number of Apply nodes: 5 | |
Theano Optimizer time: 5.370140e-03s | |
Theano validate time: 1.072884e-04s | |
Theano Linker time (includes C, CUDA code generation/compiling): 4.125834e-03s | |
Import time 3.008842e-04s | |
Time in all call to theano.grad() 1.065451e+00s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
76.6% 76.6% 0.000s 1.37e-04s C 3 3 theano.sandbox.cuda.basic_ops.GpuFromHost | |
19.3% 95.9% 0.000s 1.04e-04s C 1 1 theano.sandbox.cuda.blas.GpuGemv | |
4.1% 100.0% 0.000s 2.22e-05s C 1 1 theano.sandbox.cuda.basic_ops.HostFromGpu | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
76.6% 76.6% 0.000s 1.37e-04s C 3 3 GpuFromHost | |
19.3% 95.9% 0.000s 1.04e-04s C 1 1 GpuGemv{no_inplace} | |
4.1% 100.0% 0.000s 2.22e-05s C 1 1 HostFromGpu | |
... (remaining 0 Ops account for 0.00%(0.00s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
40.2% 40.2% 0.000s 2.16e-04s 1 1 GpuFromHost(xx) | |
32.9% 73.0% 0.000s 1.77e-04s 1 2 GpuFromHost(aa) | |
19.3% 92.4% 0.000s 1.04e-04s 1 3 GpuGemv{no_inplace}(GpuFromHost.0, TensorConstant{1.0}, GpuFromHost.0, | |
GpuFromHost.0, TensorConstant{0.0}) | |
4.1% 96.5% 0.000s 2.22e-05s 1 4 HostFromGpu(GpuGemv{no_inplace}.0) | |
3.5% 100.0% 0.000s 1.88e-05s 1 0 GpuFromHost(yy) | |
... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime) | |
Function profiling | |
================== | |
Message: /home/ubuntu/deepy/deepy/trainers/trainers.py:319 | |
Time in 0 calls to Function.__call__: 0.000000e+00s | |
Total compile time: 2.812242e+01s | |
Number of Apply nodes: 730 | |
Theano Optimizer time: 2.564678e+01s | |
Theano validate time: 3.940237e-01s | |
Theano Linker time (includes C, CUDA code generation/compiling): 2.356268e+00s | |
Import time 3.264451e-02s | |
Time in all call to theano.grad() 1.065451e+00s | |
Time in all call to theano.grad() 1.065451e+00s | |
Time in all call to theano.grad() 1.065451e+00s | |
Time in all call to theano.grad() 1.065451e+00s | |
Time in all call to theano.grad() 1.065451e+00s | |
Time in all call to theano.grad() 1.065451e+00s | |
Time in all call to theano.grad() 1.065451e+00s | |
Function profiling | |
================== | |
Message: Sum of all(4) printed profiles at exit excluding Scan op profile. | |
Time in 2 calls to Function.__call__: 6.487370e-04s | |
Time in Function.fn.__call__: 5.991459e-04s (92.356%) | |
Time in thunks: 5.378723e-04s (82.911%) | |
Total compile time: 3.908009e+01s | |
Number of Apply nodes: 232 | |
Theano Optimizer time: 3.237780e+01s | |
Theano validate time: 4.915767e-01s | |
Theano Linker time (includes C, CUDA code generation/compiling): 6.532208e+00s | |
Import time 1.079617e-01s | |
Time in all call to theano.grad() 1.065451e+00s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
76.6% 76.6% 0.000s 1.37e-04s C 3 3 theano.sandbox.cuda.basic_ops.GpuFromHost | |
19.3% 95.9% 0.000s 1.04e-04s C 1 1 theano.sandbox.cuda.blas.GpuGemv | |
4.1% 100.0% 0.000s 2.22e-05s C 1 1 theano.sandbox.cuda.basic_ops.HostFromGpu | |
0.0% 100.0% 0.000s 0.00e+00s C 1 1 theano.sandbox.cuda.dnn.DnnVersion | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
76.6% 76.6% 0.000s 1.37e-04s C 3 3 GpuFromHost | |
19.3% 95.9% 0.000s 1.04e-04s C 1 1 GpuGemv{no_inplace} | |
4.1% 100.0% 0.000s 2.22e-05s C 1 1 HostFromGpu | |
0.0% 100.0% 0.000s 0.00e+00s C 1 1 <theano.sandbox.cuda.dnn.DnnVersion object at 0x7fce106b0550 | |
> | |
... (remaining 0 Ops account for 0.00%(0.00s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
40.2% 40.2% 0.000s 2.16e-04s 1 1 GpuFromHost(xx) | |
32.9% 73.0% 0.000s 1.77e-04s 1 2 GpuFromHost(aa) | |
19.3% 92.4% 0.000s 1.04e-04s 1 3 GpuGemv{no_inplace}(GpuFromHost.0, TensorConstant{1.0}, GpuFromHost.0, | |
GpuFromHost.0, TensorConstant{0.0}) | |
4.1% 96.5% 0.000s 2.22e-05s 1 4 HostFromGpu(GpuGemv{no_inplace}.0) | |
3.5% 100.0% 0.000s 1.88e-05s 1 0 GpuFromHost(yy) | |
0.0% 100.0% 0.000s 0.00e+00s 1 0 <theano.sandbox.cuda.dnn.DnnVersion object at 0x7fce106b0550>() | |
... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime) | |
ubuntu@ip-10-0-0-156:~/nmt$ gist /tmp/dbg.txt | |
https://gist.github.com/a0ba5d0f0f7097c1b3ce | |
ubuntu@ip-10-0-0-156:~/nmt$ CUDA_LAUNCH_BLOCKING=1 python ./trails/lstm_encdec/train.py /home/ubuntu/data/pickles/remt1.v80k_40k.unk | |
pos.b80.trun40.rev.pack --valid /home/ubuntu/data/pickles/remt1.v80k_40k.unkpos.b80.trun40.rev.valid.pack --source_size 80000 --targ | |
et_size 40001 --hidden_size 1024 --approx --word_embed 1000 --arch lstm_one_layer_search --encoder_mask --optimizer sgd --lr 0.01 -- | |
train_size 12500 --predict /home/ubuntu/data/models/lstm_sch_1lay_v80k40k_1024h_trun40_emb_msk_pred_adam.gz | |
Using gpu device 0: GRID K520 | |
usage: train.py [-h] [--valid VALID] [--save SAVE] [--load LOAD] | |
[--hidden_size HIDDEN_SIZE] [--source_size SOURCE_SIZE] | |
[--target_size TARGET_SIZE] [--train_size TRAIN_SIZE] | |
[--arch ARCH] [--data_transmit] [--iter_offset ITER_OFFSET] | |
[--approx] [--sample] [--target_vocab TARGET_VOCAB] [--lr LR] | |
[--gradient_clip GRADIENT_CLIP] [--word_embed WORD_EMBED] | |
[--encoder_mask] [--optimizer OPTIMIZER] [--predict] | |
[--avoid_compute_embed_norm] | |
data | |
train.py: error: unrecognized arguments: /home/ubuntu/data/models/lstm_sch_1lay_v80k40k_1024h_trun40_emb_msk_pred_adam.gz | |
ubuntu@ip-10-0-0-156:~/nmt$ CUDA_LAUNCH_BLOCKING=1 python ./trails/lstm_encdec/train.py /home/ubuntu/data/pickles/remt1.v80k_40k.unk | |
pos.b80.trun40.rev.pack --valid /home/ubuntu/data/pickles/remt1.v80k_40k.unkpos.b80.trun40.rev.valid.pack --source_size 80000 --targ | |
et_size 40001 --hidden_size 1024 --approx --word_embed 1000 --arch lstm_one_layer_search --encoder_mask --optimizer sgd --lr 0.01 -- | |
train_size 12500 --predict /home/ubuntu/data/models/lstm_sch_1lay_v80k40k_1024h_trun40_emb_msk_p^C | |
ubuntu@ip-10-0-0-156:~/nmt$ ^CDA_LAUNCH_BLOCKING=1 python ./trails/lstm_encdec/train.py /home/ubuntu/data/pickles/remt1.v80k_40k.unk | |
pos.b80.trun40.rev.pack --valid /home/ubuntu/data/pickles/remt1.v80k_40k.unkpos.b80.trun40.rev.valid.pack --source_size 80000 --targ | |
et_size 40001 --hidden_size 1024 --approx --word_embed 1000 --arch one_layer_search --encoder_mask --optimizer sgd --lr 0.01 --train | |
_size 12500 --predict | |
ubuntu@ip-10-0-0-156:~/nmt$ CUDA_LAUNCH_BLOCKING=1 python ./trails/lstm_encdec/train.py /home/ubuntu/data/pickles/remt1.v80k_40k.unk | |
pos.b80.trun40.rev.pack --valid /home/ubuntu/data/pickles/remt1.v80k_40k.unkpos.b80.trun40.rev.valid.pack --source_size 80000 --targ | |
et_size 40001 --hidden_size 1024 --approx --word_embed 1000 --arch lstm_one_layer_search --encoder_mask --optimizer sgd --lr 0.01 -- | |
train_size 12500 --predict | |
Using gpu device 0: GRID K520 | |
^CTraceback (most recent call last): | |
File "./trails/lstm_encdec/train.py", line 10, in <module> | |
from deepy import * | |
File "/home/ubuntu/deepy/deepy/__init__.py", line 6, in <module> | |
from conf import * | |
File "/home/ubuntu/deepy/deepy/conf/__init__.py", line 4, in <module> | |
from nn_config import NetworkConfig | |
File "/home/ubuntu/deepy/deepy/conf/nn_config.py", line 4, in <module> | |
from deepy.utils import UniformInitializer | |
File "/home/ubuntu/deepy/deepy/utils/__init__.py", line 4, in <module> | |
from functions import * | |
File "/home/ubuntu/deepy/deepy/utils/functions.py", line 5, in <module> | |
import theano | |
File "/usr/local/lib/python2.7/dist-packages/theano/__init__.py", line 98, in <module> | |
theano.sandbox.cuda.tests.test_driver.test_nvidia_driver1() | |
File "/usr/local/lib/python2.7/dist-packages/theano/sandbox/cuda/tests/test_driver.py", line 32, in test_nvidia_driver1 | |
profile=False) | |
File "/usr/local/lib/python2.7/dist-packages/theano/compile/function.py", line 286, in function | |
output_keys=output_keys) | |
File "/usr/local/lib/python2.7/dist-packages/theano/compile/pfunc.py", line 511, in pfunc | |
on_unused_input=on_unused_input, output_keys=output_keys) | |
File "/usr/local/lib/python2.7/dist-packages/theano/compile/function_module.py", line 1483, in orig_function | |
output_keys = output_keys).create( | |
File "/usr/local/lib/python2.7/dist-packages/theano/compile/function_module.py", line 1113, in __init__ | |
theano.gof.cc.get_module_cache().refresh() | |
File "/usr/local/lib/python2.7/dist-packages/theano/gof/cc.py", line 85, in get_module_cache | |
return cmodule.get_module_cache(config.compiledir, init_args=init_args) | |
File "/usr/local/lib/python2.7/dist-packages/theano/gof/cmodule.py", line 1389, in get_module_cache | |
_module_cache = ModuleCache(dirname, **init_args) | |
File "/usr/local/lib/python2.7/dist-packages/theano/gof/cmodule.py", line 616, in __init__ | |
self.refresh() | |
File "/usr/local/lib/python2.7/dist-packages/theano/gof/cmodule.py", line 723, in refresh | |
key_data = cPickle.load(f) | |
File "/usr/local/lib/python2.7/dist-packages/theano/scalar/basic.py", line 3318, in __setstate__ | |
self.init_fgraph() | |
File "/usr/local/lib/python2.7/dist-packages/theano/scalar/basic.py", line 3137, in init_fgraph | |
gof.MergeOptimizer().optimize(fgraph) | |
File "/usr/local/lib/python2.7/dist-packages/theano/gof/opt.py", line 77, in optimize | |
self.add_requirements(fgraph) | |
File "/usr/local/lib/python2.7/dist-packages/theano/gof/opt.py", line 580, in add_requirements | |
def add_requirements(self, fgraph): | |
KeyboardInterrupt | |
ubuntu@ip-10-0-0-156:~/nmt$ CUDA_LAUNCH_BLOCKING=1 python ./trails/lstm_encdec/train.py /home/ubuntu/data/pickles/remt1.v80k_40k.unk | |
pos.b80.trun40.rev.pack --valid /home/ubuntu/data/pickles/remt1.v80k_40k.unkpos.b80.trun40.rev.valid.pack --source_size 80000 --targ | |
et_size 40001 --hidden_size 1024 --approx --word_embed 1000 --arch lstm_one_layer_search --encoder_mask --optimizer sgd --lr 0.01 -- | |
train_size 12500 --predict --avoid_compute_embed_norm | |
Using gpu device 0: GRID K520 | |
INFO:deepy.networks.network:deepy version = 0.1.7 | |
INFO:deepy.layers.layer:create weight W_embed: (80000, 1000) | |
INFO:deepy.layers.layer:create weight W_embed: (40001, 1000) | |
INFO:deepy.layers.layer:create weight W_wi: (1000, 1024) | |
INFO:deepy.layers.layer:create weight W_ui: (1024, 1024) | |
INFO:deepy.layers.layer:create bias B_i: (1024,) | |
INFO:deepy.layers.layer:create weight W_wf: (1000, 1024) | |
INFO:deepy.layers.layer:create weight W_uf: (1024, 1024) | |
INFO:deepy.layers.layer:create bias B_f: (1024,) | |
INFO:deepy.layers.layer:create weight W_wc: (1000, 1024) | |
INFO:deepy.layers.layer:create weight W_uc: (1024, 1024) | |
INFO:deepy.layers.layer:create bias B_c: (1024,) | |
INFO:deepy.layers.layer:create weight W_wo: (1000, 1024) | |
INFO:deepy.layers.layer:create weight W_uo: (1024, 1024) | |
INFO:deepy.layers.layer:create bias B_o: (1024,) | |
INFO:deepy.layers.layer:create weight W_wi: (1000, 1024) | |
INFO:deepy.layers.layer:create weight W_ui: (1024, 1024) | |
INFO:deepy.layers.layer:create bias B_i: (1024,) | |
INFO:deepy.layers.layer:create weight W_wf: (1000, 1024) | |
INFO:deepy.layers.layer:create weight W_uf: (1024, 1024) | |
INFO:deepy.layers.layer:create bias B_f: (1024,) | |
INFO:deepy.layers.layer:create weight W_wc: (1000, 1024) | |
INFO:deepy.layers.layer:create weight W_uc: (1024, 1024) | |
INFO:deepy.layers.layer:create bias B_c: (1024,) | |
INFO:deepy.layers.layer:create weight W_wo: (1000, 1024) | |
INFO:deepy.layers.layer:create weight W_uo: (1024, 1024) | |
INFO:deepy.layers.layer:create bias B_o: (1024,) | |
INFO:deepy.layers.layer:create weight W_wi: (2048, 1024) | |
INFO:deepy.layers.layer:create weight W_ui: (1024, 1024) | |
INFO:deepy.layers.layer:create bias B_i: (1024,) | |
INFO:deepy.layers.layer:create weight W_wf: (2048, 1024) | |
INFO:deepy.layers.layer:create weight W_uf: (1024, 1024) | |
INFO:deepy.layers.layer:create bias B_f: (1024,) | |
INFO:deepy.layers.layer:create weight W_wc: (2048, 1024) | |
INFO:deepy.layers.layer:create weight W_uc: (1024, 1024) | |
INFO:deepy.layers.layer:create bias B_c: (1024,) | |
INFO:deepy.layers.layer:create weight W_wo: (2048, 1024) | |
INFO:deepy.layers.layer:create weight W_uo: (1024, 1024) | |
INFO:deepy.layers.layer:create bias B_o: (1024,) | |
INFO:deepy.layers.layer:create weight W_wi2: (1000, 1024) | |
INFO:deepy.layers.layer:create weight W_wf2: (1000, 1024) | |
INFO:deepy.layers.layer:create weight W_wc2: (1000, 1024) | |
INFO:deepy.layers.layer:create weight W_wo2: (1000, 1024) | |
INFO:deepy.layers.layer:create weight W_ua: (2048, 1024) | |
INFO:deepy.layers.layer:create weight W_wa: (1024, 1024) | |
INFO:deepy.layers.layer:create weight W_va: (1024,) | |
INFO:deepy.layers.layer:create weight W_dense4: (1024, 600) | |
INFO:deepy.layers.layer:create bias B_dense4: (600,) | |
INFO:deepy.layers.layer:create weight W_dense5: (600, 40001) | |
INFO:deepy.layers.layer:create bias B_dense5: (40001,) | |
INFO:deepy.dataset.ondisk_dataset:Cache on memory | |
INFO:deepy.trainers.trainers:changing optimization method to 'SGD' | |
INFO:deepy.networks.network:network inputs: x | |
INFO:deepy.networks.network:network targets: y mask | |
INFO:deepy.networks.network:network parameters: W_embed W_embed W_wi W_ui B_i W_wc W_uc B_c W_wf W_uf B_f W_wo W_uo B_o W_wi W_ui B_ | |
i W_wc W_uc B_c W_wf W_uf B_f W_wo W_uo B_o W_wi W_ui B_i W_wc W_uc B_c W_wf W_uf B_f W_wo W_uo B_o W_wi2 W_wf2 W_wc2 W_wo2 W_va W_w | |
a W_ua W_dense4 B_dense4 W_dense5 B_dense5 | |
INFO:deepy.networks.network:parameter count: 181075161 | |
INFO:deepy.trainers.trainers:monitor list: J | |
INFO:deepy.trainers.trainers:compile evaluation function | |
/usr/local/lib/python2.7/dist-packages/theano/scan_module/scan_perform_ext.py:133: RuntimeWarning: numpy.ndarray size changed, may i | |
ndicate binary incompatibility | |
from scan_perform.scan_perform import * | |
INFO:deepy.trainers.trainers:compiling SGDTrainer learning function | |
INFO:deepy.trainers.optimize:optimize method=SGD parameters=[W_embed, W_embed, W_wi, W_ui, B_i, W_wc, W_uc, B_c, W_wf, W_uf, B_f, W_ | |
wo, W_uo, B_o, W_wi, W_ui, B_i, W_wc, W_uc, B_c, W_wf, W_uf, B_f, W_wo, W_uo, B_o, W_wi, W_ui, B_i, W_wc, W_uc, B_c, W_wf, W_uf, B_f | |
, W_wo, W_uo, B_o, W_wi2, W_wf2, W_wc2, W_wo2, W_va, W_wa, W_ua, W_dense4, B_dense4, W_dense5, B_dense5] | |
INFO:deepy.trainers.optimize:ada_family_core: [('gparams', [Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{s | |
witch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no | |
_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace | |
}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Ele | |
mwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{sw | |
itch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_ | |
inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace} | |
.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elem | |
wise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{swi | |
tch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_i | |
nplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}. | |
0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0, Elemwise{switch,no_inplace}.0]), ('l | |
earning_rate', learning_rate), ('eps', 1e-06), ('beta', 0.0), ('params', [W_embed, W_embed, W_wi, W_ui, B_i, W_wc, W_uc, B_c, W_wf, | |
W_uf, B_f, W_wo, W_uo, B_o, W_wi, W_ui, B_i, W_wc, W_uc, B_c, W_wf, W_uf, B_f, W_wo, W_uo, B_o, W_wi, W_ui, B_i, W_wc, W_uc, B_c, W_ | |
wf, W_uf, B_f, W_wo, W_uo, B_o, W_wi2, W_wf2, W_wc2, W_wo2, W_va, W_wa, W_ua, W_dense4, B_dense4, W_dense5, B_dense5]), ('rho', 0.95 | |
), ('gsum_regularization', 0.0001), ('method', 'SGD')] | |
INFO:deepy.trainers.trainers:Added 0 free parameters for optimization | |
INFO:deepy.trainers.trainers:network updates: | |
INFO:deepy.trainers.trainers:learning updates: W_embed W_embed W_wi W_ui B_i W_wc W_uc B_c W_wf W_uf B_f W_wo W_uo B_o W_wi W_ui B_i | |
W_wc W_uc B_c W_wf W_uf B_f W_wo W_uo B_o W_wi W_ui B_i W_wc W_uc B_c W_wf W_uf B_f W_wo W_uo B_o W_wi2 W_wf2 W_wc2 W_wo2 W_va W_wa | |
W_ua W_dense4 B_dense4 W_dense5 B_dense5 | |
INFO:deepy.networks.network:saving parameters to /tmp/default_model.gz | |
INFO:deepy.utils.train_logger:Save training log to /tmp/default_model.log | |
INFO:deepy.trainers.trainers:valid (iter=1) J=10.60 * | |
> 2%^CINFO:deepy.trainers.trainers:interrupted! | |
^CTraceback (most recent call last): | |
File "./trails/lstm_encdec/train.py", line 229, in <module> | |
controllers=[ScheduledLearningRateAnnealer(trainer, iter_start_halving=5 - args.iter_offset, max_iters=10 - args.iter_offset)]) | |
File "/home/ubuntu/deepy/deepy/trainers/trainers.py", line 272, in run | |
for _ in self.train(train_set, valid_set=valid_set, test_set=test_set, train_size=train_size): | |
File "/home/ubuntu/deepy/deepy/trainers/trainers.py", line 176, in train | |
self.set_params(*self.best_params) | |
File "/home/ubuntu/deepy/deepy/trainers/trainers.py", line 115, in set_params | |
param.set_value(target) | |
File "/usr/local/lib/python2.7/dist-packages/theano/sandbox/cuda/var.py", line 136, in set_value | |
self.container.value = value # this will copy a numpy ndarray | |
File "/usr/local/lib/python2.7/dist-packages/theano/gof/link.py", line 383, in __set__ | |
**kwargs) | |
File "/usr/local/lib/python2.7/dist-packages/theano/sandbox/cuda/type.py", line 94, in filter_inplace | |
strict, old_data) | |
KeyboardInterrupt | |
^C^C^CException KeyboardInterrupt in <module 'threading' from '/usr/lib/python2.7/threading.pyc'> ignored | |
Function profiling | |
================== | |
Message: /home/ubuntu/deepy/deepy/trainers/trainers.py:73 | |
Time in 16 calls to Function.__call__: 9.254583e+00s | |
Time in Function.fn.__call__: 9.252455e+00s (99.977%) | |
Time in thunks: 9.208560e+00s (99.503%) | |
Total compile time: 1.111225e+01s | |
Number of Apply nodes: 344 | |
Theano Optimizer time: 9.731210e+00s | |
Theano validate time: 2.255249e-01s | |
Theano Linker time (includes C, CUDA code generation/compiling): 1.302246e+00s | |
Import time 8.582520e-02s | |
Time in all call to theano.grad() 8.831870e+00s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
55.9% 55.9% 5.147s 1.07e-01s Py 48 3 theano.scan_module.scan_op.Scan | |
30.9% 86.8% 2.847s 1.62e-02s C 176 11 theano.sandbox.cuda.blas.GpuDot22 | |
6.9% 93.8% 0.640s 4.00e-02s C 16 1 theano.sandbox.cuda.dnn.GpuDnnSoftmax | |
4.9% 98.7% 0.452s 4.03e-03s C 112 7 theano.sandbox.cuda.basic_ops.GpuElemwise | |
0.5% 99.2% 0.047s 9.72e-05s Py 480 30 theano.sandbox.cuda.basic_ops.GpuReshape | |
0.3% 99.4% 0.025s 1.57e-03s C 16 1 theano.sandbox.cuda.basic_ops.GpuJoin | |
0.2% 99.7% 0.021s 4.35e-04s Py 48 3 theano.sandbox.cuda.basic_ops.GpuAdvancedSubtensor1 | |
0.1% 99.8% 0.013s 1.35e-04s C 96 6 theano.sandbox.cuda.basic_ops.GpuAlloc | |
0.1% 99.9% 0.007s 1.51e-04s C 48 3 theano.sandbox.cuda.basic_ops.GpuFromHost | |
0.0% 99.9% 0.003s 2.00e-06s C 1696 106 theano.tensor.elemwise.Elemwise | |
0.0% 99.9% 0.001s 3.77e-05s C 32 2 theano.sandbox.cuda.basic_ops.GpuCAReduce | |
0.0% 100.0% 0.001s 1.26e-06s C 704 44 theano.compile.ops.Shape_i | |
0.0% 100.0% 0.001s 1.42e-06s C 576 36 theano.sandbox.cuda.basic_ops.GpuDimShuffle | |
0.0% 100.0% 0.001s 1.39e-06s C 496 31 theano.tensor.opt.MakeVector | |
0.0% 100.0% 0.001s 3.24e-05s Py 16 1 theano.tensor.basic.ARange | |
0.0% 100.0% 0.000s 2.26e-06s C 208 13 theano.sandbox.cuda.basic_ops.GpuSubtensor | |
0.0% 100.0% 0.000s 1.14e-05s Py 32 2 theano.sandbox.cuda.basic_ops.GpuFlatten | |
0.0% 100.0% 0.000s 7.79e-07s C 448 28 theano.tensor.basic.ScalarFromTensor | |
0.0% 100.0% 0.000s 2.14e-05s C 16 1 theano.sandbox.cuda.basic_ops.HostFromGpu | |
0.0% 100.0% 0.000s 1.12e-05s C 16 1 theano.tensor.basic.Join | |
... (remaining 6 Classes account for 0.01%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
42.3% 42.3% 3.892s 2.43e-01s Py 16 1 forall_inplace,gpu,scan_fn} | |
30.9% 73.2% 2.847s 1.62e-02s C 176 11 GpuDot22 | |
13.6% 86.8% 1.255s 3.92e-02s Py 32 2 forall_inplace,gpu,scan_fn} | |
6.9% 93.8% 0.640s 4.00e-02s C 16 1 GpuDnnSoftmax{tensor_format='bc01', mode='channel', algo='ac | |
curate'} | |
2.5% 96.2% 0.226s 4.72e-03s C 48 3 GpuElemwise{Mul}[(0, 0)] | |
2.4% 98.7% 0.225s 7.02e-03s C 32 2 GpuElemwise{Add}[(0, 0)] | |
0.4% 99.1% 0.040s 1.47e-04s Py 272 17 GpuReshape{2} | |
0.3% 99.4% 0.025s 1.57e-03s C 16 1 GpuJoin | |
0.2% 99.6% 0.021s 4.35e-04s Py 48 3 GpuAdvancedSubtensor1 | |
0.1% 99.7% 0.013s 1.35e-04s C 96 6 GpuAlloc{memset_0=True} | |
0.1% 99.8% 0.007s 1.51e-04s C 48 3 GpuFromHost | |
0.1% 99.9% 0.007s 3.19e-05s Py 208 13 GpuReshape{3} | |
0.0% 99.9% 0.001s 3.77e-05s C 32 2 GpuCAReduce{add}{1} | |
0.0% 99.9% 0.001s 1.39e-06s C 496 31 MakeVector | |
0.0% 99.9% 0.001s 5.96e-06s C 112 7 Elemwise{sub,no_inplace} | |
0.0% 99.9% 0.001s 3.97e-05s C 16 1 GpuElemwise{Composite{(log(clip(i0, i1, i2)) * i3)}}[(0, 0)] | |
0.0% 99.9% 0.001s 3.24e-05s Py 16 1 ARange | |
0.0% 99.9% 0.000s 1.09e-06s C 400 25 Shape_i{0} | |
0.0% 99.9% 0.000s 1.50e-06s C 272 17 Shape_i{1} | |
0.0% 99.9% 0.000s 2.30e-05s C 16 1 GpuElemwise{Composite{((-i0) / i1)}}[(0, 0)] | |
... (remaining 55 Ops account for 0.06%(0.01s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
42.3% 42.3% 3.892s 2.43e-01s 16 309 forall_inplace,gpu,scan_fn}(Shape_i{1}.0, GpuSubtensor{int64:int64:int8 | |
}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{memset_0=True}.0, W_wa, W_wo, W_wo2, W_uo, W_wf, W_wf2, W_uf, W_wi, W_wi2, W_ui, W_wc, W_wc | |
2, W_uc, GpuJoin.0, GpuReshape{3}.0, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, Elemwis | |
e{neq,no_inplace}.0, GpuReshape{2}.0) | |
24.0% 66.3% 2.214s 1.38e-01s 16 324 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
7.2% 73.5% 0.661s 4.13e-02s 16 298 forall_inplace,gpu,scan_fn}(Elemwise{Composite{minimum(minimum(minimum( | |
minimum(i0, i1), i2), i3), i4)}}.0, InplaceDimShuffle{0,1,x}.0, Elemwise{sub,no_inplace}.0, GpuSubtensor{int64:int64:int8}.0, GpuSub | |
tensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{ | |
memset_0=True}.0, W_uo, W_uf, W_ui, W_uc, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDimShuf | |
6.9% 80.4% 0.640s 4.00e-02s 16 336 GpuDnnSoftmax{tensor_format='bc01', mode='channel', algo='accurate'}(Gp | |
uContiguous.0) | |
6.4% 86.9% 0.594s 3.71e-02s 16 297 forall_inplace,gpu,scan_fn}(Elemwise{Composite{minimum(minimum(minimum( | |
minimum(i0, i1), i2), i3), i4)}}.0, InplaceDimShuffle{0,1,x}.0, Elemwise{sub,no_inplace}.0, GpuSubtensor{int64:int64:int64}.0, GpuSu | |
btensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuAlloc{memset_0=True}.0, GpuAl | |
loc{memset_0=True}.0, W_uo, W_uf, W_ui, W_uc, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDim | |
2.4% 89.3% 0.221s 1.38e-02s 16 329 GpuElemwise{Add}[(0, 0)](GpuReshape{3}.0, GpuDimShuffle{x,x,0}.0) | |
2.3% 91.6% 0.215s 1.34e-02s 16 333 GpuElemwise{Mul}[(0, 0)](GpuReshape{2}.0, GpuDimShuffle{0,x}.0) | |
1.2% 92.8% 0.112s 7.01e-03s 16 307 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
0.7% 93.5% 0.061s 3.78e-03s 16 313 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
0.6% 94.1% 0.058s 3.61e-03s 16 148 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
0.6% 94.7% 0.058s 3.60e-03s 16 151 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
0.6% 95.4% 0.058s 3.59e-03s 16 145 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
0.6% 96.0% 0.057s 3.59e-03s 16 146 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
0.6% 96.6% 0.057s 3.59e-03s 16 149 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
0.6% 97.2% 0.057s 3.59e-03s 16 147 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
0.6% 97.9% 0.057s 3.59e-03s 16 152 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
0.6% 98.5% 0.057s 3.59e-03s 16 150 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
0.3% 98.8% 0.025s 1.57e-03s 16 304 GpuJoin(TensorConstant{2}, GpuDimShuffle{1,0,2}.0, GpuSubtensor{::, ::i | |
nt64}.0) | |
0.2% 99.0% 0.021s 1.29e-03s 16 312 GpuReshape{2}(GpuDimShuffle{1,0,2}.0, MakeVector.0) | |
0.2% 99.1% 0.014s 8.71e-04s 16 142 GpuReshape{2}(GpuDimShuffle{1,0,2}.0, MakeVector.0) | |
... (remaining 324 Apply instances account for 0.86%(0.08s) of the runtime) | |
Function profiling | |
================== | |
Message: /usr/local/lib/python2.7/dist-packages/theano/sandbox/cuda/dnn.py:206 | |
Time in 1 calls to Function.__call__: 2.312660e-05s | |
Time in Function.fn.__call__: 6.914139e-06s (29.897%) | |
Total compile time: 3.566572e+00s | |
Number of Apply nodes: 1 | |
Theano Optimizer time: 2.098083e-05s | |
Theano validate time: 0.000000e+00s | |
Theano Linker time (includes C, CUDA code generation/compiling): 3.560112e+00s | |
Import time 1.200914e-03s | |
Time in all call to theano.grad() 8.831870e+00s | |
No execution time accumulated (hint: try config profiling.time_thunks=1) | |
Scan Op profiling ( scan_fn ) | |
================== | |
Message: None | |
Time in 16 calls of the op (for a total of 381 steps) 5.888572e-01s | |
Total time spent in calling the VM 5.220029e-01s (88.647%) | |
Total overhead (computing slices..) 6.685424e-02s (11.353%) | |
Time in all call to theano.grad() 8.831870e+00s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
84.0% 84.0% 0.434s 2.85e-04s C 1524 4 theano.sandbox.cuda.blas.GpuGemm | |
10.2% 94.2% 0.053s 4.61e-05s C 1143 3 theano.sandbox.cuda.basic_ops.GpuElemwise | |
5.4% 99.5% 0.028s 3.63e-05s C 762 2 theano.sandbox.cuda.basic_ops.GpuFromHost | |
0.5% 100.0% 0.002s 3.15e-06s C 762 2 theano.tensor.elemwise.Elemwise | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
84.0% 84.0% 0.434s 2.85e-04s C 1524 4 GpuGemm{no_inplace} | |
5.4% 89.3% 0.028s 3.63e-05s C 762 2 GpuFromHost | |
3.8% 93.1% 0.020s 5.16e-05s C 381 1 GpuElemwise{Composite{((scalar_sigmoid((i0 + i1)) * i2) + (s | |
calar_sigmoid((i3 + i4)) * tanh((i5 + i6))))}}[(0, 1)] | |
3.7% 96.9% 0.019s 5.07e-05s C 381 1 GpuElemwise{Composite{(((scalar_sigmoid((i0 + i1)) * tanh(i2 | |
)) * i3) + (i4 * i5))},no_inplace} | |
2.7% 99.5% 0.014s 3.61e-05s C 381 1 GpuElemwise{Composite{((i0 * i1) + (i2 * i3))},no_inplace} | |
0.5% 100.0% 0.002s 3.15e-06s C 762 2 Elemwise{Cast{float32}} | |
... (remaining 0 Ops account for 0.00%(0.00s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
21.6% 21.6% 0.112s 2.93e-04s 381 5 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_uo_copy[cuda], TensorConstant{1.0}) | |
21.0% 42.6% 0.108s 2.84e-04s 381 2 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_uc_copy[cuda], TensorConstant{1.0}) | |
20.7% 63.3% 0.107s 2.81e-04s 381 4 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_uf_copy[cuda], TensorConstant{1.0}) | |
20.7% 84.0% 0.107s 2.81e-04s 381 3 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_ui_copy[cuda], TensorConstant{1.0}) | |
3.8% 87.8% 0.020s 5.16e-05s 381 8 GpuElemwise{Composite{((scalar_sigmoid((i0 + i1)) * i2) + (scalar_sigmo | |
id((i3 + i4)) * tanh((i5 + i6))))}}[(0, 1)](<CudaNdarrayType(float32, row)>, GpuGemm{no_inplace}.0, <CudaNdarrayType(float32, matrix | |
)>, <CudaNdarrayType(float32, row)>, GpuGemm{no_inplace}.0, <CudaNdarrayType(float32, row)>, GpuGemm{no_inplace}.0) | |
3.7% 91.5% 0.019s 5.07e-05s 381 10 GpuElemwise{Composite{(((scalar_sigmoid((i0 + i1)) * tanh(i2)) * i3) + | |
(i4 * i5))},no_inplace}(<CudaNdarrayType(float32, row)>, GpuGemm{no_inplace}.0, GpuElemwise{Composite{((scalar_sigmoid((i0 + i1)) * | |
i2) + (scalar_sigmoid((i3 + i4)) * tanh((i5 + i6))))}}[(0, 1)].0, GpuFromHost.0, <CudaNdarrayType(float32, matrix)>, GpuFromHost.0) | |
3.0% 94.5% 0.016s 4.09e-05s 381 7 GpuFromHost(Elemwise{Cast{float32}}.0) | |
2.7% 97.2% 0.014s 3.61e-05s 381 9 GpuElemwise{Composite{((i0 * i1) + (i2 * i3))},no_inplace}(GpuElemwise{ | |
Composite{((scalar_sigmoid((i0 + i1)) * i2) + (scalar_sigmoid((i3 + i4)) * tanh((i5 + i6))))}}[(0, 1)].0, GpuFromHost.0, <CudaNdarra | |
yType(float32, matrix)>, GpuFromHost.0) | |
2.3% 99.5% 0.012s 3.18e-05s 381 6 GpuFromHost(Elemwise{Cast{float32}}.0) | |
0.3% 99.8% 0.002s 3.97e-06s 381 1 Elemwise{Cast{float32}}(<TensorType(int8, col)>) | |
0.2% 100.0% 0.001s 2.32e-06s 381 0 Elemwise{Cast{float32}}(<TensorType(int8, col)>) | |
... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime) | |
Scan Op profiling ( scan_fn ) | |
================== | |
Message: None | |
Time in 16 calls of the op (for a total of 381 steps) 6.570778e-01s | |
Total time spent in calling the VM 5.623438e-01s (85.583%) | |
Total overhead (computing slices..) 9.473395e-02s (14.417%) | |
Time in all call to theano.grad() 8.831870e+00s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
78.6% 78.6% 0.438s 2.87e-04s C 1524 4 theano.sandbox.cuda.blas.GpuGemm | |
15.6% 94.3% 0.087s 7.60e-05s C 1143 3 theano.sandbox.cuda.basic_ops.GpuElemwise | |
5.3% 99.5% 0.029s 3.84e-05s C 762 2 theano.sandbox.cuda.basic_ops.GpuFromHost | |
0.5% 100.0% 0.003s 3.53e-06s C 762 2 theano.tensor.elemwise.Elemwise | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
78.6% 78.6% 0.438s 2.87e-04s C 1524 4 GpuGemm{no_inplace} | |
9.4% 88.0% 0.052s 1.37e-04s C 381 1 GpuElemwise{Composite{(((scalar_sigmoid((i0 + i1)) * tanh(i2 | |
)) * i3) + (i4 * i5))},no_inplace} | |
5.3% 93.3% 0.029s 3.84e-05s C 762 2 GpuFromHost | |
3.6% 96.9% 0.020s 5.30e-05s C 381 1 GpuElemwise{Composite{((scalar_sigmoid((i0 + i1)) * i2) + (s | |
calar_sigmoid((i3 + i4)) * tanh((i5 + i6))))}}[(0, 1)] | |
2.6% 99.5% 0.015s 3.83e-05s C 381 1 GpuElemwise{Composite{((i0 * i1) + (i2 * i3))},no_inplace} | |
0.5% 100.0% 0.003s 3.53e-06s C 762 2 Elemwise{Cast{float32}} | |
... (remaining 0 Ops account for 0.00%(0.00s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
20.4% 20.4% 0.114s 2.99e-04s 381 5 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_uo_copy[cuda], TensorConstant{1.0}) | |
19.6% 40.1% 0.109s 2.87e-04s 381 4 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_uf_copy[cuda], TensorConstant{1.0}) | |
19.3% 59.4% 0.107s 2.82e-04s 381 3 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_ui_copy[cuda], TensorConstant{1.0}) | |
19.3% 78.6% 0.107s 2.81e-04s 381 2 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_uc_copy[cuda], TensorConstant{1.0}) | |
9.4% 88.0% 0.052s 1.37e-04s 381 10 GpuElemwise{Composite{(((scalar_sigmoid((i0 + i1)) * tanh(i2)) * i3) + | |
(i4 * i5))},no_inplace}(<CudaNdarrayType(float32, row)>, GpuGemm{no_inplace}.0, GpuElemwise{Composite{((scalar_sigmoid((i0 + i1)) * | |
i2) + (scalar_sigmoid((i3 + i4)) * tanh((i5 + i6))))}}[(0, 1)].0, GpuFromHost.0, <CudaNdarrayType(float32, matrix)>, GpuFromHost.0) | |
3.6% 91.6% 0.020s 5.30e-05s 381 8 GpuElemwise{Composite{((scalar_sigmoid((i0 + i1)) * i2) + (scalar_sigmo | |
id((i3 + i4)) * tanh((i5 + i6))))}}[(0, 1)](<CudaNdarrayType(float32, row)>, GpuGemm{no_inplace}.0, <CudaNdarrayType(float32, matrix | |
)>, <CudaNdarrayType(float32, row)>, GpuGemm{no_inplace}.0, <CudaNdarrayType(float32, row)>, GpuGemm{no_inplace}.0) | |
3.0% 94.6% 0.017s 4.37e-05s 381 7 GpuFromHost(Elemwise{Cast{float32}}.0) | |
2.6% 97.2% 0.015s 3.83e-05s 381 9 GpuElemwise{Composite{((i0 * i1) + (i2 * i3))},no_inplace}(GpuElemwise{ | |
Composite{((scalar_sigmoid((i0 + i1)) * i2) + (scalar_sigmoid((i3 + i4)) * tanh((i5 + i6))))}}[(0, 1)].0, GpuFromHost.0, <CudaNdarra | |
yType(float32, matrix)>, GpuFromHost.0) | |
2.3% 99.5% 0.013s 3.31e-05s 381 6 GpuFromHost(Elemwise{Cast{float32}}.0) | |
0.3% 99.8% 0.002s 4.43e-06s 381 1 Elemwise{Cast{float32}}(<TensorType(int8, col)>) | |
0.2% 100.0% 0.001s 2.63e-06s 381 0 Elemwise{Cast{float32}}(<TensorType(int8, col)>) | |
... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime) | |
Scan Op profiling ( scan_fn ) | |
================== | |
Message: None | |
Time in 16 calls of the op (for a total of 632 steps) 3.881260e+00s | |
Total time spent in calling the VM 3.777264e+00s (97.321%) | |
Total overhead (computing slices..) 1.039963e-01s (2.679%) | |
Time in all call to theano.grad() 8.831870e+00s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
36.8% 36.8% 1.384s 4.38e-04s C 3160 5 theano.sandbox.cuda.blas.GpuDot22 | |
35.1% 71.9% 1.318s 2.61e-04s C 5056 8 theano.sandbox.cuda.blas.GpuGemm | |
18.3% 90.2% 0.689s 1.82e-04s C 3792 6 theano.sandbox.cuda.basic_ops.GpuElemwise | |
5.2% 95.4% 0.195s 3.09e-04s C 632 1 theano.sandbox.cuda.basic_ops.GpuCAReduce | |
1.8% 97.3% 0.069s 1.09e-04s C 632 1 theano.sandbox.cuda.blas.GpuGemv | |
0.9% 98.2% 0.035s 2.73e-05s Py 1264 2 theano.sandbox.cuda.basic_ops.GpuReshape | |
0.7% 98.9% 0.027s 4.31e-05s C 632 1 theano.sandbox.cuda.basic_ops.GpuFromHost | |
0.5% 99.4% 0.019s 2.96e-05s C 632 1 theano.sandbox.cuda.dnn.GpuDnnSoftmax | |
0.4% 99.8% 0.014s 2.27e-05s C 632 1 theano.sandbox.cuda.basic_ops.GpuAlloc | |
0.1% 99.9% 0.003s 7.27e-07s C 4424 7 theano.sandbox.cuda.basic_ops.GpuDimShuffle | |
0.1% 99.9% 0.003s 2.24e-06s C 1264 2 theano.tensor.elemwise.Elemwise | |
0.0% 100.0% 0.001s 5.34e-07s C 2528 4 theano.compile.ops.Shape_i | |
0.0% 100.0% 0.001s 4.34e-07s C 1264 2 theano.tensor.opt.MakeVector | |
0.0% 100.0% 0.000s 5.48e-07s C 632 1 theano.sandbox.cuda.basic_ops.GpuContiguous | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
36.8% 36.8% 1.384s 4.38e-04s C 3160 5 GpuDot22 | |
35.1% 71.9% 1.318s 2.61e-04s C 5056 8 GpuGemm{inplace} | |
7.6% 79.4% 0.284s 4.50e-04s C 632 1 GpuElemwise{mul,no_inplace} | |
5.3% 84.7% 0.198s 3.13e-04s C 632 1 GpuElemwise{add,no_inplace} | |
5.2% 89.9% 0.195s 3.09e-04s C 632 1 GpuCAReduce{add}{0,1,0} | |
3.4% 93.3% 0.129s 2.04e-04s C 632 1 GpuElemwise{Tanh}[(0, 0)] | |
1.8% 95.2% 0.069s 1.09e-04s C 632 1 GpuGemv{inplace} | |
1.1% 96.2% 0.040s 6.33e-05s C 632 1 GpuElemwise{Composite{((scalar_sigmoid((i0 + i1)) * i2) + (s | |
calar_sigmoid((i3 + i4)) * tanh((i5 + i6))))},no_inplace} | |
0.9% 97.2% 0.035s 2.73e-05s Py 1264 2 GpuReshape{2} | |
0.7% 97.9% 0.027s 4.31e-05s C 632 1 GpuFromHost | |
0.7% 98.6% 0.027s 4.28e-05s C 632 1 GpuElemwise{Composite{(scalar_sigmoid((i0 + i1)) * tanh(i2)) | |
},no_inplace} | |
0.5% 99.1% 0.019s 2.96e-05s C 632 1 GpuDnnSoftmax{tensor_format='bc01', mode='channel', algo='ac | |
curate'} | |
0.4% 99.5% 0.014s 2.27e-05s C 632 1 GpuAlloc{memset_0=True} | |
0.3% 99.8% 0.011s 1.79e-05s C 632 1 GpuElemwise{Mul}[(0, 0)] | |
0.1% 99.8% 0.002s 3.92e-06s C 632 1 Elemwise{Cast{float32}} | |
0.0% 99.9% 0.001s 6.19e-07s C 1264 2 Shape_i{0} | |
0.0% 99.9% 0.001s 4.50e-07s C 1264 2 Shape_i{1} | |
0.0% 99.9% 0.001s 4.34e-07s C 1264 2 MakeVector | |
0.0% 99.9% 0.001s 8.45e-07s C 632 1 GpuDimShuffle{0} | |
0.0% 99.9% 0.001s 8.12e-07s C 632 1 GpuDimShuffle{0,1,x,x} | |
... (remaining 7 Ops account for 0.08%(0.00s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
8.2% 8.2% 0.309s 4.89e-04s 632 31 GpuDot22(GpuCAReduce{add}{0,1,0}.0, W_wo_copy[cuda]) | |
8.0% 16.3% 0.302s 4.78e-04s 632 28 GpuDot22(GpuCAReduce{add}{0,1,0}.0, W_wc_copy[cuda]) | |
8.0% 24.3% 0.301s 4.76e-04s 632 30 GpuDot22(GpuCAReduce{add}{0,1,0}.0, W_wf_copy[cuda]) | |
8.0% 32.3% 0.301s 4.76e-04s 632 29 GpuDot22(GpuCAReduce{add}{0,1,0}.0, W_wi_copy[cuda]) | |
7.6% 39.8% 0.284s 4.50e-04s 632 26 GpuElemwise{mul,no_inplace}(GpuDimShuffle{0,1,x}.0, <CudaNdarrayType(fl | |
oat32, 3D)>) | |
5.3% 45.1% 0.198s 3.13e-04s 632 11 GpuElemwise{add,no_inplace}(GpuDimShuffle{0,x,1}.0, GpuDimShuffle{0,1,2 | |
}.0) | |
5.2% 50.3% 0.195s 3.09e-04s 632 27 GpuCAReduce{add}{0,1,0}(GpuElemwise{mul,no_inplace}.0) | |
4.6% 54.8% 0.171s 2.71e-04s 632 3 GpuDot22(<CudaNdarrayType(float32, matrix)>, W_wa_copy[cuda]) | |
4.4% 59.2% 0.165s 2.61e-04s 632 39 GpuGemm{inplace}(GpuGemm{inplace}.0, TensorConstant{1.0}, <CudaNdarrayT | |
ype(float32, matrix)>, W_uo_copy[cuda], TensorConstant{1.0}) | |
4.4% 63.6% 0.165s 2.61e-04s 632 35 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(floa | |
t32, matrix)>, W_wo2_copy[cuda], TensorConstant{1.0}) | |
4.4% 68.0% 0.165s 2.61e-04s 632 38 GpuGemm{inplace}(GpuGemm{inplace}.0, TensorConstant{1.0}, <CudaNdarrayT | |
ype(float32, matrix)>, W_uf_copy[cuda], TensorConstant{1.0}) | |
4.4% 72.4% 0.165s 2.61e-04s 632 36 GpuGemm{inplace}(GpuGemm{inplace}.0, TensorConstant{1.0}, <CudaNdarrayT | |
ype(float32, matrix)>, W_uc_copy[cuda], TensorConstant{1.0}) | |
4.4% 76.8% 0.165s 2.61e-04s 632 37 GpuGemm{inplace}(GpuGemm{inplace}.0, TensorConstant{1.0}, <CudaNdarrayT | |
ype(float32, matrix)>, W_ui_copy[cuda], TensorConstant{1.0}) | |
4.4% 81.2% 0.164s 2.60e-04s 632 34 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(floa | |
t32, matrix)>, W_wf2_copy[cuda], TensorConstant{1.0}) | |
4.4% 85.5% 0.164s 2.60e-04s 632 32 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(floa | |
t32, matrix)>, W_wc2_copy[cuda], TensorConstant{1.0}) | |
4.4% 89.9% 0.164s 2.60e-04s 632 33 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(floa | |
t32, matrix)>, W_wi2_copy[cuda], TensorConstant{1.0}) | |
3.4% 93.3% 0.129s 2.04e-04s 632 15 GpuElemwise{Tanh}[(0, 0)](GpuReshape{2}.0) | |
1.8% 95.2% 0.069s 1.09e-04s 632 17 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, GpuEle | |
mwise{Tanh}[(0, 0)].0, GpuDimShuffle{0}.0, TensorConstant{0.0}) | |
1.1% 96.2% 0.040s 6.33e-05s 632 40 GpuElemwise{Composite{((scalar_sigmoid((i0 + i1)) * i2) + (scalar_sigmo | |
id((i3 + i4)) * tanh((i5 + i6))))},no_inplace}(<CudaNdarrayType(float32, row)>, GpuGemm{inplace}.0, <CudaNdarrayType(float32, matrix | |
)>, <CudaNdarrayType(float32, row)>, GpuGemm{inplace}.0, <CudaNdarrayType(float32, row)>, GpuGemm{inplace}.0) | |
0.7% 97.0% 0.027s 4.31e-05s 632 9 GpuFromHost(Elemwise{Cast{float32}}.0) | |
... (remaining 22 Apply instances account for 3.04%(0.11s) of the runtime) | |
Function profiling | |
================== | |
Message: /usr/local/lib/python2.7/dist-packages/theano/tensor/blas_c.py:733 | |
Time in 1 calls to Function.__call__: 2.198935e-03s | |
Time in Function.fn.__call__: 2.163887e-03s (98.406%) | |
Time in thunks: 2.101898e-03s (95.587%) | |
Total compile time: 1.613784e-02s | |
Number of Apply nodes: 5 | |
Theano Optimizer time: 5.366087e-03s | |
Theano validate time: 1.032352e-04s | |
Theano Linker time (includes C, CUDA code generation/compiling): 4.407883e-03s | |
Import time 3.850460e-04s | |
Time in all call to theano.grad() 8.831870e+00s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
93.7% 93.7% 0.002s 6.57e-04s C 3 3 theano.sandbox.cuda.basic_ops.GpuFromHost | |
5.2% 99.0% 0.000s 1.10e-04s C 1 1 theano.sandbox.cuda.blas.GpuGemv | |
1.0% 100.0% 0.000s 2.19e-05s C 1 1 theano.sandbox.cuda.basic_ops.HostFromGpu | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
93.7% 93.7% 0.002s 6.57e-04s C 3 3 GpuFromHost | |
5.2% 99.0% 0.000s 1.10e-04s C 1 1 GpuGemv{no_inplace} | |
1.0% 100.0% 0.000s 2.19e-05s C 1 1 HostFromGpu | |
... (remaining 0 Ops account for 0.00%(0.00s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
81.6% 81.6% 0.002s 1.72e-03s 1 2 GpuFromHost(aa) | |
10.9% 92.6% 0.000s 2.30e-04s 1 0 GpuFromHost(yy) | |
5.2% 97.8% 0.000s 1.10e-04s 1 3 GpuGemv{no_inplace}(GpuFromHost.0, TensorConstant{1.0}, GpuFromHost.0, | |
GpuFromHost.0, TensorConstant{0.0}) | |
1.1% 99.0% 0.000s 2.41e-05s 1 1 GpuFromHost(xx) | |
1.0% 100.0% 0.000s 2.19e-05s 1 4 HostFromGpu(GpuGemv{no_inplace}.0) | |
... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime) | |
Function profiling | |
================== | |
Message: /home/ubuntu/deepy/deepy/trainers/trainers.py:319 | |
Time in 297 calls to Function.__call__: 6.786967e+02s | |
Time in Function.fn.__call__: 6.786298e+02s (99.990%) | |
Time in thunks: 6.739271e+02s (99.297%) | |
Total compile time: 1.045786e+02s | |
Number of Apply nodes: 1348 | |
Theano Optimizer time: 8.847273e+01s | |
Theano validate time: 1.659854e+00s | |
Theano Linker time (includes C, CUDA code generation/compiling): 1.573371e+01s | |
Import time 2.074945e-01s | |
Time in all call to theano.grad() 8.831870e+00s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
63.8% 63.8% 430.046s 2.41e-01s Py 1782 6 theano.scan_module.scan_op.Scan | |
19.8% 83.6% 133.139s 1.18e-02s C 11286 38 theano.sandbox.cuda.blas.GpuDot22 | |
6.6% 90.2% 44.408s 2.93e-03s C 15147 51 theano.sandbox.cuda.basic_ops.GpuCAReduce | |
2.7% 92.8% 17.878s 8.85e-04s C 20196 68 theano.sandbox.cuda.basic_ops.GpuElemwise | |
1.5% 94.3% 10.271s 3.46e-02s C 297 1 theano.sandbox.cuda.basic_ops.GpuAdvancedIncSubtensor1 | |
1.5% 95.8% 9.947s 3.35e-02s C 297 1 theano.sandbox.cuda.dnn.GpuDnnSoftmax | |
1.4% 97.2% 9.632s 3.24e-02s C 297 1 theano.sandbox.cuda.dnn.GpuDnnSoftmaxGrad | |
1.0% 98.2% 6.487s 4.37e-03s C 1485 5 theano.sandbox.cuda.blas.GpuDot22Scalar | |
0.7% 98.9% 4.421s 2.48e-04s C 17820 60 theano.sandbox.cuda.basic_ops.GpuAlloc | |
0.4% 99.3% 2.981s 1.43e-03s Py 2079 7 theano.sandbox.cuda.basic_ops.GpuFlatten | |
0.2% 99.5% 1.539s 7.40e-05s Py 20790 70 theano.sandbox.cuda.basic_ops.GpuReshape | |
0.2% 99.7% 1.464s 1.90e-04s C 7722 26 theano.sandbox.cuda.basic_ops.GpuIncSubtensor | |
0.0% 99.8% 0.317s 3.56e-04s Py 891 3 theano.sandbox.cuda.basic_ops.GpuAdvancedSubtensor1 | |
0.0% 99.8% 0.278s 9.37e-04s Py 297 1 theano.sandbox.cuda.basic_ops.GpuSplit | |
0.0% 99.9% 0.272s 1.81e-06s C 150579 507 theano.tensor.elemwise.Elemwise | |
0.0% 99.9% 0.222s 7.47e-04s C 297 1 theano.sandbox.cuda.basic_ops.GpuJoin | |
0.0% 99.9% 0.162s 2.73e-04s C 594 2 theano.sandbox.cuda.basic_ops.GpuAdvancedIncSubtensor1_dev20 | |
0.0% 100.0% 0.150s 6.32e-05s C 2376 8 theano.sandbox.cuda.basic_ops.GpuFromHost | |
0.0% 100.0% 0.067s 1.62e-06s C 41283 139 theano.compile.ops.Shape_i | |
0.0% 100.0% 0.060s 2.83e-06s C 21384 72 theano.sandbox.cuda.basic_ops.GpuSubtensor | |
... (remaining 13 Classes account for 0.03%(0.18s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
45.5% 45.5% 306.715s 1.03e+00s Py 297 1 forall_inplace,gpu,grad_of_scan_fn} | |
19.8% 65.3% 133.139s 1.18e-02s C 11286 38 GpuDot22 | |
9.2% 74.5% 61.915s 1.04e-01s Py 594 2 forall_inplace,gpu,grad_of_scan_fn} | |
7.9% 82.3% 53.129s 1.79e-01s Py 297 1 forall_inplace,gpu,scan_fn} | |
4.7% 87.1% 31.945s 3.36e-03s C 9504 32 GpuCAReduce{pre=sqr,red=add}{1,1} | |
1.8% 88.9% 12.223s 2.06e-02s C 594 2 GpuCAReduce{add}{1,1,0} | |
1.5% 90.4% 10.271s 3.46e-02s C 297 1 GpuAdvancedIncSubtensor1{inplace,inc} | |
1.5% 91.9% 9.947s 3.35e-02s C 297 1 GpuDnnSoftmax{tensor_format='bc01', mode='channel', algo='ac | |
curate'} | |
1.4% 93.3% 9.632s 3.24e-02s C 297 1 GpuDnnSoftmaxGrad{tensor_format='bc01', mode='channel', algo | |
='accurate'} | |
1.2% 94.6% 8.287s 1.40e-02s Py 594 2 forall_inplace,gpu,scan_fn} | |
1.0% 95.5% 6.487s 4.37e-03s C 1485 5 GpuDot22Scalar | |
0.9% 96.4% 6.301s 4.82e-04s C 13068 44 GpuElemwise{Composite{(i0 - (Switch(i1, (i2 * i0), Switch(i | |
3, i4, ((i5 * i4) / i6))) * i7))}}[(0, 0)] | |
0.7% 97.1% 4.421s 2.48e-04s C 17820 60 GpuAlloc{memset_0=True} | |
0.5% 97.6% 3.673s 6.18e-03s C 594 2 GpuElemwise{add,no_inplace} | |
0.5% 98.2% 3.644s 4.09e-03s C 891 3 GpuElemwise{mul,no_inplace} | |
0.5% 98.7% 3.467s 5.84e-03s C 594 2 GpuElemwise{Mul}[(0, 0)] | |
0.4% 99.1% 2.974s 2.00e-03s Py 1485 5 GpuFlatten{2} | |
0.2% 99.3% 1.288s 9.64e-05s Py 13365 45 GpuReshape{2} | |
0.1% 99.4% 0.517s 3.48e-04s C 1485 5 GpuIncSubtensor{Inc;:int64:} | |
0.1% 99.5% 0.494s 1.39e-04s C 3564 12 GpuIncSubtensor{InplaceInc;int64::} | |
... (remaining 161 Ops account for 0.51%(3.46s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
45.5% 45.5% 306.715s 1.03e+00s 297 1063 forall_inplace,gpu,grad_of_scan_fn}(Shape_i{1}.0, GpuDimShuffle{0,2,1} | |
.0, GpuDimShuffle{0,2,1}.0, GpuElemwise{tanh,no_inplace}.0, GpuElemwise{Composite{(i0 - sqr(i1))},no_inplace}.0, GpuSubtensor{int64: | |
int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{::int64}.0, GpuAlloc{memset_0=Tru | |
e}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{memset_0=True}.0, GpuAl | |
7.9% 53.4% 53.129s 1.79e-01s 297 959 forall_inplace,gpu,scan_fn}(Shape_i{1}.0, GpuSubtensor{int64:int64:int8 | |
}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{memset_0=True}.0, W_wa, W_wo, W_wo2, W_uo, W_wf, W_wf2, W_uf, W_wi, W_wi2, W_ui, W_wc, W_wc | |
2, W_uc, GpuJoin.0, GpuReshape{3}.0, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, Elemwis | |
e{neq,no_inplace}.0, GpuReshape{2}.0) | |
6.4% 59.8% 42.972s 1.45e-01s 297 1044 GpuDot22(GpuReshape{2}.0, GpuDimShuffle{1,0}.0) | |
5.8% 65.5% 38.873s 1.31e-01s 297 1136 forall_inplace,gpu,grad_of_scan_fn}(Elemwise{Composite{minimum(minimum | |
(minimum(minimum(i0, i1), i2), i3), i4)}}.0, InplaceDimShuffle{0,1,x}.0, GpuDimShuffle{0,2,1}.0, Elemwise{sub,no_inplace}.0, GpuSubt | |
ensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, | |
GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{::int64}.0, GpuAlloc{memset_0=True} | |
5.1% 70.7% 34.667s 1.17e-01s 297 999 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
4.8% 75.5% 32.447s 1.09e-01s 297 1043 GpuDot22(GpuDimShuffle{1,0}.0, GpuReshape{2}.0) | |
3.4% 78.9% 23.042s 7.76e-02s 297 1137 forall_inplace,gpu,grad_of_scan_fn}(Elemwise{Composite{minimum(minimum | |
(minimum(minimum(i0, i1), i2), i3), i4)}}.0, InplaceDimShuffle{0,1,x}.0, GpuDimShuffle{0,2,1}.0, Elemwise{sub,no_inplace}.0, GpuSubt | |
ensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, | |
GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{::int64}.0, GpuAlloc{memset_0=True} | |
1.8% 80.8% 12.410s 4.18e-02s 297 1050 GpuCAReduce{pre=sqr,red=add}{1,1}(GpuDimShuffle{0,1}.0) | |
1.8% 82.5% 12.040s 4.05e-02s 297 1042 GpuCAReduce{add}{1,1,0}(GpuReshape{3}.0) | |
1.5% 84.1% 10.271s 3.46e-02s 297 1031 GpuAdvancedIncSubtensor1{inplace,inc}(GpuAlloc{memset_0=True}.0, GpuEl | |
emwise{Composite{((i0 * i1 * i2) / (i3 * i4))},no_inplace}.0, Elemwise{Composite{((i0 * i1) + i2)}}.0) | |
1.5% 85.5% 9.947s 3.35e-02s 297 1019 GpuDnnSoftmax{tensor_format='bc01', mode='channel', algo='accurate'}(G | |
puContiguous.0) | |
1.4% 87.0% 9.632s 3.24e-02s 297 1037 GpuDnnSoftmaxGrad{tensor_format='bc01', mode='channel', algo='accurate | |
'}(GpuContiguous.0, GpuContiguous.0) | |
0.6% 87.6% 4.149s 1.40e-02s 297 881 forall_inplace,gpu,scan_fn}(Elemwise{Composite{minimum(minimum(minimum( | |
minimum(i0, i1), i2), i3), i4)}}.0, InplaceDimShuffle{0,1,x}.0, Elemwise{sub,no_inplace}.0, GpuSubtensor{int64:int64:int8}.0, GpuSub | |
tensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{ | |
memset_0=True}.0, W_uo, W_uf, W_ui, W_uc, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDimShuf | |
0.6% 88.2% 4.138s 1.39e-02s 297 906 forall_inplace,gpu,scan_fn}(Elemwise{Composite{minimum(minimum(minimum( | |
minimum(i0, i1), i2), i3), i4)}}.0, InplaceDimShuffle{0,1,x}.0, Elemwise{sub,no_inplace}.0, GpuSubtensor{int64:int64:int64}.0, GpuSu | |
btensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuAlloc{memset_0=True}.0, GpuAl | |
loc{memset_0=True}.0, W_uo, W_uf, W_ui, W_uc, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDim | |
0.5% 88.7% 3.584s 1.21e-02s 297 1004 GpuElemwise{add,no_inplace}(GpuReshape{3}.0, GpuDimShuffle{x,x,0}.0) | |
0.5% 89.2% 3.438s 1.16e-02s 297 1012 GpuElemwise{mul,no_inplace}(GpuReshape{2}.0, GpuDimShuffle{0,x}.0) | |
0.5% 89.7% 3.342s 1.13e-02s 297 1039 GpuElemwise{Mul}[(0, 0)](GpuDimShuffle{0,1}.0, GpuDimShuffle{0,x}.0) | |
0.4% 90.2% 2.833s 9.54e-03s 297 1346 GpuElemwise{Composite{(i0 - (Switch(i1, (i2 * i0), Switch(i3, i4, ((i5 | |
* i4) / i6))) * i7))}}[(0, 0)](W_embed, GpuFromHost.0, CudaNdarrayConstant{[[ 0.01]]}, GpuFromHost.0, GpuAdvancedIncSubtensor1_dev2 | |
0{inplace,inc}.0, CudaNdarrayConstant{[[ 3.]]}, GpuDimShuffle{x,x}.0, GpuDimShuffle{x,x}.0) | |
0.4% 90.6% 2.804s 9.44e-03s 297 1116 GpuDot22(GpuFlatten{2}.0, GpuReshape{2}.0) | |
0.4% 91.0% 2.800s 9.43e-03s 297 1119 GpuDot22Scalar(GpuFlatten{2}.0, GpuReshape{2}.0, TensorConstant{3.0}) | |
... (remaining 1328 Apply instances account for 9.01%(60.69s) of the runtime) | |
Scan Op profiling ( scan_fn ) | |
================== | |
Message: None | |
Time in 298 calls of the op (for a total of 2621 steps) 4.064922e+00s | |
Total time spent in calling the VM 3.681994e+00s (90.580%) | |
Total overhead (computing slices..) 3.829279e-01s (9.420%) | |
Time in all call to theano.grad() 8.831870e+00s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
83.7% 83.7% 3.048s 2.91e-04s C 10484 4 theano.sandbox.cuda.blas.GpuGemm | |
10.3% 94.1% 0.377s 4.79e-05s C 7863 3 theano.sandbox.cuda.basic_ops.GpuElemwise | |
5.4% 99.5% 0.197s 3.77e-05s C 5242 2 theano.sandbox.cuda.basic_ops.GpuFromHost | |
0.5% 100.0% 0.019s 3.61e-06s C 5242 2 theano.tensor.elemwise.Elemwise | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
83.7% 83.7% 3.048s 2.91e-04s C 10484 4 GpuGemm{no_inplace} | |
5.4% 89.1% 0.197s 3.77e-05s C 5242 2 GpuFromHost | |
3.8% 93.0% 0.140s 5.34e-05s C 2621 1 GpuElemwise{Composite{((scalar_sigmoid((i0 + i1)) * i2) + (s | |
calar_sigmoid((i3 + i4)) * tanh((i5 + i6))))}}[(0, 1)] | |
3.8% 96.8% 0.138s 5.26e-05s C 2621 1 GpuElemwise{Composite{(((scalar_sigmoid((i0 + i1)) * tanh(i2 | |
)) * i3) + (i4 * i5))},no_inplace} | |
2.7% 99.5% 0.099s 3.78e-05s C 2621 1 GpuElemwise{Composite{((i0 * i1) + (i2 * i3))},no_inplace} | |
0.5% 100.0% 0.019s 3.61e-06s C 5242 2 Elemwise{Cast{float32}} | |
... (remaining 0 Ops account for 0.00%(0.00s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
21.8% 21.8% 0.792s 3.02e-04s 2621 5 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_uo_copy[cuda], TensorConstant{1.0}) | |
21.1% 42.9% 0.768s 2.93e-04s 2621 2 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_uc_copy[cuda], TensorConstant{1.0}) | |
20.4% 63.3% 0.745s 2.84e-04s 2621 4 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_uf_copy[cuda], TensorConstant{1.0}) | |
20.4% 83.7% 0.743s 2.84e-04s 2621 3 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_ui_copy[cuda], TensorConstant{1.0}) | |
3.8% 87.6% 0.140s 5.34e-05s 2621 8 GpuElemwise{Composite{((scalar_sigmoid((i0 + i1)) * i2) + (scalar_sigmo | |
id((i3 + i4)) * tanh((i5 + i6))))}}[(0, 1)](<CudaNdarrayType(float32, row)>, GpuGemm{no_inplace}.0, <CudaNdarrayType(float32, matrix | |
)>, <CudaNdarrayType(float32, row)>, GpuGemm{no_inplace}.0, <CudaNdarrayType(float32, row)>, GpuGemm{no_inplace}.0) | |
3.8% 91.3% 0.138s 5.26e-05s 2621 10 GpuElemwise{Composite{(((scalar_sigmoid((i0 + i1)) * tanh(i2)) * i3) + | |
(i4 * i5))},no_inplace}(<CudaNdarrayType(float32, row)>, GpuGemm{no_inplace}.0, GpuElemwise{Composite{((scalar_sigmoid((i0 + i1)) * | |
i2) + (scalar_sigmoid((i3 + i4)) * tanh((i5 + i6))))}}[(0, 1)].0, GpuFromHost.0, <CudaNdarrayType(float32, matrix)>, GpuFromHost.0) | |
3.0% 94.4% 0.110s 4.21e-05s 2621 7 GpuFromHost(Elemwise{Cast{float32}}.0) | |
2.7% 97.1% 0.099s 3.78e-05s 2621 9 GpuElemwise{Composite{((i0 * i1) + (i2 * i3))},no_inplace}(GpuElemwise{ | |
Composite{((scalar_sigmoid((i0 + i1)) * i2) + (scalar_sigmoid((i3 + i4)) * tanh((i5 + i6))))}}[(0, 1)].0, GpuFromHost.0, <CudaNdarra | |
yType(float32, matrix)>, GpuFromHost.0) | |
2.4% 99.5% 0.087s 3.33e-05s 2621 6 GpuFromHost(Elemwise{Cast{float32}}.0) | |
0.3% 99.8% 0.012s 4.62e-06s 2621 1 Elemwise{Cast{float32}}(<TensorType(int8, col)>) | |
0.2% 100.0% 0.007s 2.60e-06s 2621 0 Elemwise{Cast{float32}}(<TensorType(int8, col)>) | |
... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime) | |
Scan Op profiling ( scan_fn ) | |
================== | |
Message: None | |
Time in 298 calls of the op (for a total of 2621 steps) 4.061944e+00s | |
Total time spent in calling the VM 3.678530e+00s (90.561%) | |
Total overhead (computing slices..) 3.834136e-01s (9.439%) | |
Time in all call to theano.grad() 8.831870e+00s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
83.8% 83.8% 3.046s 2.90e-04s C 10484 4 theano.sandbox.cuda.blas.GpuGemm | |
10.3% 94.0% 0.373s 4.74e-05s C 7863 3 theano.sandbox.cuda.basic_ops.GpuElemwise | |
5.5% 99.5% 0.199s 3.80e-05s C 5242 2 theano.sandbox.cuda.basic_ops.GpuFromHost | |
0.5% 100.0% 0.018s 3.49e-06s C 5242 2 theano.tensor.elemwise.Elemwise | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
83.8% 83.8% 3.046s 2.90e-04s C 10484 4 GpuGemm{no_inplace} | |
5.5% 89.2% 0.199s 3.80e-05s C 5242 2 GpuFromHost | |
3.8% 93.0% 0.138s 5.27e-05s C 2621 1 GpuElemwise{Composite{((scalar_sigmoid((i0 + i1)) * i2) + (s | |
calar_sigmoid((i3 + i4)) * tanh((i5 + i6))))}}[(0, 1)] | |
3.8% 96.8% 0.137s 5.22e-05s C 2621 1 GpuElemwise{Composite{(((scalar_sigmoid((i0 + i1)) * tanh(i2 | |
)) * i3) + (i4 * i5))},no_inplace} | |
2.7% 99.5% 0.098s 3.74e-05s C 2621 1 GpuElemwise{Composite{((i0 * i1) + (i2 * i3))},no_inplace} | |
0.5% 100.0% 0.018s 3.49e-06s C 5242 2 Elemwise{Cast{float32}} | |
... (remaining 0 Ops account for 0.00%(0.00s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
21.7% 21.7% 0.790s 3.01e-04s 2621 5 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_uo_copy[cuda], TensorConstant{1.0}) | |
21.1% 42.9% 0.768s 2.93e-04s 2621 2 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_uc_copy[cuda], TensorConstant{1.0}) | |
20.5% 63.3% 0.744s 2.84e-04s 2621 4 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_uf_copy[cuda], TensorConstant{1.0}) | |
20.4% 83.8% 0.743s 2.84e-04s 2621 3 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_ui_copy[cuda], TensorConstant{1.0}) | |
3.8% 87.6% 0.138s 5.27e-05s 2621 8 GpuElemwise{Composite{((scalar_sigmoid((i0 + i1)) * i2) + (scalar_sigmo | |
id((i3 + i4)) * tanh((i5 + i6))))}}[(0, 1)](<CudaNdarrayType(float32, row)>, GpuGemm{no_inplace}.0, <CudaNdarrayType(float32, matrix | |
)>, <CudaNdarrayType(float32, row)>, GpuGemm{no_inplace}.0, <CudaNdarrayType(float32, row)>, GpuGemm{no_inplace}.0) | |
3.8% 91.3% 0.137s 5.22e-05s 2621 10 GpuElemwise{Composite{(((scalar_sigmoid((i0 + i1)) * tanh(i2)) * i3) + | |
(i4 * i5))},no_inplace}(<CudaNdarrayType(float32, row)>, GpuGemm{no_inplace}.0, GpuElemwise{Composite{((scalar_sigmoid((i0 + i1)) * | |
i2) + (scalar_sigmoid((i3 + i4)) * tanh((i5 + i6))))}}[(0, 1)].0, GpuFromHost.0, <CudaNdarrayType(float32, matrix)>, GpuFromHost.0) | |
3.1% 94.4% 0.111s 4.25e-05s 2621 7 GpuFromHost(Elemwise{Cast{float32}}.0) | |
2.7% 97.1% 0.098s 3.74e-05s 2621 9 GpuElemwise{Composite{((i0 * i1) + (i2 * i3))},no_inplace}(GpuElemwise{ | |
Composite{((scalar_sigmoid((i0 + i1)) * i2) + (scalar_sigmoid((i3 + i4)) * tanh((i5 + i6))))}}[(0, 1)].0, GpuFromHost.0, <CudaNdarra | |
yType(float32, matrix)>, GpuFromHost.0) | |
2.4% 99.5% 0.088s 3.35e-05s 2621 6 GpuFromHost(Elemwise{Cast{float32}}.0) | |
0.3% 99.8% 0.011s 4.38e-06s 2621 1 Elemwise{Cast{float32}}(<TensorType(int8, col)>) | |
0.2% 100.0% 0.007s 2.59e-06s 2621 0 Elemwise{Cast{float32}}(<TensorType(int8, col)>) | |
... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime) | |
Scan Op profiling ( scan_fn ) | |
================== | |
Message: None | |
Time in 298 calls of the op (for a total of 9881 steps) 5.314603e+01s | |
Total time spent in calling the VM 5.168440e+01s (97.250%) | |
Total overhead (computing slices..) 1.461627e+00s (2.750%) | |
Time in all call to theano.grad() 8.831870e+00s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
42.3% 42.3% 21.728s 4.40e-04s C 49405 5 theano.sandbox.cuda.blas.GpuDot22 | |
40.4% 82.7% 20.740s 2.62e-04s C 79048 8 theano.sandbox.cuda.blas.GpuGemm | |
10.1% 92.8% 5.189s 8.75e-05s C 59286 6 theano.sandbox.cuda.basic_ops.GpuElemwise | |
2.9% 95.7% 1.479s 1.50e-04s C 9881 1 theano.sandbox.cuda.basic_ops.GpuCAReduce | |
1.1% 96.8% 0.555s 2.81e-05s Py 19762 2 theano.sandbox.cuda.basic_ops.GpuReshape | |
1.0% 97.8% 0.525s 5.32e-05s C 9881 1 theano.sandbox.cuda.blas.GpuGemv | |
0.9% 98.7% 0.474s 4.80e-05s C 9881 1 theano.sandbox.cuda.basic_ops.GpuFromHost | |
0.6% 99.3% 0.298s 3.02e-05s C 9881 1 theano.sandbox.cuda.dnn.GpuDnnSoftmax | |
0.4% 99.7% 0.212s 2.15e-05s C 9881 1 theano.sandbox.cuda.basic_ops.GpuAlloc | |
0.1% 99.8% 0.055s 8.02e-07s C 69167 7 theano.sandbox.cuda.basic_ops.GpuDimShuffle | |
0.1% 99.9% 0.042s 2.14e-06s C 19762 2 theano.tensor.elemwise.Elemwise | |
0.0% 100.0% 0.023s 5.93e-07s C 39524 4 theano.compile.ops.Shape_i | |
0.0% 100.0% 0.010s 5.06e-07s C 19762 2 theano.tensor.opt.MakeVector | |
0.0% 100.0% 0.006s 6.45e-07s C 9881 1 theano.sandbox.cuda.basic_ops.GpuContiguous | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
42.3% 42.3% 21.728s 4.40e-04s C 49405 5 GpuDot22 | |
40.4% 82.7% 20.740s 2.62e-04s C 79048 8 GpuGemm{inplace} | |
3.5% 86.2% 1.773s 1.79e-04s C 9881 1 GpuElemwise{mul,no_inplace} | |
2.9% 89.1% 1.479s 1.50e-04s C 9881 1 GpuCAReduce{add}{0,1,0} | |
2.6% 91.6% 1.312s 1.33e-04s C 9881 1 GpuElemwise{add,no_inplace} | |
1.7% 93.3% 0.848s 8.59e-05s C 9881 1 GpuElemwise{Tanh}[(0, 0)] | |
1.3% 94.5% 0.642s 6.50e-05s C 9881 1 GpuElemwise{Composite{((scalar_sigmoid((i0 + i1)) * i2) + (s | |
calar_sigmoid((i3 + i4)) * tanh((i5 + i6))))},no_inplace} | |
1.1% 95.6% 0.555s 2.81e-05s Py 19762 2 GpuReshape{2} | |
1.0% 96.6% 0.525s 5.32e-05s C 9881 1 GpuGemv{inplace} | |
0.9% 97.5% 0.474s 4.80e-05s C 9881 1 GpuFromHost | |
0.9% 98.4% 0.437s 4.43e-05s C 9881 1 GpuElemwise{Composite{(scalar_sigmoid((i0 + i1)) * tanh(i2)) | |
},no_inplace} | |
0.6% 99.0% 0.298s 3.02e-05s C 9881 1 GpuDnnSoftmax{tensor_format='bc01', mode='channel', algo='ac | |
curate'} | |
0.4% 99.4% 0.212s 2.15e-05s C 9881 1 GpuAlloc{memset_0=True} | |
0.3% 99.7% 0.176s 1.78e-05s C 9881 1 GpuElemwise{Mul}[(0, 0)] | |
0.1% 99.8% 0.037s 3.76e-06s C 9881 1 Elemwise{Cast{float32}} | |
0.0% 99.8% 0.013s 6.49e-07s C 19762 2 Shape_i{0} | |
0.0% 99.8% 0.011s 5.38e-07s C 19762 2 Shape_i{1} | |
0.0% 99.9% 0.010s 5.06e-07s C 19762 2 MakeVector | |
0.0% 99.9% 0.009s 9.41e-07s C 9881 1 GpuDimShuffle{0} | |
0.0% 99.9% 0.009s 8.76e-07s C 9881 1 GpuDimShuffle{0,1,x,x} | |
... (remaining 7 Ops account for 0.10%(0.05s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
9.4% 9.4% 4.842s 4.90e-04s 9881 31 GpuDot22(GpuCAReduce{add}{0,1,0}.0, W_wo_copy[cuda]) | |
9.2% 18.7% 4.749s 4.81e-04s 9881 28 GpuDot22(GpuCAReduce{add}{0,1,0}.0, W_wc_copy[cuda]) | |
9.2% 27.9% 4.725s 4.78e-04s 9881 30 GpuDot22(GpuCAReduce{add}{0,1,0}.0, W_wf_copy[cuda]) | |
9.2% 37.1% 4.722s 4.78e-04s 9881 29 GpuDot22(GpuCAReduce{add}{0,1,0}.0, W_wi_copy[cuda]) | |
5.2% 42.3% 2.690s 2.72e-04s 9881 3 GpuDot22(<CudaNdarrayType(float32, matrix)>, W_wa_copy[cuda]) | |
5.1% 47.4% 2.600s 2.63e-04s 9881 35 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(floa | |
t32, matrix)>, W_wo2_copy[cuda], TensorConstant{1.0}) | |
5.1% 52.4% 2.598s 2.63e-04s 9881 39 GpuGemm{inplace}(GpuGemm{inplace}.0, TensorConstant{1.0}, <CudaNdarrayT | |
ype(float32, matrix)>, W_uo_copy[cuda], TensorConstant{1.0}) | |
5.1% 57.5% 2.597s 2.63e-04s 9881 38 GpuGemm{inplace}(GpuGemm{inplace}.0, TensorConstant{1.0}, <CudaNdarrayT | |
ype(float32, matrix)>, W_uf_copy[cuda], TensorConstant{1.0}) | |
5.1% 62.6% 2.596s 2.63e-04s 9881 37 GpuGemm{inplace}(GpuGemm{inplace}.0, TensorConstant{1.0}, <CudaNdarrayT | |
ype(float32, matrix)>, W_ui_copy[cuda], TensorConstant{1.0}) | |
5.1% 67.6% 2.595s 2.63e-04s 9881 36 GpuGemm{inplace}(GpuGemm{inplace}.0, TensorConstant{1.0}, <CudaNdarrayT | |
ype(float32, matrix)>, W_uc_copy[cuda], TensorConstant{1.0}) | |
5.0% 72.7% 2.588s 2.62e-04s 9881 34 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(floa | |
t32, matrix)>, W_wf2_copy[cuda], TensorConstant{1.0}) | |
5.0% 77.7% 2.583s 2.61e-04s 9881 32 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(floa | |
t32, matrix)>, W_wc2_copy[cuda], TensorConstant{1.0}) | |
5.0% 82.7% 2.582s 2.61e-04s 9881 33 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(floa | |
t32, matrix)>, W_wi2_copy[cuda], TensorConstant{1.0}) | |
3.5% 86.2% 1.773s 1.79e-04s 9881 26 GpuElemwise{mul,no_inplace}(GpuDimShuffle{0,1,x}.0, <CudaNdarrayType(fl | |
oat32, 3D)>) | |
2.9% 89.1% 1.479s 1.50e-04s 9881 27 GpuCAReduce{add}{0,1,0}(GpuElemwise{mul,no_inplace}.0) | |
2.6% 91.6% 1.312s 1.33e-04s 9881 11 GpuElemwise{add,no_inplace}(GpuDimShuffle{0,x,1}.0, GpuDimShuffle{0,1,2 | |
}.0) | |
1.7% 93.3% 0.848s 8.59e-05s 9881 15 GpuElemwise{Tanh}[(0, 0)](GpuReshape{2}.0) | |
1.3% 94.5% 0.642s 6.50e-05s 9881 40 GpuElemwise{Composite{((scalar_sigmoid((i0 + i1)) * i2) + (scalar_sigmo | |
id((i3 + i4)) * tanh((i5 + i6))))},no_inplace}(<CudaNdarrayType(float32, row)>, GpuGemm{inplace}.0, <CudaNdarrayType(float32, matrix | |
)>, <CudaNdarrayType(float32, row)>, GpuGemm{inplace}.0, <CudaNdarrayType(float32, row)>, GpuGemm{inplace}.0) | |
1.0% 95.5% 0.525s 5.32e-05s 9881 17 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, GpuEle | |
mwise{Tanh}[(0, 0)].0, GpuDimShuffle{0}.0, TensorConstant{0.0}) | |
0.9% 96.5% 0.474s 4.80e-05s 9881 9 GpuFromHost(Elemwise{Cast{float32}}.0) | |
... (remaining 22 Apply instances account for 3.54%(1.82s) of the runtime) | |
Scan Op profiling ( grad_of_scan_fn ) | |
================== | |
Message: None | |
Time in 297 calls of the op (for a total of 9841 steps) 3.057151e+02s | |
Total time spent in calling the VM 2.697911e+02s (88.249%) | |
Total overhead (computing slices..) 3.592400e+01s (11.751%) | |
Time in all call to theano.grad() 8.831870e+00s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
43.3% 43.3% 116.105s 4.07e-04s C 285389 29 theano.sandbox.cuda.blas.GpuDot22 | |
31.4% 74.6% 84.122s 3.17e-04s C 265707 27 theano.sandbox.cuda.blas.GpuGemm | |
19.0% 93.7% 51.008s 1.10e-04s C 462527 47 theano.sandbox.cuda.basic_ops.GpuElemwise | |
2.7% 96.4% 7.295s 6.18e-05s C 118092 12 theano.sandbox.cuda.basic_ops.GpuCAReduce | |
0.8% 97.2% 2.231s 2.83e-05s Py 78728 8 theano.sandbox.cuda.basic_ops.GpuReshape | |
0.7% 97.9% 1.760s 5.96e-05s C 29523 3 theano.sandbox.cuda.blas.GpuGemv | |
0.6% 98.5% 1.567s 3.18e-05s C 49205 5 theano.sandbox.cuda.basic_ops.GpuAlloc | |
0.5% 98.9% 1.313s 6.67e-05s C 19682 2 theano.sandbox.cuda.blas.GpuGer | |
0.3% 99.3% 0.838s 4.26e-05s C 19682 2 theano.sandbox.cuda.basic_ops.GpuIncSubtensor | |
0.2% 99.5% 0.557s 5.66e-05s C 9841 1 theano.sandbox.cuda.basic_ops.GpuFromHost | |
0.2% 99.7% 0.533s 2.71e-05s C 19682 2 theano.sandbox.cuda.dnn.GpuDnnSoftmaxGrad | |
0.1% 99.8% 0.388s 1.16e-06s C 334594 34 theano.sandbox.cuda.basic_ops.GpuDimShuffle | |
0.1% 99.9% 0.305s 3.10e-05s C 9841 1 theano.sandbox.cuda.dnn.GpuDnnSoftmax | |
0.0% 100.0% 0.100s 5.10e-06s C 19682 2 theano.tensor.elemwise.Elemwise | |
0.0% 100.0% 0.054s 6.80e-07s C 78728 8 theano.compile.ops.Shape_i | |
0.0% 100.0% 0.031s 7.96e-07s C 39364 4 theano.tensor.opt.MakeVector | |
0.0% 100.0% 0.022s 5.61e-07s C 39364 4 theano.sandbox.cuda.basic_ops.GpuContiguous | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
43.3% 43.3% 116.105s 4.07e-04s C 285389 29 GpuDot22 | |
30.2% 73.4% 80.882s 3.16e-04s C 255866 26 GpuGemm{inplace} | |
10.0% 83.4% 26.697s 2.09e-04s C 127933 13 GpuElemwise{add,no_inplace} | |
2.4% 85.8% 6.452s 2.19e-04s C 29523 3 GpuElemwise{mul,no_inplace} | |
1.8% 87.6% 4.794s 2.44e-04s C 19682 2 GpuElemwise{Composite{(((i0 * i1) + (i2 * i1)) + i3)},no_in | |
place} | |
1.4% 88.9% 3.628s 7.37e-05s C 49205 5 GpuElemwise{Composite{((i0 + i1) + i2)},no_inplace} | |
1.2% 90.2% 3.275s 1.11e-04s C 29523 3 GpuCAReduce{add}{0,1,0} | |
1.2% 91.4% 3.240s 3.29e-04s C 9841 1 GpuGemm{no_inplace} | |
0.8% 92.2% 2.277s 4.63e-05s C 49205 5 GpuElemwise{Mul}[(0, 0)] | |
0.8% 93.0% 2.235s 3.24e-05s C 68887 7 GpuCAReduce{add}{1,0} | |
0.7% 93.7% 1.785s 9.07e-05s C 19682 2 GpuCAReduce{add}{0,0,1} | |
0.6% 94.3% 1.567s 3.18e-05s C 49205 5 GpuAlloc{memset_0=True} | |
0.6% 94.9% 1.546s 1.57e-04s C 9841 1 GpuElemwise{Composite{tanh((i0 + i1))},no_inplace} | |
0.5% 95.4% 1.313s 6.67e-05s C 19682 2 GpuGer{inplace} | |
0.5% 95.8% 1.209s 3.07e-05s Py 39364 4 GpuReshape{2} | |
0.4% 96.2% 1.094s 5.56e-05s C 19682 2 GpuGemv{inplace} | |
0.3% 96.6% 0.925s 4.70e-05s C 19682 2 GpuElemwise{Composite{(i0 - sqr(i1))}}[(0, 1)] | |
0.3% 96.9% 0.844s 2.86e-05s C 29523 3 GpuElemwise{Composite{scalar_sigmoid((i0 + i1))}}[(0, 1)] | |
0.2% 97.1% 0.665s 6.76e-05s C 9841 1 GpuGemv{no_inplace} | |
0.2% 97.4% 0.612s 3.11e-05s C 19682 2 GpuElemwise{Composite{(((i0 * i1) * i2) * i3)},no_inplace} | |
... (remaining 33 Ops account for 2.64%(7.09s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
2.2% 2.2% 5.779s 5.87e-04s 9841 59 GpuDot22(GpuElemwise{Composite{(((i0 * i1) * i2) * (i3 - i2))},no_inpla | |
ce}.0, W_wo_copy.T_replace[cuda]) | |
2.1% 4.3% 5.764s 5.86e-04s 9841 79 GpuDot22(GpuElemwise{Composite{(((i0 * i1) * i2) * i3)}}[(0, 3)].0, W_w | |
i_copy.T_replace[cuda]) | |
2.1% 6.5% 5.762s 5.85e-04s 9841 116 GpuDot22(GpuElemwise{Composite{((i0 * i1) * i2)}}[(0, 1)].0, W_wc_copy. | |
T_replace[cuda]) | |
2.1% 8.6% 5.733s 5.83e-04s 9841 69 GpuDot22(GpuElemwise{Composite{(((i0 * i1) * i2) * i3)},no_inplace}.0, | |
W_wf_copy.T_replace[cuda]) | |
2.1% 10.7% 5.724s 5.82e-04s 9841 82 GpuDot22(GpuElemwise{Composite{(((i0 * i1) * i2) * i3)}}[(0, 2)].0, W_w | |
f_copy.T_replace[cuda]) | |
2.1% 12.9% 5.722s 5.81e-04s 9841 74 GpuDot22(GpuElemwise{Composite{(((i0 * i1) * i2) * i3)},no_inplace}.0, | |
W_wi_copy.T_replace[cuda]) | |
2.1% 15.0% 5.718s 5.81e-04s 9841 106 GpuDot22(GpuElemwise{Composite{((i0 * i1) * i2)}}[(0, 0)].0, W_wc_copy. | |
T_replace[cuda]) | |
2.0% 17.0% 5.343s 5.43e-04s 9841 103 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, GpuDimShuffle{1,0}.0, | |
GpuElemwise{Composite{(((i0 * i1) * i2) * i3)},no_inplace}.0, TensorConstant{1.0}) | |
2.0% 19.0% 5.336s 5.42e-04s 9841 95 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, GpuDimShuffle{1,0}.0, | |
GpuElemwise{Composite{(((i0 * i1) * i2) * i3)}}[(0, 3)].0, TensorConstant{1.0}) | |
2.0% 21.0% 5.334s 5.42e-04s 9841 125 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, GpuDimShuffle{1,0}.0, | |
GpuElemwise{Composite{((i0 * i1) * i2)}}[(0, 1)].0, TensorConstant{1.0}) | |
1.8% 22.8% 4.888s 4.97e-04s 9841 86 GpuDot22(GpuDimShuffle{1,0}.0, GpuElemwise{Composite{(((i0 * i1) * i2) | |
* i3)}}[(0, 2)].0) | |
1.8% 24.6% 4.883s 4.96e-04s 9841 110 GpuDot22(GpuDimShuffle{1,0}.0, GpuElemwise{Composite{((i0 * i1) * i2)}} | |
[(0, 0)].0) | |
1.8% 26.4% 4.877s 4.96e-04s 9841 78 GpuDot22(GpuDimShuffle{1,0}.0, GpuElemwise{Composite{(((i0 * i1) * i2) | |
* i3)},no_inplace}.0) | |
1.8% 28.2% 4.811s 4.89e-04s 9841 41 GpuDot22(GpuCAReduce{add}{0,1,0}.0, W_wf_copy[cuda]) | |
1.8% 30.0% 4.757s 4.83e-04s 9841 38 GpuDot22(GpuCAReduce{add}{0,1,0}.0, W_wo_copy[cuda]) | |
1.8% 31.8% 4.747s 4.82e-04s 9841 40 GpuDot22(GpuCAReduce{add}{0,1,0}.0, W_wc_copy[cuda]) | |
1.8% 33.5% 4.745s 4.82e-04s 9841 39 GpuDot22(GpuCAReduce{add}{0,1,0}.0, W_wi_copy[cuda]) | |
1.6% 35.1% 4.339s 4.41e-04s 9841 136 GpuElemwise{Composite{(((i0 * i1) + (i2 * i1)) + i3)},no_inplace}(GpuEl | |
emwise{Composite{(((i0 + i1) + i2) + i3)}}[(0, 0)].0, GpuDimShuffle{0,1,x}.0, GpuElemwise{Composite{((i0 + i1) + i2)}}[(0, 0)].0, <C | |
udaNdarrayType(float32, 3D)>) | |
1.2% 36.4% 3.319s 3.37e-04s 9841 121 GpuElemwise{add,no_inplace}(<CudaNdarrayType(float32, matrix)>, GpuGemm | |
{inplace}.0) | |
1.2% 37.6% 3.303s 3.36e-04s 9841 132 GpuElemwise{add,no_inplace}(<CudaNdarrayType(float32, matrix)>, GpuGemm | |
{inplace}.0) | |
... (remaining 171 Apply instances account for 62.39%(167.34s) of the runtime) | |
Scan Op profiling ( grad_of_scan_fn ) | |
================== | |
Message: None | |
Time in 297 calls of the op (for a total of 2611 steps) 3.853760e+01s | |
Total time spent in calling the VM 1.844704e+01s (47.868%) | |
Total overhead (computing slices..) 2.009057e+01s (52.132%) | |
Time in all call to theano.grad() 8.831870e+00s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
46.6% 46.6% 8.528s 2.97e-04s C 28721 11 theano.sandbox.cuda.blas.GpuGemm | |
25.0% 71.7% 4.578s 2.92e-04s C 15666 6 theano.sandbox.cuda.blas.GpuDot22 | |
23.4% 95.0% 4.272s 5.28e-05s C 80941 31 theano.sandbox.cuda.basic_ops.GpuElemwise | |
3.4% 98.4% 0.617s 3.38e-05s C 18277 7 theano.sandbox.cuda.basic_ops.GpuCAReduce | |
1.4% 99.8% 0.264s 5.06e-05s C 5222 2 theano.sandbox.cuda.basic_ops.GpuFromHost | |
0.2% 100.0% 0.027s 5.26e-06s C 5222 2 theano.tensor.elemwise.Elemwise | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
30.1% 30.1% 5.506s 3.01e-04s C 18277 7 GpuGemm{inplace} | |
25.0% 55.1% 4.578s 2.92e-04s C 15666 6 GpuDot22 | |
16.5% 71.7% 3.022s 2.89e-04s C 10444 4 GpuGemm{no_inplace} | |
13.1% 84.7% 2.386s 1.14e-04s C 20888 8 GpuElemwise{add,no_inplace} | |
3.4% 88.1% 0.617s 3.38e-05s C 18277 7 GpuCAReduce{add}{1,0} | |
1.5% 89.6% 0.280s 3.57e-05s C 7833 3 GpuElemwise{mul,no_inplace} | |
1.4% 91.1% 0.264s 5.06e-05s C 5222 2 GpuFromHost | |
1.4% 92.5% 0.255s 3.25e-05s C 7833 3 GpuElemwise{Composite{((i0 + i1) + i2)},no_inplace} | |
1.2% 93.7% 0.220s 2.80e-05s C 7833 3 GpuElemwise{Composite{scalar_sigmoid((i0 + i1))}}[(0, 1)] | |
1.0% 94.7% 0.184s 3.52e-05s C 5222 2 GpuElemwise{Composite{(((i0 * i1) * i2) * i3)},no_inplace} | |
0.7% 95.4% 0.133s 5.09e-05s C 2611 1 GpuElemwise{Composite{(((i0 * i1) + ((i2 * i1) + (i3 * i4))) | |
+ i5)},no_inplace} | |
0.7% 96.1% 0.133s 2.54e-05s C 5222 2 GpuElemwise{Composite{((i0 * i1) * i2)}}[(0, 0)] | |
0.7% 96.8% 0.128s 4.90e-05s C 2611 1 GpuElemwise{Composite{(((i0 * i1) * i2) * (i3 - i2))},no_inp | |
lace} | |
0.6% 97.5% 0.117s 2.23e-05s C 5222 2 GpuElemwise{sub,no_inplace} | |
0.4% 97.9% 0.081s 3.08e-05s C 2611 1 GpuElemwise{Composite{tanh(((i0 * i1) + (i2 * i3)))},no_inpl | |
ace} | |
0.4% 98.3% 0.078s 2.99e-05s C 2611 1 GpuElemwise{Composite{(((i0 * i1) * i2) * i3)}}[(0, 3)] | |
0.4% 98.8% 0.078s 2.97e-05s C 2611 1 GpuElemwise{Composite{(((i0 * i1) * i2) * i3)}}[(0, 2)] | |
0.4% 99.2% 0.075s 2.87e-05s C 2611 1 GpuElemwise{Composite{tanh((i0 + i1))}}[(0, 1)] | |
0.4% 99.6% 0.073s 2.79e-05s C 2611 1 GpuElemwise{Composite{((i0 * i1) * (i2 - sqr(i3)))}}[(0, 0)] | |
0.3% 99.8% 0.053s 2.04e-05s C 2611 1 GpuElemwise{Composite{(i0 - sqr(i1))}}[(0, 1)] | |
... (remaining 1 Ops account for 0.15%(0.03s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
4.5% 4.5% 0.827s 3.17e-04s 2611 31 GpuDot22(GpuElemwise{Composite{(((i0 * i1) * i2) * i3)},no_inplace}.0, | |
W_ui_copy.T_replace[cuda]) | |
4.5% 9.0% 0.823s 3.15e-04s 2611 52 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, GpuElemwise{Composite | |
{((i0 * i1) * i2)}}[(0, 0)].0, W_uc_copy.T_replace[cuda], TensorConstant{1.0}) | |
4.4% 13.5% 0.813s 3.11e-04s 2611 21 GpuDot22(GpuElemwise{Composite{(((i0 * i1) * i2) * (i3 - i2))},no_inpla | |
ce}.0, W_uo_copy.T_replace[cuda]) | |
4.4% 17.9% 0.812s 3.11e-04s 2611 40 GpuGemm{inplace}(GpuElemwise{mul,no_inplace}.0, TensorConstant{1.0}, Gp | |
uElemwise{Composite{(((i0 * i1) * i2) * i3)}}[(0, 2)].0, W_uf_copy.T_replace[cuda], TensorConstant{1.0}) | |
4.4% 22.3% 0.811s 3.11e-04s 2611 34 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, GpuElemwise{Composite | |
{(((i0 * i1) * i2) * i3)},no_inplace}.0, W_uf_copy.T_replace[cuda], TensorConstant{1.0}) | |
4.4% 26.7% 0.805s 3.08e-04s 2611 48 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, GpuElemwise{Composite | |
{((i0 * i1) * i2)}}[(0, 0)].0, W_uc_copy.T_replace[cuda], TensorConstant{1.0}) | |
4.3% 31.1% 0.794s 3.04e-04s 2611 37 GpuDot22(GpuElemwise{Composite{(((i0 * i1) * i2) * i3)}}[(0, 3)].0, W_u | |
i_copy.T_replace[cuda]) | |
4.2% 35.3% 0.770s 2.95e-04s 2611 1 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_uc_copy[cuda], TensorConstant{1.0}) | |
4.2% 39.5% 0.763s 2.92e-04s 2611 3 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_uf_copy[cuda], TensorConstant{1.0}) | |
4.1% 43.6% 0.752s 2.88e-04s 2611 56 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(floa | |
t32, matrix)>, GpuElemwise{Composite{((i0 * i1) * i2)}}[(0, 0)].0, TensorConstant{1.0}) | |
4.1% 47.7% 0.752s 2.88e-04s 2611 43 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(floa | |
t32, matrix)>, GpuElemwise{Composite{(((i0 * i1) * i2) * i3)}}[(0, 3)].0, TensorConstant{1.0}) | |
4.1% 51.8% 0.751s 2.88e-04s 2611 44 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(floa | |
t32, matrix)>, GpuElemwise{Composite{(((i0 * i1) * i2) * i3)}}[(0, 2)].0, TensorConstant{1.0}) | |
4.1% 55.9% 0.745s 2.85e-04s 2611 2 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_ui_copy[cuda], TensorConstant{1.0}) | |
4.1% 59.9% 0.745s 2.85e-04s 2611 0 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_uo_copy[cuda], TensorConstant{1.0}) | |
3.9% 63.9% 0.716s 2.74e-04s 2611 30 GpuDot22(<CudaNdarrayType(float32, matrix)>, GpuElemwise{Composite{(((i | |
0 * i1) * i2) * i3)},no_inplace}.0) | |
3.9% 67.8% 0.714s 2.74e-04s 2611 51 GpuDot22(<CudaNdarrayType(float32, matrix)>, GpuElemwise{Composite{((i0 | |
* i1) * i2)}}[(0, 0)].0) | |
3.9% 71.7% 0.714s 2.73e-04s 2611 33 GpuDot22(<CudaNdarrayType(float32, matrix)>, GpuElemwise{Composite{(((i | |
0 * i1) * i2) * i3)},no_inplace}.0) | |
3.2% 74.9% 0.582s 2.23e-04s 2611 54 GpuElemwise{add,no_inplace}(<CudaNdarrayType(float32, matrix)>, GpuGemm | |
{inplace}.0) | |
3.2% 78.0% 0.576s 2.21e-04s 2611 58 GpuElemwise{add,no_inplace}(<CudaNdarrayType(float32, matrix)>, GpuGemm | |
{inplace}.0) | |
3.1% 81.1% 0.569s 2.18e-04s 2611 53 GpuElemwise{add,no_inplace}(<CudaNdarrayType(float32, matrix)>, GpuGemm | |
{inplace}.0) | |
... (remaining 39 Apply instances account for 18.89%(3.45s) of the runtime) | |
Scan Op profiling ( grad_of_scan_fn ) | |
================== | |
Message: None | |
Time in 297 calls of the op (for a total of 2611 steps) 2.270440e+01s | |
Total time spent in calling the VM 1.848128e+01s (81.400%) | |
Total overhead (computing slices..) 4.223124e+00s (18.600%) | |
Time in all call to theano.grad() 8.831870e+00s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
46.5% 46.5% 8.523s 2.97e-04s C 28721 11 theano.sandbox.cuda.blas.GpuGemm | |
25.0% 71.6% 4.588s 2.93e-04s C 15666 6 theano.sandbox.cuda.blas.GpuDot22 | |
23.5% 95.0% 4.299s 5.31e-05s C 80941 31 theano.sandbox.cuda.basic_ops.GpuElemwise | |
3.4% 98.4% 0.619s 3.39e-05s C 18277 7 theano.sandbox.cuda.basic_ops.GpuCAReduce | |
1.5% 99.9% 0.266s 5.10e-05s C 5222 2 theano.sandbox.cuda.basic_ops.GpuFromHost | |
0.1% 100.0% 0.027s 5.12e-06s C 5222 2 theano.tensor.elemwise.Elemwise | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
30.0% 30.0% 5.501s 3.01e-04s C 18277 7 GpuGemm{inplace} | |
25.0% 55.1% 4.588s 2.93e-04s C 15666 6 GpuDot22 | |
16.5% 71.6% 3.022s 2.89e-04s C 10444 4 GpuGemm{no_inplace} | |
13.2% 84.7% 2.411s 1.15e-04s C 20888 8 GpuElemwise{add,no_inplace} | |
3.4% 88.1% 0.619s 3.39e-05s C 18277 7 GpuCAReduce{add}{1,0} | |
1.5% 89.6% 0.277s 3.53e-05s C 7833 3 GpuElemwise{mul,no_inplace} | |
1.5% 91.1% 0.266s 5.10e-05s C 5222 2 GpuFromHost | |
1.4% 92.5% 0.264s 3.37e-05s C 7833 3 GpuElemwise{Composite{((i0 + i1) + i2)},no_inplace} | |
1.2% 93.7% 0.220s 2.81e-05s C 7833 3 GpuElemwise{Composite{scalar_sigmoid((i0 + i1))}}[(0, 1)] | |
1.0% 94.7% 0.181s 3.47e-05s C 5222 2 GpuElemwise{Composite{(((i0 * i1) * i2) * i3)},no_inplace} | |
0.7% 95.4% 0.133s 5.09e-05s C 2611 1 GpuElemwise{Composite{(((i0 * i1) + ((i2 * i1) + (i3 * i4))) | |
+ i5)},no_inplace} | |
0.7% 96.1% 0.132s 2.52e-05s C 5222 2 GpuElemwise{Composite{((i0 * i1) * i2)}}[(0, 0)] | |
0.7% 96.8% 0.129s 4.92e-05s C 2611 1 GpuElemwise{Composite{(((i0 * i1) * i2) * (i3 - i2))},no_inp | |
lace} | |
0.6% 97.5% 0.117s 2.25e-05s C 5222 2 GpuElemwise{sub,no_inplace} | |
0.4% 97.9% 0.079s 3.04e-05s C 2611 1 GpuElemwise{Composite{tanh(((i0 * i1) + (i2 * i3)))},no_inpl | |
ace} | |
0.4% 98.3% 0.078s 2.98e-05s C 2611 1 GpuElemwise{Composite{(((i0 * i1) * i2) * i3)}}[(0, 3)] | |
0.4% 98.8% 0.077s 2.96e-05s C 2611 1 GpuElemwise{Composite{(((i0 * i1) * i2) * i3)}}[(0, 2)] | |
0.4% 99.2% 0.075s 2.87e-05s C 2611 1 GpuElemwise{Composite{tanh((i0 + i1))}}[(0, 1)] | |
0.4% 99.6% 0.072s 2.77e-05s C 2611 1 GpuElemwise{Composite{((i0 * i1) * (i2 - sqr(i3)))}}[(0, 0)] | |
0.3% 99.9% 0.053s 2.03e-05s C 2611 1 GpuElemwise{Composite{(i0 - sqr(i1))}}[(0, 1)] | |
... (remaining 1 Ops account for 0.15%(0.03s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
4.5% 4.5% 0.825s 3.16e-04s 2611 31 GpuDot22(GpuElemwise{Composite{(((i0 * i1) * i2) * i3)},no_inplace}.0, | |
W_ui_copy.T_replace[cuda]) | |
4.5% 9.0% 0.820s 3.14e-04s 2611 52 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, GpuElemwise{Composite | |
{((i0 * i1) * i2)}}[(0, 0)].0, W_uc_copy.T_replace[cuda], TensorConstant{1.0}) | |
4.4% 13.4% 0.814s 3.12e-04s 2611 21 GpuDot22(GpuElemwise{Composite{(((i0 * i1) * i2) * (i3 - i2))},no_inpla | |
ce}.0, W_uo_copy.T_replace[cuda]) | |
4.4% 17.8% 0.812s 3.11e-04s 2611 34 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, GpuElemwise{Composite | |
{(((i0 * i1) * i2) * i3)},no_inplace}.0, W_uf_copy.T_replace[cuda], TensorConstant{1.0}) | |
4.4% 22.3% 0.809s 3.10e-04s 2611 40 GpuGemm{inplace}(GpuElemwise{mul,no_inplace}.0, TensorConstant{1.0}, Gp | |
uElemwise{Composite{(((i0 * i1) * i2) * i3)}}[(0, 2)].0, W_uf_copy.T_replace[cuda], TensorConstant{1.0}) | |
4.4% 26.7% 0.805s 3.08e-04s 2611 48 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, GpuElemwise{Composite | |
{((i0 * i1) * i2)}}[(0, 0)].0, W_uc_copy.T_replace[cuda], TensorConstant{1.0}) | |
4.3% 31.0% 0.794s 3.04e-04s 2611 37 GpuDot22(GpuElemwise{Composite{(((i0 * i1) * i2) * i3)}}[(0, 3)].0, W_u | |
i_copy.T_replace[cuda]) | |
4.2% 35.2% 0.771s 2.95e-04s 2611 1 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_uc_copy[cuda], TensorConstant{1.0}) | |
4.2% 39.4% 0.762s 2.92e-04s 2611 3 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_uf_copy[cuda], TensorConstant{1.0}) | |
4.1% 43.5% 0.753s 2.88e-04s 2611 56 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(floa | |
t32, matrix)>, GpuElemwise{Composite{((i0 * i1) * i2)}}[(0, 0)].0, TensorConstant{1.0}) | |
4.1% 47.6% 0.752s 2.88e-04s 2611 43 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(floa | |
t32, matrix)>, GpuElemwise{Composite{(((i0 * i1) * i2) * i3)}}[(0, 3)].0, TensorConstant{1.0}) | |
4.1% 51.7% 0.751s 2.88e-04s 2611 44 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(floa | |
t32, matrix)>, GpuElemwise{Composite{(((i0 * i1) * i2) * i3)}}[(0, 2)].0, TensorConstant{1.0}) | |
4.1% 55.7% 0.745s 2.85e-04s 2611 0 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_uo_copy[cuda], TensorConstant{1.0}) | |
4.1% 59.8% 0.744s 2.85e-04s 2611 2 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_ui_copy[cuda], TensorConstant{1.0}) | |
4.0% 63.8% 0.724s 2.77e-04s 2611 51 GpuDot22(<CudaNdarrayType(float32, matrix)>, GpuElemwise{Composite{((i0 | |
* i1) * i2)}}[(0, 0)].0) | |
3.9% 67.7% 0.716s 2.74e-04s 2611 30 GpuDot22(<CudaNdarrayType(float32, matrix)>, GpuElemwise{Composite{(((i | |
0 * i1) * i2) * i3)},no_inplace}.0) | |
3.9% 71.6% 0.715s 2.74e-04s 2611 33 GpuDot22(<CudaNdarrayType(float32, matrix)>, GpuElemwise{Composite{(((i | |
0 * i1) * i2) * i3)},no_inplace}.0) | |
3.2% 74.8% 0.592s 2.27e-04s 2611 54 GpuElemwise{add,no_inplace}(<CudaNdarrayType(float32, matrix)>, GpuGemm | |
{inplace}.0) | |
3.1% 77.9% 0.574s 2.20e-04s 2611 58 GpuElemwise{add,no_inplace}(<CudaNdarrayType(float32, matrix)>, GpuGemm | |
{inplace}.0) | |
3.1% 81.0% 0.573s 2.19e-04s 2611 53 GpuElemwise{add,no_inplace}(<CudaNdarrayType(float32, matrix)>, GpuGemm | |
{inplace}.0) | |
... (remaining 39 Apply instances account for 18.95%(3.47s) of the runtime) | |
Function profiling | |
================== | |
Message: Sum of all(4) printed profiles at exit excluding Scan op profile. | |
Time in 315 calls to Function.__call__: 6.879535e+02s | |
Time in Function.fn.__call__: 6.878845e+02s (99.990%) | |
Time in thunks: 6.831378e+02s (99.300%) | |
Total compile time: 1.192735e+02s | |
Number of Apply nodes: 344 | |
Theano Optimizer time: 9.820933e+01s | |
Theano validate time: 1.885482e+00s | |
Theano Linker time (includes C, CUDA code generation/compiling): 2.060047e+01s | |
Import time 2.949057e-01s | |
Time in all call to theano.grad() 8.831870e+00s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
63.7% 63.7% 435.193s 2.38e-01s Py 1830 9 theano.scan_module.scan_op.Scan | |
19.9% 83.6% 135.986s 1.19e-02s C 11462 49 theano.sandbox.cuda.blas.GpuDot22 | |
6.5% 90.1% 44.409s 2.93e-03s C 15179 53 theano.sandbox.cuda.basic_ops.GpuCAReduce | |
2.7% 92.8% 18.330s 9.03e-04s C 20308 75 theano.sandbox.cuda.basic_ops.GpuElemwise | |
1.5% 94.3% 10.587s 3.38e-02s C 313 2 theano.sandbox.cuda.dnn.GpuDnnSoftmax | |
1.5% 95.8% 10.271s 3.46e-02s C 297 1 theano.sandbox.cuda.basic_ops.GpuAdvancedIncSubtensor1 | |
1.4% 97.3% 9.632s 3.24e-02s C 297 1 theano.sandbox.cuda.dnn.GpuDnnSoftmaxGrad | |
0.9% 98.2% 6.487s 4.37e-03s C 1485 5 theano.sandbox.cuda.blas.GpuDot22Scalar | |
0.6% 98.9% 4.434s 2.47e-04s C 17916 66 theano.sandbox.cuda.basic_ops.GpuAlloc | |
0.4% 99.3% 2.982s 1.41e-03s Py 2111 9 theano.sandbox.cuda.basic_ops.GpuFlatten | |
0.2% 99.5% 1.586s 7.46e-05s Py 21270 100 theano.sandbox.cuda.basic_ops.GpuReshape | |
0.2% 99.7% 1.464s 1.90e-04s C 7722 26 theano.sandbox.cuda.basic_ops.GpuIncSubtensor | |
0.0% 99.8% 0.338s 3.60e-04s Py 939 6 theano.sandbox.cuda.basic_ops.GpuAdvancedSubtensor1 | |
0.0% 99.8% 0.278s 9.37e-04s Py 297 1 theano.sandbox.cuda.basic_ops.GpuSplit | |
0.0% 99.9% 0.275s 1.81e-06s C 152275 613 theano.tensor.elemwise.Elemwise | |
0.0% 99.9% 0.247s 7.89e-04s C 313 2 theano.sandbox.cuda.basic_ops.GpuJoin | |
0.0% 99.9% 0.162s 2.73e-04s C 594 2 theano.sandbox.cuda.basic_ops.GpuAdvancedIncSubtensor1_dev20 | |
0.0% 100.0% 0.159s 6.57e-05s C 2427 14 theano.sandbox.cuda.basic_ops.GpuFromHost | |
0.0% 100.0% 0.068s 1.62e-06s C 41987 183 theano.compile.ops.Shape_i | |
0.0% 100.0% 0.061s 2.82e-06s C 21592 85 theano.sandbox.cuda.basic_ops.GpuSubtensor | |
... (remaining 15 Classes account for 0.03%(0.19s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
44.9% 44.9% 306.715s 1.03e+00s Py 297 1 forall_inplace,gpu,grad_of_scan_fn} | |
19.9% 64.8% 135.986s 1.19e-02s C 11462 49 GpuDot22 | |
9.1% 73.9% 61.915s 1.04e-01s Py 594 2 forall_inplace,gpu,grad_of_scan_fn} | |
8.3% 82.2% 57.021s 1.82e-01s Py 313 2 forall_inplace,gpu,scan_fn} | |
4.7% 86.9% 31.945s 3.36e-03s C 9504 32 GpuCAReduce{pre=sqr,red=add}{1,1} | |
1.8% 88.7% 12.223s 2.06e-02s C 594 2 GpuCAReduce{add}{1,1,0} | |
1.5% 90.2% 10.587s 3.38e-02s C 313 2 GpuDnnSoftmax{tensor_format='bc01', mode='channel', algo='ac | |
curate'} | |
1.5% 91.7% 10.271s 3.46e-02s C 297 1 GpuAdvancedIncSubtensor1{inplace,inc} | |
1.4% 93.1% 9.632s 3.24e-02s C 297 1 GpuDnnSoftmaxGrad{tensor_format='bc01', mode='channel', algo | |
='accurate'} | |
1.4% 94.5% 9.542s 1.52e-02s Py 626 4 forall_inplace,gpu,scan_fn} | |
0.9% 95.5% 6.487s 4.37e-03s C 1485 5 GpuDot22Scalar | |
0.9% 96.4% 6.301s 4.82e-04s C 13068 44 GpuElemwise{Composite{(i0 - (Switch(i1, (i2 * i0), Switch(i | |
3, i4, ((i5 * i4) / i6))) * i7))}}[(0, 0)] | |
0.6% 97.1% 4.434s 2.47e-04s C 17916 66 GpuAlloc{memset_0=True} | |
0.5% 97.6% 3.693s 5.75e-03s C 642 5 GpuElemwise{Mul}[(0, 0)] | |
0.5% 98.1% 3.673s 6.18e-03s C 594 2 GpuElemwise{add,no_inplace} | |
0.5% 98.7% 3.644s 4.09e-03s C 891 3 GpuElemwise{mul,no_inplace} | |
0.4% 99.1% 2.974s 2.00e-03s Py 1485 5 GpuFlatten{2} | |
0.2% 99.3% 1.328s 9.74e-05s Py 13637 62 GpuReshape{2} | |
0.1% 99.4% 0.517s 3.48e-04s C 1485 5 GpuIncSubtensor{Inc;:int64:} | |
0.1% 99.5% 0.494s 1.39e-04s C 3564 12 GpuIncSubtensor{InplaceInc;int64::} | |
... (remaining 172 Ops account for 0.55%(3.75s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
44.9% 44.9% 306.715s 1.03e+00s 297 1063 forall_inplace,gpu,grad_of_scan_fn}(Shape_i{1}.0, GpuDimShuffle{0,2,1} | |
.0, GpuDimShuffle{0,2,1}.0, GpuElemwise{tanh,no_inplace}.0, GpuElemwise{Composite{(i0 - sqr(i1))},no_inplace}.0, GpuSubtensor{int64: | |
int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{::int64}.0, GpuAlloc{memset_0=Tru | |
e}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{memset_0=True}.0, GpuAl | |
7.8% 52.7% 53.129s 1.79e-01s 297 959 forall_inplace,gpu,scan_fn}(Shape_i{1}.0, GpuSubtensor{int64:int64:int8 | |
}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{memset_0=True}.0, W_wa, W_wo, W_wo2, W_uo, W_wf, W_wf2, W_uf, W_wi, W_wi2, W_ui, W_wc, W_wc | |
2, W_uc, GpuJoin.0, GpuReshape{3}.0, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, Elemwis | |
e{neq,no_inplace}.0, GpuReshape{2}.0) | |
6.3% 59.0% 42.972s 1.45e-01s 297 1044 GpuDot22(GpuReshape{2}.0, GpuDimShuffle{1,0}.0) | |
5.7% 64.7% 38.873s 1.31e-01s 297 1136 forall_inplace,gpu,grad_of_scan_fn}(Elemwise{Composite{minimum(minimum | |
(minimum(minimum(i0, i1), i2), i3), i4)}}.0, InplaceDimShuffle{0,1,x}.0, GpuDimShuffle{0,2,1}.0, Elemwise{sub,no_inplace}.0, GpuSubt | |
ensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, | |
GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{::int64}.0, GpuAlloc{memset_0=True} | |
5.1% 69.7% 34.667s 1.17e-01s 297 999 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
4.7% 74.5% 32.447s 1.09e-01s 297 1043 GpuDot22(GpuDimShuffle{1,0}.0, GpuReshape{2}.0) | |
3.4% 77.9% 23.042s 7.76e-02s 297 1137 forall_inplace,gpu,grad_of_scan_fn}(Elemwise{Composite{minimum(minimum | |
(minimum(minimum(i0, i1), i2), i3), i4)}}.0, InplaceDimShuffle{0,1,x}.0, GpuDimShuffle{0,2,1}.0, Elemwise{sub,no_inplace}.0, GpuSubt | |
ensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, | |
GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{::int64}.0, GpuAlloc{memset_0=True} | |
1.8% 79.7% 12.410s 4.18e-02s 297 1050 GpuCAReduce{pre=sqr,red=add}{1,1}(GpuDimShuffle{0,1}.0) | |
1.8% 81.4% 12.040s 4.05e-02s 297 1042 GpuCAReduce{add}{1,1,0}(GpuReshape{3}.0) | |
1.5% 82.9% 10.271s 3.46e-02s 297 1031 GpuAdvancedIncSubtensor1{inplace,inc}(GpuAlloc{memset_0=True}.0, GpuEl | |
emwise{Composite{((i0 * i1 * i2) / (i3 * i4))},no_inplace}.0, Elemwise{Composite{((i0 * i1) + i2)}}.0) | |
1.5% 84.4% 9.947s 3.35e-02s 297 1019 GpuDnnSoftmax{tensor_format='bc01', mode='channel', algo='accurate'}(G | |
puContiguous.0) | |
1.4% 85.8% 9.632s 3.24e-02s 297 1037 GpuDnnSoftmaxGrad{tensor_format='bc01', mode='channel', algo='accurate | |
'}(GpuContiguous.0, GpuContiguous.0) | |
0.6% 86.4% 4.149s 1.40e-02s 297 881 forall_inplace,gpu,scan_fn}(Elemwise{Composite{minimum(minimum(minimum( | |
minimum(i0, i1), i2), i3), i4)}}.0, InplaceDimShuffle{0,1,x}.0, Elemwise{sub,no_inplace}.0, GpuSubtensor{int64:int64:int8}.0, GpuSub | |
tensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{ | |
memset_0=True}.0, W_uo, W_uf, W_ui, W_uc, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDimShuf | |
0.6% 87.0% 4.138s 1.39e-02s 297 906 forall_inplace,gpu,scan_fn}(Elemwise{Composite{minimum(minimum(minimum( | |
minimum(i0, i1), i2), i3), i4)}}.0, InplaceDimShuffle{0,1,x}.0, Elemwise{sub,no_inplace}.0, GpuSubtensor{int64:int64:int64}.0, GpuSu | |
btensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuAlloc{memset_0=True}.0, GpuAl | |
loc{memset_0=True}.0, W_uo, W_uf, W_ui, W_uc, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDim | |
0.6% 87.6% 3.892s 2.43e-01s 16 309 forall_inplace,gpu,scan_fn}(Shape_i{1}.0, GpuSubtensor{int64:int64:int8 | |
}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{memset_0=True}.0, W_wa, W_wo, W_wo2, W_uo, W_wf, W_wf2, W_uf, W_wi, W_wi2, W_ui, W_wc, W_wc | |
2, W_uc, GpuJoin.0, GpuReshape{3}.0, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, Elemwis | |
e{neq,no_inplace}.0, GpuReshape{2}.0) | |
0.5% 88.1% 3.584s 1.21e-02s 297 1004 GpuElemwise{add,no_inplace}(GpuReshape{3}.0, GpuDimShuffle{x,x,0}.0) | |
0.5% 88.6% 3.438s 1.16e-02s 297 1012 GpuElemwise{mul,no_inplace}(GpuReshape{2}.0, GpuDimShuffle{0,x}.0) | |
0.5% 89.1% 3.342s 1.13e-02s 297 1039 GpuElemwise{Mul}[(0, 0)](GpuDimShuffle{0,1}.0, GpuDimShuffle{0,x}.0) | |
0.4% 89.5% 2.833s 9.54e-03s 297 1346 GpuElemwise{Composite{(i0 - (Switch(i1, (i2 * i0), Switch(i3, i4, ((i5 | |
* i4) / i6))) * i7))}}[(0, 0)](W_embed, GpuFromHost.0, CudaNdarrayConstant{[[ 0.01]]}, GpuFromHost.0, GpuAdvancedIncSubtensor1_dev2 | |
0{inplace,inc}.0, CudaNdarrayConstant{[[ 3.]]}, GpuDimShuffle{x,x}.0, GpuDimShuffle{x,x}.0) | |
0.4% 89.9% 2.804s 9.44e-03s 297 1116 GpuDot22(GpuFlatten{2}.0, GpuReshape{2}.0) | |
... (remaining 1678 Apply instances account for 10.07%(68.81s) of the runtime) | |
ubuntu@ip-10-0-0-156:~/nmt$ :command-prompt -p 'save history to filename:' -I '~/tmux.history' 'capture-pane -S -32768 ; save-buffer | |
%1 ; delete-buffer'^C | |
ubuntu@ip-10-0-0-156:~/nmt$ ::^C | |
ubuntu@ip-10-0-0-156:~/nmt$ zls | |
No command 'zls' found, did you mean: | |
Command 'als' from package 'atool' (universe) | |
Command 'bls' from package 'bacula-sd' (main) | |
Command 'fls' from package 'sleuthkit' (universe) | |
Command 'jls' from package 'sleuthkit' (universe) | |
Command 'ls' from package 'coreutils' (main) | |
Command 'ols' from package 'speech-tools' (universe) | |
Command 'tls' from package 'python-tlslite' (universe) | |
Command 'ils' from package 'sleuthkit' (universe) | |
Command 'hls' from package 'hfsutils' (main) | |
Command 'zfs' from package 'zfs-fuse' (universe) | |
zls: command not found | |
ubuntu@ip-10-0-0-156:~/nmt$ ls | |
bin -.log nmt README.md test trails | |
ubuntu@ip-10-0-0-156:~/nmt$ cat . | |
./ ../ .git/ .gitignore .idea/ .remote-sync.json | |
ubuntu@ip-10-0-0-156:~/nmt$ cat . | |
./ ../ .git/ .gitignore .idea/ .remote-sync.json | |
ubuntu@ip-10-0-0-156:~/nmt$ cat .t^C | |
ubuntu@ip-10-0-0-156:~/nmt$ cd | |
ubuntu@ip-10-0-0-156:~$ ls | |
archives data deepy nmt update.sh | |
ubuntu@ip-10-0-0-156:~$ cd - | |
/home/ubuntu/nmt | |
ubuntu@ip-10-0-0-156:~/nmt$ ls | |
bin -.log nmt README.md test trails | |
ubuntu@ip-10-0-0-156:~/nmt$ ls -lhg | |
total 20K | |
drwxrwxr-x 2 ubuntu 4.0K Jul 13 22:01 bin | |
-rw-rw-r-- 1 ubuntu 72 Jul 16 04:23 -.log | |
drwxrwxr-x 2 ubuntu 4.0K Jul 16 05:17 nmt | |
-rw-rw-r-- 1 ubuntu 0 Jul 8 05:03 README.md | |
drwxrwxr-x 2 ubuntu 4.0K Jul 8 05:03 test | |
drwxrwxr-x 3 ubuntu 4.0K Jul 8 05:03 trails | |
ubuntu@ip-10-0-0-156:~/nmt$ git status | |
On branch master | |
Your branch is up-to-date with 'origin/master'. | |
nothing to commit, working directory clean | |
ubuntu@ip-10-0-0-156:~/nmt$ git rm -.log | |
error: unknown switch `.' | |
usage: git rm [options] [--] <file>... | |
-n, --dry-run dry run | |
-q, --quiet do not list removed files | |
--cached only remove from the index | |
-f, --force override the up-to-date check | |
-r allow recursive removal | |
--ignore-unmatch exit with a zero status even if nothing matched | |
ubuntu@ip-10-0-0-156:~/nmt$ git status | |
On branch master | |
Your branch is up-to-date with 'origin/master'. | |
nothing to commit, working directory clean | |
ubuntu@ip-10-0-0-156:~/nmt$ rm -.lo | |
rm: invalid option -- '.' | |
Try 'rm --help' for more information. | |
ubuntu@ip-10-0-0-156:~/nmt$ rm -.log | |
rm: invalid option -- '.' | |
Try 'rm ./-.log' to remove the file ‘-.log’. | |
Try 'rm --help' for more information. | |
ubuntu@ip-10-0-0-156:~/nmt$ rm "-.log" | |
rm: invalid option -- '.' | |
Try 'rm ./-.log' to remove the file ‘-.log’. | |
Try 'rm --help' for more information. | |
ubuntu@ip-10-0-0-156:~/nmt$ ls | |
bin -.log nmt README.md test trails | |
ubuntu@ip-10-0-0-156:~/nmt$ ls -lh | |
total 20K | |
drwxrwxr-x 2 ubuntu ubuntu 4.0K Jul 13 22:01 bin | |
-rw-rw-r-- 1 ubuntu ubuntu 72 Jul 16 04:23 -.log | |
drwxrwxr-x 2 ubuntu ubuntu 4.0K Jul 16 05:17 nmt | |
-rw-rw-r-- 1 ubuntu ubuntu 0 Jul 8 05:03 README.md | |
drwxrwxr-x 2 ubuntu ubuntu 4.0K Jul 8 05:03 test | |
drwxrwxr-x 3 ubuntu ubuntu 4.0K Jul 8 05:03 trails | |
ubuntu@ip-10-0-0-156:~/nmt$ rm *.log | |
rm: invalid option -- '.' | |
Try 'rm ./-.log' to remove the file ‘-.log’. | |
Try 'rm --help' for more information. | |
ubuntu@ip-10-0-0-156:~/nmt$ mv *.log a.log | |
mv: invalid option -- '.' | |
Try 'mv --help' for more information. | |
ubuntu@ip-10-0-0-156:~/nmt$ ls | |
bin -.log nmt README.md test trails | |
ubuntu@ip-10-0-0-156:~/nmt$ cd . | |
ubuntu@ip-10-0-0-156:~/nmt$ ls | |
bin -.log nmt README.md test trails | |
ubuntu@ip-10-0-0-156:~/nmt$ cat . | |
./ ../ .git/ .gitignore .idea/ .remote-sync.json | |
ubuntu@ip-10-0-0-156:~/nmt$ cat . | |
./ ../ .git/ .gitignore .idea/ .remote-sync.json | |
ubuntu@ip-10-0-0-156:~/nmt$ cls | |
No command 'cls' found, but there are 18 similar ones | |
cls: command not found | |
ubuntu@ip-10-0-0-156:~/nmt$ ls | |
bin -.log nmt README.md test trails | |
ubuntu@ip-10-0-0-156:~/nmt$ rm *.log | |
rm: invalid option -- '.' | |
Try 'rm ./-.log' to remove the file ‘-.log’. | |
Try 'rm --help' for more information. | |
ubuntu@ip-10-0-0-156:~/nmt$ rm \-.log | |
rm: invalid option -- '.' | |
Try 'rm ./-.log' to remove the file ‘-.log’. | |
Try 'rm --help' for more information. | |
ubuntu@ip-10-0-0-156:~/nmt$ rm \\-.log | |
rm: cannot remove ‘\\-.log’: No such file or directory | |
ubuntu@ip-10-0-0-156:~/nmt$ rm "\-.log" | |
rm: cannot remove ‘\\-.log’: No such file or directory | |
ubuntu@ip-10-0-0-156:~/nmt$ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment