I use
$nvidia-docker run -e TERM=dumb -v /usr/local/cuda:/usr/local/cuda -i -t kaixhin/torch /bin/bash
and I was able to run luarocks install cutorch
. But when testing the library, I get the error on cuda.
require 'cutorch'
THCudaCheck FAIL file=/tmp/luarocks_cutorch-scm-1-7401/cutorch/lib/THC/THCGeneral.c line=70 error=35 : CUDA driver version is insufficient for CUDA runtime version
/root/torch/install/share/lua/5.1/trepl/init.lua:389: cuda runtime error (35) : CUDA driver version is insufficient for CUDA runtime version at /tmp/luarocks_cutorch-scm-1-7401/cutorch/lib/THC/THCGeneral.c:70
stack traceback:
[C]: in function 'error'
/root/torch/install/share/lua/5.1/trepl/init.lua:389: in function 'require'
[string "_RESULT={require 'cutorch'}"]:1: in main chunk
[C]: in function 'xpcall'
/root/torch/install/share/lua/5.1/trepl/init.lua:661: in function 'repl'
/root/torch/install/lib/luarocks/rocks/trepl/scm-1/bin/th:204: in main chunk
[C]: at 0x00406670
There are also more errors relating the lua libraies. The file Imports.lua
has two imported files: OneHot.lua
and Constant.lua
; both give errors. The OneHot.lua file was copied from a https://raw.githubusercontent.com/Element-Research/dpnn/master/OneHot.lua which now gives 404 status.
require 'nn'
require 'nngraph'
require 'optim'
package.path = package.path .. ';util/?.lua'
require 'LogSumExp'
require 'Cond'
require 'EpochDropout'
require 'Predicate'
require 'PrintNoNewline'
require 'TruncatedBackprop'
require 'IO'
require 'Callback'
require 'OneHot' -- error loading OneHot ???
/root/torch/install/share/lua/5.1/trepl/init.lua:389: /root/torch/install/share/lua/5.1/torch/init.lua:102: class nn.OneHot has been already assigned a parent class
stack traceback:
[C]: in function 'error'
/root/torch/install/share/lua/5.1/trepl/init.lua:389: in function 'require'
[string "require 'OneHot' -- error loading OneHot ???"]:1: in main chunk
[C]: in function 'xpcall'
/root/torch/install/share/lua/5.1/trepl/init.lua:679: in function 'repl'
/root/torch/install/lib/luarocks/rocks/trepl/scm-1/bin/th:204: in main chunk
[C]: at 0x00406670
package.path = package.path .. ';../torch-util/?.lua'
require 'Constant' -- why error ??
/root/torch/install/share/lua/5.1/trepl/init.lua:389: /root/torch/install/share/lua/5.1/torch/init.lua:102: class nn.Constant has been already assigned a parent class
stack traceback:
[C]: in function 'error'
/root/torch/install/share/lua/5.1/trepl/init.lua:389: in function 'require'
[string "require 'Constant' -- why error ?? "]:1: in main chunk
[C]: in function 'xpcall'
/root/torch/install/share/lua/5.1/trepl/init.lua:679: in function 'repl'
/root/torch/install/lib/luarocks/rocks/trepl/scm-1/bin/th:204: in main chunk
[C]: at 0x00406670
Strangely though, the OneHot.lua loads when using the docker douglasorr/cutorch_7.5_352.41
, the same 2 imports OneHot.lua
and Constant.lua
work fine.
However, the douglasorr/cutorch_7.5_352.41
found no suitable CUDA device
Training a SPEN on the Data using tag_cmd.sh
tag_cmd.sh: 5: [: 0: unexpected operator
-gradient_clip 1.0 -optim_method adam -evaluation_frequency 25 -save_frequency 25 -adam_epsilon 1e-8 -batches_per_epoch 100 -learning_rate_decay 0.0 -learning_rate 0.001 -num_epochs 30 -training_mode pretrain_unaries
running in tag-runs/Tue_Jul__3_07:23:03_UTC_2018
{
cudnn : 1
inference_rtol : 1e-05
profile : 0
batch_size : 10
icnn : 0
problem : "SequenceTagging"
init_line_search_step : 1
inference_learning_rate : 0.1
inference_learning_rate_power : 1
init_classifier : ""
test_list : "./data/sequence/crf-data.test.list"
inference_learning_rate_decay : 0
train_list : "./data/sequence/crf-data.train.list"
max_inference_iters : 20
evaluate_classifier_only : 0
gpuid : 0
training_configs : "tag-runs/Tue_Jul__3_07:23:03_UTC_2018/train-config"
results_file : ""
first_iter_to_penalize_convergence : 10
continuous_outputs : 0
penalize_all_iterates : 0
mirror_descent : 1
out_dir : "tag-runs/Tue_Jul__3_07:23:03_UTC_2018/results"
line_search : 1
model_file : "tag-runs/Tue_Jul__3_07:23:03_UTC_2018/model-"
evaluate_spen_only : 0
init_full_net : ""
entropy_weight : 1
unconstrained_iterates : 1
problem_config : "tag-runs/Tue_Jul__3_07:23:03_UTC_2018/problem-config"
convergence_regularization_weight : 0
training_method : "E2E"
inference_momentum : 0.5
init_at_local_prediction : 0
shuffle : 1
learn_inference_hyperparams : 1
first_iter_to_apply_loss : 10
}
USING GPU 0
/opt/torch/install/bin/luajit: /opt/torch/install/share/lua/5.1/trepl/init.lua:363: cuda runtime error (38) : no CUDA-capable device is detected at /opt/torch/extra/cutorch/lib/THC/THCGeneral.c:16
stack traceback:
[C]: in function 'error'
/opt/torch/install/share/lua/5.1/trepl/init.lua:363: in function 'require'
main.lua:21: in main chunk
[C]: in function 'dofile'
/opt/torch/install/lib/luarocks/rocks/trepl/scm-1/bin/th:131: in main chunk
[C]: at 0x00405d30