MarkusPfundstein · June 9, 2015 13:49
diff --git a/cuda-memcheck b/cuda-memcheck
 ========= CUDA-MEMCHECK
 ========= Program hit cudaErrorInvalidValue (error 11) due to "invalid argument" on CUDA API call to cudaMemsetAsync. 
 =========     Saved host backtrace up to driver entry point at error
 =========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x2e4263]
 =========     Host Frame:/usr/local/cuda-7.0/lib64/libcublas.so.7.0 [0x232029]
 =========     Host Frame:/usr/local/cuda-7.0/lib64/libcublas.so.7.0 [0x11a11d]
 =========     Host Frame:/usr/local/cuda-7.0/lib64/libcublas.so.7.0 [0x119840]
 =========     Host Frame:/usr/local/cuda-7.0/lib64/libcublas.so.7.0 [0xe7f12]
 =========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/tmpWEAUFm/6b20a8021c12925d87ac05c6d9b33c6c.so (_Z6corrMMP11CudaNdarrayS0_S0_iiiii + 0x63d) [0x2c2d]
 =========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/tmpWEAUFm/6b20a8021c12925d87ac05c6d9b33c6c.so (_ZN62_GLOBAL__N__38_tmpxft_000046f7_00000000_9_mod_cpp1_ii_9233ac7753__struct_compiled_op_6b20a8021c12925d87ac05c6d9b33c6c3runEv + 0x625) [0x37d5]
 =========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x36eb]
 =========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4128]
 =========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
 =========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
 =========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
 =========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
 =========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x3a00]
 =========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
 =========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x47b7]
 =========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyObject_Call + 0x53) [0x48333]
 =========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalFrameEx + 0x4c42) [0xf74b2]
 =========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalCodeEx + 0x88e) [0xf9dbe]
 =========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 [0x77891]
 =========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyObject_Call + 0x53) [0x48333]
 =========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 [0x5a96f]
 =========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyObject_Call + 0x53) [0x48333]
 =========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 [0xb4b2c]
 =========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyObject_Call + 0x53) [0x48333]
 =========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalFrameEx + 0x4c42) [0xf74b2]
 =========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalFrameEx + 0x5ee0) [0xf8750]
 =========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalFrameEx + 0x5ee0) [0xf8750]
 =========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalCodeEx + 0x88e) [0xf9dbe]
 =========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalCode + 0x32) [0xf9ed2]
 =========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyRun_FileExFlags + 0xb0) [0x119e10]
 =========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyRun_SimpleFileExFlags + 0xef) [0x119fef]
 =========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (Py_Main + 0xca4) [0x12f8f4]
 =========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21ec5]
 =========     Host Frame:python [0x649]
 =========
 ========= Program hit cudaErrorInvalidResourceHandle (error 33) due to "invalid resource handle" on CUDA API call to cudaEventRecord. 
 =========     Saved host backtrace up to driver entry point at error
 =========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x2e4263]
 =========     Host Frame:/usr/local/cuda-7.0/lib64/libcublas.so.7.0 [0x231732]
 =========     Host Frame:/usr/local/cuda-7.0/lib64/libcublas.so.7.0 [0x2140a]
 =========     Host Frame:/usr/local/cuda-7.0/lib64/libcublas.so.7.0 [0x11a2b2]
 =========     Host Frame:/usr/local/cuda-7.0/lib64/libcublas.so.7.0 [0x119840]
 =========     Host Frame:/usr/local/cuda-7.0/lib64/libcublas.so.7.0 [0xe7f12]
 =========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/tmpWEAUFm/6b20a8021c12925d87ac05c6d9b33c6c.so (_Z6corrMMP11CudaNdarrayS0_S0_iiiii + 0x63d) [0x2c2d]
 =========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/tmpWEAUFm/6b20a8021c12925d87ac05c6d9b33c6c.so (_ZN62_GLOBAL__N__38_tmpxft_000046f7_00000000_9_mod_cpp1_ii_9233ac7753__struct_compiled_op_6b20a8021c12925d87ac05c6d9b33c6c3runEv + 0x625) [0x37d5]
 =========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x36eb]
 =========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4128]
 =========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
 =========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
 =========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
 =========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
 =========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x3a00]
 =========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
 =========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x47b7]
 =========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyObject_Call + 0x53) [0x48333]
 =========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalFrameEx + 0x4c42) [0xf74b2]
 =========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalCodeEx + 0x88e) [0xf9dbe]
 =========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 [0x77891]
 =========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyObject_Call + 0x53) [0x48333]
 =========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 [0x5a96f]
 =========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libp0.7.0
 =========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 [0xb4b2c]
 =========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyObject_Call + 0x53) [0x48333]
 =========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalFrameEx + 0x4c42) [0xf74b2]
 =========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalFrameEx + 0x5ee0) [0xf8750]
 =========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalFrameEx + 0x5ee0) [0xf8750]
 =========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalCodeEx + 0x88e) [0xf9dbe]
 =========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalCode + 0x32) [0xf9ed2]
 =========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyRun_FileExFlags + 0xb0) [0x119e10]
 =========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyRun_SimpleFileExFlags + 0xef) [0x119fef]
 =========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (Py_Main + 0xca4) [0x12f8f4]
 =========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21ec5]
 =========     Host Frame:python [0x649]
 =========
 ========= ERROR SUMMARY: 2 errors
diff --git a/nvidia-smi OUTPUT b/nvidia-smi OUTPUT
 Tue Jun  9 08:45:54 2015       
 +------------------------------------------------------+                       
 | NVIDIA-SMI 346.59     Driver Version: 346.59         |                       
 |-------------------------------+----------------------+----------------------+
 | GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
 | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
 |===============================+======================+======================|
 |   0  Tesla K80           Off  | 0000:83:00.0     Off |                    0 |
 | N/A   34C    P0    59W / 149W |     55MiB / 11519MiB |      0%      Default |
 +-------------------------------+----------------------+----------------------+
 |   1  Tesla K80           Off  | 0000:84:00.0     Off |                    0 |
 | N/A   32C    P0    75W / 149W |     55MiB / 11519MiB |      0%      Default |
 +-------------------------------+----------------------+----------------------+
 |   2  Tesla K80           Off  | 0000:87:00.0     Off |                    0 |
 | N/A   34C    P0    60W / 149W |     55MiB / 11519MiB |      0%      Default |
 +-------------------------------+----------------------+----------------------+
 |   3  Tesla K80           Off  | 0000:88:00.0     Off |                    0 |
 | N/A   31C    P0    75W / 149W |     55MiB / 11519MiB |     97%      Default |
 +-------------------------------+----------------------+----------------------+
                                                                               
 +-----------------------------------------------------------------------------+
 | Processes:                                                       GPU Memory |
 |  GPU       PID  Type  Process name                               Usage      |
 |=============================================================================|
 |  No running processes found                                                 |
 +-----------------------------------------------------------------------------+
diff --git a/queryDevice output b/queryDevice output
 ./queryDevice OUTPUT:
 NVIDIA_CUDA-7.0_Samples/bin/x86_64/linux/release/deviceQuery Starting...

 CUDA Device Query (Runtime API) version (CUDART static linking)

 Detected 4 CUDA Capable device(s)

 Device 0: "Tesla K80"
  CUDA Driver Version / Runtime Version          7.0 / 7.0
  CUDA Capability Major/Minor version number:    3.7
  Total amount of global memory:                 11520 MBytes (12079136768 bytes)
  (13) Multiprocessors, (192) CUDA Cores/MP:     2496 CUDA Cores
  GPU Max Clock rate:                            824 MHz (0.82 GHz)
  Memory Clock rate:                             2505 Mhz
  Memory Bus Width:                              384-bit
  L2 Cache Size:                                 1572864 bytes
  Maximum Texture Dimension Size (x,y,z)         1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
  Maximum Layered 1D Texture Size, (num) layers  1D=(16384), 2048 layers
  Maximum Layered 2D Texture Size, (num) layers  2D=(16384, 16384), 2048 layers
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  2048
  Maximum number of threads per block:           1024
  Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
  Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Concurrent copy and kernel execution:          Yes with 2 copy engine(s)
  Run time limit on kernels:                     No
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support:                        Enabled
  Device supports Unified Addressing (UVA):      Yes
  Device PCI Domain ID / Bus ID / location ID:   0 / 131 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

 Device 1: "Tesla K80"
  CUDA Driver Version / Runtime Version          7.0 / 7.0
  CUDA Capability Major/Minor version number:    3.7
  Total amount of global memory:                 11520 MBytes (12079136768 bytes)
  (13) Multiprocessors, (192) CUDA Cores/MP:     2496 CUDA Cores
  GPU Max Clock rate:                            824 MHz (0.82 GHz)
  Memory Clock rate:                             2505 Mhz
  Memory Bus Width:                              384-bit
  L2 Cache Size:                                 1572864 bytes
  Maximum Texture Dimension Size (x,y,z)         1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
  Maximum Layered 1D Texture Size, (num) layers  1D=(16384), 2048 layers
  Maximum Layered 2D Texture Size, (num) layers  2D=(16384, 16384), 2048 layers
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  2048
  Maximum number of threads per block:           1024
  Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
  Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Concurrent copy and kernel execution:          Yes with 2 copy engine(s)
  Run time limit on kernels:                     No
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support:                        Enabled
  Device supports Unified Addressing (UVA):      Yes
  Device PCI Domain ID / Bus ID / location ID:   0 / 132 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

 Device 2: "Tesla K80"
  CUDA Driver Version / Runtime Version          7.0 / 7.0
  CUDA Capability Major/Minor version number:    3.7
  Total amount of global memory:                 11520 MBytes (12079136768 bytes)
  (13) Multiprocessors, (192) CUDA Cores/MP:     2496 CUDA Cores
  GPU Max Clock rate:                            824 MHz (0.82 GHz)
  Memory Clock rate:                             2505 Mhz
  Memory Bus Width:                              384-bit
  L2 Cache Size:                                 1572864 bytes
  Maximum Texture Dimension Size (x,y,z)         1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
  Maximum Layered 1D Texture Size, (num) layers  1D=(16384), 2048 layers
  Maximum Layered 2D Texture Size, (num) layers  2D=(16384, 16384), 2048 layers
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  2048
  Maximum number of threads per block:           1024
  Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
  Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Concurrent copy and kernel execution:          Yes with 2 copy engine(s)
  Run time limit on kernels:                     No
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support:                        Enabled
  Device supports Unified Addressing (UVA):      Yes
  Device PCI Domain ID / Bus ID / location ID:   0 / 135 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

 Device 3: "Tesla K80"
  CUDA Driver Version / Runtime Version          7.0 / 7.0
  CUDA Capability Major/Minor version number:    3.7
  Total amount of global memory:                 11520 MBytes (12079136768 bytes)
  (13) Multiprocessors, (192) CUDA Cores/MP:     2496 CUDA Cores
  GPU Max Clock rate:                            824 MHz (0.82 GHz)
  Memory Clock rate:                             2505 Mhz
  Memory Bus Width:                              384-bit
  L2 Cache Size:                                 1572864 bytes
  Maximum Texture Dimension Size (x,y,z)         1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
  Maximum Layered 1D Texture Size, (num) layers  1D=(16384), 2048 layers
  Maximum Layered 2D Texture Size, (num) layers  2D=(16384, 16384), 2048 layers
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  2048
  Maximum number of threads per block:           1024
  Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
  Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Concurrent copy and kernel execution:          Yes with 2 copy engine(s)
  Run time limit on kernels:                     No
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support:                        Enabled
  Device supports Unified Addressing (UVA):      Yes
  Device PCI Domain ID / Bus ID / location ID:   0 / 136 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
 > Peer access from Tesla K80 (GPU0) -> Tesla K80 (GPU1) : Yes
 > Peer access from Tesla K80 (GPU0) -> Tesla K80 (GPU2) : Yes
 > Peer access from Tesla K80 (GPU0) -> Tesla K80 (GPU3) : Yes
 > Peer access from Tesla K80 (GPU1) -> Tesla K80 (GPU1) : No
 > Peer access from Tesla K80 (GPU1) -> Tesla K80 (GPU2) : Yes
 > Peer access from Tesla K80 (GPU1) -> Tesla K80 (GPU3) : Yes
 > Peer access from Tesla K80 (GPU2) -> Tesla K80 (GPU1) : Yes
 > Peer access from Tesla K80 (GPU2) -> Tesla K80 (GPU2) : No
 > Peer access from Tesla K80 (GPU2) -> Tesla K80 (GPU3) : Yes
 > Peer access from Tesla K80 (GPU1) -> Tesla K80 (GPU0) : Yes
 > Peer access from Tesla K80 (GPU1) -> Tesla K80 (GPU1) : No
 > Peer access from Tesla K80 (GPU1) -> Tesla K80 (GPU2) : Yes
 > Peer access from Tesla K80 (GPU2) -> Tesla K80 (GPU0) : Yes
 > Peer access from Tesla K80 (GPU2) -> Tesla K80 (GPU1) : Yes
 > Peer access from Tesla K80 (GPU2) -> Tesla K80 (GPU2) : No
 > Peer access from Tesla K80 (GPU3) -> Tesla K80 (GPU0) : Yes
 > Peer access from Tesla K80 (GPU3) -> Tesla K80 (GPU1) : Yes
 > Peer access from Tesla K80 (GPU3) -> Tesla K80 (GPU2) : Yes

 deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 7.0, CUDA Runtime Version = 7.0, NumDevs = 4, Device0 = Tesla K80, Device1 = Tesla K80, Device2 = Tesla K80, Device3 = Tesla K80
 Result = PASS
diff --git a/runtime exception b/runtime exception
 Traceback (most recent call last):
  File "/home/<xy>net/<xy>nn/main.py", line 281, in <module>
    args.func(args)
  File "/home/<xy>net/<xy>nn/main.py", line 72, in train
    nn.train(data_provider)
  File "/home/<xy>net/<xy>nn/NN/neural_net.py", line 399, in train
    cost = self._train(batch_offset[0], batch_offset[1])
  File "/home/<xy>net/anaconda/lib/python2.7/site-packages/theano/compile/function_module.py", line 606, in __call__
    storage_map=self.fn.storage_map)
  File "/home/<xy>net/anaconda/lib/python2.7/site-packages/theano/compile/function_module.py", line 595, in __call__
    outputs = self.fn()
 RuntimeError: GpuCorrMM encountered a CUBLAS error: an internal operation failed
 This could be a known bug in CUDA, please see the GpuCorrMM() documentation.

 Apply node that caused the error: GpuCorrMM{valid, (1, 1)}(GpuContiguous.0, GpuContiguous.0)
 Inputs types: [CudaNdarrayType(float32, 4D), CudaNdarrayType(float32, 4D)]
 Inputs shapes: [(3, 5, 128, 128), (3, 5, 118, 118)]
 Inputs strides: [(81920, 16384, 128, 1), (69620, 13924, 118, 1)]
 Inputs values: ['not shown', 'not shown']

 HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
 HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.
diff --git a/Versions b/Versions
 NVIDIA drivers:
 <xy>net@train-k80:~/kaggle$ cat /proc/driver/nvidia/version 
 NVRM version: NVIDIA UNIX x86_64 Kernel Module  346.59  Tue Mar 31 14:10:31 PDT 2015
 GCC version:  gcc version 4.8.2 (Ubuntu 4.8.2-19ubuntu1) 

 Theano:
 <xy>net@train-k80:~/kaggle$ python -c 'import theano; print theano.__version__'
 Using gpu device 0: Tesla K80
 0.7.0
	========= CUDA-MEMCHECK
	========= Program hit cudaErrorInvalidValue (error 11) due to "invalid argument" on CUDA API call to cudaMemsetAsync.
	========= Saved host backtrace up to driver entry point at error
	========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x2e4263]
	========= Host Frame:/usr/local/cuda-7.0/lib64/libcublas.so.7.0 [0x232029]
	========= Host Frame:/usr/local/cuda-7.0/lib64/libcublas.so.7.0 [0x11a11d]
	========= Host Frame:/usr/local/cuda-7.0/lib64/libcublas.so.7.0 [0x119840]
	========= Host Frame:/usr/local/cuda-7.0/lib64/libcublas.so.7.0 [0xe7f12]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/tmpWEAUFm/6b20a8021c12925d87ac05c6d9b33c6c.so (_Z6corrMMP11CudaNdarrayS0_S0_iiiii + 0x63d) [0x2c2d]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/tmpWEAUFm/6b20a8021c12925d87ac05c6d9b33c6c.so (_ZN62_GLOBAL__N__38_tmpxft_000046f7_00000000_9_mod_cpp1_ii_9233ac7753__struct_compiled_op_6b20a8021c12925d87ac05c6d9b33c6c3runEv + 0x625) [0x37d5]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x36eb]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4128]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x3a00]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x47b7]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyObject_Call + 0x53) [0x48333]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalFrameEx + 0x4c42) [0xf74b2]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalCodeEx + 0x88e) [0xf9dbe]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 [0x77891]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyObject_Call + 0x53) [0x48333]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 [0x5a96f]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyObject_Call + 0x53) [0x48333]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 [0xb4b2c]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyObject_Call + 0x53) [0x48333]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalFrameEx + 0x4c42) [0xf74b2]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalFrameEx + 0x5ee0) [0xf8750]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalFrameEx + 0x5ee0) [0xf8750]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalCodeEx + 0x88e) [0xf9dbe]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalCode + 0x32) [0xf9ed2]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyRun_FileExFlags + 0xb0) [0x119e10]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyRun_SimpleFileExFlags + 0xef) [0x119fef]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (Py_Main + 0xca4) [0x12f8f4]
	========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21ec5]
	========= Host Frame:python [0x649]
	=========
	========= Program hit cudaErrorInvalidResourceHandle (error 33) due to "invalid resource handle" on CUDA API call to cudaEventRecord.
	========= Saved host backtrace up to driver entry point at error
	========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x2e4263]
	========= Host Frame:/usr/local/cuda-7.0/lib64/libcublas.so.7.0 [0x231732]
	========= Host Frame:/usr/local/cuda-7.0/lib64/libcublas.so.7.0 [0x2140a]
	========= Host Frame:/usr/local/cuda-7.0/lib64/libcublas.so.7.0 [0x11a2b2]
	========= Host Frame:/usr/local/cuda-7.0/lib64/libcublas.so.7.0 [0x119840]
	========= Host Frame:/usr/local/cuda-7.0/lib64/libcublas.so.7.0 [0xe7f12]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/tmpWEAUFm/6b20a8021c12925d87ac05c6d9b33c6c.so (_Z6corrMMP11CudaNdarrayS0_S0_iiiii + 0x63d) [0x2c2d]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/tmpWEAUFm/6b20a8021c12925d87ac05c6d9b33c6c.so (_ZN62_GLOBAL__N__38_tmpxft_000046f7_00000000_9_mod_cpp1_ii_9233ac7753__struct_compiled_op_6b20a8021c12925d87ac05c6d9b33c6c3runEv + 0x625) [0x37d5]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x36eb]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4128]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x3a00]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x47b7]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyObject_Call + 0x53) [0x48333]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalFrameEx + 0x4c42) [0xf74b2]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalCodeEx + 0x88e) [0xf9dbe]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 [0x77891]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyObject_Call + 0x53) [0x48333]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 [0x5a96f]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libp0.7.0
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 [0xb4b2c]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyObject_Call + 0x53) [0x48333]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalFrameEx + 0x4c42) [0xf74b2]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalFrameEx + 0x5ee0) [0xf8750]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalFrameEx + 0x5ee0) [0xf8750]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalCodeEx + 0x88e) [0xf9dbe]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalCode + 0x32) [0xf9ed2]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyRun_FileExFlags + 0xb0) [0x119e10]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyRun_SimpleFileExFlags + 0xef) [0x119fef]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (Py_Main + 0xca4) [0x12f8f4]
	========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21ec5]
	========= Host Frame:python [0x649]
	=========
	========= ERROR SUMMARY: 2 errors
	Tue Jun 9 08:45:54 2015
	+------------------------------------------------------+
	\| NVIDIA-SMI 346.59 Driver Version: 346.59 \|
	\|-------------------------------+----------------------+----------------------+
	\| GPU Name Persistence-M\| Bus-Id Disp.A \| Volatile Uncorr. ECC \|
	\| Fan Temp Perf Pwr:Usage/Cap\| Memory-Usage \| GPU-Util Compute M. \|
	\|===============================+======================+======================\|
	\| 0 Tesla K80 Off \| 0000:83:00.0 Off \| 0 \|
	\| N/A 34C P0 59W / 149W \| 55MiB / 11519MiB \| 0% Default \|
	+-------------------------------+----------------------+----------------------+
	\| 1 Tesla K80 Off \| 0000:84:00.0 Off \| 0 \|
	\| N/A 32C P0 75W / 149W \| 55MiB / 11519MiB \| 0% Default \|
	+-------------------------------+----------------------+----------------------+
	\| 2 Tesla K80 Off \| 0000:87:00.0 Off \| 0 \|
	\| N/A 34C P0 60W / 149W \| 55MiB / 11519MiB \| 0% Default \|
	+-------------------------------+----------------------+----------------------+
	\| 3 Tesla K80 Off \| 0000:88:00.0 Off \| 0 \|
	\| N/A 31C P0 75W / 149W \| 55MiB / 11519MiB \| 97% Default \|
	+-------------------------------+----------------------+----------------------+

	+-----------------------------------------------------------------------------+
	\| Processes: GPU Memory \|
	\| GPU PID Type Process name Usage \|
	\|=============================================================================\|
	\| No running processes found \|
	+-----------------------------------------------------------------------------+
	./queryDevice OUTPUT:
	NVIDIA_CUDA-7.0_Samples/bin/x86_64/linux/release/deviceQuery Starting...

	CUDA Device Query (Runtime API) version (CUDART static linking)

	Detected 4 CUDA Capable device(s)

	Device 0: "Tesla K80"
	CUDA Driver Version / Runtime Version 7.0 / 7.0
	CUDA Capability Major/Minor version number: 3.7
	Total amount of global memory: 11520 MBytes (12079136768 bytes)
	(13) Multiprocessors, (192) CUDA Cores/MP: 2496 CUDA Cores
	GPU Max Clock rate: 824 MHz (0.82 GHz)
	Memory Clock rate: 2505 Mhz
	Memory Bus Width: 384-bit
	L2 Cache Size: 1572864 bytes
	Maximum Texture Dimension Size (x,y,z) 1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
	Maximum Layered 1D Texture Size, (num) layers 1D=(16384), 2048 layers
	Maximum Layered 2D Texture Size, (num) layers 2D=(16384, 16384), 2048 layers
	Total amount of constant memory: 65536 bytes
	Total amount of shared memory per block: 49152 bytes
	Total number of registers available per block: 65536
	Warp size: 32
	Maximum number of threads per multiprocessor: 2048
	Maximum number of threads per block: 1024
	Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
	Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)
	Maximum memory pitch: 2147483647 bytes
	Texture alignment: 512 bytes
	Concurrent copy and kernel execution: Yes with 2 copy engine(s)
	Run time limit on kernels: No
	Integrated GPU sharing Host Memory: No
	Support host page-locked memory mapping: Yes
	Alignment requirement for Surfaces: Yes
	Device has ECC support: Enabled
	Device supports Unified Addressing (UVA): Yes
	Device PCI Domain ID / Bus ID / location ID: 0 / 131 / 0
	Compute Mode:
	< Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

	Device 1: "Tesla K80"
	CUDA Driver Version / Runtime Version 7.0 / 7.0
	CUDA Capability Major/Minor version number: 3.7
	Total amount of global memory: 11520 MBytes (12079136768 bytes)
	(13) Multiprocessors, (192) CUDA Cores/MP: 2496 CUDA Cores
	GPU Max Clock rate: 824 MHz (0.82 GHz)
	Memory Clock rate: 2505 Mhz
	Memory Bus Width: 384-bit
	L2 Cache Size: 1572864 bytes
	Maximum Texture Dimension Size (x,y,z) 1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
	Maximum Layered 1D Texture Size, (num) layers 1D=(16384), 2048 layers
	Maximum Layered 2D Texture Size, (num) layers 2D=(16384, 16384), 2048 layers
	Total amount of constant memory: 65536 bytes
	Total amount of shared memory per block: 49152 bytes
	Total number of registers available per block: 65536
	Warp size: 32
	Maximum number of threads per multiprocessor: 2048
	Maximum number of threads per block: 1024
	Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
	Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)
	Maximum memory pitch: 2147483647 bytes
	Texture alignment: 512 bytes
	Concurrent copy and kernel execution: Yes with 2 copy engine(s)
	Run time limit on kernels: No
	Integrated GPU sharing Host Memory: No
	Support host page-locked memory mapping: Yes
	Alignment requirement for Surfaces: Yes
	Device has ECC support: Enabled
	Device supports Unified Addressing (UVA): Yes
	Device PCI Domain ID / Bus ID / location ID: 0 / 132 / 0
	Compute Mode:
	< Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

	Device 2: "Tesla K80"
	CUDA Driver Version / Runtime Version 7.0 / 7.0
	CUDA Capability Major/Minor version number: 3.7
	Total amount of global memory: 11520 MBytes (12079136768 bytes)
	(13) Multiprocessors, (192) CUDA Cores/MP: 2496 CUDA Cores
	GPU Max Clock rate: 824 MHz (0.82 GHz)
	Memory Clock rate: 2505 Mhz
	Memory Bus Width: 384-bit
	L2 Cache Size: 1572864 bytes
	Maximum Texture Dimension Size (x,y,z) 1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
	Maximum Layered 1D Texture Size, (num) layers 1D=(16384), 2048 layers
	Maximum Layered 2D Texture Size, (num) layers 2D=(16384, 16384), 2048 layers
	Total amount of constant memory: 65536 bytes
	Total amount of shared memory per block: 49152 bytes
	Total number of registers available per block: 65536
	Warp size: 32
	Maximum number of threads per multiprocessor: 2048
	Maximum number of threads per block: 1024
	Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
	Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)
	Maximum memory pitch: 2147483647 bytes
	Texture alignment: 512 bytes
	Concurrent copy and kernel execution: Yes with 2 copy engine(s)
	Run time limit on kernels: No
	Integrated GPU sharing Host Memory: No
	Support host page-locked memory mapping: Yes
	Alignment requirement for Surfaces: Yes
	Device has ECC support: Enabled
	Device supports Unified Addressing (UVA): Yes
	Device PCI Domain ID / Bus ID / location ID: 0 / 135 / 0
	Compute Mode:
	< Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

	Device 3: "Tesla K80"
	CUDA Driver Version / Runtime Version 7.0 / 7.0
	CUDA Capability Major/Minor version number: 3.7
	Total amount of global memory: 11520 MBytes (12079136768 bytes)
	(13) Multiprocessors, (192) CUDA Cores/MP: 2496 CUDA Cores
	GPU Max Clock rate: 824 MHz (0.82 GHz)
	Memory Clock rate: 2505 Mhz
	Memory Bus Width: 384-bit
	L2 Cache Size: 1572864 bytes
	Maximum Texture Dimension Size (x,y,z) 1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
	Maximum Layered 1D Texture Size, (num) layers 1D=(16384), 2048 layers
	Maximum Layered 2D Texture Size, (num) layers 2D=(16384, 16384), 2048 layers
	Total amount of constant memory: 65536 bytes
	Total amount of shared memory per block: 49152 bytes
	Total number of registers available per block: 65536
	Warp size: 32
	Maximum number of threads per multiprocessor: 2048
	Maximum number of threads per block: 1024
	Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
	Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)
	Maximum memory pitch: 2147483647 bytes
	Texture alignment: 512 bytes
	Concurrent copy and kernel execution: Yes with 2 copy engine(s)
	Run time limit on kernels: No
	Integrated GPU sharing Host Memory: No
	Support host page-locked memory mapping: Yes
	Alignment requirement for Surfaces: Yes
	Device has ECC support: Enabled
	Device supports Unified Addressing (UVA): Yes
	Device PCI Domain ID / Bus ID / location ID: 0 / 136 / 0
	Compute Mode:
	< Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
	> Peer access from Tesla K80 (GPU0) -> Tesla K80 (GPU1) : Yes
	> Peer access from Tesla K80 (GPU0) -> Tesla K80 (GPU2) : Yes
	> Peer access from Tesla K80 (GPU0) -> Tesla K80 (GPU3) : Yes
	> Peer access from Tesla K80 (GPU1) -> Tesla K80 (GPU1) : No
	> Peer access from Tesla K80 (GPU1) -> Tesla K80 (GPU2) : Yes
	> Peer access from Tesla K80 (GPU1) -> Tesla K80 (GPU3) : Yes
	> Peer access from Tesla K80 (GPU2) -> Tesla K80 (GPU1) : Yes
	> Peer access from Tesla K80 (GPU2) -> Tesla K80 (GPU2) : No
	> Peer access from Tesla K80 (GPU2) -> Tesla K80 (GPU3) : Yes
	> Peer access from Tesla K80 (GPU1) -> Tesla K80 (GPU0) : Yes
	> Peer access from Tesla K80 (GPU1) -> Tesla K80 (GPU1) : No
	> Peer access from Tesla K80 (GPU1) -> Tesla K80 (GPU2) : Yes
	> Peer access from Tesla K80 (GPU2) -> Tesla K80 (GPU0) : Yes
	> Peer access from Tesla K80 (GPU2) -> Tesla K80 (GPU1) : Yes
	> Peer access from Tesla K80 (GPU2) -> Tesla K80 (GPU2) : No
	> Peer access from Tesla K80 (GPU3) -> Tesla K80 (GPU0) : Yes
	> Peer access from Tesla K80 (GPU3) -> Tesla K80 (GPU1) : Yes
	> Peer access from Tesla K80 (GPU3) -> Tesla K80 (GPU2) : Yes

	deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 7.0, CUDA Runtime Version = 7.0, NumDevs = 4, Device0 = Tesla K80, Device1 = Tesla K80, Device2 = Tesla K80, Device3 = Tesla K80
	Result = PASS
	Traceback (most recent call last):
	File "/home/<xy>net/<xy>nn/main.py", line 281, in <module>
	args.func(args)
	File "/home/<xy>net/<xy>nn/main.py", line 72, in train
	nn.train(data_provider)
	File "/home/<xy>net/<xy>nn/NN/neural_net.py", line 399, in train
	cost = self._train(batch_offset[0], batch_offset[1])
	File "/home/<xy>net/anaconda/lib/python2.7/site-packages/theano/compile/function_module.py", line 606, in __call__
	storage_map=self.fn.storage_map)
	File "/home/<xy>net/anaconda/lib/python2.7/site-packages/theano/compile/function_module.py", line 595, in __call__
	outputs = self.fn()
	RuntimeError: GpuCorrMM encountered a CUBLAS error: an internal operation failed
	This could be a known bug in CUDA, please see the GpuCorrMM() documentation.

	Apply node that caused the error: GpuCorrMM{valid, (1, 1)}(GpuContiguous.0, GpuContiguous.0)
	Inputs types: [CudaNdarrayType(float32, 4D), CudaNdarrayType(float32, 4D)]
	Inputs shapes: [(3, 5, 128, 128), (3, 5, 118, 118)]
	Inputs strides: [(81920, 16384, 128, 1), (69620, 13924, 118, 1)]
	Inputs values: ['not shown', 'not shown']

	HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
	HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.
	NVIDIA drivers:
	<xy>net@train-k80:~/kaggle$ cat /proc/driver/nvidia/version
	NVRM version: NVIDIA UNIX x86_64 Kernel Module 346.59 Tue Mar 31 14:10:31 PDT 2015
	GCC version: gcc version 4.8.2 (Ubuntu 4.8.2-19ubuntu1)

	Theano:
	<xy>net@train-k80:~/kaggle$ python -c 'import theano; print theano.__version__'
	Using gpu device 0: Tesla K80
	0.7.0