Skip to content

Instantly share code, notes, and snippets.

@botev
Last active March 23, 2017 13:43
Show Gist options
  • Save botev/156eb765b497c1120f11b25dfe973846 to your computer and use it in GitHub Desktop.
Save botev/156eb765b497c1120f11b25dfe973846 to your computer and use it in GitHub Desktop.
Theano + cuSOLVER
import numpy as np
try:
import pygpu
pygpu_available = True
except ImportError:
pygpu_available = False
try:
import pycuda.driver
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
pycuda_available = True
except ImportError:
pycuda_available = False
try:
import skcuda.cusolver as solver
scikits_cuda_available = True
except (ImportError, Exception):
scikits_cuda_available = False
def cusolver_solve(a, b, d, n, overwrite_a=False, overwrite_b=False):
a = a if overwrite_a else a.copy()
b = b if overwrite_b else b.copy()
h = solver.cusolverDnCreate()
# Set up work buffers:
Lwork = solver.cusolverDnSgetrf_bufferSize(h, d, d, a.gpudata, d)
workspace_gpu = gpuarray.zeros(Lwork, np.float32)
devipiv_gpu = gpuarray.zeros(d, np.int32)
devinfo_gpu = gpuarray.zeros(1, np.int32)
solver.cusolverDnSgetrf(h, d, d, a.gpudata, d, workspace_gpu.gpudata, devipiv_gpu.gpudata, devinfo_gpu.gpudata)
solver.cusolverDnSgetrs(h, 0, d, n, a.gpudata, d, devipiv_gpu.gpudata, b.gpudata, d, devinfo_gpu.gpudata)
solver.cusolverDnDestroy(h)
return b
if __name__ == '__main__':
import scipy.linalg
import scipy as sp
rows = 100
cols = 37
a = np.random.random((rows, rows)).astype("float32")
a = np.dot(a, a.T)
b = np.random.random((rows, cols)).astype("float32")
solved = sp.linalg.solve(a, b)
h = solver.cusolverDnCreate()
d, n = b.shape
# Need to copy transposed matrix because T only returns a view:
x_gpu = gpuarray.to_gpu(a.copy())
y_gpu = gpuarray.to_gpu(b.T.copy())
# Set up work buffers:
Lwork = solver.cusolverDnSgetrf_bufferSize(h, d, d, x_gpu.gpudata, d)
workspace_gpu = gpuarray.zeros(Lwork, np.float32)
devipiv_gpu = gpuarray.zeros(d, np.int32)
devinfo_gpu = gpuarray.zeros(1, np.int32)
solver.cusolverDnSgetrf(h, d, d, x_gpu.gpudata, d, workspace_gpu.gpudata, devipiv_gpu.gpudata, devinfo_gpu.gpudata)
solver.cusolverDnSgetrs(h, 0, d, n, x_gpu.gpudata, d, devipiv_gpu.gpudata, y_gpu.gpudata, d, devinfo_gpu.gpudata)
cusolved = y_gpu.get().T
print(np.linalg.norm(np.dot(a, solved) - b))
print(np.linalg.norm(np.dot(a, cusolved) - b))
solver.cusolverDnDestroy(h)
x_gpu = gpuarray.to_gpu(a.copy())
y_gpu = gpuarray.to_gpu(b.T.copy())
cusolved = cusolver_solve(x_gpu, y_gpu, True, True).get().T
print(np.linalg.norm(np.dot(a, solved) - b))
print(np.linalg.norm(np.dot(a, cusolved) - b))
from __future__ import absolute_import, print_function, division
import numpy as np
import theano
from theano import Op
import theano.tensor as T
from theano.gpuarray.basic_ops import (gpu_contiguous, as_gpuarray_variable,
infer_context_name)
from theano.gpuarray.type import get_context
try:
import pygpu
pygpu_available = True
except ImportError:
pygpu_available = False
try:
import pycuda.driver
pycuda_available = True
except ImportError:
pycuda_available = False
try:
import skcuda
import skcuda.cusolver as solver
scikits_cuda_available = True
except (ImportError, Exception):
scikits_cuda_available = False
try:
import scipy.linalg
imported_scipy = True
except ImportError:
# some ops (e.g. Cholesky, Solve, A_Xinv_b) won't work
imported_scipy = False
# def cusolver_solve(a, b, d, n, overwrite_a=False, overwrite_b=False):
# a = a if overwrite_a else a.copy()
# b = b if overwrite_b else b.copy()
# h = solver.cusolverDnCreate()
# # Set up work buffers:
# Lwork = solver.cusolverDnSgetrf_bufferSize(h, d, d, a.gpudata, d)
# workspace_gpu = gpuarray.zeros(Lwork, np.float32)
# devipiv_gpu = gpuarray.zeros(d, np.int32)
# devinfo_gpu = gpuarray.zeros(1, np.int32)
# solver.cusolverDnSgetrf(h, d, d, a.gpudata, d, workspace_gpu.gpudata, devipiv_gpu.gpudata, devinfo_gpu.gpudata)
# solver.cusolverDnSgetrs(h, 0, d, n, a.gpudata, d, devipiv_gpu.gpudata, b.gpudata, d, devinfo_gpu.gpudata)
# solver.cusolverDnDestroy(h)
# return b
class CuSolve(Op):
__props__ = ('context_name', 'overwrite_A', 'overwrite_b')
_f16_ok = False
def __init__(self, context_name=None,
overwrite_A=False,
overwrite_b=False):
self.context_name = context_name
self.overwrite_A = overwrite_A
self.overwrite_b = overwrite_b
def __repr__(self):
return 'CuChoSolve{%s}' % str(self._props())
def get_params(self, node):
return get_context(self.context_name), self.overwrite_A, self.overwrite_b
def infer_shape(self, node, shapes):
Ashape, Bshape = shapes
rows = Ashape[0]
if len(Bshape) == 1: # b is a Vector
return [(rows,)]
else:
cols = Bshape[1] # b is a Matrix
return [(rows, cols)]
def make_node(self, a, b):
if not scikits_cuda_available:
raise RuntimeError("skcuda is needed for CuFFTOp")
if not pygpu_available:
raise RuntimeError("pygpu is needed for CuFFTOp")
if not pycuda_available:
raise RuntimeError("pycuda is needed for CuFFTOp")
a = gpu_contiguous(as_gpuarray_variable(a, infer_context_name(a)))
b = gpu_contiguous(as_gpuarray_variable(b, infer_context_name(b)))
assert a.dtype == "float32"
assert a.ndim == 2
assert b.dtype == "float32"
assert b.ndim == 2
return theano.Apply(self, [a, b], [b.type()])
def make_thunk(self, node, storage_map, _, _2, impl=None):
inputs = [storage_map[v] for v in node.inputs]
outputs = [storage_map[v] for v in node.outputs]
# Initiliaze cuda context to the input's.
with node.inputs[0].type.context:
skcuda.misc.init()
# Make a copy of b to outputs as cusolve is performed always in place
def thunk():
A = inputs[0][0].copy()
b_init = inputs[1][0]
b = outputs[0]
# a_shape = A.shape
# b_shape = b.shape
# A must be square
# assert a_shape[0] == a_shape[1]
# First dimension of b must match A
# assert b_shape[0] == a_shape[0]
# d, n = b_shape
# only allocate if there is no previous allocation of the
# right size.
output_shape = b_init.shape
d, n = output_shape
if b[0] is None or b[0].shape != output_shape:
b[0] = pygpu.zeros(output_shape,
context=A.context,
dtype='float32')
with A.context:
# Sync GPU variables before computation
A.sync()
b_init.sync()
b[0][:] = b_init
b = b[0]
b.sync()
# Initialize handle
h = solver.cusolverDnCreate()
# Set up work buffers:
work = solver.cusolverDnSgetrf_bufferSize(h, d, d, A.gpudata, d)
workspace_gpu = pygpu.zeros(work, context=A.context, dtype=np.float32)
# workspace_gpu = gpuarray.zeros(work, np.float32)
devipiv_gpu = pygpu.zeros(d, context=A.context, dtype=np.float32)
# devipiv_gpu = gpuarray.zeros(d, np.int32)
devinfo_gpu = pygpu.zeros(1, context=A.context, dtype=np.float32)
# devinfo_gpu = gpuarray.zeros(1, np.int32)
# Do LU decomposition
solver.cusolverDnSgetrf(h, d, d, A.gpudata, d, workspace_gpu.gpudata, devipiv_gpu.gpudata,
devinfo_gpu.gpudata)
# Do the actual linear system solve
solver.cusolverDnSgetrs(h, 0, d, n, A.gpudata, d, devipiv_gpu.gpudata, b.gpudata, d,
devinfo_gpu.gpudata)
# Destroy the handle
solver.cusolverDnDestroy(h)
# Sync results to ensure output contains completed computation
pycuda.driver.Context.synchronize()
thunk.inputs = inputs
thunk.outputs = outputs
thunk.lazy = False
return thunk
def cu_solve(a, b):
return CuSolve()(a, b)
a_in = theano.tensor.fmatrix()
b_in = theano.tensor.fmatrix()
c = cu_solve(a_in, b_in)
f = theano.function([a_in, b_in], c)
a_np = np.random.randn(20, 20).astype(theano.config.floatX)
a_np = np.dot(a_np, a_np.T)
b_np = np.random.randn(20, 5).astype(theano.config.floatX)
sp_x = scipy.linalg.solve(a_np, b_np)
cu_x = f(a_np, b_np)
print(np.linalg.norm(np.dot(a_np, sp_x) - b_np))
print(np.linalg.norm(np.dot(a_np, cu_x) - b_np))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment