Last active
March 23, 2017 13:43
-
-
Save botev/156eb765b497c1120f11b25dfe973846 to your computer and use it in GitHub Desktop.
Theano + cuSOLVER
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
try: | |
import pygpu | |
pygpu_available = True | |
except ImportError: | |
pygpu_available = False | |
try: | |
import pycuda.driver | |
import pycuda.autoinit | |
import pycuda.gpuarray as gpuarray | |
pycuda_available = True | |
except ImportError: | |
pycuda_available = False | |
try: | |
import skcuda.cusolver as solver | |
scikits_cuda_available = True | |
except (ImportError, Exception): | |
scikits_cuda_available = False | |
def cusolver_solve(a, b, d, n, overwrite_a=False, overwrite_b=False): | |
a = a if overwrite_a else a.copy() | |
b = b if overwrite_b else b.copy() | |
h = solver.cusolverDnCreate() | |
# Set up work buffers: | |
Lwork = solver.cusolverDnSgetrf_bufferSize(h, d, d, a.gpudata, d) | |
workspace_gpu = gpuarray.zeros(Lwork, np.float32) | |
devipiv_gpu = gpuarray.zeros(d, np.int32) | |
devinfo_gpu = gpuarray.zeros(1, np.int32) | |
solver.cusolverDnSgetrf(h, d, d, a.gpudata, d, workspace_gpu.gpudata, devipiv_gpu.gpudata, devinfo_gpu.gpudata) | |
solver.cusolverDnSgetrs(h, 0, d, n, a.gpudata, d, devipiv_gpu.gpudata, b.gpudata, d, devinfo_gpu.gpudata) | |
solver.cusolverDnDestroy(h) | |
return b | |
if __name__ == '__main__': | |
import scipy.linalg | |
import scipy as sp | |
rows = 100 | |
cols = 37 | |
a = np.random.random((rows, rows)).astype("float32") | |
a = np.dot(a, a.T) | |
b = np.random.random((rows, cols)).astype("float32") | |
solved = sp.linalg.solve(a, b) | |
h = solver.cusolverDnCreate() | |
d, n = b.shape | |
# Need to copy transposed matrix because T only returns a view: | |
x_gpu = gpuarray.to_gpu(a.copy()) | |
y_gpu = gpuarray.to_gpu(b.T.copy()) | |
# Set up work buffers: | |
Lwork = solver.cusolverDnSgetrf_bufferSize(h, d, d, x_gpu.gpudata, d) | |
workspace_gpu = gpuarray.zeros(Lwork, np.float32) | |
devipiv_gpu = gpuarray.zeros(d, np.int32) | |
devinfo_gpu = gpuarray.zeros(1, np.int32) | |
solver.cusolverDnSgetrf(h, d, d, x_gpu.gpudata, d, workspace_gpu.gpudata, devipiv_gpu.gpudata, devinfo_gpu.gpudata) | |
solver.cusolverDnSgetrs(h, 0, d, n, x_gpu.gpudata, d, devipiv_gpu.gpudata, y_gpu.gpudata, d, devinfo_gpu.gpudata) | |
cusolved = y_gpu.get().T | |
print(np.linalg.norm(np.dot(a, solved) - b)) | |
print(np.linalg.norm(np.dot(a, cusolved) - b)) | |
solver.cusolverDnDestroy(h) | |
x_gpu = gpuarray.to_gpu(a.copy()) | |
y_gpu = gpuarray.to_gpu(b.T.copy()) | |
cusolved = cusolver_solve(x_gpu, y_gpu, True, True).get().T | |
print(np.linalg.norm(np.dot(a, solved) - b)) | |
print(np.linalg.norm(np.dot(a, cusolved) - b)) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import absolute_import, print_function, division | |
import numpy as np | |
import theano | |
from theano import Op | |
import theano.tensor as T | |
from theano.gpuarray.basic_ops import (gpu_contiguous, as_gpuarray_variable, | |
infer_context_name) | |
from theano.gpuarray.type import get_context | |
try: | |
import pygpu | |
pygpu_available = True | |
except ImportError: | |
pygpu_available = False | |
try: | |
import pycuda.driver | |
pycuda_available = True | |
except ImportError: | |
pycuda_available = False | |
try: | |
import skcuda | |
import skcuda.cusolver as solver | |
scikits_cuda_available = True | |
except (ImportError, Exception): | |
scikits_cuda_available = False | |
try: | |
import scipy.linalg | |
imported_scipy = True | |
except ImportError: | |
# some ops (e.g. Cholesky, Solve, A_Xinv_b) won't work | |
imported_scipy = False | |
# def cusolver_solve(a, b, d, n, overwrite_a=False, overwrite_b=False): | |
# a = a if overwrite_a else a.copy() | |
# b = b if overwrite_b else b.copy() | |
# h = solver.cusolverDnCreate() | |
# # Set up work buffers: | |
# Lwork = solver.cusolverDnSgetrf_bufferSize(h, d, d, a.gpudata, d) | |
# workspace_gpu = gpuarray.zeros(Lwork, np.float32) | |
# devipiv_gpu = gpuarray.zeros(d, np.int32) | |
# devinfo_gpu = gpuarray.zeros(1, np.int32) | |
# solver.cusolverDnSgetrf(h, d, d, a.gpudata, d, workspace_gpu.gpudata, devipiv_gpu.gpudata, devinfo_gpu.gpudata) | |
# solver.cusolverDnSgetrs(h, 0, d, n, a.gpudata, d, devipiv_gpu.gpudata, b.gpudata, d, devinfo_gpu.gpudata) | |
# solver.cusolverDnDestroy(h) | |
# return b | |
class CuSolve(Op): | |
__props__ = ('context_name', 'overwrite_A', 'overwrite_b') | |
_f16_ok = False | |
def __init__(self, context_name=None, | |
overwrite_A=False, | |
overwrite_b=False): | |
self.context_name = context_name | |
self.overwrite_A = overwrite_A | |
self.overwrite_b = overwrite_b | |
def __repr__(self): | |
return 'CuChoSolve{%s}' % str(self._props()) | |
def get_params(self, node): | |
return get_context(self.context_name), self.overwrite_A, self.overwrite_b | |
def infer_shape(self, node, shapes): | |
Ashape, Bshape = shapes | |
rows = Ashape[0] | |
if len(Bshape) == 1: # b is a Vector | |
return [(rows,)] | |
else: | |
cols = Bshape[1] # b is a Matrix | |
return [(rows, cols)] | |
def make_node(self, a, b): | |
if not scikits_cuda_available: | |
raise RuntimeError("skcuda is needed for CuFFTOp") | |
if not pygpu_available: | |
raise RuntimeError("pygpu is needed for CuFFTOp") | |
if not pycuda_available: | |
raise RuntimeError("pycuda is needed for CuFFTOp") | |
a = gpu_contiguous(as_gpuarray_variable(a, infer_context_name(a))) | |
b = gpu_contiguous(as_gpuarray_variable(b, infer_context_name(b))) | |
assert a.dtype == "float32" | |
assert a.ndim == 2 | |
assert b.dtype == "float32" | |
assert b.ndim == 2 | |
return theano.Apply(self, [a, b], [b.type()]) | |
def make_thunk(self, node, storage_map, _, _2, impl=None): | |
inputs = [storage_map[v] for v in node.inputs] | |
outputs = [storage_map[v] for v in node.outputs] | |
# Initiliaze cuda context to the input's. | |
with node.inputs[0].type.context: | |
skcuda.misc.init() | |
# Make a copy of b to outputs as cusolve is performed always in place | |
def thunk(): | |
A = inputs[0][0].copy() | |
b_init = inputs[1][0] | |
b = outputs[0] | |
# a_shape = A.shape | |
# b_shape = b.shape | |
# A must be square | |
# assert a_shape[0] == a_shape[1] | |
# First dimension of b must match A | |
# assert b_shape[0] == a_shape[0] | |
# d, n = b_shape | |
# only allocate if there is no previous allocation of the | |
# right size. | |
output_shape = b_init.shape | |
d, n = output_shape | |
if b[0] is None or b[0].shape != output_shape: | |
b[0] = pygpu.zeros(output_shape, | |
context=A.context, | |
dtype='float32') | |
with A.context: | |
# Sync GPU variables before computation | |
A.sync() | |
b_init.sync() | |
b[0][:] = b_init | |
b = b[0] | |
b.sync() | |
# Initialize handle | |
h = solver.cusolverDnCreate() | |
# Set up work buffers: | |
work = solver.cusolverDnSgetrf_bufferSize(h, d, d, A.gpudata, d) | |
workspace_gpu = pygpu.zeros(work, context=A.context, dtype=np.float32) | |
# workspace_gpu = gpuarray.zeros(work, np.float32) | |
devipiv_gpu = pygpu.zeros(d, context=A.context, dtype=np.float32) | |
# devipiv_gpu = gpuarray.zeros(d, np.int32) | |
devinfo_gpu = pygpu.zeros(1, context=A.context, dtype=np.float32) | |
# devinfo_gpu = gpuarray.zeros(1, np.int32) | |
# Do LU decomposition | |
solver.cusolverDnSgetrf(h, d, d, A.gpudata, d, workspace_gpu.gpudata, devipiv_gpu.gpudata, | |
devinfo_gpu.gpudata) | |
# Do the actual linear system solve | |
solver.cusolverDnSgetrs(h, 0, d, n, A.gpudata, d, devipiv_gpu.gpudata, b.gpudata, d, | |
devinfo_gpu.gpudata) | |
# Destroy the handle | |
solver.cusolverDnDestroy(h) | |
# Sync results to ensure output contains completed computation | |
pycuda.driver.Context.synchronize() | |
thunk.inputs = inputs | |
thunk.outputs = outputs | |
thunk.lazy = False | |
return thunk | |
def cu_solve(a, b): | |
return CuSolve()(a, b) | |
a_in = theano.tensor.fmatrix() | |
b_in = theano.tensor.fmatrix() | |
c = cu_solve(a_in, b_in) | |
f = theano.function([a_in, b_in], c) | |
a_np = np.random.randn(20, 20).astype(theano.config.floatX) | |
a_np = np.dot(a_np, a_np.T) | |
b_np = np.random.randn(20, 5).astype(theano.config.floatX) | |
sp_x = scipy.linalg.solve(a_np, b_np) | |
cu_x = f(a_np, b_np) | |
print(np.linalg.norm(np.dot(a_np, sp_x) - b_np)) | |
print(np.linalg.norm(np.dot(a_np, cu_x) - b_np)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment