Scott Gray scott-gray

scott-gray / gaussian_pool.py

Last active September 11, 2016 23:21

Custom pooling kernels

	#!/usr/bin/python

	import numpy as np
	import pycuda.driver as drv
	from pycuda.tools import context_dependent_memoize
	from pycuda.compiler import SourceModule


	class GaussianPool(object):

scott-gray / simple_fprop_direct.py

Created April 29, 2016 17:38

	import numpy as np

	def ceil_div(x, y):
	return -(-x // y)

	def out_dim(S, X, padding, strides):
	return ceil_div(X - S + 1 + 2*padding, strides)

	def fconv_slice(q, S, X, padding, strides):

scott-gray / tid_aware.cu

Created June 9, 2020 17:42


	// A case for making the compiler more threadIdx aware in conditional code.

	// Proposed solution:
	// Walk the dependacies of any predicate gating a shfl.sync to look for threadIdx.
	// Simulate all 1024 values of threadIdx with full predicate expression to see if it's warp uniform.
	// Or you can also check if only single thread is active for other opimizations (like in that atomic add).
	// This can't be that complicated to do.

	__device__ __forceinline__ float shfl_xor(float var, int laneMask)

scott-gray / memset_bench.py

Created August 19, 2020 04:40

	#!/usr/bin/env python

	import pycuda.driver as drv
	from pycuda.autoinit import context, device
	from pycuda.compiler import SourceModule

	SMs = drv.Context.get_device().get_attributes()[drv.device_attribute.MULTIPROCESSOR_COUNT]

	print(device.name())