This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import numpy as np | |
import pycuda.driver as drv | |
from pycuda.tools import context_dependent_memoize | |
from pycuda.compiler import SourceModule | |
class GaussianPool(object): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
def ceil_div(x, y): | |
return -(-x // y) | |
def out_dim(S, X, padding, strides): | |
return ceil_div(X - S + 1 + 2*padding, strides) | |
def fconv_slice(q, S, X, padding, strides): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// A case for making the compiler more threadIdx aware in conditional code. | |
// Proposed solution: | |
// Walk the dependacies of any predicate gating a shfl.sync to look for threadIdx. | |
// Simulate all 1024 values of threadIdx with full predicate expression to see if it's warp uniform. | |
// Or you can also check if only single thread is active for other opimizations (like in that atomic add). | |
// This can't be that complicated to do. | |
__device__ __forceinline__ float shfl_xor(float var, int laneMask) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import pycuda.driver as drv | |
from pycuda.autoinit import context, device | |
from pycuda.compiler import SourceModule | |
SMs = drv.Context.get_device().get_attributes()[drv.device_attribute.MULTIPROCESSOR_COUNT] | |
print(device.name()) |