Skip to content

Instantly share code, notes, and snippets.

struct LaunchConvOp<GPUDevice, T> {
static void launch(OpKernelContext* ctx, bool use_cudnn, const Tensor& input_param, const Tensor& filter, int stride, const Eigen::PaddingType& padding, Tensor* output) {
auto* stream = ctx->op_device_context<GPUDeviceContext>()->stream();
// First, we check if the CUDA platform is registered, and fall back to using Eigen on the GPU if not.
if (use_cudnn) {
Tensor input = input_param;
if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1) {
// ... 1x1 filter, so call cublas directly ...
bool blas_launch_status = stream->ThenBlasGemm(no_transpose, no_transpose, n, m, k, 1.0f, b_ptr, n, a_ptr, k, 0.0f, &c_ptr, n).ok();
template <typename T>
struct LaunchConvOp<GPUDevice, T> {
static void launch(OpKernelContext* ctx, bool use_cudnn, const Tensor& input_param, const Tensor& filter, int stride, const Eigen::PaddingType& padding, Tensor* output) {
auto* stream = ctx->op_device_context<GPUDeviceContext>()->stream();
OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
// There's branching here for three separate paths.
// First, we check if the CUDA platform is registered, and fall back to using Eigen on the GPU if not.
if (use_cudnn) {
Tensor input = input_param;
struct LaunchMatMul<GPUDevice, T, true /* USE_CUBLAS */> {
static void launch(OpKernelContext* ctx, OpKernel* kernel, const Tensor& a, const Tensor& b, const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair, Tensor* out) {
const uint64 m = a.dim_size(1 - dim_pair[0].first);
const uint64 k = a.dim_size(dim_pair[0].first);
const uint64 n = b.dim_size(1 - dim_pair[0].second);
// .. options for transposing the input matrices to the format cuBLAS expects ...
// Get a Stream for this GPUDevice
auto* stream = ctx->op_device_context<GPUDeviceContext>()->stream();
OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
// Load GPU kernel
gpu::StreamExecutor stream_exec{PlatformKind::kCuda};
gcudacc::kernel::MyKernel my_kernel{&stream_exec};
bool ok = stream_exec.GetKernel(gcudacc::spec::MyKernelSpec(), &my_kernel);
if (!ok) { ... }
// Allocate some CPU memory for the output from the GPU
gpu::DeviceMemory<int> result = stream_exec.AllocateZeroed<int>();
if (result == nullptr) { ... }
class MatMulOp : public OpKernel {
public:
explicit MatMulOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_a", &transpose_a_));
OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_b", &transpose_b_));
}
void Compute(OpKernelContext* ctx) override {
const Tensor& a = ctx->input(0);
const Tensor& b = ctx->input(1);
// TODO(jeff,sanjay): Session tests
// . Create and delete
// . Extend graph
// . Run
>>> import tensorflow as tf
>>> a = tf.Variable(tf.zeros([784, 10], name='a')
>>> print str(tf.get_default_graph().as_graph_def())
node {
name: "a"
op: "Const"
attr {
key: "dtype"
value {
type: DT_FLOAT
# load training and test data from disk
import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
# describe model
import tensorflow as tf
x = tf.placeholder(tf.float32, [None, 784])
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
y = tf.nn.softmax(tf.matmul(x, W) + b)
src_device: /job:host_a/replica:0/task:0/cpu:0;
src_incarnation: 82;
dst_device: /job:host_b/replica:0/task:0/cpu:3;
tensor_name: a;
frame_id: 0
iter_id: 0
TEST_F(GraphPartitionTest, CrossDeviceData) {
using namespace ::tensorflow::ops; // NOLINT(build/namespaces)
Node* a1 = Input(in_.opts().WithName("A1"));
Node* b1 = Input(in_.opts().WithName("B1"));
Cross(a1, b1, in_.opts().WithName("B2"));
Partition(ToGraphDef(), &partitions_);
EXPECT_EQ(2, partitions_.size());
// ... there’s some addition assertions, but they require a bit of unwinding