Kaushik Kulkarni kaushikcfd

Kernels obtained during mass matrix matvec.

Zero Initialization kernel

#include <CL/cl.h>
#include "petsc.h"
#include "petscvec.h"
#include "petscviennacl.h"
#include <iostream>

Initialization kernel

char kernel_source[] = "#define lid(N) ((int) get_local_id(N))\n"
"#define gid(N) ((int) get_group_id(N))\n"
"#if __OPENCL_C_VERSION__ < 120\n"
"#pragma OPENCL EXTENSION cl_khr_fp64: enable\n"
"#endif\n"
"\n"
"__kernel void __attribute__ ((reqd_work_group_size(33, 1, 1))) wrap_zero(int const start, int const end, __global double *__restrict__ dat0)\n"

Generated code for the function --

extern "C" void wrap_form0_cell_integral_otherwise(int const start, int const end, Vec dat0, Vec dat1, Vec dat2, cl_mem map0, const int map0_size)
{
    if(end == start)
    {
        // no need to go any further
        return;
    }

Solving helmholtz for 2**19 elements.
This is the log for the GPU implementation. Takes a total of ~27 seconds for the solve.

------------------------------------------------------------------------------------------------------------------------
Event                Count      Time (sec)     Flop                              --- Global ---  --- Stage ----  Total
                   Max Ratio  Max     Ratio   Max  Ratio  Mess   AvgLen  Reduct  %T %F %M %L %R  %T %F %M %L %R Mflop/s
------------------------------------------------------------------------------------------------------------------------
                                                             
--- Event Stage 0: Main Stage

Time (all values are in seconds)

Kernel	CPU	GeForce Titan	Tesla K40
VecMDot	3.92	7.50	11.2
VecNorm	0.70	2.92	3.64
MV	5.64	4.01	3.53
Complete Solve	16.6	25.5	34.5

Bandwidth comparison on Titan V(in GB/s)

Platform	`N=2**18`	`N=2**21`	`N=2**24`
ViennaCL	36	195	485
Thrust	3.5	26	171
CUB	32	152	482

Before transformations

---------------------------------------------------------------------------
KERNEL: wrap_form0_cell_integral_otherwise
---------------------------------------------------------------------------
ARGUMENTS:
dat0: type: np_atomic:dtype('float64'), shape: (None), dim_tags: (N0:stride:1) aspace: global
dat1: type: np:dtype('float64'), shape: (None, 2), dim_tags: (N1:stride:2, N0:stride:1) aspace: global
dat2: type: np:dtype('float64'), shape: (None), dim_tags: (N0:stride:1) aspace: global

Mass matrix

Timings for 1 matvec(in s)

Element	Number of cells	Single cell per thread	Thread transposition
	73728	2.80e-5	3.05e-5
P1	1179648	1.01e-4	1.355e-4
	4718592	2.91e-4	4.32e-4
	73728	4.83e-5	5.21e-5

	import pyopencl as cl
	import pyopencl.clrandom as cl_random
	from pyopencl.reduction import ReductionKernel
	import numpy as np
	from mako.template import Template

	ctx = cl.create_some_context()
	queue = cl.CommandQueue(ctx)
	n = 2**10
	num_groups = 128

	from pymbolic import parse
	from pymbolic.mapper import WalkMapper


	class EdMapper(WalkMapper):
	def __init__(self):
	self.lines = []

	def map_variable(self, expr):
	if expr.name == 'a':