Skip to content

Instantly share code, notes, and snippets.

@kavinderd
Created April 19, 2017 22:36
Show Gist options
  • Save kavinderd/aca155c3724c269b4de1f6285bf69516 to your computer and use it in GitHub Desktop.
Save kavinderd/aca155c3724c269b4de1f6285bf69516 to your computer and use it in GitHub Desktop.
/*
* format_c is a data structure that holds column vectors
*/
typedef struct {
format_c *values;
int row_count;
} format_c;
/*
* format_batch is a data structure that holds an array of column vectors (aka, row batch)
*/
typedef struct {
format_c *values;
int row_count;
int col_count;
HeapTuples *tuples;
} format_batch;
/*
* rowbatchformatter_import() takes a batch of rows from a buffer and converts them
* into HAWQ Tuples. This pseudo code gives a glimpse of what the implementation looks like
* and how the performance would be improved by converting whole batches of data
*
* Since HAWQ/GPDB/Postgres is based around tuple by tuple execution, this function is invoked
* with the expectation that it will return a single Tuple. To leverage the performance of row batches
* but still fitting into the current execution model this function holds resolved row batches
* in my memory and first checks this in memory data structure to return data. If the in memory structure
* is empty only then does it conver the next row batch.
*/
Datum
rowbatchformatter_import(PG_FUNCTION_ARGS)
{
//format_batch is a data structure that holds an array of column vectors (aka, row batch)
format_batch *myData;
int nrows;
//Get the Schema for the table
batchdesc = FORMATTER_GET_TUPDESC(fcinfo);
/*
* Get our internal description of the formatter
* RowBatch format will have both number of rows
* and number of attributes
*/
nrows = tupdesc->nrows;
ncolumns = tupdesc->natts;
myData = (format_t *) FORMATTER_GET_USER_CTX(fcinfo);
/*
* Initialize the context structure
*/
if (myData == NULL)
{
myData = palloc(sizeof(format_batch));
myData->row_count = nrows;
myData->col_count = ncolumns;
myData->values = palloc(sizeof(format_t) *ncolumns);
for (i = 0; i < ncolumns; i++) {
/* To leverage vectorization whole column vectors need
* to be converted into their appropriate HAWQ type
*/
format_c *col_vector = palloc(sizeof(format_c));
for(int j = 0; j < nrows; j++) {
/*
* Assuming auto-vectorization will be triggered by the compiler
* to optimize this loop
*
*/
col_vector[j] = readTypeFromBuffer(data_buf[col_offset(i)]);
}
myData->values[i] = col_vector;
}
//This function would take the array of columns and form an array of tuples
//using vectorized execution
myData->tuples = heap_form_tuples_vectorized(tupdesc, myData->values);
myData->cursor = 0;
}
return myData->tuples[myData->cursor++];
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment