Created
April 19, 2017 22:36
-
-
Save kavinderd/aca155c3724c269b4de1f6285bf69516 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* format_c is a data structure that holds column vectors | |
*/ | |
typedef struct { | |
format_c *values; | |
int row_count; | |
} format_c; | |
/* | |
* format_batch is a data structure that holds an array of column vectors (aka, row batch) | |
*/ | |
typedef struct { | |
format_c *values; | |
int row_count; | |
int col_count; | |
HeapTuples *tuples; | |
} format_batch; | |
/* | |
* rowbatchformatter_import() takes a batch of rows from a buffer and converts them | |
* into HAWQ Tuples. This pseudo code gives a glimpse of what the implementation looks like | |
* and how the performance would be improved by converting whole batches of data | |
* | |
* Since HAWQ/GPDB/Postgres is based around tuple by tuple execution, this function is invoked | |
* with the expectation that it will return a single Tuple. To leverage the performance of row batches | |
* but still fitting into the current execution model this function holds resolved row batches | |
* in my memory and first checks this in memory data structure to return data. If the in memory structure | |
* is empty only then does it conver the next row batch. | |
*/ | |
Datum | |
rowbatchformatter_import(PG_FUNCTION_ARGS) | |
{ | |
//format_batch is a data structure that holds an array of column vectors (aka, row batch) | |
format_batch *myData; | |
int nrows; | |
//Get the Schema for the table | |
batchdesc = FORMATTER_GET_TUPDESC(fcinfo); | |
/* | |
* Get our internal description of the formatter | |
* RowBatch format will have both number of rows | |
* and number of attributes | |
*/ | |
nrows = tupdesc->nrows; | |
ncolumns = tupdesc->natts; | |
myData = (format_t *) FORMATTER_GET_USER_CTX(fcinfo); | |
/* | |
* Initialize the context structure | |
*/ | |
if (myData == NULL) | |
{ | |
myData = palloc(sizeof(format_batch)); | |
myData->row_count = nrows; | |
myData->col_count = ncolumns; | |
myData->values = palloc(sizeof(format_t) *ncolumns); | |
for (i = 0; i < ncolumns; i++) { | |
/* To leverage vectorization whole column vectors need | |
* to be converted into their appropriate HAWQ type | |
*/ | |
format_c *col_vector = palloc(sizeof(format_c)); | |
for(int j = 0; j < nrows; j++) { | |
/* | |
* Assuming auto-vectorization will be triggered by the compiler | |
* to optimize this loop | |
* | |
*/ | |
col_vector[j] = readTypeFromBuffer(data_buf[col_offset(i)]); | |
} | |
myData->values[i] = col_vector; | |
} | |
//This function would take the array of columns and form an array of tuples | |
//using vectorized execution | |
myData->tuples = heap_form_tuples_vectorized(tupdesc, myData->values); | |
myData->cursor = 0; | |
} | |
return myData->tuples[myData->cursor++]; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment