kavinderd · April 19, 2017 22:36
diff --git a/rowbatchformatter.c b/rowbatchformatter.c
 /*
 * format_c is a data structure that holds column vectors
 */
 typedef struct {
    format_c *values;
    int row_count;
 } format_c;
 /*
 * format_batch is a data structure that holds an array of column vectors (aka, row batch)
 */
 typedef struct {
    format_c *values;
    int row_count;
    int col_count;
    HeapTuples *tuples;
 } format_batch;


 /*
 * rowbatchformatter_import() takes a batch of rows from a buffer and converts them
 * into HAWQ Tuples. This pseudo code gives a glimpse of what the implementation looks like
 * and how the performance would be improved by converting whole batches of data
 *
 * Since HAWQ/GPDB/Postgres is based around tuple by tuple execution, this function is invoked
 * with the expectation that it will return a single Tuple. To leverage the performance of row batches
 * but still fitting into the current execution model this function holds resolved row batches
 * in my memory and first checks this in memory data structure to return data. If the in memory structure
 * is empty only then does it conver the next row batch.
 */

 Datum
 rowbatchformatter_import(PG_FUNCTION_ARGS)
 {

    //format_batch is a data structure that holds an array of column vectors (aka, row batch)

    format_batch *myData;
    int nrows;
    //Get the Schema for the table
 	batchdesc = FORMATTER_GET_TUPDESC(fcinfo);

 	/* 
     * Get our internal description of the formatter
     * RowBatch format will have both number of rows
     * and number of attributes
     */
    nrows = tupdesc->nrows; 
 	ncolumns = tupdesc->natts;

 	myData = (format_t *) FORMATTER_GET_USER_CTX(fcinfo);

 	/*
 	 * Initialize the context structure
 	 */
 	if (myData == NULL)
 	{
 		myData          = palloc(sizeof(format_batch));
        myData->row_count = nrows;
        myData->col_count = ncolumns;
        myData->values = palloc(sizeof(format_t) *ncolumns);

        for (i = 0; i < ncolumns; i++) {
            /* To leverage vectorization whole column vectors need
             * to be converted into their appropriate HAWQ type
             */
            format_c *col_vector = palloc(sizeof(format_c));
            for(int j = 0; j < nrows; j++) {
                /* 
                 * Assuming auto-vectorization will be triggered by the compiler
                 * to optimize this loop
                 * 
                 */
                col_vector[j] = readTypeFromBuffer(data_buf[col_offset(i)]);
            }
            myData->values[i] = col_vector;
        }
        //This function would take the array of columns and form an array of tuples
        //using vectorized execution
        myData->tuples = heap_form_tuples_vectorized(tupdesc, myData->values);
        myData->cursor = 0;
    }

    return myData->tuples[myData->cursor++];
 }
	/*
	* format_c is a data structure that holds column vectors
	*/
	typedef struct {
	format_c *values;
	int row_count;
	} format_c;
	/*
	* format_batch is a data structure that holds an array of column vectors (aka, row batch)
	*/
	typedef struct {
	format_c *values;
	int row_count;
	int col_count;
	HeapTuples *tuples;
	} format_batch;


	/*
	* rowbatchformatter_import() takes a batch of rows from a buffer and converts them
	* into HAWQ Tuples. This pseudo code gives a glimpse of what the implementation looks like
	* and how the performance would be improved by converting whole batches of data
	*
	* Since HAWQ/GPDB/Postgres is based around tuple by tuple execution, this function is invoked
	* with the expectation that it will return a single Tuple. To leverage the performance of row batches
	* but still fitting into the current execution model this function holds resolved row batches
	* in my memory and first checks this in memory data structure to return data. If the in memory structure
	* is empty only then does it conver the next row batch.
	*/

	Datum
	rowbatchformatter_import(PG_FUNCTION_ARGS)
	{

	//format_batch is a data structure that holds an array of column vectors (aka, row batch)

	format_batch *myData;
	int nrows;
	//Get the Schema for the table
	batchdesc = FORMATTER_GET_TUPDESC(fcinfo);

	/*
	* Get our internal description of the formatter
	* RowBatch format will have both number of rows
	* and number of attributes
	*/
	nrows = tupdesc->nrows;
	ncolumns = tupdesc->natts;

	myData = (format_t *) FORMATTER_GET_USER_CTX(fcinfo);

	/*
	* Initialize the context structure
	*/
	if (myData == NULL)
	{
	myData = palloc(sizeof(format_batch));
	myData->row_count = nrows;
	myData->col_count = ncolumns;
	myData->values = palloc(sizeof(format_t) *ncolumns);

	for (i = 0; i < ncolumns; i++) {
	/* To leverage vectorization whole column vectors need
	* to be converted into their appropriate HAWQ type
	*/
	format_c *col_vector = palloc(sizeof(format_c));
	for(int j = 0; j < nrows; j++) {
	/*
	* Assuming auto-vectorization will be triggered by the compiler
	* to optimize this loop
	*
	*/
	col_vector[j] = readTypeFromBuffer(data_buf[col_offset(i)]);
	}
	myData->values[i] = col_vector;
	}
	//This function would take the array of columns and form an array of tuples
	//using vectorized execution
	myData->tuples = heap_form_tuples_vectorized(tupdesc, myData->values);
	myData->cursor = 0;
	}

	return myData->tuples[myData->cursor++];
	}