Last active
March 26, 2024 18:09
-
-
Save mhkeller/855ca5c0a6582e4ead7d36e6f8169fdd to your computer and use it in GitHub Desktop.
Convert a parquet file to bytes representing an arrow table
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "parquet-to-arrow", | |
"version": "1.0.0", | |
"description": "", | |
"scripts": { | |
}, | |
"dependencies": { | |
"apache-arrow": "^15.0.2", | |
"arrow-js-ffi": "^0.4.1", | |
"parquet-wasm": "^0.6.0-beta.2" | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Converts a parquet file on disk to an arrow file in memory | |
* Adapted from the README example in https://github.com/kylebarron/parquet-wasm?tab=readme-ov-file | |
* | |
* @param {string} filepath | |
* @returns {buffer} | |
*/ | |
const arrow = require("apache-arrow"); | |
const { parseRecordBatch } = require("arrow-js-ffi"); | |
const { readFileSync } = require("fs"); | |
const { readParquet, wasmMemory } = require("parquet-wasm"); | |
module.exports = async function parquetToArrow(filepath) { | |
// A reference to the WebAssembly memory object. | |
const WASM_MEMORY = wasmMemory(); | |
const buf = readFileSync(filepath); | |
const parquetUint8Array = new Uint8Array(Buffer.from(buf)); | |
const wasmArrowTable = readParquet(parquetUint8Array).intoFFI(); | |
const recordBatches = []; | |
for (let i = 0; i < wasmArrowTable.numBatches(); i++) { | |
// Note: Unless you know what you're doing, setting `true` below is recommended to _copy_ | |
// table data from WebAssembly into JavaScript memory. This may become the default in the | |
// future. | |
const recordBatch = parseRecordBatch( | |
WASM_MEMORY.buffer, | |
wasmArrowTable.arrayAddr(i), | |
wasmArrowTable.schemaAddr(), | |
true | |
); | |
recordBatches.push(recordBatch); | |
} | |
const table = new arrow.Table(recordBatches); | |
// Skip this step converting it to bytes if you just want the table | |
const ipcStream = arrow.tableToIPC(table, 'stream'); | |
const bytes = Buffer.from(ipcStream, 'utf-8'); | |
// VERY IMPORTANT! You must call `drop` on the Wasm table object when you're done using it | |
// to release the Wasm memory. | |
// Note that any access to the pointers in this table is undefined behavior after this call. | |
// Calling any `wasmArrowTable` method will error. | |
wasmArrowTable.drop(); | |
return bytes; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment