Skip to content

Instantly share code, notes, and snippets.

@tobilg
Last active August 21, 2024 10:41
Show Gist options
  • Save tobilg/4d1a49a37ee2da795ea71c9e4dd81d9b to your computer and use it in GitHub Desktop.
Save tobilg/4d1a49a37ee2da795ea71c9e4dd81d9b to your computer and use it in GitHub Desktop.
import { Table, Vector, Field, Utf8, Type, Schema } from 'apache-arrow';
/**
* Cast all columns with complex data types in an Apache Arrow Table to strings
* @param {Table} table - The Apache Arrow Table
* @returns {Table} - A new Table with all complex data type columns cast to strings
*/
function castComplexColumnsToString(table: Table): Table {
const schemaFields = table.schema.fields;
// Helper function to determine if a data type is complex
const isComplexType = (type: any): boolean => {
return type.typeId === Type.Struct ||
type.typeId === Type.List ||
type.typeId === Type.Map ||
type.typeId === Type.Dictionary ||
type.typeId === Type.FixedSizeList ||
type.typeId === Type.FixedSizeBinary;
};
// Create a new schema where complex types are replaced by Utf8 (String)
const newSchema = new Schema(
schemaFields.map((field) => {
if (isComplexType(field.type)) {
return new Field(field.name, new Utf8(), field.nullable);
}
return field;
})
);
// Transform each column if it is of a complex type
const newColumns = table.columns.map((column, index) => {
const field = schemaFields[index];
if (isComplexType(field.type)) {
const newColumnData: string[] = [];
for (let i = 0; i < column.length; i++) {
const complexValue = column.get(i);
// Convert complex type to string representation (e.g., JSON string)
const stringValue = JSON.stringify(complexValue);
newColumnData.push(stringValue);
}
return Vector.from(newColumnData);
}
return column;
});
// Create and return the new table
return new Table(newColumns, newSchema);
}
// Example usage:
// Assume you have an Apache Arrow Table 'table'
// const newTable = castComplexColumnsToString(table);
@tobilg
Copy link
Author

tobilg commented Aug 21, 2024

Claude's suggestion (also wrong):

import { Table, DataType, Field, type FieldOptions, type TableOptions } from 'apache-arrow';

function castComplexColumnsToString(table: Table): Table {
  const fields: Field[] = [];

  for (const field of table.schema.fields) {
    const options: FieldOptions = {
      metadata: field.metadata,
      nullable: field.nullable,
    };

    if (field.type instanceof DataType.Struct || field.type instanceof DataType.List || field.type instanceof DataType.Union) {
      fields.push(new Field(field.name, DataType.string(), options));
    } else {
      fields.push(field);
    }
  }

  const tableOptions: TableOptions = {
    schema: new Table.Schema(fields),
  };

  return new Table(table.chunks, tableOptions);
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment