Skip to content

Instantly share code, notes, and snippets.

@yenson-lau
Last active January 10, 2022 06:37
Show Gist options
  • Save yenson-lau/3daa461a579ec2a4dcae3aeffc41a7c1 to your computer and use it in GitHub Desktop.
Save yenson-lau/3daa461a579ec2a4dcae3aeffc41a7c1 to your computer and use it in GitHub Desktop.
Convert Pandas DataFrame into Arrow bytestream, and process it with Julia
from julia.api import Julia
import pyarrow as pa
def process_dataframe_jl(df):
"""Process DataFrame in Julia; return the resulting DataFrame"""
bytes = df_to_arrowbytes(df) # convert df into Arrow bytestream
jl = Julia(compiled_modules=False) # create / get Julia instance
jl.eval('include("process_dataframe.jl"') # this script contains my Julia function
from julia.Main import process_arrowbytes # the function is now in julia.Main
result_bytes = process_arrowbytes(bytes) # process the bytestream in Julia;
# the result is an Arrow bytestream
# containing the resulting DataFrame
return arrowbytes_to_df(result_bytes) # convert bytes back into a DataFrame
def df_to_arrowbytes(df):
"""Converts a DataFrame into Arrow bytestream"""
batch = pa.record_batch(df)
sink = pa.BufferOutputStream()
with pa.ipc.new_stream(sink, batch.schema) as writer:
writer.write_batch(batch)
return bytearray(sink.getvalue().to_pybytes())
def arrowbytes_to_df(arrowbytes):
"""Converts an Arrow bytestream into DataFrame"""
return pa.ipc.open_stream(arrowbytes).read_pandas()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment