Skip to content

Instantly share code, notes, and snippets.

@simicd
Last active June 24, 2020 18:58
Show Gist options
  • Save simicd/d700b557be69127fb57d7d81dd405c75 to your computer and use it in GitHub Desktop.
Save simicd/d700b557be69127fb57d7d81dd405c75 to your computer and use it in GitHub Desktop.
# Measure initial memory consumption
memory_init = psutil.Process(os.getpid()).memory_info().rss >> 20
# Read csv
col_csv = pd.read_csv("penguin-dataset.csv")["Flipper Length (mm)"]
memory_post_csv = psutil.Process(os.getpid()).memory_info().rss >> 20
# Read parquet
col_parquet = pd.read_parquet("penguin-dataset.parquet", columns=["Flipper Length (mm)"])
memory_post_parquet = psutil.Process(os.getpid()).memory_info().rss >> 20
# Read Arrow using file API
with pa.OSFile('penguin-dataset.arrow', 'rb') as source:
col_arrow_file = pa.ipc.open_file(source).read_all().column("Flipper Length (mm)").to_pandas()
memory_post_arrowos = psutil.Process(os.getpid()).memory_info().rss >> 20
# Read Arrow with memory-mapped API with missing values
source = pa.memory_map('penguin-dataset.arrow', 'r')
table_mmap = pa.ipc.RecordBatchFileReader(source).read_all().column("Flipper Length (mm)")
col_arrow_mapped = table_mmap.to_pandas()
memory_post_arrowmmap = psutil.Process(os.getpid()).memory_info().rss >> 20
# Read Arrow with memory-mapped API without missing values (zero-copy)
source = pa.memory_map('penguin-dataset-nonan.arrow', 'r')
table_mmap_zc = pa.ipc.RecordBatchFileReader(source).read_all().column("Flipper Length (mm)")
col_arrow_mapped_zc = table_mmap_zc.to_pandas()
memory_post_arrowmmap_zc = psutil.Process(os.getpid()).memory_info().rss >> 20
# Display memory consumption
print(f"csv memory consumption: {memory_post_csv - memory_init}\n"
f"Parquet memory consumption: {memory_post_parquet - memory_post_csv}\n"
f"Arrow file memory consumption: {memory_post_arrowos - memory_post_parquet}\n"
f"Arrow mapped (no zero-copy) memory consumption: {memory_post_arrowmmap - memory_post_arrowos}\n"
f"Arrow mapped (zero-copy) memory consumption: {memory_post_arrowmmap_zc - memory_post_arrowmmap}\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment