Last active
June 24, 2020 18:56
-
-
Save simicd/72815e7f639502d35047541c80773d8a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Read csv and calculate mean | |
%%timeit | |
pd.read_csv("penguin-dataset.csv")["Flipper Length (mm)"].mean() | |
# Read parquet and calculate mean | |
%%timeit | |
pd.read_parquet("penguin-dataset.parquet", columns=["Flipper Length (mm)"]).mean() | |
# Read Arrow using file API and calculate mean | |
%%timeit | |
with pa.OSFile('penguin-dataset.arrow', 'rb') as source: | |
table = pa.ipc.open_file(source).read_all().column("Flipper Length (mm)") | |
result = table.to_pandas().mean() | |
# Read Arrow with memory-mapped API with missing values | |
%%timeit | |
source = pa.memory_map('penguin-dataset.arrow', 'r') | |
table = pa.ipc.RecordBatchFileReader(source).read_all().column("Flipper Length (mm)") | |
result = table.to_pandas().mean() | |
# Read Arrow with memory-mapped API without missing values (zero-copy) | |
%%timeit | |
source = pa.memory_map('penguin-dataset-nonan.arrow', 'r') | |
table = pa.ipc.RecordBatchFileReader(source).read_all().column("Flipper Length (mm)") | |
result = table.to_pandas().mean() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment