Skip to content

Instantly share code, notes, and snippets.

@dennisseah
Created November 22, 2022 20:41
Show Gist options
  • Save dennisseah/99d5e4a19f379f154e2b8a0eef1988e3 to your computer and use it in GitHub Desktop.
Save dennisseah/99d5e4a19f379f154e2b8a0eef1988e3 to your computer and use it in GitHub Desktop.
CSV vs Parquet file format
# pandas = "1.5.1"
# faker = "15.3.2"
# matplotlib = "3.6.2"
# pyarrow = "10.0.0"
from faker import Faker
from timeit import default_timer as timer
import math
import matplotlib.pyplot as plt
import pandas as pd
import os
import sys
x_axis = [10000, 50000, 100000]
series_fs_csv, series_fs_par, series_fs_par_gz, series_fs_par_snappy = [
0], [0], [0], [0]
series_tr_csv, series_tr_par, series_tr_par_gz, series_tr_par_snappy = [
0], [0], [0], [0]
series_tw_csv, series_tw_par, series_tw_par_gz, series_tw_par_snappy = [
0], [0], [0], [0]
def generate(count: int):
faker = Faker()
profiles = [faker.simple_profile() for _ in range(0, count)]
data = {
"username": [p["username"] for p in profiles],
"name": [p["name"] for p in profiles],
"sex": [p["sex"] for p in profiles],
"address": [p["address"].replace("\n", " ") for p in profiles],
"mail": [p["mail"] for p in profiles],
"birthdate": [pd.to_datetime(p["birthdate"]) for p in profiles],
}
df = pd.DataFrame(data)
start = timer()
df.to_csv("test.csv", index=False)
series_tw_csv.append(1000 * (timer() - start))
series_fs_csv.append(os.path.getsize("test.csv") / (1024 * 1024))
start = timer()
df_csv = pd.read_csv("test.csv")
series_tr_csv.append(1000 * (timer() - start))
os.remove("test.csv")
start = timer()
df.to_parquet("test.parquet", index=False, compression=None)
series_tw_par.append(1000 * (timer() - start))
series_fs_par.append(os.path.getsize("test.parquet") / (1024 * 1024))
start = timer()
df_par = pd.read_parquet("test.parquet")
series_tr_par.append(1000 * (timer() - start))
os.remove("test.parquet")
start = timer()
df.to_parquet("test.parquet.gz", index=False, compression="gzip")
series_tw_par_gz.append(1000 * (timer() - start))
series_fs_par_gz.append(os.path.getsize("test.parquet.gz") / (1024 * 1024))
start = timer()
df_par_gz = pd.read_parquet("test.parquet.gz")
series_tr_par_gz.append(1000 * (timer() - start))
os.remove("test.parquet.gz")
start = timer()
df.to_parquet("test.parquet.snappy", index=False, compression="snappy")
series_tw_par_snappy.append(1000 * (timer() - start))
series_fs_par_snappy.append(os.path.getsize(
"test.parquet.snappy") / (1024 * 1024))
start = timer()
df_par_snappy = pd.read_parquet("test.parquet.snappy")
series_tr_par_snappy.append(1000 * (timer() - start))
os.remove("test.parquet.snappy")
def plot(x_series, title, y_label, series_csv, series_par, series_par_gz, series_par_snappy, min_val, max_val):
plt.plot(x_series, series_csv, label="csv", linestyle="-")
plt.plot(x_series, series_par, label="parquet", linestyle="-.")
plt.plot(x_series, series_par_gz, label="parquet gz", linestyle=":")
plt.plot(x_series, series_par_snappy,
label="parquet snappy", linestyle="--")
plt.ylim([min_val - 1, max_val])
plt.xlim([0, 100000])
plt.xlabel("# of row")
plt.ylabel(y_label)
plt.title(title)
plt.legend()
plt.show()
fs_max_val = tr_max_val = tw_max_val = 0
fs_min_val = sys.maxsize
for i in x_axis:
generate(i)
x_axis.insert(0, 0)
for x in [series_tr_csv, series_tr_par, series_tr_par_gz, series_tr_par_snappy]:
tr_max_val = max(tr_max_val, max(x))
print(x)
for x in [series_tw_csv, series_tw_par, series_tw_par_gz, series_tw_par_snappy]:
tw_max_val = max(tw_max_val, max(x))
print(x)
for x in [series_fs_csv, series_fs_par, series_fs_par_gz, series_fs_par_snappy]:
fs_max_val = max(fs_max_val, max(x))
fs_min_val = min(fs_min_val, min(x))
print(x)
plot(x_axis, "file size", "file size (in MBytes)", series_fs_csv, series_fs_par,
series_fs_par_gz, series_fs_par_snappy, fs_min_val, fs_max_val)
plot(x_axis, "time taken to write", "time (in msec)", series_tw_csv, series_tw_par,
series_tw_par_gz, series_tw_par_snappy, 0, tw_max_val)
plot(x_axis, "time taken to read", "time (in msec)", series_tr_csv, series_tr_par,
series_tr_par_gz, series_tr_par_snappy, 0, tr_max_val)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment