Skip to content

Instantly share code, notes, and snippets.

@zacharysyoung
Last active February 24, 2023 01:37
Show Gist options
  • Select an option

  • Save zacharysyoung/597f46f9902149b9a3308197e23a6147 to your computer and use it in GitHub Desktop.

Select an option

Save zacharysyoung/597f46f9902149b9a3308197e23a6147 to your computer and use it in GitHub Desktop.
  1. Download all the .py scripts and run.sh
  2. pip install json-stream
  3. sh run.sh

run.sh calls the run_*.py scripts, which will run gen_json.py to generate three JSON test files of varying size.

The generate JSON looks like:

{
     "0": {"foo": "bar"},
     "1": {"foo": "bar"},
     "2": {"foo": "bar"},
     "3": {"foo": "bar"},
     ...
}

for 100_000, 1_000_000, and 10_000_000 {foo: bar} objects.

read_json.py and transform_json.py will then read and transform the generated JSON.

Those three .py runners will be calling /usr/bin/time and capturing the output to get a rough metric for run time and memory usage of the two different methods, standard and stream.

The stats show that json-stream has a flat memory curve for processing 100_000, 1_000_000, and 10_000_000 objects. It does take more time to read and transform, though:

Generate

Method Items Real (s) User (s) Sys (s) Mem (MB)
standard 1e+05 0.19 0.17 0.01 45.84
standard 1e+06 2.00 1.93 0.06 372.97
standard 1e+07 21.67 20.46 1.03 3480.29
stream 1e+05 0.18 0.15 0.00 7.28
stream 1e+06 1.43 1.41 0.02 7.69
stream 1e+07 14.41 14.07 0.20 7.58

Read

Method Items Real (s) User (s) Sys (s) Mem (MB)
standard 1e+05 0.05 0.04 0.01 48.28
standard 1e+06 0.58 0.50 0.05 390.17
standard 1e+07 7.69 6.73 0.80 3875.81
stream 1e+05 0.32 0.31 0.01 7.70
stream 1e+06 2.96 2.94 0.02 7.69
stream 1e+07 29.88 29.65 0.17 7.77

Transform

Method Items Real (s) User (s) Sys (s) Mem (MB)
standard 1e+05 0.19 0.17 0.01 48.05
standard 1e+06 1.83 1.75 0.07 388.83
standard 1e+07 20.16 19.15 0.91 3875.49
stream 1e+05 0.63 0.61 0.01 7.61
stream 1e+06 6.06 6.02 0.03 7.92
stream 1e+07 61.44 60.89 0.35 8.44
#!/usr/bin/env python3
import json
import sys
from json_stream import streamable_dict
@streamable_dict
def yield_obj(n: int):
for x in range(n):
yield str(x), {"foo": "bar"}
def gen_standard(n: int):
with open(f"gen/{n}.json", "w") as f:
obj = dict(list(yield_obj(n)))
json.dump(obj, f, indent=1)
def gen_stream(n: int):
with open(f"gen/{n}.json", "w") as f:
json.dump(yield_obj(n), f, indent=1)
method = sys.argv[1]
n = int(sys.argv[2])
if method == "standard":
gen_standard(n)
elif method == "stream":
gen_stream(n)
#!/usr/bin/env python3
import json
import sys
import json_stream
def read_standard(fname: str):
with open(fname) as f:
for _ in json.load(f):
pass
def read_stream(fname: str):
with open(fname) as f:
for _ in json_stream.load(f):
pass
method = sys.argv[1]
fname = sys.argv[2]
if method == "standard":
read_standard(fname)
elif method == "stream":
read_stream(fname)
#!/bin/sh
mkdir -p gen
./run_gen.py || exit 1
./run_read.py || exit 1
./run_transform.py || exit 1
# Optional, use a utility to transform the CSV stat files to Markdown table
# and create the table in the README.
#
# STATS_MD='stats.md'
# echo '# Runnings' > $STATS_MD
#
# { echo ; echo '## Generate'; echo ; } >> $STATS_MD
# gocsv viewmd stats_gen.csv >> $STATS_MD
#
# { echo ; echo '## Read'; echo ; } >> $STATS_MD
# gocsv viewmd stats_read.csv >> $STATS_MD
#
# { echo ; echo '## Transform'; echo ; } >> $STATS_MD
# gocsv viewmd stats_transform.csv >> $STATS_MD
#
# # Add right-align to header for numerial columns
# old='|----------|-------|----------|----------|---------|----------|'
# new='|----------|------:|---------:|---------:|--------:|---------:|'
# sed "s/$old/$new/g" $STATS_MD> tmp && mv tmp $STATS_MD
#!/usr/bin/env python3
import csv
import re
import subprocess
from typing import TypedDict
import glob, os, sys
METHODS = ["standard", "stream"]
GEN_ITEMS = [100_000, 1_000_000, 10_000_000]
GEN_TMPL = "gen/{}.json"
STATS_CSV = "stats_gen.csv"
def clean():
def _rem(path: str):
try:
os.remove(path)
except FileNotFoundError:
pass
for fname in glob.glob(GEN_TMPL.format("*")):
_rem(fname)
_rem(STATS_CSV)
class TimeResults(TypedDict):
# timings (seconds)
real: float
user: float
sys: float
# mem footprint (bytes)
mem: int
def parse_time_results(stat_str: str) -> TimeResults:
"""Extract stats from multiline output from `/usr/bin/time -l`"""
lines = stat_str.splitlines()
# Parse first line, like, " 0.00 real 0.00 user 0.00 sys"
m_timings = re.match(r"\s+(\d+\.\d+) real\s+(\d+\.\d+) user\s+(\d+\.\d+) sys", lines[0])
assert m_timings != None
treal = float(m_timings.group(1))
tuser = float(m_timings.group(2))
tsys = float(m_timings.group(3))
# Parse last line, like, " 951040 peak memory footprint"
m_memory = re.match(r"\s+(\d+)\s+peak memory footprint", lines[-1])
assert m_memory != None
mem = int(m_memory.group(1))
return TimeResults(real=treal, user=tuser, sys=tsys, mem=mem)
def header() -> list[str]:
return ["Method", "Items", "Real (s)", "User (s)", "Sys (s)", "Mem (MB)"]
def to_row(method: str, n_items: int, stats: TimeResults) -> list[str]:
def _to_MB(x: str | int | float) -> float:
return float(x) / (1024 * 1024)
return [
method,
f"{n_items:.0g}",
f"{stats['real']:.2f}",
f"{stats['user']:.2f}",
f"{stats['sys']:.2f}",
f"{_to_MB(stats['mem']):.2f}",
]
def main():
# Handle cmd-line args, if any
if len(sys.argv) > 1:
if sys.argv[1] == "clean":
clean()
exit(0)
print(f"error: run_time.py [clean]")
exit(1)
# Clean-up previous input CSVs
clean()
# Create stats CSV
f_stats = open(STATS_CSV, "w", newline="")
stats_writer = csv.writer(f_stats)
stats_writer.writerow(header())
for method in METHODS:
for n_items in GEN_ITEMS:
cmd = f"/usr/bin/time -l ./gen_json.py {method} {n_items} 2>&1"
print(f"Running `{cmd}`")
p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
if p.stderr is not None:
raise ValueError(p.stderr)
assert p.stdout != None
stats = parse_time_results(p.stdout.read().decode("utf-8"))
stats_writer.writerow(to_row(method, n_items, stats))
f_stats.flush()
main()
#!/usr/bin/env python3
import csv
import re
import subprocess
from typing import TypedDict
import glob, os, sys
METHODS = ["standard", "stream"]
GEN_ITEMS = [100_000, 1_000_000, 10_000_000]
GEN_TMPL = "gen/{}.json"
STATS_CSV = "stats_read.csv"
def clean():
def _rem(path: str):
try:
os.remove(path)
except FileNotFoundError:
pass
_rem(STATS_CSV)
class TimeResults(TypedDict):
# timings (seconds)
real: float
user: float
sys: float
# mem footprint (bytes)
mem: int
def parse_time_results(stat_str: str) -> TimeResults:
"""Extract stats from multiline output from `/usr/bin/time -l`"""
lines = stat_str.splitlines()
# Parse first line, like, " 0.00 real 0.00 user 0.00 sys"
m_timings = re.match(r"\s+(\d+\.\d+) real\s+(\d+\.\d+) user\s+(\d+\.\d+) sys", lines[0])
assert m_timings != None
treal = float(m_timings.group(1))
tuser = float(m_timings.group(2))
tsys = float(m_timings.group(3))
# Parse last line, like, " 951040 peak memory footprint"
m_memory = re.match(r"\s+(\d+)\s+peak memory footprint", lines[-1])
assert m_memory != None
mem = int(m_memory.group(1))
return TimeResults(real=treal, user=tuser, sys=tsys, mem=mem)
def header() -> list[str]:
return ["Method", "Items", "Real (s)", "User (s)", "Sys (s)", "Mem (MB)"]
def to_row(method: str, n_items: int, stats: TimeResults) -> list[str]:
def _to_MB(x: str | int | float) -> float:
return float(x) / (1024 * 1024)
return [
method,
f"{n_items:.0g}",
f"{stats['real']:.2f}",
f"{stats['user']:.2f}",
f"{stats['sys']:.2f}",
f"{_to_MB(stats['mem']):.2f}",
]
def main():
# Handle cmd-line args, if any
if len(sys.argv) > 1:
if sys.argv[1] == "clean":
clean()
exit(0)
print(f"error: run_time.py [clean]")
exit(1)
# Clean-up previous input CSVs
clean()
# Create stats CSV
f_stats = open(STATS_CSV, "w", newline="")
stats_writer = csv.writer(f_stats)
stats_writer.writerow(header())
for method in METHODS:
for n_items in GEN_ITEMS:
cmd = f"/usr/bin/time -l ./read_json.py {method} gen/{n_items}.json 2>&1"
print(f"Running `{cmd}`")
p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
if p.stderr is not None:
raise ValueError(p.stderr)
assert p.stdout != None
stats = parse_time_results(p.stdout.read().decode("utf-8"))
stats_writer.writerow(to_row(method, n_items, stats))
f_stats.flush()
main()
#!/usr/bin/env python3
import csv
import re
import subprocess
from typing import TypedDict
import glob, os, sys
METHODS = ["standard", "stream"]
GEN_ITEMS = [100_000, 1_000_000, 10_000_000]
GEN_TMPL = "gen/{}_out.json"
STATS_CSV = "stats_transform.csv"
def clean():
def _rem(path: str):
try:
os.remove(path)
except FileNotFoundError:
pass
for fname in glob.glob(GEN_TMPL.format("*")):
_rem(fname)
_rem(STATS_CSV)
class TimeResults(TypedDict):
# timings (seconds)
real: float
user: float
sys: float
# mem footprint (bytes)
mem: int
def parse_time_results(stat_str: str) -> TimeResults:
"""Extract stats from multiline output from `/usr/bin/time -l`"""
lines = stat_str.splitlines()
# Parse first line, like, " 0.00 real 0.00 user 0.00 sys"
m_timings = re.match(r"\s+(\d+\.\d+) real\s+(\d+\.\d+) user\s+(\d+\.\d+) sys", lines[0])
assert m_timings != None
treal = float(m_timings.group(1))
tuser = float(m_timings.group(2))
tsys = float(m_timings.group(3))
# Parse last line, like, " 951040 peak memory footprint"
m_memory = re.match(r"\s+(\d+)\s+peak memory footprint", lines[-1])
assert m_memory != None
mem = int(m_memory.group(1))
return TimeResults(real=treal, user=tuser, sys=tsys, mem=mem)
def header() -> list[str]:
return ["Method", "Items", "Real (s)", "User (s)", "Sys (s)", "Mem (MB)"]
def to_row(method: str, n_items: int, stats: TimeResults) -> list[str]:
def _to_MB(x: str | int | float) -> float:
return float(x) / (1024 * 1024)
return [
method,
f"{n_items:.0g}",
f"{stats['real']:.2f}",
f"{stats['user']:.2f}",
f"{stats['sys']:.2f}",
f"{_to_MB(stats['mem']):.2f}",
]
def main():
# Handle cmd-line args, if any
if len(sys.argv) > 1:
if sys.argv[1] == "clean":
clean()
exit(0)
print(f"error: run_time.py [clean]")
exit(1)
# Clean-up previous input CSVs
clean()
# Create stats CSV
f_stats = open(STATS_CSV, "w", newline="")
stats_writer = csv.writer(f_stats)
stats_writer.writerow(header())
for method in METHODS:
for n_items in GEN_ITEMS:
cmd = f"/usr/bin/time -l ./transform_json.py {method} gen/{n_items}.json 2>&1"
print(f"Running `{cmd}`")
p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
if p.stderr is not None:
raise ValueError(p.stderr)
assert p.stdout != None
stats = parse_time_results(p.stdout.read().decode("utf-8"))
stats_writer.writerow(to_row(method, n_items, stats))
f_stats.flush()
main()
#!/usr/bin/env python3
import json
import sys
import json_stream
from json_stream.writer import streamable_dict
FOOBARBAZ = "FooBarBaz"
def out_name(in_name: str) -> str:
return in_name.replace(".json", "_out.json")
def transform_standard(fname: str):
with open(fname) as f_in:
data = json.load(f_in)
for key, value in data.items():
if int(key) % 2 == 1:
value["foo"] = "BAR"
with open(out_name(fname), "w") as f_out:
json.dump(data, f_out, indent=1)
def transform_stream(fname: str):
@streamable_dict
def update(data):
for key, value in data.items():
value = json_stream.to_standard_types(value)
if int(key) % 2 == 1:
value["foo"] = "BAR"
yield key, value
with open(fname) as f_in:
data = json_stream.load(f_in)
updated_data = update(data)
with open(out_name(fname), "w") as f_out:
json.dump(updated_data, f_out, indent=1)
method = sys.argv[1]
fname = sys.argv[2]
if method == "standard":
transform_standard(fname)
elif method == "stream":
transform_stream(fname)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment