Last active
December 5, 2023 06:11
-
-
Save deanmalmgren/fd1714799dc5b5643b87 to your computer and use it in GitHub Desktop.
comparison of various ways to write a lots of irrational floats to disk in python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
5 function calls in 6.465 seconds | |
Ordered by: internal time | |
ncalls tottime percall cumtime percall filename:lineno(function) | |
1 6.461 6.461 6.465 6.465 similarity_profiling.py:20(print_approach) | |
1 0.004 0.004 0.004 0.004 {open} | |
1 0.000 0.000 0.000 0.000 {range} | |
1 0.000 0.000 6.465 6.465 <string>:1(<module>) | |
1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects} | |
1006 function calls in 4.621 seconds | |
Ordered by: internal time | |
ncalls tottime percall cumtime percall filename:lineno(function) | |
1000 4.606 0.005 4.606 0.005 {method 'writerow' of '_csv.writer' objects} | |
1 0.011 0.011 4.621 4.621 similarity_profiling.py:31(csv_approach) | |
1 0.004 0.004 0.004 0.004 {open} | |
1 0.000 0.000 0.000 0.000 {range} | |
1 0.000 0.000 4.621 4.621 <string>:1(<module>) | |
1 0.000 0.000 0.000 0.000 {_csv.writer} | |
1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects} | |
12040 function calls in 4.819 seconds | |
Ordered by: internal time | |
ncalls tottime percall cumtime percall filename:lineno(function) | |
1000 4.460 0.004 4.817 0.005 {method 'writerow' of '_csv.writer' objects} | |
1000 0.337 0.000 0.337 0.000 {built-in method compress} | |
1000 0.008 0.000 0.357 0.000 gzip.py:227(write) | |
1001 0.007 0.000 0.007 0.000 {zlib.crc32} | |
1010 0.002 0.000 0.002 0.000 {method 'write' of 'file' objects} | |
1000 0.001 0.000 0.002 0.000 gzip.py:150(_check_closed) | |
1001 0.001 0.000 0.001 0.000 {isinstance} | |
1 0.001 0.001 0.001 0.001 {method 'close' of 'file' objects} | |
1 0.001 0.001 4.819 4.819 similarity_profiling.py:44(csvgz_approach) | |
1002 0.001 0.000 0.001 0.000 gzip.py:367(closed) | |
4000 0.000 0.000 0.000 0.000 {len} | |
1 0.000 0.000 0.000 0.000 {built-in method flush} | |
1 0.000 0.000 0.000 0.000 {open} | |
1 0.000 0.000 0.000 0.000 {zlib.compressobj} | |
1 0.000 0.000 0.000 0.000 gzip.py:45(__init__) | |
1 0.000 0.000 0.000 0.000 gzip.py:164(_write_gzip_header) | |
1 0.000 0.000 4.819 4.819 <string>:1(<module>) | |
1 0.000 0.000 0.000 0.000 {range} | |
1 0.000 0.000 0.001 0.001 gzip.py:371(close) | |
1 0.000 0.000 0.000 0.000 gzip.py:157(_init_write) | |
3 0.000 0.000 0.000 0.000 gzip.py:19(write32u) | |
3 0.000 0.000 0.000 0.000 {_struct.pack} | |
1 0.000 0.000 0.000 0.000 gzip.py:27(open) | |
1 0.000 0.000 0.000 0.000 {_csv.writer} | |
1 0.000 0.000 0.000 0.000 posixpath.py:112(basename) | |
1 0.000 0.000 0.000 0.000 {time.time} | |
1 0.000 0.000 0.000 0.000 {method 'replace' of 'str' objects} | |
1 0.000 0.000 0.000 0.000 {method 'rfind' of 'str' objects} | |
1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects} | |
1 0.000 0.000 0.000 0.000 {chr} | |
1 0.000 0.000 0.000 0.000 {method 'endswith' of 'str' objects} | |
1005 function calls in 0.374 seconds | |
Ordered by: internal time | |
ncalls tottime percall cumtime percall filename:lineno(function) | |
1 0.307 0.307 0.374 0.374 similarity_profiling.py:56(array_approach) | |
1000 0.064 0.000 0.064 0.000 {method 'tofile' of 'array.array' objects} | |
1 0.003 0.003 0.003 0.003 {open} | |
1 0.000 0.000 0.000 0.000 {range} | |
1 0.000 0.000 0.374 0.374 <string>:1(<module>) | |
1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects} | |
2005 function calls in 0.238 seconds | |
Ordered by: internal time | |
ncalls tottime percall cumtime percall filename:lineno(function) | |
1000 0.172 0.000 0.172 0.000 {numpy.core.multiarray.array} | |
1000 0.053 0.000 0.053 0.000 {method 'tofile' of 'numpy.ndarray' objects} | |
1 0.010 0.010 0.238 0.238 similarity_profiling.py:69(numpy_approach) | |
1 0.003 0.003 0.003 0.003 {open} | |
1 0.000 0.000 0.000 0.000 {range} | |
1 0.000 0.000 0.238 0.238 <string>:1(<module>) | |
1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects} | |
3005 function calls in 0.178 seconds | |
Ordered by: internal time | |
ncalls tottime percall cumtime percall filename:lineno(function) | |
1000 0.069 0.000 0.069 0.000 {_struct.pack} | |
1000 0.058 0.000 0.058 0.000 {method 'write' of 'file' objects} | |
1 0.048 0.048 0.178 0.178 similarity_profiling.py:81(struct_approach) | |
1 0.003 0.003 0.003 0.003 {open} | |
1000 0.000 0.000 0.000 0.000 {len} | |
1 0.000 0.000 0.000 0.000 {range} | |
1 0.000 0.000 0.178 0.178 <string>:1(<module>) | |
1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Simple test to see how effective the various methods mentioned | |
[here](http://stackoverflow.com/questions/807863/how-to-output-list-of-floats-to-a-binary-file-in-python) | |
are for writing a lot of floating point numbers to disk | |
""" | |
import cProfile | |
import csv | |
import array | |
import math | |
import struct | |
import numpy | |
import gzip | |
# simple test of how efficient it would be to export a lot of floating point | |
# numbers (often irrational) to disk in various formats | |
floats = [math.pi, math.sqrt(23.0), 1.0/9.0, 0.0, 0.0, 0.0] * 1000 | |
def print_approach(): | |
"""write to disk by just printing | |
PROS: easy to read/write code and can also read file by eye | |
CONS: slow as shit | |
""" | |
with open('print.dat', 'w') as stream: | |
for i in range(1000): | |
print >> stream, floats | |
def csv_approach(): | |
"""write to disk by converting each floating point numbrer to a string | |
representation and writing that in csv format | |
PROS: easy to read/write using python. can also read file by eye | |
CONS: slow as shit | |
""" | |
with open('csv.dat', 'w') as stream: | |
writer = csv.writer(stream) | |
for i in range(1000): | |
writer.writerow(floats) | |
def csvgz_approach(): | |
"""Try gzipping the csv output to see if that saves I/O time | |
PROS: readable code and relatively readable output on disk | |
CONS: slower than csv_approach | |
""" | |
with gzip.open('csvgz.dat', 'wb') as stream: | |
writer = csv.writer(stream) | |
for i in range(1000): | |
writer.writerow(floats) | |
def array_approach(): | |
"""write to disk by using the array module (included as a part of the | |
standard library) to write the floating point numbers to disk as bytes | |
PROS: very fast, reasonably readable code | |
CONS: can't read the resulting binary output file | |
""" | |
with open('array.dat', 'wb') as stream: | |
for i in range(1000): | |
a = array.array('d', floats) | |
a.tofile(stream) | |
def numpy_approach(): | |
"""write to disk using numpy | |
PROS: v v fast and code that is relatively easy to read | |
CONS: still 30% slower than struct_approach | |
""" | |
with open('numpy.dat', 'wb') as stream: | |
for i in range(1000): | |
a = numpy.array(floats) | |
a.tofile(stream) | |
def struct_approach(): | |
"""write to disk using python's stuct module (included as a part of the | |
standard library) | |
PROS: v v v v fast | |
CONS: kinda janky code and can't read the resulting binary output file | |
""" | |
with open('struct.dat', 'wb') as stream: | |
for i in range(1000): | |
s = struct.pack('d'*len(floats), *floats) | |
stream.write(s) | |
# do the profiling of each approach | |
cProfile.run('print_approach()', sort='time') | |
cProfile.run('csv_approach()', sort='time') | |
cProfile.run('csvgz_approach()', sort='time') | |
cProfile.run('array_approach()', sort='time') | |
cProfile.run('numpy_approach()', sort='time') | |
cProfile.run('struct_approach()', sort='time') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment