Skip to content

Instantly share code, notes, and snippets.

@deanmalmgren
Last active December 5, 2023 06:11
Show Gist options
  • Save deanmalmgren/fd1714799dc5b5643b87 to your computer and use it in GitHub Desktop.
Save deanmalmgren/fd1714799dc5b5643b87 to your computer and use it in GitHub Desktop.
comparison of various ways to write a lots of irrational floats to disk in python
5 function calls in 6.465 seconds
Ordered by: internal time
ncalls tottime percall cumtime percall filename:lineno(function)
1 6.461 6.461 6.465 6.465 similarity_profiling.py:20(print_approach)
1 0.004 0.004 0.004 0.004 {open}
1 0.000 0.000 0.000 0.000 {range}
1 0.000 0.000 6.465 6.465 <string>:1(<module>)
1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}
1006 function calls in 4.621 seconds
Ordered by: internal time
ncalls tottime percall cumtime percall filename:lineno(function)
1000 4.606 0.005 4.606 0.005 {method 'writerow' of '_csv.writer' objects}
1 0.011 0.011 4.621 4.621 similarity_profiling.py:31(csv_approach)
1 0.004 0.004 0.004 0.004 {open}
1 0.000 0.000 0.000 0.000 {range}
1 0.000 0.000 4.621 4.621 <string>:1(<module>)
1 0.000 0.000 0.000 0.000 {_csv.writer}
1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}
12040 function calls in 4.819 seconds
Ordered by: internal time
ncalls tottime percall cumtime percall filename:lineno(function)
1000 4.460 0.004 4.817 0.005 {method 'writerow' of '_csv.writer' objects}
1000 0.337 0.000 0.337 0.000 {built-in method compress}
1000 0.008 0.000 0.357 0.000 gzip.py:227(write)
1001 0.007 0.000 0.007 0.000 {zlib.crc32}
1010 0.002 0.000 0.002 0.000 {method 'write' of 'file' objects}
1000 0.001 0.000 0.002 0.000 gzip.py:150(_check_closed)
1001 0.001 0.000 0.001 0.000 {isinstance}
1 0.001 0.001 0.001 0.001 {method 'close' of 'file' objects}
1 0.001 0.001 4.819 4.819 similarity_profiling.py:44(csvgz_approach)
1002 0.001 0.000 0.001 0.000 gzip.py:367(closed)
4000 0.000 0.000 0.000 0.000 {len}
1 0.000 0.000 0.000 0.000 {built-in method flush}
1 0.000 0.000 0.000 0.000 {open}
1 0.000 0.000 0.000 0.000 {zlib.compressobj}
1 0.000 0.000 0.000 0.000 gzip.py:45(__init__)
1 0.000 0.000 0.000 0.000 gzip.py:164(_write_gzip_header)
1 0.000 0.000 4.819 4.819 <string>:1(<module>)
1 0.000 0.000 0.000 0.000 {range}
1 0.000 0.000 0.001 0.001 gzip.py:371(close)
1 0.000 0.000 0.000 0.000 gzip.py:157(_init_write)
3 0.000 0.000 0.000 0.000 gzip.py:19(write32u)
3 0.000 0.000 0.000 0.000 {_struct.pack}
1 0.000 0.000 0.000 0.000 gzip.py:27(open)
1 0.000 0.000 0.000 0.000 {_csv.writer}
1 0.000 0.000 0.000 0.000 posixpath.py:112(basename)
1 0.000 0.000 0.000 0.000 {time.time}
1 0.000 0.000 0.000 0.000 {method 'replace' of 'str' objects}
1 0.000 0.000 0.000 0.000 {method 'rfind' of 'str' objects}
1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}
1 0.000 0.000 0.000 0.000 {chr}
1 0.000 0.000 0.000 0.000 {method 'endswith' of 'str' objects}
1005 function calls in 0.374 seconds
Ordered by: internal time
ncalls tottime percall cumtime percall filename:lineno(function)
1 0.307 0.307 0.374 0.374 similarity_profiling.py:56(array_approach)
1000 0.064 0.000 0.064 0.000 {method 'tofile' of 'array.array' objects}
1 0.003 0.003 0.003 0.003 {open}
1 0.000 0.000 0.000 0.000 {range}
1 0.000 0.000 0.374 0.374 <string>:1(<module>)
1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}
2005 function calls in 0.238 seconds
Ordered by: internal time
ncalls tottime percall cumtime percall filename:lineno(function)
1000 0.172 0.000 0.172 0.000 {numpy.core.multiarray.array}
1000 0.053 0.000 0.053 0.000 {method 'tofile' of 'numpy.ndarray' objects}
1 0.010 0.010 0.238 0.238 similarity_profiling.py:69(numpy_approach)
1 0.003 0.003 0.003 0.003 {open}
1 0.000 0.000 0.000 0.000 {range}
1 0.000 0.000 0.238 0.238 <string>:1(<module>)
1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}
3005 function calls in 0.178 seconds
Ordered by: internal time
ncalls tottime percall cumtime percall filename:lineno(function)
1000 0.069 0.000 0.069 0.000 {_struct.pack}
1000 0.058 0.000 0.058 0.000 {method 'write' of 'file' objects}
1 0.048 0.048 0.178 0.178 similarity_profiling.py:81(struct_approach)
1 0.003 0.003 0.003 0.003 {open}
1000 0.000 0.000 0.000 0.000 {len}
1 0.000 0.000 0.000 0.000 {range}
1 0.000 0.000 0.178 0.178 <string>:1(<module>)
1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}
"""Simple test to see how effective the various methods mentioned
[here](http://stackoverflow.com/questions/807863/how-to-output-list-of-floats-to-a-binary-file-in-python)
are for writing a lot of floating point numbers to disk
"""
import cProfile
import csv
import array
import math
import struct
import numpy
import gzip
# simple test of how efficient it would be to export a lot of floating point
# numbers (often irrational) to disk in various formats
floats = [math.pi, math.sqrt(23.0), 1.0/9.0, 0.0, 0.0, 0.0] * 1000
def print_approach():
"""write to disk by just printing
PROS: easy to read/write code and can also read file by eye
CONS: slow as shit
"""
with open('print.dat', 'w') as stream:
for i in range(1000):
print >> stream, floats
def csv_approach():
"""write to disk by converting each floating point numbrer to a string
representation and writing that in csv format
PROS: easy to read/write using python. can also read file by eye
CONS: slow as shit
"""
with open('csv.dat', 'w') as stream:
writer = csv.writer(stream)
for i in range(1000):
writer.writerow(floats)
def csvgz_approach():
"""Try gzipping the csv output to see if that saves I/O time
PROS: readable code and relatively readable output on disk
CONS: slower than csv_approach
"""
with gzip.open('csvgz.dat', 'wb') as stream:
writer = csv.writer(stream)
for i in range(1000):
writer.writerow(floats)
def array_approach():
"""write to disk by using the array module (included as a part of the
standard library) to write the floating point numbers to disk as bytes
PROS: very fast, reasonably readable code
CONS: can't read the resulting binary output file
"""
with open('array.dat', 'wb') as stream:
for i in range(1000):
a = array.array('d', floats)
a.tofile(stream)
def numpy_approach():
"""write to disk using numpy
PROS: v v fast and code that is relatively easy to read
CONS: still 30% slower than struct_approach
"""
with open('numpy.dat', 'wb') as stream:
for i in range(1000):
a = numpy.array(floats)
a.tofile(stream)
def struct_approach():
"""write to disk using python's stuct module (included as a part of the
standard library)
PROS: v v v v fast
CONS: kinda janky code and can't read the resulting binary output file
"""
with open('struct.dat', 'wb') as stream:
for i in range(1000):
s = struct.pack('d'*len(floats), *floats)
stream.write(s)
# do the profiling of each approach
cProfile.run('print_approach()', sort='time')
cProfile.run('csv_approach()', sort='time')
cProfile.run('csvgz_approach()', sort='time')
cProfile.run('array_approach()', sort='time')
cProfile.run('numpy_approach()', sort='time')
cProfile.run('struct_approach()', sort='time')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment