deanmalmgren · December 5, 2023 06:11
diff --git a/results.txt b/results.txt
         5 function calls in 6.465 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    6.461    6.461    6.465    6.465 similarity_profiling.py:20(print_approach)
        1    0.004    0.004    0.004    0.004 {open}
        1    0.000    0.000    0.000    0.000 {range}
        1    0.000    0.000    6.465    6.465 <string>:1(<module>)
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}


         1006 function calls in 4.621 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
     1000    4.606    0.005    4.606    0.005 {method 'writerow' of '_csv.writer' objects}
        1    0.011    0.011    4.621    4.621 similarity_profiling.py:31(csv_approach)
        1    0.004    0.004    0.004    0.004 {open}
        1    0.000    0.000    0.000    0.000 {range}
        1    0.000    0.000    4.621    4.621 <string>:1(<module>)
        1    0.000    0.000    0.000    0.000 {_csv.writer}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}


         12040 function calls in 4.819 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
     1000    4.460    0.004    4.817    0.005 {method 'writerow' of '_csv.writer' objects}
     1000    0.337    0.000    0.337    0.000 {built-in method compress}
     1000    0.008    0.000    0.357    0.000 gzip.py:227(write)
     1001    0.007    0.000    0.007    0.000 {zlib.crc32}
     1010    0.002    0.000    0.002    0.000 {method 'write' of 'file' objects}
     1000    0.001    0.000    0.002    0.000 gzip.py:150(_check_closed)
     1001    0.001    0.000    0.001    0.000 {isinstance}
        1    0.001    0.001    0.001    0.001 {method 'close' of 'file' objects}
        1    0.001    0.001    4.819    4.819 similarity_profiling.py:44(csvgz_approach)
     1002    0.001    0.000    0.001    0.000 gzip.py:367(closed)
     4000    0.000    0.000    0.000    0.000 {len}
        1    0.000    0.000    0.000    0.000 {built-in method flush}
        1    0.000    0.000    0.000    0.000 {open}
        1    0.000    0.000    0.000    0.000 {zlib.compressobj}
        1    0.000    0.000    0.000    0.000 gzip.py:45(__init__)
        1    0.000    0.000    0.000    0.000 gzip.py:164(_write_gzip_header)
        1    0.000    0.000    4.819    4.819 <string>:1(<module>)
        1    0.000    0.000    0.000    0.000 {range}
        1    0.000    0.000    0.001    0.001 gzip.py:371(close)
        1    0.000    0.000    0.000    0.000 gzip.py:157(_init_write)
        3    0.000    0.000    0.000    0.000 gzip.py:19(write32u)
        3    0.000    0.000    0.000    0.000 {_struct.pack}
        1    0.000    0.000    0.000    0.000 gzip.py:27(open)
        1    0.000    0.000    0.000    0.000 {_csv.writer}
        1    0.000    0.000    0.000    0.000 posixpath.py:112(basename)
        1    0.000    0.000    0.000    0.000 {time.time}
        1    0.000    0.000    0.000    0.000 {method 'replace' of 'str' objects}
        1    0.000    0.000    0.000    0.000 {method 'rfind' of 'str' objects}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
        1    0.000    0.000    0.000    0.000 {chr}
        1    0.000    0.000    0.000    0.000 {method 'endswith' of 'str' objects}


         1005 function calls in 0.374 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.307    0.307    0.374    0.374 similarity_profiling.py:56(array_approach)
     1000    0.064    0.000    0.064    0.000 {method 'tofile' of 'array.array' objects}
        1    0.003    0.003    0.003    0.003 {open}
        1    0.000    0.000    0.000    0.000 {range}
        1    0.000    0.000    0.374    0.374 <string>:1(<module>)
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}


         2005 function calls in 0.238 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
     1000    0.172    0.000    0.172    0.000 {numpy.core.multiarray.array}
     1000    0.053    0.000    0.053    0.000 {method 'tofile' of 'numpy.ndarray' objects}
        1    0.010    0.010    0.238    0.238 similarity_profiling.py:69(numpy_approach)
        1    0.003    0.003    0.003    0.003 {open}
        1    0.000    0.000    0.000    0.000 {range}
        1    0.000    0.000    0.238    0.238 <string>:1(<module>)
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}


         3005 function calls in 0.178 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
     1000    0.069    0.000    0.069    0.000 {_struct.pack}
     1000    0.058    0.000    0.058    0.000 {method 'write' of 'file' objects}
        1    0.048    0.048    0.178    0.178 similarity_profiling.py:81(struct_approach)
        1    0.003    0.003    0.003    0.003 {open}
     1000    0.000    0.000    0.000    0.000 {len}
        1    0.000    0.000    0.000    0.000 {range}
        1    0.000    0.000    0.178    0.178 <string>:1(<module>)
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
diff --git a/write_profiler.py b/write_profiler.py
 """Simple test to see how effective the various methods mentioned
 [here](http://stackoverflow.com/questions/807863/how-to-output-list-of-floats-to-a-binary-file-in-python)
 are for writing a lot of floating point numbers to disk
 """

 import cProfile
 import csv
 import array
 import math
 import struct
 import numpy
 import gzip


 # simple test of how efficient it would be to export a lot of floating point
 # numbers (often irrational) to disk in various formats
 floats = [math.pi, math.sqrt(23.0), 1.0/9.0, 0.0, 0.0, 0.0] * 1000


 def print_approach():
    """write to disk by just printing

    PROS: easy to read/write code and can also read file by eye
    CONS: slow as shit
    """
    with open('print.dat', 'w') as stream:
        for i in range(1000):
            print >> stream, floats


 def csv_approach():
    """write to disk by converting each floating point numbrer to a string
    representation and writing that in csv format

    PROS: easy to read/write using python. can also read file by eye
    CONS: slow as shit
    """
    with open('csv.dat', 'w') as stream:
        writer = csv.writer(stream)
        for i in range(1000):
            writer.writerow(floats)


 def csvgz_approach():
    """Try gzipping the csv output to see if that saves I/O time

    PROS: readable code and relatively readable output on disk
    CONS: slower than csv_approach
    """
    with gzip.open('csvgz.dat', 'wb') as stream:
        writer = csv.writer(stream)
        for i in range(1000):
            writer.writerow(floats)


 def array_approach():
    """write to disk by using the array module (included as a part of the
    standard library) to write the floating point numbers to disk as bytes

    PROS: very fast, reasonably readable code
    CONS: can't read the resulting binary output file
    """
    with open('array.dat', 'wb') as stream:
        for i in range(1000):
            a = array.array('d', floats)
            a.tofile(stream)


 def numpy_approach():
    """write to disk using numpy

    PROS: v v fast and code that is relatively easy to read
    CONS: still 30% slower than struct_approach
    """
    with open('numpy.dat', 'wb') as stream:
        for i in range(1000):
            a = numpy.array(floats)
            a.tofile(stream)


 def struct_approach():
    """write to disk using python's stuct module (included as a part of the
    standard library)

    PROS: v v v v fast
    CONS: kinda janky code and can't read the resulting binary output file
    """
    with open('struct.dat', 'wb') as stream:
        for i in range(1000):
            s = struct.pack('d'*len(floats), *floats)
            stream.write(s)


 # do the profiling of each approach
 cProfile.run('print_approach()', sort='time')
 cProfile.run('csv_approach()', sort='time')
 cProfile.run('csvgz_approach()', sort='time')
 cProfile.run('array_approach()', sort='time')
 cProfile.run('numpy_approach()', sort='time')
 cProfile.run('struct_approach()', sort='time')
	5 function calls in 6.465 seconds

	Ordered by: internal time

	ncalls tottime percall cumtime percall filename:lineno(function)
	1 6.461 6.461 6.465 6.465 similarity_profiling.py:20(print_approach)
	1 0.004 0.004 0.004 0.004 {open}
	1 0.000 0.000 0.000 0.000 {range}
	1 0.000 0.000 6.465 6.465 <string>:1(<module>)
	1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}


	1006 function calls in 4.621 seconds

	Ordered by: internal time

	ncalls tottime percall cumtime percall filename:lineno(function)
	1000 4.606 0.005 4.606 0.005 {method 'writerow' of '_csv.writer' objects}
	1 0.011 0.011 4.621 4.621 similarity_profiling.py:31(csv_approach)
	1 0.004 0.004 0.004 0.004 {open}
	1 0.000 0.000 0.000 0.000 {range}
	1 0.000 0.000 4.621 4.621 <string>:1(<module>)
	1 0.000 0.000 0.000 0.000 {_csv.writer}
	1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}


	12040 function calls in 4.819 seconds

	Ordered by: internal time

	ncalls tottime percall cumtime percall filename:lineno(function)
	1000 4.460 0.004 4.817 0.005 {method 'writerow' of '_csv.writer' objects}
	1000 0.337 0.000 0.337 0.000 {built-in method compress}
	1000 0.008 0.000 0.357 0.000 gzip.py:227(write)
	1001 0.007 0.000 0.007 0.000 {zlib.crc32}
	1010 0.002 0.000 0.002 0.000 {method 'write' of 'file' objects}
	1000 0.001 0.000 0.002 0.000 gzip.py:150(_check_closed)
	1001 0.001 0.000 0.001 0.000 {isinstance}
	1 0.001 0.001 0.001 0.001 {method 'close' of 'file' objects}
	1 0.001 0.001 4.819 4.819 similarity_profiling.py:44(csvgz_approach)
	1002 0.001 0.000 0.001 0.000 gzip.py:367(closed)
	4000 0.000 0.000 0.000 0.000 {len}
	1 0.000 0.000 0.000 0.000 {built-in method flush}
	1 0.000 0.000 0.000 0.000 {open}
	1 0.000 0.000 0.000 0.000 {zlib.compressobj}
	1 0.000 0.000 0.000 0.000 gzip.py:45(__init__)
	1 0.000 0.000 0.000 0.000 gzip.py:164(_write_gzip_header)
	1 0.000 0.000 4.819 4.819 <string>:1(<module>)
	1 0.000 0.000 0.000 0.000 {range}
	1 0.000 0.000 0.001 0.001 gzip.py:371(close)
	1 0.000 0.000 0.000 0.000 gzip.py:157(_init_write)
	3 0.000 0.000 0.000 0.000 gzip.py:19(write32u)
	3 0.000 0.000 0.000 0.000 {_struct.pack}
	1 0.000 0.000 0.000 0.000 gzip.py:27(open)
	1 0.000 0.000 0.000 0.000 {_csv.writer}
	1 0.000 0.000 0.000 0.000 posixpath.py:112(basename)
	1 0.000 0.000 0.000 0.000 {time.time}
	1 0.000 0.000 0.000 0.000 {method 'replace' of 'str' objects}
	1 0.000 0.000 0.000 0.000 {method 'rfind' of 'str' objects}
	1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}
	1 0.000 0.000 0.000 0.000 {chr}
	1 0.000 0.000 0.000 0.000 {method 'endswith' of 'str' objects}


	1005 function calls in 0.374 seconds

	Ordered by: internal time

	ncalls tottime percall cumtime percall filename:lineno(function)
	1 0.307 0.307 0.374 0.374 similarity_profiling.py:56(array_approach)
	1000 0.064 0.000 0.064 0.000 {method 'tofile' of 'array.array' objects}
	1 0.003 0.003 0.003 0.003 {open}
	1 0.000 0.000 0.000 0.000 {range}
	1 0.000 0.000 0.374 0.374 <string>:1(<module>)
	1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}


	2005 function calls in 0.238 seconds

	Ordered by: internal time

	ncalls tottime percall cumtime percall filename:lineno(function)
	1000 0.172 0.000 0.172 0.000 {numpy.core.multiarray.array}
	1000 0.053 0.000 0.053 0.000 {method 'tofile' of 'numpy.ndarray' objects}
	1 0.010 0.010 0.238 0.238 similarity_profiling.py:69(numpy_approach)
	1 0.003 0.003 0.003 0.003 {open}
	1 0.000 0.000 0.000 0.000 {range}
	1 0.000 0.000 0.238 0.238 <string>:1(<module>)
	1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}


	3005 function calls in 0.178 seconds

	Ordered by: internal time

	ncalls tottime percall cumtime percall filename:lineno(function)
	1000 0.069 0.000 0.069 0.000 {_struct.pack}
	1000 0.058 0.000 0.058 0.000 {method 'write' of 'file' objects}
	1 0.048 0.048 0.178 0.178 similarity_profiling.py:81(struct_approach)
	1 0.003 0.003 0.003 0.003 {open}
	1000 0.000 0.000 0.000 0.000 {len}
	1 0.000 0.000 0.000 0.000 {range}
	1 0.000 0.000 0.178 0.178 <string>:1(<module>)
	1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}
	"""Simple test to see how effective the various methods mentioned
	[here](http://stackoverflow.com/questions/807863/how-to-output-list-of-floats-to-a-binary-file-in-python)
	are for writing a lot of floating point numbers to disk
	"""

	import cProfile
	import csv
	import array
	import math
	import struct
	import numpy
	import gzip


	# simple test of how efficient it would be to export a lot of floating point
	# numbers (often irrational) to disk in various formats
	floats = [math.pi, math.sqrt(23.0), 1.0/9.0, 0.0, 0.0, 0.0] * 1000


	def print_approach():
	"""write to disk by just printing

	PROS: easy to read/write code and can also read file by eye
	CONS: slow as shit
	"""
	with open('print.dat', 'w') as stream:
	for i in range(1000):
	print >> stream, floats


	def csv_approach():
	"""write to disk by converting each floating point numbrer to a string
	representation and writing that in csv format

	PROS: easy to read/write using python. can also read file by eye
	CONS: slow as shit
	"""
	with open('csv.dat', 'w') as stream:
	writer = csv.writer(stream)
	for i in range(1000):
	writer.writerow(floats)


	def csvgz_approach():
	"""Try gzipping the csv output to see if that saves I/O time

	PROS: readable code and relatively readable output on disk
	CONS: slower than csv_approach
	"""
	with gzip.open('csvgz.dat', 'wb') as stream:
	writer = csv.writer(stream)
	for i in range(1000):
	writer.writerow(floats)


	def array_approach():
	"""write to disk by using the array module (included as a part of the
	standard library) to write the floating point numbers to disk as bytes

	PROS: very fast, reasonably readable code
	CONS: can't read the resulting binary output file
	"""
	with open('array.dat', 'wb') as stream:
	for i in range(1000):
	a = array.array('d', floats)
	a.tofile(stream)


	def numpy_approach():
	"""write to disk using numpy

	PROS: v v fast and code that is relatively easy to read
	CONS: still 30% slower than struct_approach
	"""
	with open('numpy.dat', 'wb') as stream:
	for i in range(1000):
	a = numpy.array(floats)
	a.tofile(stream)


	def struct_approach():
	"""write to disk using python's stuct module (included as a part of the
	standard library)

	PROS: v v v v fast
	CONS: kinda janky code and can't read the resulting binary output file
	"""
	with open('struct.dat', 'wb') as stream:
	for i in range(1000):
	s = struct.pack('d'len(floats), floats)
	stream.write(s)


	# do the profiling of each approach
	cProfile.run('print_approach()', sort='time')
	cProfile.run('csv_approach()', sort='time')
	cProfile.run('csvgz_approach()', sort='time')
	cProfile.run('array_approach()', sort='time')
	cProfile.run('numpy_approach()', sort='time')
	cProfile.run('struct_approach()', sort='time')