zacharysyoung · December 4, 2021 02:58
diff --git a/README.md b/README.md
diff --git a/gen_target.py b/gen_target.py
 #!/usr/bin/env python3
 import sys

 CHUNK_SZ = 64 * 1024

 STR = '''
 28807644,0,Maun FCU,US
 28855353,0,WNB Holdings LLC,US
 29212330,0,Idaho First Bank,US
 29278777,0,Republic Bank of Arizona,US
 29633181,0,Friendly Hills Bank,US
 29760145,0,The Freedom Bank of Virginia,US
 100504846,0,Community First Fund Federal Credit Union,US
 '''.lstrip()  # need last newline

 STR_SZ = len(STR)

 mag = int(sys.argv[1])  # the 10**number of times to write STR

 str_ct = int(CHUNK_SZ / STR_SZ)
 str_rem = CHUNK_SZ % STR_SZ

 # print(f'STR will fit into CHUNK {str_ct} times, with a remainder of {str_rem} empty slots ({(str_rem/CHUNK_SZ)*100:.1f}% wasted space per write)')

 write_chunk_ct = int(10**mag / str_ct)
 write_str_rem = 10**mag % str_ct

 # print(f'Will need to write STR {write_chunk_ct + write_str_rem} times')

 write = sys.stdout.write

 write('a,b,c,d\n')

 for i in range(write_chunk_ct):
    write(STR * str_ct)

 write(STR * write_str_rem)
diff --git a/gen_test.py b/gen_test.py
 #!/usr/bin/env python3
 import sys

 CHUNK_SZ = 64 * 1024

 STR = "28807644'~'0'~'Maun FCU'~'US#@#@#28855353'~'0'~'WNB Holdings LLC'~'US#@#@#29212330'~'0'~'Idaho First Bank'~'US#@#@#29278777'~'0'~'Republic Bank of Arizona'~'US#@#@#29633181'~'0'~'Friendly Hills Bank'~'US#@#@#29760145'~'0'~'The Freedom Bank of Virginia'~'US#@#@#100504846'~'0'~'Community First Fund Federal Credit Union'~'US#@#@#"
 STR_SZ = len(STR)

 mag = int(sys.argv[1])  # the 10**number of times to write STR

 str_ct = int(CHUNK_SZ / STR_SZ)
 str_rem = CHUNK_SZ % STR_SZ

 # print(f'STR will fit into CHUNK {str_ct} times, with a remainder of {str_rem} empty slots ({(str_rem/CHUNK_SZ)*100:.1f}% wasted space per write)')

 write_chunk_ct = int(10**mag / str_ct)
 write_str_rem = 10**mag % str_ct

 # print(f'Will need to write STR {write_chunk_ct + write_str_rem} times')

 for i in range(write_chunk_ct):
    sys.stdout.write(STR * str_ct)

 sys.stdout.write(STR * write_str_rem)
diff --git a/main.py b/main.py
 import sys

 CHUNK_SZES = {'1K':1024, '32K': 32 * 1024,'1M': 1024 * 1024}

 FS = "'~'"
 RS = '#@#@#'

 # With chars repeated in the separators, check most specific (least ambiguous)
 # to least specific (most ambiguous) to definitively catch a partial with the
 # fewest number of checks
 PARTIAL_RSES = ['#@#@', '#@#', '#@', '#']
 PARTIAL_FSES = ["'~", "'"]
 ALL_PARTIALS =  PARTIAL_FSES + PARTIAL_RSES 

 chunk_sz = CHUNK_SZES[sys.argv[1]]

 f_out = open('out.csv', 'w')
 f_out.write('a,b,c,d\n')

 f_in = open('my_file.txt')
 line = ''
 while True:
    # Read chunks till no more, then break out
    chunk = f_in.read(chunk_sz)
    if not chunk:
        break

    # Any previous partial separator, plus new chunk
    line += chunk

    # Check end of line for a partial FS or RS (when separators are more than
    # one char)
    final_partial = ''

    if line.endswith(FS) or line.endswith(RS):
        pass  # Write-out will replace complete FS or RS
    else:
        for partial in ALL_PARTIALS:
            if line.endswith(partial):
                final_partial = partial
                line = line[:-len(partial)]
                break

    # Process/write chunk
    f_out.write(line
                .replace(FS, ',')
                .replace(RS, '\n'))

    # Add partial back, to be completed next chunk
    line = final_partial


 # Clean up
 f_in.close()
 f_out.close()
diff --git a/process_results.py b/process_results.py
 #!/usr/bin/env python3
 import csv
 import sys
 from collections import defaultdict

 # Turn a set of files like:
 #
 #     e0test.txt
 #             0.08 real         0.01 user         0.00 sys
 #                 5067840  peak memory footprint
 #     e1test.txt
 #             0.02 real         0.01 user         0.00 sys
 #                 4981760  peak memory footprint
 #     e2test.txt
 #             0.02 real         0.01 user         0.00 sys
 #                 4994048  peak memory footprint
 #
 # into:
 #
 #     | File size | 1K (s) | 1K (MB) |
 #     |-----------|--------|---------|
 #     | e0        | 0.08   | 4.8     |
 #     | e1        | 0.02   | 4.8     |
 #     | e2        | 0.02   | 4.8     |


 # BUF_SIZES = ['1K', '32K', '1M']
 F_SIZES = [f'e{i}' for i in range(0, 9)]

 buf_sizes = sys.argv[1:]

 # Iterate buffer-sized results files
 results = defaultdict(dict)
 for buf_size in buf_sizes:
    f = open(f'results_{buf_size}.txt')

    # For every file-size result group, consume 3 lines
    while True:

        try:
            fsz = next(f).strip()[:2]
            line_time = next(f).strip().split(' ')
            line_mem = next(f).strip().split(' ')
        except StopIteration:
            break
        except Exception as e:
            print(f'error: {e}')
            sys.exit(1)

        assert fsz in F_SIZES, f'{fsz} is not a valid F_SIZE'
        assert line_time[1] == 'real'
        assert line_mem[2] == 'peak'

        t = line_time[0]
        m = int(line_mem[0])

        results[fsz][buf_size] = (t, m)

    f.close()


 w = csv.writer(sys.stdout)

 header = ['File size']
 for buf_size in buf_sizes:
    header += [f'{buf_size} (s)']
    header += [f'{buf_size} (MB)']
 w.writerow(header)

 for fsz, buf_szs in results.items():
    row = [fsz]
    for buf_sz in buf_szs:
        t, m = results[fsz][buf_sz]
        # Format, convert mem size to MB
        row += [f'{t}', f'{m/1024**2:.1f}']

    w.writerow(row)
diff --git a/test.sh b/test.sh
 #!/bin/sh

 chunkSize=$1

 mags="0 1 2 3" # 4 5 6" # 7 8"

 for mag in $mags; do
    testFile="e${mag}test.txt"
    targetFile="e${mag}target.csv"

    # Make sure there's something to test against
    if [ ! -f "$testFile" ]; then
        echo "Generating $testFile"
        ./gen_test.py "$mag" >"$testFile"
    fi
    
    if [ ! -f "$targetFile" ]; then
        echo "Generating $targetFile"
        ./gen_target.py "$mag" >"$targetFile"
    fi

    echo "$testFile"

    # main.py reads my_file.txt and writes to out.csv
    ln -fs "$testFile" my_file.txt

    # Run and get stats
    /usr/bin/time -l python3 main.py "$chunkSize" 2>&1 \
    | grep 'real\|peak'
    
    # Test
    cmp "e${mag}target.csv" out.csv
 done
	#!/usr/bin/env python3
	import sys

	CHUNK_SZ = 64 * 1024

	STR = '''
	28807644,0,Maun FCU,US
	28855353,0,WNB Holdings LLC,US
	29212330,0,Idaho First Bank,US
	29278777,0,Republic Bank of Arizona,US
	29633181,0,Friendly Hills Bank,US
	29760145,0,The Freedom Bank of Virginia,US
	100504846,0,Community First Fund Federal Credit Union,US
	'''.lstrip() # need last newline

	STR_SZ = len(STR)

	mag = int(sys.argv[1]) # the 10**number of times to write STR

	str_ct = int(CHUNK_SZ / STR_SZ)
	str_rem = CHUNK_SZ % STR_SZ

	# print(f'STR will fit into CHUNK {str_ct} times, with a remainder of {str_rem} empty slots ({(str_rem/CHUNK_SZ)*100:.1f}% wasted space per write)')

	write_chunk_ct = int(10**mag / str_ct)
	write_str_rem = 10**mag % str_ct

	# print(f'Will need to write STR {write_chunk_ct + write_str_rem} times')

	write = sys.stdout.write

	write('a,b,c,d\n')

	for i in range(write_chunk_ct):
	write(STR * str_ct)

	write(STR * write_str_rem)
	import sys

	CHUNK_SZES = {'1K':1024, '32K': 32 * 1024,'1M': 1024 * 1024}

	FS = "'~'"
	RS = '#@#@#'

	# With chars repeated in the separators, check most specific (least ambiguous)
	# to least specific (most ambiguous) to definitively catch a partial with the
	# fewest number of checks
	PARTIAL_RSES = ['#@#@', '#@#', '#@', '#']
	PARTIAL_FSES = ["'~", "'"]
	ALL_PARTIALS = PARTIAL_FSES + PARTIAL_RSES

	chunk_sz = CHUNK_SZES[sys.argv[1]]

	f_out = open('out.csv', 'w')
	f_out.write('a,b,c,d\n')

	f_in = open('my_file.txt')
	line = ''
	while True:
	# Read chunks till no more, then break out
	chunk = f_in.read(chunk_sz)
	if not chunk:
	break

	# Any previous partial separator, plus new chunk
	line += chunk

	# Check end of line for a partial FS or RS (when separators are more than
	# one char)
	final_partial = ''

	if line.endswith(FS) or line.endswith(RS):
	pass # Write-out will replace complete FS or RS
	else:
	for partial in ALL_PARTIALS:
	if line.endswith(partial):
	final_partial = partial
	line = line[:-len(partial)]
	break

	# Process/write chunk
	f_out.write(line
	.replace(FS, ',')
	.replace(RS, '\n'))

	# Add partial back, to be completed next chunk
	line = final_partial


	# Clean up
	f_in.close()
	f_out.close()
	#!/usr/bin/env python3
	import csv
	import sys
	from collections import defaultdict

	# Turn a set of files like:
	#
	# e0test.txt
	# 0.08 real 0.01 user 0.00 sys
	# 5067840 peak memory footprint
	# e1test.txt
	# 0.02 real 0.01 user 0.00 sys
	# 4981760 peak memory footprint
	# e2test.txt
	# 0.02 real 0.01 user 0.00 sys
	# 4994048 peak memory footprint
	#
	# into:
	#
	# \| File size \| 1K (s) \| 1K (MB) \|
	# \|-----------\|--------\|---------\|
	# \| e0 \| 0.08 \| 4.8 \|
	# \| e1 \| 0.02 \| 4.8 \|
	# \| e2 \| 0.02 \| 4.8 \|


	# BUF_SIZES = ['1K', '32K', '1M']
	F_SIZES = [f'e{i}' for i in range(0, 9)]

	buf_sizes = sys.argv[1:]

	# Iterate buffer-sized results files
	results = defaultdict(dict)
	for buf_size in buf_sizes:
	f = open(f'results_{buf_size}.txt')

	# For every file-size result group, consume 3 lines
	while True:

	try:
	fsz = next(f).strip()[:2]
	line_time = next(f).strip().split(' ')
	line_mem = next(f).strip().split(' ')
	except StopIteration:
	break
	except Exception as e:
	print(f'error: {e}')
	sys.exit(1)

	assert fsz in F_SIZES, f'{fsz} is not a valid F_SIZE'
	assert line_time[1] == 'real'
	assert line_mem[2] == 'peak'

	t = line_time[0]
	m = int(line_mem[0])

	results[fsz][buf_size] = (t, m)

	f.close()


	w = csv.writer(sys.stdout)

	header = ['File size']
	for buf_size in buf_sizes:
	header += [f'{buf_size} (s)']
	header += [f'{buf_size} (MB)']
	w.writerow(header)

	for fsz, buf_szs in results.items():
	row = [fsz]
	for buf_sz in buf_szs:
	t, m = results[fsz][buf_sz]
	# Format, convert mem size to MB
	row += [f'{t}', f'{m/1024**2:.1f}']

	w.writerow(row)
	#!/bin/sh

	chunkSize=$1

	mags="0 1 2 3" # 4 5 6" # 7 8"

	for mag in $mags; do
	testFile="e${mag}test.txt"
	targetFile="e${mag}target.csv"

	# Make sure there's something to test against
	if [ ! -f "$testFile" ]; then
	echo "Generating $testFile"
	./gen_test.py "$mag" >"$testFile"
	fi

	if [ ! -f "$targetFile" ]; then
	echo "Generating $targetFile"
	./gen_target.py "$mag" >"$targetFile"
	fi

	echo "$testFile"

	# main.py reads my_file.txt and writes to out.csv
	ln -fs "$testFile" my_file.txt

	# Run and get stats
	/usr/bin/time -l python3 main.py "$chunkSize" 2>&1 \
	\| grep 'real\\|peak'

	# Test
	cmp "e${mag}target.csv" out.csv
	done