Skip to content

Instantly share code, notes, and snippets.

@santa4nt
Last active December 16, 2015 08:48
Show Gist options
  • Select an option

  • Save santa4nt/5408293 to your computer and use it in GitHub Desktop.

Select an option

Save santa4nt/5408293 to your computer and use it in GitHub Desktop.
Combine the last three gists: generate random ints, sort them by chunking, and verify sorted final output.
#!/usr/bin/env python
import os
import sys
import array
import struct
NUMRANDS = 10 ** 6 # the number of random integers we want to generate (one million)
BUFSIZE = 1000 # the number of integers we buffer internally before flushing to output
a = array.array('i')
for i in xrange(NUMRANDS): # generate one million 32-bit integers
# generate the integer (signed) from the OS' random source of byte strings
randint = struct.unpack('@i', os.urandom(4))[0]
a.append(randint) # buffer the random integer into an array
if len(a) >= BUFSIZE: # flush to output buffer once we have enough
a.tofile(sys.stdout)
del a[:]
if a:
a.tofile(sys.stdout)
#!/usr/bin/env python
import os
import sys
import array
import heapq
import struct
import tempfile
BUFSIZE = 4096 # read 1000 * 4-byte (32-bit) numbers at a time
TBUFSIZE = 256 # read each sorted tempfile 64 numbers at a time
input = sys.stdin
output = sys.stdout
tempsorts = [] # holds temporary file names whose contents contain ordered numbers
sortiters = [] # holds iterators over the content of tempsorts above
# 1. Read the input into blocks of BUFSIZE bytes, sort them, and dump into a tempfile
# read the first block of the input into the buffer
buf = input.read(BUFSIZE)
while buf:
a = array.array('i', buf) # if buf was full (all (BUFSIZE=4096) bytes), then len(a) == 1024 (we have 1024 random ints)
s = array.array('i', sorted(a))
# emit the sorted output to a tempfile
with tempfile.NamedTemporaryFile(mode='w+b', bufsize=BUFSIZE, dir='.', delete=False) as f:
s.tofile(f.file)
tempsorts.append(f.name)
# read the next block of the input into the buffer
buf = input.read(BUFSIZE)
# create an iterator (really, a generator) for each temp file output we have
def tempiter(fname):
with open(fname, 'rb') as f:
buf = f.read(TBUFSIZE) # read 64 numbers (256) bytes at a time from each temp file
while buf:
for n in array.array('i', buf):
yield n # then the generator yields one number at a time therein
buf = f.read(TBUFSIZE) # read the next 256-byte block from this tempfile
# create len(tempsorts) number of such an iterator
map(lambda f: sortiters.append(tempiter(f)), tempsorts)
assert len(sortiters) == len(tempsorts) # sanity check
""" To test the iterators we created: """
"""
penult = sortiters[-2] # second last block should have exactly 1024 sorted numbers
counter = 0
for n in penult:
counter += 1
print ('%d,' % n),
print
print counter
assert counter == BUFSIZE / 4
"""
# 2. Merge sorted output from each tempfile, using a heap queue that can contain at least len(tempsorts) numbers
out = [] # temporary heap
# a. First, insert the head of each input iterator into the heap
for i in xrange(len(sortiters)):
it = sortiters[i]
item = next(it)
heapq.heappush(out, (item, i)) # also include the index for the merging loop in step (b)
# b. Loop while the heap is not empty
a = array.array('i') # temporary buffer to store merged sorted output
while out:
item, i = heapq.heappop(out)
# buffer the item we just popped from the heap
a.append(item)
# if the buffer is "full" (reached a pre-set capacity), flush to output
if len(a) >= (BUFSIZE / 4):
a.tofile(output)
del a[:]
# refresh the heap with new item from input iterator at index i
item = next(sortiters[i], None)
if item is not None:
heapq.heappush(out, (item, i))
# finally, clean up tempfiles
map(lambda f: os.unlink(f), tempsorts)
#!/usr/bin/env python
import sys
import array
BUFSIZE = 4096 # read 1000 * 4-byte (32-bit) numbers at a time
input = sys.stdin
largest = None
buf = input.read(BUFSIZE)
while buf:
a = array.array('i', buf)
for n in a:
if a < largest:
# found a new number that is larger than currently known largest number!
print 0
sys.exit(-1)
else:
largest = a
buf = input.read(BUFSIZE)
# the input is sorted!
print 1
sys.exit(0)
@santa4nt
Copy link
Author

$ ./genrandints.py | ./sortints.py > sortedints
$ ./vrfsrtints.py < sortedints 
1
$ du --si -b sortedints 
4000000 sortedints
$ ./genrandints.py | ./sortints.py | ./vrfsrtints.py
1

@santa4nt
Copy link
Author

$ time ./genrandints.py | ./sortints.py > sortedints

real    0m3.803s
user    0m2.312s
sys     0m1.768s

And since verification has to be linear:

$ time ./genrandints.py | ./sortints.py | ./vrfsrtints.py
1

real    0m21.829s
user    0m21.117s
sys     0m1.840s

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment