Skip to content

Instantly share code, notes, and snippets.

@santa4nt
Last active December 16, 2015 08:39
Show Gist options
  • Select an option

  • Save santa4nt/5407704 to your computer and use it in GitHub Desktop.

Select an option

Save santa4nt/5407704 to your computer and use it in GitHub Desktop.
Sort a large number of (signed) 4-byte integers in machine format, chunks at a time.
#!/usr/bin/env python
import os
import sys
import array
import heapq
import struct
import tempfile
BUFSIZE = 4096 # read 1000 * 4-byte (32-bit) numbers at a time
TBUFSIZE = 256 # read each sorted tempfile 64 numbers at a time
input = sys.stdin
output = sys.stdout
tempsorts = [] # holds temporary file names whose contents contain ordered numbers
sortiters = [] # holds iterators over the content of tempsorts above
# 1. Read the input into blocks of BUFSIZE bytes, sort them, and dump into a tempfile
# read the first block of the input into the buffer
buf = input.read(BUFSIZE)
while buf:
a = array.array('i', buf) # if buf was full (all (BUFSIZE=4096) bytes), then len(a) == 1024 (we have 1024 random ints)
s = array.array('i', sorted(a))
# emit the sorted output to a tempfile
with tempfile.NamedTemporaryFile(mode='w+b', bufsize=BUFSIZE, dir='.', delete=False) as f:
s.tofile(f.file)
tempsorts.append(f.name)
# read the next block of the input into the buffer
buf = input.read(BUFSIZE)
# create an iterator (really, a generator) for each temp file output we have
def tempiter(fname):
with open(fname, 'rb') as f:
buf = f.read(TBUFSIZE) # read 64 numbers (256) bytes at a time from each temp file
while buf:
for n in array.array('i', buf):
yield n # then the generator yields one number at a time therein
buf = f.read(TBUFSIZE) # read the next 256-byte block from this tempfile
# create len(tempsorts) number of such an iterator
map(lambda f: sortiters.append(tempiter(f)), tempsorts)
assert len(sortiters) == len(tempsorts) # sanity check
""" To test the iterators we created: """
"""
penult = sortiters[-2] # second last block should have exactly 1024 sorted numbers
counter = 0
for n in penult:
counter += 1
print ('%d,' % n),
print
print counter
assert counter == BUFSIZE / 4
"""
# 2. Merge sorted output from each tempfile, using a heap queue that can contain at least len(tempsorts) numbers
out = [] # temporary heap
# a. First, insert the head of each input iterator into the heap
for i in xrange(len(sortiters)):
it = sortiters[i]
item = next(it)
heapq.heappush(out, (item, i)) # also include the index for the merging loop in step (b)
# b. Loop while the heap is not empty
a = array.array('i') # temporary buffer to store merged sorted output
while out:
item, i = heapq.heappop(out)
# buffer the item we just popped from the heap
a.append(item)
# if the buffer is "full" (reached a pre-set capacity), flush to output
if len(a) >= (BUFSIZE / 4):
a.tofile(output)
del a[:]
# refresh the heap with new item from input iterator at index i
item = next(sortiters[i], None)
if item is not None:
heapq.heappush(out, (item, i))
# finally, clean up tempfiles
map(lambda f: os.unlink(f), tempsorts)
@santa4nt
Copy link
Author

$ ./sortints.py < randints > sortedints
$ du -b --si sortedints 
4.0M    sortedints
$ python -c "import array; f = open('sortedints', 'rb'); buf = f.read(40); print array.array('i', buf); f.close()"
array('i', [-2147483479, -2147482182, -2147477010, -2147479446, -2147479798, -2147469225, -2147472137, -2147430253, -2147468777, -2147442541])

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment