Last active
December 16, 2015 08:39
-
-
Save santa4nt/5407704 to your computer and use it in GitHub Desktop.
Sort a large number of (signed) 4-byte integers in machine format, chunks at a time.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| import os | |
| import sys | |
| import array | |
| import heapq | |
| import struct | |
| import tempfile | |
| BUFSIZE = 4096 # read 1000 * 4-byte (32-bit) numbers at a time | |
| TBUFSIZE = 256 # read each sorted tempfile 64 numbers at a time | |
| input = sys.stdin | |
| output = sys.stdout | |
| tempsorts = [] # holds temporary file names whose contents contain ordered numbers | |
| sortiters = [] # holds iterators over the content of tempsorts above | |
| # 1. Read the input into blocks of BUFSIZE bytes, sort them, and dump into a tempfile | |
| # read the first block of the input into the buffer | |
| buf = input.read(BUFSIZE) | |
| while buf: | |
| a = array.array('i', buf) # if buf was full (all (BUFSIZE=4096) bytes), then len(a) == 1024 (we have 1024 random ints) | |
| s = array.array('i', sorted(a)) | |
| # emit the sorted output to a tempfile | |
| with tempfile.NamedTemporaryFile(mode='w+b', bufsize=BUFSIZE, dir='.', delete=False) as f: | |
| s.tofile(f.file) | |
| tempsorts.append(f.name) | |
| # read the next block of the input into the buffer | |
| buf = input.read(BUFSIZE) | |
| # create an iterator (really, a generator) for each temp file output we have | |
| def tempiter(fname): | |
| with open(fname, 'rb') as f: | |
| buf = f.read(TBUFSIZE) # read 64 numbers (256) bytes at a time from each temp file | |
| while buf: | |
| for n in array.array('i', buf): | |
| yield n # then the generator yields one number at a time therein | |
| buf = f.read(TBUFSIZE) # read the next 256-byte block from this tempfile | |
| # create len(tempsorts) number of such an iterator | |
| map(lambda f: sortiters.append(tempiter(f)), tempsorts) | |
| assert len(sortiters) == len(tempsorts) # sanity check | |
| """ To test the iterators we created: """ | |
| """ | |
| penult = sortiters[-2] # second last block should have exactly 1024 sorted numbers | |
| counter = 0 | |
| for n in penult: | |
| counter += 1 | |
| print ('%d,' % n), | |
| print counter | |
| assert counter == BUFSIZE / 4 | |
| """ | |
| # 2. Merge sorted output from each tempfile, using a heap queue that can contain at least len(tempsorts) numbers | |
| out = [] # temporary heap | |
| # a. First, insert the head of each input iterator into the heap | |
| for i in xrange(len(sortiters)): | |
| it = sortiters[i] | |
| item = next(it) | |
| heapq.heappush(out, (item, i)) # also include the index for the merging loop in step (b) | |
| # b. Loop while the heap is not empty | |
| a = array.array('i') # temporary buffer to store merged sorted output | |
| while out: | |
| item, i = heapq.heappop(out) | |
| # buffer the item we just popped from the heap | |
| a.append(item) | |
| # if the buffer is "full" (reached a pre-set capacity), flush to output | |
| if len(a) >= (BUFSIZE / 4): | |
| a.tofile(output) | |
| del a[:] | |
| # refresh the heap with new item from input iterator at index i | |
| item = next(sortiters[i], None) | |
| if item is not None: | |
| heapq.heappush(out, (item, i)) | |
| # finally, clean up tempfiles | |
| map(lambda f: os.unlink(f), tempsorts) |
Author
santa4nt
commented
Apr 17, 2013
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment