Created
January 18, 2019 22:07
-
-
Save hobbsh/797306d8df44d6df5cee1894acc07b6b to your computer and use it in GitHub Desktop.
Sort arbitrary sized file filled with random integers by N biggest
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import random | |
import sys | |
import threading, Queue | |
from heapq import nlargest | |
import os | |
# job queue | |
queue = Queue.Queue() | |
# result queue | |
result = [] | |
FILE = 'nums.txt' | |
# From: http://effbot.org/zone/wide-finder.htm | |
class Worker(threading.Thread): | |
def run(self): | |
while 1: | |
args = queue.get() | |
if args is None: | |
break | |
result.append(process(*args)) | |
queue.task_done() | |
# Sort chunk by largest number and return it | |
def process(file, chunk): | |
f = open(file, "rb") | |
f.seek(chunk[0]) | |
chunk_list = f.read(chunk[1]).split('\n') | |
largest = max(chunk_list) | |
return largest | |
# Chunk input file into 1M chunks - increase size for bigger files | |
def getchunks(fname, size=1024*1024): | |
f = open(fname) | |
while 1: | |
start = f.tell() | |
f.seek(size, 1) | |
s = f.readline() | |
yield start, f.tell() - start | |
if not s: | |
break | |
#Seed a new nums.txt | |
def seed(nums): | |
with open(FILE, 'w+') as f: | |
for i in range(0,nums): | |
line = str(random.randint(1, 99999999999999)) | |
f.write(line+'\n') | |
def sort(): | |
try: | |
num_biggest = int(sys.argv[1]) | |
except ValueError: | |
print('Biggest number value must be a valid integer: i.e. python nums.py 100') | |
sys.exit(1) | |
# Create 4 worker threads | |
for i in range(4): | |
w = Worker() | |
w.setDaemon(1) | |
w.start() | |
# Generate chunks of file and push them to queue for workers to process | |
for chunk in getchunks(FILE): | |
queue.put((FILE, chunk)) | |
# Start processing | |
queue.join() | |
#Sort result list by N biggest numbers | |
biggest_nums = nlargest(num_biggest, result) | |
for num in biggest_nums: | |
print(num) | |
if __name__ == '__main__': | |
#seed(1000000000) #seed 100 million ints | |
sort() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment