fxn · May 28, 2017 17:51
diff --git a/0_description.md b/0_description.md
diff --git a/1_simple.py b/1_simple.py
 import http.client

 host = 's3.amazonaws.com'
 path = '/carto-1000x/data/yellow_tripdata_2016-01.csv'

 conn = http.client.HTTPSConnection(host)
 conn.request('GET', path)
 payload = conn.getresponse().read()

 avg = 0
 for i, line in enumerate(payload.splitlines()):
    if i == 0:
        continue
    tip_amount  = float(line.split(b',')[-4])
    avg += (tip_amount - avg)/i

 print(avg)
diff --git a/2_streamed.py b/2_streamed.py
 import http.client

 host  = 's3.amazonaws.com'
 path  = '/carto-1000x/data/yellow_tripdata_2016-01.csv'
 bsize = 4096

 class Averager:
    def __init__(self):
        self.headers = False # have the headers been consumed?
        self.ndata   = 0     # number of data points so far
        self.avg     = 0     # incremental average
        self.pline   = None  # partial line at end of buffer, if any

    def process(self, buffer):
        lines = buffer.splitlines(True)

        if self.pline:
          lines[0] = self.pline + lines[0]
          self.pline = None

        if not lines[-1].endswith(b'\n'):
            self.pline = lines.pop()

        if not self.headers:
            if len(lines) > 0:
              lines = lines[1:]
              self.headers = True

        for line in lines:
            tip_amount  = float(line.split(b',')[-4])
            self.ndata += 1
            self.avg   += (tip_amount - self.avg)/self.ndata

 conn = http.client.HTTPSConnection(host)
 conn.request('GET', path)
 resp = conn.getresponse()

 averager = Averager()
 buffer = bytearray(bsize)
 while not resp.closed:
    nbytes = resp.readinto(buffer)
    if nbytes > 0:
        averager.process(buffer[0:nbytes])
    else:
        break

 print(averager.avg)
diff --git a/3_parallel.py b/3_parallel.py
 import http.client
 import multiprocessing

 method = 'GET'
 host   = 's3.amazonaws.com'
 path   = '/carto-1000x/data/yellow_tripdata_2016-01.csv'
 body   = None
 bsize  = 4096
 nprocs = multiprocessing.cpu_count()

 def headers(byte_range):
    return {'Range': 'bytes=' + str(byte_range[0]) + '-' + str(byte_range[1])}

 def fetch(byte_range, writer):
    conn = http.client.HTTPSConnection(host)
    conn.request(method, path, body, headers(byte_range))
    resp = conn.getresponse()
    buffer = bytearray(bsize)

    # We are going to collect this in a list because concatenating bytearrays
    # has horrible performance.
    chunks = []
    while True:
        nbytes = resp.readinto(buffer)
        if nbytes > 0:
            chunks.append(buffer[0:nbytes])
        else:
            break

    for chunk in chunks:
        writer.send_bytes(chunk)

    writer.close()
    return

 class Averager:
    def __init__(self):
        self.headers = False # have the headers been consumed?
        self.ndata   = 0     # number of data points so far
        self.avg     = 0     # incremental average
        self.pline   = None  # partial line at end of buffer, if any

    def process(self, buffer):
        lines = buffer.splitlines(True)

        if self.pline:
          lines[0] = self.pline + lines[0]
          self.pline = None

        if not lines[-1].endswith(b'\n'):
            self.pline = lines.pop()

        if not self.headers:
            if len(lines) > 0:
              lines = lines[1:]
              self.headers = True

        for line in lines:
            tip_amount  = float(line.split(b',')[-4])
            self.ndata += 1
            self.avg   += (tip_amount - self.avg)/self.ndata

 # Would be computed using a HEAD request, hard-coded to simplify the code, is
 # not going to make a difference in performance.
 content_length = 1708674492

 byte_range_width = int(content_length/nprocs)
 byte_ranges      = []
 from_byte        = 0
 while from_byte <= content_length:
    # Range requests may have limits greater than the file size. In such case
    # the server returns what is left.
    to_byte = from_byte + byte_range_width
    byte_ranges.append([from_byte, to_byte])
    from_byte = to_byte + 1

 averager = Averager()
 readers  = []
 for byte_range in byte_ranges:
    reader, writer = multiprocessing.Pipe(False)
    process = multiprocessing.Process(target=fetch, args=(byte_range, writer))
    process.start()
    readers.append(reader)

 buffer = bytearray(bsize)
 for reader in readers:
    while True:
        nbytes = reader.recv_bytes_into(buffer)
        averager.process(buffer[0:nbytes])
        # There is no guarantee that the chunks are of this size, though
        # in practice it is normally the case except for maybe the last one.
        # This should check a different condition and trap EOFError. Problem
        # is rev_bytes_into() blocked for some reason I could not understand.
        # In real code this break condition should be fixed.
        if nbytes < bsize:
            break

 print(averager.avg)
diff --git a/4_parallel_streamed.py b/4_parallel_streamed.py
 import http.client
 from multiprocessing import cpu_count, Process, Queue

 method = 'GET'
 host   = 's3.amazonaws.com'
 path   = '/carto-1000x/data/yellow_tripdata_2016-01.csv'
 body   = None
 bsize  = 4096
 nprocs = cpu_count()

 def headers(byte_range):
    return {'Range': 'bytes=' + str(byte_range[0]) + '-' + str(byte_range[1])}

 class Averager:
    def __init__(self):
        self.ndata = 0     # number of data points so far
        self.avg   = 0     # incremental average
        self.pline = None  # partial line at end of buffer, if any

    def process(self, lines):
        if self.pline:
          lines[0] = self.pline + lines[0]
          self.pline = None

        if not lines[-1].endswith(b'\n'):
            self.pline = lines.pop()

        for line in lines:
            tip_amount  = float(line.split(b',')[-4])
            self.ndata += 1
            self.avg   += (tip_amount - self.avg)/self.ndata

 # Computes the average of tip_amount in the whole lines in the byte range,
 # except for the first and (perhaps) last ones, which could be partial.
 # Puts in the queue the first line and an averager in a dictionary.
 def byte_range_avg(byte_range, queue):
    averager = Averager()
    first_line = None

    conn = http.client.HTTPSConnection(host)
    conn.request(method, path, body, headers(byte_range))
    resp = conn.getresponse()
    buffer = bytearray(bsize)
    while True:
        nbytes = resp.readinto(buffer)
        if nbytes > 0:
            lines = buffer[0:nbytes].splitlines(True)
            # There is no guarantee that we read a whole line even if there are
            # lines in the file to be read in this range. A real program would
            # implement seek_line() or somesuch to loop until you get at least
            # one or whatever payload until end of stream. For the purposes of
            # the exercise we'll leave it here, it works in practice.
            if not first_line: first_line = lines.pop(0)
            if lines: averager.process(lines)
        else:
            break

    queue.put({ 'first_line': first_line, 'averager': averager })
    queue.close()

 #
 # --- Compute the HTTP ranges, as many as CPU cores ----------------------------
 #

 # Would be computed using a HEAD request, hard-coded to simplify the code, is
 # not going to make a difference in performance.
 content_length = 1708674492

 byte_range_width = int(content_length/nprocs)
 byte_ranges      = []
 from_byte        = 0
 while from_byte <= content_length:
    # Range requests may have limits greater than the file size. In such case
    # the server returns what is left.
    to_byte = from_byte + byte_range_width
    byte_ranges.append([from_byte, to_byte])
    from_byte = to_byte + 1

 #
 # --- Fork one process per range and capture the results in order --------------
 #

 queues = []
 for byte_range in byte_ranges:
    queue   = Queue()
    process = Process(target=byte_range_avg, args=(byte_range, queue))
    process.start()
    queues.append(queue)
 results = [ queue.get(True) for queue in queues ]

 #
 # --- Additional averager for the first and last lines per range ---------------
 #

 pending = []
 for result in results:
    pending.append(result['first_line'])
    pending.append(result['averager'].pline)
 pending.pop(0) # remove CSV header
 pending = list(filter(bool, pending)) # edge cases

 averager = Averager()
 averager.process(b''.join(pending).splitlines(True))

 #
 # --- Final computations -------------------------------------------------------
 #

 averagers = [ result['averager'] for result in results ]
 averagers.append(averager)

 avg   = 0
 ndata = 0
 for averager in averagers:
    agg_ndata = ndata + averager.ndata
    if agg_ndata:
        avg = avg*ndata/agg_ndata + averager.avg*averager.ndata/agg_ndata
        ndata += averager.ndata

 print(avg)
Solution	Runtime	Memory
simple	~5 min	High
streamed	2 min 55 s	Low
parallel	1 min 20 s	High
parallel streamed	53 s	Low
	import http.client

	host = 's3.amazonaws.com'
	path = '/carto-1000x/data/yellow_tripdata_2016-01.csv'

	conn = http.client.HTTPSConnection(host)
	conn.request('GET', path)
	payload = conn.getresponse().read()

	avg = 0
	for i, line in enumerate(payload.splitlines()):
	if i == 0:
	continue
	tip_amount = float(line.split(b',')[-4])
	avg += (tip_amount - avg)/i

	print(avg)
	import http.client
	import multiprocessing

	method = 'GET'
	host = 's3.amazonaws.com'
	path = '/carto-1000x/data/yellow_tripdata_2016-01.csv'
	body = None
	bsize = 4096
	nprocs = multiprocessing.cpu_count()

	def headers(byte_range):
	return {'Range': 'bytes=' + str(byte_range[0]) + '-' + str(byte_range[1])}

	def fetch(byte_range, writer):
	conn = http.client.HTTPSConnection(host)
	conn.request(method, path, body, headers(byte_range))
	resp = conn.getresponse()
	buffer = bytearray(bsize)

	# We are going to collect this in a list because concatenating bytearrays
	# has horrible performance.
	chunks = []
	while True:
	nbytes = resp.readinto(buffer)
	if nbytes > 0:
	chunks.append(buffer[0:nbytes])
	else:
	break

	for chunk in chunks:
	writer.send_bytes(chunk)

	writer.close()
	return

	class Averager:
	def __init__(self):
	self.headers = False # have the headers been consumed?
	self.ndata = 0 # number of data points so far
	self.avg = 0 # incremental average
	self.pline = None # partial line at end of buffer, if any

	def process(self, buffer):
	lines = buffer.splitlines(True)

	if self.pline:
	lines[0] = self.pline + lines[0]
	self.pline = None

	if not lines[-1].endswith(b'\n'):
	self.pline = lines.pop()

	if not self.headers:
	if len(lines) > 0:
	lines = lines[1:]
	self.headers = True

	for line in lines:
	tip_amount = float(line.split(b',')[-4])
	self.ndata += 1
	self.avg += (tip_amount - self.avg)/self.ndata

	# Would be computed using a HEAD request, hard-coded to simplify the code, is
	# not going to make a difference in performance.
	content_length = 1708674492

	byte_range_width = int(content_length/nprocs)
	byte_ranges = []
	from_byte = 0
	while from_byte <= content_length:
	# Range requests may have limits greater than the file size. In such case
	# the server returns what is left.
	to_byte = from_byte + byte_range_width
	byte_ranges.append([from_byte, to_byte])
	from_byte = to_byte + 1

	averager = Averager()
	readers = []
	for byte_range in byte_ranges:
	reader, writer = multiprocessing.Pipe(False)
	process = multiprocessing.Process(target=fetch, args=(byte_range, writer))
	process.start()
	readers.append(reader)

	buffer = bytearray(bsize)
	for reader in readers:
	while True:
	nbytes = reader.recv_bytes_into(buffer)
	averager.process(buffer[0:nbytes])
	# There is no guarantee that the chunks are of this size, though
	# in practice it is normally the case except for maybe the last one.
	# This should check a different condition and trap EOFError. Problem
	# is rev_bytes_into() blocked for some reason I could not understand.
	# In real code this break condition should be fixed.
	if nbytes < bsize:
	break

	print(averager.avg)
	import http.client
	from multiprocessing import cpu_count, Process, Queue

	method = 'GET'
	host = 's3.amazonaws.com'
	path = '/carto-1000x/data/yellow_tripdata_2016-01.csv'
	body = None
	bsize = 4096
	nprocs = cpu_count()

	def headers(byte_range):
	return {'Range': 'bytes=' + str(byte_range[0]) + '-' + str(byte_range[1])}

	class Averager:
	def __init__(self):
	self.ndata = 0 # number of data points so far
	self.avg = 0 # incremental average
	self.pline = None # partial line at end of buffer, if any

	def process(self, lines):
	if self.pline:
	lines[0] = self.pline + lines[0]
	self.pline = None

	if not lines[-1].endswith(b'\n'):
	self.pline = lines.pop()

	for line in lines:
	tip_amount = float(line.split(b',')[-4])
	self.ndata += 1
	self.avg += (tip_amount - self.avg)/self.ndata

	# Computes the average of tip_amount in the whole lines in the byte range,
	# except for the first and (perhaps) last ones, which could be partial.
	# Puts in the queue the first line and an averager in a dictionary.
	def byte_range_avg(byte_range, queue):
	averager = Averager()
	first_line = None

	conn = http.client.HTTPSConnection(host)
	conn.request(method, path, body, headers(byte_range))
	resp = conn.getresponse()
	buffer = bytearray(bsize)
	while True:
	nbytes = resp.readinto(buffer)
	if nbytes > 0:
	lines = buffer[0:nbytes].splitlines(True)
	# There is no guarantee that we read a whole line even if there are
	# lines in the file to be read in this range. A real program would
	# implement seek_line() or somesuch to loop until you get at least
	# one or whatever payload until end of stream. For the purposes of
	# the exercise we'll leave it here, it works in practice.
	if not first_line: first_line = lines.pop(0)
	if lines: averager.process(lines)
	else:
	break

	queue.put({ 'first_line': first_line, 'averager': averager })
	queue.close()

	#
	# --- Compute the HTTP ranges, as many as CPU cores ----------------------------
	#

	# Would be computed using a HEAD request, hard-coded to simplify the code, is
	# not going to make a difference in performance.
	content_length = 1708674492

	byte_range_width = int(content_length/nprocs)
	byte_ranges = []
	from_byte = 0
	while from_byte <= content_length:
	# Range requests may have limits greater than the file size. In such case
	# the server returns what is left.
	to_byte = from_byte + byte_range_width
	byte_ranges.append([from_byte, to_byte])
	from_byte = to_byte + 1

	#
	# --- Fork one process per range and capture the results in order --------------
	#

	queues = []
	for byte_range in byte_ranges:
	queue = Queue()
	process = Process(target=byte_range_avg, args=(byte_range, queue))
	process.start()
	queues.append(queue)
	results = [ queue.get(True) for queue in queues ]

	#
	# --- Additional averager for the first and last lines per range ---------------
	#

	pending = []
	for result in results:
	pending.append(result['first_line'])
	pending.append(result['averager'].pline)
	pending.pop(0) # remove CSV header
	pending = list(filter(bool, pending)) # edge cases

	averager = Averager()
	averager.process(b''.join(pending).splitlines(True))

	#
	# --- Final computations -------------------------------------------------------
	#

	averagers = [ result['averager'] for result in results ]
	averagers.append(averager)

	avg = 0
	ndata = 0
	for averager in averagers:
	agg_ndata = ndata + averager.ndata
	if agg_ndata:
	avg = avgndata/agg_ndata + averager.avgaverager.ndata/agg_ndata
	ndata += averager.ndata

	print(avg)