mattrobenolt · July 18, 2012 02:01
diff --git a/gistfile1.py b/gistfile1.py
 # Data set is a large SQL dump. Most lines are around 1MB.
 # When the chunk size is at a smaller (default) chunk size, CPU jumps ridiculously high.
 # If the chunk size is even smaller, there is obviously more CPU consumed as a high rate.
 # When setting the chunk size of 0.5MB or even 1MB, it is smoothed out to a realistic usage.
 # The obvious bottleneck is around testing each chunk with "splitlines()"

 import requests

 res = requests.get('https://s3.amazonaws.com/littlesis/public-data/littlesis-data.sql', prefetch=False)
 bytes_total = int(res.headers['content-length'])
 bytes_read = 0
 for line in res.iter_lines(10 * 1024):  # default chunk size
  line_len = len(line)
  bytes_read += line_len
  print line_len
	# Data set is a large SQL dump. Most lines are around 1MB.
	# When the chunk size is at a smaller (default) chunk size, CPU jumps ridiculously high.
	# If the chunk size is even smaller, there is obviously more CPU consumed as a high rate.
	# When setting the chunk size of 0.5MB or even 1MB, it is smoothed out to a realistic usage.
	# The obvious bottleneck is around testing each chunk with "splitlines()"

	import requests

	res = requests.get('https://s3.amazonaws.com/littlesis/public-data/littlesis-data.sql', prefetch=False)
	bytes_total = int(res.headers['content-length'])
	bytes_read = 0
	for line in res.iter_lines(10 * 1024): # default chunk size
	line_len = len(line)
	bytes_read += line_len
	print line_len