qhuang872 · December 31, 2016 10:32
diff --git a/answer.txt b/answer.txt
 1, average runtime of the program is 52 seconds.

 2, I separate the program into 4 different small functions, each serving only one purposes: read_content, get_link, download_video and download_image. So that I can measure memory usage of each part, using memory_profiler. According to the output of memory_profiler, read_content takes up the most memory: about 48 MiB. Other functions and their memory usage:  get_link uses 40 MiB, download_video uses 42 MiB, and download_image uses 45 MiB.

 3, in this program, variable contents which is a json object occupies most memory.

 4, 1) the first way to optimize this script is to skip the "get_link" process, which means instead of  storing all mp4_links and png_links into lists, since the "get_link" process is just a for loop, by eliminating this process, we are reducing runtime as well. We can just iterate through every json object, and just download the link from there. However, this solution only reduces runtime by 1~2 seconds in average, in my opinion, this is not effective.
   
   2) the second way to optimize this script: instead of downloading each link one by one, we can utilize python's threading module to download links in parallel. This supposes to reduce runtime. Using multithreading for download_mp4 alone reduces runtime to 35 seconds, compared to original runtime as 52 seconds. If further modifying the script, using multithreading for both download_mp4 and download_png functions, average runtime for the script is 10 seconds. However, highest memory usage is higher as a tradeoff, which is now about 60 MiB.
   
   3) the third way to optimize this script is by using ijson.backends.yajl2(which is faster than ijson), so that we can load part of the json string instead of loading the whole string. Thus we can reduce memory usage when we have large json string. In this case all we need is "download_urls". Although I am able to reduce highest memory usage to 37 MiB, runtime is slower than second solution, which is about 49 seconds. So I decide not to use ijson. 
diff --git a/download_videos_and_images.py b/download_videos_and_images.py
 #!/usr/bin/env python3

 # The goal of this script is to download all the mp4 and png
 # data for each content.
 import imghdr
 import logging
 import json
 import tempfile
 import urllib.request
 import time 
 import threading
 from contextlib import closing
 from memory_profiler import profile

 begin = time.time()
 logging.basicConfig(level=logging.INFO)

 # First, read the contents file into memory, then parse it into json.
 @profile
 def read_content():
    with open('contents.json') as f:
        contents_str = f.read()
        contents = json.loads(contents_str)
    return contents

 contents = read_content()
 #Then, loop over each content, and get its mp4 and png values.
 @profile
 def get_links():
    logging.info("Extracting mp4 and png links.")
    mp4_links = []
    png_links = []
    for content in contents.values():
        mp4 = content["download_urls"]["mp4"]
        png = content["download_urls"]["png"]
        png_links.append(png)
        mp4_links.append(mp4)
    return mp4_links, png_links

 mp4_links, png_links = get_links()

 # For the sake of runtime and reproducibility let's just download the
 # first 10 links. You can reduce this if you want.
 LIMIT = 50
 def download_mp4(mp4):
    logging.info("Downloading {}".format(mp4))
    r = urllib.request.urlopen(mp4)

 # Now, download each mp4 link.
 @profile
 def download_video_parallel():
    logging.info("Downloading mp4 links.")
    threads = [threading.Thread(target=download_mp4,args = (mp4,)) for mp4 in mp4_links[:LIMIT]]
    for t in threads:
        t.start()
    for t in threads:
        t.join()
 download_video_parallel()
        # Let's not save the file to avoid cluttering our filesystem.

    # What's a fast and succinct way of detecting whether what we downloaded
    # was an MP4 file?


 # Now, download each png link.
 # However, we want to figure out if what we downloaded was a PNG file.
 # If it's not a PNG file, then it should not be in the filesystem after
 # this script.
 def download_png(png):
    logging.info("Downloading {}".format(png))
    r = urllib.request.urlopen(png)
    with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
        contents = r.read()
        if imghdr.what(None, h=contents) != "png":
            logging.info("Image not a PNG; not saving.")
        else:
            f.write(contents)

 @profile
 def download_image():
    logging.info("Downloading png links.")
    threads = [threading.Thread(target=download_png,args=(png,)) for png in png_links[:LIMIT]]
    for t in threads:
        t.start()
    for t in threads:
        t.join()

 download_image()
 end = time.time()
 print("running time: {}s".format(end-begin))
diff --git a/output.txt b/output.txt
 Filename: download_videos_and_images.py

 Line #    Mem usage    Increment   Line Contents
 ================================================
    19     18.6 MiB      0.0 MiB   @profile(stream=output)
    20                             def read_content():
    21     18.6 MiB      0.0 MiB       with open('contents.json') as f:
    22     26.3 MiB      7.7 MiB           contents_str = f.read()
    23     47.9 MiB     21.6 MiB           contents = json.loads(contents_str)
    24     47.9 MiB      0.0 MiB       return contents


 Filename: download_videos_and_images.py

 Line #    Mem usage    Increment   Line Contents
 ================================================
    28     40.3 MiB      0.0 MiB   @profile(stream=output)
    29                             def get_links():
    30     40.3 MiB      0.0 MiB       logging.info("Extracting mp4 and png links.")
    31     40.3 MiB      0.0 MiB       mp4_links = []
    32     40.3 MiB      0.0 MiB       png_links = []
    33     40.3 MiB      0.0 MiB       for content in contents.values():
    34     40.3 MiB      0.0 MiB           mp4 = content["download_urls"]["mp4"]
    35     40.3 MiB      0.0 MiB           png = content["download_urls"]["png"]
    36     40.3 MiB      0.0 MiB           png_links.append(png)
    37     40.3 MiB      0.0 MiB           mp4_links.append(mp4)
    38     40.3 MiB      0.0 MiB       return mp4_links, png_links


 Filename: download_videos_and_images.py

 Line #    Mem usage    Increment   Line Contents
 ================================================
    50     40.3 MiB      0.0 MiB   @profile(stream=output)
    51                             def download_video_parallel():
    52     40.3 MiB      0.0 MiB       logging.info("Downloading mp4 links.")
    53     40.3 MiB      0.0 MiB       threads = [threading.Thread(target=download_mp4,args = (mp4,)) for mp4 in mp4_links[:LIMIT]]
    54     53.4 MiB     13.0 MiB       for t in threads:
    55     53.4 MiB      0.0 MiB           t.start()
    56     53.4 MiB      0.0 MiB       for t in threads:
    57     46.5 MiB     -6.9 MiB           t.join()


 Filename: download_videos_and_images.py

 Line #    Mem usage    Increment   Line Contents
 ================================================
    79     42.8 MiB      0.0 MiB   @profile(stream=output)
    80                             def download_image():
    81     42.8 MiB      0.0 MiB       logging.info("Downloading png links.")
    82     42.8 MiB      0.0 MiB       threads = [threading.Thread(target=download_png,args=(png,)) for png in png_links[:LIMIT]]
    83     53.1 MiB     10.2 MiB       for t in threads:
    84     53.1 MiB      0.0 MiB           t.start()
    85     63.9 MiB     10.9 MiB       for t in threads:
    86     63.9 MiB      0.0 MiB           t.join()


 running time: 9.577131032943726s
	1, average runtime of the program is 52 seconds.

	2, I separate the program into 4 different small functions, each serving only one purposes: read_content, get_link, download_video and download_image. So that I can measure memory usage of each part, using memory_profiler. According to the output of memory_profiler, read_content takes up the most memory: about 48 MiB. Other functions and their memory usage: get_link uses 40 MiB, download_video uses 42 MiB, and download_image uses 45 MiB.

	3, in this program, variable contents which is a json object occupies most memory.

	4, 1) the first way to optimize this script is to skip the "get_link" process, which means instead of storing all mp4_links and png_links into lists, since the "get_link" process is just a for loop, by eliminating this process, we are reducing runtime as well. We can just iterate through every json object, and just download the link from there. However, this solution only reduces runtime by 1~2 seconds in average, in my opinion, this is not effective.

	2) the second way to optimize this script: instead of downloading each link one by one, we can utilize python's threading module to download links in parallel. This supposes to reduce runtime. Using multithreading for download_mp4 alone reduces runtime to 35 seconds, compared to original runtime as 52 seconds. If further modifying the script, using multithreading for both download_mp4 and download_png functions, average runtime for the script is 10 seconds. However, highest memory usage is higher as a tradeoff, which is now about 60 MiB.

	3) the third way to optimize this script is by using ijson.backends.yajl2(which is faster than ijson), so that we can load part of the json string instead of loading the whole string. Thus we can reduce memory usage when we have large json string. In this case all we need is "download_urls". Although I am able to reduce highest memory usage to 37 MiB, runtime is slower than second solution, which is about 49 seconds. So I decide not to use ijson.
	#!/usr/bin/env python3

	# The goal of this script is to download all the mp4 and png
	# data for each content.
	import imghdr
	import logging
	import json
	import tempfile
	import urllib.request
	import time
	import threading
	from contextlib import closing
	from memory_profiler import profile

	begin = time.time()
	logging.basicConfig(level=logging.INFO)

	# First, read the contents file into memory, then parse it into json.
	@profile
	def read_content():
	with open('contents.json') as f:
	contents_str = f.read()
	contents = json.loads(contents_str)
	return contents

	contents = read_content()
	#Then, loop over each content, and get its mp4 and png values.
	@profile
	def get_links():
	logging.info("Extracting mp4 and png links.")
	mp4_links = []
	png_links = []
	for content in contents.values():
	mp4 = content["download_urls"]["mp4"]
	png = content["download_urls"]["png"]
	png_links.append(png)
	mp4_links.append(mp4)
	return mp4_links, png_links

	mp4_links, png_links = get_links()

	# For the sake of runtime and reproducibility let's just download the
	# first 10 links. You can reduce this if you want.
	LIMIT = 50
	def download_mp4(mp4):
	logging.info("Downloading {}".format(mp4))
	r = urllib.request.urlopen(mp4)

	# Now, download each mp4 link.
	@profile
	def download_video_parallel():
	logging.info("Downloading mp4 links.")
	threads = [threading.Thread(target=download_mp4,args = (mp4,)) for mp4 in mp4_links[:LIMIT]]
	for t in threads:
	t.start()
	for t in threads:
	t.join()
	download_video_parallel()
	# Let's not save the file to avoid cluttering our filesystem.

	# What's a fast and succinct way of detecting whether what we downloaded
	# was an MP4 file?


	# Now, download each png link.
	# However, we want to figure out if what we downloaded was a PNG file.
	# If it's not a PNG file, then it should not be in the filesystem after
	# this script.
	def download_png(png):
	logging.info("Downloading {}".format(png))
	r = urllib.request.urlopen(png)
	with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
	contents = r.read()
	if imghdr.what(None, h=contents) != "png":
	logging.info("Image not a PNG; not saving.")
	else:
	f.write(contents)

	@profile
	def download_image():
	logging.info("Downloading png links.")
	threads = [threading.Thread(target=download_png,args=(png,)) for png in png_links[:LIMIT]]
	for t in threads:
	t.start()
	for t in threads:
	t.join()

	download_image()
	end = time.time()
	print("running time: {}s".format(end-begin))
	Filename: download_videos_and_images.py

	Line # Mem usage Increment Line Contents
	================================================
	19 18.6 MiB 0.0 MiB @profile(stream=output)
	20 def read_content():
	21 18.6 MiB 0.0 MiB with open('contents.json') as f:
	22 26.3 MiB 7.7 MiB contents_str = f.read()
	23 47.9 MiB 21.6 MiB contents = json.loads(contents_str)
	24 47.9 MiB 0.0 MiB return contents


	Filename: download_videos_and_images.py

	Line # Mem usage Increment Line Contents
	================================================
	28 40.3 MiB 0.0 MiB @profile(stream=output)
	29 def get_links():
	30 40.3 MiB 0.0 MiB logging.info("Extracting mp4 and png links.")
	31 40.3 MiB 0.0 MiB mp4_links = []
	32 40.3 MiB 0.0 MiB png_links = []
	33 40.3 MiB 0.0 MiB for content in contents.values():
	34 40.3 MiB 0.0 MiB mp4 = content["download_urls"]["mp4"]
	35 40.3 MiB 0.0 MiB png = content["download_urls"]["png"]
	36 40.3 MiB 0.0 MiB png_links.append(png)
	37 40.3 MiB 0.0 MiB mp4_links.append(mp4)
	38 40.3 MiB 0.0 MiB return mp4_links, png_links


	Filename: download_videos_and_images.py

	Line # Mem usage Increment Line Contents
	================================================
	50 40.3 MiB 0.0 MiB @profile(stream=output)
	51 def download_video_parallel():
	52 40.3 MiB 0.0 MiB logging.info("Downloading mp4 links.")
	53 40.3 MiB 0.0 MiB threads = [threading.Thread(target=download_mp4,args = (mp4,)) for mp4 in mp4_links[:LIMIT]]
	54 53.4 MiB 13.0 MiB for t in threads:
	55 53.4 MiB 0.0 MiB t.start()
	56 53.4 MiB 0.0 MiB for t in threads:
	57 46.5 MiB -6.9 MiB t.join()


	Filename: download_videos_and_images.py

	Line # Mem usage Increment Line Contents
	================================================
	79 42.8 MiB 0.0 MiB @profile(stream=output)
	80 def download_image():
	81 42.8 MiB 0.0 MiB logging.info("Downloading png links.")
	82 42.8 MiB 0.0 MiB threads = [threading.Thread(target=download_png,args=(png,)) for png in png_links[:LIMIT]]
	83 53.1 MiB 10.2 MiB for t in threads:
	84 53.1 MiB 0.0 MiB t.start()
	85 63.9 MiB 10.9 MiB for t in threads:
	86 63.9 MiB 0.0 MiB t.join()


	running time: 9.577131032943726s