mynameisfiber · March 25, 2016 10:23
diff --git a/thesession.py b/thesession.py
 #!/usr/bin/env python2.7
 """
 Scrape thesession.org for all the yummy ABC files
 """

 import os
 import itertools as IT
 from collections import deque
 from operator import itemgetter
 from urlparse import urljoin

 import grequests


 CHUNK_SIZE = 5


 def chunk(_iter, chunk_size=CHUNK_SIZE):
    """
    Chunks up the given iterator into `chunk_size` lists to make batching
    easier
    """
    buf = []
    for item in _iter:
        buf.append(item)
        if len(buf) == chunk_size:
            yield buf
            buf = buf[:0]
    yield buf

 def find_best_abc(content, _id):
    """
    Find the best ABC file referenced in the given thesession.org tune's page.
    I was going to use regex for this, but this explicit find method actually
    is substantially faster.  It works on the premise that the best ABC file
    will be closer to the bottom
    """
    start = content.rfind("/tunes/{}/abc/".format(_id))
    end = content[start:].find('"')
    return content[start:start+end]

 def get_abc_files(chunk_size=CHUNK_SIZE):
    """
    Iterate through thesession.org tune pages sequentially and extract out the
    best ABC file from each.  We only look at pages that we haven't already
    scrapped and we chunk batch our requests
    """
    collect = deque()
    for i in IT.count(1):
        if not os.path.exists("data/{:08d}.abc".format(i)):
            print "Adding request for: ", i
            request = grequests.request("GET", "https://thesession.org/tunes/{}".format(i))
            collect.append((i, request))
        if len(collect) == chunk_size:
            print "Issuing requests"
            pages = grequests.map(IT.imap(itemgetter(1), collect))
            for (_id, _), page in IT.izip(collect, pages):
                abc = find_best_abc(page.content, _id)
                print "Parsing request for id: ", _id, abc
                yield (_id, urljoin("https://thesession.org/", abc))
            collect.clear()

 def main():
    """
    Perform the scrape on thesession.org
    """
    abc_list = get_abc_files()
    for abc_files in chunk(abc_list):
        print "Downloading abc files: ", len(abc_files)
        data = grequests.map(grequests.get(u[1]) for u in abc_files)
        for (_id, _), abc in IT.izip(abc_files, data):
            print "Writing: ", _id
            open("data/{:08d}.abc".format(_id), "wb+").write(abc.content)


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python2.7
	"""
	Scrape thesession.org for all the yummy ABC files
	"""

	import os
	import itertools as IT
	from collections import deque
	from operator import itemgetter
	from urlparse import urljoin

	import grequests


	CHUNK_SIZE = 5


	def chunk(_iter, chunk_size=CHUNK_SIZE):
	"""
	Chunks up the given iterator into `chunk_size` lists to make batching
	easier
	"""
	buf = []
	for item in _iter:
	buf.append(item)
	if len(buf) == chunk_size:
	yield buf
	buf = buf[:0]
	yield buf

	def find_best_abc(content, _id):
	"""
	Find the best ABC file referenced in the given thesession.org tune's page.
	I was going to use regex for this, but this explicit find method actually
	is substantially faster. It works on the premise that the best ABC file
	will be closer to the bottom
	"""
	start = content.rfind("/tunes/{}/abc/".format(_id))
	end = content[start:].find('"')
	return content[start:start+end]

	def get_abc_files(chunk_size=CHUNK_SIZE):
	"""
	Iterate through thesession.org tune pages sequentially and extract out the
	best ABC file from each. We only look at pages that we haven't already
	scrapped and we chunk batch our requests
	"""
	collect = deque()
	for i in IT.count(1):
	if not os.path.exists("data/{:08d}.abc".format(i)):
	print "Adding request for: ", i
	request = grequests.request("GET", "https://thesession.org/tunes/{}".format(i))
	collect.append((i, request))
	if len(collect) == chunk_size:
	print "Issuing requests"
	pages = grequests.map(IT.imap(itemgetter(1), collect))
	for (_id, _), page in IT.izip(collect, pages):
	abc = find_best_abc(page.content, _id)
	print "Parsing request for id: ", _id, abc
	yield (_id, urljoin("https://thesession.org/", abc))
	collect.clear()

	def main():
	"""
	Perform the scrape on thesession.org
	"""
	abc_list = get_abc_files()
	for abc_files in chunk(abc_list):
	print "Downloading abc files: ", len(abc_files)
	data = grequests.map(grequests.get(u[1]) for u in abc_files)
	for (_id, _), abc in IT.izip(abc_files, data):
	print "Writing: ", _id
	open("data/{:08d}.abc".format(_id), "wb+").write(abc.content)


	if __name__ == "__main__":
	main()