Last active
March 25, 2016 10:23
-
-
Save mynameisfiber/95e742837e76ecac265f to your computer and use it in GitHub Desktop.
thesession.org ABC file scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2.7 | |
""" | |
Scrape thesession.org for all the yummy ABC files | |
""" | |
import os | |
import itertools as IT | |
from collections import deque | |
from operator import itemgetter | |
from urlparse import urljoin | |
import grequests | |
CHUNK_SIZE = 5 | |
def chunk(_iter, chunk_size=CHUNK_SIZE): | |
""" | |
Chunks up the given iterator into `chunk_size` lists to make batching | |
easier | |
""" | |
buf = [] | |
for item in _iter: | |
buf.append(item) | |
if len(buf) == chunk_size: | |
yield buf | |
buf = buf[:0] | |
yield buf | |
def find_best_abc(content, _id): | |
""" | |
Find the best ABC file referenced in the given thesession.org tune's page. | |
I was going to use regex for this, but this explicit find method actually | |
is substantially faster. It works on the premise that the best ABC file | |
will be closer to the bottom | |
""" | |
start = content.rfind("/tunes/{}/abc/".format(_id)) | |
end = content[start:].find('"') | |
return content[start:start+end] | |
def get_abc_files(chunk_size=CHUNK_SIZE): | |
""" | |
Iterate through thesession.org tune pages sequentially and extract out the | |
best ABC file from each. We only look at pages that we haven't already | |
scrapped and we chunk batch our requests | |
""" | |
collect = deque() | |
for i in IT.count(1): | |
if not os.path.exists("data/{:08d}.abc".format(i)): | |
print "Adding request for: ", i | |
request = grequests.request("GET", "https://thesession.org/tunes/{}".format(i)) | |
collect.append((i, request)) | |
if len(collect) == chunk_size: | |
print "Issuing requests" | |
pages = grequests.map(IT.imap(itemgetter(1), collect)) | |
for (_id, _), page in IT.izip(collect, pages): | |
abc = find_best_abc(page.content, _id) | |
print "Parsing request for id: ", _id, abc | |
yield (_id, urljoin("https://thesession.org/", abc)) | |
collect.clear() | |
def main(): | |
""" | |
Perform the scrape on thesession.org | |
""" | |
abc_list = get_abc_files() | |
for abc_files in chunk(abc_list): | |
print "Downloading abc files: ", len(abc_files) | |
data = grequests.map(grequests.get(u[1]) for u in abc_files) | |
for (_id, _), abc in IT.izip(abc_files, data): | |
print "Writing: ", _id | |
open("data/{:08d}.abc".format(_id), "wb+").write(abc.content) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment