Created
July 2, 2010 20:05
-
-
Save djsutherland/461843 to your computer and use it in GitHub Desktop.
download mp3s from betterpropaganda.com in bulk
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import lxml.html | |
import urllib | |
import Queue | |
import re | |
import sys | |
import threading | |
doc = lxml.html.parse(sys.argv[1]) | |
links = doc.xpath('//p/a[contains(@href, "downloadSong")]') | |
# threads to download 5 at a time | |
pool = Queue.Queue() | |
stdout_lock = threading.Lock() | |
url_base = 'http://betterpropaganda.com/mp3_download.ashx?id=%s' | |
class Downloader(threading.Thread): | |
def run(self): | |
while True: | |
try: | |
url, file = pool.get(timeout=3) | |
with stdout_lock: | |
print "downloading %s" % file | |
urllib.urlretrieve(url_base % url, file) | |
except Queue.Empty: | |
break | |
threads = [Downloader() for i in range(5)] | |
# tell the threads what to download | |
id_re = re.compile(r'javascript:downloadSong\((\d+)\)') | |
esc = lambda s: s.replace('(','').replace(')','') \ | |
.replace(' ','_').replace('/','_') | |
name_re = re.compile(r'^\[ play \] \| \[ download mp3 \] (.*?)\s?- "(.*)"') | |
for el in links: | |
id = id_re.match(el.get('href')).group(1) | |
p_text = el.getparent().text_content() | |
match = name_re.match(p_text) | |
if not match: | |
print "DIDN'T MATCH '%s'" % p_text | |
name = "unknown.mp3" | |
else: | |
name = '-'.join(map(esc, match.groups())) + '.mp3' | |
pool.put( (id, name) ) | |
# do the downloading | |
for thread in threads: thread.start() | |
for thread in threads: thread.join() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment