djsutherland · July 2, 2010 20:05
diff --git a/better_prop_download.py b/better_prop_download.py
 #!/usr/bin/env python

 import lxml.html
 import urllib
 import Queue
 import re
 import sys
 import threading

 doc = lxml.html.parse(sys.argv[1])
 links = doc.xpath('//p/a[contains(@href, "downloadSong")]')

 # threads to download 5 at a time
 pool = Queue.Queue()
 stdout_lock = threading.Lock()
 url_base = 'http://betterpropaganda.com/mp3_download.ashx?id=%s'
 class Downloader(threading.Thread):
    def run(self):
        while True:
            try:
                url, file = pool.get(timeout=3)
                with stdout_lock:
                    print "downloading %s" % file
                urllib.urlretrieve(url_base % url, file)
            except Queue.Empty:
                break
 threads = [Downloader() for i in range(5)]

 # tell the threads what to download
 id_re = re.compile(r'javascript:downloadSong\((\d+)\)')
 esc = lambda s: s.replace('(','').replace(')','') \
                 .replace(' ','_').replace('/','_')
 name_re = re.compile(r'^\[ play \] \| \[ download mp3 \] (.*?)\s?- "(.*)"')

 for el in links:
    id = id_re.match(el.get('href')).group(1)

    p_text = el.getparent().text_content()
    match = name_re.match(p_text)
    if not match:
        print "DIDN'T MATCH '%s'" % p_text
        name = "unknown.mp3"
    else:
        name = '-'.join(map(esc, match.groups())) + '.mp3'

    pool.put( (id, name) )

 # do the downloading
 for thread in threads: thread.start()
 for thread in threads: thread.join()
	#!/usr/bin/env python

	import lxml.html
	import urllib
	import Queue
	import re
	import sys
	import threading

	doc = lxml.html.parse(sys.argv[1])
	links = doc.xpath('//p/a[contains(@href, "downloadSong")]')

	# threads to download 5 at a time
	pool = Queue.Queue()
	stdout_lock = threading.Lock()
	url_base = 'http://betterpropaganda.com/mp3_download.ashx?id=%s'
	class Downloader(threading.Thread):
	def run(self):
	while True:
	try:
	url, file = pool.get(timeout=3)
	with stdout_lock:
	print "downloading %s" % file
	urllib.urlretrieve(url_base % url, file)
	except Queue.Empty:
	break
	threads = [Downloader() for i in range(5)]

	# tell the threads what to download
	id_re = re.compile(r'javascript:downloadSong\((\d+)\)')
	esc = lambda s: s.replace('(','').replace(')','') \
	.replace(' ','_').replace('/','_')
	name_re = re.compile(r'^\[ play \] \\| \[ download mp3 \] (.?)\s?- "(.)"')

	for el in links:
	id = id_re.match(el.get('href')).group(1)

	p_text = el.getparent().text_content()
	match = name_re.match(p_text)
	if not match:
	print "DIDN'T MATCH '%s'" % p_text
	name = "unknown.mp3"
	else:
	name = '-'.join(map(esc, match.groups())) + '.mp3'

	pool.put( (id, name) )

	# do the downloading
	for thread in threads: thread.start()
	for thread in threads: thread.join()