Created
April 24, 2013 16:57
-
-
Save gerjantd/5453685 to your computer and use it in GitHub Desktop.
Groovy: scrape mp3 links in html and download all
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@Grab(group='net.sourceforge.nekohtml', module='nekohtml', version='1.9.18') | |
import org.cyberneko.html.parsers.SAXParser | |
def download(address) | |
{ | |
def file = new FileOutputStream(address.tokenize("/")[-1]) | |
def out = new BufferedOutputStream(file) | |
out << new URL(address).openStream() | |
out.close() | |
} | |
def url = 'http://foo.bar/audio' | |
def html = new XmlSlurper(new SAXParser()).parse(url) | |
def mp3 = html.'**'.findAll { it.@href =~ 'mp3' } | |
def total = mp3.size | |
def i = 0 | |
mp3.each { | |
download "${url}/${it.@href}" | |
println "downloaded file ${++i} of ${total}: ${url}/${it.@href}" | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment