Skip to content

Instantly share code, notes, and snippets.

@bdkosher
Last active August 2, 2016 13:46
Show Gist options
  • Save bdkosher/fe42c12b11edfbb7394a45869dd6dbe8 to your computer and use it in GitHub Desktop.
Save bdkosher/fe42c12b11edfbb7394a45869dd6dbe8 to your computer and use it in GitHub Desktop.
Command line utility for batch downloading.
def cli = new CliBuilder(usage:'batchdl [options] <inputfile>')
cli.with {
start(args:1, argName:'int', 'starting index (1-based) of input file to download')
size(args:1, argName:'int', 'number of URLs to download')
dest(args:1, argName:'dir', 'where the files should be downloaded to')
log(args:1, argName: 'logfile', 'where output should be logged')
}
def options = cli.parse(args)
if (!options) {
cli.usage()
System.exit(1)
}
String inputFile = options.arguments()[0]
if (!inputFile) {
println "Must provide an input file containing a URL per line"
cli.usage()
System.exit(1)
}
def input = new File(inputFile)
if (input.exists() && input.isDirectory()) {
println "Invalid input file. $input is a directory."
System.exit(1)
}
int start = options.start ? Math.abs(options.start.toInteger()) : 1
int size = options.size ? Math.abs(options.size.toInteger()) : 100
def dest = options.dest ? new File(options.dest) : new File('.')
if (dest.exists() && !dest.isDirectory()) {
println "Invalid destination direcotry. $dest is a file."
System.exit(1)
}
def fmt = 'yyyy-MM-dd_HHmmss'
def defaultLogfileName = "batchdl-${new Date().format(fmt)}.log"
def log = options.cli ? new File(options.cli) : new File(dest, defaultLogfileName)
if (log.exists() && log.isDirectory()) {
log = new File(log, defaultLogfileName)
}
int downloaded = 0
input.eachLine { url, lineNumber ->
if (lineNumber < start) return
if (downloaded >= size) {
println "Processing complete."
System.exit(0)
}
println "Downloading data from $url"
def out = new File(dest, "${url.replaceAll('/', '-') - ':'}_${new Date().format(fmt)}.out")
try {
long startTime = System.nanoTime()
new URL(url).withInputStream { stream ->
long bytesWritten = java.nio.file.Files.copy(stream, out.toPath())
log << "$caseId,SUCCESS,${(System.nanoTime() - start) / 1e9},$url,$bytesWritten \n"
}
} catch (e) {
log << "$caseId,ERROR,${(System.nanoTime() - start) / 1e9},$url,$e.message \n"
}
++downloaded
}
println "Processed $downloaded of $size entries"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment