Skip to content

Instantly share code, notes, and snippets.

@chao-he
Last active December 29, 2015 02:09
Show Gist options
  • Save chao-he/7598272 to your computer and use it in GitHub Desktop.
Save chao-he/7598272 to your computer and use it in GitHub Desktop.
python version of wget
#!/usr/bin/python26
from tornado.httpclient import HTTPRequest, AsyncHTTPClient
from tornado.ioloop import IOLoop
from tornado.options import parse_command_line
from tornado.options import options,define
from tornado.escape import utf8
from urllib import quote_plus
from collections import deque
import simplejson as json
import logging
define("i", default="url.txt")
define("o", default="data")
define("conncurrent", default=500)
UA = "Mozilla/5.0 (MSIE 9.0; Windows NT 6.1; Trident/5.0)"
class Downloader(object):
def __init__(self, ifile, ofile):
self.input = open(ifile)
self.ofile = ofile
self.outgoing = 0
self.failures = 0
self.total_downs = 0
self.total_input = 0
self.output = None
def OpenNewFile(self):
num = self.total_downs / 10000
if self.output:
self.output.close()
self.output = open("%s/part-%05d" % (self.ofile, num), "w")
def UrlGet(self, url):
try:
AsyncHTTPClient().fetch(
HTTPRequest(url, headers={"User-Agent": UA}, request_timeout=30),
callback=self.Save
)
self.outgoing += 1
except Exception,e:
logging.error("cannot fetch %s", url, exc_info=True)
def Save(self, resp):
self.outgoing -= 1
if resp.error:
logging.error("%s => %s", resp.request.url, resp.error)
self.failures += 1
else:
self.SaveOnce(resp)
def SaveOnce(self, resp):
if self.total_downs % 10000 == 0:
self.OpenNewFile()
self.output.write(resp.request.url)
self.output.write("\t")
self.output.write(resp.body)
self.output.write("\n")
self.total_downs += 1
def Feed(self):
try:
if self.outgoing >= 2 * self.conncurrent:
return
for i in xrange(self.conncurrent):
line = self.input.readline()
if not line:
self.input.close()
self.input = None
logging.info("total input = %d", self.total_input)
break
self.total_input += 1
self.UrlGet(line.strip())
except Exception,e:
logging.error("error in schedule", exc_info=True)
def RunOnce(self):
if self.input:
self.Feed()
logging.info("input = %d, downloads = %d, failures = %d, inprogress = %d",
self.total_input,
self.total_downs,
self.failures,
self.outgoing
)
if input is None and self.outgoing == 0:
if self.output:
self.output.close()
IOLoop.instance().stop()
else:
timeout = IOLoop.instance().time() + 1
IOLoop.instance().add_timeout(timeout, self.RunOnce)
if __name__ == "__main__":
AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient", max_clients=100)
parse_command_line()
d = Downloader(options.i, options.o)
IOLoop.instance().add_callback(d.RunOnce)
IOLoop.instance().start()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment