rande · August 5, 2010 13:19
diff --git a/gistfile1.py b/gistfile1.py
 import threading, time, urllib
 import fileinput, io

 class UrlDownloadThread ( threading.Thread ):
  
  def __init__(self, url):
    self.url = url
    threading.Thread.__init__(self) 
    
  def run(self):
    file = urllib.urlopen(self.url)
    if file.getcode() != 200:
      self.error = 'http code is not 200 : %s' % self.url

 class CacheCrawler:
  
  def __init__(self, number, file):
    self.number = number
    self.pool   = {}
    self.pos    = 0
    self.stream = io.open(file)
    self.count  = 0;

  def start(self):
    
    print "starting thread"
    ## init thread
    for x in range(0, self.number):
      url = self.get_next_url()
      if url:
        self.pool[x] = UrlDownloadThread(url)
        self.pool[x].start()
    
    ## loop accros thread and check status
    try:
      while True:
        time.sleep(0.1)
          
        for x in self.pool:
          if not isinstance(self.pool[x], UrlDownloadThread):
            #print 'not a valid thread'
            continue
        
          if self.pool[x].isAlive():
            #print 'thread is running nothing to do'
            continue
        
          url = self.get_next_url()
          if not url:              
            raise StopIteration()
          
          if self.count % 100 == 0:
            print "urls downloaded : %s" % self.count
          
          self.pool[x] = UrlDownloadThread(url)
          self.pool[x].start()
          
    except StopIteration:
      print 'no more url to fetch'
      pass
          
  def get_next_url(self):
    line = self.stream.readline()
    
    if line:
      self.count += 1
      
    return line

 CacheCrawler(5, "urls/recipes.txt").start()
	import threading, time, urllib
	import fileinput, io

	class UrlDownloadThread ( threading.Thread ):

	def __init__(self, url):
	self.url = url
	threading.Thread.__init__(self)

	def run(self):
	file = urllib.urlopen(self.url)
	if file.getcode() != 200:
	self.error = 'http code is not 200 : %s' % self.url

	class CacheCrawler:

	def __init__(self, number, file):
	self.number = number
	self.pool = {}
	self.pos = 0
	self.stream = io.open(file)
	self.count = 0;

	def start(self):

	print "starting thread"
	## init thread
	for x in range(0, self.number):
	url = self.get_next_url()
	if url:
	self.pool[x] = UrlDownloadThread(url)
	self.pool[x].start()

	## loop accros thread and check status
	try:
	while True:
	time.sleep(0.1)

	for x in self.pool:
	if not isinstance(self.pool[x], UrlDownloadThread):
	#print 'not a valid thread'
	continue

	if self.pool[x].isAlive():
	#print 'thread is running nothing to do'
	continue

	url = self.get_next_url()
	if not url:
	raise StopIteration()

	if self.count % 100 == 0:
	print "urls downloaded : %s" % self.count

	self.pool[x] = UrlDownloadThread(url)
	self.pool[x].start()

	except StopIteration:
	print 'no more url to fetch'
	pass

	def get_next_url(self):
	line = self.stream.readline()

	if line:
	self.count += 1

	return line

	CacheCrawler(5, "urls/recipes.txt").start()