Skip to content

Instantly share code, notes, and snippets.

@Fluxx
Last active December 15, 2015 21:09
Show Gist options
  • Select an option

  • Save Fluxx/5323263 to your computer and use it in GitHub Desktop.

Select an option

Save Fluxx/5323263 to your computer and use it in GitHub Desktop.
4a5,7
> import Queue
> import threading
> import heapq
13,16c16,29
< class WebSpider(object):
< def __init__(self):
< self.results = {}
<
---
> class WebSpider(threading.Thread):
> def __init__(self, urls, results):
> threading.Thread.__init__(self)
> self.results = results
> self.urls = urls
>
>
> def run(self):
> while True:
> try:
> self.fetch(self.urls.get(False))
> except Queue.Empty:
> break
>
26c39,40
< self.results[url] = resp.read()
---
> self.results.put((url, resp.read()), True)
> #self.results[url] = resp.read()
30,36c44,48
< def fetch_all(self, iterable):
< for url in iterable:
< self.fetch(url)
<
< def most_vowels(self, num=10):
< """
< Return a list of the URLs which had the most vowels.
---
> #def fetch_all(self, iterable):
> # for url in iterable:
> # self.fetch(url)
>
>
38c50,55
< Each item in the list is:
---
>
> def most_vowels(self, num=10):
> """
> Return a list of the URLs which had the most vowels.
>
> Each item in the list is:
41,43c58,66
< """
< scored_urls = [(u, b, count_vowels(b)) for u, b in self.results.items()]
< return sorted(scored_urls, key=lambda x: x[-1], reverse=True)[:num]
---
> """
> #scored_urls = [(u, b, count_vowels(b)) for u, b in self.results.items()]
> #return sorted(scored_urls, key=lambda x: x[-1], reverse=True)[:num]
> h = heapq.heapq()
> # *****set max heap size to num
> for u,b in results:
> h.heappush((u, b))
>
> # **** return heap *******
47c70
< urls = []
---
> urls = Queue.Queue()
51c74
< urls.append(line.strip())
---
> urls.put(line.strip())
53,54c76,86
< spider = WebSpider()
< spider.fetch_all(urls)
---
> results = Queue.Queue()
> spiders = set([WebSpider(urls, results) for _ in range(100)])
> for s in spiders:
> s.run()
> while len(spiders) > 0:
> if s.is_alive():
> del spiders[s]
> try:
> s.join()
> except RuntimeError: pass
>
59,60c91,92
< for url, content, score in spider.most_vowels(10):
< print '%-35s %-50s %d' % (url, content, score)
---
> #for url, content, score in spider.most_vowels(10):
> # print '%-35s %-50s %d' % (url, content, score)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment