Skip to content

Instantly share code, notes, and snippets.

@hemanth
Created March 29, 2013 18:02
Show Gist options
  • Save hemanth/5272451 to your computer and use it in GitHub Desktop.
Save hemanth/5272451 to your computer and use it in GitHub Desktop.
Simple web Crawler with python twisted
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from twisted.web.client import getPage
from twisted.python.util import println
from BeautifulSoup import BeautifulSoup
from twisted.python import log
from twisted.internet import defer, task
import re
# Needs : PyOpenSSL and Twisted 12.3+
def parallel(iterable, count, callable, *args, **named):
coop = task.Cooperator()
work = (callable(elem, *args, **named) for elem in iterable)
return defer.DeferredList([coop.coiterate(work) for i in xrange(count)])
def union(p, q):
for e in p:
if e not in q:
print e
q.append(e)
def extractLinks(html):
soup = BeautifulSoup(html)
soup.prettify()
return [str(anchor['href']) for anchor in soup.findAll('a',attrs={'href': re.compile("^http://")}) if anchor['href']]
def crawlPage(url, urlList):
d = getPage(url)
d.addCallback(extractLinks)
d.addCallback(union, urlList)
d.addErrback(log.err)
return d
def crawler(urls):
urls = list(urls)
def main(reactor, *args):
urls = list(args)
return parallel(urls,len(urls), crawlPage, urls)
if __name__ == '__main__':
import sys
task.react(main,["http://h3manth.com","http://www.test.com"]) # Can pass a list of urls
@theSage21
Copy link

I do not understand why crawler(urls) is there?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment