bryanwoods · November 14, 2008 22:54
diff --git a/creepycrawlers.py b/creepycrawlers.py
 from pysqlite2 import dbapi2 as sqlite
 import urllib2
 from BeautifulSoup import *
 from urlparse import urljoin

 # Create a list of words to ignore
 ignorewords=set(['the','of','to','and','a','in','is','it'])

 class crawler:
 	# Initialize the crawler with the name of database
 	def __init__(self,dbname):
 		self.con=sqlite.connect(dbname)
 		
 	def __del__(self):
 		self.con.close()
 		
 	def dbcommit(self):
 		self.con.commit()
 		
 	# Auxilliary function for getting an entry id and adding
 	# it if it's not present
 	def getentryid(self,table,field,value,createnew=True):
 		cur=self.con.execute(
 		"select rowid from %s where %s='%s'" % (table,field,value))
 		res=cur.fetchone()
 		if res==None:
 			cur=self.con.execute(
 			"insert into %s (%s) values ('%s')" % (table,field,value))
 			return cur.lastrowid
 		else:
 			return res[0]
 		
 	# Index an individual page
 	def addtoindex(self,url,soup):
 		if self.isindexed(url): return
 		print 'Indexing '+url
 		
 		# Get the individual words
 		text=self.gettextonly(soup)
 		words=self.separatewords(text)
 		
 		# Get the URL id
 		urlid=self.getentryid('urllist','url',url)
 		
 		# Link each word to this url
 		for i in range(len(words)):
 			word=words[i]
 			if word in ignorewords: continue
 			wordid=self.getentryid('wordlist','word',word)
 			self.con.execute("Insert into wordlocation(urlid,wordid,location) \
 				values(%d,%d,%d)" % (urlid,wordid,i))
 		
 	# Extract the text from an HTML page (no tags)
 	def gettextonly(self,soup):
 		v=soup.string
 		if v==None:
 			c=soup.contents
 			resulttext=''
 			for t in c:
 				subtext=self.gettextonly(t)
 				resulttext+=subtext+'\n'
 			return resulttext
 		else:
 			return v.strip()
 		
 	# Separate the words by any non-whitespace character
 	def separatewords(self,text):
 		splitter=re.compile('\\W*')
 		return [s.lower() for s in splitter.split(text) if s!='']
 		
 	# Return true if this url is already indexed
 	def isindexed(self,url):
 		u=self.con.execute ("select rowid from urllist where url='%s" % url).fetchone()
 		if u!=None:
 			# Check if it has actually been crawled
 			v=self.con.execute(
 			'select * from wordlocation where urlid=%d' % u[0]).fetchone()
 			if v!=None: return True
 		return False
 		
 	# Add a link between two pages
 	def addlinkref(self,urlFrom,urlTo,linkText):
 		pass
 		
 	# Starting with a list of pages, do a breadth
 	# first search to the given depth, indexing pages
 	# as we go
 	def crawl(self,pages,depth=2):
 		pass
 		
 	# Create the database tables
 	def createindextables(self):
 		pass
 		
 	def crawl(self,pages,depth=2):
 		for i in range(depth):
 			newpages=set()
 			for page in pages:
 				try:
 					c=urllib2.urlopen(page)
 				except:
 					print "Could not open %s" % page
 					continue
 				soup=BeautifulSoup(c.read())
 				self.addtoindex(page,soup)
 				
 				links=soup('a')
 				for link in links:
 					if ('href' in dict(link.attrs)):
 						url=urljoin(page,link['href'])
 						if url.find("'")!=-1: continue
 						url=url.split('#')[0] # remove location portion
 						if url[0:4]=='http' and not self.isindexed(url):
 							newpages.add(url)
 						linkText=self.gettextonly(link)
 						self.addlinkref(page,url,linkText)
 						
 				self.dbcommit()
 				
 			pages=newpages
 			
 	def createindextables(self):
 		self.con.execute('create table urllist(url)')
 		self.con.execute('create table wordlist(word)')
 		self.con.execute('create table wordlocation(urlid,wordid,location)')
 		self.con.execute('create table link(fromid integer,toid integer)')
 		self.con.execute('create table linkwords(wordid,linkid)')
 		self.con.execute('create index wordidx on wordlist(word)')
 		self.con.execute('create index urlidx on urllist(url)')
 		self.con.execute('create index wordurlidx on wordlocation(wordid)')
 		self.con.execute('create index urltoidx on link(toid)')
 		self.con.execute('create index urlfromidx on link(fromid)')
 		self.dbcommit()
	from pysqlite2 import dbapi2 as sqlite
	import urllib2
	from BeautifulSoup import *
	from urlparse import urljoin

	# Create a list of words to ignore
	ignorewords=set(['the','of','to','and','a','in','is','it'])

	class crawler:
	# Initialize the crawler with the name of database
	def __init__(self,dbname):
	self.con=sqlite.connect(dbname)

	def __del__(self):
	self.con.close()

	def dbcommit(self):
	self.con.commit()

	# Auxilliary function for getting an entry id and adding
	# it if it's not present
	def getentryid(self,table,field,value,createnew=True):
	cur=self.con.execute(
	"select rowid from %s where %s='%s'" % (table,field,value))
	res=cur.fetchone()
	if res==None:
	cur=self.con.execute(
	"insert into %s (%s) values ('%s')" % (table,field,value))
	return cur.lastrowid
	else:
	return res[0]

	# Index an individual page
	def addtoindex(self,url,soup):
	if self.isindexed(url): return
	print 'Indexing '+url

	# Get the individual words
	text=self.gettextonly(soup)
	words=self.separatewords(text)

	# Get the URL id
	urlid=self.getentryid('urllist','url',url)

	# Link each word to this url
	for i in range(len(words)):
	word=words[i]
	if word in ignorewords: continue
	wordid=self.getentryid('wordlist','word',word)
	self.con.execute("Insert into wordlocation(urlid,wordid,location) \
	values(%d,%d,%d)" % (urlid,wordid,i))

	# Extract the text from an HTML page (no tags)
	def gettextonly(self,soup):
	v=soup.string
	if v==None:
	c=soup.contents
	resulttext=''
	for t in c:
	subtext=self.gettextonly(t)
	resulttext+=subtext+'\n'
	return resulttext
	else:
	return v.strip()

	# Separate the words by any non-whitespace character
	def separatewords(self,text):
	splitter=re.compile('\\W*')
	return [s.lower() for s in splitter.split(text) if s!='']

	# Return true if this url is already indexed
	def isindexed(self,url):
	u=self.con.execute ("select rowid from urllist where url='%s" % url).fetchone()
	if u!=None:
	# Check if it has actually been crawled
	v=self.con.execute(
	'select * from wordlocation where urlid=%d' % u[0]).fetchone()
	if v!=None: return True
	return False

	# Add a link between two pages
	def addlinkref(self,urlFrom,urlTo,linkText):
	pass

	# Starting with a list of pages, do a breadth
	# first search to the given depth, indexing pages
	# as we go
	def crawl(self,pages,depth=2):
	pass

	# Create the database tables
	def createindextables(self):
	pass

	def crawl(self,pages,depth=2):
	for i in range(depth):
	newpages=set()
	for page in pages:
	try:
	c=urllib2.urlopen(page)
	except:
	print "Could not open %s" % page
	continue
	soup=BeautifulSoup(c.read())
	self.addtoindex(page,soup)

	links=soup('a')
	for link in links:
	if ('href' in dict(link.attrs)):
	url=urljoin(page,link['href'])
	if url.find("'")!=-1: continue
	url=url.split('#')[0] # remove location portion
	if url[0:4]=='http' and not self.isindexed(url):
	newpages.add(url)
	linkText=self.gettextonly(link)
	self.addlinkref(page,url,linkText)

	self.dbcommit()

	pages=newpages

	def createindextables(self):
	self.con.execute('create table urllist(url)')
	self.con.execute('create table wordlist(word)')
	self.con.execute('create table wordlocation(urlid,wordid,location)')
	self.con.execute('create table link(fromid integer,toid integer)')
	self.con.execute('create table linkwords(wordid,linkid)')
	self.con.execute('create index wordidx on wordlist(word)')
	self.con.execute('create index urlidx on urllist(url)')
	self.con.execute('create index wordurlidx on wordlocation(wordid)')
	self.con.execute('create index urltoidx on link(toid)')
	self.con.execute('create index urlfromidx on link(fromid)')
	self.dbcommit()