copyninja · October 19, 2010 11:38
diff --git a/crawler.py b/crawler.py
 #!/usr/local/bin/python3

 import re
 import sqlite3
 from urllib.request import urlopen,Request
 from urllib.error import HTTPError
 from multiprocessing import Process

 # TODO Fill this table with other language regexps
 lang_regexps = {
    "kn_IN":'([\u0c80-\u0cf2]+)[^\u0c80-\u0cf2]*',
    "ml_IN":'([\u0d00-\u0d7f]+)[^\u0d00-\u0d7f]*'
    }


 def get_regexp(language):
    if language:
        return lang_regexps.get(language,None)
    return None

 class Crawler(object):
    """
    """
    
    def __init__(self, finish_callback,url,lang):
        """
        content_callback : Function to be called when content is read
        from the URL
        """
        self.callback = finish_callback
        self.urlregexp = re.compile('<a\s*\S*href=[\'|"](.*?)[\'"].*?>')

        # TODO create a regex look up for each indian language and get regex from there
        self.wordregexp = re.compile(get_regexp(lang))
        self.url = url
        self.db_connection = None
        self.lang = lang
        self.domain = self.__getdomain__()

    def crawl(self):
        if not self.db_connection:
            self.__initialize_db__()

        response = None
        self.__update_url()
        try:
            request = Request(url=self.url)
            request.add_header("User-Agent","Python 3.0 Crawler")
            response = urlopen(request).read().decode("utf-8")
        except HTTPError as e:
            print(("Something went wrong. I got error code {}".format(e.code)))
            sys.exit(1)
        except URLError as u:
            print(("Something is wrong. Error Message: {}".format(u.reason)))
            sys.exit(1)
        except:
            print("Grrr! Something is really wrong")

        if response:
            p1 = Process(target=self.__processwords__,args=(response,))
            p2 = Process(target=self.__processlinks__,args=(response,))
            p1.start()
            p2.start()
            p1.join()
            p2.join()

            self.callback(self.url)

    def close(self):
        if self.db_connection:
            self.db_connection.close()
            

    def __initialize_db__(self):
        self.db_connection= sqlite3.connect("crawler.db")
        c = self.db_connection.cursor()
        c.execute("""
        create table if not exists links (
        link text not null primary key,
        last_parsed datetime default null);
        """)
        c.execute("""
        create table if not exists words (
        word text not null primary key,
        lang text not null)
        """)
        self.db_connection.commit()
        c.close()

    def __processwords__(self,response):
        words_list = self.wordregexp.findall(response)

        if words_list and len(words_list) > 0:
            c = self.db_connection.cursor()
            for word in sorted(words_list):
                try:
                    c.execute("insert into words values (?,?)",(word,self.lang))
                except:
                    # Primary Key Error
                    # Continue
                    pass
            self.db_connection.commit()
            c.close()

    def __processlinks__(self,response):
        links_list = self.urlregexp.findall(response)

        if links_list and len(links_list) > 0:
            c = self.db_connection.cursor()
            
            for link in links_list:
                if link.startswith("#"):
                    # Not a URL 
                    continue
                elif link.find(self.domain) == -1:
                    #Possible that this is internal link
                    if not link.startswith("http://"):
                        link = "http://" + self.domain + link
                    else:
                        # External Link not interested go ahead
                        continue
                        
                try:
                    c.execute("insert into links (link) values (?)",(link,))
                except:
                    pass
            self.db_connection.commit()
            c.close()

    def __update_url(self):
        c = self.db_connection.cursor()
        c.execute("select * from links where link = '" + self.url+"'")
        if c.rowcount == 0:
            c.execute("insert into links values ('"+self.url+"', date('now'))")
        else:
            c.execute("update links set last_parsed = date('now') where link='"+self.url+"'")
        self.db_connection.commit()
        c.close()

    def __getdomain__(self):
        http_split = self.url.split("http://")
        domain = ""
        if len(http_split) == 1:
            # The given url doesn't begin with http://
            domain = self.url.split("/")[0]
            self.url = "http://" + self.url
        else:
            domain = http_split[1].split("/")[0]
        return domain
	#!/usr/local/bin/python3

	import re
	import sqlite3
	from urllib.request import urlopen,Request
	from urllib.error import HTTPError
	from multiprocessing import Process

	# TODO Fill this table with other language regexps
	lang_regexps = {
	"kn_IN":'([\u0c80-\u0cf2]+)[^\u0c80-\u0cf2]*',
	"ml_IN":'([\u0d00-\u0d7f]+)[^\u0d00-\u0d7f]*'
	}


	def get_regexp(language):
	if language:
	return lang_regexps.get(language,None)
	return None

	class Crawler(object):
	"""
	"""

	def __init__(self, finish_callback,url,lang):
	"""
	content_callback : Function to be called when content is read
	from the URL
	"""
	self.callback = finish_callback
	self.urlregexp = re.compile('<a\s\Shref=[\'\|"](.?)[\'"].?>')

	# TODO create a regex look up for each indian language and get regex from there
	self.wordregexp = re.compile(get_regexp(lang))
	self.url = url
	self.db_connection = None
	self.lang = lang
	self.domain = self.__getdomain__()

	def crawl(self):
	if not self.db_connection:
	self.__initialize_db__()

	response = None
	self.__update_url()
	try:
	request = Request(url=self.url)
	request.add_header("User-Agent","Python 3.0 Crawler")
	response = urlopen(request).read().decode("utf-8")
	except HTTPError as e:
	print(("Something went wrong. I got error code {}".format(e.code)))
	sys.exit(1)
	except URLError as u:
	print(("Something is wrong. Error Message: {}".format(u.reason)))
	sys.exit(1)
	except:
	print("Grrr! Something is really wrong")

	if response:
	p1 = Process(target=self.__processwords__,args=(response,))
	p2 = Process(target=self.__processlinks__,args=(response,))
	p1.start()
	p2.start()
	p1.join()
	p2.join()

	self.callback(self.url)

	def close(self):
	if self.db_connection:
	self.db_connection.close()


	def __initialize_db__(self):
	self.db_connection= sqlite3.connect("crawler.db")
	c = self.db_connection.cursor()
	c.execute("""
	create table if not exists links (
	link text not null primary key,
	last_parsed datetime default null);
	""")
	c.execute("""
	create table if not exists words (
	word text not null primary key,
	lang text not null)
	""")
	self.db_connection.commit()
	c.close()

	def __processwords__(self,response):
	words_list = self.wordregexp.findall(response)

	if words_list and len(words_list) > 0:
	c = self.db_connection.cursor()
	for word in sorted(words_list):
	try:
	c.execute("insert into words values (?,?)",(word,self.lang))
	except:
	# Primary Key Error
	# Continue
	pass
	self.db_connection.commit()
	c.close()

	def __processlinks__(self,response):
	links_list = self.urlregexp.findall(response)

	if links_list and len(links_list) > 0:
	c = self.db_connection.cursor()

	for link in links_list:
	if link.startswith("#"):
	# Not a URL
	continue
	elif link.find(self.domain) == -1:
	#Possible that this is internal link
	if not link.startswith("http://"):
	link = "http://" + self.domain + link
	else:
	# External Link not interested go ahead
	continue

	try:
	c.execute("insert into links (link) values (?)",(link,))
	except:
	pass
	self.db_connection.commit()
	c.close()

	def __update_url(self):
	c = self.db_connection.cursor()
	c.execute("select * from links where link = '" + self.url+"'")
	if c.rowcount == 0:
	c.execute("insert into links values ('"+self.url+"', date('now'))")
	else:
	c.execute("update links set last_parsed = date('now') where link='"+self.url+"'")
	self.db_connection.commit()
	c.close()

	def __getdomain__(self):
	http_split = self.url.split("http://")
	domain = ""
	if len(http_split) == 1:
	# The given url doesn't begin with http://
	domain = self.url.split("/")[0]
	self.url = "http://" + self.url
	else:
	domain = http_split[1].split("/")[0]
	return domain