Created
October 19, 2010 11:38
-
-
Save copyninja/634058 to your computer and use it in GitHub Desktop.
A Simple Web crawler for getting words and links of a language from WWW
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/python3 | |
import re | |
import sqlite3 | |
from urllib.request import urlopen,Request | |
from urllib.error import HTTPError | |
from multiprocessing import Process | |
# TODO Fill this table with other language regexps | |
lang_regexps = { | |
"kn_IN":'([\u0c80-\u0cf2]+)[^\u0c80-\u0cf2]*', | |
"ml_IN":'([\u0d00-\u0d7f]+)[^\u0d00-\u0d7f]*' | |
} | |
def get_regexp(language): | |
if language: | |
return lang_regexps.get(language,None) | |
return None | |
class Crawler(object): | |
""" | |
""" | |
def __init__(self, finish_callback,url,lang): | |
""" | |
content_callback : Function to be called when content is read | |
from the URL | |
""" | |
self.callback = finish_callback | |
self.urlregexp = re.compile('<a\s*\S*href=[\'|"](.*?)[\'"].*?>') | |
# TODO create a regex look up for each indian language and get regex from there | |
self.wordregexp = re.compile(get_regexp(lang)) | |
self.url = url | |
self.db_connection = None | |
self.lang = lang | |
self.domain = self.__getdomain__() | |
def crawl(self): | |
if not self.db_connection: | |
self.__initialize_db__() | |
response = None | |
self.__update_url() | |
try: | |
request = Request(url=self.url) | |
request.add_header("User-Agent","Python 3.0 Crawler") | |
response = urlopen(request).read().decode("utf-8") | |
except HTTPError as e: | |
print(("Something went wrong. I got error code {}".format(e.code))) | |
sys.exit(1) | |
except URLError as u: | |
print(("Something is wrong. Error Message: {}".format(u.reason))) | |
sys.exit(1) | |
except: | |
print("Grrr! Something is really wrong") | |
if response: | |
p1 = Process(target=self.__processwords__,args=(response,)) | |
p2 = Process(target=self.__processlinks__,args=(response,)) | |
p1.start() | |
p2.start() | |
p1.join() | |
p2.join() | |
self.callback(self.url) | |
def close(self): | |
if self.db_connection: | |
self.db_connection.close() | |
def __initialize_db__(self): | |
self.db_connection= sqlite3.connect("crawler.db") | |
c = self.db_connection.cursor() | |
c.execute(""" | |
create table if not exists links ( | |
link text not null primary key, | |
last_parsed datetime default null); | |
""") | |
c.execute(""" | |
create table if not exists words ( | |
word text not null primary key, | |
lang text not null) | |
""") | |
self.db_connection.commit() | |
c.close() | |
def __processwords__(self,response): | |
words_list = self.wordregexp.findall(response) | |
if words_list and len(words_list) > 0: | |
c = self.db_connection.cursor() | |
for word in sorted(words_list): | |
try: | |
c.execute("insert into words values (?,?)",(word,self.lang)) | |
except: | |
# Primary Key Error | |
# Continue | |
pass | |
self.db_connection.commit() | |
c.close() | |
def __processlinks__(self,response): | |
links_list = self.urlregexp.findall(response) | |
if links_list and len(links_list) > 0: | |
c = self.db_connection.cursor() | |
for link in links_list: | |
if link.startswith("#"): | |
# Not a URL | |
continue | |
elif link.find(self.domain) == -1: | |
#Possible that this is internal link | |
if not link.startswith("http://"): | |
link = "http://" + self.domain + link | |
else: | |
# External Link not interested go ahead | |
continue | |
try: | |
c.execute("insert into links (link) values (?)",(link,)) | |
except: | |
pass | |
self.db_connection.commit() | |
c.close() | |
def __update_url(self): | |
c = self.db_connection.cursor() | |
c.execute("select * from links where link = '" + self.url+"'") | |
if c.rowcount == 0: | |
c.execute("insert into links values ('"+self.url+"', date('now'))") | |
else: | |
c.execute("update links set last_parsed = date('now') where link='"+self.url+"'") | |
self.db_connection.commit() | |
c.close() | |
def __getdomain__(self): | |
http_split = self.url.split("http://") | |
domain = "" | |
if len(http_split) == 1: | |
# The given url doesn't begin with http:// | |
domain = self.url.split("/")[0] | |
self.url = "http://" + self.url | |
else: | |
domain = http_split[1].split("/")[0] | |
return domain | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment