Skip to content

Instantly share code, notes, and snippets.

@copyninja
Created October 19, 2010 11:38
Show Gist options
  • Save copyninja/634058 to your computer and use it in GitHub Desktop.
Save copyninja/634058 to your computer and use it in GitHub Desktop.
A Simple Web crawler for getting words and links of a language from WWW
#!/usr/local/bin/python3
import re
import sqlite3
from urllib.request import urlopen,Request
from urllib.error import HTTPError
from multiprocessing import Process
# TODO Fill this table with other language regexps
lang_regexps = {
"kn_IN":'([\u0c80-\u0cf2]+)[^\u0c80-\u0cf2]*',
"ml_IN":'([\u0d00-\u0d7f]+)[^\u0d00-\u0d7f]*'
}
def get_regexp(language):
if language:
return lang_regexps.get(language,None)
return None
class Crawler(object):
"""
"""
def __init__(self, finish_callback,url,lang):
"""
content_callback : Function to be called when content is read
from the URL
"""
self.callback = finish_callback
self.urlregexp = re.compile('<a\s*\S*href=[\'|"](.*?)[\'"].*?>')
# TODO create a regex look up for each indian language and get regex from there
self.wordregexp = re.compile(get_regexp(lang))
self.url = url
self.db_connection = None
self.lang = lang
self.domain = self.__getdomain__()
def crawl(self):
if not self.db_connection:
self.__initialize_db__()
response = None
self.__update_url()
try:
request = Request(url=self.url)
request.add_header("User-Agent","Python 3.0 Crawler")
response = urlopen(request).read().decode("utf-8")
except HTTPError as e:
print(("Something went wrong. I got error code {}".format(e.code)))
sys.exit(1)
except URLError as u:
print(("Something is wrong. Error Message: {}".format(u.reason)))
sys.exit(1)
except:
print("Grrr! Something is really wrong")
if response:
p1 = Process(target=self.__processwords__,args=(response,))
p2 = Process(target=self.__processlinks__,args=(response,))
p1.start()
p2.start()
p1.join()
p2.join()
self.callback(self.url)
def close(self):
if self.db_connection:
self.db_connection.close()
def __initialize_db__(self):
self.db_connection= sqlite3.connect("crawler.db")
c = self.db_connection.cursor()
c.execute("""
create table if not exists links (
link text not null primary key,
last_parsed datetime default null);
""")
c.execute("""
create table if not exists words (
word text not null primary key,
lang text not null)
""")
self.db_connection.commit()
c.close()
def __processwords__(self,response):
words_list = self.wordregexp.findall(response)
if words_list and len(words_list) > 0:
c = self.db_connection.cursor()
for word in sorted(words_list):
try:
c.execute("insert into words values (?,?)",(word,self.lang))
except:
# Primary Key Error
# Continue
pass
self.db_connection.commit()
c.close()
def __processlinks__(self,response):
links_list = self.urlregexp.findall(response)
if links_list and len(links_list) > 0:
c = self.db_connection.cursor()
for link in links_list:
if link.startswith("#"):
# Not a URL
continue
elif link.find(self.domain) == -1:
#Possible that this is internal link
if not link.startswith("http://"):
link = "http://" + self.domain + link
else:
# External Link not interested go ahead
continue
try:
c.execute("insert into links (link) values (?)",(link,))
except:
pass
self.db_connection.commit()
c.close()
def __update_url(self):
c = self.db_connection.cursor()
c.execute("select * from links where link = '" + self.url+"'")
if c.rowcount == 0:
c.execute("insert into links values ('"+self.url+"', date('now'))")
else:
c.execute("update links set last_parsed = date('now') where link='"+self.url+"'")
self.db_connection.commit()
c.close()
def __getdomain__(self):
http_split = self.url.split("http://")
domain = ""
if len(http_split) == 1:
# The given url doesn't begin with http://
domain = self.url.split("/")[0]
self.url = "http://" + self.url
else:
domain = http_split[1].split("/")[0]
return domain
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment