Created
June 9, 2012 11:14
-
-
Save srikanthlogic/2900594 to your computer and use it in GitHub Desktop.
Tamil Lexicon Scrapper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# Scrapper to get words from Tamil Lexicon | |
# By Srikanth Logic. ([email protected]) | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
from BeautifulSoup import BeautifulSoup | |
import html2text | |
import urllib2 | |
import codecs | |
import csv | |
def getResults(url): | |
page = urllib2.urlopen(url) | |
soup = BeautifulSoup(page.read()) | |
wordlist_in_page = soup.findAll('div2') | |
word_meanings = [] | |
word = {'word':'meaning'} | |
i = 1 | |
for oneword in wordlist_in_page: | |
if i == 1: | |
i+=1 | |
continue | |
word[oneword.find('span').find('span').contents[0].__str__()] = html2text.html2text(oneword.find('p').__str__()) | |
return word | |
def main(): | |
#sys.stdout = codecs.getwriter(locale.getpreferredencoding())(sys.stdout) | |
fl = codecs.open('words.csv','wb','utf-8') | |
url = 'http://dsal.uchicago.edu/cgi-bin/philologic/getobject.pl?' + 'p.0:4.tamillex' | |
words_in_page = getResults(url) | |
writer = csv.writer(fl,delimiter='\t',dialect='excel',quoting=csv.QUOTE_ALL) | |
for word in words_in_page.keys(): | |
writer.writerow((word,words_in_page[word])) | |
print 'Completed Page' + str(url) | |
fl.close() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Traceback (most recent call last):
File "tla.py", line 50, in
main()
File "tla.py", line 42, in main
words_in_page = getResults(url)
File "tla.py", line 24, in getResults
page = urllib2.urlopen(url)
File "/usr/lib/python2.7/urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 410, in open
response = meth(req, response)
File "/usr/lib/python2.7/urllib2.py", line 523, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python2.7/urllib2.py", line 448, in error
return self._call_chain(*args)
File "/usr/lib/python2.7/urllib2.py", line 382, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 531, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 500: Internal Server Error
desk-05@desk-05:~/abu$