-
-
Save srikanthlogic/2900594 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*- | |
# Scrapper to get words from Tamil Lexicon | |
# By Srikanth Logic. ([email protected]) | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
from BeautifulSoup import BeautifulSoup | |
import html2text | |
import urllib2 | |
import codecs | |
import csv | |
def getResults(url): | |
page = urllib2.urlopen(url) | |
soup = BeautifulSoup(page.read()) | |
wordlist_in_page = soup.findAll('div2') | |
word_meanings = [] | |
word = {'word':'meaning'} | |
i = 1 | |
for oneword in wordlist_in_page: | |
if i == 1: | |
i+=1 | |
continue | |
word[oneword.find('span').find('span').contents[0].__str__()] = html2text.html2text(oneword.find('p').__str__()) | |
return word | |
def main(): | |
#sys.stdout = codecs.getwriter(locale.getpreferredencoding())(sys.stdout) | |
fl = codecs.open('words.csv','wb','utf-8') | |
url = 'http://dsal.uchicago.edu/cgi-bin/philologic/getobject.pl?' + 'p.0:4.tamillex' | |
words_in_page = getResults(url) | |
writer = csv.writer(fl,delimiter='\t',dialect='excel',quoting=csv.QUOTE_ALL) | |
for word in words_in_page.keys(): | |
writer.writerow((word,words_in_page[word])) | |
print 'Completed Page' + str(url) | |
fl.close() | |
if __name__ == "__main__": | |
main() |
Traceback (most recent call last):
File "tla.py", line 50, in
main()
File "tla.py", line 42, in main
words_in_page = getResults(url)
File "tla.py", line 24, in getResults
page = urllib2.urlopen(url)
File "/usr/lib/python2.7/urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 410, in open
response = meth(req, response)
File "/usr/lib/python2.7/urllib2.py", line 523, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python2.7/urllib2.py", line 448, in error
return self._call_chain(*args)
File "/usr/lib/python2.7/urllib2.py", line 382, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 531, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 500: Internal Server Error
desk-05@desk-05:~/abu$
python dl.py
Traceback (most recent call last):
File "dl.py", line 50, in
main()
File "dl.py", line 42, in main
words_in_page = getResults(url)
File "dl.py", line 25, in getResults
soup = BeautifulSoup(page.read())
File "/usr/local/lib/python2.7/dist-packages/BeautifulSoup-3.2.1-py2.7.egg/BeautifulSoup.py", line 1522, in init
BeautifulStoneSoup.init(self, _args, *_kwargs)
File "/usr/local/lib/python2.7/dist-packages/BeautifulSoup-3.2.1-py2.7.egg/BeautifulSoup.py", line 1147, in init
self._feed(isHTML=isHTML)
File "/usr/local/lib/python2.7/dist-packages/BeautifulSoup-3.2.1-py2.7.egg/BeautifulSoup.py", line 1189, in feed
SGMLParser.feed(self, markup)
File "/usr/lib/python2.7/sgmllib.py", line 104, in feed
self.goahead(0)
File "/usr/lib/python2.7/sgmllib.py", line 143, in goahead
k = self.parse_endtag(i)
File "/usr/lib/python2.7/sgmllib.py", line 320, in parse_endtag
self.finish_endtag(tag)
File "/usr/lib/python2.7/sgmllib.py", line 358, in finish_endtag
method = getattr(self, 'end' + tag)
UnicodeEncodeError: 'ascii' codec can't encode characters in position 10-25: ordinal not in range(128)
getting the above error on executing this code.