-
-
Save srikanthlogic/2900594 to your computer and use it in GitHub Desktop.
| # -*- coding: utf-8 -*- | |
| # Scrapper to get words from Tamil Lexicon | |
| # By Srikanth Logic. ([email protected]) | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| from BeautifulSoup import BeautifulSoup | |
| import html2text | |
| import urllib2 | |
| import codecs | |
| import csv | |
| def getResults(url): | |
| page = urllib2.urlopen(url) | |
| soup = BeautifulSoup(page.read()) | |
| wordlist_in_page = soup.findAll('div2') | |
| word_meanings = [] | |
| word = {'word':'meaning'} | |
| i = 1 | |
| for oneword in wordlist_in_page: | |
| if i == 1: | |
| i+=1 | |
| continue | |
| word[oneword.find('span').find('span').contents[0].__str__()] = html2text.html2text(oneword.find('p').__str__()) | |
| return word | |
| def main(): | |
| #sys.stdout = codecs.getwriter(locale.getpreferredencoding())(sys.stdout) | |
| fl = codecs.open('words.csv','wb','utf-8') | |
| url = 'http://dsal.uchicago.edu/cgi-bin/philologic/getobject.pl?' + 'p.0:4.tamillex' | |
| words_in_page = getResults(url) | |
| writer = csv.writer(fl,delimiter='\t',dialect='excel',quoting=csv.QUOTE_ALL) | |
| for word in words_in_page.keys(): | |
| writer.writerow((word,words_in_page[word])) | |
| print 'Completed Page' + str(url) | |
| fl.close() | |
| if __name__ == "__main__": | |
| main() |
Traceback (most recent call last):
File "tla.py", line 50, in
main()
File "tla.py", line 42, in main
words_in_page = getResults(url)
File "tla.py", line 24, in getResults
page = urllib2.urlopen(url)
File "/usr/lib/python2.7/urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 410, in open
response = meth(req, response)
File "/usr/lib/python2.7/urllib2.py", line 523, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python2.7/urllib2.py", line 448, in error
return self._call_chain(*args)
File "/usr/lib/python2.7/urllib2.py", line 382, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 531, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 500: Internal Server Error
desk-05@desk-05:~/abu$
python dl.py
Traceback (most recent call last):
File "dl.py", line 50, in
main()
File "dl.py", line 42, in main
words_in_page = getResults(url)
File "dl.py", line 25, in getResults
soup = BeautifulSoup(page.read())
File "/usr/local/lib/python2.7/dist-packages/BeautifulSoup-3.2.1-py2.7.egg/BeautifulSoup.py", line 1522, in init
BeautifulStoneSoup.init(self, _args, *_kwargs)
File "/usr/local/lib/python2.7/dist-packages/BeautifulSoup-3.2.1-py2.7.egg/BeautifulSoup.py", line 1147, in init
self._feed(isHTML=isHTML)
File "/usr/local/lib/python2.7/dist-packages/BeautifulSoup-3.2.1-py2.7.egg/BeautifulSoup.py", line 1189, in feed
SGMLParser.feed(self, markup)
File "/usr/lib/python2.7/sgmllib.py", line 104, in feed
self.goahead(0)
File "/usr/lib/python2.7/sgmllib.py", line 143, in goahead
k = self.parse_endtag(i)
File "/usr/lib/python2.7/sgmllib.py", line 320, in parse_endtag
self.finish_endtag(tag)
File "/usr/lib/python2.7/sgmllib.py", line 358, in finish_endtag
method = getattr(self, 'end' + tag)
UnicodeEncodeError: 'ascii' codec can't encode characters in position 10-25: ordinal not in range(128)
getting the above error on executing this code.