Skip to content

Instantly share code, notes, and snippets.

@copyninja
Created November 14, 2011 11:04
Show Gist options
  • Save copyninja/1363736 to your computer and use it in GitHub Desktop.
Save copyninja/1363736 to your computer and use it in GitHub Desktop.
A meaning extractor for kannada wiktionary
#!/usr/bin/python
from BeautifulSoup import BeautifulSoup
import urllib2
import tablib
wiktionary_words = []
def parse_word(html_data):
soup = BeautifulSoup(html_data)
main_content = soup.find("div",{
"class" : "mw-content-ltr",
"dir" : "ltr"
})
word = main_content.find("p").find("b").string
word_type = main_content.find("span",{
"class" : "mw-headline"
}).string
meaning_list = main_content.find("ol")
def_list = []
for li in meaning_list.findAll("li"):
def_list.append(li.find("a").string)
return (word,word_type,meaning_list)
if __name__ == "__main__":
ds = tablib.Dataset(headers=['word','type','meaning'])
ds.append(parse_word(open('abandonee.htm').read()))
open('output.json','wb').write(data.json)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment