Created
September 28, 2012 18:57
-
-
Save turicas/3801534 to your computer and use it in GitHub Desktop.
Simple script that show some encoding problem in NLTK's corpora
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2 | |
# coding: utf-8 | |
import nltk | |
from nltk.corpus import stopwords, machado, shakespeare | |
def get_available_corpora(): | |
corpora = [] | |
for element in dir(nltk.corpus): | |
if element[0] == '_': | |
continue | |
elements_type = str(type(getattr(nltk.corpus, element))) | |
if elements_type.startswith("<class 'nltk.corpus."): | |
corpora.append((element, getattr(nltk.corpus, element))) | |
return corpora | |
def get_types_of_words(corpus): | |
types = set() | |
for file_id in corpus.fileids(): | |
for word in corpus.words(file_id): | |
types.add(type(word)) | |
return list(types) | |
def main(): | |
print 'NLTK version:', nltk.__version__ | |
corpora = get_available_corpora() | |
while corpora: | |
corpus_name, corpus = corpora.pop(0) | |
try: | |
types_of_words = get_types_of_words(corpus) | |
raw_type = type(corpus.raw()) | |
except: | |
print '{}: interface problem!'.format(corpus_name) | |
else: | |
print '{}:'.format(corpus_name), | |
print ' words: ', types_of_words, | |
print ' raw: ', raw_type | |
if __name__ == '__main__': | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
NLTK version: 2.0.1rc4 | |
abc: words: [<type 'str'>] raw: <type 'str'> | |
alpino: words: [<type 'str'>] raw: <type 'str'> | |
brown: words: [<type 'str'>] raw: <type 'str'> | |
cess_cat: words: [<type 'str'>] raw: <type 'str'> | |
cess_esp: words: [<type 'str'>] raw: <type 'str'> | |
cmudict: interface problem! | |
comtrans: words: [<type 'str'>] raw: <type 'str'> | |
conll2000: words: [<type 'str'>] raw: <type 'str'> | |
conll2002: words: [<type 'unicode'>] raw: <type 'unicode'> | |
conll2007: interface problem! | |
dependency_treebank: words: [<type 'str'>] raw: <type 'str'> | |
floresta: words: [<type 'str'>] raw: <type 'str'> | |
gazetteers: words: [<type 'str'>] raw: <type 'str'> | |
genesis: words: [<type 'unicode'>] raw: <type 'unicode'> | |
gutenberg: words: [<type 'str'>] raw: <type 'str'> | |
ieer: interface problem! | |
inaugural: words: [<type 'str'>] raw: <type 'str'> | |
indian: words: [<type 'str'>] raw: <type 'str'> | |
ipipan: interface problem! | |
jeita: words: [<type 'unicode'>] raw: <type 'unicode'> | |
knbc: words: [<type 'unicode'>] raw: <type 'unicode'> | |
mac_morpho: words: [<type 'unicode'>] raw: <type 'unicode'> | |
machado: words: [<type 'unicode'>] raw: <type 'unicode'> | |
movie_reviews: words: [<type 'str'>] raw: <type 'str'> | |
names: words: [<type 'str'>] raw: <type 'str'> | |
nombank: interface problem! | |
nps_chat: words: [<type 'str'>] raw: <type 'str'> | |
pl196x: words: [<type 'str'>] raw: <type 'str'> | |
ppattach: interface problem! | |
propbank: interface problem! | |
qc: interface problem! | |
reuters: words: [<type 'str'>] raw: <type 'str'> | |
rte: words: [<type 'unicode'>, <type 'str'>] raw: <type 'str'> | |
semcor: words: [<type 'str'>] raw: <type 'str'> | |
senseval: interface problem! | |
shakespeare: words: [<type 'str'>] raw: <type 'str'> | |
sinica_treebank: words: [<type 'str'>] raw: <type 'str'> | |
state_union: words: [<type 'str'>] raw: <type 'str'> | |
stopwords: words: [<type 'str'>] raw: <type 'str'> | |
swadesh: words: [<type 'str'>] raw: <type 'str'> | |
switchboard: interface problem! | |
timit: interface problem! | |
timit_tagged: words: [<type 'str'>] raw: <type 'str'> | |
toolbox: interface problem! | |
treebank: words: [<type 'str'>] raw: <type 'str'> | |
treebank_chunk: words: [<type 'str'>] raw: <type 'str'> | |
treebank_raw: words: [<type 'str'>] raw: <type 'str'> | |
udhr: interface problem! | |
verbnet: words: [<type 'str'>] raw: <type 'str'> | |
webtext: words: [<type 'str'>] raw: <type 'str'> | |
wordnet: interface problem! | |
wordnet_ic: interface problem! | |
words: words: [<type 'str'>] raw: <type 'str'> | |
ycoe: interface problem! |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment