Forked from turicas/nltk_bug_unicode_corpora.py
Last active
December 10, 2015 14:18
-
-
Save kmike/4447068 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2 | |
# coding: utf-8 | |
import nltk | |
def get_available_corpora(): | |
for element in dir(nltk.corpus): | |
if element[0] == '_': | |
continue | |
elements_type = str(type(getattr(nltk.corpus, element))) | |
if elements_type.startswith("<class 'nltk.corpus."): | |
yield element, getattr(nltk.corpus, element) | |
def get_types_of_words(corpus): | |
types = set() | |
for file_id in corpus.fileids(): | |
for word in corpus.words(file_id): | |
types.add(type(word)) | |
return list(types) | |
def main(): | |
print '='*30 | |
print 'NLTK version:', nltk.__version__ | |
print '='*30, "\n" | |
corpora = get_available_corpora() | |
for corpus_name, corpus in corpora: | |
print corpus_name | |
print '-'*30 | |
try: | |
types_of_words = get_types_of_words(corpus) | |
print 'words:', types_of_words | |
except Exception as e: | |
print 'words: ', e | |
try: | |
raw_type = type(corpus.raw()) | |
print 'raw:', raw_type | |
except Exception as e: | |
print 'raw: ', e | |
if hasattr(corpus, '_unload'): | |
corpus._unload() | |
print "" | |
if __name__ == '__main__': | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
============================== | |
NLTK version: 2.0.4 | |
============================== | |
abc | |
------------------------------ | |
words: [<type 'unicode'>] | |
raw: <type 'unicode'> | |
alpino | |
------------------------------ | |
words: [<type 'unicode'>] | |
raw: <type 'unicode'> | |
brown | |
------------------------------ | |
words: [<type 'unicode'>] | |
raw: <type 'unicode'> | |
cess_cat | |
------------------------------ | |
words: [<type 'unicode'>] | |
raw: <type 'unicode'> | |
cess_esp | |
------------------------------ | |
words: [<type 'unicode'>] | |
raw: <type 'unicode'> | |
cmudict | |
------------------------------ | |
words: words() takes exactly 1 argument (2 given) | |
raw: local variable 'fileids' referenced before assignment | |
comtrans | |
------------------------------ | |
words: [<type 'unicode'>] | |
raw: <type 'unicode'> | |
conll2000 | |
------------------------------ | |
words: [<type 'unicode'>] | |
raw: <type 'unicode'> | |
conll2002 | |
------------------------------ | |
words: [<type 'unicode'>] | |
raw: <type 'unicode'> | |
conll2007 | |
------------------------------ | |
words: [<type 'unicode'>] | |
raw: <type 'unicode'> | |
dependency_treebank | |
------------------------------ | |
words: [<type 'unicode'>] | |
raw: <type 'unicode'> | |
floresta | |
------------------------------ | |
words: [<type 'unicode'>] | |
raw: <type 'unicode'> | |
gazetteers | |
------------------------------ | |
words: [<type 'unicode'>] | |
raw: <type 'unicode'> | |
genesis | |
------------------------------ | |
words: [<type 'unicode'>] | |
raw: <type 'unicode'> | |
gutenberg | |
------------------------------ | |
words: [<type 'unicode'>] | |
raw: <type 'unicode'> | |
ieer | |
------------------------------ | |
words: 'IEERCorpusReader' object has no attribute 'words' | |
raw: <type 'unicode'> | |
inaugural | |
------------------------------ | |
words: [<type 'unicode'>] | |
raw: <type 'unicode'> | |
indian | |
------------------------------ | |
words: [<type 'unicode'>] | |
raw: <type 'unicode'> | |
ipipan | |
------------------------------ | |
words: | |
********************************************************************** | |
Resource u'corpora/ipipan' not found. Please use the NLTK | |
Downloader to obtain the resource: >>> nltk.download() | |
Searched in: | |
- '/Users/kmike/nltk_data' | |
- '/usr/share/nltk_data' | |
- '/usr/local/share/nltk_data' | |
- '/usr/lib/nltk_data' | |
- '/usr/local/lib/nltk_data' | |
********************************************************************** | |
raw: | |
********************************************************************** | |
Resource u'corpora/ipipan' not found. Please use the NLTK | |
Downloader to obtain the resource: >>> nltk.download() | |
Searched in: | |
- '/Users/kmike/nltk_data' | |
- '/usr/share/nltk_data' | |
- '/usr/local/share/nltk_data' | |
- '/usr/lib/nltk_data' | |
- '/usr/local/lib/nltk_data' | |
********************************************************************** | |
jeita | |
------------------------------ | |
words: [<type 'unicode'>] | |
raw: <type 'unicode'> | |
knbc | |
------------------------------ | |
words: [<type 'unicode'>] | |
raw: <type 'unicode'> | |
lin_thesaurus | |
------------------------------ | |
words: 'LinThesaurusCorpusReader' object has no attribute 'words' | |
raw: 'LinThesaurusCorpusReader' object has no attribute 'raw' | |
mac_morpho | |
------------------------------ | |
words: [<type 'unicode'>] | |
raw: <type 'unicode'> | |
machado | |
------------------------------ | |
words: [<type 'unicode'>] | |
raw: <type 'unicode'> | |
movie_reviews | |
------------------------------ | |
words: [<type 'unicode'>] | |
raw: <type 'unicode'> | |
names | |
------------------------------ | |
words: [<type 'unicode'>] | |
raw: <type 'unicode'> | |
nombank | |
------------------------------ | |
words: 'NombankCorpusReader' object has no attribute 'words' | |
raw: <type 'unicode'> | |
nombank_ptb | |
------------------------------ | |
words: 'NombankCorpusReader' object has no attribute 'words' | |
raw: <type 'unicode'> | |
nps_chat | |
------------------------------ | |
words: [<type 'unicode'>, <type 'str'>] | |
raw: <type 'unicode'> | |
pl196x | |
------------------------------ | |
words: [<type 'unicode'>] | |
raw: <type 'unicode'> | |
ppattach | |
------------------------------ | |
words: 'PPAttachmentCorpusReader' object has no attribute 'words' | |
raw: <type 'unicode'> | |
propbank | |
------------------------------ | |
words: 'PropbankCorpusReader' object has no attribute 'words' | |
raw: <type 'unicode'> | |
propbank_ptb | |
------------------------------ | |
words: 'PropbankCorpusReader' object has no attribute 'words' | |
raw: <type 'unicode'> | |
ptb | |
------------------------------ | |
words: [] | |
raw: concat() expects at least one object! | |
qc | |
------------------------------ | |
words: 'StringCategoryCorpusReader' object has no attribute 'words' | |
raw: <type 'unicode'> | |
reuters | |
------------------------------ | |
words: [<type 'unicode'>] | |
raw: <type 'unicode'> | |
rte | |
------------------------------ | |
words: [<type 'unicode'>, <type 'str'>] | |
raw: <type 'unicode'> | |
semcor | |
------------------------------ | |
words: [<type 'str'>] | |
raw: <type 'unicode'> | |
senseval | |
------------------------------ | |
words: 'SensevalCorpusReader' object has no attribute 'words' | |
raw: <type 'unicode'> | |
shakespeare | |
------------------------------ | |
words: [<type 'str'>] | |
raw: <type 'unicode'> | |
sinica_treebank | |
------------------------------ | |
words: [<type 'unicode'>] | |
raw: <type 'unicode'> | |
state_union | |
------------------------------ | |
words: [<type 'unicode'>] | |
raw: <type 'unicode'> | |
stopwords | |
------------------------------ | |
words: [<type 'unicode'>] | |
raw: <type 'unicode'> | |
swadesh | |
------------------------------ | |
words: [<type 'unicode'>] | |
raw: <type 'unicode'> | |
switchboard | |
------------------------------ | |
words: words() takes exactly 1 argument (2 given) | |
raw: 'SwitchboardCorpusReader' object has no attribute 'raw' | |
timit | |
------------------------------ | |
words: No such file or directory: u'/Users/kmike/nltk_data/corpora/timit/dr1-fvmh0/sa1.phn.wrd' | |
raw: 'TimitCorpusReader' object has no attribute 'raw' | |
timit_tagged | |
------------------------------ | |
words: [<type 'unicode'>] | |
raw: <type 'unicode'> | |
toolbox | |
------------------------------ | |
words: [] | |
raw: raw() takes exactly 2 arguments (1 given) | |
treebank | |
------------------------------ | |
words: [<type 'unicode'>] | |
raw: <type 'unicode'> | |
treebank_chunk | |
------------------------------ | |
words: [<type 'unicode'>] | |
raw: <type 'unicode'> | |
treebank_raw | |
------------------------------ | |
words: [<type 'unicode'>] | |
raw: <type 'unicode'> | |
udhr | |
------------------------------ | |
words: [<type 'unicode'>] | |
raw: <type 'unicode'> | |
verbnet | |
------------------------------ | |
words: [<type 'str'>] | |
raw: <type 'unicode'> | |
webtext | |
------------------------------ | |
words: [<type 'unicode'>] | |
raw: <type 'unicode'> | |
wordnet | |
------------------------------ | |
words: 'WordNetCorpusReader' object has no attribute 'words' | |
raw: 'WordNetCorpusReader' object has no attribute 'raw' | |
wordnet_ic | |
------------------------------ | |
words: 'WordNetICCorpusReader' object has no attribute 'words' | |
raw: 'WordNetICCorpusReader' object has no attribute 'raw' | |
words | |
------------------------------ | |
words: [<type 'unicode'>] | |
raw: <type 'unicode'> | |
ycoe | |
------------------------------ | |
words: No such file or directory: u'/Users/kmike/nltk_data/corpora/ycoe/psd' | |
raw: No such file or directory: u'/Users/kmike/nltk_data/corpora/ycoe/psd' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment