Last active
April 5, 2019 22:40
-
-
Save mayhewsw/3ced494825fa65378464cbf268325b58 to your computer and use it in GitHub Desktop.
Allennlp Vocabulary Tests.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from allennlp.data.vocabulary import Vocabulary\n", | |
"from allennlp.data.dataset_readers import Conll2003DatasetReader\n", | |
"from allennlp.data.token_indexers import SingleIdTokenIndexer\n", | |
"import random\n", | |
"\n", | |
"## Make the embeddings first.\n", | |
"def getwords(path, w):\n", | |
" with open(path) as f:\n", | |
" for line in f:\n", | |
" sline = line.split()\n", | |
" if len(sline) > 0:\n", | |
" w.add(sline[0])\n", | |
"\n", | |
"# these embeddings contain *every* word in the three datasets\n", | |
"words = set()\n", | |
"getwords(\"train.txt\", words)\n", | |
"getwords(\"dev.txt\", words)\n", | |
"getwords(\"test.txt\", words)\n", | |
"\n", | |
"dim = 3\n", | |
"with open(\"myembs.txt\", \"w\") as out:\n", | |
" out.write(\"{} {}\\n\".format(len(words), dim))\n", | |
" for word in words:\n", | |
" # random 3 dimensional embeddings.\n", | |
" out.write(word + \" \" + \" \".join([str(random.random())]*dim) + \"\\n\")\n", | |
" \n", | |
"\n", | |
"reader = Conll2003DatasetReader()\n", | |
"\n", | |
"# gather all data\n", | |
"train_dataset = reader.read(\"train.txt\")\n", | |
"dev_dataset = reader.read(\"dev.txt\")\n", | |
"test_dataset = reader.read(\"test.txt\")\n", | |
"\n", | |
"# I had thought that the pretrained file would extend the vocabulary, but this is not the case.\n", | |
"# this vocab has instances only from train and dev.\n", | |
"vocab1 = Vocabulary.from_instances(train_dataset + dev_dataset, pretrained_files={\"tokens\" : \"myembs.txt\"})\n", | |
"# this vocab has instances from train, dev, and test.\n", | |
"vocab2 = Vocabulary.from_instances(train_dataset + dev_dataset + test_dataset, pretrained_files={\"tokens\" : \"myembs.txt\"})\n", | |
"\n", | |
"# this has 16 elements\n", | |
"print(vocab1)\n", | |
"# this has 23 elements\n", | |
"print(vocab2)\n", | |
"\n", | |
"# Build an indexer based on vocab1 and vocab2\n", | |
"indexer = SingleIdTokenIndexer()\n", | |
"\n", | |
"# vocab1 does not contain test data, so these will be all UNK\n", | |
"ind1 = indexer.tokens_to_indices(test_dataset[0][\"tokens\"], vocab1, \"tmp\")\n", | |
"print([vocab1.get_token_from_index(i) for i in ind1[\"tmp\"]])\n", | |
"\n", | |
"# vocab2 does contain test data, so these will be all correct.\n", | |
"ind2 = indexer.tokens_to_indices(test_dataset[0][\"tokens\"], vocab2, \"tmp\")\n", | |
"print([vocab2.get_token_from_index(i) for i in ind2[\"tmp\"]])" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.7" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
philly O O I-MNT | |
is O O O | |
where O O O | |
harold O O I-MNT | |
johnson O O I-MNT | |
lives O O O | |
. O O O | |
same O O O | |
for O O O | |
bob O O I-MNT | |
jones O O I-MNT | |
. O O O |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
allennlp==0.8.3 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
jane O O I-MNT | |
null O O I-MNT | |
and O O O | |
nancy O O I-MNT | |
smith O O I-MNT | |
are O O O | |
nearby O O O | |
. O O O |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
harold O O I-MNT | |
johnson O O I-MNT | |
lives O O O | |
in O O O | |
philly O O I-MNT | |
. O O O | |
bob O O I-MNT | |
jones O O I-MNT | |
does O O O | |
too O O O | |
. O O O |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment