Skip to content

Instantly share code, notes, and snippets.

@mayhewsw
Last active April 5, 2019 22:40
Show Gist options
  • Save mayhewsw/3ced494825fa65378464cbf268325b58 to your computer and use it in GitHub Desktop.
Save mayhewsw/3ced494825fa65378464cbf268325b58 to your computer and use it in GitHub Desktop.
Allennlp Vocabulary Tests.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from allennlp.data.vocabulary import Vocabulary\n",
"from allennlp.data.dataset_readers import Conll2003DatasetReader\n",
"from allennlp.data.token_indexers import SingleIdTokenIndexer\n",
"import random\n",
"\n",
"## Make the embeddings first.\n",
"def getwords(path, w):\n",
" with open(path) as f:\n",
" for line in f:\n",
" sline = line.split()\n",
" if len(sline) > 0:\n",
" w.add(sline[0])\n",
"\n",
"# these embeddings contain *every* word in the three datasets\n",
"words = set()\n",
"getwords(\"train.txt\", words)\n",
"getwords(\"dev.txt\", words)\n",
"getwords(\"test.txt\", words)\n",
"\n",
"dim = 3\n",
"with open(\"myembs.txt\", \"w\") as out:\n",
" out.write(\"{} {}\\n\".format(len(words), dim))\n",
" for word in words:\n",
" # random 3 dimensional embeddings.\n",
" out.write(word + \" \" + \" \".join([str(random.random())]*dim) + \"\\n\")\n",
" \n",
"\n",
"reader = Conll2003DatasetReader()\n",
"\n",
"# gather all data\n",
"train_dataset = reader.read(\"train.txt\")\n",
"dev_dataset = reader.read(\"dev.txt\")\n",
"test_dataset = reader.read(\"test.txt\")\n",
"\n",
"# I had thought that the pretrained file would extend the vocabulary, but this is not the case.\n",
"# this vocab has instances only from train and dev.\n",
"vocab1 = Vocabulary.from_instances(train_dataset + dev_dataset, pretrained_files={\"tokens\" : \"myembs.txt\"})\n",
"# this vocab has instances from train, dev, and test.\n",
"vocab2 = Vocabulary.from_instances(train_dataset + dev_dataset + test_dataset, pretrained_files={\"tokens\" : \"myembs.txt\"})\n",
"\n",
"# this has 16 elements\n",
"print(vocab1)\n",
"# this has 23 elements\n",
"print(vocab2)\n",
"\n",
"# Build an indexer based on vocab1 and vocab2\n",
"indexer = SingleIdTokenIndexer()\n",
"\n",
"# vocab1 does not contain test data, so these will be all UNK\n",
"ind1 = indexer.tokens_to_indices(test_dataset[0][\"tokens\"], vocab1, \"tmp\")\n",
"print([vocab1.get_token_from_index(i) for i in ind1[\"tmp\"]])\n",
"\n",
"# vocab2 does contain test data, so these will be all correct.\n",
"ind2 = indexer.tokens_to_indices(test_dataset[0][\"tokens\"], vocab2, \"tmp\")\n",
"print([vocab2.get_token_from_index(i) for i in ind2[\"tmp\"]])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
philly O O I-MNT
is O O O
where O O O
harold O O I-MNT
johnson O O I-MNT
lives O O O
. O O O
same O O O
for O O O
bob O O I-MNT
jones O O I-MNT
. O O O
jane O O I-MNT
null O O I-MNT
and O O O
nancy O O I-MNT
smith O O I-MNT
are O O O
nearby O O O
. O O O
harold O O I-MNT
johnson O O I-MNT
lives O O O
in O O O
philly O O I-MNT
. O O O
bob O O I-MNT
jones O O I-MNT
does O O O
too O O O
. O O O
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment