Skip to content

Instantly share code, notes, and snippets.

@davebshow
Last active August 29, 2015 14:05
Show Gist options
  • Save davebshow/677f47b9dcc615442f3b to your computer and use it in GitHub Desktop.
Save davebshow/677f47b9dcc615442f3b to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": ""
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"Evaluate various classifiers for NP chunking with the ConLL-2000 corpus using Python NLTK and scikit-learn."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import datetime\n",
"import nltk\n",
"from nltk.corpus import conll2000\n",
"from sklearn.svm import LinearSVC, SVC\n",
"from nltk.classify.scikitlearn import SklearnClassifier"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Get MEGAM file for MaxentClassifier.\n",
"nltk.config_megam('/home/dbshow1/megam/MEGAM/megam-64.opt')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"[Found /home/dbshow1/megam/MEGAM/megam-64.opt: /home/dbshow1/megam/MEGAM/megam-64.opt]\n"
]
}
],
"prompt_number": 2
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Code for NP chunker based on http://www.nltk.org/book/ch07.html."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"class NPChunker(nltk.ChunkParserI):\n",
" \"\"\"Chunk text into shallow NP trees using IOB parser.\"\"\"\n",
" def __init__(self, train_sents, tagger=None):\n",
" start_time = datetime.datetime.now()\n",
" print('Training started: {0}'.format(start_time))\n",
" tagged_sents = [[((w,t),c) for (w,t,c) in\n",
" nltk.chunk.tree2conlltags(sent)]\n",
" for sent in train_sents]\n",
" self.tagger = tagger(tagged_sents)\n",
" end_time = datetime.datetime.now()\n",
" print('Training complete: {0}'.format(end_time))\n",
" print('Time spent training: {0}'.format(end_time - start_time))\n",
"\n",
" def parse(self, sentence):\n",
" tagged_sents = self.tagger.tag(sentence)\n",
" conlltags = [(w,t,c) for ((w,t),c) in tagged_sents]\n",
" return nltk.chunk.conlltags2tree(conlltags)\n",
" \n",
"\n",
"class BaseIOBChunkTagger(nltk.TaggerI): \n",
" \"\"\"Base class for IOB taggers. Defines tag method.\"\"\"\n",
" def tag(self, sentence):\n",
" history = []\n",
" for i, word in enumerate(sentence):\n",
" featureset = npchunk_features(sentence, i, history)\n",
" tag = self.classifier.classify(featureset)\n",
" history.append(tag)\n",
" return zip(sentence, history)\n",
" \n",
"\n",
"class MaxentIOBChunkTagger(BaseIOBChunkTagger): \n",
" \"\"\"IOB tagger using the Maxent classifier.\"\"\"\n",
" def __init__(self, train_sents):\n",
" train_set = extract_features(train_sents)\n",
" self.classifier = nltk.MaxentClassifier.train(\n",
" train_set, algorithm='megam', trace=0\n",
" )\n",
"\n",
" \n",
"class LinearSVCIOBChunkTagger(BaseIOBChunkTagger): \n",
" \"\"\"IOB tagger using the scikit-learn LinearSVC classifier.\"\"\"\n",
" def __init__(self, train_sents):\n",
" train_set = extract_features(train_sents)\n",
" self.classifier = SklearnClassifier(LinearSVC()).train(train_set)\n",
" \n",
" \n",
"class SVCIOBChunkTagger(BaseIOBChunkTagger): \n",
" \"\"\"IOB tagger using the scikit-learn SVC classifier.\"\"\"\n",
" def __init__(self, train_sents):\n",
" train_set = extract_features(train_sents)\n",
" self.classifier = SklearnClassifier(SVC()).train(train_set)\n",
"\n",
" \n",
"def extract_features(train_sents): \n",
" \"\"\"\"Extract features from training data.\"\"\"\n",
" train_set = []\n",
" for tagged_sent in train_sents:\n",
" untagged_sent = nltk.tag.untag(tagged_sent)\n",
" history = [] \n",
" for i, (word, tag) in enumerate(tagged_sent):\n",
" featureset = npchunk_features(untagged_sent, i, history) \n",
" train_set.append( (featureset, tag) )\n",
" history.append(tag)\n",
" return train_set\n",
"\n",
"\n",
"def npchunk_features(sentence, i, history):\n",
" \"\"\"Extract features for classification.\"\"\"\n",
" word, pos = sentence[i]\n",
" if i == 0:\n",
" prevword, prevpos = \"<START>\", \"<START>\"\n",
" previob = \"<START>\"\n",
" else:\n",
" prevword, prevpos = sentence[i - 1]\n",
" previob = history[i - 1]\n",
" if i == len(sentence) - 1:\n",
" nextword, nextpos = \"<END>\", \"<END>\"\n",
" else:\n",
" nextword, nextpos = sentence[i+1]\n",
" return {\"word\": word,\n",
" \"prevword\": prevword,\n",
" \"nextword\": nextword,\n",
" \"pos\": pos,\n",
" \"prevpos\": prevpos,\n",
" \"nextpos\": nextpos,\n",
" \"previob\": previob, \n",
" \"prevpos+pos\": \"%s+%s\" % (prevpos, pos),\n",
" \"pos+nextpos\": \"%s+%s\" % (pos, nextpos),\n",
" \"tags-since-dt\": tags_since_dt(sentence, i)}\n",
"\n",
"\n",
"def tags_since_dt(sentence, i):\n",
" \"\"\"Get all tags after a determiner as a\n",
" classification feature.\"\"\"\n",
" tags = set()\n",
" for word, pos in sentence[:i]:\n",
" if pos == 'DT':\n",
" tags = set()\n",
" else:\n",
" tags.add(pos)\n",
" return '+'.join(sorted(tags)) "
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Get training/testing sets from ConLL-2000 corpus."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])\n",
"train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Evaluate Maxent classifier."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"maxent_chunker = NPChunker(train_sents, tagger=MaxentIOBChunkTagger)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Training started: 2014-08-12 17:08:41.496115\n",
"Training complete: 2014-08-12 17:13:49.788300"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Time spent training: 0:05:08.292185\n"
]
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print(maxent_chunker.evaluate(test_sents))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"ChunkParse score:\n",
" IOB Accuracy: 96.8%\n",
" Precision: 92.7%\n",
" Recall: 92.8%\n",
" F-Measure: 92.7%\n"
]
}
],
"prompt_number": 6
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Evaluate LinearSVC classifier."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"linearsvc_chunker = NPChunker(train_sents, tagger=LinearSVCIOBChunkTagger)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Training started: 2014-08-12 17:45:43.930457\n",
"Training complete: 2014-08-12 17:46:23.394735"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Time spent training: 0:00:39.464278\n"
]
}
],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print(linearsvc_chunker.evaluate(test_sents))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"ChunkParse score:\n",
" IOB Accuracy: 96.8%\n",
" Precision: 92.8%\n",
" Recall: 93.0%\n",
" F-Measure: 92.9%\n"
]
}
],
"prompt_number": 8
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Evaluate SVC classifier."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"svc_chunker = NPChunker(train_sents, tagger=SVCIOBChunkTagger)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Training started: 2014-08-12 17:47:07.243946\n",
"Training complete: 2014-08-12 23:18:49.701231"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Time spent training: 5:31:42.457285\n"
]
}
],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print(svc_chunker.evaluate(test_sents))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"ChunkParse score:\n",
" IOB Accuracy: 64.7%\n",
" Precision: 70.2%\n",
" Recall: 30.0%\n",
" F-Measure: 42.0%\n"
]
}
],
"prompt_number": 10
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment