Last active
August 29, 2015 14:05
-
-
Save davebshow/677f47b9dcc615442f3b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"Evaluate various classifiers for NP chunking with the ConLL-2000 corpus using Python NLTK and scikit-learn." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import datetime\n", | |
"import nltk\n", | |
"from nltk.corpus import conll2000\n", | |
"from sklearn.svm import LinearSVC, SVC\n", | |
"from nltk.classify.scikitlearn import SklearnClassifier" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Get MEGAM file for MaxentClassifier.\n", | |
"nltk.config_megam('/home/dbshow1/megam/MEGAM/megam-64.opt')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"[Found /home/dbshow1/megam/MEGAM/megam-64.opt: /home/dbshow1/megam/MEGAM/megam-64.opt]\n" | |
] | |
} | |
], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 4, | |
"metadata": {}, | |
"source": [ | |
"Code for NP chunker based on http://www.nltk.org/book/ch07.html." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"class NPChunker(nltk.ChunkParserI):\n", | |
" \"\"\"Chunk text into shallow NP trees using IOB parser.\"\"\"\n", | |
" def __init__(self, train_sents, tagger=None):\n", | |
" start_time = datetime.datetime.now()\n", | |
" print('Training started: {0}'.format(start_time))\n", | |
" tagged_sents = [[((w,t),c) for (w,t,c) in\n", | |
" nltk.chunk.tree2conlltags(sent)]\n", | |
" for sent in train_sents]\n", | |
" self.tagger = tagger(tagged_sents)\n", | |
" end_time = datetime.datetime.now()\n", | |
" print('Training complete: {0}'.format(end_time))\n", | |
" print('Time spent training: {0}'.format(end_time - start_time))\n", | |
"\n", | |
" def parse(self, sentence):\n", | |
" tagged_sents = self.tagger.tag(sentence)\n", | |
" conlltags = [(w,t,c) for ((w,t),c) in tagged_sents]\n", | |
" return nltk.chunk.conlltags2tree(conlltags)\n", | |
" \n", | |
"\n", | |
"class BaseIOBChunkTagger(nltk.TaggerI): \n", | |
" \"\"\"Base class for IOB taggers. Defines tag method.\"\"\"\n", | |
" def tag(self, sentence):\n", | |
" history = []\n", | |
" for i, word in enumerate(sentence):\n", | |
" featureset = npchunk_features(sentence, i, history)\n", | |
" tag = self.classifier.classify(featureset)\n", | |
" history.append(tag)\n", | |
" return zip(sentence, history)\n", | |
" \n", | |
"\n", | |
"class MaxentIOBChunkTagger(BaseIOBChunkTagger): \n", | |
" \"\"\"IOB tagger using the Maxent classifier.\"\"\"\n", | |
" def __init__(self, train_sents):\n", | |
" train_set = extract_features(train_sents)\n", | |
" self.classifier = nltk.MaxentClassifier.train(\n", | |
" train_set, algorithm='megam', trace=0\n", | |
" )\n", | |
"\n", | |
" \n", | |
"class LinearSVCIOBChunkTagger(BaseIOBChunkTagger): \n", | |
" \"\"\"IOB tagger using the scikit-learn LinearSVC classifier.\"\"\"\n", | |
" def __init__(self, train_sents):\n", | |
" train_set = extract_features(train_sents)\n", | |
" self.classifier = SklearnClassifier(LinearSVC()).train(train_set)\n", | |
" \n", | |
" \n", | |
"class SVCIOBChunkTagger(BaseIOBChunkTagger): \n", | |
" \"\"\"IOB tagger using the scikit-learn SVC classifier.\"\"\"\n", | |
" def __init__(self, train_sents):\n", | |
" train_set = extract_features(train_sents)\n", | |
" self.classifier = SklearnClassifier(SVC()).train(train_set)\n", | |
"\n", | |
" \n", | |
"def extract_features(train_sents): \n", | |
" \"\"\"\"Extract features from training data.\"\"\"\n", | |
" train_set = []\n", | |
" for tagged_sent in train_sents:\n", | |
" untagged_sent = nltk.tag.untag(tagged_sent)\n", | |
" history = [] \n", | |
" for i, (word, tag) in enumerate(tagged_sent):\n", | |
" featureset = npchunk_features(untagged_sent, i, history) \n", | |
" train_set.append( (featureset, tag) )\n", | |
" history.append(tag)\n", | |
" return train_set\n", | |
"\n", | |
"\n", | |
"def npchunk_features(sentence, i, history):\n", | |
" \"\"\"Extract features for classification.\"\"\"\n", | |
" word, pos = sentence[i]\n", | |
" if i == 0:\n", | |
" prevword, prevpos = \"<START>\", \"<START>\"\n", | |
" previob = \"<START>\"\n", | |
" else:\n", | |
" prevword, prevpos = sentence[i - 1]\n", | |
" previob = history[i - 1]\n", | |
" if i == len(sentence) - 1:\n", | |
" nextword, nextpos = \"<END>\", \"<END>\"\n", | |
" else:\n", | |
" nextword, nextpos = sentence[i+1]\n", | |
" return {\"word\": word,\n", | |
" \"prevword\": prevword,\n", | |
" \"nextword\": nextword,\n", | |
" \"pos\": pos,\n", | |
" \"prevpos\": prevpos,\n", | |
" \"nextpos\": nextpos,\n", | |
" \"previob\": previob, \n", | |
" \"prevpos+pos\": \"%s+%s\" % (prevpos, pos),\n", | |
" \"pos+nextpos\": \"%s+%s\" % (pos, nextpos),\n", | |
" \"tags-since-dt\": tags_since_dt(sentence, i)}\n", | |
"\n", | |
"\n", | |
"def tags_since_dt(sentence, i):\n", | |
" \"\"\"Get all tags after a determiner as a\n", | |
" classification feature.\"\"\"\n", | |
" tags = set()\n", | |
" for word, pos in sentence[:i]:\n", | |
" if pos == 'DT':\n", | |
" tags = set()\n", | |
" else:\n", | |
" tags.add(pos)\n", | |
" return '+'.join(sorted(tags)) " | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 4, | |
"metadata": {}, | |
"source": [ | |
"Get training/testing sets from ConLL-2000 corpus." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])\n", | |
"train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 4, | |
"metadata": {}, | |
"source": [ | |
"Evaluate Maxent classifier." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"maxent_chunker = NPChunker(train_sents, tagger=MaxentIOBChunkTagger)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Training started: 2014-08-12 17:08:41.496115\n", | |
"Training complete: 2014-08-12 17:13:49.788300" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"Time spent training: 0:05:08.292185\n" | |
] | |
} | |
], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print(maxent_chunker.evaluate(test_sents))" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"ChunkParse score:\n", | |
" IOB Accuracy: 96.8%\n", | |
" Precision: 92.7%\n", | |
" Recall: 92.8%\n", | |
" F-Measure: 92.7%\n" | |
] | |
} | |
], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 4, | |
"metadata": {}, | |
"source": [ | |
"Evaluate LinearSVC classifier." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"linearsvc_chunker = NPChunker(train_sents, tagger=LinearSVCIOBChunkTagger)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Training started: 2014-08-12 17:45:43.930457\n", | |
"Training complete: 2014-08-12 17:46:23.394735" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"Time spent training: 0:00:39.464278\n" | |
] | |
} | |
], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print(linearsvc_chunker.evaluate(test_sents))" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"ChunkParse score:\n", | |
" IOB Accuracy: 96.8%\n", | |
" Precision: 92.8%\n", | |
" Recall: 93.0%\n", | |
" F-Measure: 92.9%\n" | |
] | |
} | |
], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 4, | |
"metadata": {}, | |
"source": [ | |
"Evaluate SVC classifier." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"svc_chunker = NPChunker(train_sents, tagger=SVCIOBChunkTagger)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Training started: 2014-08-12 17:47:07.243946\n", | |
"Training complete: 2014-08-12 23:18:49.701231" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"Time spent training: 5:31:42.457285\n" | |
] | |
} | |
], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print(svc_chunker.evaluate(test_sents))" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"ChunkParse score:\n", | |
" IOB Accuracy: 64.7%\n", | |
" Precision: 70.2%\n", | |
" Recall: 30.0%\n", | |
" F-Measure: 42.0%\n" | |
] | |
} | |
], | |
"prompt_number": 10 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment