davebshow · August 29, 2015 14:05
diff --git a/chunkers.ipynb b/chunkers.ipynb
 {
 "metadata": {
  "name": ""
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "heading",
     "level": 3,
     "metadata": {},
     "source": [
      "Evaluate various classifiers for NP chunking with the ConLL-2000 corpus using Python NLTK and scikit-learn."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import datetime\n",
      "import nltk\n",
      "from nltk.corpus import conll2000\n",
      "from sklearn.svm import  LinearSVC, SVC\n",
      "from nltk.classify.scikitlearn import SklearnClassifier"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Get MEGAM file for MaxentClassifier.\n",
      "nltk.config_megam('/home/dbshow1/megam/MEGAM/megam-64.opt')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "[Found /home/dbshow1/megam/MEGAM/megam-64.opt: /home/dbshow1/megam/MEGAM/megam-64.opt]\n"
       ]
      }
     ],
     "prompt_number": 2
    },
    {
     "cell_type": "heading",
     "level": 4,
     "metadata": {},
     "source": [
      "Code for NP chunker based on http://www.nltk.org/book/ch07.html."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "class NPChunker(nltk.ChunkParserI):\n",
      "    \"\"\"Chunk text into shallow NP trees using IOB parser.\"\"\"\n",
      "    def __init__(self, train_sents, tagger=None):\n",
      "        start_time = datetime.datetime.now()\n",
      "        print('Training started: {0}'.format(start_time))\n",
      "        tagged_sents = [[((w,t),c) for (w,t,c) in\n",
      "                         nltk.chunk.tree2conlltags(sent)]\n",
      "                        for sent in train_sents]\n",
      "        self.tagger = tagger(tagged_sents)\n",
      "        end_time = datetime.datetime.now()\n",
      "        print('Training complete: {0}'.format(end_time))\n",
      "        print('Time spent training: {0}'.format(end_time - start_time))\n",
      "\n",
      "    def parse(self, sentence):\n",
      "        tagged_sents = self.tagger.tag(sentence)\n",
      "        conlltags = [(w,t,c) for ((w,t),c) in tagged_sents]\n",
      "        return nltk.chunk.conlltags2tree(conlltags)\n",
      "   \n",
      "\n",
      "class BaseIOBChunkTagger(nltk.TaggerI): \n",
      "    \"\"\"Base class for IOB taggers. Defines tag method.\"\"\"\n",
      "    def tag(self, sentence):\n",
      "        history = []\n",
      "        for i, word in enumerate(sentence):\n",
      "            featureset = npchunk_features(sentence, i, history)\n",
      "            tag = self.classifier.classify(featureset)\n",
      "            history.append(tag)\n",
      "        return zip(sentence, history)\n",
      "    \n",
      "\n",
      "class MaxentIOBChunkTagger(BaseIOBChunkTagger): \n",
      "    \"\"\"IOB tagger using the Maxent classifier.\"\"\"\n",
      "    def __init__(self, train_sents):\n",
      "        train_set = extract_features(train_sents)\n",
      "        self.classifier = nltk.MaxentClassifier.train(\n",
      "            train_set, algorithm='megam', trace=0\n",
      "        )\n",
      "\n",
      " \n",
      "class LinearSVCIOBChunkTagger(BaseIOBChunkTagger): \n",
      "    \"\"\"IOB tagger using the scikit-learn LinearSVC classifier.\"\"\"\n",
      "    def __init__(self, train_sents):\n",
      "        train_set = extract_features(train_sents)\n",
      "        self.classifier = SklearnClassifier(LinearSVC()).train(train_set)\n",
      "        \n",
      "        \n",
      "class SVCIOBChunkTagger(BaseIOBChunkTagger): \n",
      "    \"\"\"IOB tagger using the scikit-learn SVC classifier.\"\"\"\n",
      "    def __init__(self, train_sents):\n",
      "        train_set = extract_features(train_sents)\n",
      "        self.classifier = SklearnClassifier(SVC()).train(train_set)\n",
      "\n",
      "                \n",
      "def extract_features(train_sents): \n",
      "    \"\"\"\"Extract features from training data.\"\"\"\n",
      "    train_set = []\n",
      "    for tagged_sent in train_sents:\n",
      "        untagged_sent = nltk.tag.untag(tagged_sent)\n",
      "        history = []    \n",
      "        for i, (word, tag) in enumerate(tagged_sent):\n",
      "            featureset = npchunk_features(untagged_sent, i, history) \n",
      "            train_set.append( (featureset, tag) )\n",
      "            history.append(tag)\n",
      "    return train_set\n",
      "\n",
      "\n",
      "def npchunk_features(sentence, i, history):\n",
      "    \"\"\"Extract features for classification.\"\"\"\n",
      "    word, pos = sentence[i]\n",
      "    if i == 0:\n",
      "        prevword, prevpos = \"<START>\", \"<START>\"\n",
      "        previob = \"<START>\"\n",
      "    else:\n",
      "        prevword, prevpos = sentence[i - 1]\n",
      "        previob = history[i - 1]\n",
      "    if i == len(sentence) - 1:\n",
      "        nextword, nextpos = \"<END>\", \"<END>\"\n",
      "    else:\n",
      "        nextword, nextpos = sentence[i+1]\n",
      "    return {\"word\": word,\n",
      "            \"prevword\": prevword,\n",
      "            \"nextword\": nextword,\n",
      "            \"pos\": pos,\n",
      "            \"prevpos\": prevpos,\n",
      "            \"nextpos\": nextpos,\n",
      "            \"previob\": previob, \n",
      "            \"prevpos+pos\": \"%s+%s\" % (prevpos, pos),\n",
      "            \"pos+nextpos\": \"%s+%s\" % (pos, nextpos),\n",
      "            \"tags-since-dt\": tags_since_dt(sentence, i)}\n",
      "\n",
      "\n",
      "def tags_since_dt(sentence, i):\n",
      "    \"\"\"Get all tags after a determiner as a\n",
      "       classification feature.\"\"\"\n",
      "    tags = set()\n",
      "    for word, pos in sentence[:i]:\n",
      "        if pos == 'DT':\n",
      "            tags = set()\n",
      "        else:\n",
      "            tags.add(pos)\n",
      "    return '+'.join(sorted(tags))    "
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "heading",
     "level": 4,
     "metadata": {},
     "source": [
      "Get training/testing sets from ConLL-2000 corpus."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])\n",
      "train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 4
    },
    {
     "cell_type": "heading",
     "level": 4,
     "metadata": {},
     "source": [
      "Evaluate Maxent classifier."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "maxent_chunker = NPChunker(train_sents, tagger=MaxentIOBChunkTagger)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Training started: 2014-08-12 17:08:41.496115\n",
        "Training complete: 2014-08-12 17:13:49.788300"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Time spent training: 0:05:08.292185\n"
       ]
      }
     ],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print(maxent_chunker.evaluate(test_sents))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "ChunkParse score:\n",
        "    IOB Accuracy:  96.8%\n",
        "    Precision:     92.7%\n",
        "    Recall:        92.8%\n",
        "    F-Measure:     92.7%\n"
       ]
      }
     ],
     "prompt_number": 6
    },
    {
     "cell_type": "heading",
     "level": 4,
     "metadata": {},
     "source": [
      "Evaluate LinearSVC classifier."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "linearsvc_chunker = NPChunker(train_sents, tagger=LinearSVCIOBChunkTagger)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Training started: 2014-08-12 17:45:43.930457\n",
        "Training complete: 2014-08-12 17:46:23.394735"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Time spent training: 0:00:39.464278\n"
       ]
      }
     ],
     "prompt_number": 7
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print(linearsvc_chunker.evaluate(test_sents))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "ChunkParse score:\n",
        "    IOB Accuracy:  96.8%\n",
        "    Precision:     92.8%\n",
        "    Recall:        93.0%\n",
        "    F-Measure:     92.9%\n"
       ]
      }
     ],
     "prompt_number": 8
    },
    {
     "cell_type": "heading",
     "level": 4,
     "metadata": {},
     "source": [
      "Evaluate SVC classifier."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "svc_chunker = NPChunker(train_sents, tagger=SVCIOBChunkTagger)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Training started: 2014-08-12 17:47:07.243946\n",
        "Training complete: 2014-08-12 23:18:49.701231"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Time spent training: 5:31:42.457285\n"
       ]
      }
     ],
     "prompt_number": 9
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print(svc_chunker.evaluate(test_sents))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "ChunkParse score:\n",
        "    IOB Accuracy:  64.7%\n",
        "    Precision:     70.2%\n",
        "    Recall:        30.0%\n",
        "    F-Measure:     42.0%\n"
       ]
      }
     ],
     "prompt_number": 10
    }
   ],
   "metadata": {}
  }
 ]
 }
	{
	"metadata": {
	"name": ""
	},
	"nbformat": 3,
	"nbformat_minor": 0,
	"worksheets": [
	{
	"cells": [
	{
	"cell_type": "heading",
	"level": 3,
	"metadata": {},
	"source": [
	"Evaluate various classifiers for NP chunking with the ConLL-2000 corpus using Python NLTK and scikit-learn."
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"import datetime\n",
	"import nltk\n",
	"from nltk.corpus import conll2000\n",
	"from sklearn.svm import LinearSVC, SVC\n",
	"from nltk.classify.scikitlearn import SklearnClassifier"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 1
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# Get MEGAM file for MaxentClassifier.\n",
	"nltk.config_megam('/home/dbshow1/megam/MEGAM/megam-64.opt')"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"[Found /home/dbshow1/megam/MEGAM/megam-64.opt: /home/dbshow1/megam/MEGAM/megam-64.opt]\n"
	]
	}
	],
	"prompt_number": 2
	},
	{
	"cell_type": "heading",
	"level": 4,
	"metadata": {},
	"source": [
	"Code for NP chunker based on http://www.nltk.org/book/ch07.html."
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"class NPChunker(nltk.ChunkParserI):\n",
	" \"\"\"Chunk text into shallow NP trees using IOB parser.\"\"\"\n",
	" def __init__(self, train_sents, tagger=None):\n",
	" start_time = datetime.datetime.now()\n",
	" print('Training started: {0}'.format(start_time))\n",
	" tagged_sents = [[((w,t),c) for (w,t,c) in\n",
	" nltk.chunk.tree2conlltags(sent)]\n",
	" for sent in train_sents]\n",
	" self.tagger = tagger(tagged_sents)\n",
	" end_time = datetime.datetime.now()\n",
	" print('Training complete: {0}'.format(end_time))\n",
	" print('Time spent training: {0}'.format(end_time - start_time))\n",
	"\n",
	" def parse(self, sentence):\n",
	" tagged_sents = self.tagger.tag(sentence)\n",
	" conlltags = [(w,t,c) for ((w,t),c) in tagged_sents]\n",
	" return nltk.chunk.conlltags2tree(conlltags)\n",
	" \n",
	"\n",
	"class BaseIOBChunkTagger(nltk.TaggerI): \n",
	" \"\"\"Base class for IOB taggers. Defines tag method.\"\"\"\n",
	" def tag(self, sentence):\n",
	" history = []\n",
	" for i, word in enumerate(sentence):\n",
	" featureset = npchunk_features(sentence, i, history)\n",
	" tag = self.classifier.classify(featureset)\n",
	" history.append(tag)\n",
	" return zip(sentence, history)\n",
	" \n",
	"\n",
	"class MaxentIOBChunkTagger(BaseIOBChunkTagger): \n",
	" \"\"\"IOB tagger using the Maxent classifier.\"\"\"\n",
	" def __init__(self, train_sents):\n",
	" train_set = extract_features(train_sents)\n",
	" self.classifier = nltk.MaxentClassifier.train(\n",
	" train_set, algorithm='megam', trace=0\n",
	" )\n",
	"\n",
	" \n",
	"class LinearSVCIOBChunkTagger(BaseIOBChunkTagger): \n",
	" \"\"\"IOB tagger using the scikit-learn LinearSVC classifier.\"\"\"\n",
	" def __init__(self, train_sents):\n",
	" train_set = extract_features(train_sents)\n",
	" self.classifier = SklearnClassifier(LinearSVC()).train(train_set)\n",
	" \n",
	" \n",
	"class SVCIOBChunkTagger(BaseIOBChunkTagger): \n",
	" \"\"\"IOB tagger using the scikit-learn SVC classifier.\"\"\"\n",
	" def __init__(self, train_sents):\n",
	" train_set = extract_features(train_sents)\n",
	" self.classifier = SklearnClassifier(SVC()).train(train_set)\n",
	"\n",
	" \n",
	"def extract_features(train_sents): \n",
	" \"\"\"\"Extract features from training data.\"\"\"\n",
	" train_set = []\n",
	" for tagged_sent in train_sents:\n",
	" untagged_sent = nltk.tag.untag(tagged_sent)\n",
	" history = [] \n",
	" for i, (word, tag) in enumerate(tagged_sent):\n",
	" featureset = npchunk_features(untagged_sent, i, history) \n",
	" train_set.append( (featureset, tag) )\n",
	" history.append(tag)\n",
	" return train_set\n",
	"\n",
	"\n",
	"def npchunk_features(sentence, i, history):\n",
	" \"\"\"Extract features for classification.\"\"\"\n",
	" word, pos = sentence[i]\n",
	" if i == 0:\n",
	" prevword, prevpos = \"<START>\", \"<START>\"\n",
	" previob = \"<START>\"\n",
	" else:\n",
	" prevword, prevpos = sentence[i - 1]\n",
	" previob = history[i - 1]\n",
	" if i == len(sentence) - 1:\n",
	" nextword, nextpos = \"<END>\", \"<END>\"\n",
	" else:\n",
	" nextword, nextpos = sentence[i+1]\n",
	" return {\"word\": word,\n",
	" \"prevword\": prevword,\n",
	" \"nextword\": nextword,\n",
	" \"pos\": pos,\n",
	" \"prevpos\": prevpos,\n",
	" \"nextpos\": nextpos,\n",
	" \"previob\": previob, \n",
	" \"prevpos+pos\": \"%s+%s\" % (prevpos, pos),\n",
	" \"pos+nextpos\": \"%s+%s\" % (pos, nextpos),\n",
	" \"tags-since-dt\": tags_since_dt(sentence, i)}\n",
	"\n",
	"\n",
	"def tags_since_dt(sentence, i):\n",
	" \"\"\"Get all tags after a determiner as a\n",
	" classification feature.\"\"\"\n",
	" tags = set()\n",
	" for word, pos in sentence[:i]:\n",
	" if pos == 'DT':\n",
	" tags = set()\n",
	" else:\n",
	" tags.add(pos)\n",
	" return '+'.join(sorted(tags)) "
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 3
	},
	{
	"cell_type": "heading",
	"level": 4,
	"metadata": {},
	"source": [
	"Get training/testing sets from ConLL-2000 corpus."
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])\n",
	"train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 4
	},
	{
	"cell_type": "heading",
	"level": 4,
	"metadata": {},
	"source": [
	"Evaluate Maxent classifier."
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"maxent_chunker = NPChunker(train_sents, tagger=MaxentIOBChunkTagger)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"Training started: 2014-08-12 17:08:41.496115\n",
	"Training complete: 2014-08-12 17:13:49.788300"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"\n",
	"Time spent training: 0:05:08.292185\n"
	]
	}
	],
	"prompt_number": 5
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"print(maxent_chunker.evaluate(test_sents))"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"ChunkParse score:\n",
	" IOB Accuracy: 96.8%\n",
	" Precision: 92.7%\n",
	" Recall: 92.8%\n",
	" F-Measure: 92.7%\n"
	]
	}
	],
	"prompt_number": 6
	},
	{
	"cell_type": "heading",
	"level": 4,
	"metadata": {},
	"source": [
	"Evaluate LinearSVC classifier."
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"linearsvc_chunker = NPChunker(train_sents, tagger=LinearSVCIOBChunkTagger)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"Training started: 2014-08-12 17:45:43.930457\n",
	"Training complete: 2014-08-12 17:46:23.394735"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"\n",
	"Time spent training: 0:00:39.464278\n"
	]
	}
	],
	"prompt_number": 7
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"print(linearsvc_chunker.evaluate(test_sents))"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"ChunkParse score:\n",
	" IOB Accuracy: 96.8%\n",
	" Precision: 92.8%\n",
	" Recall: 93.0%\n",
	" F-Measure: 92.9%\n"
	]
	}
	],
	"prompt_number": 8
	},
	{
	"cell_type": "heading",
	"level": 4,
	"metadata": {},
	"source": [
	"Evaluate SVC classifier."
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"svc_chunker = NPChunker(train_sents, tagger=SVCIOBChunkTagger)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"Training started: 2014-08-12 17:47:07.243946\n",
	"Training complete: 2014-08-12 23:18:49.701231"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"\n",
	"Time spent training: 5:31:42.457285\n"
	]
	}
	],
	"prompt_number": 9
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"print(svc_chunker.evaluate(test_sents))"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"ChunkParse score:\n",
	" IOB Accuracy: 64.7%\n",
	" Precision: 70.2%\n",
	" Recall: 30.0%\n",
	" F-Measure: 42.0%\n"
	]
	}
	],
	"prompt_number": 10
	}
	],
	"metadata": {}
	}
	]
	}