jgc128 · October 17, 2015 01:42
diff --git a/common.py b/common.py
 import os

 base_dir = '/data1/aromanov/study/2015_fall/nlp/homeworks/hw1/'
 brit3_filename = os.path.join(base_dir, 'brit3-excerpt.txt')
 brit3_marked_filename = os.path.join(base_dir, 'brit3-excerpt-marked.txt')

 problem4_text_filename = os.path.join(base_dir, 'problem4.txt')

 def load_documents_from_dir(directory):
    files = [os.path.join(directory, f) for f in os.listdir(directory)]
    
    docs = []
    for fl in files:
        with open(fl, 'r') as f:
            d = f.read()
            docs.append(d)
            
    return docs

 def load_file(filename):
    with open(filename, 'r') as f:
        result = f.read()
    
    return result

 def load_file_lines(filename):
    with open(filename, 'r') as f:
        lines = f.readlines()
    result = [l.strip('\n') for l in lines]
    
    return result
diff --git a/problem_4.ipynb b/problem_4.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import string\n",
    "from collections import defaultdict\n",
    "\n",
    "import nltk\n",
    "import gensim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class NgramIterator:\n",
    "    def __init__(self, doc, n=2):\n",
    "        self.__doc = doc\n",
    "        self.__n = n\n",
    "\n",
    "        self.__i = 0\n",
    "        self.__doc_len = len(doc)\n",
    "        self.__ngram_count = self.__doc_len - n + 1\n",
    "\n",
    "    def __iter__(self):\n",
    "        return self\n",
    "\n",
    "    def __next__(self):\n",
    "        if self.__i < self.__ngram_count:\n",
    "            i = self.__i\n",
    "            self.__i += 1\n",
    "            result = []\n",
    "            for j in range(self.__n):\n",
    "                result.append(self.__doc[i+j])\n",
    "            return result\n",
    "        else:\n",
    "            self.__i = 0\n",
    "            raise StopIteration()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%run common.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "doc = load_file(problem4_text_filename)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# tokens = [t.lower() for t in nltk.word_tokenize(doc)]\n",
    "tokens = [t.lower() for t in doc.replace('\\n', ' ').split(' ')]\n",
    "# tokens = [t for t in doc.replace('\\n', ' ').split(' ')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# clean_tokens = [t for t in tokens if t not in string.punctuation]\n",
    "clean_tokens = tokens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# clean_tokens.insert(0, '<start>')\n",
    "# clean_tokens.append('<end>')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "226"
      ]
     },
     "execution_count": 91,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(clean_tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "149"
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab = set(clean_tokens)\n",
    "len(vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# count bigrams\n",
    "bigrams = defaultdict(int)\n",
    "for bigram in NgramIterator(clean_tokens):\n",
    "    key = '_'.join(bigram)\n",
    "    bigrams[key] += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('of_war', 2),\n",
       " ('that_could', 3),\n",
       " ('which_could', 2),\n",
       " ('in_a', 2),\n",
       " (',_\"', 2),\n",
       " ('._the', 3),\n",
       " ('could_be', 2)]"
      ]
     },
     "execution_count": 94,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[(bk,bigrams[bk]) for bk in bigrams.keys() if bigrams[bk] > 1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_bigram_prob(wi_1, wi, bigrams, vocab):\n",
    "    numerator = bigrams[wi_1+'_'+wi]\n",
    "    denominator = sum([bigrams[wi_1+'_'+wj] for wj in vocab])\n",
    "    \n",
    "    if numerator == 0:\n",
    "        return 0\n",
    "    else:\n",
    "        return numerator/denominator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def get_bigram_prob_with_smoothing(wi_1, wi, smoothing, bigrams, vocab):\n",
    "    numerator = smoothing + bigrams[wi_1+'_'+wi]\n",
    "    denominator = (len(vocab) * smoothing) + sum([bigrams[wi_1+'_'+wj] for wj in vocab])\n",
    "    \n",
    "    return numerator/denominator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.75\n",
      "0.3577331759149941\n"
     ]
    }
   ],
   "source": [
    "print(get_bigram_prob('that', 'could', bigrams, vocab))\n",
    "print(get_bigram_prob_with_smoothing('that', 'could', 0.03, bigrams, vocab))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['we', 'seek', 'a', 'solution', 'that', 'could', 'be', 'accepted', 'by', 'both', 'sides', '.']\n"
     ]
    }
   ],
   "source": [
    "test_phrase = 'We seek a solution that could be accepted by both sides .'\n",
    "test_phrase_tokens = [t.lower() for t in test_phrase.split(' ')]\n",
    "# test_phrase_tokens = [t for t in test_phrase.split(' ')]\n",
    "print(test_phrase_tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.000390625\n"
     ]
    }
   ],
   "source": [
    "result_prob = 1\n",
    "for bigram in NgramIterator(test_phrase_tokens):\n",
    "    result_prob *= get_bigram_prob(bigram[0], bigram[1], bigrams, vocab)\n",
    "print(result_prob)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2.860849507990039e-09\n"
     ]
    }
   ],
   "source": [
    "result_prob = 1\n",
    "smoothing = 0.03\n",
    "for bigram in NgramIterator(test_phrase_tokens):\n",
    "    result_prob *= get_bigram_prob_with_smoothing(bigram[0], bigram[1], smoothing, bigrams, vocab)\n",
    "print(result_prob)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4.47"
      ]
     },
     "execution_count": 104,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(vocab) * smoothing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }
	import os

	base_dir = '/data1/aromanov/study/2015_fall/nlp/homeworks/hw1/'
	brit3_filename = os.path.join(base_dir, 'brit3-excerpt.txt')
	brit3_marked_filename = os.path.join(base_dir, 'brit3-excerpt-marked.txt')

	problem4_text_filename = os.path.join(base_dir, 'problem4.txt')

	def load_documents_from_dir(directory):
	files = [os.path.join(directory, f) for f in os.listdir(directory)]

	docs = []
	for fl in files:
	with open(fl, 'r') as f:
	d = f.read()
	docs.append(d)

	return docs

	def load_file(filename):
	with open(filename, 'r') as f:
	result = f.read()

	return result

	def load_file_lines(filename):
	with open(filename, 'r') as f:
	lines = f.readlines()
	result = [l.strip('\n') for l in lines]

	return result
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 47,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import string\n",
	"from collections import defaultdict\n",
	"\n",
	"import nltk\n",
	"import gensim"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 64,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"class NgramIterator:\n",
	" def __init__(self, doc, n=2):\n",
	" self.__doc = doc\n",
	" self.__n = n\n",
	"\n",
	" self.__i = 0\n",
	" self.__doc_len = len(doc)\n",
	" self.__ngram_count = self.__doc_len - n + 1\n",
	"\n",
	" def __iter__(self):\n",
	" return self\n",
	"\n",
	" def __next__(self):\n",
	" if self.__i < self.__ngram_count:\n",
	" i = self.__i\n",
	" self.__i += 1\n",
	" result = []\n",
	" for j in range(self.__n):\n",
	" result.append(self.__doc[i+j])\n",
	" return result\n",
	" else:\n",
	" self.__i = 0\n",
	" raise StopIteration()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 65,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"%run common.py"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 66,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"doc = load_file(problem4_text_filename)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 88,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# tokens = [t.lower() for t in nltk.word_tokenize(doc)]\n",
	"tokens = [t.lower() for t in doc.replace('\\n', ' ').split(' ')]\n",
	"# tokens = [t for t in doc.replace('\\n', ' ').split(' ')]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 89,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# clean_tokens = [t for t in tokens if t not in string.punctuation]\n",
	"clean_tokens = tokens"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 90,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# clean_tokens.insert(0, '<start>')\n",
	"# clean_tokens.append('<end>')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 91,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"226"
	]
	},
	"execution_count": 91,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"len(clean_tokens)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 92,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"149"
	]
	},
	"execution_count": 92,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"vocab = set(clean_tokens)\n",
	"len(vocab)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 93,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# count bigrams\n",
	"bigrams = defaultdict(int)\n",
	"for bigram in NgramIterator(clean_tokens):\n",
	" key = '_'.join(bigram)\n",
	" bigrams[key] += 1"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 94,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[('of_war', 2),\n",
	" ('that_could', 3),\n",
	" ('which_could', 2),\n",
	" ('in_a', 2),\n",
	" (',_\"', 2),\n",
	" ('._the', 3),\n",
	" ('could_be', 2)]"
	]
	},
	"execution_count": 94,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"[(bk,bigrams[bk]) for bk in bigrams.keys() if bigrams[bk] > 1]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 95,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def get_bigram_prob(wi_1, wi, bigrams, vocab):\n",
	" numerator = bigrams[wi_1+'_'+wi]\n",
	" denominator = sum([bigrams[wi_1+'_'+wj] for wj in vocab])\n",
	" \n",
	" if numerator == 0:\n",
	" return 0\n",
	" else:\n",
	" return numerator/denominator"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 96,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"def get_bigram_prob_with_smoothing(wi_1, wi, smoothing, bigrams, vocab):\n",
	" numerator = smoothing + bigrams[wi_1+'_'+wi]\n",
	" denominator = (len(vocab) * smoothing) + sum([bigrams[wi_1+'_'+wj] for wj in vocab])\n",
	" \n",
	" return numerator/denominator"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 97,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"0.75\n",
	"0.3577331759149941\n"
	]
	}
	],
	"source": [
	"print(get_bigram_prob('that', 'could', bigrams, vocab))\n",
	"print(get_bigram_prob_with_smoothing('that', 'could', 0.03, bigrams, vocab))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 102,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"['we', 'seek', 'a', 'solution', 'that', 'could', 'be', 'accepted', 'by', 'both', 'sides', '.']\n"
	]
	}
	],
	"source": [
	"test_phrase = 'We seek a solution that could be accepted by both sides .'\n",
	"test_phrase_tokens = [t.lower() for t in test_phrase.split(' ')]\n",
	"# test_phrase_tokens = [t for t in test_phrase.split(' ')]\n",
	"print(test_phrase_tokens)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 99,
	"metadata": {
	"collapsed": false,
	"scrolled": true
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"0.000390625\n"
	]
	}
	],
	"source": [
	"result_prob = 1\n",
	"for bigram in NgramIterator(test_phrase_tokens):\n",
	" result_prob *= get_bigram_prob(bigram[0], bigram[1], bigrams, vocab)\n",
	"print(result_prob)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 103,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"2.860849507990039e-09\n"
	]
	}
	],
	"source": [
	"result_prob = 1\n",
	"smoothing = 0.03\n",
	"for bigram in NgramIterator(test_phrase_tokens):\n",
	" result_prob *= get_bigram_prob_with_smoothing(bigram[0], bigram[1], smoothing, bigrams, vocab)\n",
	"print(result_prob)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 104,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"4.47"
	]
	},
	"execution_count": 104,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"len(vocab) * smoothing"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.4.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}