Skip to content

Instantly share code, notes, and snippets.

@jgc128
Last active October 17, 2015 01:42
Show Gist options
  • Save jgc128/75964e86b05d760b8158 to your computer and use it in GitHub Desktop.
Save jgc128/75964e86b05d760b8158 to your computer and use it in GitHub Desktop.
Code for homework 1 (2015 Fall NLP Class)
import os
base_dir = '/data1/aromanov/study/2015_fall/nlp/homeworks/hw1/'
brit3_filename = os.path.join(base_dir, 'brit3-excerpt.txt')
brit3_marked_filename = os.path.join(base_dir, 'brit3-excerpt-marked.txt')
problem4_text_filename = os.path.join(base_dir, 'problem4.txt')
def load_documents_from_dir(directory):
files = [os.path.join(directory, f) for f in os.listdir(directory)]
docs = []
for fl in files:
with open(fl, 'r') as f:
d = f.read()
docs.append(d)
return docs
def load_file(filename):
with open(filename, 'r') as f:
result = f.read()
return result
def load_file_lines(filename):
with open(filename, 'r') as f:
lines = f.readlines()
result = [l.strip('\n') for l in lines]
return result
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import string\n",
"from collections import defaultdict\n",
"\n",
"import nltk\n",
"import gensim"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"class NgramIterator:\n",
" def __init__(self, doc, n=2):\n",
" self.__doc = doc\n",
" self.__n = n\n",
"\n",
" self.__i = 0\n",
" self.__doc_len = len(doc)\n",
" self.__ngram_count = self.__doc_len - n + 1\n",
"\n",
" def __iter__(self):\n",
" return self\n",
"\n",
" def __next__(self):\n",
" if self.__i < self.__ngram_count:\n",
" i = self.__i\n",
" self.__i += 1\n",
" result = []\n",
" for j in range(self.__n):\n",
" result.append(self.__doc[i+j])\n",
" return result\n",
" else:\n",
" self.__i = 0\n",
" raise StopIteration()"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"%run common.py"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"doc = load_file(problem4_text_filename)"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# tokens = [t.lower() for t in nltk.word_tokenize(doc)]\n",
"tokens = [t.lower() for t in doc.replace('\\n', ' ').split(' ')]\n",
"# tokens = [t for t in doc.replace('\\n', ' ').split(' ')]"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# clean_tokens = [t for t in tokens if t not in string.punctuation]\n",
"clean_tokens = tokens"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# clean_tokens.insert(0, '<start>')\n",
"# clean_tokens.append('<end>')"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"226"
]
},
"execution_count": 91,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(clean_tokens)"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"149"
]
},
"execution_count": 92,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vocab = set(clean_tokens)\n",
"len(vocab)"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# count bigrams\n",
"bigrams = defaultdict(int)\n",
"for bigram in NgramIterator(clean_tokens):\n",
" key = '_'.join(bigram)\n",
" bigrams[key] += 1"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[('of_war', 2),\n",
" ('that_could', 3),\n",
" ('which_could', 2),\n",
" ('in_a', 2),\n",
" (',_\"', 2),\n",
" ('._the', 3),\n",
" ('could_be', 2)]"
]
},
"execution_count": 94,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"[(bk,bigrams[bk]) for bk in bigrams.keys() if bigrams[bk] > 1]"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def get_bigram_prob(wi_1, wi, bigrams, vocab):\n",
" numerator = bigrams[wi_1+'_'+wi]\n",
" denominator = sum([bigrams[wi_1+'_'+wj] for wj in vocab])\n",
" \n",
" if numerator == 0:\n",
" return 0\n",
" else:\n",
" return numerator/denominator"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def get_bigram_prob_with_smoothing(wi_1, wi, smoothing, bigrams, vocab):\n",
" numerator = smoothing + bigrams[wi_1+'_'+wi]\n",
" denominator = (len(vocab) * smoothing) + sum([bigrams[wi_1+'_'+wj] for wj in vocab])\n",
" \n",
" return numerator/denominator"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.75\n",
"0.3577331759149941\n"
]
}
],
"source": [
"print(get_bigram_prob('that', 'could', bigrams, vocab))\n",
"print(get_bigram_prob_with_smoothing('that', 'could', 0.03, bigrams, vocab))"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['we', 'seek', 'a', 'solution', 'that', 'could', 'be', 'accepted', 'by', 'both', 'sides', '.']\n"
]
}
],
"source": [
"test_phrase = 'We seek a solution that could be accepted by both sides .'\n",
"test_phrase_tokens = [t.lower() for t in test_phrase.split(' ')]\n",
"# test_phrase_tokens = [t for t in test_phrase.split(' ')]\n",
"print(test_phrase_tokens)"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.000390625\n"
]
}
],
"source": [
"result_prob = 1\n",
"for bigram in NgramIterator(test_phrase_tokens):\n",
" result_prob *= get_bigram_prob(bigram[0], bigram[1], bigrams, vocab)\n",
"print(result_prob)"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2.860849507990039e-09\n"
]
}
],
"source": [
"result_prob = 1\n",
"smoothing = 0.03\n",
"for bigram in NgramIterator(test_phrase_tokens):\n",
" result_prob *= get_bigram_prob_with_smoothing(bigram[0], bigram[1], smoothing, bigrams, vocab)\n",
"print(result_prob)"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"4.47"
]
},
"execution_count": 104,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(vocab) * smoothing"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment