p208p2002 · March 25, 2022 07:31
diff --git a/Pointwise Mutual Information (PMI).ipynb b/Pointwise Mutual Information (PMI).ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk\n",
    "from nltk.corpus import brown\n",
    "from nltk import WordNetLemmatizer\n",
    "from math import log"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "wnl=WordNetLemmatizer()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "FreqDist({'the': 6386, ',': 5188, '.': 4030, 'of': 2861, 'a': 2647, 'and': 2186, 'to': 2144, 'in': 2020, 'for': 969, 'that': 829, ...})"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "_Fdist = nltk.FreqDist([wnl.lemmatize(w.lower()) for w in brown.words(categories='news')])\n",
    "_Fdist"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['the',\n",
       " 'fulton',\n",
       " 'county',\n",
       " 'grand',\n",
       " 'jury',\n",
       " 'said',\n",
       " 'friday',\n",
       " 'an',\n",
       " 'investigation',\n",
       " 'of',\n",
       " \"atlanta's\",\n",
       " 'recent',\n",
       " 'primary',\n",
       " 'election',\n",
       " 'produced',\n",
       " '``',\n",
       " 'no',\n",
       " 'evidence',\n",
       " \"''\",\n",
       " 'that',\n",
       " 'any',\n",
       " 'irregularity',\n",
       " 'took',\n",
       " 'place',\n",
       " '.']"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "_Sents = [[wnl.lemmatize(j.lower()) for j in i] for i in brown.sents(categories='news')]\n",
    "_Sents[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def p(x):\n",
    "    return _Fdist[x]/len(_Fdist)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.011248107289638763"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def p_both(words:list):\n",
    "    count_both = 0\n",
    "    for s in _Sents:\n",
    "        s = \" \".join(s)\n",
    "        all_in_flag = True\n",
    "        for w in words:\n",
    "            if w not in s:\n",
    "                all_in_flag = False\n",
    "                break\n",
    "        if all_in_flag:\n",
    "            count_both += 1\n",
    "    \n",
    "    return count_both/len(_Sents)\n",
    "\n",
    "p_both([\"new\",\"york\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "6.9879045123493855"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def pmi(words:list):\n",
    "    p_words = [p(word) for word in words]\n",
    "    p_words_cm = 1.0\n",
    "    for pw in p_words:\n",
    "        p_words_cm *= pw\n",
    "    \n",
    "    if p_words_cm == 1.0 or p_both(words) == 0.0:\n",
    "        return 0.0\n",
    "              \n",
    "    return log(p_both(words)/p_words_cm ,2)\n",
    "    \n",
    "\n",
    "pmi([\"new\",\"york\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import nltk\n",
	"from nltk.corpus import brown\n",
	"from nltk import WordNetLemmatizer\n",
	"from math import log"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"wnl=WordNetLemmatizer()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"FreqDist({'the': 6386, ',': 5188, '.': 4030, 'of': 2861, 'a': 2647, 'and': 2186, 'to': 2144, 'in': 2020, 'for': 969, 'that': 829, ...})"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"_Fdist = nltk.FreqDist([wnl.lemmatize(w.lower()) for w in brown.words(categories='news')])\n",
	"_Fdist"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"['the',\n",
	" 'fulton',\n",
	" 'county',\n",
	" 'grand',\n",
	" 'jury',\n",
	" 'said',\n",
	" 'friday',\n",
	" 'an',\n",
	" 'investigation',\n",
	" 'of',\n",
	" \"atlanta's\",\n",
	" 'recent',\n",
	" 'primary',\n",
	" 'election',\n",
	" 'produced',\n",
	" '``',\n",
	" 'no',\n",
	" 'evidence',\n",
	" \"''\",\n",
	" 'that',\n",
	" 'any',\n",
	" 'irregularity',\n",
	" 'took',\n",
	" 'place',\n",
	" '.']"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"_Sents = [[wnl.lemmatize(j.lower()) for j in i] for i in brown.sents(categories='news')]\n",
	"_Sents[0]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"def p(x):\n",
	" return _Fdist[x]/len(_Fdist)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"0.011248107289638763"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"def p_both(words:list):\n",
	" count_both = 0\n",
	" for s in _Sents:\n",
	" s = \" \".join(s)\n",
	" all_in_flag = True\n",
	" for w in words:\n",
	" if w not in s:\n",
	" all_in_flag = False\n",
	" break\n",
	" if all_in_flag:\n",
	" count_both += 1\n",
	" \n",
	" return count_both/len(_Sents)\n",
	"\n",
	"p_both([\"new\",\"york\"])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"6.9879045123493855"
	]
	},
	"execution_count": 7,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"def pmi(words:list):\n",
	" p_words = [p(word) for word in words]\n",
	" p_words_cm = 1.0\n",
	" for pw in p_words:\n",
	" p_words_cm *= pw\n",
	" \n",
	" if p_words_cm == 1.0 or p_both(words) == 0.0:\n",
	" return 0.0\n",
	" \n",
	" return log(p_both(words)/p_words_cm ,2)\n",
	" \n",
	"\n",
	"pmi([\"new\",\"york\"])"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# "
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.9"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}