Skip to content

Instantly share code, notes, and snippets.

@p208p2002
Last active March 25, 2022 07:31
Show Gist options
  • Save p208p2002/b9750b60d6b8702ef41f4ae3581ff483 to your computer and use it in GitHub Desktop.
Save p208p2002/b9750b60d6b8702ef41f4ae3581ff483 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import nltk\n",
"from nltk.corpus import brown\n",
"from nltk import WordNetLemmatizer\n",
"from math import log"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"wnl=WordNetLemmatizer()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"FreqDist({'the': 6386, ',': 5188, '.': 4030, 'of': 2861, 'a': 2647, 'and': 2186, 'to': 2144, 'in': 2020, 'for': 969, 'that': 829, ...})"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"_Fdist = nltk.FreqDist([wnl.lemmatize(w.lower()) for w in brown.words(categories='news')])\n",
"_Fdist"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['the',\n",
" 'fulton',\n",
" 'county',\n",
" 'grand',\n",
" 'jury',\n",
" 'said',\n",
" 'friday',\n",
" 'an',\n",
" 'investigation',\n",
" 'of',\n",
" \"atlanta's\",\n",
" 'recent',\n",
" 'primary',\n",
" 'election',\n",
" 'produced',\n",
" '``',\n",
" 'no',\n",
" 'evidence',\n",
" \"''\",\n",
" 'that',\n",
" 'any',\n",
" 'irregularity',\n",
" 'took',\n",
" 'place',\n",
" '.']"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"_Sents = [[wnl.lemmatize(j.lower()) for j in i] for i in brown.sents(categories='news')]\n",
"_Sents[0]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def p(x):\n",
" return _Fdist[x]/len(_Fdist)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.011248107289638763"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def p_both(words:list):\n",
" count_both = 0\n",
" for s in _Sents:\n",
" s = \" \".join(s)\n",
" all_in_flag = True\n",
" for w in words:\n",
" if w not in s:\n",
" all_in_flag = False\n",
" break\n",
" if all_in_flag:\n",
" count_both += 1\n",
" \n",
" return count_both/len(_Sents)\n",
"\n",
"p_both([\"new\",\"york\"])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"6.9879045123493855"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def pmi(words:list):\n",
" p_words = [p(word) for word in words]\n",
" p_words_cm = 1.0\n",
" for pw in p_words:\n",
" p_words_cm *= pw\n",
" \n",
" if p_words_cm == 1.0 or p_both(words) == 0.0:\n",
" return 0.0\n",
" \n",
" return log(p_both(words)/p_words_cm ,2)\n",
" \n",
"\n",
"pmi([\"new\",\"york\"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment