Last active
March 25, 2022 07:31
-
-
Save p208p2002/b9750b60d6b8702ef41f4ae3581ff483 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import nltk\n", | |
"from nltk.corpus import brown\n", | |
"from nltk import WordNetLemmatizer\n", | |
"from math import log" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"wnl=WordNetLemmatizer()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"FreqDist({'the': 6386, ',': 5188, '.': 4030, 'of': 2861, 'a': 2647, 'and': 2186, 'to': 2144, 'in': 2020, 'for': 969, 'that': 829, ...})" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"_Fdist = nltk.FreqDist([wnl.lemmatize(w.lower()) for w in brown.words(categories='news')])\n", | |
"_Fdist" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['the',\n", | |
" 'fulton',\n", | |
" 'county',\n", | |
" 'grand',\n", | |
" 'jury',\n", | |
" 'said',\n", | |
" 'friday',\n", | |
" 'an',\n", | |
" 'investigation',\n", | |
" 'of',\n", | |
" \"atlanta's\",\n", | |
" 'recent',\n", | |
" 'primary',\n", | |
" 'election',\n", | |
" 'produced',\n", | |
" '``',\n", | |
" 'no',\n", | |
" 'evidence',\n", | |
" \"''\",\n", | |
" 'that',\n", | |
" 'any',\n", | |
" 'irregularity',\n", | |
" 'took',\n", | |
" 'place',\n", | |
" '.']" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"_Sents = [[wnl.lemmatize(j.lower()) for j in i] for i in brown.sents(categories='news')]\n", | |
"_Sents[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def p(x):\n", | |
" return _Fdist[x]/len(_Fdist)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.011248107289638763" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"def p_both(words:list):\n", | |
" count_both = 0\n", | |
" for s in _Sents:\n", | |
" s = \" \".join(s)\n", | |
" all_in_flag = True\n", | |
" for w in words:\n", | |
" if w not in s:\n", | |
" all_in_flag = False\n", | |
" break\n", | |
" if all_in_flag:\n", | |
" count_both += 1\n", | |
" \n", | |
" return count_both/len(_Sents)\n", | |
"\n", | |
"p_both([\"new\",\"york\"])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"6.9879045123493855" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"def pmi(words:list):\n", | |
" p_words = [p(word) for word in words]\n", | |
" p_words_cm = 1.0\n", | |
" for pw in p_words:\n", | |
" p_words_cm *= pw\n", | |
" \n", | |
" if p_words_cm == 1.0 or p_both(words) == 0.0:\n", | |
" return 0.0\n", | |
" \n", | |
" return log(p_both(words)/p_words_cm ,2)\n", | |
" \n", | |
"\n", | |
"pmi([\"new\",\"york\"])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# " | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.9" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment