Last active
January 16, 2019 06:06
-
-
Save behitek/2c5429533d96a43406afc3722acba85e to your computer and use it in GitHub Desktop.
Ví dụ tính tf-idf với python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"docA = \"bây giờ mận mới hỏi đào\"\n", | |
"docB = \"vườn hồng có lối ai vào hay chưa\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"wordsA = docA.split()\n", | |
"wordsB = docB.split()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['bây', 'giờ', 'mận', 'mới', 'hỏi', 'đào']" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"wordsA" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['vườn', 'hồng', 'có', 'lối', 'ai', 'vào', 'hay', 'chưa']" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"wordsB" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"wordSet = set(wordsA).union(set(wordsB))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'ai',\n", | |
" 'bây',\n", | |
" 'chưa',\n", | |
" 'có',\n", | |
" 'giờ',\n", | |
" 'hay',\n", | |
" 'hỏi',\n", | |
" 'hồng',\n", | |
" 'lối',\n", | |
" 'mận',\n", | |
" 'mới',\n", | |
" 'vào',\n", | |
" 'vườn',\n", | |
" 'đào'}" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"wordSet" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"wordDictA = dict.fromkeys(wordSet, 0) \n", | |
"wordDictB = dict.fromkeys(wordSet, 0)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'bây': 0,\n", | |
" 'giờ': 0,\n", | |
" 'hồng': 0,\n", | |
" 'ai': 0,\n", | |
" 'lối': 0,\n", | |
" 'mới': 0,\n", | |
" 'vườn': 0,\n", | |
" 'hay': 0,\n", | |
" 'chưa': 0,\n", | |
" 'hỏi': 0,\n", | |
" 'có': 0,\n", | |
" 'vào': 0,\n", | |
" 'đào': 0,\n", | |
" 'mận': 0}" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"wordDictA" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"for word in wordsA:\n", | |
" wordDictA[word]+=1\n", | |
" \n", | |
"for word in wordsB:\n", | |
" wordDictB[word]+=1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'bây': 1,\n", | |
" 'giờ': 1,\n", | |
" 'hồng': 0,\n", | |
" 'ai': 0,\n", | |
" 'lối': 0,\n", | |
" 'mới': 1,\n", | |
" 'vườn': 0,\n", | |
" 'hay': 0,\n", | |
" 'chưa': 0,\n", | |
" 'hỏi': 1,\n", | |
" 'có': 0,\n", | |
" 'vào': 0,\n", | |
" 'đào': 1,\n", | |
" 'mận': 1}" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"wordDictA" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>ai</th>\n", | |
" <th>bây</th>\n", | |
" <th>chưa</th>\n", | |
" <th>có</th>\n", | |
" <th>giờ</th>\n", | |
" <th>hay</th>\n", | |
" <th>hỏi</th>\n", | |
" <th>hồng</th>\n", | |
" <th>lối</th>\n", | |
" <th>mận</th>\n", | |
" <th>mới</th>\n", | |
" <th>vào</th>\n", | |
" <th>vườn</th>\n", | |
" <th>đào</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" ai bây chưa có giờ hay hỏi hồng lối mận mới vào vườn đào\n", | |
"0 0 1 0 0 1 0 1 0 0 1 1 0 0 1\n", | |
"1 1 0 1 1 0 1 0 1 1 0 0 1 1 0" | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import pandas as pd\n", | |
"pd.DataFrame([wordDictA, wordDictB])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def computeTF(wordDict, words):\n", | |
" tfDict = {}\n", | |
" wordsCount = len(words)\n", | |
" for word, count in wordDict.items():\n", | |
" tfDict[word] = count/float(wordsCount)\n", | |
" return tfDict" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"tfdocA = computeTF(wordDictA, wordsA)\n", | |
"tfdocB = computeTF(wordDictB, wordsB)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'bây': 0.16666666666666666,\n", | |
" 'giờ': 0.16666666666666666,\n", | |
" 'hồng': 0.0,\n", | |
" 'ai': 0.0,\n", | |
" 'lối': 0.0,\n", | |
" 'mới': 0.16666666666666666,\n", | |
" 'vườn': 0.0,\n", | |
" 'hay': 0.0,\n", | |
" 'chưa': 0.0,\n", | |
" 'hỏi': 0.16666666666666666,\n", | |
" 'có': 0.0,\n", | |
" 'vào': 0.0,\n", | |
" 'đào': 0.16666666666666666,\n", | |
" 'mận': 0.16666666666666666}" | |
] | |
}, | |
"execution_count": 17, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"tfdocA" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'bây': 0.0,\n", | |
" 'giờ': 0.0,\n", | |
" 'hồng': 0.125,\n", | |
" 'ai': 0.125,\n", | |
" 'lối': 0.125,\n", | |
" 'mới': 0.0,\n", | |
" 'vườn': 0.125,\n", | |
" 'hay': 0.125,\n", | |
" 'chưa': 0.125,\n", | |
" 'hỏi': 0.0,\n", | |
" 'có': 0.125,\n", | |
" 'vào': 0.125,\n", | |
" 'đào': 0.0,\n", | |
" 'mận': 0.0}" | |
] | |
}, | |
"execution_count": 19, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"tfdocB" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def computeIDF(docList):\n", | |
" import math\n", | |
" idfDict = {}\n", | |
" N = len(docList)\n", | |
" \n", | |
" idfDict = dict.fromkeys(docList[0].keys(), 0)\n", | |
" for doc in docList:\n", | |
" for word, val in doc.items():\n", | |
" if val > 0:\n", | |
" idfDict[word] += 1\n", | |
" \n", | |
" for word, val in idfDict.items():\n", | |
" idfDict[word] = math.log10(N / float(val))\n", | |
" \n", | |
" return idfDict" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"idfs = computeIDF([wordDictA, wordDictB])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'bây': 0.3010299956639812,\n", | |
" 'giờ': 0.3010299956639812,\n", | |
" 'hồng': 0.3010299956639812,\n", | |
" 'ai': 0.3010299956639812,\n", | |
" 'lối': 0.3010299956639812,\n", | |
" 'mới': 0.3010299956639812,\n", | |
" 'vườn': 0.3010299956639812,\n", | |
" 'hay': 0.3010299956639812,\n", | |
" 'chưa': 0.3010299956639812,\n", | |
" 'hỏi': 0.3010299956639812,\n", | |
" 'có': 0.3010299956639812,\n", | |
" 'vào': 0.3010299956639812,\n", | |
" 'đào': 0.3010299956639812,\n", | |
" 'mận': 0.3010299956639812}" | |
] | |
}, | |
"execution_count": 22, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"idfs" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def computeTFIDF(tfDocs, idfs):\n", | |
" tfidf = {}\n", | |
" for word, val in tfDocs.items():\n", | |
" tfidf[word] = val*idfs[word]\n", | |
" return tfidf" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"tfidfDocA = computeTFIDF(tfdocA, idfs)\n", | |
"tfidfDocB = computeTFIDF(tfdocB, idfs)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>ai</th>\n", | |
" <th>bây</th>\n", | |
" <th>chưa</th>\n", | |
" <th>có</th>\n", | |
" <th>giờ</th>\n", | |
" <th>hay</th>\n", | |
" <th>hỏi</th>\n", | |
" <th>hồng</th>\n", | |
" <th>lối</th>\n", | |
" <th>mận</th>\n", | |
" <th>mới</th>\n", | |
" <th>vào</th>\n", | |
" <th>vườn</th>\n", | |
" <th>đào</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.050172</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.050172</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.050172</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.050172</td>\n", | |
" <td>0.050172</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.050172</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>0.037629</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.037629</td>\n", | |
" <td>0.037629</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.037629</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.037629</td>\n", | |
" <td>0.037629</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.037629</td>\n", | |
" <td>0.037629</td>\n", | |
" <td>0.000000</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" ai bây chưa có giờ hay hỏi \\\n", | |
"0 0.000000 0.050172 0.000000 0.000000 0.050172 0.000000 0.050172 \n", | |
"1 0.037629 0.000000 0.037629 0.037629 0.000000 0.037629 0.000000 \n", | |
"\n", | |
" hồng lối mận mới vào vườn đào \n", | |
"0 0.000000 0.000000 0.050172 0.050172 0.000000 0.000000 0.050172 \n", | |
"1 0.037629 0.037629 0.000000 0.000000 0.037629 0.037629 0.000000 " | |
] | |
}, | |
"execution_count": 25, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import pandas as pd\n", | |
"pd.DataFrame([tfidfDocA, tfidfDocB])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.0" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment