Skip to content

Instantly share code, notes, and snippets.

@behitek
Last active January 16, 2019 06:06
Show Gist options
  • Save behitek/2c5429533d96a43406afc3722acba85e to your computer and use it in GitHub Desktop.
Save behitek/2c5429533d96a43406afc3722acba85e to your computer and use it in GitHub Desktop.
Ví dụ tính tf-idf với python
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"docA = \"bây giờ mận mới hỏi đào\"\n",
"docB = \"vườn hồng có lối ai vào hay chưa\""
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"wordsA = docA.split()\n",
"wordsB = docB.split()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['bây', 'giờ', 'mận', 'mới', 'hỏi', 'đào']"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wordsA"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['vườn', 'hồng', 'có', 'lối', 'ai', 'vào', 'hay', 'chưa']"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wordsB"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"wordSet = set(wordsA).union(set(wordsB))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'ai',\n",
" 'bây',\n",
" 'chưa',\n",
" 'có',\n",
" 'giờ',\n",
" 'hay',\n",
" 'hỏi',\n",
" 'hồng',\n",
" 'lối',\n",
" 'mận',\n",
" 'mới',\n",
" 'vào',\n",
" 'vườn',\n",
" 'đào'}"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wordSet"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"wordDictA = dict.fromkeys(wordSet, 0) \n",
"wordDictB = dict.fromkeys(wordSet, 0)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'bây': 0,\n",
" 'giờ': 0,\n",
" 'hồng': 0,\n",
" 'ai': 0,\n",
" 'lối': 0,\n",
" 'mới': 0,\n",
" 'vườn': 0,\n",
" 'hay': 0,\n",
" 'chưa': 0,\n",
" 'hỏi': 0,\n",
" 'có': 0,\n",
" 'vào': 0,\n",
" 'đào': 0,\n",
" 'mận': 0}"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wordDictA"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"for word in wordsA:\n",
" wordDictA[word]+=1\n",
" \n",
"for word in wordsB:\n",
" wordDictB[word]+=1"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'bây': 1,\n",
" 'giờ': 1,\n",
" 'hồng': 0,\n",
" 'ai': 0,\n",
" 'lối': 0,\n",
" 'mới': 1,\n",
" 'vườn': 0,\n",
" 'hay': 0,\n",
" 'chưa': 0,\n",
" 'hỏi': 1,\n",
" 'có': 0,\n",
" 'vào': 0,\n",
" 'đào': 1,\n",
" 'mận': 1}"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wordDictA"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ai</th>\n",
" <th>bây</th>\n",
" <th>chưa</th>\n",
" <th>có</th>\n",
" <th>giờ</th>\n",
" <th>hay</th>\n",
" <th>hỏi</th>\n",
" <th>hồng</th>\n",
" <th>lối</th>\n",
" <th>mận</th>\n",
" <th>mới</th>\n",
" <th>vào</th>\n",
" <th>vườn</th>\n",
" <th>đào</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ai bây chưa có giờ hay hỏi hồng lối mận mới vào vườn đào\n",
"0 0 1 0 0 1 0 1 0 0 1 1 0 0 1\n",
"1 1 0 1 1 0 1 0 1 1 0 0 1 1 0"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"pd.DataFrame([wordDictA, wordDictB])"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"def computeTF(wordDict, words):\n",
" tfDict = {}\n",
" wordsCount = len(words)\n",
" for word, count in wordDict.items():\n",
" tfDict[word] = count/float(wordsCount)\n",
" return tfDict"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"tfdocA = computeTF(wordDictA, wordsA)\n",
"tfdocB = computeTF(wordDictB, wordsB)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'bây': 0.16666666666666666,\n",
" 'giờ': 0.16666666666666666,\n",
" 'hồng': 0.0,\n",
" 'ai': 0.0,\n",
" 'lối': 0.0,\n",
" 'mới': 0.16666666666666666,\n",
" 'vườn': 0.0,\n",
" 'hay': 0.0,\n",
" 'chưa': 0.0,\n",
" 'hỏi': 0.16666666666666666,\n",
" 'có': 0.0,\n",
" 'vào': 0.0,\n",
" 'đào': 0.16666666666666666,\n",
" 'mận': 0.16666666666666666}"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tfdocA"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'bây': 0.0,\n",
" 'giờ': 0.0,\n",
" 'hồng': 0.125,\n",
" 'ai': 0.125,\n",
" 'lối': 0.125,\n",
" 'mới': 0.0,\n",
" 'vườn': 0.125,\n",
" 'hay': 0.125,\n",
" 'chưa': 0.125,\n",
" 'hỏi': 0.0,\n",
" 'có': 0.125,\n",
" 'vào': 0.125,\n",
" 'đào': 0.0,\n",
" 'mận': 0.0}"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tfdocB"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"def computeIDF(docList):\n",
" import math\n",
" idfDict = {}\n",
" N = len(docList)\n",
" \n",
" idfDict = dict.fromkeys(docList[0].keys(), 0)\n",
" for doc in docList:\n",
" for word, val in doc.items():\n",
" if val > 0:\n",
" idfDict[word] += 1\n",
" \n",
" for word, val in idfDict.items():\n",
" idfDict[word] = math.log10(N / float(val))\n",
" \n",
" return idfDict"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"idfs = computeIDF([wordDictA, wordDictB])"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'bây': 0.3010299956639812,\n",
" 'giờ': 0.3010299956639812,\n",
" 'hồng': 0.3010299956639812,\n",
" 'ai': 0.3010299956639812,\n",
" 'lối': 0.3010299956639812,\n",
" 'mới': 0.3010299956639812,\n",
" 'vườn': 0.3010299956639812,\n",
" 'hay': 0.3010299956639812,\n",
" 'chưa': 0.3010299956639812,\n",
" 'hỏi': 0.3010299956639812,\n",
" 'có': 0.3010299956639812,\n",
" 'vào': 0.3010299956639812,\n",
" 'đào': 0.3010299956639812,\n",
" 'mận': 0.3010299956639812}"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"idfs"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"def computeTFIDF(tfDocs, idfs):\n",
" tfidf = {}\n",
" for word, val in tfDocs.items():\n",
" tfidf[word] = val*idfs[word]\n",
" return tfidf"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"tfidfDocA = computeTFIDF(tfdocA, idfs)\n",
"tfidfDocB = computeTFIDF(tfdocB, idfs)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ai</th>\n",
" <th>bây</th>\n",
" <th>chưa</th>\n",
" <th>có</th>\n",
" <th>giờ</th>\n",
" <th>hay</th>\n",
" <th>hỏi</th>\n",
" <th>hồng</th>\n",
" <th>lối</th>\n",
" <th>mận</th>\n",
" <th>mới</th>\n",
" <th>vào</th>\n",
" <th>vườn</th>\n",
" <th>đào</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.000000</td>\n",
" <td>0.050172</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.050172</td>\n",
" <td>0.000000</td>\n",
" <td>0.050172</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.050172</td>\n",
" <td>0.050172</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.050172</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.037629</td>\n",
" <td>0.000000</td>\n",
" <td>0.037629</td>\n",
" <td>0.037629</td>\n",
" <td>0.000000</td>\n",
" <td>0.037629</td>\n",
" <td>0.000000</td>\n",
" <td>0.037629</td>\n",
" <td>0.037629</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.037629</td>\n",
" <td>0.037629</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ai bây chưa có giờ hay hỏi \\\n",
"0 0.000000 0.050172 0.000000 0.000000 0.050172 0.000000 0.050172 \n",
"1 0.037629 0.000000 0.037629 0.037629 0.000000 0.037629 0.000000 \n",
"\n",
" hồng lối mận mới vào vườn đào \n",
"0 0.000000 0.000000 0.050172 0.050172 0.000000 0.000000 0.050172 \n",
"1 0.037629 0.037629 0.000000 0.000000 0.037629 0.037629 0.000000 "
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"pd.DataFrame([tfidfDocA, tfidfDocB])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment