Skip to content

Instantly share code, notes, and snippets.

@PandoraRiot
Created March 28, 2022 21:34
Show Gist options
  • Save PandoraRiot/b920a56a8e4bd646a90c21d23b660c84 to your computer and use it in GitHub Desktop.
Save PandoraRiot/b920a56a8e4bd646a90c21d23b660c84 to your computer and use it in GitHub Desktop.
03_00_Clasificacion_BW__TF_IDF.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "03_00_Clasificacion_BW__TF_IDF.ipynb",
"provenance": [],
"collapsed_sections": [],
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/PandoraRiot/b920a56a8e4bd646a90c21d23b660c84/03_00_clasificacion_bw__tf_idf.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"metadata": {
"id": "4wCFBly4uu9c"
},
"source": [
"import pandas as pd\n",
"from sklearn.feature_extraction.text import TfidfVectorizer"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "355mRwx6uyki"
},
"source": [
"documentA = 'i love dogs'\n",
"documentB = 'i hate dogs and knitting'\n",
"documentC ='knitting is my hobby and my passion'"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "GUlaDZXYvC6a"
},
"source": [
"bagOfWordsA = documentA.split(' ')\n",
"bagOfWordsB = documentB.split(' ')\n",
"bagOfWordsC = documentC.split(' ')"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "bTuUh7Hlw84Z"
},
"source": [
"#VEAMOSLO COMO BAG OF WORDS\n",
"\n",
"# Cargar libreria\n",
"import numpy as np\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"# Crear vector de textos\n",
"text_data = np.array([documentA,documentB,documentC])\n",
"\n",
"# Crear bolsa de palabas (matriz)\n",
"count = CountVectorizer()\n",
"bag_of_words = count.fit_transform(text_data)\n",
"\n",
"# A arreglo\n",
"bag_of_words.toarray()\n",
"\n",
"\n",
"# Obtener nombres para las columnas\n",
"feature_names = count.get_feature_names()\n",
"\n",
"# ver nombre de las columnas\n",
"feature_names\n",
"\n",
"# Crear data frame\n",
"df_bw=pd.DataFrame(bag_of_words.toarray(), columns=feature_names)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
""
],
"metadata": {
"id": "WfAmbfvrc5Lq"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
""
],
"metadata": {
"id": "fIIinhRac5Ya"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "ALHqkk54w_FC",
"outputId": "995bb76c-545b-4d7f-e91b-70c00f693486",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 141
}
},
"source": [
"df_bw"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>and</th>\n",
" <th>dogs</th>\n",
" <th>hate</th>\n",
" <th>hobby</th>\n",
" <th>is</th>\n",
" <th>knitting</th>\n",
" <th>love</th>\n",
" <th>my</th>\n",
" <th>passion</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" and dogs hate hobby is knitting love my passion\n",
"0 0 1 0 0 0 0 1 0 0\n",
"1 1 1 1 0 0 1 0 0 0\n",
"2 1 0 0 1 1 1 0 2 1"
]
},
"metadata": {
"tags": []
},
"execution_count": 5
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "7oDH-A3yNFQx"
},
"source": [
"Tf-idf (del inglés Term frequency – Inverse document frequency), frecuencia de término – frecuencia inversa de documento (https://es.wikipedia.org/wiki/Tf-idf)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "BIF2ywCMMzSS"
},
"source": [
"![image.png]()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "oZtyR1PzMzXD"
},
"source": [
"![image.png]()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "5dbDF-n_Mzbl"
},
"source": [
"![image.png]()"
]
},
{
"cell_type": "code",
"metadata": {
"id": "6CD2HCqdvGSq"
},
"source": [
"uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))\n",
"uniqueWords=uniqueWords.union(bagOfWordsC)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "YoZVtnH8vMNv",
"outputId": "7d187131-6230-42f9-9379-fd27d7dfd490",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 188
}
},
"source": [
"uniqueWords\n"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'and',\n",
" 'dogs',\n",
" 'hate',\n",
" 'hobby',\n",
" 'i',\n",
" 'is',\n",
" 'knitting',\n",
" 'love',\n",
" 'my',\n",
" 'passion'}"
]
},
"metadata": {
"tags": []
},
"execution_count": 7
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "SzbfGGfYvM8_"
},
"source": [
"#diccionario\n",
"numOfWordsA = dict.fromkeys(uniqueWords, 0)\n",
"\n",
"for word in bagOfWordsA:\n",
" numOfWordsA[word] += 1\n",
" \n",
"numOfWordsB = dict.fromkeys(uniqueWords, 0)\n",
"\n",
"for word in bagOfWordsB:\n",
" numOfWordsB[word] += 1\n",
"\n",
"numOfWordsC = dict.fromkeys(uniqueWords, 0)\n",
"\n",
"for word in bagOfWordsC:\n",
" numOfWordsC[word] += 1 "
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "Mrip3wk6vakQ",
"outputId": "c9afbbc7-8165-4ffc-9040-f65282fadee0",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 188
}
},
"source": [
"numOfWordsA"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'and': 0,\n",
" 'dogs': 1,\n",
" 'hate': 0,\n",
" 'hobby': 0,\n",
" 'i': 1,\n",
" 'is': 0,\n",
" 'knitting': 0,\n",
" 'love': 1,\n",
" 'my': 0,\n",
" 'passion': 0}"
]
},
"metadata": {
"tags": []
},
"execution_count": 9
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "_WJ7qXK0vc2F",
"outputId": "33ceb23b-19e3-4b2b-a3ca-d62d43fd2cd2",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 188
}
},
"source": [
"numOfWordsB"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'and': 1,\n",
" 'dogs': 1,\n",
" 'hate': 1,\n",
" 'hobby': 0,\n",
" 'i': 1,\n",
" 'is': 0,\n",
" 'knitting': 1,\n",
" 'love': 0,\n",
" 'my': 0,\n",
" 'passion': 0}"
]
},
"metadata": {
"tags": []
},
"execution_count": 10
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "jp0rnrh1ve4J",
"outputId": "e175570a-37e7-467f-ede2-76d7fea016cf",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 188
}
},
"source": [
"numOfWordsC"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'and': 1,\n",
" 'dogs': 0,\n",
" 'hate': 0,\n",
" 'hobby': 1,\n",
" 'i': 0,\n",
" 'is': 1,\n",
" 'knitting': 1,\n",
" 'love': 0,\n",
" 'my': 2,\n",
" 'passion': 1}"
]
},
"metadata": {
"tags": []
},
"execution_count": 11
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "yRyW0iQZvg96"
},
"source": [
"def computeTF(wordDict, bagOfWords):\n",
" tfDict = {}\n",
" bagOfWordsCount = len(bagOfWords)\n",
" for word, count in wordDict.items():\n",
" tfDict[word] = count / float(bagOfWordsCount)\n",
" return tfDict"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "OmOXBkS0vmo0"
},
"source": [
"#TF de cada documento\n",
"tfA = computeTF(numOfWordsA, bagOfWordsA)\n",
"tfB = computeTF(numOfWordsB, bagOfWordsB)\n",
"tfC = computeTF(numOfWordsC, bagOfWordsC)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "Y7CVv-FMvsc_",
"outputId": "d71eeabd-00c8-499b-8fe8-61568c60b7a7",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 188
}
},
"source": [
"tfA"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'and': 0.0,\n",
" 'dogs': 0.3333333333333333,\n",
" 'hate': 0.0,\n",
" 'hobby': 0.0,\n",
" 'i': 0.3333333333333333,\n",
" 'is': 0.0,\n",
" 'knitting': 0.0,\n",
" 'love': 0.3333333333333333,\n",
" 'my': 0.0,\n",
" 'passion': 0.0}"
]
},
"metadata": {
"tags": []
},
"execution_count": 14
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "zK6VUFxSv0aG"
},
"source": [
"def computeIDF(documents):\n",
" import math\n",
" N = len(documents) #numero de documentos\n",
" \n",
" idfDict = dict.fromkeys(documents[0].keys(), 0)\n",
" for document in documents:\n",
" for word, val in document.items():\n",
" if val > 0:\n",
" idfDict[word] += 1\n",
" \n",
" for word, val in idfDict.items():\n",
" idfDict[word] = math.log(N / float(val))\n",
" return idfDict"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "8tIwR-LTv0sx"
},
"source": [
"idfs = computeIDF([numOfWordsA, numOfWordsB,numOfWordsC])"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "k5McMyILv4xw",
"outputId": "f5c48cc1-e8d4-468b-b54d-e410472ad93b",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 188
}
},
"source": [
"idfs"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'and': 0.4054651081081644,\n",
" 'dogs': 0.4054651081081644,\n",
" 'hate': 1.0986122886681098,\n",
" 'hobby': 1.0986122886681098,\n",
" 'i': 0.4054651081081644,\n",
" 'is': 1.0986122886681098,\n",
" 'knitting': 0.4054651081081644,\n",
" 'love': 1.0986122886681098,\n",
" 'my': 1.0986122886681098,\n",
" 'passion': 1.0986122886681098}"
]
},
"metadata": {
"tags": []
},
"execution_count": 17
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "_pORwkSowC8M"
},
"source": [
"def computeTFIDF(tfBagOfWords, idfs):\n",
" tfidf = {}\n",
" for word, val in tfBagOfWords.items():\n",
" tfidf[word] = val * idfs[word]\n",
" return tfidf"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "0dWBuQqKwPCT"
},
"source": [
"tfidfA = computeTFIDF(tfA, idfs)\n",
"tfidfB = computeTFIDF(tfB, idfs)\n",
"tfidfC = computeTFIDF(tfC, idfs)\n",
"\n",
"df = pd.DataFrame([tfidfA, tfidfB,tfidfC])"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "fKCLpD8cwSgn",
"outputId": "0e505d2e-ac3c-42c8-d019-2562ab263e9f",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 141
}
},
"source": [
"df"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>knitting</th>\n",
" <th>passion</th>\n",
" <th>hate</th>\n",
" <th>hobby</th>\n",
" <th>dogs</th>\n",
" <th>and</th>\n",
" <th>love</th>\n",
" <th>my</th>\n",
" <th>i</th>\n",
" <th>is</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.135155</td>\n",
" <td>0.000000</td>\n",
" <td>0.366204</td>\n",
" <td>0.000000</td>\n",
" <td>0.135155</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.081093</td>\n",
" <td>0.000000</td>\n",
" <td>0.219722</td>\n",
" <td>0.000000</td>\n",
" <td>0.081093</td>\n",
" <td>0.081093</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.081093</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.057924</td>\n",
" <td>0.156945</td>\n",
" <td>0.000000</td>\n",
" <td>0.156945</td>\n",
" <td>0.000000</td>\n",
" <td>0.057924</td>\n",
" <td>0.000000</td>\n",
" <td>0.313889</td>\n",
" <td>0.000000</td>\n",
" <td>0.156945</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" knitting passion hate ... my i is\n",
"0 0.000000 0.000000 0.000000 ... 0.000000 0.135155 0.000000\n",
"1 0.081093 0.000000 0.219722 ... 0.000000 0.081093 0.000000\n",
"2 0.057924 0.156945 0.000000 ... 0.313889 0.000000 0.156945\n",
"\n",
"[3 rows x 10 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 22
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "MtJeY_QuwY32",
"outputId": "fd0a5a0e-c39c-439a-d958-d9c965bb60b8",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 141
}
},
"source": [
"df_bw"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>and</th>\n",
" <th>dogs</th>\n",
" <th>hate</th>\n",
" <th>hobby</th>\n",
" <th>is</th>\n",
" <th>knitting</th>\n",
" <th>love</th>\n",
" <th>my</th>\n",
" <th>passion</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" and dogs hate hobby is knitting love my passion\n",
"0 0 1 0 0 0 0 1 0 0\n",
"1 1 1 1 0 0 1 0 0 0\n",
"2 1 0 0 1 1 1 0 2 1"
]
},
"metadata": {
"tags": []
},
"execution_count": 21
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment