Created
March 22, 2021 16:11
-
-
Save eduardodx/6b233950758f2b2ed8238c1b8d252829 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "nlp-lista-02-eduardo-souza.ipynb", | |
"provenance": [], | |
"collapsed_sections": [] | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 1000 | |
}, | |
"id": "41rLmp87qizR", | |
"outputId": "753adc94-59ae-43bc-bf2a-e824821ca7a4" | |
}, | |
"source": [ | |
"! pip install --upgrade pip spacy==3.0.5" | |
], | |
"execution_count": 45, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Collecting pip\n", | |
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/fe/ef/60d7ba03b5c442309ef42e7d69959f73aacccd0d86008362a681c4698e83/pip-21.0.1-py3-none-any.whl (1.5MB)\n", | |
"\u001b[K |████████████████████████████████| 1.5MB 5.9MB/s \n", | |
"\u001b[?25hCollecting spacy==3.0.5\n", | |
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/3a/70/a0b8bd0cb54d8739ba4d6fb3458785c3b9b812b7fbe93b0f10beb1a53ada/spacy-3.0.5-cp37-cp37m-manylinux2014_x86_64.whl (12.8MB)\n", | |
"\u001b[K |████████████████████████████████| 12.8MB 13.6MB/s \n", | |
"\u001b[?25hCollecting catalogue<2.1.0,>=2.0.1\n", | |
" Downloading https://files.pythonhosted.org/packages/48/5c/493a2f3bb0eac17b1d48129ecfd251f0520b6c89493e9fd0522f534a9e4a/catalogue-2.0.1-py3-none-any.whl\n", | |
"Collecting srsly<3.0.0,>=2.4.0\n", | |
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/19/54/76982427ceb495dd19ff982c966708c624b85e03c45bf1912feaf60c7b2d/srsly-2.4.0-cp37-cp37m-manylinux2014_x86_64.whl (456kB)\n", | |
"\u001b[K |████████████████████████████████| 460kB 39.9MB/s \n", | |
"\u001b[?25hRequirement already satisfied, skipping upgrade: jinja2 in /usr/local/lib/python3.7/dist-packages (from spacy==3.0.5) (2.11.3)\n", | |
"Requirement already satisfied, skipping upgrade: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.7/dist-packages (from spacy==3.0.5) (4.41.1)\n", | |
"Requirement already satisfied, skipping upgrade: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.7/dist-packages (from spacy==3.0.5) (1.0.5)\n", | |
"Requirement already satisfied, skipping upgrade: numpy>=1.15.0 in /usr/local/lib/python3.7/dist-packages (from spacy==3.0.5) (1.19.5)\n", | |
"Requirement already satisfied, skipping upgrade: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy==3.0.5) (2.0.5)\n", | |
"Requirement already satisfied, skipping upgrade: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from spacy==3.0.5) (20.9)\n", | |
"Requirement already satisfied, skipping upgrade: typing-extensions<4.0.0.0,>=3.7.4; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from spacy==3.0.5) (3.7.4.3)\n", | |
"Collecting thinc<8.1.0,>=8.0.2\n", | |
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/e3/08/20e707519bcded1a0caa6fd024b767ac79e4e5d0fb92266bb7dcf735e338/thinc-8.0.2-cp37-cp37m-manylinux2014_x86_64.whl (1.1MB)\n", | |
"\u001b[K |████████████████████████████████| 1.1MB 52.1MB/s \n", | |
"\u001b[?25hRequirement already satisfied, skipping upgrade: setuptools in /usr/local/lib/python3.7/dist-packages (from spacy==3.0.5) (54.1.2)\n", | |
"Requirement already satisfied, skipping upgrade: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.7/dist-packages (from spacy==3.0.5) (2.23.0)\n", | |
"Collecting pydantic<1.8.0,>=1.7.1\n", | |
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/b3/0a/52ae1c659fc08f13dd7c0ae07b88e4f807ad83fb9954a59b0b0a3d1a8ab6/pydantic-1.7.3-cp37-cp37m-manylinux2014_x86_64.whl (9.1MB)\n", | |
"\u001b[K |████████████████████████████████| 9.1MB 52.1MB/s \n", | |
"\u001b[?25hCollecting spacy-legacy<3.1.0,>=3.0.0\n", | |
" Downloading https://files.pythonhosted.org/packages/65/d5/6c58fc97f3098775e46d8202bf248752e626a8096a0ae9d76aa7c485a09c/spacy_legacy-3.0.1-py2.py3-none-any.whl\n", | |
"Requirement already satisfied, skipping upgrade: wasabi<1.1.0,>=0.8.1 in /usr/local/lib/python3.7/dist-packages (from spacy==3.0.5) (0.8.2)\n", | |
"Requirement already satisfied, skipping upgrade: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy==3.0.5) (3.0.5)\n", | |
"Collecting typer<0.4.0,>=0.3.0\n", | |
" Downloading https://files.pythonhosted.org/packages/90/34/d138832f6945432c638f32137e6c79a3b682f06a63c488dcfaca6b166c64/typer-0.3.2-py3-none-any.whl\n", | |
"Requirement already satisfied, skipping upgrade: importlib-metadata>=0.20; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from spacy==3.0.5) (3.7.2)\n", | |
"Requirement already satisfied, skipping upgrade: blis<0.8.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy==3.0.5) (0.4.1)\n", | |
"Collecting pathy>=0.3.5\n", | |
" Downloading https://files.pythonhosted.org/packages/a2/53/97dc0197cca9357369b3b71bf300896cf2d3604fa60ffaaf5cbc277de7de/pathy-0.4.0-py3-none-any.whl\n", | |
"Requirement already satisfied, skipping upgrade: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from jinja2->spacy==3.0.5) (1.1.1)\n", | |
"Requirement already satisfied, skipping upgrade: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=20.0->spacy==3.0.5) (2.4.7)\n", | |
"Requirement already satisfied, skipping upgrade: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy==3.0.5) (1.24.3)\n", | |
"Requirement already satisfied, skipping upgrade: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy==3.0.5) (2.10)\n", | |
"Requirement already satisfied, skipping upgrade: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy==3.0.5) (3.0.4)\n", | |
"Requirement already satisfied, skipping upgrade: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy==3.0.5) (2020.12.5)\n", | |
"Requirement already satisfied, skipping upgrade: click<7.2.0,>=7.1.1 in /usr/local/lib/python3.7/dist-packages (from typer<0.4.0,>=0.3.0->spacy==3.0.5) (7.1.2)\n", | |
"Requirement already satisfied, skipping upgrade: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20; python_version < \"3.8\"->spacy==3.0.5) (3.4.1)\n", | |
"Collecting smart-open<4.0.0,>=2.2.0\n", | |
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/11/9a/ba2d5f67f25e8d5bbf2fcec7a99b1e38428e83cb715f64dd179ca43a11bb/smart_open-3.0.0.tar.gz (113kB)\n", | |
"\u001b[K |████████████████████████████████| 122kB 42.9MB/s \n", | |
"\u001b[?25hBuilding wheels for collected packages: smart-open\n", | |
" Building wheel for smart-open (setup.py) ... \u001b[?25l\u001b[?25hdone\n", | |
" Created wheel for smart-open: filename=smart_open-3.0.0-cp37-none-any.whl size=107098 sha256=810002469863a684a53abc41921783aa00acdd642809ed483f51579a22fbca1f\n", | |
" Stored in directory: /root/.cache/pip/wheels/18/88/7c/f06dabd5e9cabe02d2269167bcacbbf9b47d0c0ff7d6ebcb78\n", | |
"Successfully built smart-open\n", | |
"Installing collected packages: pip, catalogue, srsly, pydantic, thinc, spacy-legacy, typer, smart-open, pathy, spacy\n", | |
" Found existing installation: pip 19.3.1\n", | |
" Uninstalling pip-19.3.1:\n", | |
" Successfully uninstalled pip-19.3.1\n", | |
" Found existing installation: catalogue 1.0.0\n", | |
" Uninstalling catalogue-1.0.0:\n", | |
" Successfully uninstalled catalogue-1.0.0\n", | |
" Found existing installation: srsly 1.0.5\n", | |
" Uninstalling srsly-1.0.5:\n", | |
" Successfully uninstalled srsly-1.0.5\n", | |
" Found existing installation: thinc 7.4.0\n", | |
" Uninstalling thinc-7.4.0:\n", | |
" Successfully uninstalled thinc-7.4.0\n", | |
" Found existing installation: smart-open 4.2.0\n", | |
" Uninstalling smart-open-4.2.0:\n", | |
" Successfully uninstalled smart-open-4.2.0\n", | |
" Found existing installation: spacy 2.2.4\n", | |
" Uninstalling spacy-2.2.4:\n", | |
" Successfully uninstalled spacy-2.2.4\n", | |
"Successfully installed catalogue-2.0.1 pathy-0.4.0 pip-21.0.1 pydantic-1.7.3 smart-open-3.0.0 spacy-3.0.5 spacy-legacy-3.0.1 srsly-2.4.0 thinc-8.0.2 typer-0.3.2\n" | |
], | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"application/vnd.colab-display-data+json": { | |
"pip_warning": { | |
"packages": [ | |
"catalogue", | |
"spacy", | |
"srsly", | |
"thinc" | |
] | |
} | |
} | |
}, | |
"metadata": { | |
"tags": [] | |
} | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "Bm-k493ll3vp", | |
"outputId": "19cac8a1-eee3-44fa-cbea-ee741bfbdc16" | |
}, | |
"source": [ | |
"! python -m spacy download pt_core_news_lg" | |
], | |
"execution_count": 1, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"2021-03-22 16:02:06.454823: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0\n", | |
"Collecting pt-core-news-lg==3.0.0\n", | |
" Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_lg-3.0.0/pt_core_news_lg-3.0.0-py3-none-any.whl (578.1 MB)\n", | |
"\u001b[K |████████████████████████████████| 578.1 MB 7.9 kB/s \n", | |
"\u001b[?25hRequirement already satisfied: spacy<3.1.0,>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from pt-core-news-lg==3.0.0) (3.0.5)\n", | |
"Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (3.0.5)\n", | |
"Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (4.41.1)\n", | |
"Requirement already satisfied: pydantic<1.8.0,>=1.7.1 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (1.7.3)\n", | |
"Requirement already satisfied: typing-extensions<4.0.0.0,>=3.7.4 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (3.7.4.3)\n", | |
"Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (54.1.2)\n", | |
"Requirement already satisfied: pathy>=0.3.5 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (0.4.0)\n", | |
"Requirement already satisfied: thinc<8.1.0,>=8.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (8.0.2)\n", | |
"Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (1.0.5)\n", | |
"Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (2.0.5)\n", | |
"Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (1.19.5)\n", | |
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (2.11.3)\n", | |
"Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (3.0.1)\n", | |
"Requirement already satisfied: importlib-metadata>=0.20 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (3.7.2)\n", | |
"Requirement already satisfied: catalogue<2.1.0,>=2.0.1 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (2.0.1)\n", | |
"Requirement already satisfied: blis<0.8.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (0.4.1)\n", | |
"Requirement already satisfied: typer<0.4.0,>=0.3.0 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (0.3.2)\n", | |
"Requirement already satisfied: wasabi<1.1.0,>=0.8.1 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (0.8.2)\n", | |
"Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (2.23.0)\n", | |
"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (20.9)\n", | |
"Requirement already satisfied: srsly<3.0.0,>=2.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (2.4.0)\n", | |
"Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20->spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (3.4.1)\n", | |
"Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=20.0->spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (2.4.7)\n", | |
"Requirement already satisfied: smart-open<4.0.0,>=2.2.0 in /usr/local/lib/python3.7/dist-packages (from pathy>=0.3.5->spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (3.0.0)\n", | |
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (2020.12.5)\n", | |
"Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (3.0.4)\n", | |
"Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (2.10)\n", | |
"Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (1.24.3)\n", | |
"Requirement already satisfied: click<7.2.0,>=7.1.1 in /usr/local/lib/python3.7/dist-packages (from typer<0.4.0,>=0.3.0->spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (7.1.2)\n", | |
"Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from jinja2->spacy<3.1.0,>=3.0.0->pt-core-news-lg==3.0.0) (1.1.1)\n", | |
"Installing collected packages: pt-core-news-lg\n", | |
"Successfully installed pt-core-news-lg-3.0.0\n", | |
"\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n", | |
"You can now load the package via spacy.load('pt_core_news_lg')\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "RvTOLOsbj2j6" | |
}, | |
"source": [ | |
"import pt_core_news_lg\n", | |
"import pandas as pd" | |
], | |
"execution_count": 2, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "tXDyhwa6kPmE" | |
}, | |
"source": [ | |
"nlp = pt_core_news_lg.load()" | |
], | |
"execution_count": 4, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "bS7T3Eu7kSBn", | |
"outputId": "c1e02b59-3436-4c34-8bf2-aac9ba5e7ea0" | |
}, | |
"source": [ | |
"df = pd.read_csv(\"https://github.com/b2wdigital/b2w-reviews01/raw/master/B2W-Reviews01.csv\", delimiter=\";\")" | |
], | |
"execution_count": 5, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py:2718: DtypeWarning: Columns (2) have mixed types.Specify dtype option on import or set low_memory=False.\n", | |
" interactivity=interactivity, compiler=compiler, result=result)\n" | |
], | |
"name": "stderr" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 581 | |
}, | |
"id": "EuBDTgnukdJt", | |
"outputId": "bfe390b6-3f03-40ac-c5f7-0545556609eb" | |
}, | |
"source": [ | |
"df.head()" | |
], | |
"execution_count": 6, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>submission_date</th>\n", | |
" <th>reviewer_id</th>\n", | |
" <th>product_id</th>\n", | |
" <th>product_name</th>\n", | |
" <th>product_brand</th>\n", | |
" <th>site_category_lv1</th>\n", | |
" <th>site_category_lv2</th>\n", | |
" <th>review_title</th>\n", | |
" <th>overall_rating</th>\n", | |
" <th>recommend_to_a_friend</th>\n", | |
" <th>review_text</th>\n", | |
" <th>reviewer_birth_year</th>\n", | |
" <th>reviewer_gender</th>\n", | |
" <th>reviewer_state</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>2018-01-01 00:11:28</td>\n", | |
" <td>d0fb1ca69422530334178f5c8624aa7a99da47907c44de...</td>\n", | |
" <td>132532965</td>\n", | |
" <td>Notebook Asus Vivobook Max X541NA-GO472T Intel...</td>\n", | |
" <td>NaN</td>\n", | |
" <td>Informática</td>\n", | |
" <td>Notebook</td>\n", | |
" <td>Bom</td>\n", | |
" <td>4</td>\n", | |
" <td>Yes</td>\n", | |
" <td>Estou contente com a compra entrega rápida o ú...</td>\n", | |
" <td>1958.0</td>\n", | |
" <td>F</td>\n", | |
" <td>RJ</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2018-01-01 00:13:48</td>\n", | |
" <td>014d6dc5a10aed1ff1e6f349fb2b059a2d3de511c7538a...</td>\n", | |
" <td>22562178</td>\n", | |
" <td>Copo Acrílico Com Canudo 500ml Rocie</td>\n", | |
" <td>NaN</td>\n", | |
" <td>Utilidades Domésticas</td>\n", | |
" <td>Copos, Taças e Canecas</td>\n", | |
" <td>Preço imbatível, ótima qualidade</td>\n", | |
" <td>4</td>\n", | |
" <td>Yes</td>\n", | |
" <td>Por apenas R$1994.20,eu consegui comprar esse ...</td>\n", | |
" <td>1996.0</td>\n", | |
" <td>M</td>\n", | |
" <td>SC</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>2018-01-01 00:26:02</td>\n", | |
" <td>44f2c8edd93471926fff601274b8b2b5c4824e386ae4f2...</td>\n", | |
" <td>113022329</td>\n", | |
" <td>Panela de Pressão Elétrica Philips Walita Dail...</td>\n", | |
" <td>philips walita</td>\n", | |
" <td>Eletroportáteis</td>\n", | |
" <td>Panela Elétrica</td>\n", | |
" <td>ATENDE TODAS AS EXPECTATIVA.</td>\n", | |
" <td>4</td>\n", | |
" <td>Yes</td>\n", | |
" <td>SUPERA EM AGILIDADE E PRATICIDADE OUTRAS PANEL...</td>\n", | |
" <td>1984.0</td>\n", | |
" <td>M</td>\n", | |
" <td>SP</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>2018-01-01 00:35:54</td>\n", | |
" <td>ce741665c1764ab2d77539e18d0e4f66dde6213c9f0863...</td>\n", | |
" <td>113851581</td>\n", | |
" <td>Betoneira Columbus - Roma Brinquedos</td>\n", | |
" <td>roma jensen</td>\n", | |
" <td>Brinquedos</td>\n", | |
" <td>Veículos de Brinquedo</td>\n", | |
" <td>presente mais que desejado</td>\n", | |
" <td>4</td>\n", | |
" <td>Yes</td>\n", | |
" <td>MEU FILHO AMOU! PARECE DE VERDADE COM TANTOS D...</td>\n", | |
" <td>1985.0</td>\n", | |
" <td>F</td>\n", | |
" <td>SP</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2018-01-01 01:00:28</td>\n", | |
" <td>7d7b6b18dda804a897359276cef0ca252f9932bf4b5c8e...</td>\n", | |
" <td>131788803</td>\n", | |
" <td>Smart TV LED 43\" LG 43UJ6525 Ultra HD 4K com C...</td>\n", | |
" <td>lg</td>\n", | |
" <td>TV e Home Theater</td>\n", | |
" <td>TV</td>\n", | |
" <td>Sem duvidas, excelente</td>\n", | |
" <td>5</td>\n", | |
" <td>Yes</td>\n", | |
" <td>A entrega foi no prazo, as americanas estão de...</td>\n", | |
" <td>1994.0</td>\n", | |
" <td>M</td>\n", | |
" <td>MG</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" submission_date ... reviewer_state\n", | |
"0 2018-01-01 00:11:28 ... RJ\n", | |
"1 2018-01-01 00:13:48 ... SC\n", | |
"2 2018-01-01 00:26:02 ... SP\n", | |
"3 2018-01-01 00:35:54 ... SP\n", | |
"4 2018-01-01 01:00:28 ... MG\n", | |
"\n", | |
"[5 rows x 14 columns]" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 6 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "dF16h_hhlAIY" | |
}, | |
"source": [ | |
"## Text-processing" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "-gc5_NRalBx7" | |
}, | |
"source": [ | |
"original_docs = df.review_text" | |
], | |
"execution_count": 7, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "b53ALLZ3lL-g" | |
}, | |
"source": [ | |
"def text_processing(text):\n", | |
" doc = nlp(text)\n", | |
" processed = []\n", | |
"\n", | |
" for token in doc:\n", | |
" lexeme = doc.vocab[token.text]\n", | |
"\n", | |
" if token.is_stop or token.is_punct or token.like_num or token.is_space:\n", | |
" continue\n", | |
"\n", | |
" processed.append(f\"{token.lemma_.lower()}-{token.pos_}-{token.is_oov}\")\n", | |
"\n", | |
" return \" \".join(processed)" | |
], | |
"execution_count": 8, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "smJDfkjtlSTu" | |
}, | |
"source": [ | |
"docs = list(original_docs.map(text_processing))" | |
], | |
"execution_count": 18, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "nOsuG7NclXeK", | |
"outputId": "a979f31c-c290-4264-be5e-2ab0f629a42c" | |
}, | |
"source": [ | |
"docs" | |
], | |
"execution_count": 10, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"['contentar-ADJ-False comprar-NOUN-False entregar-NOUN-False rápido-ADJ-False único-ADJ-False problema-NOUN-False americanas-PROPN-False haver-VERB-False trocar-NOUN-False devolução-NOUN-False produto-NOUN-False consumidor-NOUN-False problema-NOUN-False esperar-NOUN-False',\n", | |
" 'r$-SYM-False 1994.20,eu-NUM-True conseguir-VERB-False comprar-VERB-False lindar-ADJ-False copar-NOUN-False acrílico-NOUN-False',\n", | |
" 'supera-NOUN-False agilidade-NOUN-False praticidade-NOUN-False panelas-PROPN-False elétricas-PROPN-False costumo-PROPN-True panela-PROPN-False cozimento-PROPN-True arroz-PROPN-False japonesa-PROPN-False leva-VERB-False +-NOUN-False minutos-NOUN-False panela-NOUN-False rápido-NOUN-False exatamente-PROPN-False minutos-NOUN-False recomendo-DET-False',\n", | |
" 'filho-PROPN-False amou-PROPN-False verdade-PROPN-False tantos-PROPN-False detalhes-NOUN-False',\n", | |
" 'entregar-NOUN-False prazo-NOUN-False americano-NOUN-False parabém-NOUN-False smart-ADJ-False tv-NOUN-False navegação-NOUN-False internete-NOUN-False aplicativo-NOUN-False excelente-ADJ-False travar-VERB-False falar-VERB-False imagem-NOUN-False surpreender-VERB-False recomendar-VERB-False',\n", | |
" 'excelente-ADJ-False produto-NOUN-False material-NOUN-False acrílico-ADJ-False super-ADJ-False resistente-ADJ-False adamantio-NOUN-True milagre-NOUN-False bebido-NOUN-False sugiro-VERB-False aproveitar-VERB-False promoção-NOUN-False acabar-VERB-False',\n", | |
" 'produto-NOUN-False mto-ADV-False garrafa-NOUN-False vc-PROPN-False servir-VERB-False água-NOUN-False pro-ADP-False megazord-NOUN-False to-SCONJ-False pensar-VERB-False vender-VERB-False tv-NOUN-False pra-SCONJ-False comprar-VERB-False garrafa-NOUN-False recomendo-NOUN-False',\n", | |
" 'produto-NOUN-False excelente-ADJ-False qualidade-NOUN-False câmera-NOUN-False desenvolvimento-NOUN-False android-PROPN-False rapidez-NOUN-False',\n", | |
" 'barulhar-NOUN-False minimo-ADJ-False ventar-NOUN-False forte-ADJ-False velocidade-NOUN-False',\n", | |
" 'produto-PROPN-False nao-PROPN-False entregue-PROPN-False americanas-PROPN-False descontando-VERB-True fatura-NOUN-False cartão-NOUN-False']" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 10 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "s3wPbWsElCws" | |
}, | |
"source": [ | |
"## Feature-extraction" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "qPsNuX96lD7A" | |
}, | |
"source": [ | |
"from sklearn.feature_extraction.text import TfidfVectorizer" | |
], | |
"execution_count": 11, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "fXgzeTVOrhlu" | |
}, | |
"source": [ | |
"vectorizer = TfidfVectorizer()" | |
], | |
"execution_count": 12, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "pum_CErnrjV1" | |
}, | |
"source": [ | |
"X = vectorizer.fit_transform(docs)" | |
], | |
"execution_count": 13, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "pn33rA0proKY", | |
"outputId": "0d4d28bb-d22d-49ef-8c72-c8348cd33310" | |
}, | |
"source": [ | |
"# cada linha é um documento e cada coluna corresponde a uma palavra\n", | |
"print(X)" | |
], | |
"execution_count": 16, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
" (0, 35)\t0.1347699687619662\n", | |
" (0, 21)\t0.1347699687619662\n", | |
" (0, 70)\t0.08002921702608817\n", | |
" (0, 31)\t0.1347699687619662\n", | |
" (0, 90)\t0.1347699687619662\n", | |
" (0, 97)\t0.06569483075589849\n", | |
" (0, 45)\t0.1347699687619662\n", | |
" (0, 72)\t0.07234832342842883\n", | |
" (0, 9)\t0.11456678602912203\n", | |
" (0, 69)\t0.2695399375239324\n", | |
" (0, 100)\t0.1347699687619662\n", | |
" (0, 78)\t0.11456678602912203\n", | |
" (0, 33)\t0.11456678602912203\n", | |
" (0, 59)\t0.44844461572533584\n", | |
" (0, 19)\t0.10023239975893235\n", | |
" (0, 40)\t0.6975805133505224\n", | |
" (0, 5)\t0.2170449702852865\n", | |
" (0, 22)\t0.1347699687619662\n", | |
" (1, 3)\t0.20979116088351724\n", | |
" (1, 23)\t0.24678660525242735\n", | |
" (1, 50)\t0.24678660525242735\n", | |
" (1, 20)\t0.24678660525242735\n", | |
" (1, 91)\t0.16318241313102977\n", | |
" (1, 60)\t0.24678660525242735\n", | |
" (1, 36)\t0.24678660525242735\n", | |
" :\t:\n", | |
" (7, 70)\t0.1457589675335856\n", | |
" (7, 72)\t0.13176958762792945\n", | |
" (7, 59)\t0.4537566724108096\n", | |
" (7, 40)\t0.6352593413751334\n", | |
" (7, 5)\t0.13176958762792945\n", | |
" (8, 94)\t0.3042860691298508\n", | |
" (8, 43)\t0.3042860691298508\n", | |
" (8, 96)\t0.3042860691298508\n", | |
" (8, 54)\t0.3042860691298508\n", | |
" (8, 16)\t0.3042860691298508\n", | |
" (8, 59)\t0.33750211711364186\n", | |
" (8, 40)\t0.5625035285227364\n", | |
" (8, 5)\t0.3266987021871967\n", | |
" (9, 18)\t0.24376622300595205\n", | |
" (9, 41)\t0.24376622300595205\n", | |
" (9, 27)\t0.24376622300595205\n", | |
" (9, 34)\t0.24376622300595205\n", | |
" (9, 57)\t0.24376622300595205\n", | |
" (9, 91)\t0.16118524937469939\n", | |
" (9, 70)\t0.1447534650618593\n", | |
" (9, 97)\t0.11882603306575845\n", | |
" (9, 72)\t0.5234423575213665\n", | |
" (9, 9)\t0.20722356003195608\n", | |
" (9, 59)\t0.1802505924344424\n", | |
" (9, 40)\t0.5407517773033272\n" | |
], | |
"name": "stdout" | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment