Last active
October 6, 2018 18:01
-
-
Save ClebsonDantasUchoa/a914c9c6606c06d9db843239ec08fb18 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Predizer a nota de matemática de um candidato do ENEM baseado nas outras notas" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Importação das bibliotecas" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%matplotlib inline\n", | |
"import math\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"from sklearn import linear_model\n", | |
"from sklearn import metrics\n", | |
"from sklearn import preprocessing\n", | |
"from sklearn.metrics import mean_squared_error\n", | |
"import matplotlib.pyplot as plt\n", | |
"from sklearn import svm\n", | |
"from sklearn.preprocessing import StandardScaler\n", | |
"from sklearn.ensemble import GradientBoostingRegressor" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Leitura do arquivo" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"data = pd.read_csv(\"train.csv\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(13730, 167)" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"data.shape" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Filtragem das colunas importantes para um novo dataset" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dados = pd.DataFrame()\n", | |
"dados['NU_NOTA_CN'] = data['NU_NOTA_CN']\n", | |
"dados['NU_NOTA_CH'] = data['NU_NOTA_CH']\n", | |
"dados['NU_NOTA_LC'] = data['NU_NOTA_LC']\n", | |
"dados['NU_NOTA_REDACAO'] = data['NU_NOTA_REDACAO']\n", | |
"dados['NU_NOTA_MT'] = data['NU_NOTA_MT']" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Remoção das linhas com elementos faltantes" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"NU_NOTA_CN 3389\n", | |
"NU_NOTA_CH 3389\n", | |
"NU_NOTA_LC 3597\n", | |
"NU_NOTA_REDACAO 3597\n", | |
"NU_NOTA_MT 3597\n", | |
"dtype: int64" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"dados.isnull().sum()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dados = dados.dropna()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(10097, 5)" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"dados.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"NU_NOTA_CN 0\n", | |
"NU_NOTA_CH 0\n", | |
"NU_NOTA_LC 0\n", | |
"NU_NOTA_REDACAO 0\n", | |
"NU_NOTA_MT 0\n", | |
"dtype: int64" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"dados.isnull().sum()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>NU_NOTA_CN</th>\n", | |
" <th>NU_NOTA_CH</th>\n", | |
" <th>NU_NOTA_LC</th>\n", | |
" <th>NU_NOTA_REDACAO</th>\n", | |
" <th>NU_NOTA_MT</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>count</th>\n", | |
" <td>10097.000000</td>\n", | |
" <td>10097.000000</td>\n", | |
" <td>10097.000000</td>\n", | |
" <td>10097.000000</td>\n", | |
" <td>10097.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>mean</th>\n", | |
" <td>473.912509</td>\n", | |
" <td>530.346123</td>\n", | |
" <td>516.665059</td>\n", | |
" <td>529.452907</td>\n", | |
" <td>482.648638</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>std</th>\n", | |
" <td>71.098468</td>\n", | |
" <td>73.528309</td>\n", | |
" <td>68.442602</td>\n", | |
" <td>154.001881</td>\n", | |
" <td>99.685820</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>min</th>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>25%</th>\n", | |
" <td>420.100000</td>\n", | |
" <td>481.000000</td>\n", | |
" <td>468.600000</td>\n", | |
" <td>440.000000</td>\n", | |
" <td>409.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>50%</th>\n", | |
" <td>460.400000</td>\n", | |
" <td>532.600000</td>\n", | |
" <td>521.100000</td>\n", | |
" <td>540.000000</td>\n", | |
" <td>461.300000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>75%</th>\n", | |
" <td>515.100000</td>\n", | |
" <td>581.900000</td>\n", | |
" <td>564.900000</td>\n", | |
" <td>600.000000</td>\n", | |
" <td>537.700000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>max</th>\n", | |
" <td>806.400000</td>\n", | |
" <td>807.000000</td>\n", | |
" <td>763.600000</td>\n", | |
" <td>1000.000000</td>\n", | |
" <td>952.000000</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" NU_NOTA_CN NU_NOTA_CH NU_NOTA_LC NU_NOTA_REDACAO NU_NOTA_MT\n", | |
"count 10097.000000 10097.000000 10097.000000 10097.000000 10097.000000\n", | |
"mean 473.912509 530.346123 516.665059 529.452907 482.648638\n", | |
"std 71.098468 73.528309 68.442602 154.001881 99.685820\n", | |
"min 0.000000 0.000000 0.000000 0.000000 0.000000\n", | |
"25% 420.100000 481.000000 468.600000 440.000000 409.000000\n", | |
"50% 460.400000 532.600000 521.100000 540.000000 461.300000\n", | |
"75% 515.100000 581.900000 564.900000 600.000000 537.700000\n", | |
"max 806.400000 807.000000 763.600000 1000.000000 952.000000" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"dados.describe()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Criação dos datasets de treino e testes" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"values = dados.values\n", | |
"#values = dados.drop('NU_NOTA_MT', axis=1).values" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#scaler = preprocessing.StandardScaler().fit(values)\n", | |
"#newdf = pd.DataFrame(scaler.transform(values))\n", | |
"#newdf['NU_NOTA_MT'] = dados['NU_NOTA_MT'].values\n", | |
"#newdf.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#values = newdf.values\n", | |
"np.random.seed(2)\n", | |
"np.random.shuffle(values)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"7067\n" | |
] | |
} | |
], | |
"source": [ | |
"n = 4\n", | |
"linhas = int(dados.shape[0]*0.7)\n", | |
"x_train = values[:linhas, 0:n]\n", | |
"y_train = values[:linhas, n]\n", | |
"x_test = values[linhas: , 0:n]\n", | |
"y_test = values[linhas:, n]\n", | |
"print(linhas)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Desvio padrão do label no conjunto de teste:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"99.2996703459975" | |
] | |
}, | |
"execution_count": 20, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"np.std(y_test)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Criação do modelo de regressão linear, treinamento e predição" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"modelo = linear_model.LinearRegression()\n", | |
"modelo.fit(x_train, y_train)\n", | |
"predicao = modelo.predict(x_test)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Real</th>\n", | |
" <th>Predicao</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>431.5</td>\n", | |
" <td>474.918928</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>559.5</td>\n", | |
" <td>514.217154</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>450.7</td>\n", | |
" <td>539.906238</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>548.9</td>\n", | |
" <td>543.481335</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>396.5</td>\n", | |
" <td>437.443556</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Real Predicao\n", | |
"0 431.5 474.918928\n", | |
"1 559.5 514.217154\n", | |
"2 450.7 539.906238\n", | |
"3 548.9 543.481335\n", | |
"4 396.5 437.443556" | |
] | |
}, | |
"execution_count": 15, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"real_predicao = pd.DataFrame()\n", | |
"real_predicao['Real'] = y_test\n", | |
"real_predicao['Predicao'] = predicao\n", | |
"real_predicao.head(5)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Criação do modelo GradientBoostingRegressor" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"modeloGBR = GradientBoostingRegressor()\n", | |
"modeloGBR.fit(x_train, y_train)\n", | |
"predicaoGBR = modeloGBR.predict(x_test)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Real</th>\n", | |
" <th>Predicao</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>431.5</td>\n", | |
" <td>474.918928</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>559.5</td>\n", | |
" <td>514.217154</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>450.7</td>\n", | |
" <td>539.906238</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>548.9</td>\n", | |
" <td>543.481335</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>396.5</td>\n", | |
" <td>437.443556</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Real Predicao\n", | |
"0 431.5 474.918928\n", | |
"1 559.5 514.217154\n", | |
"2 450.7 539.906238\n", | |
"3 548.9 543.481335\n", | |
"4 396.5 437.443556" | |
] | |
}, | |
"execution_count": 17, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"real_predicaoGBR = pd.DataFrame()\n", | |
"real_predicaoGBR['Real'] = y_test\n", | |
"real_predicaoGBR['Predicao'] = predicao\n", | |
"real_predicao.head(5)\n", | |
"real_predicaoGBR['Predicao']" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Métricas de avaliação" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"grafico = pd.DataFrame()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"RMSE linearRegression: \n", | |
"77.84089186651498\n", | |
"RMSE GradientBoostingRegressor: \n", | |
"74.56662051253947\n" | |
] | |
} | |
], | |
"source": [ | |
"print(\"RMSE linearRegression: \")\n", | |
"print(math.sqrt(metrics.mean_squared_error(y_test, predicao)))\n", | |
"print(\"RMSE GradientBoostingRegressor: \")\n", | |
"print(math.sqrt(metrics.mean_squared_error(y_test, predicaoGBR)))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"MAE linearRegression: \n", | |
"61.666621587672786\n", | |
"MAE GradientBoostingRegressor: \n", | |
"59.14682258347615\n" | |
] | |
} | |
], | |
"source": [ | |
"print(\"MAE linearRegression: \")\n", | |
"print(metrics.mean_absolute_error(y_test, predicao))\n", | |
"print(\"MAE GradientBoostingRegressor: \")\n", | |
"print(metrics.mean_absolute_error(y_test, predicaoGBR))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
regispires
commented
Oct 4, 2018
•
- não use os dados de teste sem label.
- divida seus dados de treino em treino (70%) e teste (30%). ou seja: ambos terão label.
- treine o modelo com os dados de treino.
- calcule a métrica RMSE para os dados de teste.
- calcule e mostre o desvio padrão (std) do label no conj de teste.
- atualize o GIST com isso e me avise para analisarmos novamente.
- criar um modelo também com o GradientBoostingRegressor e acrescentá-lo à comparação.
- vc não calculou o desvio padrão do label no conj. de teste: np.std(y_test)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment