Skip to content

Instantly share code, notes, and snippets.

@ClebsonDantasUchoa
Last active October 6, 2018 18:01
Show Gist options
  • Save ClebsonDantasUchoa/a914c9c6606c06d9db843239ec08fb18 to your computer and use it in GitHub Desktop.
Save ClebsonDantasUchoa/a914c9c6606c06d9db843239ec08fb18 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Predizer a nota de matemática de um candidato do ENEM baseado nas outras notas"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Importação das bibliotecas"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import math\n",
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn import linear_model\n",
"from sklearn import metrics\n",
"from sklearn import preprocessing\n",
"from sklearn.metrics import mean_squared_error\n",
"import matplotlib.pyplot as plt\n",
"from sklearn import svm\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.ensemble import GradientBoostingRegressor"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Leitura do arquivo"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"data = pd.read_csv(\"train.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(13730, 167)"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Filtragem das colunas importantes para um novo dataset"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"dados = pd.DataFrame()\n",
"dados['NU_NOTA_CN'] = data['NU_NOTA_CN']\n",
"dados['NU_NOTA_CH'] = data['NU_NOTA_CH']\n",
"dados['NU_NOTA_LC'] = data['NU_NOTA_LC']\n",
"dados['NU_NOTA_REDACAO'] = data['NU_NOTA_REDACAO']\n",
"dados['NU_NOTA_MT'] = data['NU_NOTA_MT']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Remoção das linhas com elementos faltantes"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"NU_NOTA_CN 3389\n",
"NU_NOTA_CH 3389\n",
"NU_NOTA_LC 3597\n",
"NU_NOTA_REDACAO 3597\n",
"NU_NOTA_MT 3597\n",
"dtype: int64"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dados.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"dados = dados.dropna()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(10097, 5)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dados.shape"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"NU_NOTA_CN 0\n",
"NU_NOTA_CH 0\n",
"NU_NOTA_LC 0\n",
"NU_NOTA_REDACAO 0\n",
"NU_NOTA_MT 0\n",
"dtype: int64"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dados.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>NU_NOTA_CN</th>\n",
" <th>NU_NOTA_CH</th>\n",
" <th>NU_NOTA_LC</th>\n",
" <th>NU_NOTA_REDACAO</th>\n",
" <th>NU_NOTA_MT</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>10097.000000</td>\n",
" <td>10097.000000</td>\n",
" <td>10097.000000</td>\n",
" <td>10097.000000</td>\n",
" <td>10097.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>473.912509</td>\n",
" <td>530.346123</td>\n",
" <td>516.665059</td>\n",
" <td>529.452907</td>\n",
" <td>482.648638</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>71.098468</td>\n",
" <td>73.528309</td>\n",
" <td>68.442602</td>\n",
" <td>154.001881</td>\n",
" <td>99.685820</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>420.100000</td>\n",
" <td>481.000000</td>\n",
" <td>468.600000</td>\n",
" <td>440.000000</td>\n",
" <td>409.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>460.400000</td>\n",
" <td>532.600000</td>\n",
" <td>521.100000</td>\n",
" <td>540.000000</td>\n",
" <td>461.300000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>515.100000</td>\n",
" <td>581.900000</td>\n",
" <td>564.900000</td>\n",
" <td>600.000000</td>\n",
" <td>537.700000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>806.400000</td>\n",
" <td>807.000000</td>\n",
" <td>763.600000</td>\n",
" <td>1000.000000</td>\n",
" <td>952.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" NU_NOTA_CN NU_NOTA_CH NU_NOTA_LC NU_NOTA_REDACAO NU_NOTA_MT\n",
"count 10097.000000 10097.000000 10097.000000 10097.000000 10097.000000\n",
"mean 473.912509 530.346123 516.665059 529.452907 482.648638\n",
"std 71.098468 73.528309 68.442602 154.001881 99.685820\n",
"min 0.000000 0.000000 0.000000 0.000000 0.000000\n",
"25% 420.100000 481.000000 468.600000 440.000000 409.000000\n",
"50% 460.400000 532.600000 521.100000 540.000000 461.300000\n",
"75% 515.100000 581.900000 564.900000 600.000000 537.700000\n",
"max 806.400000 807.000000 763.600000 1000.000000 952.000000"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dados.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Criação dos datasets de treino e testes"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"values = dados.values\n",
"#values = dados.drop('NU_NOTA_MT', axis=1).values"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"#scaler = preprocessing.StandardScaler().fit(values)\n",
"#newdf = pd.DataFrame(scaler.transform(values))\n",
"#newdf['NU_NOTA_MT'] = dados['NU_NOTA_MT'].values\n",
"#newdf.head()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"#values = newdf.values\n",
"np.random.seed(2)\n",
"np.random.shuffle(values)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"7067\n"
]
}
],
"source": [
"n = 4\n",
"linhas = int(dados.shape[0]*0.7)\n",
"x_train = values[:linhas, 0:n]\n",
"y_train = values[:linhas, n]\n",
"x_test = values[linhas: , 0:n]\n",
"y_test = values[linhas:, n]\n",
"print(linhas)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Desvio padrão do label no conjunto de teste:"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"99.2996703459975"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.std(y_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Criação do modelo de regressão linear, treinamento e predição"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"modelo = linear_model.LinearRegression()\n",
"modelo.fit(x_train, y_train)\n",
"predicao = modelo.predict(x_test)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Real</th>\n",
" <th>Predicao</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>431.5</td>\n",
" <td>474.918928</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>559.5</td>\n",
" <td>514.217154</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>450.7</td>\n",
" <td>539.906238</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>548.9</td>\n",
" <td>543.481335</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>396.5</td>\n",
" <td>437.443556</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Real Predicao\n",
"0 431.5 474.918928\n",
"1 559.5 514.217154\n",
"2 450.7 539.906238\n",
"3 548.9 543.481335\n",
"4 396.5 437.443556"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"real_predicao = pd.DataFrame()\n",
"real_predicao['Real'] = y_test\n",
"real_predicao['Predicao'] = predicao\n",
"real_predicao.head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Criação do modelo GradientBoostingRegressor"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"modeloGBR = GradientBoostingRegressor()\n",
"modeloGBR.fit(x_train, y_train)\n",
"predicaoGBR = modeloGBR.predict(x_test)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Real</th>\n",
" <th>Predicao</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>431.5</td>\n",
" <td>474.918928</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>559.5</td>\n",
" <td>514.217154</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>450.7</td>\n",
" <td>539.906238</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>548.9</td>\n",
" <td>543.481335</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>396.5</td>\n",
" <td>437.443556</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Real Predicao\n",
"0 431.5 474.918928\n",
"1 559.5 514.217154\n",
"2 450.7 539.906238\n",
"3 548.9 543.481335\n",
"4 396.5 437.443556"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"real_predicaoGBR = pd.DataFrame()\n",
"real_predicaoGBR['Real'] = y_test\n",
"real_predicaoGBR['Predicao'] = predicao\n",
"real_predicao.head(5)\n",
"real_predicaoGBR['Predicao']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Métricas de avaliação"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"grafico = pd.DataFrame()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"RMSE linearRegression: \n",
"77.84089186651498\n",
"RMSE GradientBoostingRegressor: \n",
"74.56662051253947\n"
]
}
],
"source": [
"print(\"RMSE linearRegression: \")\n",
"print(math.sqrt(metrics.mean_squared_error(y_test, predicao)))\n",
"print(\"RMSE GradientBoostingRegressor: \")\n",
"print(math.sqrt(metrics.mean_squared_error(y_test, predicaoGBR)))"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MAE linearRegression: \n",
"61.666621587672786\n",
"MAE GradientBoostingRegressor: \n",
"59.14682258347615\n"
]
}
],
"source": [
"print(\"MAE linearRegression: \")\n",
"print(metrics.mean_absolute_error(y_test, predicao))\n",
"print(\"MAE GradientBoostingRegressor: \")\n",
"print(metrics.mean_absolute_error(y_test, predicaoGBR))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
@regispires
Copy link

regispires commented Oct 4, 2018

  • não use os dados de teste sem label.
  • divida seus dados de treino em treino (70%) e teste (30%). ou seja: ambos terão label.
  • treine o modelo com os dados de treino.
  • calcule a métrica RMSE para os dados de teste.
  • calcule e mostre o desvio padrão (std) do label no conj de teste.
  • atualize o GIST com isso e me avise para analisarmos novamente.

@regispires
Copy link

regispires commented Oct 6, 2018

  • criar um modelo também com o GradientBoostingRegressor e acrescentá-lo à comparação.
  • vc não calculou o desvio padrão do label no conj. de teste: np.std(y_test)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment