ClebsonDantasUchoa · October 6, 2018 18:01 · regispires · Oct 4, 2018 · regispires · Oct 6, 2018
diff --git a/enem-2.ipynb b/enem-2.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Predizer a nota de matemática de um candidato do ENEM baseado nas outras notas"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Importação das bibliotecas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import math\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn import linear_model\n",
    "from sklearn import metrics\n",
    "from sklearn import preprocessing\n",
    "from sklearn.metrics import mean_squared_error\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn import svm\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.ensemble import GradientBoostingRegressor"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Leitura do arquivo"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv(\"train.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(13730, 167)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Filtragem das colunas importantes para um novo dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "dados = pd.DataFrame()\n",
    "dados['NU_NOTA_CN'] = data['NU_NOTA_CN']\n",
    "dados['NU_NOTA_CH'] = data['NU_NOTA_CH']\n",
    "dados['NU_NOTA_LC'] = data['NU_NOTA_LC']\n",
    "dados['NU_NOTA_REDACAO'] = data['NU_NOTA_REDACAO']\n",
    "dados['NU_NOTA_MT'] = data['NU_NOTA_MT']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Remoção das linhas com elementos faltantes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "NU_NOTA_CN         3389\n",
       "NU_NOTA_CH         3389\n",
       "NU_NOTA_LC         3597\n",
       "NU_NOTA_REDACAO    3597\n",
       "NU_NOTA_MT         3597\n",
       "dtype: int64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dados.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "dados = dados.dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10097, 5)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dados.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "NU_NOTA_CN         0\n",
       "NU_NOTA_CH         0\n",
       "NU_NOTA_LC         0\n",
       "NU_NOTA_REDACAO    0\n",
       "NU_NOTA_MT         0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dados.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>NU_NOTA_CN</th>\n",
       "      <th>NU_NOTA_CH</th>\n",
       "      <th>NU_NOTA_LC</th>\n",
       "      <th>NU_NOTA_REDACAO</th>\n",
       "      <th>NU_NOTA_MT</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>10097.000000</td>\n",
       "      <td>10097.000000</td>\n",
       "      <td>10097.000000</td>\n",
       "      <td>10097.000000</td>\n",
       "      <td>10097.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>473.912509</td>\n",
       "      <td>530.346123</td>\n",
       "      <td>516.665059</td>\n",
       "      <td>529.452907</td>\n",
       "      <td>482.648638</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>71.098468</td>\n",
       "      <td>73.528309</td>\n",
       "      <td>68.442602</td>\n",
       "      <td>154.001881</td>\n",
       "      <td>99.685820</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>420.100000</td>\n",
       "      <td>481.000000</td>\n",
       "      <td>468.600000</td>\n",
       "      <td>440.000000</td>\n",
       "      <td>409.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>460.400000</td>\n",
       "      <td>532.600000</td>\n",
       "      <td>521.100000</td>\n",
       "      <td>540.000000</td>\n",
       "      <td>461.300000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>515.100000</td>\n",
       "      <td>581.900000</td>\n",
       "      <td>564.900000</td>\n",
       "      <td>600.000000</td>\n",
       "      <td>537.700000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>806.400000</td>\n",
       "      <td>807.000000</td>\n",
       "      <td>763.600000</td>\n",
       "      <td>1000.000000</td>\n",
       "      <td>952.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         NU_NOTA_CN    NU_NOTA_CH    NU_NOTA_LC  NU_NOTA_REDACAO    NU_NOTA_MT\n",
       "count  10097.000000  10097.000000  10097.000000     10097.000000  10097.000000\n",
       "mean     473.912509    530.346123    516.665059       529.452907    482.648638\n",
       "std       71.098468     73.528309     68.442602       154.001881     99.685820\n",
       "min        0.000000      0.000000      0.000000         0.000000      0.000000\n",
       "25%      420.100000    481.000000    468.600000       440.000000    409.000000\n",
       "50%      460.400000    532.600000    521.100000       540.000000    461.300000\n",
       "75%      515.100000    581.900000    564.900000       600.000000    537.700000\n",
       "max      806.400000    807.000000    763.600000      1000.000000    952.000000"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dados.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Criação dos datasets de treino e testes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "values = dados.values\n",
    "#values = dados.drop('NU_NOTA_MT', axis=1).values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "#scaler = preprocessing.StandardScaler().fit(values)\n",
    "#newdf = pd.DataFrame(scaler.transform(values))\n",
    "#newdf['NU_NOTA_MT'] = dados['NU_NOTA_MT'].values\n",
    "#newdf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "#values = newdf.values\n",
    "np.random.seed(2)\n",
    "np.random.shuffle(values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "7067\n"
     ]
    }
   ],
   "source": [
    "n = 4\n",
    "linhas = int(dados.shape[0]*0.7)\n",
    "x_train = values[:linhas, 0:n]\n",
    "y_train = values[:linhas, n]\n",
    "x_test = values[linhas: , 0:n]\n",
    "y_test = values[linhas:, n]\n",
    "print(linhas)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Desvio padrão do label no conjunto de teste:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "99.2996703459975"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.std(y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Criação do modelo de regressão linear, treinamento e predição"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "modelo = linear_model.LinearRegression()\n",
    "modelo.fit(x_train, y_train)\n",
    "predicao = modelo.predict(x_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Real</th>\n",
       "      <th>Predicao</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>431.5</td>\n",
       "      <td>474.918928</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>559.5</td>\n",
       "      <td>514.217154</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>450.7</td>\n",
       "      <td>539.906238</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>548.9</td>\n",
       "      <td>543.481335</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>396.5</td>\n",
       "      <td>437.443556</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Real    Predicao\n",
       "0  431.5  474.918928\n",
       "1  559.5  514.217154\n",
       "2  450.7  539.906238\n",
       "3  548.9  543.481335\n",
       "4  396.5  437.443556"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "real_predicao = pd.DataFrame()\n",
    "real_predicao['Real'] = y_test\n",
    "real_predicao['Predicao'] = predicao\n",
    "real_predicao.head(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Criação do modelo GradientBoostingRegressor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "modeloGBR = GradientBoostingRegressor()\n",
    "modeloGBR.fit(x_train, y_train)\n",
    "predicaoGBR = modeloGBR.predict(x_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Real</th>\n",
       "      <th>Predicao</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>431.5</td>\n",
       "      <td>474.918928</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>559.5</td>\n",
       "      <td>514.217154</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>450.7</td>\n",
       "      <td>539.906238</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>548.9</td>\n",
       "      <td>543.481335</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>396.5</td>\n",
       "      <td>437.443556</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Real    Predicao\n",
       "0  431.5  474.918928\n",
       "1  559.5  514.217154\n",
       "2  450.7  539.906238\n",
       "3  548.9  543.481335\n",
       "4  396.5  437.443556"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "real_predicaoGBR = pd.DataFrame()\n",
    "real_predicaoGBR['Real'] = y_test\n",
    "real_predicaoGBR['Predicao'] = predicao\n",
    "real_predicao.head(5)\n",
    "real_predicaoGBR['Predicao']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Métricas de avaliação"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "grafico = pd.DataFrame()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "RMSE linearRegression: \n",
      "77.84089186651498\n",
      "RMSE GradientBoostingRegressor: \n",
      "74.56662051253947\n"
     ]
    }
   ],
   "source": [
    "print(\"RMSE linearRegression: \")\n",
    "print(math.sqrt(metrics.mean_squared_error(y_test, predicao)))\n",
    "print(\"RMSE GradientBoostingRegressor: \")\n",
    "print(math.sqrt(metrics.mean_squared_error(y_test, predicaoGBR)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "MAE linearRegression: \n",
      "61.666621587672786\n",
      "MAE GradientBoostingRegressor: \n",
      "59.14682258347615\n"
     ]
    }
   ],
   "source": [
    "print(\"MAE linearRegression: \")\n",
    "print(metrics.mean_absolute_error(y_test, predicao))\n",
    "print(\"MAE GradientBoostingRegressor: \")\n",
    "print(metrics.mean_absolute_error(y_test, predicaoGBR))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Predizer a nota de matemática de um candidato do ENEM baseado nas outras notas"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Importação das bibliotecas"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"%matplotlib inline\n",
	"import math\n",
	"import pandas as pd\n",
	"import numpy as np\n",
	"from sklearn import linear_model\n",
	"from sklearn import metrics\n",
	"from sklearn import preprocessing\n",
	"from sklearn.metrics import mean_squared_error\n",
	"import matplotlib.pyplot as plt\n",
	"from sklearn import svm\n",
	"from sklearn.preprocessing import StandardScaler\n",
	"from sklearn.ensemble import GradientBoostingRegressor"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Leitura do arquivo"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"data = pd.read_csv(\"train.csv\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"(13730, 167)"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"data.shape"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Filtragem das colunas importantes para um novo dataset"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"dados = pd.DataFrame()\n",
	"dados['NU_NOTA_CN'] = data['NU_NOTA_CN']\n",
	"dados['NU_NOTA_CH'] = data['NU_NOTA_CH']\n",
	"dados['NU_NOTA_LC'] = data['NU_NOTA_LC']\n",
	"dados['NU_NOTA_REDACAO'] = data['NU_NOTA_REDACAO']\n",
	"dados['NU_NOTA_MT'] = data['NU_NOTA_MT']"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Remoção das linhas com elementos faltantes"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"NU_NOTA_CN 3389\n",
	"NU_NOTA_CH 3389\n",
	"NU_NOTA_LC 3597\n",
	"NU_NOTA_REDACAO 3597\n",
	"NU_NOTA_MT 3597\n",
	"dtype: int64"
	]
	},
	"execution_count": 5,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"dados.isnull().sum()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [],
	"source": [
	"dados = dados.dropna()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"(10097, 5)"
	]
	},
	"execution_count": 7,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"dados.shape"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"NU_NOTA_CN 0\n",
	"NU_NOTA_CH 0\n",
	"NU_NOTA_LC 0\n",
	"NU_NOTA_REDACAO 0\n",
	"NU_NOTA_MT 0\n",
	"dtype: int64"
	]
	},
	"execution_count": 8,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"dados.isnull().sum()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>NU_NOTA_CN</th>\n",
	" <th>NU_NOTA_CH</th>\n",
	" <th>NU_NOTA_LC</th>\n",
	" <th>NU_NOTA_REDACAO</th>\n",
	" <th>NU_NOTA_MT</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>count</th>\n",
	" <td>10097.000000</td>\n",
	" <td>10097.000000</td>\n",
	" <td>10097.000000</td>\n",
	" <td>10097.000000</td>\n",
	" <td>10097.000000</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>mean</th>\n",
	" <td>473.912509</td>\n",
	" <td>530.346123</td>\n",
	" <td>516.665059</td>\n",
	" <td>529.452907</td>\n",
	" <td>482.648638</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>std</th>\n",
	" <td>71.098468</td>\n",
	" <td>73.528309</td>\n",
	" <td>68.442602</td>\n",
	" <td>154.001881</td>\n",
	" <td>99.685820</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>min</th>\n",
	" <td>0.000000</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.000000</td>\n",
	" <td>0.000000</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>25%</th>\n",
	" <td>420.100000</td>\n",
	" <td>481.000000</td>\n",
	" <td>468.600000</td>\n",
	" <td>440.000000</td>\n",
	" <td>409.000000</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>50%</th>\n",
	" <td>460.400000</td>\n",
	" <td>532.600000</td>\n",
	" <td>521.100000</td>\n",
	" <td>540.000000</td>\n",
	" <td>461.300000</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>75%</th>\n",
	" <td>515.100000</td>\n",
	" <td>581.900000</td>\n",
	" <td>564.900000</td>\n",
	" <td>600.000000</td>\n",
	" <td>537.700000</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>max</th>\n",
	" <td>806.400000</td>\n",
	" <td>807.000000</td>\n",
	" <td>763.600000</td>\n",
	" <td>1000.000000</td>\n",
	" <td>952.000000</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" NU_NOTA_CN NU_NOTA_CH NU_NOTA_LC NU_NOTA_REDACAO NU_NOTA_MT\n",
	"count 10097.000000 10097.000000 10097.000000 10097.000000 10097.000000\n",
	"mean 473.912509 530.346123 516.665059 529.452907 482.648638\n",
	"std 71.098468 73.528309 68.442602 154.001881 99.685820\n",
	"min 0.000000 0.000000 0.000000 0.000000 0.000000\n",
	"25% 420.100000 481.000000 468.600000 440.000000 409.000000\n",
	"50% 460.400000 532.600000 521.100000 540.000000 461.300000\n",
	"75% 515.100000 581.900000 564.900000 600.000000 537.700000\n",
	"max 806.400000 807.000000 763.600000 1000.000000 952.000000"
	]
	},
	"execution_count": 9,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"dados.describe()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Criação dos datasets de treino e testes"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [],
	"source": [
	"values = dados.values\n",
	"#values = dados.drop('NU_NOTA_MT', axis=1).values"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [],
	"source": [
	"#scaler = preprocessing.StandardScaler().fit(values)\n",
	"#newdf = pd.DataFrame(scaler.transform(values))\n",
	"#newdf['NU_NOTA_MT'] = dados['NU_NOTA_MT'].values\n",
	"#newdf.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {},
	"outputs": [],
	"source": [
	"#values = newdf.values\n",
	"np.random.seed(2)\n",
	"np.random.shuffle(values)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"7067\n"
	]
	}
	],
	"source": [
	"n = 4\n",
	"linhas = int(dados.shape[0]*0.7)\n",
	"x_train = values[:linhas, 0:n]\n",
	"y_train = values[:linhas, n]\n",
	"x_test = values[linhas: , 0:n]\n",
	"y_test = values[linhas:, n]\n",
	"print(linhas)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Desvio padrão do label no conjunto de teste:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"99.2996703459975"
	]
	},
	"execution_count": 20,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"np.std(y_test)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Criação do modelo de regressão linear, treinamento e predição"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {},
	"outputs": [],
	"source": [
	"modelo = linear_model.LinearRegression()\n",
	"modelo.fit(x_train, y_train)\n",
	"predicao = modelo.predict(x_test)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>Real</th>\n",
	" <th>Predicao</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>431.5</td>\n",
	" <td>474.918928</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>559.5</td>\n",
	" <td>514.217154</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>450.7</td>\n",
	" <td>539.906238</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>548.9</td>\n",
	" <td>543.481335</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>396.5</td>\n",
	" <td>437.443556</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" Real Predicao\n",
	"0 431.5 474.918928\n",
	"1 559.5 514.217154\n",
	"2 450.7 539.906238\n",
	"3 548.9 543.481335\n",
	"4 396.5 437.443556"
	]
	},
	"execution_count": 15,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"real_predicao = pd.DataFrame()\n",
	"real_predicao['Real'] = y_test\n",
	"real_predicao['Predicao'] = predicao\n",
	"real_predicao.head(5)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Criação do modelo GradientBoostingRegressor"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {},
	"outputs": [],
	"source": [
	"modeloGBR = GradientBoostingRegressor()\n",
	"modeloGBR.fit(x_train, y_train)\n",
	"predicaoGBR = modeloGBR.predict(x_test)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>Real</th>\n",
	" <th>Predicao</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>431.5</td>\n",
	" <td>474.918928</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>559.5</td>\n",
	" <td>514.217154</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>450.7</td>\n",
	" <td>539.906238</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>548.9</td>\n",
	" <td>543.481335</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>396.5</td>\n",
	" <td>437.443556</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" Real Predicao\n",
	"0 431.5 474.918928\n",
	"1 559.5 514.217154\n",
	"2 450.7 539.906238\n",
	"3 548.9 543.481335\n",
	"4 396.5 437.443556"
	]
	},
	"execution_count": 17,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"real_predicaoGBR = pd.DataFrame()\n",
	"real_predicaoGBR['Real'] = y_test\n",
	"real_predicaoGBR['Predicao'] = predicao\n",
	"real_predicao.head(5)\n",
	"real_predicaoGBR['Predicao']"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Métricas de avaliação"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"grafico = pd.DataFrame()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"RMSE linearRegression: \n",
	"77.84089186651498\n",
	"RMSE GradientBoostingRegressor: \n",
	"74.56662051253947\n"
	]
	}
	],
	"source": [
	"print(\"RMSE linearRegression: \")\n",
	"print(math.sqrt(metrics.mean_squared_error(y_test, predicao)))\n",
	"print(\"RMSE GradientBoostingRegressor: \")\n",
	"print(math.sqrt(metrics.mean_squared_error(y_test, predicaoGBR)))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"MAE linearRegression: \n",
	"61.666621587672786\n",
	"MAE GradientBoostingRegressor: \n",
	"59.14682258347615\n"
	]
	}
	],
	"source": [
	"print(\"MAE linearRegression: \")\n",
	"print(metrics.mean_absolute_error(y_test, predicao))\n",
	"print(\"MAE GradientBoostingRegressor: \")\n",
	"print(metrics.mean_absolute_error(y_test, predicaoGBR))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.5"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}