Created
February 2, 2021 05:48
-
-
Save dianachua/c69484e1af018a870e2c797ae58f1780 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": "Requirement already satisfied: scikit-learn in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (0.23.1)\nRequirement already satisfied: numpy>=1.13.3 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from scikit-learn) (1.18.5)\nRequirement already satisfied: joblib>=0.11 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from scikit-learn) (0.16.0)\nRequirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from scikit-learn) (2.1.0)\nRequirement already satisfied: scipy>=0.19.1 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from scikit-learn) (1.5.0)\nRequirement already up-to-date: pixiedust in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (1.1.19)\nRequirement already satisfied, skipping upgrade: markdown in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from pixiedust) (3.1.1)\nRequirement already satisfied, skipping upgrade: requests in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from pixiedust) (2.24.0)\nRequirement already satisfied, skipping upgrade: geojson in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from pixiedust) (2.5.0)\nRequirement already satisfied, skipping upgrade: colour in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from pixiedust) (0.1.5)\nRequirement already satisfied, skipping upgrade: matplotlib in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from pixiedust) (3.2.2)\nRequirement already satisfied, skipping upgrade: pandas in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from pixiedust) (1.0.5)\nRequirement already satisfied, skipping upgrade: astunparse in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from pixiedust) (1.6.3)\nRequirement already satisfied, skipping upgrade: setuptools>=36 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from markdown->pixiedust) (47.3.1.post20200622)\nRequirement already satisfied, skipping upgrade: certifi>=2017.4.17 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from requests->pixiedust) (2020.12.5)\nRequirement already satisfied, skipping upgrade: idna<3,>=2.5 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from requests->pixiedust) (2.9)\nRequirement already satisfied, skipping upgrade: chardet<4,>=3.0.2 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from requests->pixiedust) (3.0.4)\nRequirement already satisfied, skipping upgrade: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from requests->pixiedust) (1.25.9)\nRequirement already satisfied, skipping upgrade: cycler>=0.10 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from matplotlib->pixiedust) (0.10.0)\nRequirement already satisfied, skipping upgrade: python-dateutil>=2.1 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from matplotlib->pixiedust) (2.8.1)\nRequirement already satisfied, skipping upgrade: kiwisolver>=1.0.1 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from matplotlib->pixiedust) (1.2.0)\nRequirement already satisfied, skipping upgrade: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from matplotlib->pixiedust) (2.4.7)\nRequirement already satisfied, skipping upgrade: numpy>=1.11 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from matplotlib->pixiedust) (1.18.5)\nRequirement already satisfied, skipping upgrade: pytz>=2017.2 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from pandas->pixiedust) (2020.1)\nRequirement already satisfied, skipping upgrade: six<2.0,>=1.6.1 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from astunparse->pixiedust) (1.15.0)\nRequirement already satisfied, skipping upgrade: wheel<1.0,>=0.23.0 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from astunparse->pixiedust) (0.34.2)\n" | |
} | |
], | |
"source": "!pip install scikit-learn\n!pip install --upgrade pixiedust" | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": "Requirement already satisfied: brunel in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (2.6.2)\nRequirement already satisfied: jinja2 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from brunel) (2.11.2)\nRequirement already satisfied: jupyter-pip in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from brunel) (0.3.1)\nRequirement already satisfied: Py4J in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from brunel) (0.10.9.1)\nRequirement already satisfied: pandas in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from brunel) (1.0.5)\nRequirement already satisfied: ipython in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from brunel) (7.15.0)\nRequirement already satisfied: MarkupSafe>=0.23 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from jinja2->brunel) (1.1.1)\nRequirement already satisfied: python-dateutil>=2.6.1 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from pandas->brunel) (2.8.1)\nRequirement already satisfied: pytz>=2017.2 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from pandas->brunel) (2020.1)\nRequirement already satisfied: numpy>=1.13.3 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from pandas->brunel) (1.18.5)\nRequirement already satisfied: backcall in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ipython->brunel) (0.2.0)\nRequirement already satisfied: prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ipython->brunel) (3.0.5)\nRequirement already satisfied: pexpect; sys_platform != \"win32\" in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ipython->brunel) (4.8.0)\nRequirement already satisfied: pickleshare in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ipython->brunel) (0.7.5)\nRequirement already satisfied: jedi>=0.10 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ipython->brunel) (0.17.1)\nRequirement already satisfied: decorator in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ipython->brunel) (4.4.2)\nRequirement already satisfied: traitlets>=4.2 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ipython->brunel) (4.3.3)\nRequirement already satisfied: pygments in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ipython->brunel) (2.6.1)\nRequirement already satisfied: setuptools>=18.5 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ipython->brunel) (47.3.1.post20200622)\nRequirement already satisfied: six>=1.5 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from python-dateutil>=2.6.1->pandas->brunel) (1.15.0)\nRequirement already satisfied: wcwidth in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0->ipython->brunel) (0.2.4)\nRequirement already satisfied: ptyprocess>=0.5 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from pexpect; sys_platform != \"win32\"->ipython->brunel) (0.6.0)\nRequirement already satisfied: parso<0.8.0,>=0.7.0 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from jedi>=0.10->ipython->brunel) (0.7.0)\nRequirement already satisfied: ipython-genutils in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from traitlets>=4.2->ipython->brunel) (0.2.0)\nRequirement already satisfied: ibm_watson_machine_learning in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (1.0.45)\nRequirement already satisfied: ibm-cos-sdk==2.7.* in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ibm_watson_machine_learning) (2.7.0)\nRequirement already satisfied: urllib3 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ibm_watson_machine_learning) (1.25.9)\nRequirement already satisfied: tabulate in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ibm_watson_machine_learning) (0.8.3)\nRequirement already satisfied: certifi in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ibm_watson_machine_learning) (2020.12.5)\nRequirement already satisfied: pandas<=1.0.5 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ibm_watson_machine_learning) (1.0.5)\nRequirement already satisfied: lomond in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ibm_watson_machine_learning) (0.3.3)\nRequirement already satisfied: requests in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ibm_watson_machine_learning) (2.24.0)\nRequirement already satisfied: jmespath<1.0.0,>=0.7.1 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ibm-cos-sdk==2.7.*->ibm_watson_machine_learning) (0.9.4)\nRequirement already satisfied: ibm-cos-sdk-s3transfer==2.7.0 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ibm-cos-sdk==2.7.*->ibm_watson_machine_learning) (2.7.0)\nRequirement already satisfied: ibm-cos-sdk-core==2.7.0 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ibm-cos-sdk==2.7.*->ibm_watson_machine_learning) (2.7.0)\nRequirement already satisfied: numpy>=1.13.3 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from pandas<=1.0.5->ibm_watson_machine_learning) (1.18.5)\nRequirement already satisfied: pytz>=2017.2 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from pandas<=1.0.5->ibm_watson_machine_learning) (2020.1)\nRequirement already satisfied: python-dateutil>=2.6.1 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from pandas<=1.0.5->ibm_watson_machine_learning) (2.8.1)\nRequirement already satisfied: six>=1.10.0 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from lomond->ibm_watson_machine_learning) (1.15.0)\nRequirement already satisfied: idna<3,>=2.5 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from requests->ibm_watson_machine_learning) (2.9)\nRequirement already satisfied: chardet<4,>=3.0.2 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from requests->ibm_watson_machine_learning) (3.0.4)\nRequirement already satisfied: docutils<0.16,>=0.10 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ibm-cos-sdk-core==2.7.0->ibm-cos-sdk==2.7.*->ibm_watson_machine_learning) (0.15.2)\nPixiedust database opened successfully\n" | |
}, | |
{ | |
"data": { | |
"text/html": "\n <div style=\"margin:10px\">\n <a href=\"https://github.com/ibm-watson-data-lab/pixiedust\" target=\"_new\">\n <img src=\"https://github.com/ibm-watson-data-lab/pixiedust/raw/master/docs/_static/pd_icon32.png\" style=\"float:left;margin-right:10px\"/>\n </a>\n <span>Pixiedust version 1.1.19</span>\n </div>\n ", | |
"text/plain": "<IPython.core.display.HTML object>" | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": "/opt/conda/envs/Python-3.7-main/lib/python3.7/importlib/_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject\n return f(*args, **kwds)\n/opt/conda/envs/Python-3.7-main/lib/python3.7/importlib/_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject\n return f(*args, **kwds)\n/opt/conda/envs/Python-3.7-main/lib/python3.7/importlib/_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject\n return f(*args, **kwds)\n/opt/conda/envs/Python-3.7-main/lib/python3.7/importlib/_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject\n return f(*args, **kwds)\n2021-02-02 03:53:51,446 - matplotlib.font_manager - WARNING - findfont: Font family ['serif'] not found. Falling back to DejaVu Sans.\n" | |
} | |
], | |
"source": "!pip install brunel\n!pip install ibm_watson_machine_learning\nimport pixiedust\nimport sklearn\nimport pandas as pd\nfrom sklearn.ensemble import RandomForestClassifier\nimport numpy as np\nfrom sklearn.model_selection import train_test_split\nfrom scipy.io import arff\nimport brunel\nfrom ibm_watson_machine_learning import APIClient" | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": "Downloading 'https://raw.githubusercontent.com/apischdo/skillsacademy/master/Denormalized%20claims%20data.csv' from https://raw.githubusercontent.com/apischdo/skillsacademy/master/Denormalized%20claims%20data.csv\nDownloaded 463947 bytes\nCreating pandas DataFrame for 'https://raw.githubusercontent.com/apischdo/skillsacademy/master/Denormalized%20claims%20data.csv'. Please wait...\nLoading file using 'pandas'\nSuccessfully created pandas DataFrame for 'https://raw.githubusercontent.com/apischdo/skillsacademy/master/Denormalized%20claims%20data.csv'\n" | |
} | |
], | |
"source": "raw_df=pixiedust.sampleData('https://raw.githubusercontent.com/apischdo/skillsacademy/master/Denormalized%20claims%20data.csv')" | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"pixiedust": { | |
"displayParams": { | |
"chartsize": "99", | |
"handlerId": "mapView", | |
"keyFields": "LATITUDE,LONGITUDE", | |
"kind": "choropleth-cluster", | |
"mapboxtoken": "pk.eyJ1IjoiYXBpc2NoZG8iLCJhIjoiY2o2cXkxMjUxMDMyaTJ3bGEyajFsZ3Y4cSJ9.mL2PT0XH2vrNiDozb7gO0w", | |
"orientation": "vertical", | |
"rendererId": "mapbox", | |
"rowCount": "100", | |
"sortby": "Keys ASC", | |
"title": "Predicting fraud in auto insurance claim", | |
"valueFields": "POLICE_REPORT" | |
} | |
}, | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": "<style type=\"text/css\">.pd_warning{display:none;}</style><div class=\"pd_warning\"><em>Hey, there's something awesome here! To see it, open this notebook outside GitHub, in a viewer like Jupyter</em></div>", | |
"text/plain": "<IPython.core.display.HTML object>" | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": "display(raw_df)" | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": "# Claim within 15 days of policy expiry (date of loss - insurance_policy.expiry)\nraw_df[\"EXPIRY_DATE\"] = pd.to_datetime(raw_df[\"EXPIRY_DATE\"])\nraw_df[\"LOSS_EVENT_TIME\"] = pd.to_datetime(raw_df[\"LOSS_EVENT_TIME\"])\nraw_df[\"DAYS_FROM_LOSS\"] = raw_df[\"LOSS_EVENT_TIME\"] - raw_df[\"EXPIRY_DATE\"]\nraw_df[\"DAYS_FROM_LOSS\"] = abs(raw_df.DAYS_FROM_LOSS.dt.days)\nraw_df.loc[raw_df['DAYS_FROM_LOSS'] >= 15, 'SUSPICIOUS_CLAIM_TIME'] = 1\nraw_df.loc[raw_df['DAYS_FROM_LOSS'] < 15, 'SUSPICIOUS_CLAIM_TIME'] = 0" | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": "1.0 915\n0.0 60\nName: SUSPICIOUS_CLAIM_TIME, dtype: int64" | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": "raw_df[\"SUSPICIOUS_CLAIM_TIME\"].value_counts()" | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": "# Expired drivers license (if date of loss > insurance_driver.drivers_license_expiry)\nraw_df[\"DRIVERS_LICENSE_EXPIRY\"] = pd.to_datetime(raw_df[\"DRIVERS_LICENSE_EXPIRY\"])\nraw_df[\"DAYS_FROM_L_EXPIRY\"] = raw_df[\"DRIVERS_LICENSE_EXPIRY\"] - raw_df[\"LOSS_EVENT_TIME\"]\nraw_df[\"DAYS_FROM_L_EXPIRY\"] = raw_df.DAYS_FROM_L_EXPIRY.dt.days\nraw_df.loc[raw_df['DAYS_FROM_L_EXPIRY'] >= 0, 'EXPIRED_LICENSE'] = 0\nraw_df.loc[raw_df['DAYS_FROM_L_EXPIRY'] < 0, 'EXPIRED_LICENSE'] = 1" | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": "# Days living at current address (date of loss - insurance_driver.date_at_current_address)\nraw_df[\"DATE_AT_CURRENT_ADDRESS\"] = pd.to_datetime(raw_df[\"DATE_AT_CURRENT_ADDRESS\"])\nraw_df[\"DAYS_AT_ADDRESS\"] = raw_df[\"LOSS_EVENT_TIME\"] - raw_df[\"DATE_AT_CURRENT_ADDRESS\"]\nraw_df[\"DAYS_AT_ADDRESS\"] = abs(raw_df.DAYS_AT_ADDRESS.dt.days)\nraw_df.loc[raw_df['DAYS_AT_ADDRESS'] >= 15, 'SUSPICIOUS_LIVING'] = 1\nraw_df.loc[raw_df['DAYS_AT_ADDRESS'] < 15, 'SUSPICIOUS_LIVING'] = 0" | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": "1.0 973\n0.0 2\nName: SUSPICIOUS_LIVING, dtype: int64" | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": "raw_df[\"SUSPICIOUS_LIVING\"].value_counts()" | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": "145432.118234 1\n234953.253233 1\n55515.574648 1\n123525.401515 1\n198472.566993 1\n ..\n98941.082789 1\n100299.475188 1\n61050.847328 1\n54548.889011 1\n170432.357027 1\nName: MILES/YEAR, Length: 975, dtype: int64" | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": "#7500/year\nraw_df[\"START_DATE\"] = pd.to_datetime(raw_df[\"START_DATE\"])\n#find number of days between policy creation and accident\nraw_df[\"LENGTH_OF_POLICY\"]=(raw_df[\"LOSS_EVENT_TIME\"] - raw_df[\"START_DATE\"]).dt.days\n#convert to years\nraw_df[\"LENGTH_OF_POLICY\"]=raw_df[\"LENGTH_OF_POLICY\"]/365\n#divide Odometer at loss by years\nraw_df[\"MILES/YEAR\"] = raw_df[\"ODOMETER_AT_LOSS\"]/raw_df[\"LENGTH_OF_POLICY\"]\nraw_df[\"MILES/YEAR\"].value_counts()" | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [], | |
"source": "# Conflict on whether a policyholder with a low mileage discount experienced a loss with high mileage at the point of loss\nraw_df.loc[raw_df[\"MILES/YEAR\"] <7500, 'LOW_MILEAGE_AT_LOSS'] = 1\nraw_df.loc[raw_df[\"MILES/YEAR\"] >=7500, 'LOW_MILEAGE_AT_LOSS'] = 0" | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [], | |
"source": "raw_df.loc[raw_df[\"LOW_MILEAGE_USE\"]==raw_df[\"LOW_MILEAGE_AT_LOSS\"], 'SUSPICIOUS_MILEAGE'] = 0\nraw_df.loc[raw_df[\"LOW_MILEAGE_USE\"]!=raw_df[\"LOW_MILEAGE_AT_LOSS\"], 'SUSPICIOUS_MILEAGE'] = 1" | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [], | |
"source": "raw_df.loc[raw_df[\"CLAIM_AMOUNT\"] <3000, 'EXCESSIVE_CLAIM_AMOUNT'] = 0\nraw_df.loc[raw_df[\"CLAIM_AMOUNT\"] >=3000, 'EXCESSIVE_CLAIM_AMOUNT'] = 1" | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [], | |
"source": "# dataframes for certain features\nfeatures = ['FLAG_FOR_FRAUD_INV',\n'SUSPICIOUS_MILEAGE',\n'EXPIRED_LICENSE',\n'SUSPICIOUS_CLAIM_TIME',\n'SUSPICIOUS_LIVING',\n'EXCESSIVE_CLAIM_AMOUNT']" | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [], | |
"source": "df_model = raw_df[features]" | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [], | |
"source": "#ensure all relevant features are integers\ndf_model[\"SUSPICIOUS_LIVING\"] = df_model[\"SUSPICIOUS_LIVING\"].astype(int)\ndf_model[\"EXPIRED_LICENSE\"] = df_model[\"EXPIRED_LICENSE\"].astype(int)\ndf_model[\"SUSPICIOUS_CLAIM_TIME\"] = df_model[\"SUSPICIOUS_CLAIM_TIME\"].astype(int)\ndf_model[\"SUSPICIOUS_MILEAGE\"] = df_model[\"SUSPICIOUS_MILEAGE\"].astype(int)\ndf_model[\"EXCESSIVE_CLAIM_AMOUNT\"] = df_model[\"EXCESSIVE_CLAIM_AMOUNT\"].astype(int)" | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>FLAG_FOR_FRAUD_INV</th>\n <th>INCIDENT_CAUSE</th>\n <th>DESCRIPTION</th>\n <th>CLAIM_STATUS</th>\n <th>ODOMETER_AT_LOSS</th>\n <th>POLICE_REPORT</th>\n <th>CLAIMS_AT_LOSS_DATE</th>\n <th>LOSS_LOCATION_LAT</th>\n <th>LOSS_LOCATION_LONG</th>\n <th>CLAIM_AMOUNT</th>\n <th>...</th>\n <th>SUSPICIOUS_CLAIM_TIME</th>\n <th>DAYS_FROM_L_EXPIRY</th>\n <th>EXPIRED_LICENSE</th>\n <th>DAYS_AT_ADDRESS</th>\n <th>SUSPICIOUS_LIVING</th>\n <th>LENGTH_OF_POLICY</th>\n <th>MILES/YEAR</th>\n <th>LOW_MILEAGE_AT_LOSS</th>\n <th>SUSPICIOUS_MILEAGE</th>\n <th>EXCESSIVE_CLAIM_AMOUNT</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0</td>\n <td>2.926186</td>\n <td>NaN</td>\n <td>2.977153</td>\n <td>180783.846046</td>\n <td>0.281195</td>\n <td>1.492091</td>\n <td>41.857895</td>\n <td>-87.675014</td>\n <td>2521.147627</td>\n <td>...</td>\n <td>0.933216</td>\n <td>894.546573</td>\n <td>0.042179</td>\n <td>3487.956063</td>\n <td>0.996485</td>\n <td>1.070159</td>\n <td>205170.356701</td>\n <td>0.0</td>\n <td>0.094903</td>\n <td>0.086116</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1</td>\n <td>3.041872</td>\n <td>NaN</td>\n <td>1.103448</td>\n <td>175504.046305</td>\n <td>0.637931</td>\n <td>1.758621</td>\n <td>41.860133</td>\n <td>-87.679617</td>\n <td>26429.480296</td>\n <td>...</td>\n <td>0.945813</td>\n <td>917.960591</td>\n <td>0.036946</td>\n <td>3252.413793</td>\n <td>1.000000</td>\n <td>0.960490</td>\n <td>230687.578997</td>\n <td>0.0</td>\n <td>0.088670</td>\n <td>0.933498</td>\n </tr>\n </tbody>\n</table>\n<p>2 rows \u00d7 32 columns</p>\n</div>", | |
"text/plain": " FLAG_FOR_FRAUD_INV INCIDENT_CAUSE DESCRIPTION CLAIM_STATUS \\\n0 0 2.926186 NaN 2.977153 \n1 1 3.041872 NaN 1.103448 \n\n ODOMETER_AT_LOSS POLICE_REPORT CLAIMS_AT_LOSS_DATE LOSS_LOCATION_LAT \\\n0 180783.846046 0.281195 1.492091 41.857895 \n1 175504.046305 0.637931 1.758621 41.860133 \n\n LOSS_LOCATION_LONG CLAIM_AMOUNT ... SUSPICIOUS_CLAIM_TIME \\\n0 -87.675014 2521.147627 ... 0.933216 \n1 -87.679617 26429.480296 ... 0.945813 \n\n DAYS_FROM_L_EXPIRY EXPIRED_LICENSE DAYS_AT_ADDRESS SUSPICIOUS_LIVING \\\n0 894.546573 0.042179 3487.956063 0.996485 \n1 917.960591 0.036946 3252.413793 1.000000 \n\n LENGTH_OF_POLICY MILES/YEAR LOW_MILEAGE_AT_LOSS SUSPICIOUS_MILEAGE \\\n0 1.070159 205170.356701 0.0 0.094903 \n1 0.960490 230687.578997 0.0 0.088670 \n\n EXCESSIVE_CLAIM_AMOUNT \n0 0.086116 \n1 0.933498 \n\n[2 rows x 32 columns]" | |
}, | |
"execution_count": 19, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": "raw_df.groupby(\"FLAG_FOR_FRAUD_INV\", as_index=False).mean()" | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [], | |
"source": "#split data into x and y variables\nxVar = df_model[[\"EXPIRED_LICENSE\",\"SUSPICIOUS_CLAIM_TIME\",\"SUSPICIOUS_LIVING\",\"SUSPICIOUS_MILEAGE\",\"EXCESSIVE_CLAIM_AMOUNT\"]]\nyVar = df_model[\"FLAG_FOR_FRAUD_INV\"]" | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>EXPIRED_LICENSE</th>\n <th>SUSPICIOUS_CLAIM_TIME</th>\n <th>SUSPICIOUS_LIVING</th>\n <th>SUSPICIOUS_MILEAGE</th>\n <th>EXCESSIVE_CLAIM_AMOUNT</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0</td>\n <td>1</td>\n <td>1</td>\n <td>0</td>\n <td>1</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1</td>\n <td>1</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>0</td>\n <td>1</td>\n <td>1</td>\n <td>1</td>\n <td>1</td>\n </tr>\n <tr>\n <th>3</th>\n <td>0</td>\n <td>1</td>\n <td>1</td>\n <td>0</td>\n <td>1</td>\n </tr>\n <tr>\n <th>4</th>\n <td>0</td>\n <td>1</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n</div>", | |
"text/plain": " EXPIRED_LICENSE SUSPICIOUS_CLAIM_TIME SUSPICIOUS_LIVING \\\n0 0 1 1 \n1 1 1 1 \n2 0 1 1 \n3 0 1 1 \n4 0 1 1 \n\n SUSPICIOUS_MILEAGE EXCESSIVE_CLAIM_AMOUNT \n0 0 1 \n1 0 0 \n2 1 1 \n3 0 1 \n4 0 0 " | |
}, | |
"execution_count": 23, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": "xVar.head()" | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 37, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": "(780, 5) (780,)\n(195, 5) (195,)\n" | |
} | |
], | |
"source": "#split into a test/train set\nX_train, X_test, y_train, y_test = train_test_split(xVar, yVar, test_size=0.2, random_state=42)\nprint (X_train.shape, y_train.shape)\nprint (X_test.shape, y_test.shape)" | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 39, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": "RandomForestClassifier(n_jobs=2, random_state=0)" | |
}, | |
"execution_count": 39, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": "#train model\nclf = RandomForestClassifier(n_jobs=2, random_state=0)\nclf.fit(X_train, y_train)" | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 40, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th>Predicted Result</th>\n <th>0</th>\n <th>1</th>\n </tr>\n <tr>\n <th>Actual Result</th>\n <th></th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>109</td>\n <td>10</td>\n </tr>\n <tr>\n <th>1</th>\n <td>5</td>\n <td>71</td>\n </tr>\n </tbody>\n</table>\n</div>", | |
"text/plain": "Predicted Result 0 1\nActual Result \n0 109 10\n1 5 71" | |
}, | |
"execution_count": 40, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": "#create confusion matrix to gut check model\npreds = clf.predict(X_test)\npd.crosstab(y_test, preds, rownames=['Actual Result'], colnames=['Predicted Result'])" | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 48, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": "0.9234764579342365" | |
}, | |
"execution_count": 48, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": "#create F1 score\nfrom sklearn.metrics import f1_score\nyhat=clf.predict(X_test)\nf1_score(y_test, yhat, average='weighted')" | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": "" | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3.7", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.9" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment