dianachua · February 2, 2021 05:48
diff --git a/Predicting auto insurance fraud 2.ipynb b/Predicting auto insurance fraud 2.ipynb
 {
    "cells": [
        {
            "cell_type": "code",
            "execution_count": 1,
            "metadata": {},
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": "Requirement already satisfied: scikit-learn in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (0.23.1)\nRequirement already satisfied: numpy>=1.13.3 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from scikit-learn) (1.18.5)\nRequirement already satisfied: joblib>=0.11 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from scikit-learn) (0.16.0)\nRequirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from scikit-learn) (2.1.0)\nRequirement already satisfied: scipy>=0.19.1 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from scikit-learn) (1.5.0)\nRequirement already up-to-date: pixiedust in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (1.1.19)\nRequirement already satisfied, skipping upgrade: markdown in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from pixiedust) (3.1.1)\nRequirement already satisfied, skipping upgrade: requests in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from pixiedust) (2.24.0)\nRequirement already satisfied, skipping upgrade: geojson in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from pixiedust) (2.5.0)\nRequirement already satisfied, skipping upgrade: colour in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from pixiedust) (0.1.5)\nRequirement already satisfied, skipping upgrade: matplotlib in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from pixiedust) (3.2.2)\nRequirement already satisfied, skipping upgrade: pandas in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from pixiedust) (1.0.5)\nRequirement already satisfied, skipping upgrade: astunparse in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from pixiedust) (1.6.3)\nRequirement already satisfied, skipping upgrade: setuptools>=36 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from markdown->pixiedust) (47.3.1.post20200622)\nRequirement already satisfied, skipping upgrade: certifi>=2017.4.17 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from requests->pixiedust) (2020.12.5)\nRequirement already satisfied, skipping upgrade: idna<3,>=2.5 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from requests->pixiedust) (2.9)\nRequirement already satisfied, skipping upgrade: chardet<4,>=3.0.2 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from requests->pixiedust) (3.0.4)\nRequirement already satisfied, skipping upgrade: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from requests->pixiedust) (1.25.9)\nRequirement already satisfied, skipping upgrade: cycler>=0.10 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from matplotlib->pixiedust) (0.10.0)\nRequirement already satisfied, skipping upgrade: python-dateutil>=2.1 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from matplotlib->pixiedust) (2.8.1)\nRequirement already satisfied, skipping upgrade: kiwisolver>=1.0.1 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from matplotlib->pixiedust) (1.2.0)\nRequirement already satisfied, skipping upgrade: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from matplotlib->pixiedust) (2.4.7)\nRequirement already satisfied, skipping upgrade: numpy>=1.11 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from matplotlib->pixiedust) (1.18.5)\nRequirement already satisfied, skipping upgrade: pytz>=2017.2 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from pandas->pixiedust) (2020.1)\nRequirement already satisfied, skipping upgrade: six<2.0,>=1.6.1 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from astunparse->pixiedust) (1.15.0)\nRequirement already satisfied, skipping upgrade: wheel<1.0,>=0.23.0 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from astunparse->pixiedust) (0.34.2)\n"
                }
            ],
            "source": "!pip install scikit-learn\n!pip install --upgrade pixiedust"
        },
        {
            "cell_type": "code",
            "execution_count": 2,
            "metadata": {},
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": "Requirement already satisfied: brunel in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (2.6.2)\nRequirement already satisfied: jinja2 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from brunel) (2.11.2)\nRequirement already satisfied: jupyter-pip in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from brunel) (0.3.1)\nRequirement already satisfied: Py4J in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from brunel) (0.10.9.1)\nRequirement already satisfied: pandas in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from brunel) (1.0.5)\nRequirement already satisfied: ipython in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from brunel) (7.15.0)\nRequirement already satisfied: MarkupSafe>=0.23 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from jinja2->brunel) (1.1.1)\nRequirement already satisfied: python-dateutil>=2.6.1 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from pandas->brunel) (2.8.1)\nRequirement already satisfied: pytz>=2017.2 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from pandas->brunel) (2020.1)\nRequirement already satisfied: numpy>=1.13.3 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from pandas->brunel) (1.18.5)\nRequirement already satisfied: backcall in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ipython->brunel) (0.2.0)\nRequirement already satisfied: prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ipython->brunel) (3.0.5)\nRequirement already satisfied: pexpect; sys_platform != \"win32\" in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ipython->brunel) (4.8.0)\nRequirement already satisfied: pickleshare in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ipython->brunel) (0.7.5)\nRequirement already satisfied: jedi>=0.10 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ipython->brunel) (0.17.1)\nRequirement already satisfied: decorator in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ipython->brunel) (4.4.2)\nRequirement already satisfied: traitlets>=4.2 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ipython->brunel) (4.3.3)\nRequirement already satisfied: pygments in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ipython->brunel) (2.6.1)\nRequirement already satisfied: setuptools>=18.5 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ipython->brunel) (47.3.1.post20200622)\nRequirement already satisfied: six>=1.5 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from python-dateutil>=2.6.1->pandas->brunel) (1.15.0)\nRequirement already satisfied: wcwidth in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0->ipython->brunel) (0.2.4)\nRequirement already satisfied: ptyprocess>=0.5 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from pexpect; sys_platform != \"win32\"->ipython->brunel) (0.6.0)\nRequirement already satisfied: parso<0.8.0,>=0.7.0 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from jedi>=0.10->ipython->brunel) (0.7.0)\nRequirement already satisfied: ipython-genutils in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from traitlets>=4.2->ipython->brunel) (0.2.0)\nRequirement already satisfied: ibm_watson_machine_learning in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (1.0.45)\nRequirement already satisfied: ibm-cos-sdk==2.7.* in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ibm_watson_machine_learning) (2.7.0)\nRequirement already satisfied: urllib3 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ibm_watson_machine_learning) (1.25.9)\nRequirement already satisfied: tabulate in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ibm_watson_machine_learning) (0.8.3)\nRequirement already satisfied: certifi in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ibm_watson_machine_learning) (2020.12.5)\nRequirement already satisfied: pandas<=1.0.5 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ibm_watson_machine_learning) (1.0.5)\nRequirement already satisfied: lomond in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ibm_watson_machine_learning) (0.3.3)\nRequirement already satisfied: requests in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ibm_watson_machine_learning) (2.24.0)\nRequirement already satisfied: jmespath<1.0.0,>=0.7.1 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ibm-cos-sdk==2.7.*->ibm_watson_machine_learning) (0.9.4)\nRequirement already satisfied: ibm-cos-sdk-s3transfer==2.7.0 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ibm-cos-sdk==2.7.*->ibm_watson_machine_learning) (2.7.0)\nRequirement already satisfied: ibm-cos-sdk-core==2.7.0 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ibm-cos-sdk==2.7.*->ibm_watson_machine_learning) (2.7.0)\nRequirement already satisfied: numpy>=1.13.3 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from pandas<=1.0.5->ibm_watson_machine_learning) (1.18.5)\nRequirement already satisfied: pytz>=2017.2 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from pandas<=1.0.5->ibm_watson_machine_learning) (2020.1)\nRequirement already satisfied: python-dateutil>=2.6.1 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from pandas<=1.0.5->ibm_watson_machine_learning) (2.8.1)\nRequirement already satisfied: six>=1.10.0 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from lomond->ibm_watson_machine_learning) (1.15.0)\nRequirement already satisfied: idna<3,>=2.5 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from requests->ibm_watson_machine_learning) (2.9)\nRequirement already satisfied: chardet<4,>=3.0.2 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from requests->ibm_watson_machine_learning) (3.0.4)\nRequirement already satisfied: docutils<0.16,>=0.10 in /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages (from ibm-cos-sdk-core==2.7.0->ibm-cos-sdk==2.7.*->ibm_watson_machine_learning) (0.15.2)\nPixiedust database opened successfully\n"
                },
                {
                    "data": {
                        "text/html": "\n        <div style=\"margin:10px\">\n            <a href=\"https://github.com/ibm-watson-data-lab/pixiedust\" target=\"_new\">\n                <img src=\"https://github.com/ibm-watson-data-lab/pixiedust/raw/master/docs/_static/pd_icon32.png\" style=\"float:left;margin-right:10px\"/>\n            </a>\n            <span>Pixiedust version 1.1.19</span>\n        </div>\n        ",
                        "text/plain": "<IPython.core.display.HTML object>"
                    },
                    "metadata": {},
                    "output_type": "display_data"
                },
                {
                    "name": "stderr",
                    "output_type": "stream",
                    "text": "/opt/conda/envs/Python-3.7-main/lib/python3.7/importlib/_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject\n  return f(*args, **kwds)\n/opt/conda/envs/Python-3.7-main/lib/python3.7/importlib/_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject\n  return f(*args, **kwds)\n/opt/conda/envs/Python-3.7-main/lib/python3.7/importlib/_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject\n  return f(*args, **kwds)\n/opt/conda/envs/Python-3.7-main/lib/python3.7/importlib/_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject\n  return f(*args, **kwds)\n2021-02-02 03:53:51,446 - matplotlib.font_manager - WARNING - findfont: Font family ['serif'] not found. Falling back to DejaVu Sans.\n"
                }
            ],
            "source": "!pip install brunel\n!pip install ibm_watson_machine_learning\nimport pixiedust\nimport sklearn\nimport pandas as pd\nfrom sklearn.ensemble import RandomForestClassifier\nimport numpy as np\nfrom sklearn.model_selection import train_test_split\nfrom scipy.io import arff\nimport brunel\nfrom ibm_watson_machine_learning import APIClient"
        },
        {
            "cell_type": "code",
            "execution_count": 4,
            "metadata": {},
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": "Downloading 'https://raw.githubusercontent.com/apischdo/skillsacademy/master/Denormalized%20claims%20data.csv' from https://raw.githubusercontent.com/apischdo/skillsacademy/master/Denormalized%20claims%20data.csv\nDownloaded 463947 bytes\nCreating pandas DataFrame for 'https://raw.githubusercontent.com/apischdo/skillsacademy/master/Denormalized%20claims%20data.csv'. Please wait...\nLoading file using 'pandas'\nSuccessfully created pandas DataFrame for 'https://raw.githubusercontent.com/apischdo/skillsacademy/master/Denormalized%20claims%20data.csv'\n"
                }
            ],
            "source": "raw_df=pixiedust.sampleData('https://raw.githubusercontent.com/apischdo/skillsacademy/master/Denormalized%20claims%20data.csv')"
        },
        {
            "cell_type": "code",
            "execution_count": 5,
            "metadata": {
                "pixiedust": {
                    "displayParams": {
                        "chartsize": "99",
                        "handlerId": "mapView",
                        "keyFields": "LATITUDE,LONGITUDE",
                        "kind": "choropleth-cluster",
                        "mapboxtoken": "pk.eyJ1IjoiYXBpc2NoZG8iLCJhIjoiY2o2cXkxMjUxMDMyaTJ3bGEyajFsZ3Y4cSJ9.mL2PT0XH2vrNiDozb7gO0w",
                        "orientation": "vertical",
                        "rendererId": "mapbox",
                        "rowCount": "100",
                        "sortby": "Keys ASC",
                        "title": "Predicting fraud in auto insurance claim",
                        "valueFields": "POLICE_REPORT"
                    }
                },
                "scrolled": true
            },
            "outputs": [
                {
                    "data": {
                        "text/html": "<style type=\"text/css\">.pd_warning{display:none;}</style><div class=\"pd_warning\"><em>Hey, there's something awesome here! To see it, open this notebook outside GitHub, in a viewer like Jupyter</em></div>",
                        "text/plain": "<IPython.core.display.HTML object>"
                    },
                    "metadata": {},
                    "output_type": "display_data"
                }
            ],
            "source": "display(raw_df)"
        },
        {
            "cell_type": "code",
            "execution_count": 6,
            "metadata": {},
            "outputs": [],
            "source": "# Claim within 15 days of policy expiry (date of loss - insurance_policy.expiry)\nraw_df[\"EXPIRY_DATE\"] = pd.to_datetime(raw_df[\"EXPIRY_DATE\"])\nraw_df[\"LOSS_EVENT_TIME\"] = pd.to_datetime(raw_df[\"LOSS_EVENT_TIME\"])\nraw_df[\"DAYS_FROM_LOSS\"] = raw_df[\"LOSS_EVENT_TIME\"] - raw_df[\"EXPIRY_DATE\"]\nraw_df[\"DAYS_FROM_LOSS\"] = abs(raw_df.DAYS_FROM_LOSS.dt.days)\nraw_df.loc[raw_df['DAYS_FROM_LOSS'] >= 15, 'SUSPICIOUS_CLAIM_TIME'] = 1\nraw_df.loc[raw_df['DAYS_FROM_LOSS'] < 15, 'SUSPICIOUS_CLAIM_TIME'] = 0"
        },
        {
            "cell_type": "code",
            "execution_count": 7,
            "metadata": {},
            "outputs": [
                {
                    "data": {
                        "text/plain": "1.0    915\n0.0     60\nName: SUSPICIOUS_CLAIM_TIME, dtype: int64"
                    },
                    "execution_count": 7,
                    "metadata": {},
                    "output_type": "execute_result"
                }
            ],
            "source": "raw_df[\"SUSPICIOUS_CLAIM_TIME\"].value_counts()"
        },
        {
            "cell_type": "code",
            "execution_count": 8,
            "metadata": {},
            "outputs": [],
            "source": "# Expired drivers license (if date of loss > insurance_driver.drivers_license_expiry)\nraw_df[\"DRIVERS_LICENSE_EXPIRY\"] = pd.to_datetime(raw_df[\"DRIVERS_LICENSE_EXPIRY\"])\nraw_df[\"DAYS_FROM_L_EXPIRY\"] = raw_df[\"DRIVERS_LICENSE_EXPIRY\"] - raw_df[\"LOSS_EVENT_TIME\"]\nraw_df[\"DAYS_FROM_L_EXPIRY\"] = raw_df.DAYS_FROM_L_EXPIRY.dt.days\nraw_df.loc[raw_df['DAYS_FROM_L_EXPIRY'] >= 0, 'EXPIRED_LICENSE'] = 0\nraw_df.loc[raw_df['DAYS_FROM_L_EXPIRY'] < 0, 'EXPIRED_LICENSE'] = 1"
        },
        {
            "cell_type": "code",
            "execution_count": 9,
            "metadata": {},
            "outputs": [],
            "source": "# Days living at current address (date of loss - insurance_driver.date_at_current_address)\nraw_df[\"DATE_AT_CURRENT_ADDRESS\"] = pd.to_datetime(raw_df[\"DATE_AT_CURRENT_ADDRESS\"])\nraw_df[\"DAYS_AT_ADDRESS\"] = raw_df[\"LOSS_EVENT_TIME\"] - raw_df[\"DATE_AT_CURRENT_ADDRESS\"]\nraw_df[\"DAYS_AT_ADDRESS\"] = abs(raw_df.DAYS_AT_ADDRESS.dt.days)\nraw_df.loc[raw_df['DAYS_AT_ADDRESS'] >= 15, 'SUSPICIOUS_LIVING'] = 1\nraw_df.loc[raw_df['DAYS_AT_ADDRESS'] < 15, 'SUSPICIOUS_LIVING'] = 0"
        },
        {
            "cell_type": "code",
            "execution_count": 10,
            "metadata": {},
            "outputs": [
                {
                    "data": {
                        "text/plain": "1.0    973\n0.0      2\nName: SUSPICIOUS_LIVING, dtype: int64"
                    },
                    "execution_count": 10,
                    "metadata": {},
                    "output_type": "execute_result"
                }
            ],
            "source": "raw_df[\"SUSPICIOUS_LIVING\"].value_counts()"
        },
        {
            "cell_type": "code",
            "execution_count": 11,
            "metadata": {},
            "outputs": [
                {
                    "data": {
                        "text/plain": "145432.118234    1\n234953.253233    1\n55515.574648     1\n123525.401515    1\n198472.566993    1\n                ..\n98941.082789     1\n100299.475188    1\n61050.847328     1\n54548.889011     1\n170432.357027    1\nName: MILES/YEAR, Length: 975, dtype: int64"
                    },
                    "execution_count": 11,
                    "metadata": {},
                    "output_type": "execute_result"
                }
            ],
            "source": "#7500/year\nraw_df[\"START_DATE\"] = pd.to_datetime(raw_df[\"START_DATE\"])\n#find number of days between policy creation and accident\nraw_df[\"LENGTH_OF_POLICY\"]=(raw_df[\"LOSS_EVENT_TIME\"] - raw_df[\"START_DATE\"]).dt.days\n#convert to years\nraw_df[\"LENGTH_OF_POLICY\"]=raw_df[\"LENGTH_OF_POLICY\"]/365\n#divide Odometer at loss by years\nraw_df[\"MILES/YEAR\"] = raw_df[\"ODOMETER_AT_LOSS\"]/raw_df[\"LENGTH_OF_POLICY\"]\nraw_df[\"MILES/YEAR\"].value_counts()"
        },
        {
            "cell_type": "code",
            "execution_count": 13,
            "metadata": {},
            "outputs": [],
            "source": "# Conflict on whether a policyholder with a low mileage discount experienced a loss with high mileage at the point of loss\nraw_df.loc[raw_df[\"MILES/YEAR\"] <7500, 'LOW_MILEAGE_AT_LOSS'] = 1\nraw_df.loc[raw_df[\"MILES/YEAR\"] >=7500, 'LOW_MILEAGE_AT_LOSS'] = 0"
        },
        {
            "cell_type": "code",
            "execution_count": 14,
            "metadata": {},
            "outputs": [],
            "source": "raw_df.loc[raw_df[\"LOW_MILEAGE_USE\"]==raw_df[\"LOW_MILEAGE_AT_LOSS\"], 'SUSPICIOUS_MILEAGE'] = 0\nraw_df.loc[raw_df[\"LOW_MILEAGE_USE\"]!=raw_df[\"LOW_MILEAGE_AT_LOSS\"], 'SUSPICIOUS_MILEAGE'] = 1"
        },
        {
            "cell_type": "code",
            "execution_count": 15,
            "metadata": {},
            "outputs": [],
            "source": "raw_df.loc[raw_df[\"CLAIM_AMOUNT\"] <3000, 'EXCESSIVE_CLAIM_AMOUNT'] = 0\nraw_df.loc[raw_df[\"CLAIM_AMOUNT\"] >=3000, 'EXCESSIVE_CLAIM_AMOUNT'] = 1"
        },
        {
            "cell_type": "code",
            "execution_count": 16,
            "metadata": {},
            "outputs": [],
            "source": "# dataframes for certain features\nfeatures = ['FLAG_FOR_FRAUD_INV',\n'SUSPICIOUS_MILEAGE',\n'EXPIRED_LICENSE',\n'SUSPICIOUS_CLAIM_TIME',\n'SUSPICIOUS_LIVING',\n'EXCESSIVE_CLAIM_AMOUNT']"
        },
        {
            "cell_type": "code",
            "execution_count": 17,
            "metadata": {},
            "outputs": [],
            "source": "df_model = raw_df[features]"
        },
        {
            "cell_type": "code",
            "execution_count": 18,
            "metadata": {},
            "outputs": [],
            "source": "#ensure all relevant features are integers\ndf_model[\"SUSPICIOUS_LIVING\"] = df_model[\"SUSPICIOUS_LIVING\"].astype(int)\ndf_model[\"EXPIRED_LICENSE\"] = df_model[\"EXPIRED_LICENSE\"].astype(int)\ndf_model[\"SUSPICIOUS_CLAIM_TIME\"] = df_model[\"SUSPICIOUS_CLAIM_TIME\"].astype(int)\ndf_model[\"SUSPICIOUS_MILEAGE\"] = df_model[\"SUSPICIOUS_MILEAGE\"].astype(int)\ndf_model[\"EXCESSIVE_CLAIM_AMOUNT\"] = df_model[\"EXCESSIVE_CLAIM_AMOUNT\"].astype(int)"
        },
        {
            "cell_type": "code",
            "execution_count": 19,
            "metadata": {},
            "outputs": [
                {
                    "data": {
                        "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>FLAG_FOR_FRAUD_INV</th>\n      <th>INCIDENT_CAUSE</th>\n      <th>DESCRIPTION</th>\n      <th>CLAIM_STATUS</th>\n      <th>ODOMETER_AT_LOSS</th>\n      <th>POLICE_REPORT</th>\n      <th>CLAIMS_AT_LOSS_DATE</th>\n      <th>LOSS_LOCATION_LAT</th>\n      <th>LOSS_LOCATION_LONG</th>\n      <th>CLAIM_AMOUNT</th>\n      <th>...</th>\n      <th>SUSPICIOUS_CLAIM_TIME</th>\n      <th>DAYS_FROM_L_EXPIRY</th>\n      <th>EXPIRED_LICENSE</th>\n      <th>DAYS_AT_ADDRESS</th>\n      <th>SUSPICIOUS_LIVING</th>\n      <th>LENGTH_OF_POLICY</th>\n      <th>MILES/YEAR</th>\n      <th>LOW_MILEAGE_AT_LOSS</th>\n      <th>SUSPICIOUS_MILEAGE</th>\n      <th>EXCESSIVE_CLAIM_AMOUNT</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0</td>\n      <td>2.926186</td>\n      <td>NaN</td>\n      <td>2.977153</td>\n      <td>180783.846046</td>\n      <td>0.281195</td>\n      <td>1.492091</td>\n      <td>41.857895</td>\n      <td>-87.675014</td>\n      <td>2521.147627</td>\n      <td>...</td>\n      <td>0.933216</td>\n      <td>894.546573</td>\n      <td>0.042179</td>\n      <td>3487.956063</td>\n      <td>0.996485</td>\n      <td>1.070159</td>\n      <td>205170.356701</td>\n      <td>0.0</td>\n      <td>0.094903</td>\n      <td>0.086116</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>1</td>\n      <td>3.041872</td>\n      <td>NaN</td>\n      <td>1.103448</td>\n      <td>175504.046305</td>\n      <td>0.637931</td>\n      <td>1.758621</td>\n      <td>41.860133</td>\n      <td>-87.679617</td>\n      <td>26429.480296</td>\n      <td>...</td>\n      <td>0.945813</td>\n      <td>917.960591</td>\n      <td>0.036946</td>\n      <td>3252.413793</td>\n      <td>1.000000</td>\n      <td>0.960490</td>\n      <td>230687.578997</td>\n      <td>0.0</td>\n      <td>0.088670</td>\n      <td>0.933498</td>\n    </tr>\n  </tbody>\n</table>\n<p>2 rows \u00d7 32 columns</p>\n</div>",
                        "text/plain": "   FLAG_FOR_FRAUD_INV  INCIDENT_CAUSE  DESCRIPTION  CLAIM_STATUS  \\\n0                   0        2.926186          NaN      2.977153   \n1                   1        3.041872          NaN      1.103448   \n\n   ODOMETER_AT_LOSS  POLICE_REPORT  CLAIMS_AT_LOSS_DATE  LOSS_LOCATION_LAT  \\\n0     180783.846046       0.281195             1.492091          41.857895   \n1     175504.046305       0.637931             1.758621          41.860133   \n\n   LOSS_LOCATION_LONG  CLAIM_AMOUNT  ...  SUSPICIOUS_CLAIM_TIME  \\\n0          -87.675014   2521.147627  ...               0.933216   \n1          -87.679617  26429.480296  ...               0.945813   \n\n   DAYS_FROM_L_EXPIRY  EXPIRED_LICENSE  DAYS_AT_ADDRESS  SUSPICIOUS_LIVING  \\\n0          894.546573         0.042179      3487.956063           0.996485   \n1          917.960591         0.036946      3252.413793           1.000000   \n\n   LENGTH_OF_POLICY     MILES/YEAR  LOW_MILEAGE_AT_LOSS  SUSPICIOUS_MILEAGE  \\\n0          1.070159  205170.356701                  0.0            0.094903   \n1          0.960490  230687.578997                  0.0            0.088670   \n\n   EXCESSIVE_CLAIM_AMOUNT  \n0                0.086116  \n1                0.933498  \n\n[2 rows x 32 columns]"
                    },
                    "execution_count": 19,
                    "metadata": {},
                    "output_type": "execute_result"
                }
            ],
            "source": "raw_df.groupby(\"FLAG_FOR_FRAUD_INV\", as_index=False).mean()"
        },
        {
            "cell_type": "code",
            "execution_count": 22,
            "metadata": {},
            "outputs": [],
            "source": "#split data into x and y variables\nxVar = df_model[[\"EXPIRED_LICENSE\",\"SUSPICIOUS_CLAIM_TIME\",\"SUSPICIOUS_LIVING\",\"SUSPICIOUS_MILEAGE\",\"EXCESSIVE_CLAIM_AMOUNT\"]]\nyVar = df_model[\"FLAG_FOR_FRAUD_INV\"]"
        },
        {
            "cell_type": "code",
            "execution_count": 23,
            "metadata": {},
            "outputs": [
                {
                    "data": {
                        "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>EXPIRED_LICENSE</th>\n      <th>SUSPICIOUS_CLAIM_TIME</th>\n      <th>SUSPICIOUS_LIVING</th>\n      <th>SUSPICIOUS_MILEAGE</th>\n      <th>EXCESSIVE_CLAIM_AMOUNT</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0</td>\n      <td>1</td>\n      <td>1</td>\n      <td>0</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>0</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>0</td>\n      <td>1</td>\n      <td>1</td>\n      <td>0</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>0</td>\n      <td>1</td>\n      <td>1</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
                        "text/plain": "   EXPIRED_LICENSE  SUSPICIOUS_CLAIM_TIME  SUSPICIOUS_LIVING  \\\n0                0                      1                  1   \n1                1                      1                  1   \n2                0                      1                  1   \n3                0                      1                  1   \n4                0                      1                  1   \n\n   SUSPICIOUS_MILEAGE  EXCESSIVE_CLAIM_AMOUNT  \n0                   0                       1  \n1                   0                       0  \n2                   1                       1  \n3                   0                       1  \n4                   0                       0  "
                    },
                    "execution_count": 23,
                    "metadata": {},
                    "output_type": "execute_result"
                }
            ],
            "source": "xVar.head()"
        },
        {
            "cell_type": "code",
            "execution_count": 37,
            "metadata": {},
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": "(780, 5) (780,)\n(195, 5) (195,)\n"
                }
            ],
            "source": "#split into a test/train set\nX_train, X_test, y_train, y_test = train_test_split(xVar, yVar, test_size=0.2, random_state=42)\nprint (X_train.shape, y_train.shape)\nprint (X_test.shape, y_test.shape)"
        },
        {
            "cell_type": "code",
            "execution_count": 39,
            "metadata": {},
            "outputs": [
                {
                    "data": {
                        "text/plain": "RandomForestClassifier(n_jobs=2, random_state=0)"
                    },
                    "execution_count": 39,
                    "metadata": {},
                    "output_type": "execute_result"
                }
            ],
            "source": "#train model\nclf = RandomForestClassifier(n_jobs=2, random_state=0)\nclf.fit(X_train, y_train)"
        },
        {
            "cell_type": "code",
            "execution_count": 40,
            "metadata": {},
            "outputs": [
                {
                    "data": {
                        "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th>Predicted Result</th>\n      <th>0</th>\n      <th>1</th>\n    </tr>\n    <tr>\n      <th>Actual Result</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>109</td>\n      <td>10</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>5</td>\n      <td>71</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
                        "text/plain": "Predicted Result    0   1\nActual Result            \n0                 109  10\n1                   5  71"
                    },
                    "execution_count": 40,
                    "metadata": {},
                    "output_type": "execute_result"
                }
            ],
            "source": "#create confusion matrix to gut check model\npreds = clf.predict(X_test)\npd.crosstab(y_test, preds, rownames=['Actual Result'], colnames=['Predicted Result'])"
        },
        {
            "cell_type": "code",
            "execution_count": 48,
            "metadata": {},
            "outputs": [
                {
                    "data": {
                        "text/plain": "0.9234764579342365"
                    },
                    "execution_count": 48,
                    "metadata": {},
                    "output_type": "execute_result"
                }
            ],
            "source": "#create F1 score\nfrom sklearn.metrics import f1_score\nyhat=clf.predict(X_test)\nf1_score(y_test, yhat, average='weighted')"
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "metadata": {},
            "outputs": [],
            "source": ""
        }
    ],
    "metadata": {
        "kernelspec": {
            "display_name": "Python 3.7",
            "language": "python",
            "name": "python3"
        },
        "language_info": {
            "codemirror_mode": {
                "name": "ipython",
                "version": 3
            },
            "file_extension": ".py",
            "mimetype": "text/x-python",
            "name": "python",
            "nbconvert_exporter": "python",
            "pygments_lexer": "ipython3",
            "version": "3.7.9"
        }
    },
    "nbformat": 4,
    "nbformat_minor": 1
 }