Skip to content

Instantly share code, notes, and snippets.

@yongbin
Created June 4, 2020 08:15
Show Gist options
  • Save yongbin/f46262f49f95bfd84eb1d9c6dc8a49f1 to your computer and use it in GitHub Desktop.
Save yongbin/f46262f49f95bfd84eb1d9c6dc8a49f1 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 패키지 불러오기"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import datetime\n",
"from xgboost import XGBClassifier\n",
"from sklearn.metrics import accuracy_score"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 데이터 불러오기"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"x_train = pd.read_csv('X_train.csv')\n",
"y_train = pd.read_csv('Y_train.csv')\n",
"x_test = pd.read_csv('X_test.csv')\n",
"y_test = pd.read_csv('Y_test.csv')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 평가 데이터 제작"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"Y_train = y_train['gender']\n",
"Y_test = y_test['gender']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 데이터 합치기"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>custid</th>\n",
" <th>date_time</th>\n",
" <th>store</th>\n",
" <th>product</th>\n",
" <th>brand</th>\n",
" <th>corner</th>\n",
" <th>pc</th>\n",
" <th>part</th>\n",
" <th>imported</th>\n",
" <th>amount</th>\n",
" <th>discount</th>\n",
" <th>installment</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>2000-06-25 12:12</td>\n",
" <td>무역점</td>\n",
" <td>2116050008000</td>\n",
" <td>에스티로더</td>\n",
" <td>수입종합화장품</td>\n",
" <td>화장품</td>\n",
" <td>명품잡화</td>\n",
" <td>1</td>\n",
" <td>90000</td>\n",
" <td>9000</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>2000-06-25 12:42</td>\n",
" <td>무역점</td>\n",
" <td>4125440008000</td>\n",
" <td>시슬리</td>\n",
" <td>수입종합화장품</td>\n",
" <td>화장품</td>\n",
" <td>명품잡화</td>\n",
" <td>1</td>\n",
" <td>39000</td>\n",
" <td>3900</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>2000-08-26 18:10</td>\n",
" <td>본점</td>\n",
" <td>2116052008000</td>\n",
" <td>크리니크</td>\n",
" <td>수입종합화장품</td>\n",
" <td>화장품</td>\n",
" <td>잡화파트</td>\n",
" <td>1</td>\n",
" <td>175000</td>\n",
" <td>17500</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>2000-08-26 18:30</td>\n",
" <td>본점</td>\n",
" <td>4106430119900</td>\n",
" <td>듀퐁</td>\n",
" <td>수입의류</td>\n",
" <td>명품토탈</td>\n",
" <td>잡화파트</td>\n",
" <td>1</td>\n",
" <td>455000</td>\n",
" <td>45500</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>2000-09-03 18:02</td>\n",
" <td>무역점</td>\n",
" <td>2139141008000</td>\n",
" <td>랑콤</td>\n",
" <td>수입종합화장품</td>\n",
" <td>화장품</td>\n",
" <td>명품잡화</td>\n",
" <td>0</td>\n",
" <td>100000</td>\n",
" <td>10000</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" custid date_time store product brand corner pc part \\\n",
"0 0 2000-06-25 12:12 무역점 2116050008000 에스티로더 수입종합화장품 화장품 명품잡화 \n",
"1 0 2000-06-25 12:42 무역점 4125440008000 시슬리 수입종합화장품 화장품 명품잡화 \n",
"2 0 2000-08-26 18:10 본점 2116052008000 크리니크 수입종합화장품 화장품 잡화파트 \n",
"3 0 2000-08-26 18:30 본점 4106430119900 듀퐁 수입의류 명품토탈 잡화파트 \n",
"4 0 2000-09-03 18:02 무역점 2139141008000 랑콤 수입종합화장품 화장품 명품잡화 \n",
"\n",
" imported amount discount installment \n",
"0 1 90000 9000 3 \n",
"1 1 39000 3900 1 \n",
"2 1 175000 17500 3 \n",
"3 1 455000 45500 3 \n",
"4 0 100000 10000 3 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_test = pd.concat([x_train, x_test], ignore_index = True)\n",
"train_test.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 데이터 변환"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"train_test['date'] = train_test['date_time'].str[:10]\n",
"train_test['time'] = train_test['date_time'].str[11:13].astype('int')\n",
"del train_test['date_time']"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>custid</th>\n",
" <th>store</th>\n",
" <th>product</th>\n",
" <th>brand</th>\n",
" <th>corner</th>\n",
" <th>pc</th>\n",
" <th>part</th>\n",
" <th>imported</th>\n",
" <th>amount</th>\n",
" <th>discount</th>\n",
" <th>installment</th>\n",
" <th>date</th>\n",
" <th>time</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>무역점</td>\n",
" <td>2116050008000</td>\n",
" <td>에스티로더</td>\n",
" <td>수입종합화장품</td>\n",
" <td>화장품</td>\n",
" <td>명품잡화</td>\n",
" <td>1</td>\n",
" <td>90000</td>\n",
" <td>9000</td>\n",
" <td>3</td>\n",
" <td>2000-06-25</td>\n",
" <td>12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>무역점</td>\n",
" <td>4125440008000</td>\n",
" <td>시슬리</td>\n",
" <td>수입종합화장품</td>\n",
" <td>화장품</td>\n",
" <td>명품잡화</td>\n",
" <td>1</td>\n",
" <td>39000</td>\n",
" <td>3900</td>\n",
" <td>1</td>\n",
" <td>2000-06-25</td>\n",
" <td>12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>본점</td>\n",
" <td>2116052008000</td>\n",
" <td>크리니크</td>\n",
" <td>수입종합화장품</td>\n",
" <td>화장품</td>\n",
" <td>잡화파트</td>\n",
" <td>1</td>\n",
" <td>175000</td>\n",
" <td>17500</td>\n",
" <td>3</td>\n",
" <td>2000-08-26</td>\n",
" <td>18</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>본점</td>\n",
" <td>4106430119900</td>\n",
" <td>듀퐁</td>\n",
" <td>수입의류</td>\n",
" <td>명품토탈</td>\n",
" <td>잡화파트</td>\n",
" <td>1</td>\n",
" <td>455000</td>\n",
" <td>45500</td>\n",
" <td>3</td>\n",
" <td>2000-08-26</td>\n",
" <td>18</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>무역점</td>\n",
" <td>2139141008000</td>\n",
" <td>랑콤</td>\n",
" <td>수입종합화장품</td>\n",
" <td>화장품</td>\n",
" <td>명품잡화</td>\n",
" <td>0</td>\n",
" <td>100000</td>\n",
" <td>10000</td>\n",
" <td>3</td>\n",
" <td>2000-09-03</td>\n",
" <td>18</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" custid store product brand corner pc part imported amount \\\n",
"0 0 무역점 2116050008000 에스티로더 수입종합화장품 화장품 명품잡화 1 90000 \n",
"1 0 무역점 4125440008000 시슬리 수입종합화장품 화장품 명품잡화 1 39000 \n",
"2 0 본점 2116052008000 크리니크 수입종합화장품 화장품 잡화파트 1 175000 \n",
"3 0 본점 4106430119900 듀퐁 수입의류 명품토탈 잡화파트 1 455000 \n",
"4 0 무역점 2139141008000 랑콤 수입종합화장품 화장품 명품잡화 0 100000 \n",
"\n",
" discount installment date time \n",
"0 9000 3 2000-06-25 12 \n",
"1 3900 1 2000-06-25 12 \n",
"2 17500 3 2000-08-26 18 \n",
"3 45500 3 2000-08-26 18 \n",
"4 10000 3 2000-09-03 18 "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_test.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def ndiscount(x):\n",
" if x != 0:\n",
" return 1\n",
" else:\n",
" return 0"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"train_test['ndiscount'] = train_test['discount'].apply(ndiscount)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>custid</th>\n",
" <th>store</th>\n",
" <th>product</th>\n",
" <th>brand</th>\n",
" <th>corner</th>\n",
" <th>pc</th>\n",
" <th>part</th>\n",
" <th>imported</th>\n",
" <th>amount</th>\n",
" <th>discount</th>\n",
" <th>installment</th>\n",
" <th>date</th>\n",
" <th>time</th>\n",
" <th>ndiscount</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>무역점</td>\n",
" <td>2116050008000</td>\n",
" <td>에스티로더</td>\n",
" <td>수입종합화장품</td>\n",
" <td>화장품</td>\n",
" <td>명품잡화</td>\n",
" <td>1</td>\n",
" <td>90000</td>\n",
" <td>9000</td>\n",
" <td>3</td>\n",
" <td>2000-06-25</td>\n",
" <td>12</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>무역점</td>\n",
" <td>4125440008000</td>\n",
" <td>시슬리</td>\n",
" <td>수입종합화장품</td>\n",
" <td>화장품</td>\n",
" <td>명품잡화</td>\n",
" <td>1</td>\n",
" <td>39000</td>\n",
" <td>3900</td>\n",
" <td>1</td>\n",
" <td>2000-06-25</td>\n",
" <td>12</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>본점</td>\n",
" <td>2116052008000</td>\n",
" <td>크리니크</td>\n",
" <td>수입종합화장품</td>\n",
" <td>화장품</td>\n",
" <td>잡화파트</td>\n",
" <td>1</td>\n",
" <td>175000</td>\n",
" <td>17500</td>\n",
" <td>3</td>\n",
" <td>2000-08-26</td>\n",
" <td>18</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>본점</td>\n",
" <td>4106430119900</td>\n",
" <td>듀퐁</td>\n",
" <td>수입의류</td>\n",
" <td>명품토탈</td>\n",
" <td>잡화파트</td>\n",
" <td>1</td>\n",
" <td>455000</td>\n",
" <td>45500</td>\n",
" <td>3</td>\n",
" <td>2000-08-26</td>\n",
" <td>18</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>무역점</td>\n",
" <td>2139141008000</td>\n",
" <td>랑콤</td>\n",
" <td>수입종합화장품</td>\n",
" <td>화장품</td>\n",
" <td>명품잡화</td>\n",
" <td>0</td>\n",
" <td>100000</td>\n",
" <td>10000</td>\n",
" <td>3</td>\n",
" <td>2000-09-03</td>\n",
" <td>18</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" custid store product brand corner pc part imported amount \\\n",
"0 0 무역점 2116050008000 에스티로더 수입종합화장품 화장품 명품잡화 1 90000 \n",
"1 0 무역점 4125440008000 시슬리 수입종합화장품 화장품 명품잡화 1 39000 \n",
"2 0 본점 2116052008000 크리니크 수입종합화장품 화장품 잡화파트 1 175000 \n",
"3 0 본점 4106430119900 듀퐁 수입의류 명품토탈 잡화파트 1 455000 \n",
"4 0 무역점 2139141008000 랑콤 수입종합화장품 화장품 명품잡화 0 100000 \n",
"\n",
" discount installment date time ndiscount \n",
"0 9000 3 2000-06-25 12 1 \n",
"1 3900 1 2000-06-25 12 1 \n",
"2 17500 3 2000-08-26 18 1 \n",
"3 45500 3 2000-08-26 18 1 \n",
"4 10000 3 2000-09-03 18 1 "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_test.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 데이터 분할"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"x = []\n",
"y = []"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"for f in train_test.index:\n",
" if train_test['amount'][f] < 0:\n",
" x.append(f)\n",
" else :\n",
" y.append(f)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"train_test_purchase = train_test.drop(train_test.index[x])\n",
"train_test_refund = train_test.drop(train_test.index[y])"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>custid</th>\n",
" <th>store</th>\n",
" <th>product</th>\n",
" <th>brand</th>\n",
" <th>corner</th>\n",
" <th>pc</th>\n",
" <th>part</th>\n",
" <th>imported</th>\n",
" <th>amount</th>\n",
" <th>discount</th>\n",
" <th>installment</th>\n",
" <th>date</th>\n",
" <th>time</th>\n",
" <th>ndiscount</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>무역점</td>\n",
" <td>2116050008000</td>\n",
" <td>에스티로더</td>\n",
" <td>수입종합화장품</td>\n",
" <td>화장품</td>\n",
" <td>명품잡화</td>\n",
" <td>1</td>\n",
" <td>90000</td>\n",
" <td>9000</td>\n",
" <td>3</td>\n",
" <td>2000-06-25</td>\n",
" <td>12</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>무역점</td>\n",
" <td>4125440008000</td>\n",
" <td>시슬리</td>\n",
" <td>수입종합화장품</td>\n",
" <td>화장품</td>\n",
" <td>명품잡화</td>\n",
" <td>1</td>\n",
" <td>39000</td>\n",
" <td>3900</td>\n",
" <td>1</td>\n",
" <td>2000-06-25</td>\n",
" <td>12</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>본점</td>\n",
" <td>2116052008000</td>\n",
" <td>크리니크</td>\n",
" <td>수입종합화장품</td>\n",
" <td>화장품</td>\n",
" <td>잡화파트</td>\n",
" <td>1</td>\n",
" <td>175000</td>\n",
" <td>17500</td>\n",
" <td>3</td>\n",
" <td>2000-08-26</td>\n",
" <td>18</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>본점</td>\n",
" <td>4106430119900</td>\n",
" <td>듀퐁</td>\n",
" <td>수입의류</td>\n",
" <td>명품토탈</td>\n",
" <td>잡화파트</td>\n",
" <td>1</td>\n",
" <td>455000</td>\n",
" <td>45500</td>\n",
" <td>3</td>\n",
" <td>2000-08-26</td>\n",
" <td>18</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>무역점</td>\n",
" <td>2139141008000</td>\n",
" <td>랑콤</td>\n",
" <td>수입종합화장품</td>\n",
" <td>화장품</td>\n",
" <td>명품잡화</td>\n",
" <td>0</td>\n",
" <td>100000</td>\n",
" <td>10000</td>\n",
" <td>3</td>\n",
" <td>2000-09-03</td>\n",
" <td>18</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" custid store product brand corner pc part imported amount \\\n",
"0 0 무역점 2116050008000 에스티로더 수입종합화장품 화장품 명품잡화 1 90000 \n",
"1 0 무역점 4125440008000 시슬리 수입종합화장품 화장품 명품잡화 1 39000 \n",
"2 0 본점 2116052008000 크리니크 수입종합화장품 화장품 잡화파트 1 175000 \n",
"3 0 본점 4106430119900 듀퐁 수입의류 명품토탈 잡화파트 1 455000 \n",
"4 0 무역점 2139141008000 랑콤 수입종합화장품 화장품 명품잡화 0 100000 \n",
"\n",
" discount installment date time ndiscount \n",
"0 9000 3 2000-06-25 12 1 \n",
"1 3900 1 2000-06-25 12 1 \n",
"2 17500 3 2000-08-26 18 1 \n",
"3 45500 3 2000-08-26 18 1 \n",
"4 10000 3 2000-09-03 18 1 "
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_test_purchase.head()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>custid</th>\n",
" <th>store</th>\n",
" <th>product</th>\n",
" <th>brand</th>\n",
" <th>corner</th>\n",
" <th>pc</th>\n",
" <th>part</th>\n",
" <th>imported</th>\n",
" <th>amount</th>\n",
" <th>discount</th>\n",
" <th>installment</th>\n",
" <th>date</th>\n",
" <th>time</th>\n",
" <th>ndiscount</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>1</td>\n",
" <td>본점</td>\n",
" <td>4234190015074</td>\n",
" <td>바바라</td>\n",
" <td>란제리</td>\n",
" <td>내의란제리</td>\n",
" <td>케주얼,구두,아동</td>\n",
" <td>1</td>\n",
" <td>-35000</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2000-06-17</td>\n",
" <td>12</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>1</td>\n",
" <td>본점</td>\n",
" <td>4229811011200</td>\n",
" <td>시슬리</td>\n",
" <td>영트랜드</td>\n",
" <td>영트렌디</td>\n",
" <td>케주얼,구두,아동</td>\n",
" <td>0</td>\n",
" <td>-73000</td>\n",
" <td>-3650</td>\n",
" <td>3</td>\n",
" <td>2000-06-30</td>\n",
" <td>11</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>39</th>\n",
" <td>2</td>\n",
" <td>무역점</td>\n",
" <td>4301000017000</td>\n",
" <td>노티카</td>\n",
" <td>트래디셔널</td>\n",
" <td>트래디셔널</td>\n",
" <td>골프/유니캐쥬얼</td>\n",
" <td>0</td>\n",
" <td>-434500</td>\n",
" <td>-43450</td>\n",
" <td>1</td>\n",
" <td>2000-08-27</td>\n",
" <td>19</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>46</th>\n",
" <td>2</td>\n",
" <td>무역점</td>\n",
" <td>4502161930200</td>\n",
" <td>삼성</td>\n",
" <td>가전특정</td>\n",
" <td>가전</td>\n",
" <td>가정용품</td>\n",
" <td>0</td>\n",
" <td>-1416000</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" <td>2001-01-03</td>\n",
" <td>10</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>60</th>\n",
" <td>3</td>\n",
" <td>천호점</td>\n",
" <td>4405620111000</td>\n",
" <td>지오다노</td>\n",
" <td>영캐쥬얼</td>\n",
" <td>영트랜디</td>\n",
" <td>영라이브</td>\n",
" <td>0</td>\n",
" <td>-74600</td>\n",
" <td>-3730</td>\n",
" <td>3</td>\n",
" <td>2000-08-27</td>\n",
" <td>11</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" custid store product brand corner pc part imported \\\n",
"18 1 본점 4234190015074 바바라 란제리 내의란제리 케주얼,구두,아동 1 \n",
"21 1 본점 4229811011200 시슬리 영트랜드 영트렌디 케주얼,구두,아동 0 \n",
"39 2 무역점 4301000017000 노티카 트래디셔널 트래디셔널 골프/유니캐쥬얼 0 \n",
"46 2 무역점 4502161930200 삼성 가전특정 가전 가정용품 0 \n",
"60 3 천호점 4405620111000 지오다노 영캐쥬얼 영트랜디 영라이브 0 \n",
"\n",
" amount discount installment date time ndiscount \n",
"18 -35000 0 1 2000-06-17 12 0 \n",
"21 -73000 -3650 3 2000-06-30 11 1 \n",
"39 -434500 -43450 1 2000-08-27 19 1 \n",
"46 -1416000 0 6 2001-01-03 10 0 \n",
"60 -74600 -3730 3 2000-08-27 11 1 "
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_test_refund.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# features 제작"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"features = []"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1.총구매액"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>custid</th>\n",
" <th>총구매액</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>1742000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2880100</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>5601350</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>2996100</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>1045000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" custid 총구매액\n",
"0 0 1742000\n",
"1 1 2880100\n",
"2 2 5601350\n",
"3 3 2996100\n",
"4 4 1045000"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"f = train_test_purchase.groupby('custid')['amount'].agg([('총구매액', 'sum')])\n",
"\n",
"f = f.reset_index()\n",
"f.head()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"features.append(f)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2.구매건수"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>custid</th>\n",
" <th>구매건수</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>24</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>28</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" custid 구매건수\n",
"0 0 11\n",
"1 1 24\n",
"2 2 9\n",
"3 3 28\n",
"4 4 4"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"f = train_test_purchase.groupby('custid')['amount'].agg([('구매건수', 'count')])\n",
"\n",
"f = f.reset_index()\n",
"f.head()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"features.append(f)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3.환불건수"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>custid</th>\n",
" <th>환불건수</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>8</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" custid 환불건수\n",
"0 1 2\n",
"1 2 2\n",
"2 3 2\n",
"3 6 2\n",
"4 8 2"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"f = train_test_refund.groupby('custid')['amount'].agg([('환불건수', 'count')]).astype('int')\n",
"\n",
"f = f.reset_index()\n",
"f.head()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"features.append(f)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4.평균 구매가격"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>custid</th>\n",
" <th>평균구매가격</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>158363.636364</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>120004.166667</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>622372.222222</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>107003.571429</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>261250.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" custid 평균구매가격\n",
"0 0 158363.636364\n",
"1 1 120004.166667\n",
"2 2 622372.222222\n",
"3 3 107003.571429\n",
"4 4 261250.000000"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"f = train_test_purchase.groupby('custid')['amount'].agg([('평균구매가격', 'mean')])\n",
"\n",
"f = f.reset_index()\n",
"f.head()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"features.append(f)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5.평균 할부개월"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>custid</th>\n",
" <th>평균할부개월</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>2.818182</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>3.444444</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>2.571429</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>4.500000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" custid 평균할부개월\n",
"0 0 2.818182\n",
"1 1 2.500000\n",
"2 2 3.444444\n",
"3 3 2.571429\n",
"4 4 4.500000"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"f = train_test_purchase.groupby('custid')['installment'].agg([('평균할부개월','mean')])\n",
"\n",
"f = f.reset_index()\n",
"f.head()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"features.append(f)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 6.브랜드 다양성"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1900"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"n_brand = train_test_purchase['brand'].nunique()\n",
"n_brand"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>custid</th>\n",
" <th>브랜드다양성</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>0.003684</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>0.010000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>0.003684</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>0.011053</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>0.002105</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" custid 브랜드다양성\n",
"0 0 0.003684\n",
"1 1 0.010000\n",
"2 2 0.003684\n",
"3 3 0.011053\n",
"4 4 0.002105"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def brand(x):\n",
" return x.nunique() / n_brand\n",
"\n",
"f = train_test_purchase.groupby('custid')['brand'].agg([('브랜드다양성', brand)])\n",
"\n",
"f = f.reset_index()\n",
"f.head()"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"features.append(f)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 7.내점 일수"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>custid</th>\n",
" <th>내점일수</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>16</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" custid 내점일수\n",
"0 0 7\n",
"1 1 16\n",
"2 2 7\n",
"3 3 12\n",
"4 4 2"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"f = train_test_purchase.groupby('custid')['date'].agg([('내점일수','nunique')])\n",
"\n",
"f = f.reset_index()\n",
"f.head()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"features.append(f)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 8.평균 할인금액"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>custid</th>\n",
" <th>평균할인금액</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>15836.363636</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2511.666667</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>33171.111111</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>4515.714286</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>5450.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" custid 평균할인금액\n",
"0 0 15836.363636\n",
"1 1 2511.666667\n",
"2 2 33171.111111\n",
"3 3 4515.714286\n",
"4 4 5450.000000"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"f = train_test_purchase.groupby('custid')['discount'].agg([('평균할인금액','mean')])\n",
"\n",
"f = f.reset_index()\n",
"f.head()"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"features.append(f)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 9.파트별 구매건수"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>part</th>\n",
" <th>custid</th>\n",
" <th>가정용품</th>\n",
" <th>가정용품파트</th>\n",
" <th>골프/유니캐쥬얼</th>\n",
" <th>공산품</th>\n",
" <th>공산품파트</th>\n",
" <th>남성의류</th>\n",
" <th>남성정장스포츠</th>\n",
" <th>로얄부띠끄</th>\n",
" <th>로얄부틱</th>\n",
" <th>...</th>\n",
" <th>여성캐쥬얼</th>\n",
" <th>영라이브</th>\n",
" <th>영어덜트캐쥬얼</th>\n",
" <th>영캐릭터</th>\n",
" <th>영플라자</th>\n",
" <th>인터넷백화점</th>\n",
" <th>잡화</th>\n",
" <th>잡화파트</th>\n",
" <th>케주얼,구두,아동</th>\n",
" <th>패션잡화</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.090909</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.090909</td>\n",
" <td>0.000000</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.545455</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>0.041667</td>\n",
" <td>0.041667</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.041667</td>\n",
" <td>0.25</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.208333</td>\n",
" <td>0.25</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>0.333333</td>\n",
" <td>0.000000</td>\n",
" <td>0.222222</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.111111</td>\n",
" <td>0.000000</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.111111</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.111111</td>\n",
" <td>0.000000</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>0.071429</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.107143</td>\n",
" <td>0.0</td>\n",
" <td>0.107143</td>\n",
" <td>0.000000</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.214286</td>\n",
" <td>0.071429</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 32 columns</p>\n",
"</div>"
],
"text/plain": [
"part custid 가정용품 가정용품파트 골프/유니캐쥬얼 공산품 공산품파트 남성의류 \\\n",
"0 0 0.000000 0.000000 0.090909 0.000000 0.0 0.090909 \n",
"1 1 0.041667 0.041667 0.000000 0.000000 0.0 0.000000 \n",
"2 2 0.333333 0.000000 0.222222 0.000000 0.0 0.111111 \n",
"3 3 0.071429 0.000000 0.000000 0.107143 0.0 0.107143 \n",
"4 4 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 \n",
"\n",
"part 남성정장스포츠 로얄부띠끄 로얄부틱 ... 여성캐쥬얼 영라이브 영어덜트캐쥬얼 영캐릭터 영플라자 \\\n",
"0 0.000000 0.00 0.0 ... 0.0 0.000000 0.000000 0.0 0.0 \n",
"1 0.041667 0.25 0.0 ... 0.0 0.000000 0.000000 0.0 0.0 \n",
"2 0.000000 0.00 0.0 ... 0.0 0.000000 0.111111 0.0 0.0 \n",
"3 0.000000 0.00 0.0 ... 0.0 0.214286 0.071429 0.0 0.0 \n",
"4 0.000000 0.00 0.0 ... 0.0 0.000000 0.000000 0.0 0.0 \n",
"\n",
"part 인터넷백화점 잡화 잡화파트 케주얼,구두,아동 패션잡화 \n",
"0 0.0 0.000000 0.545455 0.00 0.0 \n",
"1 0.0 0.000000 0.208333 0.25 0.0 \n",
"2 0.0 0.111111 0.000000 0.00 0.0 \n",
"3 0.0 0.000000 0.000000 0.00 0.0 \n",
"4 0.0 0.000000 0.000000 0.00 0.0 \n",
"\n",
"[5 rows x 32 columns]"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"f = pd.pivot_table(train_test_purchase, index='custid', columns='part', values='amount', aggfunc='count', fill_value=0)\n",
"\n",
"f = f.div(f.sum(axis = 1), axis = 0).fillna(0)\n",
"\n",
"f = f.reset_index()\n",
"f.head()"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"features.append(f)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 10.오전/오후 구매비율"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"def time(x):\n",
" if x >= 12:\n",
" return 1\n",
" else :\n",
" return 0\n",
" \n",
"train_test_purchase['time'] = train_test_purchase['time'].apply(time)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>custid</th>\n",
" <th>오전</th>\n",
" <th>오후</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>0.125000</td>\n",
" <td>0.875000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>0.222222</td>\n",
" <td>0.777778</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>0.071429</td>\n",
" <td>0.928571</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" custid 오전 오후\n",
"0 0 0.000000 1.000000\n",
"1 1 0.125000 0.875000\n",
"2 2 0.222222 0.777778\n",
"3 3 0.071429 0.928571\n",
"4 4 0.000000 1.000000"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"f = pd.pivot_table(train_test_purchase, index='custid', columns='time', values='amount', aggfunc='count', fill_value=0)\n",
"\n",
"f.columns = ['오전', '오후']\n",
"\n",
"f = f.div(f.sum(axis = 1), axis = 0).fillna(0)\n",
"\n",
"f = f.reset_index()\n",
"f.head()"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"features.append(f)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 11.할인건수"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>custid</th>\n",
" <th>할인건수</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>18</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" custid 할인건수\n",
"0 0 11\n",
"1 1 9\n",
"2 2 7\n",
"3 3 18\n",
"4 4 2"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"f = train_test_purchase.groupby('custid')['ndiscount'].agg([('할인건수', 'sum')])\n",
"\n",
"f = f.reset_index()\n",
"f.head()"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"features.append(f)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 12.할부건수"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"def isins(x):\n",
" if x > 0:\n",
" return 1\n",
" else:\n",
" return 0\n",
" \n",
"train_test_purchase['ins'] = train_test_purchase['installment'].apply(isins)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>custid</th>\n",
" <th>할부건수</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>24</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>28</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" custid 할부건수\n",
"0 0 11\n",
"1 1 24\n",
"2 2 9\n",
"3 3 28\n",
"4 4 4"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"f = train_test_purchase.groupby('custid')['ins'].agg([('할부건수', 'sum')])\n",
"\n",
"f = f.reset_index()\n",
"f.head()"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"features.append(f)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 13.계절별 구매건수"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"train_test_purchase['year'] = train_test_purchase['date'].str[:4].astype('int')\n",
"train_test_purchase['month'] = train_test_purchase['date'].str[5:7].astype('int')\n",
"train_test_purchase['day'] = train_test_purchase['date'].str[8:10].astype('int')\n",
"\n",
"def ismonth(x):\n",
" if x >= 3 and x <= 4:\n",
" return '봄'\n",
" elif x >= 5 and x <= 8:\n",
" return '여름'\n",
" elif x >= 9 and x <= 10:\n",
" return '가을'\n",
" else:\n",
" return '겨울'\n",
" \n",
"train_test_purchase['season'] = train_test_purchase['month'].apply(ismonth)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>season</th>\n",
" <th>custid</th>\n",
" <th>가을</th>\n",
" <th>겨울</th>\n",
" <th>봄</th>\n",
" <th>여름</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>0.272727</td>\n",
" <td>0.272727</td>\n",
" <td>0.090909</td>\n",
" <td>0.363636</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>0.083333</td>\n",
" <td>0.208333</td>\n",
" <td>0.208333</td>\n",
" <td>0.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>0.444444</td>\n",
" <td>0.333333</td>\n",
" <td>0.000000</td>\n",
" <td>0.222222</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>0.107143</td>\n",
" <td>0.214286</td>\n",
" <td>0.214286</td>\n",
" <td>0.464286</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"season custid 가을 겨울 봄 여름\n",
"0 0 0.272727 0.272727 0.090909 0.363636\n",
"1 1 0.083333 0.208333 0.208333 0.500000\n",
"2 2 0.444444 0.333333 0.000000 0.222222\n",
"3 3 0.107143 0.214286 0.214286 0.464286\n",
"4 4 0.000000 0.000000 0.000000 1.000000"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"f = pd.pivot_table(train_test_purchase, index='custid', columns='season', values='amount', aggfunc='count', fill_value=0)\n",
"\n",
"f = f.div(f.sum(axis=1), axis=0).fillna(0)\n",
"\n",
"f = f.reset_index()\n",
"f.head()"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"features.append(f)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 14.주말별 구매건수"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"train_test_purchase['date'] = pd.to_datetime(train_test_purchase['date'])"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"def week(x):\n",
" return x.dayofweek\n",
"\n",
"train_test_purchase['week'] = train_test_purchase['date'].apply(week)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"def weekend(x):\n",
" if x >= 5:\n",
" return 1\n",
" else:\n",
" return 0\n",
" \n",
"train_test_purchase['week'] = train_test_purchase['week'].apply(weekend)"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>custid</th>\n",
" <th>주말</th>\n",
" <th>평일</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>0.363636</td>\n",
" <td>0.636364</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>0.500000</td>\n",
" <td>0.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>0.666667</td>\n",
" <td>0.333333</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>0.642857</td>\n",
" <td>0.357143</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>0.750000</td>\n",
" <td>0.250000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" custid 주말 평일\n",
"0 0 0.363636 0.636364\n",
"1 1 0.500000 0.500000\n",
"2 2 0.666667 0.333333\n",
"3 3 0.642857 0.357143\n",
"4 4 0.750000 0.250000"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"f = pd.pivot_table(train_test_purchase, index = 'custid', columns = 'week', values = 'amount', aggfunc = 'count', fill_value=0)\n",
"\n",
"f.columns = ['주말','평일']\n",
"\n",
"f = f.div(f.sum(axis=1), axis=0).fillna(0)\n",
"\n",
"f = f.reset_index()\n",
"f.head()"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"features.append(f)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# features 합치기"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>custid</th>\n",
" <th>총구매액</th>\n",
" <th>구매건수</th>\n",
" <th>환불건수</th>\n",
" <th>평균구매가격</th>\n",
" <th>평균할부개월</th>\n",
" <th>브랜드다양성</th>\n",
" <th>내점일수</th>\n",
" <th>평균할인금액</th>\n",
" <th>가정용품</th>\n",
" <th>...</th>\n",
" <th>오전</th>\n",
" <th>오후</th>\n",
" <th>할인건수</th>\n",
" <th>할부건수</th>\n",
" <th>가을</th>\n",
" <th>겨울</th>\n",
" <th>봄</th>\n",
" <th>여름</th>\n",
" <th>주말</th>\n",
" <th>평일</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>1742000</td>\n",
" <td>11</td>\n",
" <td>0.0</td>\n",
" <td>158363.636364</td>\n",
" <td>2.818182</td>\n",
" <td>0.003684</td>\n",
" <td>7</td>\n",
" <td>15836.363636</td>\n",
" <td>0.000000</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>11</td>\n",
" <td>11</td>\n",
" <td>0.272727</td>\n",
" <td>0.272727</td>\n",
" <td>0.090909</td>\n",
" <td>0.363636</td>\n",
" <td>0.363636</td>\n",
" <td>0.636364</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2880100</td>\n",
" <td>24</td>\n",
" <td>2.0</td>\n",
" <td>120004.166667</td>\n",
" <td>2.500000</td>\n",
" <td>0.010000</td>\n",
" <td>16</td>\n",
" <td>2511.666667</td>\n",
" <td>0.041667</td>\n",
" <td>...</td>\n",
" <td>0.125000</td>\n",
" <td>0.875000</td>\n",
" <td>9</td>\n",
" <td>24</td>\n",
" <td>0.083333</td>\n",
" <td>0.208333</td>\n",
" <td>0.208333</td>\n",
" <td>0.500000</td>\n",
" <td>0.500000</td>\n",
" <td>0.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>5601350</td>\n",
" <td>9</td>\n",
" <td>2.0</td>\n",
" <td>622372.222222</td>\n",
" <td>3.444444</td>\n",
" <td>0.003684</td>\n",
" <td>7</td>\n",
" <td>33171.111111</td>\n",
" <td>0.333333</td>\n",
" <td>...</td>\n",
" <td>0.222222</td>\n",
" <td>0.777778</td>\n",
" <td>7</td>\n",
" <td>9</td>\n",
" <td>0.444444</td>\n",
" <td>0.333333</td>\n",
" <td>0.000000</td>\n",
" <td>0.222222</td>\n",
" <td>0.666667</td>\n",
" <td>0.333333</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>2996100</td>\n",
" <td>28</td>\n",
" <td>2.0</td>\n",
" <td>107003.571429</td>\n",
" <td>2.571429</td>\n",
" <td>0.011053</td>\n",
" <td>12</td>\n",
" <td>4515.714286</td>\n",
" <td>0.071429</td>\n",
" <td>...</td>\n",
" <td>0.071429</td>\n",
" <td>0.928571</td>\n",
" <td>18</td>\n",
" <td>28</td>\n",
" <td>0.107143</td>\n",
" <td>0.214286</td>\n",
" <td>0.214286</td>\n",
" <td>0.464286</td>\n",
" <td>0.642857</td>\n",
" <td>0.357143</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>1045000</td>\n",
" <td>4</td>\n",
" <td>0.0</td>\n",
" <td>261250.000000</td>\n",
" <td>4.500000</td>\n",
" <td>0.002105</td>\n",
" <td>2</td>\n",
" <td>5450.000000</td>\n",
" <td>0.000000</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0.750000</td>\n",
" <td>0.250000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 50 columns</p>\n",
"</div>"
],
"text/plain": [
" custid 총구매액 구매건수 환불건수 평균구매가격 평균할부개월 브랜드다양성 내점일수 \\\n",
"0 0 1742000 11 0.0 158363.636364 2.818182 0.003684 7 \n",
"1 1 2880100 24 2.0 120004.166667 2.500000 0.010000 16 \n",
"2 2 5601350 9 2.0 622372.222222 3.444444 0.003684 7 \n",
"3 3 2996100 28 2.0 107003.571429 2.571429 0.011053 12 \n",
"4 4 1045000 4 0.0 261250.000000 4.500000 0.002105 2 \n",
"\n",
" 평균할인금액 가정용품 ... 오전 오후 할인건수 할부건수 가을 \\\n",
"0 15836.363636 0.000000 ... 0.000000 1.000000 11 11 0.272727 \n",
"1 2511.666667 0.041667 ... 0.125000 0.875000 9 24 0.083333 \n",
"2 33171.111111 0.333333 ... 0.222222 0.777778 7 9 0.444444 \n",
"3 4515.714286 0.071429 ... 0.071429 0.928571 18 28 0.107143 \n",
"4 5450.000000 0.000000 ... 0.000000 1.000000 2 4 0.000000 \n",
"\n",
" 겨울 봄 여름 주말 평일 \n",
"0 0.272727 0.090909 0.363636 0.363636 0.636364 \n",
"1 0.208333 0.208333 0.500000 0.500000 0.500000 \n",
"2 0.333333 0.000000 0.222222 0.666667 0.333333 \n",
"3 0.214286 0.214286 0.464286 0.642857 0.357143 \n",
"4 0.000000 0.000000 1.000000 0.750000 0.250000 \n",
"\n",
"[5 rows x 50 columns]"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tr = pd.DataFrame({'custid' : train_test['custid'].unique()})\n",
"\n",
"for f in features:\n",
" tr = pd.merge(tr, f, how='left').fillna(0)\n",
"\n",
"tr.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 21.할인비율"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"tr['할인비율'] = tr['할인건수'] / tr['구매건수'].round(2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 22.평균할인율"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"tr['평균할인율'] = tr['평균할인금액'] / tr['평균구매가격'].round(2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 23.환불비율"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"tr['환불비율'] = tr['환불건수'] / tr['구매건수'].round(2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 24.할부비율"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"tr['할부비율'] = tr['할부건수'] / tr['구매건수']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 25.하루평균구매금액"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"tr['하루구매금액'] = tr[\"총구매액\"] / tr['내점일수']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 잘못된 피쳐 삭제"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"del tr['총구매액']"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"del tr[\"평균구매가격\"]"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"del tr['평균할인금액']"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
"del tr['할부건수']"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"del tr['할인건수']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# featuers 나누기"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
"X_train = tr[tr['custid'] <= x_train['custid'].unique().max()]\n",
"del X_train['custid']"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"X_test = tr[tr['custid'] >= x_test['custid'].unique().min()]\n",
"del X_test['custid']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 모델 제작"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[10:41:28] WARNING: C:\\Users\\Administrator\\workspace\\xgboost-win64_release_1.1.0\\src\\learner.cc:480: \n",
"Parameters: { n_extimators } might not be used.\n",
"\n",
" This may not be accurate due to some parameters are only used in language bindings but\n",
" passed down to XGBoost core. Or some parameters are not used but slip through this\n",
" verification. Please open an issue if you find above cases.\n",
"\n",
"\n"
]
}
],
"source": [
"xgb = XGBClassifier(n_extimators=500, learning_rate = 0.1, max_depth = 4)\n",
"xgb.fit(X_train, Y_train)\n",
"xgb_pred = xgb.predict(X_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 평가"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"71.52\n",
"0.7151787946986746\n"
]
}
],
"source": [
"print(accuracy_score(Y_test, xgb_pred).round(4) * 100)\n",
"print(accuracy_score(Y_test, xgb_pred))"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>custid</th>\n",
" <th>구매건수</th>\n",
" <th>환불건수</th>\n",
" <th>평균할부개월</th>\n",
" <th>브랜드다양성</th>\n",
" <th>내점일수</th>\n",
" <th>가정용품</th>\n",
" <th>가정용품파트</th>\n",
" <th>골프/유니캐쥬얼</th>\n",
" <th>공산품</th>\n",
" <th>...</th>\n",
" <th>겨울</th>\n",
" <th>봄</th>\n",
" <th>여름</th>\n",
" <th>주말</th>\n",
" <th>평일</th>\n",
" <th>할인비율</th>\n",
" <th>평균할인율</th>\n",
" <th>환불비율</th>\n",
" <th>할부비율</th>\n",
" <th>하루구매금액</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>11</td>\n",
" <td>0.0</td>\n",
" <td>2.818182</td>\n",
" <td>0.003684</td>\n",
" <td>7</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.090909</td>\n",
" <td>0.000000</td>\n",
" <td>...</td>\n",
" <td>0.272727</td>\n",
" <td>0.090909</td>\n",
" <td>0.363636</td>\n",
" <td>0.363636</td>\n",
" <td>0.636364</td>\n",
" <td>1.000000</td>\n",
" <td>0.100000</td>\n",
" <td>0.000000</td>\n",
" <td>1.0</td>\n",
" <td>248857.142857</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>24</td>\n",
" <td>2.0</td>\n",
" <td>2.500000</td>\n",
" <td>0.010000</td>\n",
" <td>16</td>\n",
" <td>0.041667</td>\n",
" <td>0.041667</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>...</td>\n",
" <td>0.208333</td>\n",
" <td>0.208333</td>\n",
" <td>0.500000</td>\n",
" <td>0.500000</td>\n",
" <td>0.500000</td>\n",
" <td>0.375000</td>\n",
" <td>0.020930</td>\n",
" <td>0.083333</td>\n",
" <td>1.0</td>\n",
" <td>180006.250000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>9</td>\n",
" <td>2.0</td>\n",
" <td>3.444444</td>\n",
" <td>0.003684</td>\n",
" <td>7</td>\n",
" <td>0.333333</td>\n",
" <td>0.000000</td>\n",
" <td>0.222222</td>\n",
" <td>0.000000</td>\n",
" <td>...</td>\n",
" <td>0.333333</td>\n",
" <td>0.000000</td>\n",
" <td>0.222222</td>\n",
" <td>0.666667</td>\n",
" <td>0.333333</td>\n",
" <td>0.777778</td>\n",
" <td>0.053298</td>\n",
" <td>0.222222</td>\n",
" <td>1.0</td>\n",
" <td>800192.857143</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>28</td>\n",
" <td>2.0</td>\n",
" <td>2.571429</td>\n",
" <td>0.011053</td>\n",
" <td>12</td>\n",
" <td>0.071429</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.107143</td>\n",
" <td>...</td>\n",
" <td>0.214286</td>\n",
" <td>0.214286</td>\n",
" <td>0.464286</td>\n",
" <td>0.642857</td>\n",
" <td>0.357143</td>\n",
" <td>0.642857</td>\n",
" <td>0.042202</td>\n",
" <td>0.071429</td>\n",
" <td>1.0</td>\n",
" <td>249675.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>0.0</td>\n",
" <td>4.500000</td>\n",
" <td>0.002105</td>\n",
" <td>2</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0.750000</td>\n",
" <td>0.250000</td>\n",
" <td>0.500000</td>\n",
" <td>0.020861</td>\n",
" <td>0.000000</td>\n",
" <td>1.0</td>\n",
" <td>522500.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 50 columns</p>\n",
"</div>"
],
"text/plain": [
" custid 구매건수 환불건수 평균할부개월 브랜드다양성 내점일수 가정용품 가정용품파트 골프/유니캐쥬얼 \\\n",
"0 0 11 0.0 2.818182 0.003684 7 0.000000 0.000000 0.090909 \n",
"1 1 24 2.0 2.500000 0.010000 16 0.041667 0.041667 0.000000 \n",
"2 2 9 2.0 3.444444 0.003684 7 0.333333 0.000000 0.222222 \n",
"3 3 28 2.0 2.571429 0.011053 12 0.071429 0.000000 0.000000 \n",
"4 4 4 0.0 4.500000 0.002105 2 0.000000 0.000000 0.000000 \n",
"\n",
" 공산품 ... 겨울 봄 여름 주말 평일 할인비율 \\\n",
"0 0.000000 ... 0.272727 0.090909 0.363636 0.363636 0.636364 1.000000 \n",
"1 0.000000 ... 0.208333 0.208333 0.500000 0.500000 0.500000 0.375000 \n",
"2 0.000000 ... 0.333333 0.000000 0.222222 0.666667 0.333333 0.777778 \n",
"3 0.107143 ... 0.214286 0.214286 0.464286 0.642857 0.357143 0.642857 \n",
"4 0.000000 ... 0.000000 0.000000 1.000000 0.750000 0.250000 0.500000 \n",
"\n",
" 평균할인율 환불비율 할부비율 하루구매금액 \n",
"0 0.100000 0.000000 1.0 248857.142857 \n",
"1 0.020930 0.083333 1.0 180006.250000 \n",
"2 0.053298 0.222222 1.0 800192.857143 \n",
"3 0.042202 0.071429 1.0 249675.000000 \n",
"4 0.020861 0.000000 1.0 522500.000000 \n",
"\n",
"[5 rows x 50 columns]"
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tr.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment