Skip to content

Instantly share code, notes, and snippets.

@inoccu
Last active June 26, 2023 00:47
Show Gist options
  • Save inoccu/ead662b0d3e22b3857333c95d41c5668 to your computer and use it in GitHub Desktop.
Save inoccu/ead662b0d3e22b3857333c95d41c5668 to your computer and use it in GitHub Desktop.

Colaboratoryの基礎

Notebookの基礎

import this

requestsパッケージのインストール

!pip install requests

requestsでデータダウンロード

import requests

response = requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data')
with open('imports-85.data', mode='wb') as f:
  f.write(response.content)

タイトル行の追加

with open('imports-85.data', mode='r') as f:
  data = f.read()
    
title = 'symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price'
data = f'{title}\n{data}'

with open('automobile.csv', mode='w') as f:
  f.write(data)

Pandasによるデータの前処理

Pandas(1)データの読み込みと表示

import pandas as pd
pd.set_option('display.max_columns', 30)

df = pd.read_csv('automobile.csv')
df.head()

Pandas(2)データの形状と列名の取得

print('shape:', df.shape)
print('columns:', df.columns)

Pandas(3)列名を指定したデータの取得

df[['make', 'price']]

Pandas(4)条件を指定したデータの取得

df[['make', 'width', 'price']].query('width > 70')
df[df['width'] > 70][['make', 'width', 'price']]

Pandas(5)indexを指定したデータの取得

df[['make', 'price']].loc[10:15]

Pandas(6)グルーピングと統計計算

df[['make', 'width']].groupby(['make']).mean()

Pandas(7)基本統計量を見る

df.describe()

Pandas(8)型の確認

print(df.dtypes)

Pandas(9)特定文字列のNaNへの置換と型変換

import numpy as np

df['normalized-losses'] = df['normalized-losses'].replace('?', np.nan)
df['bore'] = df['bore'].replace('?', np.nan)
df['stroke'] = df['stroke'].replace('?', np.nan)
df['horsepower'] = df['horsepower'].replace('?', np.nan)
df['peak-rpm'] = df['peak-rpm'].replace('?', np.nan)
df['price'] = df['price'].replace('?', np.nan)
df = df.astype({'normalized-losses': 'float64', 'bore': 'float64', 'stroke': 'float64', 'horsepower': 'float64', 'peak-rpm': 'float64', 'price': 'float64'})

Pandas(10)量的データに絞り込んで基本統計量を見る

df[['width', 'length', 'horsepower', 'price']].describe()

分散

df[['horsepower', 'price']].var()

Pandas(11)カテゴリデータ(質的データ)の値の種類を見る

print(df['make'].unique())
print(type(df['make']))

Pandas(12)欠損値の確認

df.isnull().sum()

Pandas(13)欠損値のある行の削除

print('df count:', df.count())
df_a = df.dropna()
print('df_a count:', df_a.count())

Pandas(14)欠損値の補完

df_b = df.fillna({
    'normalized-losses': df['normalized-losses'].mean(),
    'bore': df['bore'].mean(),
    'stroke': df['stroke'].mean(),
    'horsepower': df['horsepower'].mean(),
    'peak-rpm': df['peak-rpm'].mean(),
    'price': df['price'].mean()
})
df_b.isnull().sum()

データの可視化

%matplotlib inline

from matplotlib import pyplot as plt
import seaborn as sns

価格のヒストグラム

df_b['price'].hist()

メーカーの棒グラフ

make_count = df[['make']].value_counts()
make_count
make_count.plot.bar()

馬力と価格の相関係数を求める

df_b[['horsepower', 'price']].corr()

相関行列の作成(1)

plt.figure(figsize=(9, 9))
sns.heatmap(df_b.corr(), annot=True)

カテゴリ変数のダミー変数化

df2 = pd.get_dummies(df_b[['make']])
df2

DataFrameの結合

df3 = pd.concat([df_b.drop(['make'], axis=1), df2], axis=1)
df3

相関行列の作成(2)

plt.figure(figsize=(12, 12))
sns.heatmap(df3.corr(), annot=True)

LabelEncoderの使用

from sklearn.preprocessing import LabelEncoder

for category in ['make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'engine-type', 'num-of-cylinders', 'fuel-system']:
    le = LabelEncoder()
    le.fit(df_b[category])
    
    df_b[category] = le.transform(df_b[category])

df_b
print(df_b.dtypes)

相関行列の作成(3)

plt.figure(figsize=(15, 15))
sns.heatmap(df_b.corr(), annot=True)

散布図行列を描く

pd.plotting.scatter_matrix(df_b[['drive-wheels', 'wheel-base', 'length', 'width', 'curb-weight', 'engine-size', 'fuel-system', 'bore', 'horsepower', 'city-mpg', 'highway-mpg', 'price']], figsize=(15,15), range_padding=0.2)
plt.show()

加工したデータフレームをCSVファイルとして保存

df_b.to_csv('automobile_converted.csv', index=False)

scikit-learnによるモデル作成

データの読み込み

import pandas as pd

df = pd.read_csv('automobile_converted.csv')
df.head()

目的変数と説明変数

# 説明変数
X_var = df.drop('price', axis=1)
X_array = X_var.values

# 目的変数
y_var = df['price']
y_array = y_var.values

訓練データとテストデータ

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_array, y_array, train_size=0.8, random_state=0)

線形回帰で機械学習

from sklearn import linear_model

model = linear_model.LinearRegression()
model.fit(X_train, y_train)

傾きと切片

print('傾き: %s' % model.coef_)
print('切片: %s' % model.intercept_)

学習済みモデルの評価

print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

学習済みモデルで予測する

model.predict([X_test[0]])
y_test[0]

決定木でのモデル作成

from sklearn.tree import DecisionTreeRegressor

# 決定木回帰
dtr = DecisionTreeRegressor(
    max_depth=3
)
dtr.fit(X_train, y_train)
print(dtr.score(X_train, y_train))
print(dtr.score(X_test, y_test))

決定木を描く

%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.tree import plot_tree

plt.figure(figsize=(20, 10))
plot_tree(
    dtr,
    label='all', # all, none
    max_depth=3, 
    filled=True,
    feature_names=df.columns,
    fontsize=12, 
)
plt.show()

ランダムフォレストでのモデル作成

from sklearn.ensemble import RandomForestRegressor

# ランダムフォレスト回帰
rfr = RandomForestRegressor(
    n_estimators=50,
    max_depth=3
)
rfr.fit(X_train, y_train)
print(rfr.score(X_train, y_train))
print(rfr.score(X_test, y_test))

XGBoostでのモデル作成

from xgboost import XGBRegressor

# XGBoost回帰
xgb = XGBRegressor(
    n_estimators=50, 
    use_label_encoder=False, 
    eval_metric='logloss')
xgb.fit(X_train, y_train)
print(xgb.score(X_train, y_train))
print(xgb.score(X_test, y_test))

Pycaretによるモデル作成の自動化

Pycaretのインストール

!pip install pycaret

データの読み込みと分割

import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('automobile_converted.csv')
train = df.sample(frac=0.8, random_state=111)
test = df.drop(train.index)
train.reset_index(inplace=True, drop=True)
test.reset_index(inplace=True, drop=True)
print('train: ' + str(train.shape))
print('test: ' + str(test.shape))

Pycaretにデータをセットアップ

from pycaret.regression import *
reg01 = setup(data=train, target='price')

アルゴリズムの選択

best_model = compare_models()

モデルの作成

model = create_model('ridge')

ハイパーパラメータのチューニング

tuned = tune_model(model)

モデルの評価

evaluate_model(tuned)

モデルのファイナライズ

final = finalize_model(tuned)
print(final)

予測の実行と精度評価

test_predictions = predict_model(final, data=test)
test_predictions.head()

モデルの保存と使用

save_model(final, 'automobile_final')
from pycaret.regression import *
import pandas as pd

model = load_model('automobile_final')

data = [[2.0, 164.0, 1.0, 1.0, 0.0, 1.0, 3.0, 0.0, 0.0, 99.4, 176.6, 66.4, 54.3, 2824.0, 3.0, 1.0, 136.0, 5.0, 3.19, 3.4, 8.0, 115.0, 5500.0, 18.0, 22.0]]
columns = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg']

df = pd.DataFrame(data, columns=columns)
predictions = predict_model(model, df)
print('predicted price: %i' % predictions[['prediction_label']].values[0][0])

作成したモデルのデプロイ

FlaskとNgrokのインストール

!pip install flask flask-ngrok pyngrok
!ngrok authtoken <token>

Flaskアプリの作成

from pycaret.regression import *
import pandas as pd
from flask import Flask, request, jsonify
from flask_ngrok import run_with_ngrok

app = Flask(__name__)
run_with_ngrok(app)

@app.route('/predict', methods=['POST'])
def predict():
  payload = request.json
  columns = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg']

  df = pd.DataFrame(payload['data'], columns=columns)
  model = load_model('automobile_final')
  predictions = predict_model(model, df)
  return jsonify({
      'price': float(predictions[['prediction_label']].values[0][0])
  })

app.run()

APIの実行

!pip install requests
import json
import requests

response = requests.post(
    'http://ce59a77d4613.ngrok.io/predict',
    headers={'Content-Type': 'application/json'},
    data=json.dumps({'data': [[2.0, 164.0, 1.0, 1.0, 0.0, 1.0, 3.0, 0.0, 0.0, 99.4, 176.6, 66.4, 54.3, 2824.0, 3.0, 1.0, 136.0, 5.0, 3.19, 3.4, 8.0, 115.0, 5500.0, 18.0, 22.0]]})
)
result = response.json()
result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment