https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python
# apply
import pandas as pd
from sklearn.datasets import load_iris
iris = load_iris()
df = pd.DataFrame(iris.data, columns=df.feature_names)
df['HasBigSepal'] = df['sepal length (cm)'].apply(lambda x: 1 if x > 5 else 0)
df.head()
sepal length (cm) sepal width (cm) ... petal width (cm) HasBigSepal
0 5.1 3.5 ... 0.2 1
1 4.9 3.0 ... 0.2 0
2 4.7 3.2 ... 0.2 0
3 4.6 3.1 ... 0.2 0
4 5.0 3.6 ... 0.2 0
[5 rows x 5 columns]
# qcut
In [55]: pd.qcut(df['sepal length (cm)'], 5).head()
Out[55]:
0 (5.0, 5.6]
1 (4.2989999999999995, 5.0]
2 (4.2989999999999995, 5.0]
3 (4.2989999999999995, 5.0]
4 (4.2989999999999995, 5.0]
Name: sepal length (cm), dtype: category
Categories (5, interval[float64]): [(4.2989999999999995, 5.0] < (5.0, 5.6] < (5.6, 6.1] < (6.1, 6.52] < (6.52, 7.9]]
In [56]: pd.qcut(df['sepal length (cm)'], 5)[0]
Out[56]: Interval(5.0, 5.6, closed='right')
In [57]: 1 in pd.qcut(df['sepal length (cm)'], 5)[0]
Out[57]: False
In [58]: 5.4 in pd.qcut(df['sepal length (cm)'], 5)[0]
Out[58]: True
In [65]: df['HasBigSepal'].head()
Out[65]:
0 1
1 0
2 0
3 0
4 0
Name: HasBigSepal, dtype: int64
In [66]: df['HasBigSepal'].map({0: 'small', 1: 'big'}).head()
Out[66]:
0 big
1 small
2 small
3 small
4 small
Name: HasBigSepal, dtype: object
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier()
clf.fit(X, y)
clf.score(X, y)