Skip to content

Instantly share code, notes, and snippets.

class DecisionTreeRegressor:
def fit(self, X, y, min_leaf = 5):
self.dtree = Node(X, y, np.array(np.arange(len(y))), min_leaf)
return self
def predict(self, X):
return self.dtree.predict(X.values)
class Node:
def __init__(self, x, y, idxs, min_leaf=5):
self.x = x
self.y = y
self.idxs = idxs
self.min_leaf = min_leaf
self.row_count = len(idxs)
self.col_count = x.shape[1]
self.val = np.mean(y[idxs])
class Node:
def __init__(self, x, y, idxs, min_leaf=5):
self.x = x
self.y = y
self.idxs = idxs
self.min_leaf = min_leaf
self.row_count = len(idxs)
self.col_count = x.shape[1]
self.val = np.mean(y[idxs])
def find_varsplit(self):
for c in range(self.col_count): self.find_better_split(c)
if self.is_leaf: return
x = self.split_col
lhs = np.nonzero(x <= self.split)[0]
rhs = np.nonzero(x > self.split)[0]
self.lhs = Node(self.x, self.y, self.idxs[lhs], self.min_leaf)
self.rhs = Node(self.x, self.y, self.idxs[rhs], self.min_leaf)
@property
def split_col(self): return self.x.values[self.idxs,self.var_idx]
@property
def is_leaf(self): return self.score == float('inf')
def find_better_split(self, var_idx):
x = self.x.values[self.idxs, var_idx]
for r in range(self.row_count):
lhs = x <= x[r]
rhs = x > x[r]
if rhs.sum() < self.min_leaf or lhs.sum() < self.min_leaf: continue
curr_score = self.find_score(lhs, rhs)
if curr_score < self.score:
def predict(self, x):
return np.array([self.predict_row(xi) for xi in x])
def predict_row(self, xi):
if self.is_leaf: return self.val
node = self.lhs if xi[self.var_idx] <= self.split else self.rhs
return node.predict_row(xi)
regressor = DecisionTreeRegressor().fit(X, y)
preds = regressor.predict(X)
X_test = df_test[['OverallQual', 'GrLivArea', 'GarageCars']]
pred_test = regressor.predict(X_test)
submission = pd.DataFrame({'Id': df_test.Id, 'SalePrice': pred_test})
submission.to_csv('submission.csv', index=False)
!pip install tensorflow-gpu==2.0.0-alpha0