Skip to content

Instantly share code, notes, and snippets.

@annanay25
Created August 30, 2016 18:46
Show Gist options
  • Save annanay25/b6013e72a469cb250f26e042f924a622 to your computer and use it in GitHub Desktop.
Save annanay25/b6013e72a469cb250f26e042f924a622 to your computer and use it in GitHub Desktop.
import numpy as np
import pandas as pd
from sklearn import preprocessing
class decisionnode:
def __init__(self,col=-1,value=None,results=None,tb=None,fb=None):
self.col=col
self.value=value
self.results=results
self.tb=tb
self.fb=fb
# Divides a set on a specific column. Can handle numeric
# or nominal values
def divideset(rows,column,value):
# Make a function that tells us if a row is in
# the first group (true) or the second group (false)
split_function=None
if isinstance(value,int) or isinstance(value,float):
split_function=lambda row:row[column]>=value
else:
split_function=lambda row:row[column]==value
# Divide the rows into two sets and return them
set1=[row for row in rows if split_function(row)]
set2=[row for row in rows if not split_function(row)]
return (set1,set2)
# Create counts of possible results (the last column of
# each row is the result)
def uniquecounts(rows):
results={}
for row in rows:
# The result is the last column
r=row[len(row)-1]
if r not in results: results[r]=0
results[r]+=1
return results
def get_t(train_x, colInstance):
return int((max(train_x[:, colInstance]) + min(train_x[:, colInstance])) / 2)
def get_above_below(train_x, colInstance):
threshold = get_t(train_x, colInstance)
p_below = sum([train_x[i, colInstance] <= threshold for i in range(len(train_x))])
p_above = len(train_x) - p_below
p_below /= len(train_x)
p_above /= len(train_x)
return p_below, p_above
def printtree(tree,indent=''):
# Is this a leaf node?
if tree.results!=None:
print(str(tree.results))
else:
# Print the criteria
print(str(tree.col)+':'+str(tree.value)+'? ')
# Print the branches
print(indent+'T->', end=' ')
printtree(tree.tb,indent+' ')
print(indent+'F->', end=' ')
printtree(tree.fb,indent+' ')
def buildtree(rows):
if len(rows)==0: return decisionnode()
# Set up some variables to track the best criteria
YTrain = rows["Target"]
XTrain = rows.drop("Target",1)
for colInstance in range(len(XTrain[0])):
t = get_t(XTrain, colInstance)
a , b = get_above_below(XTrain, colInstance)
# Now that we have the threshold values, we need to compare with the summation and see if it is leser.
for i in range(len(train_x)):
if XTrain[i, colInstance] <= threshold: below_t[train_y[i]] += 1
else: above_t[train_y[i]] += 1
# For each class row, we need to check the splitting class and replace entropy with the given condition (functionsa name above)
# Also, instead of the Informtion Gain factor, we will have the probablistic methdd given which has been explained in psuedocode.
# if below_t[0] > below_t[1]: below_t = below_t[1]
# else: below_t = below_t[0]
# if above_t[0] > above_t[1]: above_t = above_t[1]
# else: above_t = above_t[0]
# if below*below_t + above*above_t < split_class_val:
# split_class_val = below*below_t + above*above_t
# split_class = curr_col
# return split_class
# This should work for split class identifier.
def encode_target(df, target_column):
"""Add column to df with integers for the target.
Args
----
df -- pandas DataFrame.
target_column -- column to map to int, producing
new Target column.
Returns
-------
df_mod -- modified DataFrame.
targets -- list of target names.
"""
df_mod = df.copy()
targets = df_mod[target_column].unique()
map_to_int = {name: n for n, name in enumerate(targets)}
df_mod["Target"] = df_mod[target_column].replace(map_to_int)
return (df_mod, targets)
def main():
data = pd.read_csv('Training.csv', sep=',', header=None)
# Shuffle data.
# data = data.iloc[np.random.permutation(len(data))]
# Eliminate invalid data.
headers = data.iloc[0]
data = data[1:]
data = data.rename(columns = headers)
data['Smoking_Pack-Years'] = data['Smoking_Pack-Years'].astype(float)
data['Smoking_Pack-Years'].fillna((data['Smoking_Pack-Years'].mean()), inplace=True)
data['Pathological_grade'] = data['Pathological_grade'].astype(str)
#data = data.replace(['?'], [np.nan])
prep = preprocessing.LabelEncoder()
# print( data.dtypes)
for cols in data.columns:
# print (cols)
data[cols]=prep.fit_transform(data[cols])
(data,targets) = encode_target(data, "KM_Overall_survival_censor")
# print(data)
# Now we remove the column from the initial table.
data = data.drop("KM_Overall_survival_censor",1)
# print(data)
Train = data[:140]
Test = data[140:]
Train = Train.sample(frac=1)
# Convert the numpy data structure.
tree=buildtree(Train.values.tolist())
print((tree.col))
print((tree.value))
print((tree.results))
print("")
print((tree.tb.col))
print((tree.tb.value))
print((tree.tb.results))
print("")
print((tree.tb.tb.col))
print((tree.tb.tb.value))
print((tree.tb.tb.results))
print("")
print((tree.tb.fb.col))
print((tree.tb.fb.value))
print((tree.tb.fb.results))
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment