Created
August 30, 2016 18:46
-
-
Save annanay25/b6013e72a469cb250f26e042f924a622 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from sklearn import preprocessing | |
class decisionnode: | |
def __init__(self,col=-1,value=None,results=None,tb=None,fb=None): | |
self.col=col | |
self.value=value | |
self.results=results | |
self.tb=tb | |
self.fb=fb | |
# Divides a set on a specific column. Can handle numeric | |
# or nominal values | |
def divideset(rows,column,value): | |
# Make a function that tells us if a row is in | |
# the first group (true) or the second group (false) | |
split_function=None | |
if isinstance(value,int) or isinstance(value,float): | |
split_function=lambda row:row[column]>=value | |
else: | |
split_function=lambda row:row[column]==value | |
# Divide the rows into two sets and return them | |
set1=[row for row in rows if split_function(row)] | |
set2=[row for row in rows if not split_function(row)] | |
return (set1,set2) | |
# Create counts of possible results (the last column of | |
# each row is the result) | |
def uniquecounts(rows): | |
results={} | |
for row in rows: | |
# The result is the last column | |
r=row[len(row)-1] | |
if r not in results: results[r]=0 | |
results[r]+=1 | |
return results | |
def get_t(train_x, colInstance): | |
return int((max(train_x[:, colInstance]) + min(train_x[:, colInstance])) / 2) | |
def get_above_below(train_x, colInstance): | |
threshold = get_t(train_x, colInstance) | |
p_below = sum([train_x[i, colInstance] <= threshold for i in range(len(train_x))]) | |
p_above = len(train_x) - p_below | |
p_below /= len(train_x) | |
p_above /= len(train_x) | |
return p_below, p_above | |
def printtree(tree,indent=''): | |
# Is this a leaf node? | |
if tree.results!=None: | |
print(str(tree.results)) | |
else: | |
# Print the criteria | |
print(str(tree.col)+':'+str(tree.value)+'? ') | |
# Print the branches | |
print(indent+'T->', end=' ') | |
printtree(tree.tb,indent+' ') | |
print(indent+'F->', end=' ') | |
printtree(tree.fb,indent+' ') | |
def buildtree(rows): | |
if len(rows)==0: return decisionnode() | |
# Set up some variables to track the best criteria | |
YTrain = rows["Target"] | |
XTrain = rows.drop("Target",1) | |
for colInstance in range(len(XTrain[0])): | |
t = get_t(XTrain, colInstance) | |
a , b = get_above_below(XTrain, colInstance) | |
# Now that we have the threshold values, we need to compare with the summation and see if it is leser. | |
for i in range(len(train_x)): | |
if XTrain[i, colInstance] <= threshold: below_t[train_y[i]] += 1 | |
else: above_t[train_y[i]] += 1 | |
# For each class row, we need to check the splitting class and replace entropy with the given condition (functionsa name above) | |
# Also, instead of the Informtion Gain factor, we will have the probablistic methdd given which has been explained in psuedocode. | |
# if below_t[0] > below_t[1]: below_t = below_t[1] | |
# else: below_t = below_t[0] | |
# if above_t[0] > above_t[1]: above_t = above_t[1] | |
# else: above_t = above_t[0] | |
# if below*below_t + above*above_t < split_class_val: | |
# split_class_val = below*below_t + above*above_t | |
# split_class = curr_col | |
# return split_class | |
# This should work for split class identifier. | |
def encode_target(df, target_column): | |
"""Add column to df with integers for the target. | |
Args | |
---- | |
df -- pandas DataFrame. | |
target_column -- column to map to int, producing | |
new Target column. | |
Returns | |
------- | |
df_mod -- modified DataFrame. | |
targets -- list of target names. | |
""" | |
df_mod = df.copy() | |
targets = df_mod[target_column].unique() | |
map_to_int = {name: n for n, name in enumerate(targets)} | |
df_mod["Target"] = df_mod[target_column].replace(map_to_int) | |
return (df_mod, targets) | |
def main(): | |
data = pd.read_csv('Training.csv', sep=',', header=None) | |
# Shuffle data. | |
# data = data.iloc[np.random.permutation(len(data))] | |
# Eliminate invalid data. | |
headers = data.iloc[0] | |
data = data[1:] | |
data = data.rename(columns = headers) | |
data['Smoking_Pack-Years'] = data['Smoking_Pack-Years'].astype(float) | |
data['Smoking_Pack-Years'].fillna((data['Smoking_Pack-Years'].mean()), inplace=True) | |
data['Pathological_grade'] = data['Pathological_grade'].astype(str) | |
#data = data.replace(['?'], [np.nan]) | |
prep = preprocessing.LabelEncoder() | |
# print( data.dtypes) | |
for cols in data.columns: | |
# print (cols) | |
data[cols]=prep.fit_transform(data[cols]) | |
(data,targets) = encode_target(data, "KM_Overall_survival_censor") | |
# print(data) | |
# Now we remove the column from the initial table. | |
data = data.drop("KM_Overall_survival_censor",1) | |
# print(data) | |
Train = data[:140] | |
Test = data[140:] | |
Train = Train.sample(frac=1) | |
# Convert the numpy data structure. | |
tree=buildtree(Train.values.tolist()) | |
print((tree.col)) | |
print((tree.value)) | |
print((tree.results)) | |
print("") | |
print((tree.tb.col)) | |
print((tree.tb.value)) | |
print((tree.tb.results)) | |
print("") | |
print((tree.tb.tb.col)) | |
print((tree.tb.tb.value)) | |
print((tree.tb.tb.results)) | |
print("") | |
print((tree.tb.fb.col)) | |
print((tree.tb.fb.value)) | |
print((tree.tb.fb.results)) | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment