annanay25 · August 30, 2016 18:46
diff --git a/3d.py b/3d.py
 import numpy as np
 import pandas as pd
 from sklearn import preprocessing

 class decisionnode:
    def __init__(self,col=-1,value=None,results=None,tb=None,fb=None):
        self.col=col
        self.value=value
        self.results=results
        self.tb=tb
        self.fb=fb

 # Divides a set on a specific column. Can handle numeric
 # or nominal values
 def divideset(rows,column,value):
    # Make a function that tells us if a row is in
    # the first group (true) or the second group (false)
    split_function=None
    if isinstance(value,int) or isinstance(value,float):
        split_function=lambda row:row[column]>=value
    else:
        split_function=lambda row:row[column]==value

    # Divide the rows into two sets and return them
    set1=[row for row in rows if split_function(row)]
    set2=[row for row in rows if not split_function(row)]
    return (set1,set2)


 # Create counts of possible results (the last column of
 # each row is the result)
 def uniquecounts(rows):
    results={}
    for row in rows:
        # The result is the last column
        r=row[len(row)-1]
        if r not in results: results[r]=0
        results[r]+=1
    return results


 def get_t(train_x, colInstance):
    return int((max(train_x[:, colInstance]) + min(train_x[:, colInstance])) / 2)

 def get_above_below(train_x, colInstance):
    threshold = get_t(train_x, colInstance)
    p_below = sum([train_x[i, colInstance] <= threshold for i in range(len(train_x))])
    p_above = len(train_x) - p_below
    p_below /= len(train_x)
    p_above /= len(train_x)
    return p_below, p_above


 def printtree(tree,indent=''):
    # Is this a leaf node?
    if tree.results!=None:
        print(str(tree.results))
    else:
        # Print the criteria
        print(str(tree.col)+':'+str(tree.value)+'? ')

        # Print the branches
        print(indent+'T->', end=' ')
        printtree(tree.tb,indent+'  ')
        print(indent+'F->', end=' ')
        printtree(tree.fb,indent+'  ')


 def buildtree(rows):

    if len(rows)==0: return decisionnode()

    # Set up some variables to track the best criteria
    YTrain = rows["Target"]
    XTrain = rows.drop("Target",1)

    for colInstance in range(len(XTrain[0])):
        t = get_t(XTrain, colInstance)
        a , b = get_above_below(XTrain, colInstance)

        # Now that we have the threshold values, we need to compare with the summation and see if it is leser.
        for i in range(len(train_x)):
            if XTrain[i, colInstance] <= threshold: below_t[train_y[i]] += 1
            else: above_t[train_y[i]] += 1

    # For each class row, we need to check the splitting class and replace entropy with the given condition (functionsa name above)
    # Also, instead of the Informtion Gain factor, we will have the probablistic methdd given which has been explained in psuedocode.

            #     if below_t[0] > below_t[1]: below_t = below_t[1]
            #     else: below_t = below_t[0]
            #     if above_t[0] > above_t[1]: above_t = above_t[1]
            #     else: above_t = above_t[0]
            #     if below*below_t + above*above_t < split_class_val:
            #         split_class_val = below*below_t + above*above_t
            #         split_class = curr_col
            # return split_class
    # This should work for split class identifier.


 def encode_target(df, target_column):
    """Add column to df with integers for the target.

    Args
    ----
    df -- pandas DataFrame.
    target_column -- column to map to int, producing
                     new Target column.

    Returns
    -------
    df_mod -- modified DataFrame.
    targets -- list of target names.
    """
    df_mod = df.copy()
    targets = df_mod[target_column].unique()
    map_to_int = {name: n for n, name in enumerate(targets)}
    df_mod["Target"] = df_mod[target_column].replace(map_to_int)
    return (df_mod, targets)

 def main():
    data = pd.read_csv('Training.csv', sep=',', header=None)
    # Shuffle data.
    # data = data.iloc[np.random.permutation(len(data))]
    # Eliminate invalid data.
    headers = data.iloc[0]
    data = data[1:]
    data = data.rename(columns = headers)
    data['Smoking_Pack-Years'] = data['Smoking_Pack-Years'].astype(float)
    data['Smoking_Pack-Years'].fillna((data['Smoking_Pack-Years'].mean()), inplace=True)
    data['Pathological_grade'] = data['Pathological_grade'].astype(str)
    #data = data.replace(['?'], [np.nan])
    prep = preprocessing.LabelEncoder()
    # print( data.dtypes)
    for cols in data.columns:
        # print (cols)
        data[cols]=prep.fit_transform(data[cols])
    (data,targets) = encode_target(data, "KM_Overall_survival_censor")
    # print(data)
    # Now we remove the column from the initial table.
    data = data.drop("KM_Overall_survival_censor",1)
    # print(data)
    Train = data[:140]
    Test = data[140:]
    Train = Train.sample(frac=1)

    # Convert the numpy data structure.
    tree=buildtree(Train.values.tolist())

    print((tree.col))
    print((tree.value))
    print((tree.results))
    print("")
    print((tree.tb.col))
    print((tree.tb.value))
    print((tree.tb.results))
    print("")
    print((tree.tb.tb.col))
    print((tree.tb.tb.value))
    print((tree.tb.tb.results))
    print("")
    print((tree.tb.fb.col))
    print((tree.tb.fb.value))
    print((tree.tb.fb.results))


 main()
	import numpy as np
	import pandas as pd
	from sklearn import preprocessing

	class decisionnode:
	def __init__(self,col=-1,value=None,results=None,tb=None,fb=None):
	self.col=col
	self.value=value
	self.results=results
	self.tb=tb
	self.fb=fb

	# Divides a set on a specific column. Can handle numeric
	# or nominal values
	def divideset(rows,column,value):
	# Make a function that tells us if a row is in
	# the first group (true) or the second group (false)
	split_function=None
	if isinstance(value,int) or isinstance(value,float):
	split_function=lambda row:row[column]>=value
	else:
	split_function=lambda row:row[column]==value

	# Divide the rows into two sets and return them
	set1=[row for row in rows if split_function(row)]
	set2=[row for row in rows if not split_function(row)]
	return (set1,set2)


	# Create counts of possible results (the last column of
	# each row is the result)
	def uniquecounts(rows):
	results={}
	for row in rows:
	# The result is the last column
	r=row[len(row)-1]
	if r not in results: results[r]=0
	results[r]+=1
	return results


	def get_t(train_x, colInstance):
	return int((max(train_x[:, colInstance]) + min(train_x[:, colInstance])) / 2)

	def get_above_below(train_x, colInstance):
	threshold = get_t(train_x, colInstance)
	p_below = sum([train_x[i, colInstance] <= threshold for i in range(len(train_x))])
	p_above = len(train_x) - p_below
	p_below /= len(train_x)
	p_above /= len(train_x)
	return p_below, p_above


	def printtree(tree,indent=''):
	# Is this a leaf node?
	if tree.results!=None:
	print(str(tree.results))
	else:
	# Print the criteria
	print(str(tree.col)+':'+str(tree.value)+'? ')

	# Print the branches
	print(indent+'T->', end=' ')
	printtree(tree.tb,indent+' ')
	print(indent+'F->', end=' ')
	printtree(tree.fb,indent+' ')


	def buildtree(rows):

	if len(rows)==0: return decisionnode()

	# Set up some variables to track the best criteria
	YTrain = rows["Target"]
	XTrain = rows.drop("Target",1)

	for colInstance in range(len(XTrain[0])):
	t = get_t(XTrain, colInstance)
	a , b = get_above_below(XTrain, colInstance)

	# Now that we have the threshold values, we need to compare with the summation and see if it is leser.
	for i in range(len(train_x)):
	if XTrain[i, colInstance] <= threshold: below_t[train_y[i]] += 1
	else: above_t[train_y[i]] += 1

	# For each class row, we need to check the splitting class and replace entropy with the given condition (functionsa name above)
	# Also, instead of the Informtion Gain factor, we will have the probablistic methdd given which has been explained in psuedocode.

	# if below_t[0] > below_t[1]: below_t = below_t[1]
	# else: below_t = below_t[0]
	# if above_t[0] > above_t[1]: above_t = above_t[1]
	# else: above_t = above_t[0]
	# if belowbelow_t + aboveabove_t < split_class_val:
	# split_class_val = belowbelow_t + aboveabove_t
	# split_class = curr_col
	# return split_class
	# This should work for split class identifier.


	def encode_target(df, target_column):
	"""Add column to df with integers for the target.

	Args
	----
	df -- pandas DataFrame.
	target_column -- column to map to int, producing
	new Target column.

	Returns
	-------
	df_mod -- modified DataFrame.
	targets -- list of target names.
	"""
	df_mod = df.copy()
	targets = df_mod[target_column].unique()
	map_to_int = {name: n for n, name in enumerate(targets)}
	df_mod["Target"] = df_mod[target_column].replace(map_to_int)
	return (df_mod, targets)

	def main():
	data = pd.read_csv('Training.csv', sep=',', header=None)
	# Shuffle data.
	# data = data.iloc[np.random.permutation(len(data))]
	# Eliminate invalid data.
	headers = data.iloc[0]
	data = data[1:]
	data = data.rename(columns = headers)
	data['Smoking_Pack-Years'] = data['Smoking_Pack-Years'].astype(float)
	data['Smoking_Pack-Years'].fillna((data['Smoking_Pack-Years'].mean()), inplace=True)
	data['Pathological_grade'] = data['Pathological_grade'].astype(str)
	#data = data.replace(['?'], [np.nan])
	prep = preprocessing.LabelEncoder()
	# print( data.dtypes)
	for cols in data.columns:
	# print (cols)
	data[cols]=prep.fit_transform(data[cols])
	(data,targets) = encode_target(data, "KM_Overall_survival_censor")
	# print(data)
	# Now we remove the column from the initial table.
	data = data.drop("KM_Overall_survival_censor",1)
	# print(data)
	Train = data[:140]
	Test = data[140:]
	Train = Train.sample(frac=1)

	# Convert the numpy data structure.
	tree=buildtree(Train.values.tolist())

	print((tree.col))
	print((tree.value))
	print((tree.results))
	print("")
	print((tree.tb.col))
	print((tree.tb.value))
	print((tree.tb.results))
	print("")
	print((tree.tb.tb.col))
	print((tree.tb.tb.value))
	print((tree.tb.tb.results))
	print("")
	print((tree.tb.fb.col))
	print((tree.tb.fb.value))
	print((tree.tb.fb.results))


	main()