Madhivarman · August 20, 2018 10:49
diff --git a/Kmeans.py b/Kmeans.py

 import pandas as pd
 import numpy as np

 #create a dummy data
 user_id = [x for x in range(10000)]
 recency = np.random.randint(low=1, high=10, size=10000)
 monetary = np.random.randint(low=1, high=10, size=10000)
 frequency = np.random.randint(low=1, high=10, size=10000)

 #convert above data into a dataframe
 dummy_data = pd.DataFrame({'user_id':user_id,'Recency':recency, 'Monetary':monetary, 'Frequency':frequency})
 dummy_data.shape

 dummy_data.head()

 total_value = recency + monetary + frequency
 total_value

 dummy_data["total_value"] = total_value
 dummy_data.head()

 segment = []

 for i in total_value:
    #write condition value
    if i < 5:
        segment.append("lost")
    elif i >= 5 and i < 10:
        segment.append("abouttosleep")
    elif i >=10 and i < 17:
        segment.append("recentcustomer")
    elif i >= 17 and i < 25:
        segment.append("loyalcustomer")
    else:
        segment.append("champions")

 dummy_data["segment"] = segment
 dummy_data.head()

 #do one hot encoding
 dummy_data["segment"] = dummy_data["segment"].apply({"recentcustomer":4.0, "champions":5.0, "abouttosleep":2.0,
                                                    "lost":1.0,"loyalcustomer":3.0}.get)


 # In[11]:


 dummy_data.head()


 # In[12]:


 #convert all data into float
 dummy_data[["Frequency","Monetary","Recency"]] = dummy_data[["Frequency","Monetary","Recency"]].astype(float)
 dummy_data.head()


 # In[13]:


 #input  data to consider {Frequency, Monetarty, Recency} 
 #target data is Segment

 input_data = dummy_data[["Frequency","Monetary","Recency"]]
 output_data = dummy_data["segment"]

 input_data.head()


 # In[14]:


 output_data.head()


 # ## Split the dataset into Train and Test dataset

 # In[15]:


 train_X,train_Y = input_data.iloc[:7000], output_data.iloc[:7000]
 test_X, test_Y = input_data.iloc[7000:], output_data.iloc[7000:]

 print(train_X.shape, train_Y.shape, test_X.shape, test_Y.shape)
 train_X, train_Y = train_X.values, train_Y.values
 test_X, test_Y = test_X.values, test_Y.values



 import tensorflow as tf


 # In[26]:


 from tensorflow.contrib.factorization import KMeans

 with tf.name_scope("input_variables"):
    data = tf.placeholder(tf.float32, shape=[None, 3],name="data")
    target = tf.placeholder(tf.float32, shape=[None, 1],name="target")
    
 #parameters defining
 epochs = 500
 num_classes = 5
 num_clusters = 5
 batch_size = 64

 #defining a KMeans architecture
 with tf.name_scope("KMeans_Architecture"):
    Kmeans = KMeans(inputs = data,
                   num_clusters = num_clusters,
                   distance_metric = 'cosine',
                   use_mini_batch = True)
    
 #building a graph
 training_graph =  Kmeans.training_graph()

 if len(training_graph) > 6:
 	(all_scores, cluster_idx, scores, cluster_centers_initialized,
 		cluster_center_var, init_op, train_op) = training_graph

 else:
 	(all_scores, cluster_idx, scores, cluster_centers_initialized,
 		init_op, train_op) = training_graph

 cluster_idx = cluster_idx[0]
 avg_distance = tf.reduce_mean(scores)

 #initialize all variables
 init_vars = tf.global_variables_initializer()

 #start tensorflow session
 sess = tf.Session()

 #run the initializer
 sess.run(init_vars, feed_dict={data: train_X})
 sess.run(init_op, feed_dict={data: train_X})

 #add ops to save the tensorflow model
 saver = tf.train.Saver()

 #Training
 for i in range(1, epochs+1):
    _, d, idx = sess.run([train_op, avg_distance, cluster_idx], feed_dict={data:train_X})
    
    if i%100 == 0 or i == 1:
        print("step:{}, Avg-Distance:{}".format(i,d))
        
        save_path = saver.save(sess, "customer_segmentation_saved_model/model.ckpt")
        print("Model in saved in the path:{dir}".format(dir=save_path))

 #assign a lable to each centroid
 #count total number of labels per centroid, using lable for each training

 counts = np.zeros(shape=(num_clusters,num_classes))
 for i in range(len(idx)):
 	counts[idx[i]] += train_Y[i]

 #assign most frequent label to centroid
 lables_map = [np.argmax(c) for c in counts]
 lables_map = tf.convert_to_tensor(lables_map)

 #lookup: centroid_id -> label
 cluster_label = tf.nn.embedding_lookup(lables_map, cluster_idx)

 #compute accuracy
 correct_prediction = tf.equal(cluster_label, tf.cast(tf.argmax(target,1), tf.int32))
 accuracy_op = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

 print(correct_prediction)


 # In[27]:


 dup = test_Y.reshape(-1,1)
 dup


 # In[28]:


 #test accuracy
 print("Test Accuracy:{}".format(sess.run(accuracy_op, feed_dict={data:test_X, target:test_Y.reshape(-1,1)})))


 # ## Inspect what all variables are stored in the check point ##

 # In[30]:


 # import the inspect_checkpoint library
 from tensorflow.python.tools import inspect_checkpoint as chkp

 # print all tensors in checkpoint file
 chkp.print_tensors_in_checkpoint_file("customer_segmentation_saved_model/model.ckpt", tensor_name='', all_tensors=True)


 # In[31]:


 chkp.print_tensors_in_checkpoint_file("customer_segmentation_saved_model/model.ckpt", tensor_name='data', all_tensors=False)


 # ## Do prediction in the Test data ##
 #generate random test data of length 200

 test_recency = np.random.randint(low=1, high=10, size=500)
 test_monetary = np.random.randint(low=1, high=10, size=500)
 test_frequency = np.random.randint(low=1, high=10, size=500)

 test_df = pd.DataFrame({'Recency':test_recency, 'Monetary':test_monetary, 'Frequency':test_frequency})
 test_df.head()


 # In[33]:


 test_df.values


 # In[37]:


 ## Let us restore the saved model 
 t_sess = tf.Session()
 # Step-1: Recreate the network graph. At this step only graph is created.
 saver = tf.train.Saver()
 # Step-2: Now let's load the weights saved using the restore method.
 saver.restore(t_sess, 'customer_segmentation_saved_model/model.ckpt')




 # Accessing the default graph which we have restored
 graph = tf.get_default_graph()
 #accessing the default graph which is stored
 graph = tf.get_default_graph()

 y_pred = graph.get_tensor_by_name("input_variables_3/target:0")

 #lets feed the data into the placeholders
 x = graph.get_tensor_by_name("input_variables_3/data:0")

 #create the feed_dict that is required to feed the input to the data
 feed_dict_testing = {x: test_df.values}

 #check the result
 result = t_sess.run(accuracy_op, feed_dict=feed_dict_testing)
 print(result)

	import pandas as pd
	import numpy as np

	#create a dummy data
	user_id = [x for x in range(10000)]
	recency = np.random.randint(low=1, high=10, size=10000)
	monetary = np.random.randint(low=1, high=10, size=10000)
	frequency = np.random.randint(low=1, high=10, size=10000)

	#convert above data into a dataframe
	dummy_data = pd.DataFrame({'user_id':user_id,'Recency':recency, 'Monetary':monetary, 'Frequency':frequency})
	dummy_data.shape

	dummy_data.head()

	total_value = recency + monetary + frequency
	total_value

	dummy_data["total_value"] = total_value
	dummy_data.head()

	segment = []

	for i in total_value:
	#write condition value
	if i < 5:
	segment.append("lost")
	elif i >= 5 and i < 10:
	segment.append("abouttosleep")
	elif i >=10 and i < 17:
	segment.append("recentcustomer")
	elif i >= 17 and i < 25:
	segment.append("loyalcustomer")
	else:
	segment.append("champions")

	dummy_data["segment"] = segment
	dummy_data.head()

	#do one hot encoding
	dummy_data["segment"] = dummy_data["segment"].apply({"recentcustomer":4.0, "champions":5.0, "abouttosleep":2.0,
	"lost":1.0,"loyalcustomer":3.0}.get)


	# In[11]:


	dummy_data.head()


	# In[12]:


	#convert all data into float
	dummy_data[["Frequency","Monetary","Recency"]] = dummy_data[["Frequency","Monetary","Recency"]].astype(float)
	dummy_data.head()


	# In[13]:


	#input data to consider {Frequency, Monetarty, Recency}
	#target data is Segment

	input_data = dummy_data[["Frequency","Monetary","Recency"]]
	output_data = dummy_data["segment"]

	input_data.head()


	# In[14]:


	output_data.head()


	# ## Split the dataset into Train and Test dataset

	# In[15]:


	train_X,train_Y = input_data.iloc[:7000], output_data.iloc[:7000]
	test_X, test_Y = input_data.iloc[7000:], output_data.iloc[7000:]

	print(train_X.shape, train_Y.shape, test_X.shape, test_Y.shape)
	train_X, train_Y = train_X.values, train_Y.values
	test_X, test_Y = test_X.values, test_Y.values



	import tensorflow as tf


	# In[26]:


	from tensorflow.contrib.factorization import KMeans

	with tf.name_scope("input_variables"):
	data = tf.placeholder(tf.float32, shape=[None, 3],name="data")
	target = tf.placeholder(tf.float32, shape=[None, 1],name="target")

	#parameters defining
	epochs = 500
	num_classes = 5
	num_clusters = 5
	batch_size = 64

	#defining a KMeans architecture
	with tf.name_scope("KMeans_Architecture"):
	Kmeans = KMeans(inputs = data,
	num_clusters = num_clusters,
	distance_metric = 'cosine',
	use_mini_batch = True)

	#building a graph
	training_graph = Kmeans.training_graph()

	if len(training_graph) > 6:
	(all_scores, cluster_idx, scores, cluster_centers_initialized,
	cluster_center_var, init_op, train_op) = training_graph

	else:
	(all_scores, cluster_idx, scores, cluster_centers_initialized,
	init_op, train_op) = training_graph

	cluster_idx = cluster_idx[0]
	avg_distance = tf.reduce_mean(scores)

	#initialize all variables
	init_vars = tf.global_variables_initializer()

	#start tensorflow session
	sess = tf.Session()

	#run the initializer
	sess.run(init_vars, feed_dict={data: train_X})
	sess.run(init_op, feed_dict={data: train_X})

	#add ops to save the tensorflow model
	saver = tf.train.Saver()

	#Training
	for i in range(1, epochs+1):
	_, d, idx = sess.run([train_op, avg_distance, cluster_idx], feed_dict={data:train_X})

	if i%100 == 0 or i == 1:
	print("step:{}, Avg-Distance:{}".format(i,d))

	save_path = saver.save(sess, "customer_segmentation_saved_model/model.ckpt")
	print("Model in saved in the path:{dir}".format(dir=save_path))

	#assign a lable to each centroid
	#count total number of labels per centroid, using lable for each training

	counts = np.zeros(shape=(num_clusters,num_classes))
	for i in range(len(idx)):
	counts[idx[i]] += train_Y[i]

	#assign most frequent label to centroid
	lables_map = [np.argmax(c) for c in counts]
	lables_map = tf.convert_to_tensor(lables_map)

	#lookup: centroid_id -> label
	cluster_label = tf.nn.embedding_lookup(lables_map, cluster_idx)

	#compute accuracy
	correct_prediction = tf.equal(cluster_label, tf.cast(tf.argmax(target,1), tf.int32))
	accuracy_op = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

	print(correct_prediction)


	# In[27]:


	dup = test_Y.reshape(-1,1)
	dup


	# In[28]:


	#test accuracy
	print("Test Accuracy:{}".format(sess.run(accuracy_op, feed_dict={data:test_X, target:test_Y.reshape(-1,1)})))


	# ## Inspect what all variables are stored in the check point ##

	# In[30]:


	# import the inspect_checkpoint library
	from tensorflow.python.tools import inspect_checkpoint as chkp

	# print all tensors in checkpoint file
	chkp.print_tensors_in_checkpoint_file("customer_segmentation_saved_model/model.ckpt", tensor_name='', all_tensors=True)


	# In[31]:


	chkp.print_tensors_in_checkpoint_file("customer_segmentation_saved_model/model.ckpt", tensor_name='data', all_tensors=False)


	# ## Do prediction in the Test data ##
	#generate random test data of length 200

	test_recency = np.random.randint(low=1, high=10, size=500)
	test_monetary = np.random.randint(low=1, high=10, size=500)
	test_frequency = np.random.randint(low=1, high=10, size=500)

	test_df = pd.DataFrame({'Recency':test_recency, 'Monetary':test_monetary, 'Frequency':test_frequency})
	test_df.head()


	# In[33]:


	test_df.values


	# In[37]:


	## Let us restore the saved model
	t_sess = tf.Session()
	# Step-1: Recreate the network graph. At this step only graph is created.
	saver = tf.train.Saver()
	# Step-2: Now let's load the weights saved using the restore method.
	saver.restore(t_sess, 'customer_segmentation_saved_model/model.ckpt')




	# Accessing the default graph which we have restored
	graph = tf.get_default_graph()
	#accessing the default graph which is stored
	graph = tf.get_default_graph()

	y_pred = graph.get_tensor_by_name("input_variables_3/target:0")

	#lets feed the data into the placeholders
	x = graph.get_tensor_by_name("input_variables_3/data:0")

	#create the feed_dict that is required to feed the input to the data
	feed_dict_testing = {x: test_df.values}

	#check the result
	result = t_sess.run(accuracy_op, feed_dict=feed_dict_testing)
	print(result)