spyhi · October 24, 2017 05:49 · spyhi · Oct 24, 2017 · nikolai3d · Oct 24, 2017
diff --git a/BCclusterAnalysis.py b/BCclusterAnalysis.py
 import pandas as pd #Using Pandas for DataFrame
 import numpy as np
 import matplotlib.pyplot as plt
 from mpl_toolkits.mplot3d import Axes3D #Create 3D plot
 from sklearn.cluster import KMeans #Import learning algorithm

 # Simple KMeans cluster analysis on breast cancer data using Python, SKLearn, Numpy, and Pandas
 # Created for ICS 491 (Big Data) at University of Hawaii at Manoa, Fall 2017
 # Questions? Tweet me at https://twitter.com/spyhi

 # Import breast cancer data using Pandas. Should load if it's in the same folder as Python script.
 # Total about 570 samples.
 # I got my data from Kaggle at the following URL:
 # https://www.kaggle.com/uciml/breast-cancer-wisconsin-data
 data = pd.read_csv('breast_cancer_data.csv')

 # Was originally going to use all these feature columns
 # but decided to test on a small amount to visualize and ended up being happy with results
 feat_cols = ["radius_mean", "texture_mean", "texture_mean", "area_mean", \
                      "smoothness_mean", "compactness_mean", "concavity_mean", \
                      "symmetry_mean", "fractal_dimension_mean"]

 # Actual features used, based on previous knowledge of skin cancer diagnosis factors
 feat_cols_sm = ["radius_mean", "concavity_mean", "symmetry_mean"]

 # Use Pandas dataframe query to populate Numpy array with feature vectors.
 features = np.array(data[feat_cols_sm])

 # Initialize the KMeans cluster module. Setting it to find two clusters, hoping to find malignant vs benign.
 clusters = KMeans(n_clusters=2, max_iter=300)

 # Fit model to our selected features.
 clusters.fit(features)

 # Put centroids and results into variables.
 centroids = clusters.cluster_centers_
 labels = clusters.labels_

 # Sanity check
 print(centroids)

 # Create new MatPlotLib figure
 fig = plt.figure()
 # Add 3rd dimension to figure
 ax = fig.add_subplot(111, projection='3d')
 # This means "red" and "blue"
 colors = ["r", "b"]

 # Plot all the features and assign color based on cluster identity label
 for i in range(len(features)):
    ax.scatter(xs=features[i][0], ys=features[i][1], zs=features[i][2],
               c=colors[labels[i]], zdir='z')

 # Plot centroids, though you can't really see them.
 ax.scatter(xs=centroids[:,0], ys=centroids[:,1], zs=centroids[:,2],
           marker="x", s=150, c="c")

 # Create array of diagnosis data, which should be same length as labels.
 diag = np.array(data['diagnosis'])
 # Create variable to hold matches in order to get percentage accuracy.
 matches = 0

 # Transform diagnosis vector from B||M to 0||1 and matches++ if correct.
 for i in range(0, len(diag)):
    if diag[i] == "B":
        diag[i] = 0
    if diag[i] == "M":
        diag[i] = 1
    if diag[i] == labels[i]:
        matches = matches + 1

 #Calculate percentage matches and print.
 percentMatch = (matches/len(diag))*100
 print("Percent matched between benign and malignant ", percentMatch)

 #Set labels on figure and show 3D scatter plot to visualize data and clusters.
 ax.set_xlabel("Radius Mean")
 ax.set_ylabel("Concavity Mean")
 ax.set_zlabel("Symmetry Mean")
 plt.show()

 #Finis
	import pandas as pd #Using Pandas for DataFrame
	import numpy as np
	import matplotlib.pyplot as plt
	from mpl_toolkits.mplot3d import Axes3D #Create 3D plot
	from sklearn.cluster import KMeans #Import learning algorithm

	# Simple KMeans cluster analysis on breast cancer data using Python, SKLearn, Numpy, and Pandas
	# Created for ICS 491 (Big Data) at University of Hawaii at Manoa, Fall 2017
	# Questions? Tweet me at https://twitter.com/spyhi

	# Import breast cancer data using Pandas. Should load if it's in the same folder as Python script.
	# Total about 570 samples.
	# I got my data from Kaggle at the following URL:
	# https://www.kaggle.com/uciml/breast-cancer-wisconsin-data
	data = pd.read_csv('breast_cancer_data.csv')

	# Was originally going to use all these feature columns
	# but decided to test on a small amount to visualize and ended up being happy with results
	feat_cols = ["radius_mean", "texture_mean", "texture_mean", "area_mean", \
	"smoothness_mean", "compactness_mean", "concavity_mean", \
	"symmetry_mean", "fractal_dimension_mean"]

	# Actual features used, based on previous knowledge of skin cancer diagnosis factors
	feat_cols_sm = ["radius_mean", "concavity_mean", "symmetry_mean"]

	# Use Pandas dataframe query to populate Numpy array with feature vectors.
	features = np.array(data[feat_cols_sm])

	# Initialize the KMeans cluster module. Setting it to find two clusters, hoping to find malignant vs benign.
	clusters = KMeans(n_clusters=2, max_iter=300)

	# Fit model to our selected features.
	clusters.fit(features)

	# Put centroids and results into variables.
	centroids = clusters.cluster_centers_
	labels = clusters.labels_

	# Sanity check
	print(centroids)

	# Create new MatPlotLib figure
	fig = plt.figure()
	# Add 3rd dimension to figure
	ax = fig.add_subplot(111, projection='3d')
	# This means "red" and "blue"
	colors = ["r", "b"]

	# Plot all the features and assign color based on cluster identity label
	for i in range(len(features)):
	ax.scatter(xs=features[i][0], ys=features[i][1], zs=features[i][2],
	c=colors[labels[i]], zdir='z')

	# Plot centroids, though you can't really see them.
	ax.scatter(xs=centroids[:,0], ys=centroids[:,1], zs=centroids[:,2],
	marker="x", s=150, c="c")

	# Create array of diagnosis data, which should be same length as labels.
	diag = np.array(data['diagnosis'])
	# Create variable to hold matches in order to get percentage accuracy.
	matches = 0

	# Transform diagnosis vector from B\|\|M to 0\|\|1 and matches++ if correct.
	for i in range(0, len(diag)):
	if diag[i] == "B":
	diag[i] = 0
	if diag[i] == "M":
	diag[i] = 1
	if diag[i] == labels[i]:
	matches = matches + 1

	#Calculate percentage matches and print.
	percentMatch = (matches/len(diag))*100
	print("Percent matched between benign and malignant ", percentMatch)

	#Set labels on figure and show 3D scatter plot to visualize data and clusters.
	ax.set_xlabel("Radius Mean")
	ax.set_ylabel("Concavity Mean")
	ax.set_zlabel("Symmetry Mean")
	plt.show()

	#Finis