Created
October 24, 2017 05:49
-
-
Save spyhi/ec8e60419d90aefc8537eb557ef35826 to your computer and use it in GitHub Desktop.
Python SKLearn KMeans Cluster Analysis on UW Breast Cancer Data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd #Using Pandas for DataFrame | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from mpl_toolkits.mplot3d import Axes3D #Create 3D plot | |
from sklearn.cluster import KMeans #Import learning algorithm | |
# Simple KMeans cluster analysis on breast cancer data using Python, SKLearn, Numpy, and Pandas | |
# Created for ICS 491 (Big Data) at University of Hawaii at Manoa, Fall 2017 | |
# Questions? Tweet me at https://twitter.com/spyhi | |
# Import breast cancer data using Pandas. Should load if it's in the same folder as Python script. | |
# Total about 570 samples. | |
# I got my data from Kaggle at the following URL: | |
# https://www.kaggle.com/uciml/breast-cancer-wisconsin-data | |
data = pd.read_csv('breast_cancer_data.csv') | |
# Was originally going to use all these feature columns | |
# but decided to test on a small amount to visualize and ended up being happy with results | |
feat_cols = ["radius_mean", "texture_mean", "texture_mean", "area_mean", \ | |
"smoothness_mean", "compactness_mean", "concavity_mean", \ | |
"symmetry_mean", "fractal_dimension_mean"] | |
# Actual features used, based on previous knowledge of skin cancer diagnosis factors | |
feat_cols_sm = ["radius_mean", "concavity_mean", "symmetry_mean"] | |
# Use Pandas dataframe query to populate Numpy array with feature vectors. | |
features = np.array(data[feat_cols_sm]) | |
# Initialize the KMeans cluster module. Setting it to find two clusters, hoping to find malignant vs benign. | |
clusters = KMeans(n_clusters=2, max_iter=300) | |
# Fit model to our selected features. | |
clusters.fit(features) | |
# Put centroids and results into variables. | |
centroids = clusters.cluster_centers_ | |
labels = clusters.labels_ | |
# Sanity check | |
print(centroids) | |
# Create new MatPlotLib figure | |
fig = plt.figure() | |
# Add 3rd dimension to figure | |
ax = fig.add_subplot(111, projection='3d') | |
# This means "red" and "blue" | |
colors = ["r", "b"] | |
# Plot all the features and assign color based on cluster identity label | |
for i in range(len(features)): | |
ax.scatter(xs=features[i][0], ys=features[i][1], zs=features[i][2], | |
c=colors[labels[i]], zdir='z') | |
# Plot centroids, though you can't really see them. | |
ax.scatter(xs=centroids[:,0], ys=centroids[:,1], zs=centroids[:,2], | |
marker="x", s=150, c="c") | |
# Create array of diagnosis data, which should be same length as labels. | |
diag = np.array(data['diagnosis']) | |
# Create variable to hold matches in order to get percentage accuracy. | |
matches = 0 | |
# Transform diagnosis vector from B||M to 0||1 and matches++ if correct. | |
for i in range(0, len(diag)): | |
if diag[i] == "B": | |
diag[i] = 0 | |
if diag[i] == "M": | |
diag[i] = 1 | |
if diag[i] == labels[i]: | |
matches = matches + 1 | |
#Calculate percentage matches and print. | |
percentMatch = (matches/len(diag))*100 | |
print("Percent matched between benign and malignant ", percentMatch) | |
#Set labels on figure and show 3D scatter plot to visualize data and clusters. | |
ax.set_xlabel("Radius Mean") | |
ax.set_ylabel("Concavity Mean") | |
ax.set_zlabel("Symmetry Mean") | |
plt.show() | |
#Finis |
👏
sir your program accuracy is going good but sir the plot is not being printing on console. any suggestions for it??
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Output should look kinda like this:

Last clocked at about 85.6% accuracy when compared to benign vs malignant diagnosis.