Created
October 27, 2015 19:49
-
-
Save askmeegs/7e94dfc1ed97cc359d84 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//KMeans_Clustering.java | |
//Megan O'Keefe | |
//Completed: 10/25/15 | |
//CS313, Wellesley College (Assignment #5) | |
//USES EXISTING CLUSTERING CLASSES, NOT INCLUDED | |
/* | |
Overview of KMeans algorithm: | |
- randomly assign each Gene to a cluster (Math.random... for range = # desired clusters) | |
- Then until you have no shuffling changes in the iteration... | |
- Calculate the mean of each of the "k" clusters | |
- Assign each gene to the cluster with the closest mean | |
*/ | |
import java.util.*; // Needed for Scanner class and for Vector class | |
import java.io.*; // Needed for File class | |
public class KMeans_Clustering extends Clustering { | |
int k; | |
/*A set of genes and experiments are determined from the specified String representing the name of a file. Genes and experiments are read-in from the tab-delimited file. Initially, the constructed KMeans_Clustering consists of the specified number of clusters, but each cluster has not yet been assigned any genes.*/ | |
public KMeans_Clustering(String fileName, int numClusters) { | |
super(fileName); //instantiate a Clustering object using the read-in file | |
k = numClusters; | |
kMeans(); //perform the kMeans clustering. | |
} | |
public void kMeans() { | |
initializeAllClusters(); | |
randomlyAssignGenesToClusters(); //to start, randomly assign to k clusters. | |
Vector<Vector<Double>> means = getMeansOfAllClusters(); | |
boolean done = assignGenesToClusters(means); //check for small likelihood no clustering is needed. | |
while(!done) { //until we reach convergence... keep re-clustering. | |
means = getMeansOfAllClusters(); | |
done = assignGenesToClusters(means); | |
} | |
System.out.println("...clustering complete.); | |
} | |
//instantiates the clusters vector | |
public void initializeAllClusters() { | |
for(int i=0; i<k; i++) { | |
this.clusters.add(new Cluster()); | |
} | |
} | |
public void randomlyAssignGenesToClusters() { | |
int range = k; //random generator's range | |
for(int i=0; i<genes.size(); i++) { | |
Gene g = genes.get(i); | |
int where = (int) (Math.random()*range); | |
clusters.get(where).addGene(g); //add Gene to a random cluster. | |
} | |
} | |
//Return a collection of the mean (average) expression vectors for all of the clusters. | |
public Vector<Vector<Double>> getMeansOfAllClusters() { | |
Vector<Vector<Double>> output = new Vector<Vector<Double>>(); | |
for(int i=0; i<clusters.size(); i++) { //for each cluster, calculate means, add to output. | |
Cluster curCluster = clusters.get(i); | |
Vector<Double> temp = curCluster.getClusterMean(); | |
output.add(temp); | |
} | |
return output; | |
} | |
/*Assigns each gene to the cluster whose mean expression vector is closest to the gene. | |
The parameter means corresponds to a collection of Vectors (analagous to a 2D array). Each entry of means corresponds to the set of average expression values for a particular cluster. The method returns true if the cluster assignments are not improving, i.e., we have achieved a locally optimal clustering. The method returns false if the cluster assignments are an improvement over previous cluster assignments, i.e., better clusters than previously have been found. The measure used for comparing one clustering to another clustering is the sum of the distance of each gene from its cluster's mean.*/ | |
public boolean assignGenesToClusters(Vector<Vector<Double>> means) { | |
//calculate the old sum for later use | |
double oldSum = 0.0; | |
for(int a=0; a<genes.size(); a++) { | |
Gene mygene = genes.get(a); | |
//where's this gene? | |
int at = 0; | |
for(int t = 0; t<clusters.size(); t++) { | |
if(clusters.get(t).isGeneInCluster(mygene)) at = t; | |
} | |
Vector<Double> oldMeans = clusters.get(at).getClusterMean(); | |
double mydist = mygene.distanceToExpressionVector(oldMeans); | |
oldSum = oldSum + mydist; | |
} | |
//Assign each gene to the right cluster. | |
for(int i=0; i<genes.size(); i++) { | |
Gene curGene = genes.get(i); | |
//where is this gene? | |
int oldIndex = 0; | |
for(int q = 0; q<clusters.size(); q++) { | |
if(clusters.get(q).isGeneInCluster(curGene)) oldIndex = q; | |
} | |
//is there a closer star? | |
int newIndex = 0; | |
double mindist = 10000000; | |
for(int j=0; j<clusters.size(); j++) { | |
if(clusters.get(j).getSizeOfCluster() != 0) { //as long as cluster isn't empty... | |
Vector<Double> curMeans = clusters.get(j).getClusterMean(); | |
double dist = curGene.distanceToExpressionVector(curMeans); | |
//if we found a new minimum distance... | |
if(dist<mindist) { | |
newIndex = j; | |
mindist = dist; | |
} | |
} | |
} | |
//add this gene to new cluster, and remove it from the old one. | |
if(newIndex != oldIndex) { | |
clusters.get(newIndex).addGene(curGene); | |
clusters.get(oldIndex).removeGene(curGene); | |
} | |
} | |
populateEmptyClusters(); //reshuffle if this resulted in any clusters being empty. | |
//now we compare the old arrangement and the new arrangment.... | |
double newSum = 0.0; | |
for(int v=0; v<genes.size(); v++) { | |
Gene mygene = genes.get(v); | |
//where's this gene? | |
int at = 0; | |
for(int x = 0; x<clusters.size(); x++) { | |
if(clusters.get(x).isGeneInCluster(mygene)) at = x; | |
} | |
Vector<Double> newMeans = clusters.get(at).getClusterMean(); | |
double mydist = mygene.distanceToExpressionVector(newMeans); | |
newSum = newSum + mydist; | |
} | |
if(oldSum<=newSum) return true; | |
else return false; | |
} | |
/*If any clusters are empty (contain zero genes), then genes are moved from clusters containing multiple genes. | |
Assumes all genes have been assigned to a cluster. For any empty clusters, a cluster with mulitple genes is found randomly and one gene is removed from the multiple gene cluster and added to the empty cluster. When the method completes, no cluster contains zero genes.*/ | |
public void populateEmptyClusters() { | |
Vector<Integer> empties = new Vector<Integer>(); | |
for(int i=0; i<clusters.size(); i++) { | |
Cluster curCluster = clusters.get(i); | |
if(curCluster.getSizeOfCluster()==0) empties.add(i); | |
} | |
for(int j=0; j<empties.size();j++) { | |
//determine our options - could change as we go, so must keep updated. | |
Vector<Integer> multiples = new Vector<Integer>(); | |
for(int m=0; m<clusters.size(); m++) { | |
int size = clusters.get(m).getSizeOfCluster(); | |
if(size>1) | |
multiples.add(m); | |
} | |
//get a random cluster from eligible candidates | |
int rand = (int) (Math.random() * multiples.size()); | |
//which cluster did we get? | |
int takeIndex = multiples.get(rand); | |
Cluster from = clusters.get(takeIndex); | |
//which gene should we grab from this random cluster? | |
int atIndex = (int) (Math.random() * from.getSizeOfCluster()); | |
Gene g = from.getGene(atIndex); | |
//prepare to add a new cluster | |
Cluster temp = new Cluster(); | |
temp.addGene(g); | |
clusters.get(takeIndex).removeGene(g); //remove gene from old cluster | |
clusters.remove(empties.get(j)); //remove the empty cluster | |
clusters.add(temp); //add the new, non-empty cluster. | |
} | |
} | |
public static void main(String[] args) { | |
System.out.println("\n*********** KMEANS CLUSTERING ****************"); | |
KMeans_Clustering K = new KMeans_Clustering(args[0], Integer.parseInt(args[1])); | |
System.out.println(K); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment