askmeegs · October 27, 2015 19:49
diff --git a/KMeans_Clustering.java b/KMeans_Clustering.java
 //KMeans_Clustering.java
 //Megan O'Keefe
 //Completed: 10/25/15
 //CS313, Wellesley College (Assignment #5)
 //USES EXISTING CLUSTERING CLASSES, NOT INCLUDED

 /* 
 Overview of KMeans algorithm:
 	- randomly assign each Gene to a cluster (Math.random... for range = # desired clusters)
 	- Then until you have no shuffling changes in the iteration...
 		- Calculate the mean of each of the "k" clusters 
 		- Assign each gene to the cluster with the closest mean 
 */

 import java.util.*;  // Needed for Scanner class and for Vector class
 import java.io.*;  // Needed for File class


 public class KMeans_Clustering extends Clustering {
 	
 	int k;
 	
 	
 	/*A set of genes and experiments are determined from the specified String representing the name of a file. Genes and experiments are read-in from the tab-delimited file. Initially, the constructed KMeans_Clustering consists of the specified number of clusters, but each cluster has not yet been assigned any genes.*/
 	public KMeans_Clustering(String fileName, int numClusters) {
 		super(fileName);  //instantiate a Clustering object using the read-in file
 		k = numClusters;
 		kMeans(); //perform the kMeans clustering.
 	}
 	
 	
 	public void kMeans() {
 		initializeAllClusters(); 
 		randomlyAssignGenesToClusters(); //to start, randomly assign to k clusters.
 		Vector<Vector<Double>> means = getMeansOfAllClusters();
 		boolean done = assignGenesToClusters(means); //check for small likelihood no clustering is needed.
 		while(!done) { //until we reach convergence... keep re-clustering.
 			means = getMeansOfAllClusters();
 			done = assignGenesToClusters(means); 
 		}
 		System.out.println("...clustering complete.); 
 	}
 	
 	
 	//instantiates the clusters vector 
 	public void initializeAllClusters() {
 		for(int i=0; i<k; i++) {
 			this.clusters.add(new Cluster()); 
 		}
 	}
 	
 	public void randomlyAssignGenesToClusters() {
 		int range = k; //random generator's range 
 		for(int i=0; i<genes.size(); i++) {
 			Gene g = genes.get(i); 
 			int where = (int) (Math.random()*range); 
 			clusters.get(where).addGene(g); //add Gene to a random cluster. 
 		}
 	}
 	
 	//Return a collection of the mean (average) expression vectors for all of the clusters.
 	public Vector<Vector<Double>> getMeansOfAllClusters() {
 		Vector<Vector<Double>> output = new Vector<Vector<Double>>();
 		for(int i=0; i<clusters.size(); i++) { //for each cluster, calculate means, add to output.
 			Cluster curCluster = clusters.get(i); 
 			Vector<Double> temp = curCluster.getClusterMean();
 			output.add(temp); 
 		}
 		return output; 
 	}
 	
 	
 /*Assigns each gene to the cluster whose mean expression vector is closest to the gene.
 The parameter means corresponds to a collection of Vectors (analagous to a 2D array). Each entry of means corresponds to the set of average expression values for a particular cluster. The method returns true if the cluster assignments are not improving, i.e., we have achieved a locally optimal clustering. The method returns false if the cluster assignments are an improvement over previous cluster assignments, i.e., better clusters than previously have been found. The measure used for comparing one clustering to another clustering is the sum of the distance of each gene from its cluster's mean.*/
 	public boolean assignGenesToClusters(Vector<Vector<Double>> means) {
 		//calculate the old sum for later use
 		double oldSum = 0.0;
 		for(int a=0; a<genes.size(); a++) {
 			Gene mygene = genes.get(a); 
 			//where's this gene?
 			int at = 0;
 			for(int t = 0; t<clusters.size(); t++) {
 				if(clusters.get(t).isGeneInCluster(mygene)) at = t;
 			}
 			Vector<Double> oldMeans = clusters.get(at).getClusterMean(); 
 			double mydist = mygene.distanceToExpressionVector(oldMeans);
 			oldSum = oldSum + mydist;
 		}

 	//Assign each gene to the right cluster. 
 		for(int i=0; i<genes.size(); i++) {
 			Gene curGene = genes.get(i); 
 			//where is this gene?
 			int oldIndex = 0;
 			for(int q = 0; q<clusters.size(); q++) {
 				if(clusters.get(q).isGeneInCluster(curGene)) oldIndex = q;
 			}	
 			//is there a closer star?
 			int newIndex = 0;
 			double mindist = 10000000; 
 			for(int j=0; j<clusters.size(); j++) { 
 				if(clusters.get(j).getSizeOfCluster() != 0) { //as long as cluster isn't empty...
 					Vector<Double> curMeans = clusters.get(j).getClusterMean(); 
 					double dist = curGene.distanceToExpressionVector(curMeans); 
 					//if we found a new minimum distance...
 					if(dist<mindist) {
 						newIndex = j;
 						mindist = dist; 
 					}	
 				}
 			}
 		 //add this gene to new cluster, and remove it from the old one.
 		 if(newIndex != oldIndex) {
 		  	clusters.get(newIndex).addGene(curGene);
 		  	clusters.get(oldIndex).removeGene(curGene); 
 		  	}
 		  	
 		}
 		populateEmptyClusters(); //reshuffle if this resulted in any clusters being empty.
 		
 		
 		//now we compare the old arrangement and the new arrangment....
 		double newSum = 0.0;
 		for(int v=0; v<genes.size(); v++) {
 			Gene mygene = genes.get(v); 
 			//where's this gene?
 			int at = 0;
 			for(int x = 0; x<clusters.size(); x++) {
 				if(clusters.get(x).isGeneInCluster(mygene)) at = x;
 			}
 			Vector<Double> newMeans = clusters.get(at).getClusterMean(); 
 			double mydist = mygene.distanceToExpressionVector(newMeans);
 			newSum = newSum + mydist;
 		}

 		if(oldSum<=newSum) return true;
 			else return false; 
 		
 	}
 	
 /*If any clusters are empty (contain zero genes), then genes are moved from clusters containing multiple genes.
 Assumes all genes have been assigned to a cluster. For any empty clusters, a cluster with mulitple genes is found randomly and one gene is removed from the multiple gene cluster and added to the empty cluster. When the method completes, no cluster contains zero genes.*/
 	public void populateEmptyClusters() {
 		Vector<Integer> empties = new Vector<Integer>(); 
 		for(int i=0; i<clusters.size(); i++) {
 			Cluster curCluster = clusters.get(i);
 			if(curCluster.getSizeOfCluster()==0) empties.add(i);
 		}
 		for(int j=0; j<empties.size();j++) {
 			//determine our options - could change as we go, so must keep updated.
 			Vector<Integer> multiples = new Vector<Integer>();
 			for(int m=0; m<clusters.size(); m++) {
 				int size = clusters.get(m).getSizeOfCluster();
 				if(size>1)
 					multiples.add(m);
 			}
 			//get a random cluster from eligible candidates 
 			int rand = (int) (Math.random() * multiples.size());
 			//which cluster did we get?
 			int takeIndex = multiples.get(rand);
 			Cluster from = clusters.get(takeIndex);
 			//which gene should we grab from this random cluster? 
 			int atIndex = (int) (Math.random() * from.getSizeOfCluster());
 			Gene g = from.getGene(atIndex); 
 			//prepare to add a new cluster 
 			Cluster temp = new Cluster();
 			temp.addGene(g);
 			clusters.get(takeIndex).removeGene(g); //remove gene from old cluster
 			clusters.remove(empties.get(j)); //remove the empty cluster
 			clusters.add(temp); //add the new, non-empty cluster.
 		}
 		
 	}

 	public static void main(String[] args) {
 		System.out.println("\n*********** KMEANS CLUSTERING ****************");
 		KMeans_Clustering K = new KMeans_Clustering(args[0], Integer.parseInt(args[1]));
 		System.out.println(K);
 		
 	}
 }
	//KMeans_Clustering.java
	//Megan O'Keefe
	//Completed: 10/25/15
	//CS313, Wellesley College (Assignment #5)
	//USES EXISTING CLUSTERING CLASSES, NOT INCLUDED

	/*
	Overview of KMeans algorithm:
	- randomly assign each Gene to a cluster (Math.random... for range = # desired clusters)
	- Then until you have no shuffling changes in the iteration...
	- Calculate the mean of each of the "k" clusters
	- Assign each gene to the cluster with the closest mean
	*/

	import java.util.*; // Needed for Scanner class and for Vector class
	import java.io.*; // Needed for File class


	public class KMeans_Clustering extends Clustering {

	int k;


	/A set of genes and experiments are determined from the specified String representing the name of a file. Genes and experiments are read-in from the tab-delimited file. Initially, the constructed KMeans_Clustering consists of the specified number of clusters, but each cluster has not yet been assigned any genes./
	public KMeans_Clustering(String fileName, int numClusters) {
	super(fileName); //instantiate a Clustering object using the read-in file
	k = numClusters;
	kMeans(); //perform the kMeans clustering.
	}


	public void kMeans() {
	initializeAllClusters();
	randomlyAssignGenesToClusters(); //to start, randomly assign to k clusters.
	Vector<Vector<Double>> means = getMeansOfAllClusters();
	boolean done = assignGenesToClusters(means); //check for small likelihood no clustering is needed.
	while(!done) { //until we reach convergence... keep re-clustering.
	means = getMeansOfAllClusters();
	done = assignGenesToClusters(means);
	}
	System.out.println("...clustering complete.);
	}


	//instantiates the clusters vector
	public void initializeAllClusters() {
	for(int i=0; i<k; i++) {
	this.clusters.add(new Cluster());
	}
	}

	public void randomlyAssignGenesToClusters() {
	int range = k; //random generator's range
	for(int i=0; i<genes.size(); i++) {
	Gene g = genes.get(i);
	int where = (int) (Math.random()*range);
	clusters.get(where).addGene(g); //add Gene to a random cluster.
	}
	}

	//Return a collection of the mean (average) expression vectors for all of the clusters.
	public Vector<Vector<Double>> getMeansOfAllClusters() {
	Vector<Vector<Double>> output = new Vector<Vector<Double>>();
	for(int i=0; i<clusters.size(); i++) { //for each cluster, calculate means, add to output.
	Cluster curCluster = clusters.get(i);
	Vector<Double> temp = curCluster.getClusterMean();
	output.add(temp);
	}
	return output;
	}


	/*Assigns each gene to the cluster whose mean expression vector is closest to the gene.
	The parameter means corresponds to a collection of Vectors (analagous to a 2D array). Each entry of means corresponds to the set of average expression values for a particular cluster. The method returns true if the cluster assignments are not improving, i.e., we have achieved a locally optimal clustering. The method returns false if the cluster assignments are an improvement over previous cluster assignments, i.e., better clusters than previously have been found. The measure used for comparing one clustering to another clustering is the sum of the distance of each gene from its cluster's mean.*/
	public boolean assignGenesToClusters(Vector<Vector<Double>> means) {
	//calculate the old sum for later use
	double oldSum = 0.0;
	for(int a=0; a<genes.size(); a++) {
	Gene mygene = genes.get(a);
	//where's this gene?
	int at = 0;
	for(int t = 0; t<clusters.size(); t++) {
	if(clusters.get(t).isGeneInCluster(mygene)) at = t;
	}
	Vector<Double> oldMeans = clusters.get(at).getClusterMean();
	double mydist = mygene.distanceToExpressionVector(oldMeans);
	oldSum = oldSum + mydist;
	}

	//Assign each gene to the right cluster.
	for(int i=0; i<genes.size(); i++) {
	Gene curGene = genes.get(i);
	//where is this gene?
	int oldIndex = 0;
	for(int q = 0; q<clusters.size(); q++) {
	if(clusters.get(q).isGeneInCluster(curGene)) oldIndex = q;
	}
	//is there a closer star?
	int newIndex = 0;
	double mindist = 10000000;
	for(int j=0; j<clusters.size(); j++) {
	if(clusters.get(j).getSizeOfCluster() != 0) { //as long as cluster isn't empty...
	Vector<Double> curMeans = clusters.get(j).getClusterMean();
	double dist = curGene.distanceToExpressionVector(curMeans);
	//if we found a new minimum distance...
	if(dist<mindist) {
	newIndex = j;
	mindist = dist;
	}
	}
	}
	//add this gene to new cluster, and remove it from the old one.
	if(newIndex != oldIndex) {
	clusters.get(newIndex).addGene(curGene);
	clusters.get(oldIndex).removeGene(curGene);
	}

	}
	populateEmptyClusters(); //reshuffle if this resulted in any clusters being empty.


	//now we compare the old arrangement and the new arrangment....
	double newSum = 0.0;
	for(int v=0; v<genes.size(); v++) {
	Gene mygene = genes.get(v);
	//where's this gene?
	int at = 0;
	for(int x = 0; x<clusters.size(); x++) {
	if(clusters.get(x).isGeneInCluster(mygene)) at = x;
	}
	Vector<Double> newMeans = clusters.get(at).getClusterMean();
	double mydist = mygene.distanceToExpressionVector(newMeans);
	newSum = newSum + mydist;
	}

	if(oldSum<=newSum) return true;
	else return false;

	}

	/*If any clusters are empty (contain zero genes), then genes are moved from clusters containing multiple genes.
	Assumes all genes have been assigned to a cluster. For any empty clusters, a cluster with mulitple genes is found randomly and one gene is removed from the multiple gene cluster and added to the empty cluster. When the method completes, no cluster contains zero genes.*/
	public void populateEmptyClusters() {
	Vector<Integer> empties = new Vector<Integer>();
	for(int i=0; i<clusters.size(); i++) {
	Cluster curCluster = clusters.get(i);
	if(curCluster.getSizeOfCluster()==0) empties.add(i);
	}
	for(int j=0; j<empties.size();j++) {
	//determine our options - could change as we go, so must keep updated.
	Vector<Integer> multiples = new Vector<Integer>();
	for(int m=0; m<clusters.size(); m++) {
	int size = clusters.get(m).getSizeOfCluster();
	if(size>1)
	multiples.add(m);
	}
	//get a random cluster from eligible candidates
	int rand = (int) (Math.random() * multiples.size());
	//which cluster did we get?
	int takeIndex = multiples.get(rand);
	Cluster from = clusters.get(takeIndex);
	//which gene should we grab from this random cluster?
	int atIndex = (int) (Math.random() * from.getSizeOfCluster());
	Gene g = from.getGene(atIndex);
	//prepare to add a new cluster
	Cluster temp = new Cluster();
	temp.addGene(g);
	clusters.get(takeIndex).removeGene(g); //remove gene from old cluster
	clusters.remove(empties.get(j)); //remove the empty cluster
	clusters.add(temp); //add the new, non-empty cluster.
	}

	}

	public static void main(String[] args) {
	System.out.println("\n********* KMEANS CLUSTERING **************");
	KMeans_Clustering K = new KMeans_Clustering(args[0], Integer.parseInt(args[1]));
	System.out.println(K);

	}
	}