|
#!/usr/bin/python3.4 |
|
|
|
# Dependencies |
|
import csv |
|
import json |
|
import sys |
|
|
|
import matplotlib.pyplot as plot |
|
import numpy as np |
|
|
|
from datetime import datetime |
|
from math import factorial |
|
from os import makedirs |
|
from scipy import linalg |
|
|
|
# Read csv data |
|
data = [] |
|
with open('data_breast_cancer.csv', 'rU') as csvfile: |
|
spamreader = csv.reader(csvfile) |
|
for row in spamreader: |
|
data.append(row) |
|
|
|
# Organize labels |
|
genes = [d[0] for d in data[1:-1]] |
|
people = data[0][1:] |
|
cancers = list(set(data[-1][1:])) |
|
|
|
# Organize data |
|
peopleMap = {} |
|
for p in range(len(people)): |
|
person = people[p] |
|
peopleMap[person] = {} |
|
for cancer in cancers: |
|
peopleMap[person][cancer] = cancer == data[-1][p+1] |
|
for g in range(len(genes)): |
|
gene = genes[g] |
|
for p in range(len(people)): |
|
person = people[p] |
|
peopleMap[person][gene] = float(data[g+1][p+1]) |
|
data = None # free memory |
|
|
|
# Create run work directory |
|
directory = "out/" + str(datetime.now()) |
|
makedirs(directory, exist_ok=True) |
|
|
|
# A method to be called for each combination cancer-cancer-gene-gene |
|
def doMSE(cancerA, cancerB, geneA, geneB): |
|
# Create Sn data when n is the amount of people with cancerA and cancerB |
|
# and the each item has the structure ((geneA, geneB), cancer) |
|
Sn = [] |
|
for person, data in peopleMap.items(): |
|
if (data[cancerA] or data[cancerB]): |
|
Xi = [data[geneA], data[geneB]] |
|
Sn.append([Xi, cancerA if data[cancerA] else cancerB]) |
|
|
|
# Create the X matrix and y vector |
|
X = np.empty((len(Sn), len(Sn[0][0])+1)) |
|
y = np.empty((len(Sn))) |
|
for i in range(len(Sn)): |
|
X[i][0] = 1 |
|
y[i] = 1.0 if Sn[i][1] is cancerA else -1.0 |
|
xi = Sn[i][0] |
|
for j in range(len(xi)): |
|
X[i][j+1] = xi[j] |
|
|
|
# Get a vector a by MSE equation a = (X^t * X)^-1 * X^t*y |
|
Xt = X.T |
|
XtX = Xt.dot(X) |
|
XtXi = linalg.inv(XtX) |
|
Xty = Xt.dot(y) |
|
a = XtXi.dot(Xty) |
|
|
|
# check if there is a point misclassified. |
|
hasError = False |
|
for i in range(len(Sn)): |
|
yi = 1.0 if Sn[i][1] is cancerA else -1.0 |
|
xi = Sn[i][0] |
|
if ((yi * (a[0] + np.dot(a[1:], xi))) < 0): |
|
hasError = True |
|
break |
|
|
|
# If it is linearly separable draw the plot |
|
if (not hasError): |
|
# Create current combination directory |
|
job = cancerA + " x " + cancerB + " to " + geneA + " x " + geneB |
|
workingDirectory = directory + "/" + job |
|
makedirs(workingDirectory, exist_ok=True) |
|
|
|
# Plot the data |
|
minX = minY = maxX = maxY = 0.0 |
|
cancerAData = [[], []] |
|
cancerBData = [[], []] |
|
for S in Sn: |
|
Xi = S[0] |
|
yi = S[1] |
|
x = Xi[0] |
|
y = Xi[1] |
|
minX = min(minX, x) |
|
minY = min(minY, y) |
|
maxX = max(maxX, x) |
|
maxY = max(maxY, y) |
|
if yi is cancerA: |
|
cancerAData[0].append(x) |
|
cancerAData[1].append(y) |
|
else: |
|
cancerBData[0].append(x) |
|
cancerBData[1].append(y) |
|
minX = minX * 0.9 # Add space to min X be completely visible |
|
minY = minY * 0.9 # Add space to min Y be completely visible |
|
maxX = maxX * 1.1 # Add space to max X be completely visible |
|
maxY = maxY * 1.2 # Add space to max Y be completely visible and space to legend |
|
|
|
figure = plot.figure() |
|
plot.title(job) |
|
plot.xlabel(geneA) |
|
plot.ylabel(geneB) |
|
plot.axis([minX, maxX, minY, maxY]) |
|
cancerAPlot, = plot.plot(cancerAData[0], cancerAData[1], "rx", label=cancerA) |
|
cancerBPlot, = plot.plot(cancerBData[0], cancerBData[1], "gx", label=cancerB) |
|
plot.legend([cancerAPlot, cancerBPlot], [cancerA, cancerB], loc="upper right") |
|
getY = lambda x : -(a[1]/a[2])*x - a[0]/a[2] |
|
plot.plot([minX, maxX], [getY(minX), getY(maxX)], color="b", linestyle="-") |
|
plot.savefig(workingDirectory + "/plot.svg", format="svg") |
|
plot.close(figure) |
|
with open(workingDirectory + "/result.json", "w") as outfile: |
|
json.dump({"b":a[0],"w":list(a[1:]),"Sn":Sn}, outfile) |
|
|
|
# Pick two cancers and two genes |
|
cancersLen = len(cancers) |
|
genesLen = len(genes) |
|
totalCombinations = int((factorial(genesLen) / (factorial(2) * factorial(genesLen-2))) \ |
|
* (factorial(cancersLen) / (factorial(2) * factorial(cancersLen-2)))) |
|
runnedCombinations = 0 |
|
print("Starting the", totalCombinations, "combinations cancerXcancer geneXgene, it will take a time, go to drink a coffe.") |
|
for cA in range(cancersLen): |
|
cancerA = cancers[cA] |
|
for cB in range(cA + 1, cancersLen): |
|
cancerB = cancers[cB] |
|
for gA in range(genesLen): |
|
geneA = genes[gA] |
|
for gB in range(gA + 1, genesLen): |
|
geneB = genes[gB] |
|
doMSE(cancerA, cancerB, geneA, geneB) |
|
runnedCombinations = runnedCombinations + 1 |
|
# Informe user the percentage of job done |
|
progress = runnedCombinations / totalCombinations * 100 |
|
bar = '#'*int(progress) + ' '*(100-int(progress)) |
|
sys.stdout.write("\r[{0}] {1}".format(bar, "{0:.3f}%".format(progress))) |
|
sys.stdout.flush() |