Skip to content

Instantly share code, notes, and snippets.

@MishaelRosenthal
Created August 16, 2016 17:13
Show Gist options
  • Select an option

  • Save MishaelRosenthal/5a75ffe65836db92f50f9c6ae82f779e to your computer and use it in GitHub Desktop.

Select an option

Save MishaelRosenthal/5a75ffe65836db92f50f9c6ae82f779e to your computer and use it in GitHub Desktop.
included:
promoterDiscovery.py - trains a classifier, and predicts if a DNA sequence is a promoter or enhancer
testPromoterDiscovery - includes unit tests for the promoterDiscovery class
this README
usage:
promoterDiscovery.py [-h] [-c] [-p SEQUENCE] [-f FILE] [-t FILE]\n"
-h show this help message
-c run 10 fold cross validation
-t FILE train model and save into FILE
-p SEQUENCE train model and predict label for SEQUENCE
-p SEQUENCE -f FILE load model from FILE and predict label for SEQUENCE
Notes about the code:
- Following the sample code I recieved, the fasta files and training folder have to be placed in the same folder as the
python code.
Due to time constraints, and since this is my first time coding in python I opted for a simple solution.
I used python's built-in option to extract BOW features from the DNA sequence, and a random forest classifier with
the default parameters.
I chose random forest since in many cases it has the best performance (see Fernandez-Delgado et. al. 2014). I also
compared it to naive bayes and logistic regression, and it showed better results.
There are two main directions in which to improve classification performance:
1) More sophisticated features. I believe performance can be improved by adding patterns with some flexibility.
For example, many promoters and enhancers contain transcription factor binding sites, which have a consensus sequence
but allow variability. Therefore, a classifier could benefit from features that aim to look for these sequences,
requiring a more flexible representation than the BOW which requires to find the exact sequence.
2) Classifier parameter tuning. Random forest has many parameters to tune. By splitting the training data into 2 parts,
it is possible to tune the parameters, and improve performance
from sklearn.feature_extraction.text import CountVectorizer
#from sklearn import linear_model
#from sklearn.naive_bayes import GaussianNB
#from sklearn.neighbors import KNeighborsClassifier
from pysam import FastaFile
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_predict
from os import listdir
from os.path import join
from sklearn.externals import joblib
import sys, getopt
def iter_peaks_and_labels():
my_path = "train_data"
for f in listdir("train_data"):
if f.endswith(".bed"):
with open(join(my_path, f)) as fp:
for line in fp:
data = line.split()
yield (data[0], int(data[1]), int(data[2])), data[3]
return
def extract_features(corpus):
vectorizer = CountVectorizer(min_df=1, analyzer='char',ngram_range=(1, 3))
X = vectorizer.fit_transform(corpus)
features = X.toarray()
return features, vectorizer
def train(classifier):
corpus, labels = extract_dna()
features,vectorizer = extract_features(corpus)
classifier.fit(features,labels)
return classifier, vectorizer
def load(filename):
vectorizer = joblib.load(filename + "_vectorizer")
classifier = joblib.load(filename + "_model")
return classifier, vectorizer
def predict(dna_seq, classifier, filename=None):
if filename is None:
classifier, vectorizer = train(classifier)
else:
classifier, vectorizer = load(filename)
features = vectorizer.transform(dna_seq)
return classifier.predict(features.toarray())
def extract_dna():
min_region_size = 1000
genome = FastaFile("GRCh38.genome.fa")
corpus = []
labels = []
iter = iter_peaks_and_labels()
for region, label in iter:
# create a new region exactly min_region_size basepairs long centered on
# region
expanded_start = region[1] + (region[2] - region[1])/2 - min_region_size/2
expanded_stop = expanded_start + min_region_size
region = (region[0], expanded_start, expanded_stop)
dna = genome.fetch(*region)
corpus.append(dna)
labels.append(label)
return corpus, labels
def estimate(classifier):
corpus, labels = extract_dna()
features,vectorizer = extract_features(corpus)
print "done extracting"
predicted = cross_val_predict(classifier, features, labels, cv=10)
print("Classification report for classifier %s:\n%s\n"
% (classifier, metrics.classification_report(labels, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(labels, predicted))
return
def print_usage():
print "usage: promoterDiscovery.py [-h] [-c] [-p SEQUENCE] [-f FILE] [-t FILE]\n" \
"-h\t\t\t\t\tshow this help message\n" \
"-c\t\t\t\t\trun 10 fold cross validation\n" \
"-t FILE\t\t\t\ttrain model and save into FILE\n" \
"-p SEQUENCE\t\t\ttrain model and predict label for SEQUENCE\n" \
"-p SEQUENCE -f FILE\tload model from FILE and predict label for SEQUENCE\n"
def main(argv):
classifier = RandomForestClassifier()
#classifier = linear_model.LogisticRegression(C=1e5)
filename = None
pred = False
try:
opts, args = getopt.getopt(argv,"hcp:f:t:")
except getopt.GetoptError:
print_usage()
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print_usage()
sys.exit()
elif opt == '-c':
estimate(classifier)
sys.exit()
elif opt == '-f':
filename = arg
elif opt == '-t':
filename = arg
classifier, vectorizer = train(classifier)
joblib.dump(vectorizer, filename + "_vectorizer")
joblib.dump(classifier, filename + "_model")
elif opt == '-p':
pred=True
sequence=arg
prediction = None
if pred:
prediction = predict([sequence],classifier,filename)
print prediction
return prediction
if __name__ == '__main__':
main(sys.argv[1:])
import unittest
import os.path
from os import listdir
from promoterDiscovery import extract_features,main
class TestCode(unittest.TestCase):
def test_feature_extraction(self):
corpus = ['ACG','GC']
features, vectorizer = extract_features(corpus)
feature_names = vectorizer.get_feature_names()
for i in range(len(feature_names)):
if feature_names[i]=="a":
self.assertEqual(features[0,i], 1)
self.assertEqual(features[1,i], 0)
if feature_names[i]=="c":
self.assertEqual(features[0,i], 1)
self.assertEqual(features[1,i], 1)
if feature_names[i]=="ac":
self.assertEqual(features[0,i], 1)
self.assertEqual(features[1,i], 0)
if feature_names[i]=="gc":
self.assertEqual(features[0,i], 0)
self.assertEqual(features[1,i], 1)
def test_train(self):
main(["-ttest"])
self.assertTrue(os.path.exists("./test_model"))
self.assertTrue(os.path.exists("./test_vectorizer"))
def test_predict(self):
prediction = main(["-pCATCATTCTGAATTATGACCATGCTTTCATCTTCACAACTCAAGATCTATTTTCA"])
self.assertIn(prediction[0], ["enhancer","promoter"])
if __name__ == '__main__':
unittest.main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment