MishaelRosenthal · August 16, 2016 17:13
diff --git a/README b/README
 included:
 promoterDiscovery.py  - trains a classifier, and predicts if a DNA sequence is a promoter or enhancer
 testPromoterDiscovery - includes unit tests for the promoterDiscovery class
 this README

 usage:
 promoterDiscovery.py [-h] [-c] [-p SEQUENCE] [-f FILE] [-t FILE]\n"
          -h					show this help message
          -c					run 10 fold cross validation
          -t FILE	   			train model and save into FILE
          -p SEQUENCE 			train model and predict label for SEQUENCE
          -p SEQUENCE -f FILE 	load model from FILE and predict label for SEQUENCE


 Notes about the code:

 - Following the sample code I recieved, the fasta files and training folder have to be placed in the same folder as the
  python code.


 Due to time constraints, and since this is my first time coding in python I opted for a simple solution.
 I used python's built-in option to extract BOW features from the DNA sequence, and a random forest classifier with
 the default parameters.
 I chose random forest since in many cases it has the best performance (see Fernandez-Delgado et. al. 2014). I also
 compared it to naive bayes and logistic regression, and it showed better results.
 There are two main directions in which to improve classification performance:
 1) More sophisticated features. I believe performance can be improved by adding patterns with some flexibility.
   For example, many promoters and enhancers contain transcription factor binding sites, which have a consensus sequence
   but allow variability. Therefore, a classifier could benefit from features that aim to look for these sequences,
   requiring a more flexible representation than the BOW which requires to find the exact sequence.
 2) Classifier parameter tuning. Random forest has many parameters to tune. By splitting the training data into 2 parts,
   it is possible to tune the parameters, and improve performance
diff --git a/promoterDiscovery.py b/promoterDiscovery.py
 from sklearn.feature_extraction.text import CountVectorizer
 #from sklearn import linear_model
 #from sklearn.naive_bayes import GaussianNB
 #from sklearn.neighbors import KNeighborsClassifier
 from pysam import FastaFile
 from sklearn import metrics
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.cross_validation import cross_val_predict
 from os import listdir
 from os.path import join
 from sklearn.externals import joblib
 import sys, getopt


 def iter_peaks_and_labels():
    my_path = "train_data"
    for f in listdir("train_data"):
        if f.endswith(".bed"):
            with open(join(my_path, f)) as fp:
                for line in fp:
                    data = line.split()
                    yield (data[0], int(data[1]), int(data[2])), data[3]
    return


 def extract_features(corpus):
    vectorizer = CountVectorizer(min_df=1, analyzer='char',ngram_range=(1, 3))
    X = vectorizer.fit_transform(corpus)
    features = X.toarray()
    return features, vectorizer


 def train(classifier):
    corpus, labels = extract_dna()
    features,vectorizer = extract_features(corpus)
    classifier.fit(features,labels)
    return classifier, vectorizer


 def load(filename):
    vectorizer = joblib.load(filename + "_vectorizer")
    classifier = joblib.load(filename + "_model")
    return classifier, vectorizer


 def predict(dna_seq, classifier, filename=None):
    if filename is None:
        classifier, vectorizer = train(classifier)
    else:
        classifier, vectorizer = load(filename)
    features = vectorizer.transform(dna_seq)
    return classifier.predict(features.toarray())


 def extract_dna():
    min_region_size = 1000
    genome = FastaFile("GRCh38.genome.fa")
    corpus = []
    labels = []
    iter = iter_peaks_and_labels()
    for region, label in iter:
        # create a new region exactly min_region_size basepairs long centered on
        # region
        expanded_start = region[1] + (region[2] - region[1])/2 - min_region_size/2
        expanded_stop = expanded_start + min_region_size
        region = (region[0], expanded_start, expanded_stop)
        dna = genome.fetch(*region)
        corpus.append(dna)
        labels.append(label)
    return corpus, labels


 def estimate(classifier):
    corpus, labels = extract_dna()
    features,vectorizer = extract_features(corpus)
    print "done extracting"
    predicted = cross_val_predict(classifier, features, labels, cv=10)
    print("Classification report for classifier %s:\n%s\n"
          % (classifier, metrics.classification_report(labels, predicted)))
    print("Confusion matrix:\n%s" % metrics.confusion_matrix(labels, predicted))
    return


 def print_usage():
    print "usage: promoterDiscovery.py [-h] [-c] [-p SEQUENCE] [-f FILE] [-t FILE]\n" \
          "-h\t\t\t\t\tshow this help message\n" \
          "-c\t\t\t\t\trun 10 fold cross validation\n" \
          "-t FILE\t\t\t\ttrain model and save into FILE\n" \
          "-p SEQUENCE\t\t\ttrain model and predict label for SEQUENCE\n" \
          "-p SEQUENCE -f FILE\tload model from FILE and predict label for SEQUENCE\n"

 def main(argv):
    classifier = RandomForestClassifier()
    #classifier = linear_model.LogisticRegression(C=1e5)
    filename = None
    pred = False
    try:
        opts, args = getopt.getopt(argv,"hcp:f:t:")
    except getopt.GetoptError:
        print_usage()
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print_usage()
            sys.exit()
        elif opt == '-c':
            estimate(classifier)
            sys.exit()
        elif opt == '-f':
            filename = arg
        elif opt == '-t':
            filename = arg
            classifier, vectorizer = train(classifier)
            joblib.dump(vectorizer, filename + "_vectorizer")
            joblib.dump(classifier, filename + "_model")
        elif opt == '-p':
            pred=True
            sequence=arg
    prediction = None
    if pred:
        prediction = predict([sequence],classifier,filename)
        print prediction
    return prediction

 if __name__ == '__main__':
    main(sys.argv[1:])
diff --git a/testPromoterDiscovery.py b/testPromoterDiscovery.py
 import unittest
 import os.path
 from os import listdir
 from promoterDiscovery import extract_features,main

 class TestCode(unittest.TestCase):

    def test_feature_extraction(self):
        corpus = ['ACG','GC']
        features, vectorizer = extract_features(corpus)
        feature_names = vectorizer.get_feature_names()
        for i in range(len(feature_names)):
            if feature_names[i]=="a":
                self.assertEqual(features[0,i], 1)
                self.assertEqual(features[1,i], 0)
            if feature_names[i]=="c":
                self.assertEqual(features[0,i], 1)
                self.assertEqual(features[1,i], 1)
            if feature_names[i]=="ac":
                self.assertEqual(features[0,i], 1)
                self.assertEqual(features[1,i], 0)
            if feature_names[i]=="gc":
                self.assertEqual(features[0,i], 0)
                self.assertEqual(features[1,i], 1)

    def test_train(self):
        main(["-ttest"])
        self.assertTrue(os.path.exists("./test_model"))
        self.assertTrue(os.path.exists("./test_vectorizer"))


    def test_predict(self):
        prediction = main(["-pCATCATTCTGAATTATGACCATGCTTTCATCTTCACAACTCAAGATCTATTTTCA"])
        self.assertIn(prediction[0], ["enhancer","promoter"])




 if __name__ == '__main__':
    unittest.main()
	included:
	promoterDiscovery.py - trains a classifier, and predicts if a DNA sequence is a promoter or enhancer
	testPromoterDiscovery - includes unit tests for the promoterDiscovery class
	this README

	usage:
	promoterDiscovery.py [-h] [-c] [-p SEQUENCE] [-f FILE] [-t FILE]\n"
	-h show this help message
	-c run 10 fold cross validation
	-t FILE train model and save into FILE
	-p SEQUENCE train model and predict label for SEQUENCE
	-p SEQUENCE -f FILE load model from FILE and predict label for SEQUENCE


	Notes about the code:

	- Following the sample code I recieved, the fasta files and training folder have to be placed in the same folder as the
	python code.


	Due to time constraints, and since this is my first time coding in python I opted for a simple solution.
	I used python's built-in option to extract BOW features from the DNA sequence, and a random forest classifier with
	the default parameters.
	I chose random forest since in many cases it has the best performance (see Fernandez-Delgado et. al. 2014). I also
	compared it to naive bayes and logistic regression, and it showed better results.
	There are two main directions in which to improve classification performance:
	1) More sophisticated features. I believe performance can be improved by adding patterns with some flexibility.
	For example, many promoters and enhancers contain transcription factor binding sites, which have a consensus sequence
	but allow variability. Therefore, a classifier could benefit from features that aim to look for these sequences,
	requiring a more flexible representation than the BOW which requires to find the exact sequence.
	2) Classifier parameter tuning. Random forest has many parameters to tune. By splitting the training data into 2 parts,
	it is possible to tune the parameters, and improve performance
	from sklearn.feature_extraction.text import CountVectorizer
	#from sklearn import linear_model
	#from sklearn.naive_bayes import GaussianNB
	#from sklearn.neighbors import KNeighborsClassifier
	from pysam import FastaFile
	from sklearn import metrics
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.cross_validation import cross_val_predict
	from os import listdir
	from os.path import join
	from sklearn.externals import joblib
	import sys, getopt


	def iter_peaks_and_labels():
	my_path = "train_data"
	for f in listdir("train_data"):
	if f.endswith(".bed"):
	with open(join(my_path, f)) as fp:
	for line in fp:
	data = line.split()
	yield (data[0], int(data[1]), int(data[2])), data[3]
	return


	def extract_features(corpus):
	vectorizer = CountVectorizer(min_df=1, analyzer='char',ngram_range=(1, 3))
	X = vectorizer.fit_transform(corpus)
	features = X.toarray()
	return features, vectorizer


	def train(classifier):
	corpus, labels = extract_dna()
	features,vectorizer = extract_features(corpus)
	classifier.fit(features,labels)
	return classifier, vectorizer


	def load(filename):
	vectorizer = joblib.load(filename + "_vectorizer")
	classifier = joblib.load(filename + "_model")
	return classifier, vectorizer


	def predict(dna_seq, classifier, filename=None):
	if filename is None:
	classifier, vectorizer = train(classifier)
	else:
	classifier, vectorizer = load(filename)
	features = vectorizer.transform(dna_seq)
	return classifier.predict(features.toarray())


	def extract_dna():
	min_region_size = 1000
	genome = FastaFile("GRCh38.genome.fa")
	corpus = []
	labels = []
	iter = iter_peaks_and_labels()
	for region, label in iter:
	# create a new region exactly min_region_size basepairs long centered on
	# region
	expanded_start = region[1] + (region[2] - region[1])/2 - min_region_size/2
	expanded_stop = expanded_start + min_region_size
	region = (region[0], expanded_start, expanded_stop)
	dna = genome.fetch(*region)
	corpus.append(dna)
	labels.append(label)
	return corpus, labels


	def estimate(classifier):
	corpus, labels = extract_dna()
	features,vectorizer = extract_features(corpus)
	print "done extracting"
	predicted = cross_val_predict(classifier, features, labels, cv=10)
	print("Classification report for classifier %s:\n%s\n"
	% (classifier, metrics.classification_report(labels, predicted)))
	print("Confusion matrix:\n%s" % metrics.confusion_matrix(labels, predicted))
	return


	def print_usage():
	print "usage: promoterDiscovery.py [-h] [-c] [-p SEQUENCE] [-f FILE] [-t FILE]\n" \
	"-h\t\t\t\t\tshow this help message\n" \
	"-c\t\t\t\t\trun 10 fold cross validation\n" \
	"-t FILE\t\t\t\ttrain model and save into FILE\n" \
	"-p SEQUENCE\t\t\ttrain model and predict label for SEQUENCE\n" \
	"-p SEQUENCE -f FILE\tload model from FILE and predict label for SEQUENCE\n"

	def main(argv):
	classifier = RandomForestClassifier()
	#classifier = linear_model.LogisticRegression(C=1e5)
	filename = None
	pred = False
	try:
	opts, args = getopt.getopt(argv,"hcp:f:t:")
	except getopt.GetoptError:
	print_usage()
	sys.exit(2)
	for opt, arg in opts:
	if opt == '-h':
	print_usage()
	sys.exit()
	elif opt == '-c':
	estimate(classifier)
	sys.exit()
	elif opt == '-f':
	filename = arg
	elif opt == '-t':
	filename = arg
	classifier, vectorizer = train(classifier)
	joblib.dump(vectorizer, filename + "_vectorizer")
	joblib.dump(classifier, filename + "_model")
	elif opt == '-p':
	pred=True
	sequence=arg
	prediction = None
	if pred:
	prediction = predict([sequence],classifier,filename)
	print prediction
	return prediction

	if __name__ == '__main__':
	main(sys.argv[1:])
	import unittest
	import os.path
	from os import listdir
	from promoterDiscovery import extract_features,main

	class TestCode(unittest.TestCase):

	def test_feature_extraction(self):
	corpus = ['ACG','GC']
	features, vectorizer = extract_features(corpus)
	feature_names = vectorizer.get_feature_names()
	for i in range(len(feature_names)):
	if feature_names[i]=="a":
	self.assertEqual(features[0,i], 1)
	self.assertEqual(features[1,i], 0)
	if feature_names[i]=="c":
	self.assertEqual(features[0,i], 1)
	self.assertEqual(features[1,i], 1)
	if feature_names[i]=="ac":
	self.assertEqual(features[0,i], 1)
	self.assertEqual(features[1,i], 0)
	if feature_names[i]=="gc":
	self.assertEqual(features[0,i], 0)
	self.assertEqual(features[1,i], 1)

	def test_train(self):
	main(["-ttest"])
	self.assertTrue(os.path.exists("./test_model"))
	self.assertTrue(os.path.exists("./test_vectorizer"))


	def test_predict(self):
	prediction = main(["-pCATCATTCTGAATTATGACCATGCTTTCATCTTCACAACTCAAGATCTATTTTCA"])
	self.assertIn(prediction[0], ["enhancer","promoter"])




	if __name__ == '__main__':
	unittest.main()