Created
August 16, 2016 17:13
-
-
Save MishaelRosenthal/5a75ffe65836db92f50f9c6ae82f779e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| included: | |
| promoterDiscovery.py - trains a classifier, and predicts if a DNA sequence is a promoter or enhancer | |
| testPromoterDiscovery - includes unit tests for the promoterDiscovery class | |
| this README | |
| usage: | |
| promoterDiscovery.py [-h] [-c] [-p SEQUENCE] [-f FILE] [-t FILE]\n" | |
| -h show this help message | |
| -c run 10 fold cross validation | |
| -t FILE train model and save into FILE | |
| -p SEQUENCE train model and predict label for SEQUENCE | |
| -p SEQUENCE -f FILE load model from FILE and predict label for SEQUENCE | |
| Notes about the code: | |
| - Following the sample code I recieved, the fasta files and training folder have to be placed in the same folder as the | |
| python code. | |
| Due to time constraints, and since this is my first time coding in python I opted for a simple solution. | |
| I used python's built-in option to extract BOW features from the DNA sequence, and a random forest classifier with | |
| the default parameters. | |
| I chose random forest since in many cases it has the best performance (see Fernandez-Delgado et. al. 2014). I also | |
| compared it to naive bayes and logistic regression, and it showed better results. | |
| There are two main directions in which to improve classification performance: | |
| 1) More sophisticated features. I believe performance can be improved by adding patterns with some flexibility. | |
| For example, many promoters and enhancers contain transcription factor binding sites, which have a consensus sequence | |
| but allow variability. Therefore, a classifier could benefit from features that aim to look for these sequences, | |
| requiring a more flexible representation than the BOW which requires to find the exact sequence. | |
| 2) Classifier parameter tuning. Random forest has many parameters to tune. By splitting the training data into 2 parts, | |
| it is possible to tune the parameters, and improve performance |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from sklearn.feature_extraction.text import CountVectorizer | |
| #from sklearn import linear_model | |
| #from sklearn.naive_bayes import GaussianNB | |
| #from sklearn.neighbors import KNeighborsClassifier | |
| from pysam import FastaFile | |
| from sklearn import metrics | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.cross_validation import cross_val_predict | |
| from os import listdir | |
| from os.path import join | |
| from sklearn.externals import joblib | |
| import sys, getopt | |
| def iter_peaks_and_labels(): | |
| my_path = "train_data" | |
| for f in listdir("train_data"): | |
| if f.endswith(".bed"): | |
| with open(join(my_path, f)) as fp: | |
| for line in fp: | |
| data = line.split() | |
| yield (data[0], int(data[1]), int(data[2])), data[3] | |
| return | |
| def extract_features(corpus): | |
| vectorizer = CountVectorizer(min_df=1, analyzer='char',ngram_range=(1, 3)) | |
| X = vectorizer.fit_transform(corpus) | |
| features = X.toarray() | |
| return features, vectorizer | |
| def train(classifier): | |
| corpus, labels = extract_dna() | |
| features,vectorizer = extract_features(corpus) | |
| classifier.fit(features,labels) | |
| return classifier, vectorizer | |
| def load(filename): | |
| vectorizer = joblib.load(filename + "_vectorizer") | |
| classifier = joblib.load(filename + "_model") | |
| return classifier, vectorizer | |
| def predict(dna_seq, classifier, filename=None): | |
| if filename is None: | |
| classifier, vectorizer = train(classifier) | |
| else: | |
| classifier, vectorizer = load(filename) | |
| features = vectorizer.transform(dna_seq) | |
| return classifier.predict(features.toarray()) | |
| def extract_dna(): | |
| min_region_size = 1000 | |
| genome = FastaFile("GRCh38.genome.fa") | |
| corpus = [] | |
| labels = [] | |
| iter = iter_peaks_and_labels() | |
| for region, label in iter: | |
| # create a new region exactly min_region_size basepairs long centered on | |
| # region | |
| expanded_start = region[1] + (region[2] - region[1])/2 - min_region_size/2 | |
| expanded_stop = expanded_start + min_region_size | |
| region = (region[0], expanded_start, expanded_stop) | |
| dna = genome.fetch(*region) | |
| corpus.append(dna) | |
| labels.append(label) | |
| return corpus, labels | |
| def estimate(classifier): | |
| corpus, labels = extract_dna() | |
| features,vectorizer = extract_features(corpus) | |
| print "done extracting" | |
| predicted = cross_val_predict(classifier, features, labels, cv=10) | |
| print("Classification report for classifier %s:\n%s\n" | |
| % (classifier, metrics.classification_report(labels, predicted))) | |
| print("Confusion matrix:\n%s" % metrics.confusion_matrix(labels, predicted)) | |
| return | |
| def print_usage(): | |
| print "usage: promoterDiscovery.py [-h] [-c] [-p SEQUENCE] [-f FILE] [-t FILE]\n" \ | |
| "-h\t\t\t\t\tshow this help message\n" \ | |
| "-c\t\t\t\t\trun 10 fold cross validation\n" \ | |
| "-t FILE\t\t\t\ttrain model and save into FILE\n" \ | |
| "-p SEQUENCE\t\t\ttrain model and predict label for SEQUENCE\n" \ | |
| "-p SEQUENCE -f FILE\tload model from FILE and predict label for SEQUENCE\n" | |
| def main(argv): | |
| classifier = RandomForestClassifier() | |
| #classifier = linear_model.LogisticRegression(C=1e5) | |
| filename = None | |
| pred = False | |
| try: | |
| opts, args = getopt.getopt(argv,"hcp:f:t:") | |
| except getopt.GetoptError: | |
| print_usage() | |
| sys.exit(2) | |
| for opt, arg in opts: | |
| if opt == '-h': | |
| print_usage() | |
| sys.exit() | |
| elif opt == '-c': | |
| estimate(classifier) | |
| sys.exit() | |
| elif opt == '-f': | |
| filename = arg | |
| elif opt == '-t': | |
| filename = arg | |
| classifier, vectorizer = train(classifier) | |
| joblib.dump(vectorizer, filename + "_vectorizer") | |
| joblib.dump(classifier, filename + "_model") | |
| elif opt == '-p': | |
| pred=True | |
| sequence=arg | |
| prediction = None | |
| if pred: | |
| prediction = predict([sequence],classifier,filename) | |
| print prediction | |
| return prediction | |
| if __name__ == '__main__': | |
| main(sys.argv[1:]) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import unittest | |
| import os.path | |
| from os import listdir | |
| from promoterDiscovery import extract_features,main | |
| class TestCode(unittest.TestCase): | |
| def test_feature_extraction(self): | |
| corpus = ['ACG','GC'] | |
| features, vectorizer = extract_features(corpus) | |
| feature_names = vectorizer.get_feature_names() | |
| for i in range(len(feature_names)): | |
| if feature_names[i]=="a": | |
| self.assertEqual(features[0,i], 1) | |
| self.assertEqual(features[1,i], 0) | |
| if feature_names[i]=="c": | |
| self.assertEqual(features[0,i], 1) | |
| self.assertEqual(features[1,i], 1) | |
| if feature_names[i]=="ac": | |
| self.assertEqual(features[0,i], 1) | |
| self.assertEqual(features[1,i], 0) | |
| if feature_names[i]=="gc": | |
| self.assertEqual(features[0,i], 0) | |
| self.assertEqual(features[1,i], 1) | |
| def test_train(self): | |
| main(["-ttest"]) | |
| self.assertTrue(os.path.exists("./test_model")) | |
| self.assertTrue(os.path.exists("./test_vectorizer")) | |
| def test_predict(self): | |
| prediction = main(["-pCATCATTCTGAATTATGACCATGCTTTCATCTTCACAACTCAAGATCTATTTTCA"]) | |
| self.assertIn(prediction[0], ["enhancer","promoter"]) | |
| if __name__ == '__main__': | |
| unittest.main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment