Last active
December 30, 2015 16:19
-
-
Save zfz/7854350 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| #-*- coding: utf-8 -*- | |
| import sys | |
| import urllib | |
| import urllib2 | |
| import re | |
| import xml.parsers.expat | |
| import argparse | |
| import traceback | |
| class DatabaseFinder(object): | |
| """The class for query data through the Internet. """ | |
| ncbi_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" | |
| query_params = {'db': '', 'id': '', 'retypr': 'fast', 'retmode': 'xml'} | |
| def __init__(self, db, db_id): | |
| self.query_params['db'] = db | |
| self.query_params['id'] = db_id | |
| print 'Querying database.' | |
| def query_xml_data(self): | |
| try: | |
| query_url = self.ncbi_url + urllib.urlencode(self.query_params) | |
| r = urllib2.urlopen(query_url).read() | |
| return r | |
| except: | |
| #print traceback.format_exc() | |
| print 'No results queried.' | |
| return None | |
| class XmlParser(object): | |
| """The class for parsing xml texts from data queried.""" | |
| char_list = [] | |
| def __init__ (self, r): | |
| self.r = r | |
| self.textBuff = "" | |
| print "Parsing xml." | |
| def char_data(self, data): | |
| data = data.strip() | |
| if data: | |
| data = data.encode('ascii') | |
| self.textBuff += data + "\n" | |
| def parse_xml(self): | |
| try: | |
| p = xml.parsers.expat.ParserCreate() | |
| p.CharacterDataHandler = self.char_data | |
| p.Parse(self.r) | |
| self.char_list = self.textBuff.split('\n') | |
| except: | |
| #print traceback.format_exc() | |
| print "Parse xml error occurs." | |
| class ReParser(object): | |
| """The class for parsing regex pattern. """ | |
| target_list = [] | |
| def __init__(self, xml_text, re_input): | |
| self.l = xml_text | |
| self.regex = re.compile("^[" + re_input + "]*$") | |
| print 'Parsing re.' | |
| def parse_re(self): | |
| #print len(self.l) | |
| for item in self.l: | |
| try: | |
| dna_seq = self.regex.findall(item.strip()) | |
| for s in dna_seq: | |
| if s != '': | |
| #print s | |
| self.target_list.append(s) | |
| except: | |
| #print traceback.format_exc() | |
| print 'Parse re error occurs.' | |
| #Output the data parsed. | |
| def output_file(l, file_name): | |
| fp = open('./' + file_name, 'w') | |
| for item in l: | |
| fp.write(item) | |
| fp.write('\n') | |
| fp.close() | |
| print "Finish writring output file." | |
| #Format the re pattern. | |
| def format_reinput(s): | |
| try: | |
| regex = re.compile("\((.*)\)") | |
| re_obj = regex.search(s) | |
| re_input = re_obj.groups()[0] | |
| re_input = re_input + '|' + re_input.lower() | |
| #Make sure both lowercase and uppercase works. | |
| return re_input | |
| except: | |
| print "Bad re input." | |
| def run_project(db, db_id, file_name, re_input): | |
| ncbi_object = DatabaseFinder(db=db, db_id=db_id) | |
| query_result = ncbi_object.query_xml_data() | |
| if query_result: | |
| re_input = format_reinput(re_input) | |
| if re_input: | |
| ncbi_xml = XmlParser(r=query_result) | |
| ncbi_xml.parse_xml() | |
| ncbi_re = ReParser(ncbi_xml.char_list, re_input) | |
| ncbi_re.parse_re() | |
| #print file_name | |
| output_file(ncbi_re.target_list, file_name) | |
| if __name__ == '__main__': | |
| try: | |
| if len(sys.argv) == 5: | |
| db = sys.argv[1] | |
| db_id = sys.argv[2] | |
| file_name = sys.argv[3] | |
| re_input = sys.argv[4] | |
| for v in sys.argv[1:]: | |
| print v | |
| else: | |
| print "Params should follow the order: database, id, output_file and re pattern." | |
| print "Example: python sf_project.py 'nucleotide' '30271926' 'out.txt' '(A|C|G|T)'" | |
| except: | |
| #print traceback.format_exc() | |
| print 'Bad params.' | |
| try: | |
| run_project(db, db_id, file_name, re_input) | |
| except: | |
| print 'Error occurs when running the script.' |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
To run the script: python sf_project.py 'nucleotide' '30271926' 'out.txt' '(A|C|G|T)'