Skip to content

Instantly share code, notes, and snippets.

@zfz
Last active December 30, 2015 16:19
Show Gist options
  • Select an option

  • Save zfz/7854350 to your computer and use it in GitHub Desktop.

Select an option

Save zfz/7854350 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
#-*- coding: utf-8 -*-
import sys
import urllib
import urllib2
import re
import xml.parsers.expat
import argparse
import traceback
class DatabaseFinder(object):
"""The class for query data through the Internet. """
ncbi_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"
query_params = {'db': '', 'id': '', 'retypr': 'fast', 'retmode': 'xml'}
def __init__(self, db, db_id):
self.query_params['db'] = db
self.query_params['id'] = db_id
print 'Querying database.'
def query_xml_data(self):
try:
query_url = self.ncbi_url + urllib.urlencode(self.query_params)
r = urllib2.urlopen(query_url).read()
return r
except:
#print traceback.format_exc()
print 'No results queried.'
return None
class XmlParser(object):
"""The class for parsing xml texts from data queried."""
char_list = []
def __init__ (self, r):
self.r = r
self.textBuff = ""
print "Parsing xml."
def char_data(self, data):
data = data.strip()
if data:
data = data.encode('ascii')
self.textBuff += data + "\n"
def parse_xml(self):
try:
p = xml.parsers.expat.ParserCreate()
p.CharacterDataHandler = self.char_data
p.Parse(self.r)
self.char_list = self.textBuff.split('\n')
except:
#print traceback.format_exc()
print "Parse xml error occurs."
class ReParser(object):
"""The class for parsing regex pattern. """
target_list = []
def __init__(self, xml_text, re_input):
self.l = xml_text
self.regex = re.compile("^[" + re_input + "]*$")
print 'Parsing re.'
def parse_re(self):
#print len(self.l)
for item in self.l:
try:
dna_seq = self.regex.findall(item.strip())
for s in dna_seq:
if s != '':
#print s
self.target_list.append(s)
except:
#print traceback.format_exc()
print 'Parse re error occurs.'
#Output the data parsed.
def output_file(l, file_name):
fp = open('./' + file_name, 'w')
for item in l:
fp.write(item)
fp.write('\n')
fp.close()
print "Finish writring output file."
#Format the re pattern.
def format_reinput(s):
try:
regex = re.compile("\((.*)\)")
re_obj = regex.search(s)
re_input = re_obj.groups()[0]
re_input = re_input + '|' + re_input.lower()
#Make sure both lowercase and uppercase works.
return re_input
except:
print "Bad re input."
def run_project(db, db_id, file_name, re_input):
ncbi_object = DatabaseFinder(db=db, db_id=db_id)
query_result = ncbi_object.query_xml_data()
if query_result:
re_input = format_reinput(re_input)
if re_input:
ncbi_xml = XmlParser(r=query_result)
ncbi_xml.parse_xml()
ncbi_re = ReParser(ncbi_xml.char_list, re_input)
ncbi_re.parse_re()
#print file_name
output_file(ncbi_re.target_list, file_name)
if __name__ == '__main__':
try:
if len(sys.argv) == 5:
db = sys.argv[1]
db_id = sys.argv[2]
file_name = sys.argv[3]
re_input = sys.argv[4]
for v in sys.argv[1:]:
print v
else:
print "Params should follow the order: database, id, output_file and re pattern."
print "Example: python sf_project.py 'nucleotide' '30271926' 'out.txt' '(A|C|G|T)'"
except:
#print traceback.format_exc()
print 'Bad params.'
try:
run_project(db, db_id, file_name, re_input)
except:
print 'Error occurs when running the script.'
@zfz
Copy link
Author

zfz commented Dec 8, 2013

To run the script: python sf_project.py 'nucleotide' '30271926' 'out.txt' '(A|C|G|T)'

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment