Created
May 1, 2015 08:39
-
-
Save knmkr/03fdabf3d4af2fb69e67 to your computer and use it in GitHub Desktop.
[bioinfo] Get FASTA sequence of rs ID
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import argparse | |
import gzip | |
import re | |
import sys | |
from Bio import SeqIO | |
from Bio.Alphabet import IUPAC | |
def _main(): | |
parser = argparse.ArgumentParser(description='') | |
parser.add_argument('multi_fasta_path', help='Multi-fasta file downloaded from ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/rs_fasta') | |
parser.add_argument('query_path', help='Text file, each line contains one rs ID for query.') | |
args = parser.parse_args() | |
rs_regexp = re.compile('.*(rs(\d+)).*') | |
seq_split_regexp = re.compile('([^atgc]+)', re.IGNORECASE) | |
# | |
query_rs = set() | |
with open(args.query_path, 'r') as qfin: | |
for line in qfin: | |
query_rs.update([line.strip()]) | |
print >>sys.stderr, '[INFO] {}'.format(query_rs) | |
# | |
with gzip.open(args.multi_fasta_path, 'r') as fin: | |
for i, record in enumerate(SeqIO.parse(fin, 'fasta', alphabet=IUPAC.ambiguous_dna)): | |
rs_match = rs_regexp.match(record.id) | |
rs_id = rs_match.group(1) if rs_match else '' | |
if rs_id in query_rs: | |
seqs = seq_split_regexp.split(str(record.seq)) | |
# FIXME | |
if not len(seqs) == 3: | |
print >>sys.stderr, '[WARN] ambigous dna found more than one.' | |
print >>sys.stderr, '[WARN] {}'.format(rs_id) | |
else: | |
seq_5, seq_ambigous, seq_3 = seqs | |
print ','.join([rs_id, seq_5, seq_ambigous, seq_3]) | |
if __name__ == '__main__': | |
_main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Download all multi-FASTA files from FTP for current dbSNP build:
Create a query text file, then run posted python script:
Results: