Last active
December 17, 2015 16:18
-
-
Save danielecook/5637393 to your computer and use it in GitHub Desktop.
This is a set of functions used for pulling SNP information and parsing it into an array from the Entrez Database in Python. Requires biopython (pip install biopython)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pprint import pprint as pp | |
from Bio import Entrez | |
Entrez.email = "[email protected]" | |
def pull_line(var_set,line): | |
""" | |
This function parses data from lines in one of three ways: | |
1.) Pulls variables out of a particular line when defined as "variablename=[value]" - uses a string to find the variable. | |
2.) Pulls variables based on a set position within a line [splits the line by '|'] | |
3.) Defines variables that can be identified based on a limited possible set of values - [categorical variable, specified using an array] | |
""" | |
line_set = {} | |
for k,v in var_set.items(): | |
if type(v) == str: | |
try: | |
line_set[k] = [x for x in line if x.startswith(v)][0].replace(v,'') | |
except: | |
pass | |
elif type(v) == int: | |
try: | |
line_set[k] = line[v] | |
except: | |
pass | |
else: | |
try: | |
line_set[k] = [x for x in line if x in v][0] | |
except: | |
pass | |
return line_set | |
def pull_vars(var_set,line_start,line,multi=False): | |
""" | |
Delegates and compiles data from entrez flat files dependent on whether | |
the type of data trying to be pulled is contained in unique vs. non-unique lines. | |
For example - the first line of the flat file is always something like this: | |
rs12009 | Homo Sapiens | 9606 | etc. | |
This line is unique (refers to RefSnp identifier)- and only occurs once in each flat file. On the other hand, lines | |
beginning with "ss[number]" refer to 'submitted snp' numbers and can appear multiple times. | |
""" | |
lineset = [x.split(' | ') for x in line if x.startswith(line_start)] | |
if len(lineset) == 0: | |
return | |
# If the same line exists multiple times - place results into an array | |
if multi == True: | |
pulled_vars = [] | |
for line in lineset: | |
# Pull date in from line and append | |
pulled_vars.append(pull_line(var_set,line)) | |
return pulled_vars | |
else: | |
# Else if the line is always unique, output single dictionary | |
line = lineset[0] | |
pulled_vars = {} | |
return pull_line(var_set,line) | |
def get_snp(q): | |
""" | |
Takes as input an array of snp identifiers and returns | |
a parsed dictionary of their data from Entrez. | |
""" | |
response = Entrez.efetch(db='SNP', id=','.join(q), rettype='flt', retmode='flt').read() | |
r = {} # Return dictionary variable | |
# Parse flat file response | |
for snp_info in filter(None,response.split('\n\n')): | |
print snp_info | |
# Parse the First Line. Details of rs flat files available here: | |
# ftp://ftp.ncbi.nlm.nih.gov/snp/specs/00readme.txt | |
snp = snp_info.split('\n') | |
# Parse the 'rs' line: | |
rsId = snp[0].split(" | ")[0] | |
r[rsId] = {} | |
# rs vars | |
rs_vars = {"organism":1, | |
"taxId":2, | |
"snpClass":3, | |
"genotype":"genotype=", | |
"rsLinkout":"submitterlink=", | |
"date":"updated "} | |
# rs vars | |
ss_vars = {"ssId":0, | |
"handle":1, | |
"locSnpId":2, | |
"orient":"orient=", | |
"exemplar":"ss_pick=", | |
} | |
# SNP line variables: | |
SNP_vars = {"observed":"alleles=", | |
"value":"het=", | |
"stdError":"se(het)=", | |
"validated":"validated=", | |
"validProbMin":"min_prob=", | |
"validProbMax":"max_prob=", | |
"validation":"suspect=", | |
"AlleleOrigin":['unknown', | |
'germline', | |
'somatic', | |
'inherited', | |
'paternal', | |
'maternal', | |
'de-novo', | |
'bipaternal', | |
'unipaternal', | |
'not-tested', | |
'tested-inconclusive'], | |
"snpType":['notwithdrawn', | |
'artifact', | |
'gene-duplication', | |
'duplicate-submission', | |
'notspecified', | |
'ambiguous-location;', | |
'low-map-quality']} | |
# CLINSIG line variables: | |
CLINSIG_vars = {"ClinicalSignificance":['probable-pathogenic','pathogenic','other']} | |
# GMAF line variables | |
GMAF_vars = {"allele":"allele=", | |
"sampleSize":"count=", | |
"freq":"MAF="} | |
# CTG line variables | |
CTG_vars = {"groupLabel":"assembly=", | |
"chromosome":"chr=", | |
"physmapInt":"chr-pos=", | |
"asnFrom":"ctg-start=", | |
"asnTo":"ctg-end=", | |
"loctype":"loctype=", | |
"orient":"orient="} | |
# LOC line variables | |
LOC_vars = {"symbol":1, | |
"geneId":"locus_id=", | |
"fxnClass":"fxn-class=", | |
"allele":"allele=", | |
"readingFrame":"frame=", | |
"residue":"residue=", | |
"aaPosition":"aa_position="} | |
# LOC line variables | |
SEQ_vars = {"gi":1, | |
"source":"source-db=", | |
"asnFrom":"seq-pos=", | |
"orient":"orient="} | |
# Pull out variable information: | |
r[rsId]['rs'] = pull_vars(rs_vars,"rs",snp) | |
r[rsId]['ss'] = pull_vars(ss_vars,"ss",snp,True) | |
r[rsId]['SNP'] = pull_vars(SNP_vars,"SNP",snp) | |
r[rsId]['CLINSIG'] = pull_vars(CLINSIG_vars,"CLINSIG",snp) | |
r[rsId]['GMAF'] = pull_vars(GMAF_vars,"GMAF",snp) | |
r[rsId]['CTG'] = pull_vars(CTG_vars,"CTG",snp,True) | |
r[rsId]['LOC'] = pull_vars(LOC_vars,"LOC",snp,True) | |
r[rsId]['SEQ'] = pull_vars(SEQ_vars,"SEQ",snp,True) | |
return r | |
snp = get_snp(["12009"]) | |
print pp(snp) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment