Created
January 15, 2015 23:02
-
-
Save pontikos/b994990cd7cb2482fd88 to your computer and use it in GitHub Desktop.
splits ALT alleles over multiple lines
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import sys,gzip | |
| vcf = gzip.open(sys.argv[1],"r") | |
| #these 9 column headers are standard to all VCF files | |
| STD_HEADERS=['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT'] | |
| #the samples headers depend on the number of samples in the file | |
| #which we find out once we read the #CHROM line | |
| SAMPLE_HEADERS=[] | |
| for line in vcf: | |
| line=line.strip() | |
| #remove beginning | |
| #lines which start with '###' | |
| #are not tab separated | |
| if line.startswith('##'): | |
| #print(line) | |
| continue | |
| #this is tab separated line | |
| s=line.split("\t") | |
| #header line yay!: #CHROM ... | |
| if line.startswith('#'): | |
| print(line) | |
| headers=s | |
| headers[0]=headers[0].strip('#') | |
| #the first 9 names in the header are standard (see above) | |
| if (headers[0:len(STD_HEADERS)] != STD_HEADERS): raise 'hell' | |
| #everything else in the header is a sample name | |
| SAMPLE_HEADERS=headers[len(STD_HEADERS):] | |
| continue | |
| #you can now access elements by their header name | |
| s=dict(zip(headers,s)) | |
| #QUAL=., FILTER=., INFO=., FORMAT=GT | |
| #just set everything except GT to . | |
| #@pontikos: doesn't VEP use the other fields though? | |
| for h in ['QUAL','FILTER','INFO']: s[h]='.' | |
| s['FORMAT']='GT' | |
| #split alternate alleles | |
| alternative_alleles=s['ALT'].split(",") | |
| format=s['FORMAT'].split(':') | |
| #extract the genotypes of the samples | |
| #genotype is always first element of array | |
| for h in SAMPLE_HEADERS: s[h]=dict(zip(format,s[h].split(":")))['GT'] | |
| #if single alternate allele | |
| #then just print out as normal | |
| if len(alternative_alleles)<=1: | |
| print('\t'.join([s[h] for h in headers])) | |
| continue | |
| #otherwise split over as many lines as there are alternative alleles | |
| #each genotype then takes a 1 if matches the alternative or a 0 otherwise | |
| for i in range(len(alternative_alleles)): | |
| s['ALT']=alternative_alleles[i] | |
| print( '\t'.join( [s[h] for h in STD_HEADERS] + ['/'.join([['0','1'][int(int(g)==i)] for g in s[h].split('/')]) for h in SAMPLE_HEADERS] ) ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment