- 
      
- 
        Save PoisonAlien/be1af2a53d5d7bbe2c7a to your computer and use it in GitHub Desktop. 
| __author__ = "Anand M" | |
| ''' | |
| Takes output file generated by VarScan2 somatic programme and converts the formats. | |
| ''' | |
| import argparse, math, re | |
| parser = argparse.ArgumentParser( | |
| description="Converts VarScan2 somatic vcf to native format and vice-versa.\nInput is automatically detected") | |
| parser.add_argument('input', help='Input file generated by VarScan2 somatic') | |
| # parser.add_argument('output', help='output file name') | |
| args = parser.parse_args() | |
| # Function to print header line | |
| def printNativeHeader(): | |
| """ | |
| :rtype : Null | |
| """ | |
| print( | |
| "chrom\tposition\tref\tvar\tnormal_reads1\tnormal_reads2\tnormal_var_freq\tnormal_gt\ttumor_reads1\ttumor_reads\ttumor_var_freq\ttumor_gt\tsomatic_status\tvariant_p_value\tsomatic_p_value\ttumor_reads1_plus\ttumor_reads1_minus\ttumor_reads2_plus\ttumor_reads2_minus\tnormal_reads1_plus\tnormal_reads1_minus\tnormal_reads2_plus\tnormal_reads2_minus") | |
| # Function to print vcf header | |
| def printVcfHeader(): | |
| print("##fileformat=VCFv4.1\n" | |
| "##source=VarScan2\n" | |
| "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total depth of quality bases\">\n" | |
| "##INFO=<ID=SOMATIC,Number=0,Type=Flag,Description=\"Indicates if record is a somatic mutation\">\n" | |
| "##INFO=<ID=SS,Number=1,Type=String,Description=\"Somatic status of variant (0=Reference,1=Germline,2=Somatic,3=LOH, or 5=Unknown)\">\n" | |
| "##INFO=<ID=SSC,Number=1,Type=String,Description=\"Somatic score in Phred scale (0-255) derived from somatic p-value\">\n" | |
| "##INFO=<ID=GPV,Number=1,Type=Float,Description=\"Fisher's Exact Test P-value of tumor+normal versus no variant for Germline calls\">\n" | |
| "##INFO=<ID=SPV,Number=1,Type=Float,Description=\"Fisher's Exact Test P-value of tumor versus normal for Somatic/LOH calls\">\n" | |
| "##FILTER=<ID=str10,Description=\"Less than 10% or more than 90% of variant supporting reads on one strand\">\n" | |
| "##FILTER=<ID=indelError,Description=\"Likely artifact due to indel reads at this position\">\n" | |
| "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n" | |
| "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">\n" | |
| "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">\n" | |
| "##FORMAT=<ID=RD,Number=1,Type=Integer,Description=\"Depth of reference-supporting bases (reads1)\">\n" | |
| "##FORMAT=<ID=AD,Number=1,Type=Integer,Description=\"Depth of variant-supporting bases (reads2)\">\n" | |
| "##FORMAT=<ID=FREQ,Number=1,Type=String,Description=\"Variant allele frequency\">\n" | |
| "##FORMAT=<ID=DP4,Number=1,Type=String,Description=\"Strand read counts: ref/fwd, ref/rev, var/fwd, var/rev\">\n" | |
| "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL TUMOR") | |
| # Function to convert vcf record to NativeFormat record | |
| def makeNativeRec(vcfIp): | |
| """ | |
| :rtype : Null | |
| :type nativeIp: basestring | |
| """ | |
| nativeLine = vcfIp.split("\t") | |
| somaticDict = {'0': 'Reference', '1': 'Germline', '2': 'Somatic', '3': 'LOH', '5': 'Unknown'} | |
| chrom = nativeLine[0] | |
| position = nativeLine[1] | |
| ref = nativeLine[3] | |
| var = nativeLine[4] | |
| normalInfo = nativeLine[9] | |
| tumorInfo = nativeLine[10] | |
| normal_reads1 = normalInfo.split(":")[3] | |
| normal_reads2 = normalInfo.split(":")[4] | |
| normal_var_freq = normalInfo.split(":")[5] | |
| normal_gt = normalInfo.split(":")[0] | |
| normal_dp4 = normalInfo.split(":")[6] | |
| normal_reads1_plus = normal_dp4.split(",")[0] | |
| normal_reads1_minus = normal_dp4.split(",")[1] | |
| normal_reads2_plus = normal_dp4.split(",")[2] | |
| normal_reads2_minus = normal_dp4.split(",")[3] | |
| tumor_reads1 = tumorInfo.split(":")[3] | |
| tumor_reads2 = tumorInfo.split(":")[4] | |
| tumor_var_freq = tumorInfo.split(":")[5] | |
| tumor_gt = tumorInfo.split(":")[0] | |
| tumor_dp4 = tumorInfo.split(":")[6] | |
| tumor_reads1_plus = tumor_dp4.split(",")[0] | |
| tumor_reads1_minus = tumor_dp4.split(",")[1] | |
| tumor_reads2_plus = tumor_dp4.split(",")[2] | |
| tumor_reads2_minus = tumor_dp4.split(",")[3] | |
| info = nativeLine[7] | |
| infoDict = {} | |
| infoSpl = info.split(";") | |
| for rec in infoSpl: | |
| recSpl = rec.split("=") | |
| if not len(recSpl) == 1: | |
| infoDict[recSpl[0]] = recSpl[1] | |
| somatic_status = somaticDict[infoDict['SS']] | |
| variant_p_value = infoDict['GPV'] | |
| somatic_p_value = infoDict['SPV'] | |
| print("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t" % | |
| (chrom, position, ref, var, normal_reads1, normal_reads2, normal_var_freq, normal_gt, tumor_reads1, | |
| tumor_reads2, tumor_var_freq, tumor_gt, somatic_status, variant_p_value, somatic_p_value, tumor_reads1_plus, | |
| tumor_reads1_minus, tumor_reads2_plus, tumor_reads2_minus, normal_reads1_plus, normal_reads1_minus, | |
| normal_reads2_plus, normal_reads2_minus)) | |
| ##### | |
| # Function to convert Native to VCF record | |
| def makeVcfRecord(nativeIp): | |
| """ | |
| :rtype : Null | |
| """ | |
| somaticDict = {'Reference': '0', 'Germline': '1', 'Somatic': '2', 'LOH': '3', 'Unknown': '5'} | |
| nIp = nativeIp.split("\t") | |
| chrom = nIp[0] | |
| pos = nIp[1] | |
| id = '.' | |
| ref = nIp[2] | |
| alt = nIp[3] | |
| qual = '.' | |
| filter = 'PASS' | |
| dp = int(nIp[4]) + int(nIp[5]) + int(nIp[8]) + int(nIp[9]) | |
| ss = somaticDict[nIp[12]] | |
| ssc = -10 * math.log10(float(nIp[14])) | |
| gpv = nIp[13] | |
| spv = nIp[14] | |
| if ss == '2': | |
| info = "DP=" + str(dp) + ";SOMATIC;" + "SS=" + ss + ";" + "SSC=" + str( | |
| int(ssc)) + ";" + "GPV=" + gpv + ";" + "SPV=" + spv | |
| else: | |
| info = "DP=" + str(dp) + ";" + "SS=" + ss + ";" + "SSC=" + str( | |
| int(ssc)) + ";" + "GPV=" + gpv + ";" + "SPV=" + spv | |
| vcf_format = "GT:GQ:DP:RD:AD:FREQ:DP4" | |
| normal_var_freq = float(re.sub("%", "", nIp[6])) | |
| if normal_var_freq > 10 and normal_var_freq < 75: | |
| gt = '0/1' | |
| elif normal_var_freq > 75: | |
| gt = '1/1' | |
| else: | |
| gt = "0/0" | |
| tumor_var_freq = float(re.sub("%", "", nIp[10])) | |
| if tumor_var_freq > 10 and tumor_var_freq < 75: | |
| gt2 = '0/1' | |
| elif tumor_var_freq > 75: | |
| gt2 = '1/1' | |
| else: | |
| gt2 = "0/0" | |
| gq = '.' | |
| dp2 = int(nIp[4]) + int(nIp[5]) | |
| rd = nIp[4] | |
| ad = nIp[5] | |
| freq = nIp[6] | |
| dp4 = nIp[19] + ',' + nIp[20] + ',' + nIp[21] + ',' + nIp[22] | |
| normal_format = gt + ':' + gq + ":" + str(dp2) + ':' + rd + ':' + ad + ':' + freq + ':' + dp4 | |
| dp3 = int(nIp[8]) + int(nIp[9]) | |
| rd2 = nIp[8] | |
| ad2 = nIp[9] | |
| freq2 = nIp[10] | |
| dp42 = nIp[15] + ',' + nIp[16] + ',' + nIp[17] + ',' + nIp[19] | |
| tumor_format = gt2 + ':' + gq + ":" + str(dp3) + ':' + rd2 + ':' + ad2 + ':' + freq2 + ':' + dp42 | |
| print("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % | |
| (chrom, pos, id, ref, alt, qual, filter, info, vcf_format, normal_format, tumor_format)) | |
| #### | |
| def NativeToVcf(inputFile): | |
| printVcfHeader() | |
| vs = open(inputFile, 'r') | |
| for line in vs.readlines(): | |
| if not line.startswith("chrom"): | |
| makeVcfRecord(line.strip()) | |
| vs.close() | |
| ### | |
| def vcfToNative(inputFile): | |
| vs = open(inputFile, 'r') | |
| printNativeHeader() | |
| for line in vs.readlines(): | |
| if not line.startswith("#"): | |
| makeNativeRec(line.strip()) | |
| vs.close() | |
| #### | |
| vsIp = open(args.input, 'r') | |
| firstLine = vsIp.readline().strip() | |
| if firstLine.startswith("##fileformat="): | |
| vcfToNative(args.input) | |
| else: | |
| NativeToVcf(args.input) | |
| vsIp.close() | 
Sorry just saw your question.
Just download the script. Invoke from terminal :
python VarScan2_format_converter.py -h for usage
eg : python VarScan2_format_converter.py foo.vcf  - for converting vcf files to native
python VarScan2_format_converter.py foo.snp for converting native snp into vcf
Thanks.
$ python VarScan2_format_converter.py LUC8_strigent.basename.snp.Somatic
I get this error.
Traceback (most recent call last):
File "VarScan2_format_converter.py", line 192, in 
vsIp = file(args.input, 'r')
NameError: name 'file' is not defined
Any suggestions?
Ah ! I see.. You're using python > 3. file has been deprecated in higher version. I have changed it.. Should be working now.
Can I use this script both for indel and snp ?
If so, you have just saved my life.
Thanks
Hi,
I have created a repo for this kind of conversions. You can find them here:
https://github.com/PoisonAlien/varscan_accessories
some of my conversion get this error:
Traceback (most recent call last):
File "./VarScan2_format_converter.py", line 199, in 
NativeToVcf(args.input)
File "./VarScan2_format_converter.py", line 178, in NativeToVcf
makeVcfRecord(line.strip())
File "./VarScan2_format_converter.py", line 125, in makeVcfRecord
ssc = -10 * math.log10(float(nIp[14]))
ValueError: math domain error
Thanks for the helpful scripts!
A minor suggestion.
In line 45
"#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NORMAL  TUMOR")
is using space instead of tab, which will lead to an error message below in annovar with option -vcfinput:
"Error: invalid record found in VCF4 file (at least 8 tab-delimited fields expected): <#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NORMAL  TUMOR>"
this code is amazing.
Thanks a lot! Very useful!
I've corrected some (little) bugs that cause some errors related with tabs (@masashi0924) and double alleles --like this: "A/G"-- in alt column (they shold be separated by "," instead "/" according to vcf specifications)... they caused problems in downstream applications.
Modifications here, I hope it helps!
Thank you for your code !
But I found my data went error on treating varScan output record below:
chrM 5021 T C 5500 3 0.05% T 3973 1079 21.36% Y Somatic 1.0 0.0 1741 2232 479 600 2548 2952 2 1
Traceback
(most
recent call last):
File "../VarScan2_format_converter.py", line 199, in
NativeToVcf(args.input)
File "../VarScan2_format_converter.py", line 178, in NativeToVcf
makeVcfRecord(line.strip())
File "../VarScan2_format_converter.py", line 125, in makeVcfRecord
ssc = -10 * math.log10(float(nIp[14]))
ValueError: math domain error
Line 125 does not seem to consider the case where the somatic_p_value is 0. Since this number might not influence later judgement, may I just keep nIp[14] rather put it into math.log10() ?
line 125:
ssc = float(nIp[14])
Sorry if this is a very basic issue. It's my first somatic mutation data.
I download your new script and I get this error:
Traceback (most recent call last):
File "VarScan2_format_converter.py", line 212, in 
NativeToVcf(args.input)
File "VarScan2_format_converter.py", line 191, in NativeToVcf
makeVcfRecord(line.strip())
File "VarScan2_format_converter.py", line 150, in makeVcfRecord
normal_var_freq = float(re.sub("%", "", nIp[6]))
ValueError: invalid literal for float(): 63,64
Embarrassingly naive question but I'm new to this. Can you give an example of how to use the script?