Last active
January 8, 2016 21:29
-
-
Save MikeDacre/5f194cf3f561881c747b to your computer and use it in GitHub Desktop.
Convert simplified bed file(s) plus a reference genome into a vcf file. Formats in source.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
# Distributed under terms of the MIT license | |
""" | |
================================================================================ | |
FILE: bed2vcf (python 3) | |
AUTHOR: Michael D Dacre, [email protected] | |
ORGANIZATION: Stanford University | |
LICENSE: MIT License, Property of Stanford, Use as you wish | |
VERSION: 0.4 | |
CREATED: 2014-07-09 11:23 | |
Last modified: 2016-01-08 13:29 | |
DESCRIPTION: Convert simplified bed file(s) plus a reference genome into a | |
vcf file. Formats in source. | |
Very memory intensive. Each bed file takes 2GB, each chromosome | |
takes 12GB. Runs one chromosome file at a time, so save memory | |
by spliting large genomes into multiple files. | |
Max memory usage I have observed: 22GB | |
Runtime on mouse, one individual, all chromosomes: 15 minutes | |
Should be python2/3 compatible. | |
================================================================================= | |
""" | |
import sys | |
# Import logging functions from | |
# https://github.com/MikeDacre/fraser-tools/blob/master/mike.py | |
from mike import logme | |
################## | |
# VCF Creation # | |
################## | |
def make_vcf(individuals, chromosomes, phased=0, outfile='', logfile=''): | |
"""Take a dictionary of individuals and an array of chromosome files | |
and make VCF file. | |
Only loads one chromosome file into memory at a time. | |
Individuals is a dictonary of individual_name->list of tuples | |
('Chr', 'location', 'Alleles'). Alleles is a tuple of two alleles. | |
Chromosomes is a list of fasta file names.""" | |
if phased: | |
character = '|' | |
else: | |
character = '/' | |
final_vcf_output = {} | |
all_individuals = set() | |
for i in chromosomes: | |
# Create dictionary from chromosome file | |
logme("Processing chromosome file " + i + "...", logfile, print_level=2) | |
chr = parse_fasta(i) | |
logme("Done", logfile, print_level=2) | |
# Loop through each individual and compare to this chromosome | |
for chr_name, chr_locations in chr.items(): | |
logme("Getting varient calls for chromosome " + chr_name, logfile, print_level=2) | |
ind_snps = {} | |
for ind_name, ind_info_list in individuals.items(): | |
ind_snps[ind_name] = {} | |
for ind_info in ind_info_list: | |
if ind_info[0] == chr_name: | |
ref = chr_locations[ind_info[1]].upper() | |
alt = ref | |
if ind_info[2][0].upper() == ref: | |
allele_1 = '0' | |
else: | |
allele_1 = '1' | |
alt = ind_info[2][0] | |
if ind_info[2][1].upper() == ref: | |
allele_2 = '0' | |
else: | |
allele_2 = '1' | |
alt = ind_info[2][1] | |
ind_snps[ind_name][ind_info[1]] = (ref, alt, (allele_1, allele_2)) | |
print(ref, alt) | |
print(ind_info[1], allele_1, allele_2) | |
logme("Done", logfile, print_level=2) | |
# Dictonary to hold a data structure that will become the vcf file | |
# for this chromosome | |
# location -> (ref, alt, {name1->(a1,a2), ..., name_n}) | |
vcf_output = {} | |
# Loop through individuals and create VCF output | |
logme("Creating VCF output for chromosome " + chr_name, logfile, print_level=2) | |
for name, snp_info in ind_snps.items(): | |
all_individuals.add(name) | |
for location, varient in snp_info.items(): | |
if location in vcf_output: | |
# Check alternate allele is the same | |
if varient[0] == vcf_output[location][0] or varient[0] == vcf_output[location][1]: | |
# Check alternate allele is the same | |
if varient[1] == vcf_output[location][0] or varient[1] == vcf_output[location][1]: | |
# Save varient call | |
vcf_output[location][2][name] = varient[2] | |
else: | |
# If ref and alt are the same, change alt | |
if vcf_output[location][1] == vcf_output[location][0]: | |
vcf_output[location] = (vcf_output[location][0], | |
varient[1], | |
vcf_output[location][2]) | |
# Save varient call | |
vcf_output[location][2][name] = varient[2] | |
else: | |
sys.stderr.write(vcf_output[location] + '\n') | |
sys.stderr.write(varient + '\n') | |
sys.stderr.write( | |
("Individual {} at location {} had a " + | |
"different ALT allele\h").format( | |
name, | |
location)) | |
sys.stderr.write("{}\n{}\n".format( | |
vcf_output[location][0], | |
vcf_output[location][1])) | |
raise Exception | |
else: | |
# If ref and alt are the same, change alt | |
if vcf_output[location][1] == vcf_output[location][0]: | |
vcf_output[location] = (vcf_output[location][0], | |
varient[0], | |
vcf_output[location][2]) | |
# Save varient call | |
vcf_output[location][2][name] = varient[2] | |
else: | |
sys.stderr.write(( | |
"Individual {} at location " + | |
"{} had a different ALT allele\n" + | |
"{}\n{}\n").format(name, location, | |
varient[0], | |
vcf_output[location][0])) | |
raise Exception | |
else: | |
vcf_output[location] = (varient[0], varient[1], {name: varient[2]}) | |
# Add chromosome info to final output dictionary | |
final_vcf_output[chr_name] = vcf_output | |
logme("Done", logfile, print_level=2) | |
# Set outfile | |
logme("\n\nAll processing complete\nPrinting output\n", logfile, print_level=2) | |
if outfile: | |
outfile = open(outfile, 'w') | |
else: | |
outfile = sys.stdout | |
# Print the thing | |
outfile.write("##fileformat=VCFv4.0\n") | |
outfile.write('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n') | |
outfile.write(("#CHROM\tPOS\tID\tREF\tALT\tQUAL\t" + | |
"FILTER\tINFO\tFORMAT\t{}\n").format( | |
'\t'.join(all_individuals))) | |
all_individuals = frozenset(all_individuals) | |
for chr, v in final_vcf_output.items(): | |
for location, info in v.items(): | |
print_string = (chr, str(location), info[0], info[1]) | |
for individual in all_individuals: | |
if individual in info[2]: | |
print_string = print_string + (character.join(info[2][individual]),) | |
else: | |
print_string = print_string + "." | |
print_string = '\t'.join([print_string[0], print_string[1], | |
'.', print_string[2], print_string[3], | |
".\t.\t.\tGT", | |
'\t'.join(print_string[4:])]) | |
outfile.write(print_string + '\n') | |
logme("Operation completed successfully", logfile, print_level=2) | |
################## | |
# File parsing # | |
################## | |
def parse_bedfile(bedfile): | |
"""Load a bed file into memory as list of tuples | |
('Chr', 'location', 'Alleles')""" | |
alleles = [] | |
with open(bedfile) as infile: | |
for line in infile: | |
fields = line.rstrip().split('\t') | |
if len(fields[3]) == 3: | |
alt = (fields[3][0], fields[3][2]) | |
alleles.append((fields[0], int(fields[1]), alt)) | |
return alleles | |
def parse_fasta(genome_file): | |
"""Parse a genome into a tuple of bases. Requires approximately 12GB | |
of memory per chromosome in mouse. | |
Base 0.""" | |
from re import sub | |
current_chr = '' | |
chromosomes = {} | |
c = [] | |
with open(genome_file) as infile: | |
for line in infile: | |
if line.startswith('>'): | |
if c: | |
chromosomes[current_chr] = tuple(c) | |
current_chr = sub(r'^>', '', line.rstrip()) | |
chromosomes[current_chr] = () | |
else: | |
for i in line.rstrip(): | |
c.append(i) | |
if c: | |
chromosomes[current_chr] = tuple(c) | |
return chromosomes | |
################## | |
# File formats # | |
################## | |
vcf_format = """##fileformat=VCFv4.0 | |
##INFO=<ID=REF_COUNT,Number=1,Type=String,Description="# of mapped reads with reference allele"> | |
##INFO=<ID=ALT_COUNT,Number=1,Type=String,Description="# of mapped reads with alernative allele"> | |
##INFO=<ID=JunD,Number=0,Type=Flag,Description="ChIP sequencing, JunD peak"> | |
##INFO=<ID=Max,Number=0,Type=Flag,Description="ChIP sequencing, JMax peak"> | |
##INFO=<ID=NfkB,Number=0,Type=Flag,Description="ChIP sequencing, NfkB peak"> | |
##INFO=<ID=Pol_II,Number=0,Type=Flag,Description="ChIP sequencing, Pol_II peak"> | |
##INFO=<ID=Pol_III,Number=0,Type=Flag,Description="ChIP sequencing, Pol_III peak"> | |
##INFO=<ID=cFos,Number=0,Type=Flag,Description="ChIP sequencing, cFox peak"> | |
##INFO=<ID=cMyc,Number=0,Type=Flag,Description="ChIP sequencing, cMyc peak"> | |
##INFO=<ID=RNAseq,Number=0,Type=Flag,Description="RNA sequencing"> | |
##reference=NCBI36/hg18 | |
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | |
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 | |
1 37928930 . G A . . cFos;REF_COUNT=19;ALT_COUNT=4 GT 0/1 | |
1 37929014 . T C . . cFos;REF_COUNT=3;ALT_COUNT=15 GT 0/1 | |
1 52941430 . T C . . cFos;REF_COUNT=2;ALT_COUNT=13 GT 1|0 | |
1 111544364 . A G . . cFos;REF_COUNT=2;ALT_COUNT=38 GT 1|0 | |
2 66318134 . C A . . cFos;REF_COUNT=0;ALT_COUNT=40 GT 0/1 | |
2 154600656 . G T . . cFos;REF_COUNT=1;ALT_COUNT=57 GT 0|1 | |
4 48785218 . T C . . cFos;REF_COUNT=1;ALT_COUNT=129 GT 0|1 | |
4 185466260 . C T . . cFos;REF_COUNT=8;ALT_COUNT=0 GT 0|1 | |
5 171780279 . T C . . cFos;REF_COUNT=1;ALT_COUNT=14 GT 1|0 | |
5 178919238 . C A . . cFos;REF_COUNT=0;ALT_COUNT=12 GT 0|1 | |
6 31432983 . G A . . cFos;REF_COUNT=42;ALT_COUNT=21 GT 0|1 | |
6 31433069 . C T . . cFos;REF_COUNT=28;ALT_COUNT=9 GT 0|1 | |
6 32665754 . C T . . cFos;REF_COUNT=2;ALT_COUNT=14 GT 0/1 | |
6 32703852 . C T . . cFos;REF_COUNT=13;ALT_COUNT=2 GT 1|0 | |
7 137337589 . T G . . cFos;REF_COUNT=47;ALT_COUNT=24 GT 1|0 | |
9 126021036 . A G . . cFos;REF_COUNT=1;ALT_COUNT=10 GT 0/1 | |
9 134895878 . C T . . cFos;REF_COUNT=43;ALT_COUNT=12 GT 0|1 | |
""" | |
bed_format = """chr1 3000714 3000715 T|G | |
chr1 3001065 3001066 G|T | |
chr1 3001110 3001111 G|C | |
chr1 3001131 3001132 G|A | |
chr1 3001273 3001274 T|C | |
chr1 3001291 3001292 T|C | |
chr1 3002158 3002159 C|A | |
chr1 3002205 3002206 T|A | |
chr1 3002386 3002387 G|T | |
chr1 3002642 3002643 A|G | |
chr1 3002673 3002674 G|A | |
chr1 3002800 3002801 A|T | |
chr1 3003516 3003517 G|T | |
chr1 3003697 3003698 T|G | |
chr1 3003790 3003791 T|C | |
chr1 3004508 3004509 T|G | |
chr1 3004699 3004700 G|A | |
chr1 3004757 3004758 C|A | |
chr1 3005791 3005792 C|A | |
chr1 3009094 3009095 G|T""" | |
################################################################################ | |
# Run as a script # | |
################################################################################ | |
def _get_args(): | |
""" Command Line Argument Parsing """ | |
import argparse | |
parser = argparse.ArgumentParser( | |
description=__doc__, | |
formatter_class=argparse.RawDescriptionHelpFormatter) | |
# Optional Arguments | |
parser.add_argument('--bed_format', action='store_true', | |
help="Print bed file format to STDOUT") | |
parser.add_argument('--vcf_format', action='store_true', | |
help="Print vcf file format to STDOUT") | |
parser.add_argument('--phased', action='store_true', | |
help="Are the genotypes in the bed files phased?") | |
# Bed Files | |
parser.add_argument('bedfiles', nargs='+', | |
help="BED File(s)") | |
# Genome file(s) | |
parser.add_argument('--chr', nargs='+', | |
help="Reference Genome, multiple files fine") | |
# Names | |
parser.add_argument('--names', nargs='+', | |
help="Names of individuals in bed files. Position is essential") | |
# Optional Files | |
parser.add_argument('-o', '--vcf', nargs='?', default='', | |
help="VCF output file, Default STDOUT") | |
parser.add_argument('-l', '--logfile', nargs='?', default='', | |
help="Log File, Default STDERR (append mode)") | |
return parser | |
def main(): | |
"""Run directly""" | |
from re import sub | |
# Get commandline arguments | |
parser = _get_args() | |
args = parser.parse_args() | |
# Print format help | |
if args.bed_format: | |
print("BED files should be for single individuals and should have the format:") | |
print("Columns:\n", | |
"1) chr\n", | |
"2) start base (zero based)\n", | |
"3) end base\n", | |
"4) reference base|indiviual's base)") | |
print("Note: The only columns of import are column 1, column 2, and the alt allele in column 4") | |
print("Example file:") | |
print(bed_format) | |
sys.exit(0) | |
if args.vcf_format: | |
print("VCF file will follow standard VCF format with the following columns:") | |
print("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tIND1[\tIND2\t...\tINDn]") | |
print("Example file:") | |
print(vcf_format) | |
sys.exit(0) | |
# Make sure that there are reference files | |
if not args.chr: | |
parser.print_help() | |
print("Reference genome required ('--chr')") | |
sys.exit(1) | |
# Set Names | |
named_files = {} | |
if args.names: | |
if not len(args.names) == len(args.bedfiles): | |
print("Number of names does not equal number of bedfiles") | |
sys.exit(2) | |
count = len(args.bedfiles) | |
while count: | |
named_files[args.names[count - 1]] = args.bedfiles[count - 1] | |
count = count - 1 | |
else: | |
for i in args.bedfiles: | |
named_files[sub(r'\.bed$', '', i)] = i | |
# Load bed files into memory | |
individuals = {} | |
for k, v in named_files.items(): | |
logme("Processing individual " + k + "'s bed file...", args.logfile, print_level=2) | |
individuals[k] = parse_bedfile(v) | |
# Travel through the chromosomes and create the VCF file | |
make_vcf(individuals=individuals, chromosomes=args.chr, phased=args.phased, outfile=args.vcf, logfile=args.logfile) | |
# The end | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment