Skip to content

Instantly share code, notes, and snippets.

@MikeDacre
Last active January 8, 2016 21:29
Show Gist options
  • Save MikeDacre/5f194cf3f561881c747b to your computer and use it in GitHub Desktop.
Save MikeDacre/5f194cf3f561881c747b to your computer and use it in GitHub Desktop.
Convert simplified bed file(s) plus a reference genome into a vcf file. Formats in source.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Distributed under terms of the MIT license
"""
================================================================================
FILE: bed2vcf (python 3)
AUTHOR: Michael D Dacre, [email protected]
ORGANIZATION: Stanford University
LICENSE: MIT License, Property of Stanford, Use as you wish
VERSION: 0.4
CREATED: 2014-07-09 11:23
Last modified: 2016-01-08 13:29
DESCRIPTION: Convert simplified bed file(s) plus a reference genome into a
vcf file. Formats in source.
Very memory intensive. Each bed file takes 2GB, each chromosome
takes 12GB. Runs one chromosome file at a time, so save memory
by spliting large genomes into multiple files.
Max memory usage I have observed: 22GB
Runtime on mouse, one individual, all chromosomes: 15 minutes
Should be python2/3 compatible.
=================================================================================
"""
import sys
# Import logging functions from
# https://github.com/MikeDacre/fraser-tools/blob/master/mike.py
from mike import logme
##################
# VCF Creation #
##################
def make_vcf(individuals, chromosomes, phased=0, outfile='', logfile=''):
"""Take a dictionary of individuals and an array of chromosome files
and make VCF file.
Only loads one chromosome file into memory at a time.
Individuals is a dictonary of individual_name->list of tuples
('Chr', 'location', 'Alleles'). Alleles is a tuple of two alleles.
Chromosomes is a list of fasta file names."""
if phased:
character = '|'
else:
character = '/'
final_vcf_output = {}
all_individuals = set()
for i in chromosomes:
# Create dictionary from chromosome file
logme("Processing chromosome file " + i + "...", logfile, print_level=2)
chr = parse_fasta(i)
logme("Done", logfile, print_level=2)
# Loop through each individual and compare to this chromosome
for chr_name, chr_locations in chr.items():
logme("Getting varient calls for chromosome " + chr_name, logfile, print_level=2)
ind_snps = {}
for ind_name, ind_info_list in individuals.items():
ind_snps[ind_name] = {}
for ind_info in ind_info_list:
if ind_info[0] == chr_name:
ref = chr_locations[ind_info[1]].upper()
alt = ref
if ind_info[2][0].upper() == ref:
allele_1 = '0'
else:
allele_1 = '1'
alt = ind_info[2][0]
if ind_info[2][1].upper() == ref:
allele_2 = '0'
else:
allele_2 = '1'
alt = ind_info[2][1]
ind_snps[ind_name][ind_info[1]] = (ref, alt, (allele_1, allele_2))
print(ref, alt)
print(ind_info[1], allele_1, allele_2)
logme("Done", logfile, print_level=2)
# Dictonary to hold a data structure that will become the vcf file
# for this chromosome
# location -> (ref, alt, {name1->(a1,a2), ..., name_n})
vcf_output = {}
# Loop through individuals and create VCF output
logme("Creating VCF output for chromosome " + chr_name, logfile, print_level=2)
for name, snp_info in ind_snps.items():
all_individuals.add(name)
for location, varient in snp_info.items():
if location in vcf_output:
# Check alternate allele is the same
if varient[0] == vcf_output[location][0] or varient[0] == vcf_output[location][1]:
# Check alternate allele is the same
if varient[1] == vcf_output[location][0] or varient[1] == vcf_output[location][1]:
# Save varient call
vcf_output[location][2][name] = varient[2]
else:
# If ref and alt are the same, change alt
if vcf_output[location][1] == vcf_output[location][0]:
vcf_output[location] = (vcf_output[location][0],
varient[1],
vcf_output[location][2])
# Save varient call
vcf_output[location][2][name] = varient[2]
else:
sys.stderr.write(vcf_output[location] + '\n')
sys.stderr.write(varient + '\n')
sys.stderr.write(
("Individual {} at location {} had a " +
"different ALT allele\h").format(
name,
location))
sys.stderr.write("{}\n{}\n".format(
vcf_output[location][0],
vcf_output[location][1]))
raise Exception
else:
# If ref and alt are the same, change alt
if vcf_output[location][1] == vcf_output[location][0]:
vcf_output[location] = (vcf_output[location][0],
varient[0],
vcf_output[location][2])
# Save varient call
vcf_output[location][2][name] = varient[2]
else:
sys.stderr.write((
"Individual {} at location " +
"{} had a different ALT allele\n" +
"{}\n{}\n").format(name, location,
varient[0],
vcf_output[location][0]))
raise Exception
else:
vcf_output[location] = (varient[0], varient[1], {name: varient[2]})
# Add chromosome info to final output dictionary
final_vcf_output[chr_name] = vcf_output
logme("Done", logfile, print_level=2)
# Set outfile
logme("\n\nAll processing complete\nPrinting output\n", logfile, print_level=2)
if outfile:
outfile = open(outfile, 'w')
else:
outfile = sys.stdout
# Print the thing
outfile.write("##fileformat=VCFv4.0\n")
outfile.write('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n')
outfile.write(("#CHROM\tPOS\tID\tREF\tALT\tQUAL\t" +
"FILTER\tINFO\tFORMAT\t{}\n").format(
'\t'.join(all_individuals)))
all_individuals = frozenset(all_individuals)
for chr, v in final_vcf_output.items():
for location, info in v.items():
print_string = (chr, str(location), info[0], info[1])
for individual in all_individuals:
if individual in info[2]:
print_string = print_string + (character.join(info[2][individual]),)
else:
print_string = print_string + "."
print_string = '\t'.join([print_string[0], print_string[1],
'.', print_string[2], print_string[3],
".\t.\t.\tGT",
'\t'.join(print_string[4:])])
outfile.write(print_string + '\n')
logme("Operation completed successfully", logfile, print_level=2)
##################
# File parsing #
##################
def parse_bedfile(bedfile):
"""Load a bed file into memory as list of tuples
('Chr', 'location', 'Alleles')"""
alleles = []
with open(bedfile) as infile:
for line in infile:
fields = line.rstrip().split('\t')
if len(fields[3]) == 3:
alt = (fields[3][0], fields[3][2])
alleles.append((fields[0], int(fields[1]), alt))
return alleles
def parse_fasta(genome_file):
"""Parse a genome into a tuple of bases. Requires approximately 12GB
of memory per chromosome in mouse.
Base 0."""
from re import sub
current_chr = ''
chromosomes = {}
c = []
with open(genome_file) as infile:
for line in infile:
if line.startswith('>'):
if c:
chromosomes[current_chr] = tuple(c)
current_chr = sub(r'^>', '', line.rstrip())
chromosomes[current_chr] = ()
else:
for i in line.rstrip():
c.append(i)
if c:
chromosomes[current_chr] = tuple(c)
return chromosomes
##################
# File formats #
##################
vcf_format = """##fileformat=VCFv4.0
##INFO=<ID=REF_COUNT,Number=1,Type=String,Description="# of mapped reads with reference allele">
##INFO=<ID=ALT_COUNT,Number=1,Type=String,Description="# of mapped reads with alernative allele">
##INFO=<ID=JunD,Number=0,Type=Flag,Description="ChIP sequencing, JunD peak">
##INFO=<ID=Max,Number=0,Type=Flag,Description="ChIP sequencing, JMax peak">
##INFO=<ID=NfkB,Number=0,Type=Flag,Description="ChIP sequencing, NfkB peak">
##INFO=<ID=Pol_II,Number=0,Type=Flag,Description="ChIP sequencing, Pol_II peak">
##INFO=<ID=Pol_III,Number=0,Type=Flag,Description="ChIP sequencing, Pol_III peak">
##INFO=<ID=cFos,Number=0,Type=Flag,Description="ChIP sequencing, cFox peak">
##INFO=<ID=cMyc,Number=0,Type=Flag,Description="ChIP sequencing, cMyc peak">
##INFO=<ID=RNAseq,Number=0,Type=Flag,Description="RNA sequencing">
##reference=NCBI36/hg18
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878
1 37928930 . G A . . cFos;REF_COUNT=19;ALT_COUNT=4 GT 0/1
1 37929014 . T C . . cFos;REF_COUNT=3;ALT_COUNT=15 GT 0/1
1 52941430 . T C . . cFos;REF_COUNT=2;ALT_COUNT=13 GT 1|0
1 111544364 . A G . . cFos;REF_COUNT=2;ALT_COUNT=38 GT 1|0
2 66318134 . C A . . cFos;REF_COUNT=0;ALT_COUNT=40 GT 0/1
2 154600656 . G T . . cFos;REF_COUNT=1;ALT_COUNT=57 GT 0|1
4 48785218 . T C . . cFos;REF_COUNT=1;ALT_COUNT=129 GT 0|1
4 185466260 . C T . . cFos;REF_COUNT=8;ALT_COUNT=0 GT 0|1
5 171780279 . T C . . cFos;REF_COUNT=1;ALT_COUNT=14 GT 1|0
5 178919238 . C A . . cFos;REF_COUNT=0;ALT_COUNT=12 GT 0|1
6 31432983 . G A . . cFos;REF_COUNT=42;ALT_COUNT=21 GT 0|1
6 31433069 . C T . . cFos;REF_COUNT=28;ALT_COUNT=9 GT 0|1
6 32665754 . C T . . cFos;REF_COUNT=2;ALT_COUNT=14 GT 0/1
6 32703852 . C T . . cFos;REF_COUNT=13;ALT_COUNT=2 GT 1|0
7 137337589 . T G . . cFos;REF_COUNT=47;ALT_COUNT=24 GT 1|0
9 126021036 . A G . . cFos;REF_COUNT=1;ALT_COUNT=10 GT 0/1
9 134895878 . C T . . cFos;REF_COUNT=43;ALT_COUNT=12 GT 0|1
"""
bed_format = """chr1 3000714 3000715 T|G
chr1 3001065 3001066 G|T
chr1 3001110 3001111 G|C
chr1 3001131 3001132 G|A
chr1 3001273 3001274 T|C
chr1 3001291 3001292 T|C
chr1 3002158 3002159 C|A
chr1 3002205 3002206 T|A
chr1 3002386 3002387 G|T
chr1 3002642 3002643 A|G
chr1 3002673 3002674 G|A
chr1 3002800 3002801 A|T
chr1 3003516 3003517 G|T
chr1 3003697 3003698 T|G
chr1 3003790 3003791 T|C
chr1 3004508 3004509 T|G
chr1 3004699 3004700 G|A
chr1 3004757 3004758 C|A
chr1 3005791 3005792 C|A
chr1 3009094 3009095 G|T"""
################################################################################
# Run as a script #
################################################################################
def _get_args():
""" Command Line Argument Parsing """
import argparse
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
# Optional Arguments
parser.add_argument('--bed_format', action='store_true',
help="Print bed file format to STDOUT")
parser.add_argument('--vcf_format', action='store_true',
help="Print vcf file format to STDOUT")
parser.add_argument('--phased', action='store_true',
help="Are the genotypes in the bed files phased?")
# Bed Files
parser.add_argument('bedfiles', nargs='+',
help="BED File(s)")
# Genome file(s)
parser.add_argument('--chr', nargs='+',
help="Reference Genome, multiple files fine")
# Names
parser.add_argument('--names', nargs='+',
help="Names of individuals in bed files. Position is essential")
# Optional Files
parser.add_argument('-o', '--vcf', nargs='?', default='',
help="VCF output file, Default STDOUT")
parser.add_argument('-l', '--logfile', nargs='?', default='',
help="Log File, Default STDERR (append mode)")
return parser
def main():
"""Run directly"""
from re import sub
# Get commandline arguments
parser = _get_args()
args = parser.parse_args()
# Print format help
if args.bed_format:
print("BED files should be for single individuals and should have the format:")
print("Columns:\n",
"1) chr\n",
"2) start base (zero based)\n",
"3) end base\n",
"4) reference base|indiviual's base)")
print("Note: The only columns of import are column 1, column 2, and the alt allele in column 4")
print("Example file:")
print(bed_format)
sys.exit(0)
if args.vcf_format:
print("VCF file will follow standard VCF format with the following columns:")
print("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tIND1[\tIND2\t...\tINDn]")
print("Example file:")
print(vcf_format)
sys.exit(0)
# Make sure that there are reference files
if not args.chr:
parser.print_help()
print("Reference genome required ('--chr')")
sys.exit(1)
# Set Names
named_files = {}
if args.names:
if not len(args.names) == len(args.bedfiles):
print("Number of names does not equal number of bedfiles")
sys.exit(2)
count = len(args.bedfiles)
while count:
named_files[args.names[count - 1]] = args.bedfiles[count - 1]
count = count - 1
else:
for i in args.bedfiles:
named_files[sub(r'\.bed$', '', i)] = i
# Load bed files into memory
individuals = {}
for k, v in named_files.items():
logme("Processing individual " + k + "'s bed file...", args.logfile, print_level=2)
individuals[k] = parse_bedfile(v)
# Travel through the chromosomes and create the VCF file
make_vcf(individuals=individuals, chromosomes=args.chr, phased=args.phased, outfile=args.vcf, logfile=args.logfile)
# The end
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment