MikeDacre · January 8, 2016 21:29
diff --git a/bed2vcf.py b/bed2vcf.py
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 # Distributed under terms of the MIT license
 """
 ================================================================================

          FILE: bed2vcf (python 3)
        AUTHOR: Michael D Dacre, [email protected]
  ORGANIZATION: Stanford University
       LICENSE: MIT License, Property of Stanford, Use as you wish
       VERSION: 0.4
       CREATED: 2014-07-09 11:23
 Last modified: 2016-01-08 13:29

   DESCRIPTION: Convert simplified bed file(s) plus a reference genome into a
                vcf file. Formats in source.

                Very memory intensive. Each bed file takes 2GB, each chromosome
                takes 12GB. Runs one chromosome file at a time, so save memory
                by spliting large genomes into multiple files.
                Max memory usage I have observed: 22GB

                Runtime on mouse, one individual, all chromosomes: 15 minutes

                Should be python2/3 compatible.

 =================================================================================
 """
 import sys
 # Import logging functions from
 # https://github.com/MikeDacre/fraser-tools/blob/master/mike.py
 from mike import logme

 ##################
 #  VCF Creation  #
 ##################


 def make_vcf(individuals, chromosomes, phased=0, outfile='', logfile=''):
    """Take a dictionary of individuals and an array of chromosome files
       and make VCF file.
       Only loads one chromosome file into memory at a time.

       Individuals is a dictonary of individual_name->list of tuples
       ('Chr', 'location', 'Alleles'). Alleles is a tuple of two alleles.

       Chromosomes is a list of fasta file names."""

    if phased:
        character = '|'
    else:
        character = '/'

    final_vcf_output = {}
    all_individuals = set()

    for i in chromosomes:
        # Create dictionary from chromosome file
        logme("Processing chromosome file " + i + "...", logfile, print_level=2)
        chr = parse_fasta(i)
        logme("Done", logfile, print_level=2)

        # Loop through each individual and compare to this chromosome
        for chr_name, chr_locations in chr.items():
            logme("Getting varient calls for chromosome " + chr_name, logfile, print_level=2)
            ind_snps = {}
            for ind_name, ind_info_list in individuals.items():
                ind_snps[ind_name] = {}
                for ind_info in ind_info_list:
                    if ind_info[0] == chr_name:
                        ref = chr_locations[ind_info[1]].upper()
                        alt = ref
                        if ind_info[2][0].upper() == ref:
                            allele_1 = '0'
                        else:
                            allele_1 = '1'
                            alt = ind_info[2][0]

                        if ind_info[2][1].upper() == ref:
                            allele_2 = '0'
                        else:
                            allele_2 = '1'
                            alt = ind_info[2][1]

                        ind_snps[ind_name][ind_info[1]] = (ref, alt, (allele_1, allele_2))
                        print(ref, alt)
                        print(ind_info[1], allele_1, allele_2)
            logme("Done", logfile, print_level=2)

            # Dictonary to hold a data structure that will become the vcf file
            # for this chromosome
            # location -> (ref, alt, {name1->(a1,a2), ..., name_n})
            vcf_output = {}

            # Loop through individuals and create VCF output
            logme("Creating VCF output for chromosome " + chr_name, logfile, print_level=2)
            for name, snp_info in ind_snps.items():
                all_individuals.add(name)
                for location, varient in snp_info.items():
                    if location in vcf_output:
                        # Check alternate allele is the same
                        if varient[0] == vcf_output[location][0] or varient[0] == vcf_output[location][1]:
                            # Check alternate allele is the same
                            if varient[1] == vcf_output[location][0] or varient[1] == vcf_output[location][1]:
                                # Save varient call
                                vcf_output[location][2][name] = varient[2]
                            else:
                                # If ref and alt are the same, change alt
                                if vcf_output[location][1] == vcf_output[location][0]:
                                    vcf_output[location] = (vcf_output[location][0],
                                                            varient[1],
                                                            vcf_output[location][2])
                                    # Save varient call
                                    vcf_output[location][2][name] = varient[2]
                                else:
                                    sys.stderr.write(vcf_output[location] + '\n')
                                    sys.stderr.write(varient + '\n')
                                    sys.stderr.write(
                                        ("Individual {} at location {} had a " +
                                         "different ALT allele\h").format(
                                             name,
                                             location))
                                    sys.stderr.write("{}\n{}\n".format(
                                        vcf_output[location][0],
                                        vcf_output[location][1]))
                                    raise Exception
                        else:
                            # If ref and alt are the same, change alt
                            if vcf_output[location][1] == vcf_output[location][0]:
                                vcf_output[location] = (vcf_output[location][0],
                                                        varient[0],
                                                        vcf_output[location][2])
                                # Save varient call
                                vcf_output[location][2][name] = varient[2]
                            else:
                                sys.stderr.write((
                                    "Individual {} at location " +
                                    "{} had a different ALT allele\n" +
                                    "{}\n{}\n").format(name, location,
                                                       varient[0],
                                                       vcf_output[location][0]))
                                raise Exception
                    else:
                        vcf_output[location] = (varient[0], varient[1], {name: varient[2]})

            # Add chromosome info to final output dictionary
            final_vcf_output[chr_name] = vcf_output
            logme("Done", logfile, print_level=2)

    # Set outfile
    logme("\n\nAll processing complete\nPrinting output\n", logfile, print_level=2)
    if outfile:
        outfile = open(outfile, 'w')
    else:
        outfile = sys.stdout
    # Print the thing
    outfile.write("##fileformat=VCFv4.0\n")
    outfile.write('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n')
    outfile.write(("#CHROM\tPOS\tID\tREF\tALT\tQUAL\t" +
                   "FILTER\tINFO\tFORMAT\t{}\n").format(
                       '\t'.join(all_individuals)))
    all_individuals = frozenset(all_individuals)
    for chr, v in final_vcf_output.items():
        for location, info in v.items():
            print_string = (chr, str(location), info[0], info[1])
            for individual in all_individuals:
                if individual in info[2]:
                    print_string = print_string + (character.join(info[2][individual]),)
                else:
                    print_string = print_string + "."

            print_string = '\t'.join([print_string[0], print_string[1],
                                      '.', print_string[2], print_string[3],
                                      ".\t.\t.\tGT",
                                      '\t'.join(print_string[4:])])
            outfile.write(print_string + '\n')

    logme("Operation completed successfully", logfile, print_level=2)

 ##################
 #  File parsing  #
 ##################


 def parse_bedfile(bedfile):
    """Load a bed file into memory as list of tuples
       ('Chr', 'location', 'Alleles')"""

    alleles = []
    with open(bedfile) as infile:
        for line in infile:
            fields = line.rstrip().split('\t')
            if len(fields[3]) == 3:
                alt = (fields[3][0], fields[3][2])

            alleles.append((fields[0], int(fields[1]), alt))

    return alleles


 def parse_fasta(genome_file):
    """Parse a genome into a tuple of bases. Requires approximately 12GB
       of memory per chromosome in mouse.

       Base 0."""
    from re import sub

    current_chr = ''
    chromosomes = {}
    c = []
    with open(genome_file) as infile:
        for line in infile:
            if line.startswith('>'):
                if c:
                    chromosomes[current_chr] = tuple(c)
                current_chr = sub(r'^>', '', line.rstrip())
                chromosomes[current_chr] = ()
            else:
                for i in line.rstrip():
                    c.append(i)
        if c:
            chromosomes[current_chr] = tuple(c)
    return chromosomes

 ##################
 #  File formats  #
 ##################

 vcf_format = """##fileformat=VCFv4.0
 ##INFO=<ID=REF_COUNT,Number=1,Type=String,Description="# of mapped reads with reference allele">
 ##INFO=<ID=ALT_COUNT,Number=1,Type=String,Description="# of mapped reads with alernative allele">
 ##INFO=<ID=JunD,Number=0,Type=Flag,Description="ChIP sequencing, JunD peak">
 ##INFO=<ID=Max,Number=0,Type=Flag,Description="ChIP sequencing, JMax peak">
 ##INFO=<ID=NfkB,Number=0,Type=Flag,Description="ChIP sequencing, NfkB peak">
 ##INFO=<ID=Pol_II,Number=0,Type=Flag,Description="ChIP sequencing, Pol_II peak">
 ##INFO=<ID=Pol_III,Number=0,Type=Flag,Description="ChIP sequencing, Pol_III peak">
 ##INFO=<ID=cFos,Number=0,Type=Flag,Description="ChIP sequencing, cFox peak">
 ##INFO=<ID=cMyc,Number=0,Type=Flag,Description="ChIP sequencing, cMyc peak">
 ##INFO=<ID=RNAseq,Number=0,Type=Flag,Description="RNA sequencing">
 ##reference=NCBI36/hg18
 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA12878
 1	37928930	.	G	A	.	.	cFos;REF_COUNT=19;ALT_COUNT=4	GT	0/1
 1	37929014	.	T	C	.	.	cFos;REF_COUNT=3;ALT_COUNT=15	GT	0/1
 1	52941430	.	T	C	.	.	cFos;REF_COUNT=2;ALT_COUNT=13	GT	1|0
 1	111544364	.	A	G	.	.	cFos;REF_COUNT=2;ALT_COUNT=38	GT	1|0
 2	66318134	.	C	A	.	.	cFos;REF_COUNT=0;ALT_COUNT=40	GT	0/1
 2	154600656	.	G	T	.	.	cFos;REF_COUNT=1;ALT_COUNT=57	GT	0|1
 4	48785218	.	T	C	.	.	cFos;REF_COUNT=1;ALT_COUNT=129	GT	0|1
 4	185466260	.	C	T	.	.	cFos;REF_COUNT=8;ALT_COUNT=0	GT	0|1
 5	171780279	.	T	C	.	.	cFos;REF_COUNT=1;ALT_COUNT=14	GT	1|0
 5	178919238	.	C	A	.	.	cFos;REF_COUNT=0;ALT_COUNT=12	GT	0|1
 6	31432983	.	G	A	.	.	cFos;REF_COUNT=42;ALT_COUNT=21	GT	0|1
 6	31433069	.	C	T	.	.	cFos;REF_COUNT=28;ALT_COUNT=9	GT	0|1
 6	32665754	.	C	T	.	.	cFos;REF_COUNT=2;ALT_COUNT=14	GT	0/1
 6	32703852	.	C	T	.	.	cFos;REF_COUNT=13;ALT_COUNT=2	GT	1|0
 7	137337589	.	T	G	.	.	cFos;REF_COUNT=47;ALT_COUNT=24	GT	1|0
 9	126021036	.	A	G	.	.	cFos;REF_COUNT=1;ALT_COUNT=10	GT	0/1
 9	134895878	.	C	T	.	.	cFos;REF_COUNT=43;ALT_COUNT=12	GT	0|1
 """

 bed_format = """chr1	3000714	3000715	T|G
 chr1	3001065	3001066	G|T
 chr1	3001110	3001111	G|C
 chr1	3001131	3001132	G|A
 chr1	3001273	3001274	T|C
 chr1	3001291	3001292	T|C
 chr1	3002158	3002159	C|A
 chr1	3002205	3002206	T|A
 chr1	3002386	3002387	G|T
 chr1	3002642	3002643	A|G
 chr1	3002673	3002674	G|A
 chr1	3002800	3002801	A|T
 chr1	3003516	3003517	G|T
 chr1	3003697	3003698	T|G
 chr1	3003790	3003791	T|C
 chr1	3004508	3004509	T|G
 chr1	3004699	3004700	G|A
 chr1	3004757	3004758	C|A
 chr1	3005791	3005792	C|A
 chr1	3009094	3009095	G|T"""

 ################################################################################
 #                               Run as a script                                #
 ################################################################################


 def _get_args():
    """ Command Line Argument Parsing """
    import argparse

    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    # Optional Arguments
    parser.add_argument('--bed_format', action='store_true',
                        help="Print bed file format to STDOUT")
    parser.add_argument('--vcf_format', action='store_true',
                        help="Print vcf file format to STDOUT")
    parser.add_argument('--phased', action='store_true',
                        help="Are the genotypes in the bed files phased?")

    # Bed Files
    parser.add_argument('bedfiles', nargs='+',
                        help="BED File(s)")

    # Genome file(s)
    parser.add_argument('--chr', nargs='+',
                        help="Reference Genome, multiple files fine")

    # Names
    parser.add_argument('--names', nargs='+',
                        help="Names of individuals in bed files. Position is essential")

    # Optional Files
    parser.add_argument('-o', '--vcf', nargs='?', default='',
                        help="VCF output file, Default STDOUT")
    parser.add_argument('-l', '--logfile', nargs='?', default='',
                        help="Log File, Default STDERR (append mode)")

    return parser


 def main():
    """Run directly"""
    from re import sub

    # Get commandline arguments
    parser = _get_args()
    args = parser.parse_args()

    # Print format help
    if args.bed_format:
        print("BED files should be for single individuals and should have the format:")
        print("Columns:\n",
              "1) chr\n",
              "2) start base (zero based)\n",
              "3) end base\n",
              "4) reference base|indiviual's base)")
        print("Note: The only columns of import are column 1, column 2, and the alt allele in column 4")
        print("Example file:")
        print(bed_format)
        sys.exit(0)

    if args.vcf_format:
        print("VCF file will follow standard VCF format with the following columns:")
        print("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tIND1[\tIND2\t...\tINDn]")
        print("Example file:")
        print(vcf_format)
        sys.exit(0)

    # Make sure that there are reference files
    if not args.chr:
        parser.print_help()
        print("Reference genome required ('--chr')")
        sys.exit(1)

    # Set Names
    named_files = {}
    if args.names:
        if not len(args.names) == len(args.bedfiles):
            print("Number of names does not equal number of bedfiles")
            sys.exit(2)
        count = len(args.bedfiles)
        while count:
            named_files[args.names[count - 1]] = args.bedfiles[count - 1]
            count = count - 1
    else:
        for i in args.bedfiles:
            named_files[sub(r'\.bed$', '', i)] = i

    # Load bed files into memory
    individuals = {}
    for k, v in named_files.items():
        logme("Processing individual " + k + "'s bed file...", args.logfile, print_level=2)
        individuals[k] = parse_bedfile(v)

    # Travel through the chromosomes and create the VCF file
    make_vcf(individuals=individuals, chromosomes=args.chr, phased=args.phased, outfile=args.vcf, logfile=args.logfile)

 # The end
 if __name__ == '__main__':
    main()
	#!/usr/bin/env python3
	# -- coding: utf-8 --
	# Distributed under terms of the MIT license
	"""
	================================================================================

	FILE: bed2vcf (python 3)
	AUTHOR: Michael D Dacre, [email protected]
	ORGANIZATION: Stanford University
	LICENSE: MIT License, Property of Stanford, Use as you wish
	VERSION: 0.4
	CREATED: 2014-07-09 11:23
	Last modified: 2016-01-08 13:29

	DESCRIPTION: Convert simplified bed file(s) plus a reference genome into a
	vcf file. Formats in source.

	Very memory intensive. Each bed file takes 2GB, each chromosome
	takes 12GB. Runs one chromosome file at a time, so save memory
	by spliting large genomes into multiple files.
	Max memory usage I have observed: 22GB

	Runtime on mouse, one individual, all chromosomes: 15 minutes

	Should be python2/3 compatible.

	=================================================================================
	"""
	import sys
	# Import logging functions from
	# https://github.com/MikeDacre/fraser-tools/blob/master/mike.py
	from mike import logme

	##################
	# VCF Creation #
	##################


	def make_vcf(individuals, chromosomes, phased=0, outfile='', logfile=''):
	"""Take a dictionary of individuals and an array of chromosome files
	and make VCF file.
	Only loads one chromosome file into memory at a time.

	Individuals is a dictonary of individual_name->list of tuples
	('Chr', 'location', 'Alleles'). Alleles is a tuple of two alleles.

	Chromosomes is a list of fasta file names."""

	if phased:
	character = '\|'
	else:
	character = '/'

	final_vcf_output = {}
	all_individuals = set()

	for i in chromosomes:
	# Create dictionary from chromosome file
	logme("Processing chromosome file " + i + "...", logfile, print_level=2)
	chr = parse_fasta(i)
	logme("Done", logfile, print_level=2)

	# Loop through each individual and compare to this chromosome
	for chr_name, chr_locations in chr.items():
	logme("Getting varient calls for chromosome " + chr_name, logfile, print_level=2)
	ind_snps = {}
	for ind_name, ind_info_list in individuals.items():
	ind_snps[ind_name] = {}
	for ind_info in ind_info_list:
	if ind_info[0] == chr_name:
	ref = chr_locations[ind_info[1]].upper()
	alt = ref
	if ind_info[2][0].upper() == ref:
	allele_1 = '0'
	else:
	allele_1 = '1'
	alt = ind_info[2][0]

	if ind_info[2][1].upper() == ref:
	allele_2 = '0'
	else:
	allele_2 = '1'
	alt = ind_info[2][1]

	ind_snps[ind_name][ind_info[1]] = (ref, alt, (allele_1, allele_2))
	print(ref, alt)
	print(ind_info[1], allele_1, allele_2)
	logme("Done", logfile, print_level=2)

	# Dictonary to hold a data structure that will become the vcf file
	# for this chromosome
	# location -> (ref, alt, {name1->(a1,a2), ..., name_n})
	vcf_output = {}

	# Loop through individuals and create VCF output
	logme("Creating VCF output for chromosome " + chr_name, logfile, print_level=2)
	for name, snp_info in ind_snps.items():
	all_individuals.add(name)
	for location, varient in snp_info.items():
	if location in vcf_output:
	# Check alternate allele is the same
	if varient[0] == vcf_output[location][0] or varient[0] == vcf_output[location][1]:
	# Check alternate allele is the same
	if varient[1] == vcf_output[location][0] or varient[1] == vcf_output[location][1]:
	# Save varient call
	vcf_output[location][2][name] = varient[2]
	else:
	# If ref and alt are the same, change alt
	if vcf_output[location][1] == vcf_output[location][0]:
	vcf_output[location] = (vcf_output[location][0],
	varient[1],
	vcf_output[location][2])
	# Save varient call
	vcf_output[location][2][name] = varient[2]
	else:
	sys.stderr.write(vcf_output[location] + '\n')
	sys.stderr.write(varient + '\n')
	sys.stderr.write(
	("Individual {} at location {} had a " +
	"different ALT allele\h").format(
	name,
	location))
	sys.stderr.write("{}\n{}\n".format(
	vcf_output[location][0],
	vcf_output[location][1]))
	raise Exception
	else:
	# If ref and alt are the same, change alt
	if vcf_output[location][1] == vcf_output[location][0]:
	vcf_output[location] = (vcf_output[location][0],
	varient[0],
	vcf_output[location][2])
	# Save varient call
	vcf_output[location][2][name] = varient[2]
	else:
	sys.stderr.write((
	"Individual {} at location " +
	"{} had a different ALT allele\n" +
	"{}\n{}\n").format(name, location,
	varient[0],
	vcf_output[location][0]))
	raise Exception
	else:
	vcf_output[location] = (varient[0], varient[1], {name: varient[2]})

	# Add chromosome info to final output dictionary
	final_vcf_output[chr_name] = vcf_output
	logme("Done", logfile, print_level=2)

	# Set outfile
	logme("\n\nAll processing complete\nPrinting output\n", logfile, print_level=2)
	if outfile:
	outfile = open(outfile, 'w')
	else:
	outfile = sys.stdout
	# Print the thing
	outfile.write("##fileformat=VCFv4.0\n")
	outfile.write('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n')
	outfile.write(("#CHROM\tPOS\tID\tREF\tALT\tQUAL\t" +
	"FILTER\tINFO\tFORMAT\t{}\n").format(
	'\t'.join(all_individuals)))
	all_individuals = frozenset(all_individuals)
	for chr, v in final_vcf_output.items():
	for location, info in v.items():
	print_string = (chr, str(location), info[0], info[1])
	for individual in all_individuals:
	if individual in info[2]:
	print_string = print_string + (character.join(info[2][individual]),)
	else:
	print_string = print_string + "."

	print_string = '\t'.join([print_string[0], print_string[1],
	'.', print_string[2], print_string[3],
	".\t.\t.\tGT",
	'\t'.join(print_string[4:])])
	outfile.write(print_string + '\n')

	logme("Operation completed successfully", logfile, print_level=2)

	##################
	# File parsing #
	##################


	def parse_bedfile(bedfile):
	"""Load a bed file into memory as list of tuples
	('Chr', 'location', 'Alleles')"""

	alleles = []
	with open(bedfile) as infile:
	for line in infile:
	fields = line.rstrip().split('\t')
	if len(fields[3]) == 3:
	alt = (fields[3][0], fields[3][2])

	alleles.append((fields[0], int(fields[1]), alt))

	return alleles


	def parse_fasta(genome_file):
	"""Parse a genome into a tuple of bases. Requires approximately 12GB
	of memory per chromosome in mouse.

	Base 0."""
	from re import sub

	current_chr = ''
	chromosomes = {}
	c = []
	with open(genome_file) as infile:
	for line in infile:
	if line.startswith('>'):
	if c:
	chromosomes[current_chr] = tuple(c)
	current_chr = sub(r'^>', '', line.rstrip())
	chromosomes[current_chr] = ()
	else:
	for i in line.rstrip():
	c.append(i)
	if c:
	chromosomes[current_chr] = tuple(c)
	return chromosomes

	##################
	# File formats #
	##################

	vcf_format = """##fileformat=VCFv4.0
	##INFO=<ID=REF_COUNT,Number=1,Type=String,Description="# of mapped reads with reference allele">
	##INFO=<ID=ALT_COUNT,Number=1,Type=String,Description="# of mapped reads with alernative allele">
	##INFO=<ID=JunD,Number=0,Type=Flag,Description="ChIP sequencing, JunD peak">
	##INFO=<ID=Max,Number=0,Type=Flag,Description="ChIP sequencing, JMax peak">
	##INFO=<ID=NfkB,Number=0,Type=Flag,Description="ChIP sequencing, NfkB peak">
	##INFO=<ID=Pol_II,Number=0,Type=Flag,Description="ChIP sequencing, Pol_II peak">
	##INFO=<ID=Pol_III,Number=0,Type=Flag,Description="ChIP sequencing, Pol_III peak">
	##INFO=<ID=cFos,Number=0,Type=Flag,Description="ChIP sequencing, cFox peak">
	##INFO=<ID=cMyc,Number=0,Type=Flag,Description="ChIP sequencing, cMyc peak">
	##INFO=<ID=RNAseq,Number=0,Type=Flag,Description="RNA sequencing">
	##reference=NCBI36/hg18
	##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
	#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878
	1 37928930 . G A . . cFos;REF_COUNT=19;ALT_COUNT=4 GT 0/1
	1 37929014 . T C . . cFos;REF_COUNT=3;ALT_COUNT=15 GT 0/1
	1 52941430 . T C . . cFos;REF_COUNT=2;ALT_COUNT=13 GT 1\|0
	1 111544364 . A G . . cFos;REF_COUNT=2;ALT_COUNT=38 GT 1\|0
	2 66318134 . C A . . cFos;REF_COUNT=0;ALT_COUNT=40 GT 0/1
	2 154600656 . G T . . cFos;REF_COUNT=1;ALT_COUNT=57 GT 0\|1
	4 48785218 . T C . . cFos;REF_COUNT=1;ALT_COUNT=129 GT 0\|1
	4 185466260 . C T . . cFos;REF_COUNT=8;ALT_COUNT=0 GT 0\|1
	5 171780279 . T C . . cFos;REF_COUNT=1;ALT_COUNT=14 GT 1\|0
	5 178919238 . C A . . cFos;REF_COUNT=0;ALT_COUNT=12 GT 0\|1
	6 31432983 . G A . . cFos;REF_COUNT=42;ALT_COUNT=21 GT 0\|1
	6 31433069 . C T . . cFos;REF_COUNT=28;ALT_COUNT=9 GT 0\|1
	6 32665754 . C T . . cFos;REF_COUNT=2;ALT_COUNT=14 GT 0/1
	6 32703852 . C T . . cFos;REF_COUNT=13;ALT_COUNT=2 GT 1\|0
	7 137337589 . T G . . cFos;REF_COUNT=47;ALT_COUNT=24 GT 1\|0
	9 126021036 . A G . . cFos;REF_COUNT=1;ALT_COUNT=10 GT 0/1
	9 134895878 . C T . . cFos;REF_COUNT=43;ALT_COUNT=12 GT 0\|1
	"""

	bed_format = """chr1 3000714 3000715 T\|G
	chr1 3001065 3001066 G\|T
	chr1 3001110 3001111 G\|C
	chr1 3001131 3001132 G\|A
	chr1 3001273 3001274 T\|C
	chr1 3001291 3001292 T\|C
	chr1 3002158 3002159 C\|A
	chr1 3002205 3002206 T\|A
	chr1 3002386 3002387 G\|T
	chr1 3002642 3002643 A\|G
	chr1 3002673 3002674 G\|A
	chr1 3002800 3002801 A\|T
	chr1 3003516 3003517 G\|T
	chr1 3003697 3003698 T\|G
	chr1 3003790 3003791 T\|C
	chr1 3004508 3004509 T\|G
	chr1 3004699 3004700 G\|A
	chr1 3004757 3004758 C\|A
	chr1 3005791 3005792 C\|A
	chr1 3009094 3009095 G\|T"""

	################################################################################
	# Run as a script #
	################################################################################


	def _get_args():
	""" Command Line Argument Parsing """
	import argparse

	parser = argparse.ArgumentParser(
	description=__doc__,
	formatter_class=argparse.RawDescriptionHelpFormatter)

	# Optional Arguments
	parser.add_argument('--bed_format', action='store_true',
	help="Print bed file format to STDOUT")
	parser.add_argument('--vcf_format', action='store_true',
	help="Print vcf file format to STDOUT")
	parser.add_argument('--phased', action='store_true',
	help="Are the genotypes in the bed files phased?")

	# Bed Files
	parser.add_argument('bedfiles', nargs='+',
	help="BED File(s)")

	# Genome file(s)
	parser.add_argument('--chr', nargs='+',
	help="Reference Genome, multiple files fine")

	# Names
	parser.add_argument('--names', nargs='+',
	help="Names of individuals in bed files. Position is essential")

	# Optional Files
	parser.add_argument('-o', '--vcf', nargs='?', default='',
	help="VCF output file, Default STDOUT")
	parser.add_argument('-l', '--logfile', nargs='?', default='',
	help="Log File, Default STDERR (append mode)")

	return parser


	def main():
	"""Run directly"""
	from re import sub

	# Get commandline arguments
	parser = _get_args()
	args = parser.parse_args()

	# Print format help
	if args.bed_format:
	print("BED files should be for single individuals and should have the format:")
	print("Columns:\n",
	"1) chr\n",
	"2) start base (zero based)\n",
	"3) end base\n",
	"4) reference base\|indiviual's base)")
	print("Note: The only columns of import are column 1, column 2, and the alt allele in column 4")
	print("Example file:")
	print(bed_format)
	sys.exit(0)

	if args.vcf_format:
	print("VCF file will follow standard VCF format with the following columns:")
	print("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tIND1[\tIND2\t...\tINDn]")
	print("Example file:")
	print(vcf_format)
	sys.exit(0)

	# Make sure that there are reference files
	if not args.chr:
	parser.print_help()
	print("Reference genome required ('--chr')")
	sys.exit(1)

	# Set Names
	named_files = {}
	if args.names:
	if not len(args.names) == len(args.bedfiles):
	print("Number of names does not equal number of bedfiles")
	sys.exit(2)
	count = len(args.bedfiles)
	while count:
	named_files[args.names[count - 1]] = args.bedfiles[count - 1]
	count = count - 1
	else:
	for i in args.bedfiles:
	named_files[sub(r'\.bed$', '', i)] = i

	# Load bed files into memory
	individuals = {}
	for k, v in named_files.items():
	logme("Processing individual " + k + "'s bed file...", args.logfile, print_level=2)
	individuals[k] = parse_bedfile(v)

	# Travel through the chromosomes and create the VCF file
	make_vcf(individuals=individuals, chromosomes=args.chr, phased=args.phased, outfile=args.vcf, logfile=args.logfile)

	# The end
	if __name__ == '__main__':
	main()