suqingdong · July 11, 2019 05:40
diff --git a/genome_stat.py b/genome_stat.py
 #!/usr/bin/env python
 #-*- encoding: utf8 -*-
 '''\033[1;32m
    参考基因组每条染色体总长度和N的数量统计
 \033[0m'''
 import os
 import sys
 import json
 import textwrap
 import logging
 from collections import defaultdict


 __author__ = 'suqingdong'
 __author_email__ = '[email protected]'


 logging.basicConfig(
    format='[%(asctime)s %(funcName)s %(levelname)s] %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    level=logging.DEBUG)
 logger = logging.getLogger(__name__)


 def sort_chrom(item):

    chrom = item[0].strip('chr')

    if chrom.isdigit():
        chrom = int(chrom)
    elif chrom == 'X':
        chrom = 23
    elif chrom == 'Y':
        chrom = 24
    elif chrom in 'MT':
        chrom = 25
    else:
        chrom = 999
    return chrom



 def main():

    infile = args['infile']

    logger.debug('opening file: {}'.format(infile))

    stat = defaultdict(dict)

    chrom_list = [str(i) for i in range(1, 23)] + ['X', 'Y', 'M', 'MT']

    with open(infile) as f:
        for line in f:
            if line.startswith('>'):
                chrom = line.strip().split()[0][1:]
                logger.info('dealing with chrom: {}'.format(chrom))
                continue
            if chrom.strip('chr') in chrom_list:
                length = len(line.strip())
                ncounts = line.upper().count('N')
                if 'length' not in stat[chrom]:
                    stat[chrom]['length'] = length
                    stat[chrom]['ncounts'] = ncounts
                else:
                    stat[chrom]['length'] += length
                    stat[chrom]['ncounts'] += ncounts

    print json.dumps(stat, indent=2)
    out_prefix = args['out'] if args['out'] else os.path.basename(args['infile'])

    with open(out_prefix + '.bed', 'w') as out_bed, open(out_prefix + '.nblocks', 'w') as out_nb:
        for chrom, info in sorted(stat.iteritems(), key=sort_chrom):
            out_bed.write('{}\t0\t{}\n'.format(chrom, info['length']))
            out_nb.write('{}\t{}\n'.format(chrom, info['ncounts']))


 if __name__ == "__main__":

    epilog = textwrap.dedent('''
        \033[36mexamples:
            %(prog)s hg19.fa
            %(prog)s hg19.fa -o hg19

        \033[33mcontact: {__author__} <{__author_email__}>
    ''').format(**globals())

    import argparse

    parser = argparse.ArgumentParser(
        prog='genome_stat',
        description=__doc__,
        epilog=epilog,
        formatter_class=argparse.RawTextHelpFormatter)

    parser.add_argument('infile', help='the input reference fasta file', nargs='?')
    parser.add_argument('-o', '--out', help='the output prefix')

    args = vars(parser.parse_args())

    if not args['infile']:
        parser.print_help()
        exit()

    main()
	#!/usr/bin/env python
	#-- encoding: utf8 --
	'''\033[1;32m
	参考基因组每条染色体总长度和N的数量统计
	\033[0m'''
	import os
	import sys
	import json
	import textwrap
	import logging
	from collections import defaultdict


	__author__ = 'suqingdong'
	__author_email__ = '[email protected]'


	logging.basicConfig(
	format='[%(asctime)s %(funcName)s %(levelname)s] %(message)s',
	datefmt='%Y-%m-%d %H:%M:%S',
	level=logging.DEBUG)
	logger = logging.getLogger(__name__)


	def sort_chrom(item):

	chrom = item[0].strip('chr')

	if chrom.isdigit():
	chrom = int(chrom)
	elif chrom == 'X':
	chrom = 23
	elif chrom == 'Y':
	chrom = 24
	elif chrom in 'MT':
	chrom = 25
	else:
	chrom = 999
	return chrom



	def main():

	infile = args['infile']

	logger.debug('opening file: {}'.format(infile))

	stat = defaultdict(dict)

	chrom_list = [str(i) for i in range(1, 23)] + ['X', 'Y', 'M', 'MT']

	with open(infile) as f:
	for line in f:
	if line.startswith('>'):
	chrom = line.strip().split()[0][1:]
	logger.info('dealing with chrom: {}'.format(chrom))
	continue
	if chrom.strip('chr') in chrom_list:
	length = len(line.strip())
	ncounts = line.upper().count('N')
	if 'length' not in stat[chrom]:
	stat[chrom]['length'] = length
	stat[chrom]['ncounts'] = ncounts
	else:
	stat[chrom]['length'] += length
	stat[chrom]['ncounts'] += ncounts

	print json.dumps(stat, indent=2)
	out_prefix = args['out'] if args['out'] else os.path.basename(args['infile'])

	with open(out_prefix + '.bed', 'w') as out_bed, open(out_prefix + '.nblocks', 'w') as out_nb:
	for chrom, info in sorted(stat.iteritems(), key=sort_chrom):
	out_bed.write('{}\t0\t{}\n'.format(chrom, info['length']))
	out_nb.write('{}\t{}\n'.format(chrom, info['ncounts']))


	if __name__ == "__main__":

	epilog = textwrap.dedent('''
	\033[36mexamples:
	%(prog)s hg19.fa
	%(prog)s hg19.fa -o hg19

	\033[33mcontact: {__author__} <{__author_email__}>
	''').format(**globals())

	import argparse

	parser = argparse.ArgumentParser(
	prog='genome_stat',
	description=__doc__,
	epilog=epilog,
	formatter_class=argparse.RawTextHelpFormatter)

	parser.add_argument('infile', help='the input reference fasta file', nargs='?')
	parser.add_argument('-o', '--out', help='the output prefix')

	args = vars(parser.parse_args())

	if not args['infile']:
	parser.print_help()
	exit()

	main()