Skip to content

Instantly share code, notes, and snippets.

@suqingdong
Last active July 11, 2019 05:40
Show Gist options
  • Save suqingdong/e217208561195503fb62e1f03c4ba15b to your computer and use it in GitHub Desktop.
Save suqingdong/e217208561195503fb62e1f03c4ba15b to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
#-*- encoding: utf8 -*-
'''\033[1;32m
参考基因组每条染色体总长度和N的数量统计
\033[0m'''
import os
import sys
import json
import textwrap
import logging
from collections import defaultdict
__author__ = 'suqingdong'
__author_email__ = '[email protected]'
logging.basicConfig(
format='[%(asctime)s %(funcName)s %(levelname)s] %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
level=logging.DEBUG)
logger = logging.getLogger(__name__)
def sort_chrom(item):
chrom = item[0].strip('chr')
if chrom.isdigit():
chrom = int(chrom)
elif chrom == 'X':
chrom = 23
elif chrom == 'Y':
chrom = 24
elif chrom in 'MT':
chrom = 25
else:
chrom = 999
return chrom
def main():
infile = args['infile']
logger.debug('opening file: {}'.format(infile))
stat = defaultdict(dict)
chrom_list = [str(i) for i in range(1, 23)] + ['X', 'Y', 'M', 'MT']
with open(infile) as f:
for line in f:
if line.startswith('>'):
chrom = line.strip().split()[0][1:]
logger.info('dealing with chrom: {}'.format(chrom))
continue
if chrom.strip('chr') in chrom_list:
length = len(line.strip())
ncounts = line.upper().count('N')
if 'length' not in stat[chrom]:
stat[chrom]['length'] = length
stat[chrom]['ncounts'] = ncounts
else:
stat[chrom]['length'] += length
stat[chrom]['ncounts'] += ncounts
print json.dumps(stat, indent=2)
out_prefix = args['out'] if args['out'] else os.path.basename(args['infile'])
with open(out_prefix + '.bed', 'w') as out_bed, open(out_prefix + '.nblocks', 'w') as out_nb:
for chrom, info in sorted(stat.iteritems(), key=sort_chrom):
out_bed.write('{}\t0\t{}\n'.format(chrom, info['length']))
out_nb.write('{}\t{}\n'.format(chrom, info['ncounts']))
if __name__ == "__main__":
epilog = textwrap.dedent('''
\033[36mexamples:
%(prog)s hg19.fa
%(prog)s hg19.fa -o hg19
\033[33mcontact: {__author__} <{__author_email__}>
''').format(**globals())
import argparse
parser = argparse.ArgumentParser(
prog='genome_stat',
description=__doc__,
epilog=epilog,
formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('infile', help='the input reference fasta file', nargs='?')
parser.add_argument('-o', '--out', help='the output prefix')
args = vars(parser.parse_args())
if not args['infile']:
parser.print_help()
exit()
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment