Skip to content

Instantly share code, notes, and snippets.

@MikeDacre
Last active August 29, 2015 14:04
Show Gist options
  • Save MikeDacre/697714ebfad646b1be85 to your computer and use it in GitHub Desktop.
Save MikeDacre/697714ebfad646b1be85 to your computer and use it in GitHub Desktop.
Take a list of FastA files (or just one file) and count the length of the chromosome
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# vim:fenc=utf-8 tabstop=4 expandtab shiftwidth=4 softtabstop=4
#
# Copyright © Mike Dacre <[email protected]>
#
# Distributed under terms of the MIT license
"""
#====================================================================================
#
# FILE: get_chromosome_length (python 3)
# AUTHOR: Michael D Dacre, [email protected]
# ORGANIZATION: Stanford University
# LICENSE: MIT License, Property of Stanford, Use as you wish
# VERSION: 0.1
# CREATED: 2014-07-21 13:26
# Last modified: 2014-07-21 13:49
#
# DESCRIPTION: Take a FastA file and count the length of the genome
#
#====================================================================================
"""
from re import sub
from collections import OrderedDict
def count_fasta(file_list):
""" File list is an array """
current_chr = ''
chromosomes = {}
c = 0
for genome_file in file_list:
with open(genome_file) as infile:
for line in infile:
if line.startswith('>'):
if c:
chromosomes[current_chr] = str(c)
c = 0
current_chr = sub(r'^>', '', line.rstrip())
else:
c = c + len(line.rstrip())
if c:
chromosomes[current_chr] = str(c)
return(chromosomes)
def _get_args():
"""Command Line Argument Parsing"""
import argparse, sys
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
# Required Files
parser.add_argument('fasta_files', nargs='+',
help="FastA files")
return parser
# Main function for direct running
def main():
"""Run directly"""
# Get commandline arguments
parser = _get_args()
args = parser.parse_args()
chromosomes = OrderedDict(sorted(count_fasta(args.fasta_files).items()))
for k, v in chromosomes.items():
print(k, v, sep='\t')
# The end
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment