Last active
August 29, 2015 14:04
-
-
Save MikeDacre/697714ebfad646b1be85 to your computer and use it in GitHub Desktop.
Take a list of FastA files (or just one file) and count the length of the chromosome
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
# vim:fenc=utf-8 tabstop=4 expandtab shiftwidth=4 softtabstop=4 | |
# | |
# Copyright © Mike Dacre <[email protected]> | |
# | |
# Distributed under terms of the MIT license | |
""" | |
#==================================================================================== | |
# | |
# FILE: get_chromosome_length (python 3) | |
# AUTHOR: Michael D Dacre, [email protected] | |
# ORGANIZATION: Stanford University | |
# LICENSE: MIT License, Property of Stanford, Use as you wish | |
# VERSION: 0.1 | |
# CREATED: 2014-07-21 13:26 | |
# Last modified: 2014-07-21 13:49 | |
# | |
# DESCRIPTION: Take a FastA file and count the length of the genome | |
# | |
#==================================================================================== | |
""" | |
from re import sub | |
from collections import OrderedDict | |
def count_fasta(file_list): | |
""" File list is an array """ | |
current_chr = '' | |
chromosomes = {} | |
c = 0 | |
for genome_file in file_list: | |
with open(genome_file) as infile: | |
for line in infile: | |
if line.startswith('>'): | |
if c: | |
chromosomes[current_chr] = str(c) | |
c = 0 | |
current_chr = sub(r'^>', '', line.rstrip()) | |
else: | |
c = c + len(line.rstrip()) | |
if c: | |
chromosomes[current_chr] = str(c) | |
return(chromosomes) | |
def _get_args(): | |
"""Command Line Argument Parsing""" | |
import argparse, sys | |
parser = argparse.ArgumentParser( | |
description=__doc__, | |
formatter_class=argparse.RawDescriptionHelpFormatter) | |
# Required Files | |
parser.add_argument('fasta_files', nargs='+', | |
help="FastA files") | |
return parser | |
# Main function for direct running | |
def main(): | |
"""Run directly""" | |
# Get commandline arguments | |
parser = _get_args() | |
args = parser.parse_args() | |
chromosomes = OrderedDict(sorted(count_fasta(args.fasta_files).items())) | |
for k, v in chromosomes.items(): | |
print(k, v, sep='\t') | |
# The end | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment