Last active
May 24, 2017 20:15
-
-
Save mojaveazure/cfed05bd593034a2fd2d4916e7b7a217 to your computer and use it in GitHub Desktop.
Translate Ensembl codes into gene names
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
"""Translate Ensembl codes into gene names""" | |
import sys | |
if sys.version_info.major is not 3 and sys.version_info.minor < 5: | |
sys.exit("Please use Python 3.5 or higher for this program") | |
import argparse | |
import os.path as path | |
from typing import Dict, Any | |
from collections import Counter | |
def _make_argument_parser() -> argparse.ArgumentParser: | |
parser = argparse.ArgumentParser(add_help=True) | |
codes = parser.add_argument_group( | |
title='codes arguments', | |
description='Provide details about the codes lookup table' | |
) | |
codes.add_argument( # Codes file | |
'-c', | |
'--codes-file', | |
dest='codes', | |
type=str, | |
required=True, | |
default=None, | |
metavar='CODES FILE', | |
help="Table of codes from Ensembl" | |
) | |
codes.add_argument( # Header for codes? | |
'--header', | |
dest='header', | |
action='store_true', | |
required=False, | |
help="Does the codes file have a header line?" | |
) | |
input_output = parser.add_argument_group( | |
title='input/output arguments', | |
description='Set options for input and output files' | |
) | |
input_output.add_argument( # File to translate | |
'-t', | |
'--to-translate', | |
dest='translate', | |
type=str, | |
required=True, | |
default=None, | |
metavar='TO TRANSLATE', | |
help="File to translate" | |
) | |
input_output.add_argument( # Separation character for output | |
'-s', | |
'--separator', | |
dest='separator', | |
type=str, | |
required=False, | |
default='\t', | |
metavar='SEPARATOR', | |
help="Separation character(s) for the output file, defaults to tab" | |
) | |
case = input_output.add_mutually_exclusive_group() | |
case.add_argument( # Force uppercase? | |
'--uppercase', | |
dest='case', | |
action='store_const', | |
const='upper', | |
default='asis', | |
help="Set all values to uppercase? Mutually exclusive with '--lowercase'" | |
) | |
case.add_argument( # Force lowercase? | |
'--lowercase', | |
dest='case', | |
action='store_const', | |
const='lower', | |
default='asis', | |
help="Set all values to lowercase? Mutually exclusive with '--uppercase'" | |
) | |
return parser | |
def translate(translation_dictionary: Dict[str, str], key: str, case: str='asis') -> str: | |
"""Translate a string given values in a translation dictionary | |
Returns either the translated value or the original string | |
'translation_dictionary' a dict of strings | |
'key' a string to be translated | |
'ret' is a way to determine the case, | |
choose from 'upper', 'lower' or 'asis'""" | |
ret = translation_dictionary.get(key, key) # type: str | |
if ret == 'duplicate': | |
# If our return value is a duplicate, raise | |
raise ValueError("Entry has been marked as duplicate") | |
# Figure out the case that we're supposed to return in | |
if case == 'upper': | |
ret = ret.upper() | |
elif case == 'lower': | |
ret = ret.lower() | |
return ret | |
def main(args: Dict[str, Any]) -> None: | |
"""Run the program""" | |
# Stuff for holding | |
codes = dict() # type: Dict[str, str] | |
to_translate = list() # type: List[List[str]] | |
log = list() # type: List[str] | |
# Set output names | |
basename, extension = path.splitext(args['translate']) # type: str, str | |
filename = basename + '_translated' + extension # type: str | |
logname = basename + '_duplicates.log' # type: str | |
try: | |
# Load the codes | |
with open(args['codes'], 'r') as codesfile: | |
print("Reading in", args['codes'], "as codes file", file=sys.stderr) | |
# Skip any potential header lines | |
if args['header']: | |
_ = codesfile.readline() | |
# For every line in the codes file: | |
# collect the code and gene from Ensembl and | |
# add to our dictionary of codes | |
for line in codesfile: # type: str | |
code, gene = line.strip().split() # type: str, str | |
codes[code] = gene | |
# Load the data to translate | |
with open(args['translate'], 'r') as transfile: | |
print("Reading in", args['translate'], "as file to translate", file=sys.stderr) | |
# For ever line in our file to translate | |
# add the line, as a list of strings, to our holding list | |
for line in transfile: # type: str | |
to_translate.append(line.strip().split()) | |
except FileNotFoundError as error: | |
sys.exit("Failed to find file: %s" % error.filename) | |
# Modify the dictionary for any duplicates | |
duplicates = Counter(codes.values()) # type: Counter | |
duplicates = {gene for gene, count in duplicates.items() if count > 1} # type: Set[str] | |
for code, gene in codes.items(): # type: str, str | |
if gene in duplicates: | |
codes[code] = 'duplicate' | |
with open(filename, 'w') as outfile: | |
print("Writing translated lines to", filename, file=sys.stderr) | |
# For every line we need to translate | |
# translate it, write it out using the separator defined | |
# and flush the I/O handle | |
for line in to_translate: # type: List[str] | |
try: | |
translated = [translate(translation_dictionary=codes, key=word, case=args['case']) for word in line] # type: List[str] | |
except ValueError: | |
# Catch any duplicate lines and log them | |
log.append(args['separator'].join(line)) | |
else: | |
outfile.write(args['separator'].join(translated)) | |
outfile.write('\n') | |
finally: | |
outfile.flush() | |
# Write any duplicate lines to a log file | |
if log: | |
with open(logname, 'w') as logfile: | |
print("Writing duplicate lines to", logname, file=sys.stderr) | |
for line in log: | |
logfile.write(line) | |
logfile.write('\n') | |
logfile.flush() | |
if __name__ == '__main__': | |
PARSER = _make_argument_parser() # type: argparse.ArgumentParser | |
if not sys.argv[1:]: | |
sys.exit(PARSER.print_help()) | |
ARGS = vars(PARSER.parse_args()) # type: Dict[str, Union[str, bool]] | |
main(args=ARGS) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment