Skip to content

Instantly share code, notes, and snippets.

@mojaveazure
Last active May 24, 2017 20:15
Show Gist options
  • Save mojaveazure/cfed05bd593034a2fd2d4916e7b7a217 to your computer and use it in GitHub Desktop.
Save mojaveazure/cfed05bd593034a2fd2d4916e7b7a217 to your computer and use it in GitHub Desktop.
Translate Ensembl codes into gene names
#!/usr/bin/env python3
"""Translate Ensembl codes into gene names"""
import sys
if sys.version_info.major is not 3 and sys.version_info.minor < 5:
sys.exit("Please use Python 3.5 or higher for this program")
import argparse
import os.path as path
from typing import Dict, Any
from collections import Counter
def _make_argument_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(add_help=True)
codes = parser.add_argument_group(
title='codes arguments',
description='Provide details about the codes lookup table'
)
codes.add_argument( # Codes file
'-c',
'--codes-file',
dest='codes',
type=str,
required=True,
default=None,
metavar='CODES FILE',
help="Table of codes from Ensembl"
)
codes.add_argument( # Header for codes?
'--header',
dest='header',
action='store_true',
required=False,
help="Does the codes file have a header line?"
)
input_output = parser.add_argument_group(
title='input/output arguments',
description='Set options for input and output files'
)
input_output.add_argument( # File to translate
'-t',
'--to-translate',
dest='translate',
type=str,
required=True,
default=None,
metavar='TO TRANSLATE',
help="File to translate"
)
input_output.add_argument( # Separation character for output
'-s',
'--separator',
dest='separator',
type=str,
required=False,
default='\t',
metavar='SEPARATOR',
help="Separation character(s) for the output file, defaults to tab"
)
case = input_output.add_mutually_exclusive_group()
case.add_argument( # Force uppercase?
'--uppercase',
dest='case',
action='store_const',
const='upper',
default='asis',
help="Set all values to uppercase? Mutually exclusive with '--lowercase'"
)
case.add_argument( # Force lowercase?
'--lowercase',
dest='case',
action='store_const',
const='lower',
default='asis',
help="Set all values to lowercase? Mutually exclusive with '--uppercase'"
)
return parser
def translate(translation_dictionary: Dict[str, str], key: str, case: str='asis') -> str:
"""Translate a string given values in a translation dictionary
Returns either the translated value or the original string
'translation_dictionary' a dict of strings
'key' a string to be translated
'ret' is a way to determine the case,
choose from 'upper', 'lower' or 'asis'"""
ret = translation_dictionary.get(key, key) # type: str
if ret == 'duplicate':
# If our return value is a duplicate, raise
raise ValueError("Entry has been marked as duplicate")
# Figure out the case that we're supposed to return in
if case == 'upper':
ret = ret.upper()
elif case == 'lower':
ret = ret.lower()
return ret
def main(args: Dict[str, Any]) -> None:
"""Run the program"""
# Stuff for holding
codes = dict() # type: Dict[str, str]
to_translate = list() # type: List[List[str]]
log = list() # type: List[str]
# Set output names
basename, extension = path.splitext(args['translate']) # type: str, str
filename = basename + '_translated' + extension # type: str
logname = basename + '_duplicates.log' # type: str
try:
# Load the codes
with open(args['codes'], 'r') as codesfile:
print("Reading in", args['codes'], "as codes file", file=sys.stderr)
# Skip any potential header lines
if args['header']:
_ = codesfile.readline()
# For every line in the codes file:
# collect the code and gene from Ensembl and
# add to our dictionary of codes
for line in codesfile: # type: str
code, gene = line.strip().split() # type: str, str
codes[code] = gene
# Load the data to translate
with open(args['translate'], 'r') as transfile:
print("Reading in", args['translate'], "as file to translate", file=sys.stderr)
# For ever line in our file to translate
# add the line, as a list of strings, to our holding list
for line in transfile: # type: str
to_translate.append(line.strip().split())
except FileNotFoundError as error:
sys.exit("Failed to find file: %s" % error.filename)
# Modify the dictionary for any duplicates
duplicates = Counter(codes.values()) # type: Counter
duplicates = {gene for gene, count in duplicates.items() if count > 1} # type: Set[str]
for code, gene in codes.items(): # type: str, str
if gene in duplicates:
codes[code] = 'duplicate'
with open(filename, 'w') as outfile:
print("Writing translated lines to", filename, file=sys.stderr)
# For every line we need to translate
# translate it, write it out using the separator defined
# and flush the I/O handle
for line in to_translate: # type: List[str]
try:
translated = [translate(translation_dictionary=codes, key=word, case=args['case']) for word in line] # type: List[str]
except ValueError:
# Catch any duplicate lines and log them
log.append(args['separator'].join(line))
else:
outfile.write(args['separator'].join(translated))
outfile.write('\n')
finally:
outfile.flush()
# Write any duplicate lines to a log file
if log:
with open(logname, 'w') as logfile:
print("Writing duplicate lines to", logname, file=sys.stderr)
for line in log:
logfile.write(line)
logfile.write('\n')
logfile.flush()
if __name__ == '__main__':
PARSER = _make_argument_parser() # type: argparse.ArgumentParser
if not sys.argv[1:]:
sys.exit(PARSER.print_help())
ARGS = vars(PARSER.parse_args()) # type: Dict[str, Union[str, bool]]
main(args=ARGS)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment