Last active
November 13, 2017 16:11
-
-
Save SamStudio8/f507ae1759f6c1bb16157532b9a894e5 to your computer and use it in GitHub Desktop.
amplyfyer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
A scrappy esoteric glue program for Ben because he only really likes databases. | |
Given a tab delimited file where: | |
fields[0] ORF id | |
fields[1] 1-pos Start | |
fields[2] 1-pos End | |
fields[-1] Structural Prediction Code | |
Output a cast dataframe with a line per ORF, describing all structural predictions | |
for that ORF as a single string. Additionally enumerating each viable code's proportion | |
of that string as a percentage. You may alter CODES to add/remove prediction codes | |
as needed. | |
It is anticipated that the input is sorted by ORF and start position. | |
Things will go wrong if that is not the case... | |
Output is printed to stdout, warnings to stderr. | |
Usage: python amplyfy.py <data.tab> | |
Author: Sam Nicholls <[email protected]> | |
Version: 0.1.1 | |
""" | |
import sys | |
def print_orf(orf_id, orf_codelist): | |
"""Given a list of ORF code substrings, output the necessary data row.""" | |
orf = "".join(orf_codelist) | |
code_pc = [] | |
for c in CODES: | |
code_pc.append( orf.count(c) / float(len(orf)) ) | |
print("\t".join([orf_id, orf] + ["%.3f" % (x * 100) for x in code_pc])) | |
INPUT_F = open(sys.argv[1]) | |
CODES = ['H', 'C', 'T', 'E'] # valid codes, Ben: add new codes to this if desired. | |
# Print header | |
print("\t".join(["ORF_id", "secondary_structure"] + ["%s_pc" % c for c in CODES])) | |
observed_orf_ids = set([]) | |
orf_codes = [] | |
last_end = 1 | |
current_orf_id = None | |
for line in INPUT_F: | |
fields = line.strip().split("\t") | |
if not current_orf_id: | |
# Assign first ORF | |
current_orf_id = fields[0] | |
elif fields[0] != current_orf_id: | |
# Complete ORF and reset variables | |
print_orf(current_orf_id, orf_codes) | |
observed_orf_ids.add(current_orf_id) | |
current_orf_id = fields[0] | |
orf_codes = [] | |
last_end = 1 | |
start = int(fields[1]) | |
end = int(fields[2]) | |
code = fields[-1] | |
if current_orf_id in observed_orf_ids: | |
sys.stderr.write("ORF '%s' has been processed already, was your input file sorted?\n" % current_orf_id) | |
if last_end + 1 != start and last_end != 1: | |
sys.stderr.write("Next start for '%s' at Pos %d does not follow from the last end.\n" % (current_orf_id, start)) | |
if code not in CODES: | |
sys.stderr.write("Prediction Code '%s' not in the list of valid codes...\n" % code) | |
# Add code characters to ongoing ORF list | |
orf_codes.append ( (end - (start-1)) * code ) | |
last_end = end | |
# Don't forget last ORF! | |
print_orf(current_orf_id, orf_codes) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment