Skip to content

Instantly share code, notes, and snippets.

@paul-english
Created August 16, 2021 19:25
Show Gist options
  • Save paul-english/3ca4ac92f91733cbf42529b70e07725c to your computer and use it in GitHub Desktop.
Save paul-english/3ca4ac92f91733cbf42529b70e07725c to your computer and use it in GitHub Desktop.
Convert uniprot_trembl to massive csv
from tqdm import tqdm
import csv
order = [
"",
"ID",
"AC",
"DT",
"DE",
"GN",
"OS",
"OG",
"OC",
"OX",
"OH",
"RN",
"RP",
"RC",
"RX",
"RG",
"RA",
"RT",
"RL",
"CC",
"DR",
"PE",
"KW",
"FT",
"SQ",
" ",
"//",
]
columns = [
"identification",
"accession",
"date",
"description",
"gene_names",
"organism_species",
"organelle",
"organism_classification",
"taxonomy_cross_reference",
"organism_host",
"reference_number",
"reference_position",
"reference_comments",
"reference_cross_references",
"reference_group",
"reference_authors",
"reference_title",
"reference_location",
"comments_or_notes",
"database_cross_references",
"protein_existence",
"keywords",
"feature_table_data",
"sequence_header",
"sequence",
]
current_code = ""
column_string = []
seq_buffer = []
def handle_line(line, writer):
global current_code, column_string, seq_buffer
code = line[:2]
rest = line[4:]
if code == current_code:
column_string.append(rest.strip())
else:
index = order.index(current_code)
next_token = order[(index+1) % len(order)]
while code != next_token:
seq_buffer.append("")
current_code = next_token
index = order.index(current_code)
next_token = order[(index+1) % len(order)]
current_code = code
seq_buffer.append("\n".join(column_string))
column_string = []
if code == "//":
writer.writerow(seq_buffer)
seq_buffer = []
with open("/home/paul/data/uniprot_trembl.csv", 'w') as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerow(columns)
with open("/home/paul/data/uniprot_trembl.dat", 'r') as infile:
for line in tqdm(infile):
handle_line(line, csvwriter)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment