Skip to content

Instantly share code, notes, and snippets.

@AnnaTSW0609
Last active June 16, 2018 14:59
Show Gist options
  • Save AnnaTSW0609/e0f32d10f98ab3cd0bbc687c9ea5da41 to your computer and use it in GitHub Desktop.
Save AnnaTSW0609/e0f32d10f98ab3cd0bbc687c9ea5da41 to your computer and use it in GitHub Desktop.
The first draft of the PantherDB Parser+test datafile
Data = [] # Create an empty dictionary
import re
with open("testfile_PantherDB.txt", "r+") as testfile:
for line in testfile:
# Creating the items for the reference gene
# The four lines below creates 1a (name of the species specific database)
# and 1b (the ID for that gene in the ss database
split_list = re.split("[| \t \n]", line)
ESID= split_list [1]
ESID_title_break = re.split("[=]", ESID)
ESID_title_1a = ESID_title_break [0]
ESID_ID_1b = ESID_title_break [1]
# This one is for the uniprot ID
UP = split_list [2]
UP_break= re.split("[=]", UP)
UP_2b = UP_break [1]
# Create the items for the ortholog
# The ss name and id of the ortholog
ortholog_ss = split_list [4]
ortholog_ss_break= re.split("[=]", ortholog_ss)
ortholog_ss_title_4a= ortholog_ss_break[0]
ortholog_ss_ID_4b = ortholog_ss_break[1]
# The UniprotID of the ortholog
ortholog_UP = split_list [5]
ortholog_UP_break= re.split("[=]", ortholog_UP)
ortholog_UP_5b = ortholog_UP_break [1]
# The ortholog type
ortholog_type = split_list [6]
# The PantherDB Family
PanDB_family = split_list [8]
# Add the whole entry into the dictionary
key_value_pair = {
"id_" : UP_2b,
ESID_title_1a : ESID_ID_1b,
"UniProt_ID:": UP_2b,
"Ortholog" : [
{ortholog_ss_title_4a : ortholog_ss_ID_4b ,
"UniProt_ID:" : ortholog_UP_5b,
"Ortholog_type": ortholog_type,
"PantherDB_family" : PanDB_family
}]
}
Data.append(dict(key_value_pair))
print(Data)
#New_Data = [] # Create another new list
#for Data [x] in Data: # iterate dictionaries in list
#if y in Data [x] == z in Data [x + 1]: # if the value of the first key of two dictionaries in the list are the same
# Append the dictionary of ortholog information (the value of the "Ortholog" key) of the second dictionary
# to the list of ortholog info of the first dictionary.
import re
with open("testfile_PartnerDB.txt", "r+") as testfile:
for line in testfile:
print(next(search("O60524", "testfile_PantherDB.txt")))
# Creating the items for the reference gene
# The four lines below creates 1a (name of the species specific database)
# and 1b (the ID for that gene in the ss database
split_list = re.split("[| \t \n]", line)
ESID= split_list [1]
ESID_title_break = re.split("[=]", ESID)
ESID_title_1a = ESID_title_break [0]
ESID_ID_1b = ESID_title_break [1]
# This one is for the uniprot ID
UP = split_list [2]
UP_break= re.split("[=]", UP)
UP_2b = UP_break [1]
# The UPID keyword would be used for searching lines with common ref. gene
# Create the items for the ortholog
# The ss name and id of the ortholog
ortholog_ss = split_list [4]
ortholog_ss_break= re.split("[=]", ortholog_ss)
ortholog_ss_title_4a= ortholog_ss_break[0]
ortholog_ss_ID_4b = ortholog_ss_break[1]
# The UniprotID of the ortholog
ortholog_UP = split_list [5]
ortholog_UP_break= re.split("[=]", ortholog_UP)
ortholog_UP_5b = ortholog_UP_break [1]
# The ortholog type
ortholog_type = split_list [6]
# The PantherDB Family
PanDB_family = split_list [8]
# The following is the generator that yield the final data structure
if UP_2b in line:
try:
data_output
# each output (final structure) would be stored in data_output
# thus if this exist, only the ortholog needs to be added
# else, a new data entry needs to be created.
except NameError:
data_output= {
"id_" : UP_2b,
EID_title_1a : ESID_ID_1b,
"UniProt_ID:": UP_2b,
"Ortholog" : [
{ortholog_ss_title_4a : ortholog_ss_ID_4b ,
"UniProt_ID:" : ortholog_UP_5b,
"Ortholog_type": ortholog_type,
"PantherDB_family" : PanDB_family
}]
}
else:
data_output.update({
"Ortholog" : [
{ortholog_ss_title_4a : ortholog_ss_ID_4b ,
"UniProt_ID:" : ortholog_UP_5b,
"Ortholog_type": ortholog_type,
"PantherDB_family" : PanDB_family
}]
}
else: # if no more item with common gene ref is found, yield the data_output and stop
yield data_output
continue
import re
with open("testfile_PantherDB.txt", "r+") as testfile:
# 1. Get the first line
for line in testfile:
# 2. Get the gene and the ortholog
split_list = re.split("[| \t \n]", line)
DB = split_list [1]
DB_break = re.split("=", DB)
DB_name = DB_break [0]
DB_ID = DB_break [1]
UP = split_list [2]
UP_break= re.split("[=]", UP)
UP_2b = UP_break [1]
# Species specific (ss) DB name and ID of ortholog
ortholog_ss = split_list [4]
ortholog_ss_break= re.split("[=]", ortholog_ss)
ortholog_ss_title= ortholog_ss_break[0]
ortholog_ss_ID = ortholog_ss_break[1]
# The UniprotID of the ortholog
ortholog_UP = split_list [5]
ortholog_UP_break= re.split("[=]", ortholog_UP)
ortholog_UP_ID = ortholog_UP_break [1]
# The ortholog type
ortholog_type = split_list [6]
# The PantherDB Family
PanDB_family = split_list [8]
# 3. Get the next line
for line in testfile:
# 4. if the two lines have the same gene, append
if UP_2b in line:
DB_next = split_list [1]
DB_next_break = re.split("=", DB_next)
DB_next_name = DB_next_break [0]
DB_next_ID = DB_next_break [1]
split_next = re.split("[| \t \n]", line)
ESID_next= split_next [1]
ESID_title_next_break = re.split("[=]", ESID_next)
ESID_title_next = ESID_title_next_break [0]
ESID_ID_next = ESID_title_next_break [1]
# This one is for the uniprot ID
UP_next = split_next [2]
UP_next_break= re.split("[=]", UP_next)
UP_next_2b = UP_next_break [1]
# The UPID keyword would be used for searching lines with common ref. gene
# Create the items for the ortholog
# The ss name and id of the ortholog
ortholog_ss_next = split_next [4]
ortholog_ss_break_next= re.split("[=]", ortholog_ss_next)
ortholog_ss_title_next= ortholog_ss_break_next[0]
ortholog_ss_ID_next = ortholog_ss_break_next[1]
# The UniprotID of the ortholog
ortholog_UP_next = split_next [5]
ortholog_UP_break_next= re.split("[=]", ortholog_UP_next)
ortholog_UP_next_ID = ortholog_UP_break_next [1]
# The ortholog type
ortholog_type_next = split_next [6]
# The PantherDB Family
PanDB_family_next = split_next [8]
print(UP_next_2b, ortholog_UP_next_ID, ortholog_type_next,PanDB_family_next)
# This is for checking if the gene info match, in case the UP_2b matches with the
# ortholog info in the next line
if UP_2b == UP_next_2b:
dict_list = {
"id" : UP_2b,
"pantherdb" : {
DB_name : DB_ID,
"uniprot_id": UP_2b,
}
}
ortholog = [{
ortholog_ss_title: ortholog_ss_ID,
"UniProtKB" : ortholog_UP_ID,
"ortholog_type" : ortholog_type,
"panther_family" : PanDB_family,
}]
ortholog.append({ortholog_ss_title_next: ortholog_ss_ID_next,"UniProtKB" : ortholog_UP_next_ID,"ortholog_type" : ortholog_type_next,"panther_family" : PanDB_family_next})
dict_list["ortholog"] = ortholog
# create a dictionary and append
else:
# 5. if not, yield
print (dict_list)
buffer = None # empty buffer
ortholog = None # empty ortholog for later usage
import re
with open("testfile_PantherDB.txt", "r+") as testfile:
for line in testfile:
split_list = re.split("[| \t \n]", line)
DB = split_list [1]
DB_break = re.split("=", DB)
DB_name = DB_break [0]
DB_ID = DB_break [1]
UP = split_list [2]
UP_break= re.split("[=]", UP)
UP_2b = UP_break [1]
# Species specific (ss) DB name and ID of ortholog
ortholog_ss = split_list [4]
ortholog_ss_break= re.split("[=]", ortholog_ss)
ortholog_ss_title= ortholog_ss_break[0]
ortholog_ss_ID = ortholog_ss_break[1]
# The UniprotID of the ortholog
ortholog_UP = split_list [5]
ortholog_UP_break= re.split("[=]", ortholog_UP)
ortholog_UP_ID = ortholog_UP_break [1]
# The ortholog type
ortholog_type = split_list [6]
# The PantherDB Family
PanDB_family = split_list [8]
if UP_2b in line:
DB_next = split_list [1]
DB_next_break = re.split("=", DB_next)
DB_next_name = DB_next_break [0]
DB_next_ID = DB_next_break [1]
split_next = re.split("[| \t \n]", line)
ESID_next= split_next [1]
ESID_title_next_break = re.split("[=]", ESID_next)
ESID_title_next = ESID_title_next_break [0]
ESID_ID_next = ESID_title_next_break [1]
# This one is for the uniprot ID
UP_next = split_next [2]
UP_next_break= re.split("[=]", UP_next)
UP_next_2b = UP_next_break [1]
# The UPID keyword would be used for searching lines with common ref. gene
# Create the items for the ortholog
# The ss name and id of the ortholog
ortholog_ss_next = split_next [4]
ortholog_ss_break_next= re.split("[=]", ortholog_ss_next)
ortholog_ss_title_next= ortholog_ss_break_next[0]
ortholog_ss_ID_next = ortholog_ss_break_next[1]
# The UniprotID of the ortholog
ortholog_UP_next = split_next [5]
ortholog_UP_break_next= re.split("[=]", ortholog_UP_next)
ortholog_UP_next_ID = ortholog_UP_break_next [1]
# The ortholog type
ortholog_type_next = split_next [6]
# The PantherDB Family
PanDB_family_next = split_next [8]
# This is for checking if the gene info match, in case the UP_2b matches with the
# ortholog info in the next line
if UP == UP_next:
d = {
"id" : UP_2b,
"pantherdb" : {
DB_name : DB_ID,
"uniprot_id": UP_2b,
}
}
if buffer == None and ortholog == None: # if no dict, set one
buffer = d
ortholog = [{
ortholog_ss_title: ortholog_ss_ID,
"UniProtKB" : ortholog_UP_ID,
"ortholog_type" : ortholog_type,
"panther_family" : PanDB_family,
}]
ortholog.append({ortholog_ss_title_next: ortholog_ss_ID_next,"UniProtKB" : ortholog_UP_next_ID,"ortholog_type" : ortholog_type_next,"panther_family" : PanDB_family_next})
else: # if already have d, add item
ortholog.append({ortholog_ss_title_next: ortholog_ss_ID_next,"UniProtKB" : ortholog_UP_next_ID,"ortholog_type" : ortholog_type_next,"panther_family" : PanDB_family_next})
print(ortholog)
else:
d["ortholog"] = ortholog # only add the list at the last
print(d)
break
import re
# this empty dictionary is for storing the final output
d = {}
# this empty list is for storing the orthologs of the same reference gene
o = []
# this empty list stores the common Uniprot_ID temporarily for comparison
e = []
class Parser():
def __init__(self, x):
self.x = x
self.y = re.split("[\| \t \n]", self.x)
self.z = self.y [2]
self.a = self.z.split("=")
self.b = self.y [1]
self.c = self.b.split("=")
self.d = self.y [4]
self.g = self.d.split("=")
self.h = self.y [5]
self.i = self.h.split("=")
# the previous variables does not matter much; they are only semi-product of splitting
# the below are the ultimate products
# those with ref_gene prefix refers to the information of the common reference gene
# those wih the ortholog prefix refers to the information of the orthologs
self.ref_gene_uniprot_ID = self.a [1]
self.ref_gene_db_name = self.c [0]
self.ref_gene_db_id = self.c [1]
self.ortholog_db_name = self.g [0]
self.ortholog_db_id = self.g[-1]
self.ortholog_Uniprot_ID = self.i [1]
self.ortholog_type = self.y [6]
self.ortholog_pdb_family = self.y [8]
with open("testfile_PantherDB.txt", "r+") as f:
# This function is for splitting the line
for line in f:
line = Parser(line)
if line.ref_gene_uniprot_ID in e: # i.e. the two lines has the same reference gene
# append the ortholog to the list
new = {line.ortholog_db_name: line.ortholog_db_id,
"UniProtKB": line.ortholog_Uniprot_ID,
"Ortholog_type": line.ortholog_type,
"panther_family": line.ortholog_pdb_family
}
o.append(new)
elif e == []:
e.append(line.ref_gene_uniprot_ID) # This applies only to the first line
d = { "id": line.ref_gene_uniprot_ID,
"pantherdb": {
line.ref_gene_db_name: line.ref_gene_db_id,
"UniProtKB": line.ref_gene_uniprot_ID,
}
}
o = [{line.ortholog_db_name: line.ortholog_db_id,
"UniProtKB": line.ortholog_Uniprot_ID,
"Ortholog_type": line.ortholog_type,
"panther_family": line.ortholog_pdb_family
}
]
]
# elif EOFError: # output the last item if the end of file reached:
# d["ortholog: "] = o
# print(d)
# break
else: # if difference in reference gene, indicating output of dictionary and output of d
d["ortholog: "] = o
print(d)
d.clear()
e.clear()
e.append(line.ref_gene_uniprot_ID)
d = { "id": line.ref_gene_uniprot_ID,
"pantherdb": {
line.ref_gene_db_name: line.ref_gene_db_id,
"UniProtKB": line.ref_gene_uniprot_ID,
}
}
o = [{line.ortholog_db_name: line.ortholog_db_id,
"UniProtKB": line.ortholog_Uniprot_ID,
"Ortholog_type": line.ortholog_type,
"panther_family": line.ortholog_pdb_family
}
]
import re
# this empty dictionary is for storing the final output
d = {}
# this empty list is for storing the orthologs of the same reference gene
o = []
# this empty list stores the common Uniprot_ID temporarily for comparison
e = None
with open("testfile_PantherDB.txt", "r+") as f:
# This function is for splitting the line
for line in f:
y = re.split("[\| \t \n]", line)
z = re.split("=", y [2])
a = re.split("=", y [1])
b = re.split("=", y [4])
c = re.split("=", y [5])
# The above are only intermediates
# The below are the important variables
ref_gene_uniprot_id = z [1]
ref_gene_db_name = a [0]
ref_gene_db_id = a[-1]
ortholog_db_name = b [0]
ortholog_db_id = b [-1]
ortholog_uniprot_id = c [1]
ortholog_type = y [6]
ortholog_family = y [8]
if e is None: # for the first item
e = ref_gene_uniprot_id
d = { "id": ref_gene_uniprot_id,
"pantherdb": {
ref_gene_db_name: ref_gene_db_id,
"UniProtKB": ref_gene_uniprot_id,
}
}
o = [{ortholog_db_name: ortholog_db_id,
"UniProtKB": ortholog_uniprot_id,
"Ortholog_type": ortholog_type,
"panther_family": ortholog_family
}
]
elif ref_gene_uniprot_id != e: # if read up to a different ref. gene
d["ortholog: "] = o
print(d)
d.clear()
e = ref_gene_uniprot_id
d = { "id": ref_gene_uniprot_id,
"pantherdb": {
ref_gene_db_name: ref_gene_db_id,
"UniProtKB": ref_gene_uniprot_id
}
}
o = [{ortholog_db_name: ortholog_db_id,
"UniProtKB": ortholog_uniprot_id,
"Ortholog_type": ortholog_type,
"panther_family": ortholog_family
}
]
else: # in this case the ref. gene is the same, just append the ortholog
new = {ortholog_db_name: ortholog_db_id,
"UniProtKB": ortholog_uniprot_id,
"Ortholog_type": ortholog_type,
"panther_family": ortholog_family
}
o.append(new)
if o:
d["ortholog: "] = o
print(d)
import re
import os.path
def load_data (data_folder):
data_file = os.path.join(data_folder, "testfile_PantherDB.txt")
# this empty dictionary is for storing the final output
d = {}
# this empty list is for storing the orthologs of the same reference gene
o = []
# this empty list stores the common Uniprot_ID temporarily for comparison
e = None
# Define a function that takes the datafile as the sole argument
with open(data_file, "r+") as f:# change this to the file name
# This function is for splitting the line
for line in f:
y = re.split("[\| \t \n]", line)
z = re.split("=", y [2])
a = re.split("=", y [1])
b = re.split("=", y [4])
c = re.split("=", y [5])
# The above are only intermediates
# The below are the important variables
ref_gene_uniprot_id = z [1]
ref_gene_db_name = a [0]
ref_gene_db_id = a[-1]
ortholog_db_name = b [0]
ortholog_db_id = b [-1]
ortholog_uniprot_id = c [1]
ortholog_type = y [6]
ortholog_family = y [8]
if e is None: # for the first item
e = ref_gene_uniprot_id
d = { "id": ref_gene_uniprot_id,
"pantherdb": {
ref_gene_db_name: ref_gene_db_id,
"uniprot_kb": ref_gene_uniprot_id,
}
}
if ref_gene_uniprot_id != e: # if read up to a different ref. gene
d = { "id": ref_gene_uniprot_id,
"pantherdb": {
ref_gene_db_name: ref_gene_db_id,
"uniprot_kb": ref_gene_uniprot_id,
"orthologs" : o
}
}
yield d
d.clear()
e = ref_gene_uniprot_id
d = { "id": ref_gene_uniprot_id,
"pantherdb": {
ref_gene_db_name: ref_gene_db_id,
"uniprot_kb": ref_gene_uniprot_id
}
}
o = [{ortholog_db_name: ortholog_db_id,
"uniprot_kb": ortholog_uniprot_id,
"ortholog_type": ortholog_type,
"panther_family": ortholog_family
}
]
else: # in this case the ref. gene is the same, just append the ortholog
new = {ortholog_db_name: ortholog_db_id,
"uniprot_kb": ortholog_uniprot_id,
"ortholog_type": ortholog_type,
"panther_family": ortholog_family
}
o.append(new)
if o:
# at the last item, the ortholog is created but since it has no next ref_gene_uniprot_id to compare,
# it does not go to the second if and output the result
# and thus we need to let it output the result by giving it the condition if o == true.
d = { "id": ref_gene_uniprot_id,
"pantherdb": {
ref_gene_db_name: ref_gene_db_id,
"uniprot_kb": ref_gene_uniprot_id,
"orthologs" : o
}
}
yield d
# The below code is what I used for testing whether my generator is working
# I opened the file named Test_folder containing 2 files, one is my test file
# and the other is a "fake data file" that contains data with the same structure
# Just to test if my parser can get the right file from the folder
# Then, I feed the function with the directory to the test folder that contains
# both the right file and the fake file
if "__name__" == "__main__":
for i in load_data(data_folder):
print(i)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment