Last active
June 16, 2018 14:59
-
-
Save AnnaTSW0609/e0f32d10f98ab3cd0bbc687c9ea5da41 to your computer and use it in GitHub Desktop.
The first draft of the PantherDB Parser+test datafile
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Data = [] # Create an empty dictionary | |
import re | |
with open("testfile_PantherDB.txt", "r+") as testfile: | |
for line in testfile: | |
# Creating the items for the reference gene | |
# The four lines below creates 1a (name of the species specific database) | |
# and 1b (the ID for that gene in the ss database | |
split_list = re.split("[| \t \n]", line) | |
ESID= split_list [1] | |
ESID_title_break = re.split("[=]", ESID) | |
ESID_title_1a = ESID_title_break [0] | |
ESID_ID_1b = ESID_title_break [1] | |
# This one is for the uniprot ID | |
UP = split_list [2] | |
UP_break= re.split("[=]", UP) | |
UP_2b = UP_break [1] | |
# Create the items for the ortholog | |
# The ss name and id of the ortholog | |
ortholog_ss = split_list [4] | |
ortholog_ss_break= re.split("[=]", ortholog_ss) | |
ortholog_ss_title_4a= ortholog_ss_break[0] | |
ortholog_ss_ID_4b = ortholog_ss_break[1] | |
# The UniprotID of the ortholog | |
ortholog_UP = split_list [5] | |
ortholog_UP_break= re.split("[=]", ortholog_UP) | |
ortholog_UP_5b = ortholog_UP_break [1] | |
# The ortholog type | |
ortholog_type = split_list [6] | |
# The PantherDB Family | |
PanDB_family = split_list [8] | |
# Add the whole entry into the dictionary | |
key_value_pair = { | |
"id_" : UP_2b, | |
ESID_title_1a : ESID_ID_1b, | |
"UniProt_ID:": UP_2b, | |
"Ortholog" : [ | |
{ortholog_ss_title_4a : ortholog_ss_ID_4b , | |
"UniProt_ID:" : ortholog_UP_5b, | |
"Ortholog_type": ortholog_type, | |
"PantherDB_family" : PanDB_family | |
}] | |
} | |
Data.append(dict(key_value_pair)) | |
print(Data) | |
#New_Data = [] # Create another new list | |
#for Data [x] in Data: # iterate dictionaries in list | |
#if y in Data [x] == z in Data [x + 1]: # if the value of the first key of two dictionaries in the list are the same | |
# Append the dictionary of ortholog information (the value of the "Ortholog" key) of the second dictionary | |
# to the list of ortholog info of the first dictionary. | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
with open("testfile_PartnerDB.txt", "r+") as testfile: | |
for line in testfile: | |
print(next(search("O60524", "testfile_PantherDB.txt"))) | |
# Creating the items for the reference gene | |
# The four lines below creates 1a (name of the species specific database) | |
# and 1b (the ID for that gene in the ss database | |
split_list = re.split("[| \t \n]", line) | |
ESID= split_list [1] | |
ESID_title_break = re.split("[=]", ESID) | |
ESID_title_1a = ESID_title_break [0] | |
ESID_ID_1b = ESID_title_break [1] | |
# This one is for the uniprot ID | |
UP = split_list [2] | |
UP_break= re.split("[=]", UP) | |
UP_2b = UP_break [1] | |
# The UPID keyword would be used for searching lines with common ref. gene | |
# Create the items for the ortholog | |
# The ss name and id of the ortholog | |
ortholog_ss = split_list [4] | |
ortholog_ss_break= re.split("[=]", ortholog_ss) | |
ortholog_ss_title_4a= ortholog_ss_break[0] | |
ortholog_ss_ID_4b = ortholog_ss_break[1] | |
# The UniprotID of the ortholog | |
ortholog_UP = split_list [5] | |
ortholog_UP_break= re.split("[=]", ortholog_UP) | |
ortholog_UP_5b = ortholog_UP_break [1] | |
# The ortholog type | |
ortholog_type = split_list [6] | |
# The PantherDB Family | |
PanDB_family = split_list [8] | |
# The following is the generator that yield the final data structure | |
if UP_2b in line: | |
try: | |
data_output | |
# each output (final structure) would be stored in data_output | |
# thus if this exist, only the ortholog needs to be added | |
# else, a new data entry needs to be created. | |
except NameError: | |
data_output= { | |
"id_" : UP_2b, | |
EID_title_1a : ESID_ID_1b, | |
"UniProt_ID:": UP_2b, | |
"Ortholog" : [ | |
{ortholog_ss_title_4a : ortholog_ss_ID_4b , | |
"UniProt_ID:" : ortholog_UP_5b, | |
"Ortholog_type": ortholog_type, | |
"PantherDB_family" : PanDB_family | |
}] | |
} | |
else: | |
data_output.update({ | |
"Ortholog" : [ | |
{ortholog_ss_title_4a : ortholog_ss_ID_4b , | |
"UniProt_ID:" : ortholog_UP_5b, | |
"Ortholog_type": ortholog_type, | |
"PantherDB_family" : PanDB_family | |
}] | |
} | |
else: # if no more item with common gene ref is found, yield the data_output and stop | |
yield data_output | |
continue | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
with open("testfile_PantherDB.txt", "r+") as testfile: | |
# 1. Get the first line | |
for line in testfile: | |
# 2. Get the gene and the ortholog | |
split_list = re.split("[| \t \n]", line) | |
DB = split_list [1] | |
DB_break = re.split("=", DB) | |
DB_name = DB_break [0] | |
DB_ID = DB_break [1] | |
UP = split_list [2] | |
UP_break= re.split("[=]", UP) | |
UP_2b = UP_break [1] | |
# Species specific (ss) DB name and ID of ortholog | |
ortholog_ss = split_list [4] | |
ortholog_ss_break= re.split("[=]", ortholog_ss) | |
ortholog_ss_title= ortholog_ss_break[0] | |
ortholog_ss_ID = ortholog_ss_break[1] | |
# The UniprotID of the ortholog | |
ortholog_UP = split_list [5] | |
ortholog_UP_break= re.split("[=]", ortholog_UP) | |
ortholog_UP_ID = ortholog_UP_break [1] | |
# The ortholog type | |
ortholog_type = split_list [6] | |
# The PantherDB Family | |
PanDB_family = split_list [8] | |
# 3. Get the next line | |
for line in testfile: | |
# 4. if the two lines have the same gene, append | |
if UP_2b in line: | |
DB_next = split_list [1] | |
DB_next_break = re.split("=", DB_next) | |
DB_next_name = DB_next_break [0] | |
DB_next_ID = DB_next_break [1] | |
split_next = re.split("[| \t \n]", line) | |
ESID_next= split_next [1] | |
ESID_title_next_break = re.split("[=]", ESID_next) | |
ESID_title_next = ESID_title_next_break [0] | |
ESID_ID_next = ESID_title_next_break [1] | |
# This one is for the uniprot ID | |
UP_next = split_next [2] | |
UP_next_break= re.split("[=]", UP_next) | |
UP_next_2b = UP_next_break [1] | |
# The UPID keyword would be used for searching lines with common ref. gene | |
# Create the items for the ortholog | |
# The ss name and id of the ortholog | |
ortholog_ss_next = split_next [4] | |
ortholog_ss_break_next= re.split("[=]", ortholog_ss_next) | |
ortholog_ss_title_next= ortholog_ss_break_next[0] | |
ortholog_ss_ID_next = ortholog_ss_break_next[1] | |
# The UniprotID of the ortholog | |
ortholog_UP_next = split_next [5] | |
ortholog_UP_break_next= re.split("[=]", ortholog_UP_next) | |
ortholog_UP_next_ID = ortholog_UP_break_next [1] | |
# The ortholog type | |
ortholog_type_next = split_next [6] | |
# The PantherDB Family | |
PanDB_family_next = split_next [8] | |
print(UP_next_2b, ortholog_UP_next_ID, ortholog_type_next,PanDB_family_next) | |
# This is for checking if the gene info match, in case the UP_2b matches with the | |
# ortholog info in the next line | |
if UP_2b == UP_next_2b: | |
dict_list = { | |
"id" : UP_2b, | |
"pantherdb" : { | |
DB_name : DB_ID, | |
"uniprot_id": UP_2b, | |
} | |
} | |
ortholog = [{ | |
ortholog_ss_title: ortholog_ss_ID, | |
"UniProtKB" : ortholog_UP_ID, | |
"ortholog_type" : ortholog_type, | |
"panther_family" : PanDB_family, | |
}] | |
ortholog.append({ortholog_ss_title_next: ortholog_ss_ID_next,"UniProtKB" : ortholog_UP_next_ID,"ortholog_type" : ortholog_type_next,"panther_family" : PanDB_family_next}) | |
dict_list["ortholog"] = ortholog | |
# create a dictionary and append | |
else: | |
# 5. if not, yield | |
print (dict_list) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
buffer = None # empty buffer | |
ortholog = None # empty ortholog for later usage | |
import re | |
with open("testfile_PantherDB.txt", "r+") as testfile: | |
for line in testfile: | |
split_list = re.split("[| \t \n]", line) | |
DB = split_list [1] | |
DB_break = re.split("=", DB) | |
DB_name = DB_break [0] | |
DB_ID = DB_break [1] | |
UP = split_list [2] | |
UP_break= re.split("[=]", UP) | |
UP_2b = UP_break [1] | |
# Species specific (ss) DB name and ID of ortholog | |
ortholog_ss = split_list [4] | |
ortholog_ss_break= re.split("[=]", ortholog_ss) | |
ortholog_ss_title= ortholog_ss_break[0] | |
ortholog_ss_ID = ortholog_ss_break[1] | |
# The UniprotID of the ortholog | |
ortholog_UP = split_list [5] | |
ortholog_UP_break= re.split("[=]", ortholog_UP) | |
ortholog_UP_ID = ortholog_UP_break [1] | |
# The ortholog type | |
ortholog_type = split_list [6] | |
# The PantherDB Family | |
PanDB_family = split_list [8] | |
if UP_2b in line: | |
DB_next = split_list [1] | |
DB_next_break = re.split("=", DB_next) | |
DB_next_name = DB_next_break [0] | |
DB_next_ID = DB_next_break [1] | |
split_next = re.split("[| \t \n]", line) | |
ESID_next= split_next [1] | |
ESID_title_next_break = re.split("[=]", ESID_next) | |
ESID_title_next = ESID_title_next_break [0] | |
ESID_ID_next = ESID_title_next_break [1] | |
# This one is for the uniprot ID | |
UP_next = split_next [2] | |
UP_next_break= re.split("[=]", UP_next) | |
UP_next_2b = UP_next_break [1] | |
# The UPID keyword would be used for searching lines with common ref. gene | |
# Create the items for the ortholog | |
# The ss name and id of the ortholog | |
ortholog_ss_next = split_next [4] | |
ortholog_ss_break_next= re.split("[=]", ortholog_ss_next) | |
ortholog_ss_title_next= ortholog_ss_break_next[0] | |
ortholog_ss_ID_next = ortholog_ss_break_next[1] | |
# The UniprotID of the ortholog | |
ortholog_UP_next = split_next [5] | |
ortholog_UP_break_next= re.split("[=]", ortholog_UP_next) | |
ortholog_UP_next_ID = ortholog_UP_break_next [1] | |
# The ortholog type | |
ortholog_type_next = split_next [6] | |
# The PantherDB Family | |
PanDB_family_next = split_next [8] | |
# This is for checking if the gene info match, in case the UP_2b matches with the | |
# ortholog info in the next line | |
if UP == UP_next: | |
d = { | |
"id" : UP_2b, | |
"pantherdb" : { | |
DB_name : DB_ID, | |
"uniprot_id": UP_2b, | |
} | |
} | |
if buffer == None and ortholog == None: # if no dict, set one | |
buffer = d | |
ortholog = [{ | |
ortholog_ss_title: ortholog_ss_ID, | |
"UniProtKB" : ortholog_UP_ID, | |
"ortholog_type" : ortholog_type, | |
"panther_family" : PanDB_family, | |
}] | |
ortholog.append({ortholog_ss_title_next: ortholog_ss_ID_next,"UniProtKB" : ortholog_UP_next_ID,"ortholog_type" : ortholog_type_next,"panther_family" : PanDB_family_next}) | |
else: # if already have d, add item | |
ortholog.append({ortholog_ss_title_next: ortholog_ss_ID_next,"UniProtKB" : ortholog_UP_next_ID,"ortholog_type" : ortholog_type_next,"panther_family" : PanDB_family_next}) | |
print(ortholog) | |
else: | |
d["ortholog"] = ortholog # only add the list at the last | |
print(d) | |
break | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
# this empty dictionary is for storing the final output | |
d = {} | |
# this empty list is for storing the orthologs of the same reference gene | |
o = [] | |
# this empty list stores the common Uniprot_ID temporarily for comparison | |
e = [] | |
class Parser(): | |
def __init__(self, x): | |
self.x = x | |
self.y = re.split("[\| \t \n]", self.x) | |
self.z = self.y [2] | |
self.a = self.z.split("=") | |
self.b = self.y [1] | |
self.c = self.b.split("=") | |
self.d = self.y [4] | |
self.g = self.d.split("=") | |
self.h = self.y [5] | |
self.i = self.h.split("=") | |
# the previous variables does not matter much; they are only semi-product of splitting | |
# the below are the ultimate products | |
# those with ref_gene prefix refers to the information of the common reference gene | |
# those wih the ortholog prefix refers to the information of the orthologs | |
self.ref_gene_uniprot_ID = self.a [1] | |
self.ref_gene_db_name = self.c [0] | |
self.ref_gene_db_id = self.c [1] | |
self.ortholog_db_name = self.g [0] | |
self.ortholog_db_id = self.g[-1] | |
self.ortholog_Uniprot_ID = self.i [1] | |
self.ortholog_type = self.y [6] | |
self.ortholog_pdb_family = self.y [8] | |
with open("testfile_PantherDB.txt", "r+") as f: | |
# This function is for splitting the line | |
for line in f: | |
line = Parser(line) | |
if line.ref_gene_uniprot_ID in e: # i.e. the two lines has the same reference gene | |
# append the ortholog to the list | |
new = {line.ortholog_db_name: line.ortholog_db_id, | |
"UniProtKB": line.ortholog_Uniprot_ID, | |
"Ortholog_type": line.ortholog_type, | |
"panther_family": line.ortholog_pdb_family | |
} | |
o.append(new) | |
elif e == []: | |
e.append(line.ref_gene_uniprot_ID) # This applies only to the first line | |
d = { "id": line.ref_gene_uniprot_ID, | |
"pantherdb": { | |
line.ref_gene_db_name: line.ref_gene_db_id, | |
"UniProtKB": line.ref_gene_uniprot_ID, | |
} | |
} | |
o = [{line.ortholog_db_name: line.ortholog_db_id, | |
"UniProtKB": line.ortholog_Uniprot_ID, | |
"Ortholog_type": line.ortholog_type, | |
"panther_family": line.ortholog_pdb_family | |
} | |
] | |
] | |
# elif EOFError: # output the last item if the end of file reached: | |
# d["ortholog: "] = o | |
# print(d) | |
# break | |
else: # if difference in reference gene, indicating output of dictionary and output of d | |
d["ortholog: "] = o | |
print(d) | |
d.clear() | |
e.clear() | |
e.append(line.ref_gene_uniprot_ID) | |
d = { "id": line.ref_gene_uniprot_ID, | |
"pantherdb": { | |
line.ref_gene_db_name: line.ref_gene_db_id, | |
"UniProtKB": line.ref_gene_uniprot_ID, | |
} | |
} | |
o = [{line.ortholog_db_name: line.ortholog_db_id, | |
"UniProtKB": line.ortholog_Uniprot_ID, | |
"Ortholog_type": line.ortholog_type, | |
"panther_family": line.ortholog_pdb_family | |
} | |
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
# this empty dictionary is for storing the final output | |
d = {} | |
# this empty list is for storing the orthologs of the same reference gene | |
o = [] | |
# this empty list stores the common Uniprot_ID temporarily for comparison | |
e = None | |
with open("testfile_PantherDB.txt", "r+") as f: | |
# This function is for splitting the line | |
for line in f: | |
y = re.split("[\| \t \n]", line) | |
z = re.split("=", y [2]) | |
a = re.split("=", y [1]) | |
b = re.split("=", y [4]) | |
c = re.split("=", y [5]) | |
# The above are only intermediates | |
# The below are the important variables | |
ref_gene_uniprot_id = z [1] | |
ref_gene_db_name = a [0] | |
ref_gene_db_id = a[-1] | |
ortholog_db_name = b [0] | |
ortholog_db_id = b [-1] | |
ortholog_uniprot_id = c [1] | |
ortholog_type = y [6] | |
ortholog_family = y [8] | |
if e is None: # for the first item | |
e = ref_gene_uniprot_id | |
d = { "id": ref_gene_uniprot_id, | |
"pantherdb": { | |
ref_gene_db_name: ref_gene_db_id, | |
"UniProtKB": ref_gene_uniprot_id, | |
} | |
} | |
o = [{ortholog_db_name: ortholog_db_id, | |
"UniProtKB": ortholog_uniprot_id, | |
"Ortholog_type": ortholog_type, | |
"panther_family": ortholog_family | |
} | |
] | |
elif ref_gene_uniprot_id != e: # if read up to a different ref. gene | |
d["ortholog: "] = o | |
print(d) | |
d.clear() | |
e = ref_gene_uniprot_id | |
d = { "id": ref_gene_uniprot_id, | |
"pantherdb": { | |
ref_gene_db_name: ref_gene_db_id, | |
"UniProtKB": ref_gene_uniprot_id | |
} | |
} | |
o = [{ortholog_db_name: ortholog_db_id, | |
"UniProtKB": ortholog_uniprot_id, | |
"Ortholog_type": ortholog_type, | |
"panther_family": ortholog_family | |
} | |
] | |
else: # in this case the ref. gene is the same, just append the ortholog | |
new = {ortholog_db_name: ortholog_db_id, | |
"UniProtKB": ortholog_uniprot_id, | |
"Ortholog_type": ortholog_type, | |
"panther_family": ortholog_family | |
} | |
o.append(new) | |
if o: | |
d["ortholog: "] = o | |
print(d) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import os.path | |
def load_data (data_folder): | |
data_file = os.path.join(data_folder, "testfile_PantherDB.txt") | |
# this empty dictionary is for storing the final output | |
d = {} | |
# this empty list is for storing the orthologs of the same reference gene | |
o = [] | |
# this empty list stores the common Uniprot_ID temporarily for comparison | |
e = None | |
# Define a function that takes the datafile as the sole argument | |
with open(data_file, "r+") as f:# change this to the file name | |
# This function is for splitting the line | |
for line in f: | |
y = re.split("[\| \t \n]", line) | |
z = re.split("=", y [2]) | |
a = re.split("=", y [1]) | |
b = re.split("=", y [4]) | |
c = re.split("=", y [5]) | |
# The above are only intermediates | |
# The below are the important variables | |
ref_gene_uniprot_id = z [1] | |
ref_gene_db_name = a [0] | |
ref_gene_db_id = a[-1] | |
ortholog_db_name = b [0] | |
ortholog_db_id = b [-1] | |
ortholog_uniprot_id = c [1] | |
ortholog_type = y [6] | |
ortholog_family = y [8] | |
if e is None: # for the first item | |
e = ref_gene_uniprot_id | |
d = { "id": ref_gene_uniprot_id, | |
"pantherdb": { | |
ref_gene_db_name: ref_gene_db_id, | |
"uniprot_kb": ref_gene_uniprot_id, | |
} | |
} | |
if ref_gene_uniprot_id != e: # if read up to a different ref. gene | |
d = { "id": ref_gene_uniprot_id, | |
"pantherdb": { | |
ref_gene_db_name: ref_gene_db_id, | |
"uniprot_kb": ref_gene_uniprot_id, | |
"orthologs" : o | |
} | |
} | |
yield d | |
d.clear() | |
e = ref_gene_uniprot_id | |
d = { "id": ref_gene_uniprot_id, | |
"pantherdb": { | |
ref_gene_db_name: ref_gene_db_id, | |
"uniprot_kb": ref_gene_uniprot_id | |
} | |
} | |
o = [{ortholog_db_name: ortholog_db_id, | |
"uniprot_kb": ortholog_uniprot_id, | |
"ortholog_type": ortholog_type, | |
"panther_family": ortholog_family | |
} | |
] | |
else: # in this case the ref. gene is the same, just append the ortholog | |
new = {ortholog_db_name: ortholog_db_id, | |
"uniprot_kb": ortholog_uniprot_id, | |
"ortholog_type": ortholog_type, | |
"panther_family": ortholog_family | |
} | |
o.append(new) | |
if o: | |
# at the last item, the ortholog is created but since it has no next ref_gene_uniprot_id to compare, | |
# it does not go to the second if and output the result | |
# and thus we need to let it output the result by giving it the condition if o == true. | |
d = { "id": ref_gene_uniprot_id, | |
"pantherdb": { | |
ref_gene_db_name: ref_gene_db_id, | |
"uniprot_kb": ref_gene_uniprot_id, | |
"orthologs" : o | |
} | |
} | |
yield d | |
# The below code is what I used for testing whether my generator is working | |
# I opened the file named Test_folder containing 2 files, one is my test file | |
# and the other is a "fake data file" that contains data with the same structure | |
# Just to test if my parser can get the right file from the folder | |
# Then, I feed the function with the directory to the test folder that contains | |
# both the right file and the fake file | |
if "__name__" == "__main__": | |
for i in load_data(data_folder): | |
print(i) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment