AnnaTSW0609 · June 16, 2018 14:59
diff --git a/PantherDB_Parser_draft_1.py b/PantherDB_Parser_draft_1.py
 Data = [] # Create an empty dictionary 
 import re
 with open("testfile_PantherDB.txt", "r+") as testfile:
     for line in testfile:
         # Creating the items for the reference gene
         # The four lines below creates 1a (name of the species specific database)
         # and 1b (the ID for that gene in the ss database
         split_list = re.split("[| \t \n]", line)
         ESID= split_list [1]
         ESID_title_break = re.split("[=]", ESID)
         ESID_title_1a = ESID_title_break [0]
         ESID_ID_1b = ESID_title_break [1]
         # This one is for the uniprot ID
         UP = split_list [2]
         UP_break= re.split("[=]", UP)
         UP_2b = UP_break [1]
         # Create the items for the ortholog 
         # The ss name and id of the ortholog 
         ortholog_ss = split_list [4]
         ortholog_ss_break= re.split("[=]", ortholog_ss)
         ortholog_ss_title_4a= ortholog_ss_break[0]
         ortholog_ss_ID_4b = ortholog_ss_break[1]
         # The UniprotID of the ortholog 
         ortholog_UP = split_list [5]
         ortholog_UP_break= re.split("[=]", ortholog_UP)
         ortholog_UP_5b = ortholog_UP_break [1]
         # The ortholog type 
         ortholog_type = split_list [6]
         # The PantherDB Family 
         PanDB_family = split_list [8]
         # Add the whole entry into the dictionary 
         key_value_pair = { 
              "id_" : UP_2b, 
              ESID_title_1a : ESID_ID_1b, 
              "UniProt_ID:": UP_2b, 
              "Ortholog" : [ 
                   {ortholog_ss_title_4a : ortholog_ss_ID_4b , 
                   "UniProt_ID:" : ortholog_UP_5b, 
                   "Ortholog_type": ortholog_type,
                   "PantherDB_family" : PanDB_family
                   }]
              }
         Data.append(dict(key_value_pair))
 print(Data)

 #New_Data = [] # Create another new list 
 #for Data [x] in Data: # iterate dictionaries in list
    #if y in Data [x] == z in Data [x + 1]: # if the value of the first key of two dictionaries in the list are the same
       # Append the dictionary of ortholog information (the value of the "Ortholog" key) of the second dictionary
       # to the list of ortholog info of the first dictionary.
    
        
diff --git a/PantherDB_Parser_draft_2.py b/PantherDB_Parser_draft_2.py
 import re
 with open("testfile_PartnerDB.txt", "r+") as testfile: 
    for line in testfile:
    print(next(search("O60524", "testfile_PantherDB.txt")))
        # Creating the items for the reference gene
         # The four lines below creates 1a (name of the species specific database)
         # and 1b (the ID for that gene in the ss database
         split_list = re.split("[| \t \n]", line)
         ESID= split_list [1]
         ESID_title_break = re.split("[=]", ESID)
         ESID_title_1a = ESID_title_break [0]
         ESID_ID_1b = ESID_title_break [1]
         # This one is for the uniprot ID
         UP = split_list [2]
         UP_break= re.split("[=]", UP)
         UP_2b = UP_break [1]
         # The UPID keyword would be used for searching lines with common ref. gene
         # Create the items for the ortholog 
         # The ss name and id of the ortholog 
         ortholog_ss = split_list [4]
         ortholog_ss_break= re.split("[=]", ortholog_ss)
         ortholog_ss_title_4a= ortholog_ss_break[0]
         ortholog_ss_ID_4b = ortholog_ss_break[1]
         # The UniprotID of the ortholog 
         ortholog_UP = split_list [5]
         ortholog_UP_break= re.split("[=]", ortholog_UP)
         ortholog_UP_5b = ortholog_UP_break [1]
         # The ortholog type 
         ortholog_type = split_list [6]
         # The PantherDB Family 
         PanDB_family = split_list [8]
         # The following is the generator that yield the final data structure
         if UP_2b in line:
              try:
                 data_output
              # each output (final structure) would be stored in data_output
              # thus if this exist, only the ortholog needs to be added 
              # else, a new data entry needs to be created. 
              except NameError:
                   data_output= { 
                   "id_" : UP_2b, 
                   EID_title_1a : ESID_ID_1b, 
                   "UniProt_ID:": UP_2b, 
                   "Ortholog" : [ 
                              {ortholog_ss_title_4a : ortholog_ss_ID_4b , 
                              "UniProt_ID:" : ortholog_UP_5b, 
                              "Ortholog_type": ortholog_type,
                              "PantherDB_family" : PanDB_family
                              }]
                    }
              else:
                    data_output.update({
                    "Ortholog" : [ 
                                {ortholog_ss_title_4a : ortholog_ss_ID_4b , 
                                "UniProt_ID:" : ortholog_UP_5b, 
                                "Ortholog_type": ortholog_type,
                                "PantherDB_family" : PanDB_family
                                }]
                    }
                      
         else: # if no more item with common gene ref is found, yield the data_output and stop 
              yield data_output
              continue
         
        
diff --git a/PantherDB_Parser_draft_3.py b/PantherDB_Parser_draft_3.py
 import re
 with open("testfile_PantherDB.txt", "r+") as testfile: 
    # 1. Get the first line 
    for line in testfile:
        # 2. Get the gene and the ortholog 
        split_list = re.split("[| \t \n]", line)
        DB = split_list [1]
        DB_break = re.split("=", DB)
        DB_name = DB_break [0]
        DB_ID = DB_break [1]
        UP = split_list [2]
        UP_break= re.split("[=]", UP)
        UP_2b = UP_break [1]
        # Species specific (ss) DB name and ID of ortholog 
        ortholog_ss = split_list [4]
        ortholog_ss_break= re.split("[=]", ortholog_ss)
        ortholog_ss_title= ortholog_ss_break[0]
        ortholog_ss_ID = ortholog_ss_break[1]
        # The UniprotID of the ortholog 
        ortholog_UP = split_list [5]
        ortholog_UP_break= re.split("[=]", ortholog_UP)
        ortholog_UP_ID = ortholog_UP_break [1]
        # The ortholog type 
        ortholog_type = split_list [6]
        # The PantherDB Family 
        PanDB_family = split_list [8]
        # 3. Get the next line           
        for line in testfile: 
            # 4. if the two lines have the same gene, append 
            if UP_2b in line:     
                DB_next = split_list [1]
                DB_next_break = re.split("=", DB_next)
                DB_next_name = DB_next_break [0]
                DB_next_ID = DB_next_break [1]
                split_next = re.split("[| \t \n]", line)
                ESID_next= split_next [1]
                ESID_title_next_break = re.split("[=]", ESID_next)
                ESID_title_next = ESID_title_next_break [0]
                ESID_ID_next = ESID_title_next_break [1]
                # This one is for the uniprot ID
                UP_next = split_next [2]
                UP_next_break= re.split("[=]", UP_next)
                UP_next_2b = UP_next_break [1]
                # The UPID keyword would be used for searching lines with common ref. gene
                # Create the items for the ortholog 
                # The ss name and id of the ortholog 
                ortholog_ss_next = split_next [4]
                ortholog_ss_break_next= re.split("[=]", ortholog_ss_next)
                ortholog_ss_title_next= ortholog_ss_break_next[0]
                ortholog_ss_ID_next = ortholog_ss_break_next[1]
                # The UniprotID of the ortholog 
                ortholog_UP_next = split_next [5]
                ortholog_UP_break_next= re.split("[=]", ortholog_UP_next)
                ortholog_UP_next_ID = ortholog_UP_break_next [1]
                # The ortholog type 
                ortholog_type_next = split_next [6]
                # The PantherDB Family 
                PanDB_family_next = split_next [8]
                print(UP_next_2b, ortholog_UP_next_ID, ortholog_type_next,PanDB_family_next)
                # This is for checking if the gene info match, in case the UP_2b matches with the 
                # ortholog info in the next line 
                if UP_2b == UP_next_2b: 
                         dict_list = {
                              "id" : UP_2b,
                              "pantherdb" : {
                                    DB_name : DB_ID,
                                    "uniprot_id": UP_2b,
                               }
                         }
                         ortholog =  [{
                              ortholog_ss_title: ortholog_ss_ID,
                              "UniProtKB" : ortholog_UP_ID,
                              "ortholog_type" : ortholog_type,
                              "panther_family" : PanDB_family,
                         }]
                         ortholog.append({ortholog_ss_title_next: ortholog_ss_ID_next,"UniProtKB" : ortholog_UP_next_ID,"ortholog_type" : ortholog_type_next,"panther_family" : PanDB_family_next})
             
                         dict_list["ortholog"] = ortholog            
            # create a dictionary and append
            else:
                 # 5. if not, yield 
                 print (dict_list)
                 
                 
                
diff --git a/PantherDB_Parser_draft_4.py b/PantherDB_Parser_draft_4.py
 buffer = None # empty buffer
 ortholog = None # empty ortholog for later usage 
 import re
 with open("testfile_PantherDB.txt", "r+") as testfile: 
        for line in testfile:
                split_list = re.split("[| \t \n]", line)
                DB = split_list [1]
                DB_break = re.split("=", DB)
                DB_name = DB_break [0]
                DB_ID = DB_break [1]
                UP = split_list [2]
                UP_break= re.split("[=]", UP)
                UP_2b = UP_break [1]
                # Species specific (ss) DB name and ID of ortholog 
                ortholog_ss = split_list [4]
                ortholog_ss_break= re.split("[=]", ortholog_ss)
                ortholog_ss_title= ortholog_ss_break[0]
                ortholog_ss_ID = ortholog_ss_break[1]
                # The UniprotID of the ortholog 
                ortholog_UP = split_list [5]
                ortholog_UP_break= re.split("[=]", ortholog_UP)
                ortholog_UP_ID = ortholog_UP_break [1]
                # The ortholog type 
                ortholog_type = split_list [6]
                # The PantherDB Family 
                PanDB_family = split_list [8]          
                if UP_2b in line:     
                        DB_next = split_list [1]
                        DB_next_break = re.split("=", DB_next)
                        DB_next_name = DB_next_break [0]
                        DB_next_ID = DB_next_break [1]
                        split_next = re.split("[| \t \n]", line)
                        ESID_next= split_next [1]
                        ESID_title_next_break = re.split("[=]", ESID_next)
                        ESID_title_next = ESID_title_next_break [0]
                        ESID_ID_next = ESID_title_next_break [1]
                        # This one is for the uniprot ID
                        UP_next = split_next [2]
                        UP_next_break= re.split("[=]", UP_next)
                        UP_next_2b = UP_next_break [1]
                        # The UPID keyword would be used for searching lines with common ref. gene
                        # Create the items for the ortholog 
                        # The ss name and id of the ortholog 
                        ortholog_ss_next = split_next [4]
                        ortholog_ss_break_next= re.split("[=]", ortholog_ss_next)
                        ortholog_ss_title_next= ortholog_ss_break_next[0]
                        ortholog_ss_ID_next = ortholog_ss_break_next[1]
                        # The UniprotID of the ortholog 
                        ortholog_UP_next = split_next [5]
                        ortholog_UP_break_next= re.split("[=]", ortholog_UP_next)
                        ortholog_UP_next_ID = ortholog_UP_break_next [1]
                        # The ortholog type 
                        ortholog_type_next = split_next [6]
                        # The PantherDB Family 
                        PanDB_family_next = split_next [8]
                        # This is for checking if the gene info match, in case the UP_2b matches with the 
                        # ortholog info in the next line
                        if UP == UP_next: 
                                         d = {
                                                  "id" : UP_2b,
                                                  "pantherdb" : {
                                                                DB_name : DB_ID,
                                                                "uniprot_id": UP_2b,
                                                   }
                                         }
                                         if buffer == None and ortholog == None: # if no dict, set one
                                                        buffer = d
                                                        ortholog =  [{
                                                          ortholog_ss_title: ortholog_ss_ID,
                                                          "UniProtKB" : ortholog_UP_ID,
                                                          "ortholog_type" : ortholog_type,
                                                          "panther_family" : PanDB_family,
                                                        }]
                                                        ortholog.append({ortholog_ss_title_next: ortholog_ss_ID_next,"UniProtKB" : ortholog_UP_next_ID,"ortholog_type" : ortholog_type_next,"panther_family" : PanDB_family_next})
                                         else: # if already have d, add item 
                                                        ortholog.append({ortholog_ss_title_next: ortholog_ss_ID_next,"UniProtKB" : ortholog_UP_next_ID,"ortholog_type" : ortholog_type_next,"panther_family" : PanDB_family_next})
                                                        print(ortholog)
                        else:
                                d["ortholog"] = ortholog # only add the list at the last
                                print(d)
                                break
          
          
                  
diff --git a/PantherDB_Parser_draft_5.py b/PantherDB_Parser_draft_5.py
 import re

 # this empty dictionary is for storing the final output 
 d = {}
 # this empty list is for storing the orthologs of the same reference gene
 o = []
 # this empty list stores the common Uniprot_ID temporarily for comparison 
 e = []


 class Parser():

    def __init__(self, x):
        self.x = x
        self.y = re.split("[\| \t \n]", self.x)
        self.z = self.y [2]
        self.a = self.z.split("=")
        self.b = self.y [1]
        self.c = self.b.split("=")
        self.d = self.y [4]
        self.g = self.d.split("=")
        self.h = self.y [5]
        self.i = self.h.split("=")
        # the previous variables does not matter much; they are only semi-product of splitting
        # the below are the ultimate products
        # those with ref_gene prefix refers to the information of the common reference gene
        # those wih the ortholog prefix refers to the information of the orthologs
        self.ref_gene_uniprot_ID = self.a [1]
        self.ref_gene_db_name = self.c [0]
        self.ref_gene_db_id = self.c [1]
        self.ortholog_db_name = self.g [0]
        self.ortholog_db_id = self.g[-1]
        self.ortholog_Uniprot_ID = self.i [1]
        self.ortholog_type = self.y [6]
        self.ortholog_pdb_family = self.y [8]
                    
 with open("testfile_PantherDB.txt", "r+") as f:
    # This function is for splitting the line
    for line in f:
        line = Parser(line)
        if line.ref_gene_uniprot_ID in e: # i.e. the two lines has the same reference gene
            # append the ortholog to the list  
            new = {line.ortholog_db_name: line.ortholog_db_id,
                   "UniProtKB": line.ortholog_Uniprot_ID,
                   "Ortholog_type": line.ortholog_type,
                   "panther_family": line.ortholog_pdb_family
                   }
            o.append(new)
        elif e == []:
            e.append(line.ref_gene_uniprot_ID) # This applies only to the first line
            d = { "id": line.ref_gene_uniprot_ID,
                  "pantherdb": {
                  line.ref_gene_db_name: line.ref_gene_db_id,
                  "UniProtKB": line.ref_gene_uniprot_ID,
                  }
                }
            o = [{line.ortholog_db_name: line.ortholog_db_id,
                   "UniProtKB": line.ortholog_Uniprot_ID,
                   "Ortholog_type": line.ortholog_type,
                   "panther_family": line.ortholog_pdb_family
                   }
                ]
                        ]
        # elif EOFError: # output the last item if the end of file reached:
            # d["ortholog: "] = o
            # print(d)
            # break
        else: # if difference in reference gene, indicating output of dictionary and output of d 
            d["ortholog: "] = o
            print(d)
            d.clear()
            e.clear()
            e.append(line.ref_gene_uniprot_ID)
            d = { "id": line.ref_gene_uniprot_ID,
                  "pantherdb": {
                  line.ref_gene_db_name: line.ref_gene_db_id,
                  "UniProtKB": line.ref_gene_uniprot_ID,
                  }
                }
            o = [{line.ortholog_db_name: line.ortholog_db_id,
                   "UniProtKB": line.ortholog_Uniprot_ID,
                   "Ortholog_type": line.ortholog_type,
                   "panther_family": line.ortholog_pdb_family
                   }
                ]
diff --git a/PantherDB_Parser_draft_6.py b/PantherDB_Parser_draft_6.py
 import re

 # this empty dictionary is for storing the final output 
 d = {}
 # this empty list is for storing the orthologs of the same reference gene
 o = []
 # this empty list stores the common Uniprot_ID temporarily for comparison 
 e = None 
                    
 with open("testfile_PantherDB.txt", "r+") as f:
    # This function is for splitting the line
    for line in f:
        y = re.split("[\| \t \n]", line)
        z = re.split("=", y [2])
        a = re.split("=", y [1])
        b = re.split("=", y [4])
        c = re.split("=", y [5])
        # The above are only intermediates
        # The below are the important variables
        ref_gene_uniprot_id = z [1]
        ref_gene_db_name = a [0]
        ref_gene_db_id = a[-1]
        ortholog_db_name = b [0]
        ortholog_db_id = b [-1]
        ortholog_uniprot_id = c [1]
        ortholog_type = y [6]
        ortholog_family = y [8]
        
        if e is None: # for the first item
           e = ref_gene_uniprot_id
           d = { "id": ref_gene_uniprot_id,
                  "pantherdb": {
                  ref_gene_db_name: ref_gene_db_id,
                  "UniProtKB": ref_gene_uniprot_id,
                  }
               }
           o = [{ortholog_db_name: ortholog_db_id,
                   "UniProtKB": ortholog_uniprot_id,
                   "Ortholog_type": ortholog_type,
                   "panther_family": ortholog_family
                   }
               ]
        elif ref_gene_uniprot_id != e: # if read up to a different ref. gene 
             d["ortholog: "] = o
             print(d)
             d.clear()
             e = ref_gene_uniprot_id
             d = { "id": ref_gene_uniprot_id,
                   "pantherdb": {
                   ref_gene_db_name: ref_gene_db_id,
                   "UniProtKB": ref_gene_uniprot_id
                   }
                 }
             o = [{ortholog_db_name: ortholog_db_id,
                    "UniProtKB": ortholog_uniprot_id,
                    "Ortholog_type": ortholog_type,
                    "panther_family": ortholog_family
                    }
                 ]
        
        else: # in this case the ref. gene is the same, just append the ortholog 
            new = {ortholog_db_name: ortholog_db_id,
                   "UniProtKB": ortholog_uniprot_id,
                   "Ortholog_type": ortholog_type,
                   "panther_family": ortholog_family
                   }
            o.append(new)
              
    if o:
       d["ortholog: "] = o
       print(d)       
       
diff --git a/PantherDB_Parser_draft_7.py b/PantherDB_Parser_draft_7.py
 import re
 import os.path 

 def load_data (data_folder):
         
         data_file = os.path.join(data_folder, "testfile_PantherDB.txt") 
         
         # this empty dictionary is for storing the final output 
         d = {}
         # this empty list is for storing the orthologs of the same reference gene
         o = []
         # this empty list stores the common Uniprot_ID temporarily for comparison 
         e = None 
     
         # Define a function that takes the datafile as the sole argument               
         with open(data_file, "r+") as f:# change this to the file name
             # This function is for splitting the line
             for line in f:
                 y = re.split("[\| \t \n]", line)
                 z = re.split("=", y [2])
                 a = re.split("=", y [1])
                 b = re.split("=", y [4])
                 c = re.split("=", y [5])
                 # The above are only intermediates
                 # The below are the important variables
                 ref_gene_uniprot_id = z [1]
                 ref_gene_db_name = a [0]
                 ref_gene_db_id = a[-1]
                 ortholog_db_name = b [0]
                 ortholog_db_id = b [-1]
                 ortholog_uniprot_id = c [1]
                 ortholog_type = y [6]
                 ortholog_family = y [8]
        
                 if e is None: # for the first item
                    e = ref_gene_uniprot_id
                    d = { "id": ref_gene_uniprot_id,
                           "pantherdb": {
                           ref_gene_db_name: ref_gene_db_id,
                           "uniprot_kb": ref_gene_uniprot_id,
                           }
                        }
           
                 if ref_gene_uniprot_id != e: # if read up to a different ref. gene 
                      d = { "id": ref_gene_uniprot_id,
                            "pantherdb": {
                            ref_gene_db_name: ref_gene_db_id,
                            "uniprot_kb": ref_gene_uniprot_id,
                            "orthologs" : o
                            }
                          }
                      yield d  
                      d.clear()
                      e = ref_gene_uniprot_id
                      d = { "id": ref_gene_uniprot_id,
                            "pantherdb": {
                            ref_gene_db_name: ref_gene_db_id,
                            "uniprot_kb": ref_gene_uniprot_id
                            }
                          }
                      o = [{ortholog_db_name: ortholog_db_id,
                             "uniprot_kb": ortholog_uniprot_id,
                             "ortholog_type": ortholog_type,
                             "panther_family": ortholog_family
                             }
                          ]
        
                 else: # in this case the ref. gene is the same, just append the ortholog 
                     new = {ortholog_db_name: ortholog_db_id,
                            "uniprot_kb": ortholog_uniprot_id,
                            "ortholog_type": ortholog_type,
                            "panther_family": ortholog_family
                            }
                     o.append(new)
              
             if o:
             # at the last item, the ortholog is created but since it has no next ref_gene_uniprot_id to compare,
             # it does not go to the second if and output the result
             # and thus we need to let it output the result by giving it the condition if o == true. 
                d = { "id": ref_gene_uniprot_id,
                            "pantherdb": {
                            ref_gene_db_name: ref_gene_db_id,
                            "uniprot_kb": ref_gene_uniprot_id,
                            "orthologs" : o
                            }
                          }
                yield d       

 # The below code is what I used for testing whether my generator is working 
 # I opened the file named Test_folder containing 2 files, one is my test file
 # and the other is a "fake data file" that contains data with the same structure
 # Just to test if my parser can get the right file from the folder

 # Then, I feed the function with the directory to the test folder that contains 
 # both the right file and the fake file        
 if "__name__" == "__main__":
 	for i in load_data(data_folder):
            print(i)
	Data = [] # Create an empty dictionary
	import re
	with open("testfile_PantherDB.txt", "r+") as testfile:
	for line in testfile:
	# Creating the items for the reference gene
	# The four lines below creates 1a (name of the species specific database)
	# and 1b (the ID for that gene in the ss database
	split_list = re.split("[\| \t \n]", line)
	ESID= split_list [1]
	ESID_title_break = re.split("[=]", ESID)
	ESID_title_1a = ESID_title_break [0]
	ESID_ID_1b = ESID_title_break [1]
	# This one is for the uniprot ID
	UP = split_list [2]
	UP_break= re.split("[=]", UP)
	UP_2b = UP_break [1]
	# Create the items for the ortholog
	# The ss name and id of the ortholog
	ortholog_ss = split_list [4]
	ortholog_ss_break= re.split("[=]", ortholog_ss)
	ortholog_ss_title_4a= ortholog_ss_break[0]
	ortholog_ss_ID_4b = ortholog_ss_break[1]
	# The UniprotID of the ortholog
	ortholog_UP = split_list [5]
	ortholog_UP_break= re.split("[=]", ortholog_UP)
	ortholog_UP_5b = ortholog_UP_break [1]
	# The ortholog type
	ortholog_type = split_list [6]
	# The PantherDB Family
	PanDB_family = split_list [8]
	# Add the whole entry into the dictionary
	key_value_pair = {
	"id_" : UP_2b,
	ESID_title_1a : ESID_ID_1b,
	"UniProt_ID:": UP_2b,
	"Ortholog" : [
	{ortholog_ss_title_4a : ortholog_ss_ID_4b ,
	"UniProt_ID:" : ortholog_UP_5b,
	"Ortholog_type": ortholog_type,
	"PantherDB_family" : PanDB_family
	}]
	}
	Data.append(dict(key_value_pair))
	print(Data)

	#New_Data = [] # Create another new list
	#for Data [x] in Data: # iterate dictionaries in list
	#if y in Data [x] == z in Data [x + 1]: # if the value of the first key of two dictionaries in the list are the same
	# Append the dictionary of ortholog information (the value of the "Ortholog" key) of the second dictionary
	# to the list of ortholog info of the first dictionary.
	import re
	with open("testfile_PartnerDB.txt", "r+") as testfile:
	for line in testfile:
	print(next(search("O60524", "testfile_PantherDB.txt")))
	# Creating the items for the reference gene
	# The four lines below creates 1a (name of the species specific database)
	# and 1b (the ID for that gene in the ss database
	split_list = re.split("[\| \t \n]", line)
	ESID= split_list [1]
	ESID_title_break = re.split("[=]", ESID)
	ESID_title_1a = ESID_title_break [0]
	ESID_ID_1b = ESID_title_break [1]
	# This one is for the uniprot ID
	UP = split_list [2]
	UP_break= re.split("[=]", UP)
	UP_2b = UP_break [1]
	# The UPID keyword would be used for searching lines with common ref. gene
	# Create the items for the ortholog
	# The ss name and id of the ortholog
	ortholog_ss = split_list [4]
	ortholog_ss_break= re.split("[=]", ortholog_ss)
	ortholog_ss_title_4a= ortholog_ss_break[0]
	ortholog_ss_ID_4b = ortholog_ss_break[1]
	# The UniprotID of the ortholog
	ortholog_UP = split_list [5]
	ortholog_UP_break= re.split("[=]", ortholog_UP)
	ortholog_UP_5b = ortholog_UP_break [1]
	# The ortholog type
	ortholog_type = split_list [6]
	# The PantherDB Family
	PanDB_family = split_list [8]
	# The following is the generator that yield the final data structure
	if UP_2b in line:
	try:
	data_output
	# each output (final structure) would be stored in data_output
	# thus if this exist, only the ortholog needs to be added
	# else, a new data entry needs to be created.
	except NameError:
	data_output= {
	"id_" : UP_2b,
	EID_title_1a : ESID_ID_1b,
	"UniProt_ID:": UP_2b,
	"Ortholog" : [
	{ortholog_ss_title_4a : ortholog_ss_ID_4b ,
	"UniProt_ID:" : ortholog_UP_5b,
	"Ortholog_type": ortholog_type,
	"PantherDB_family" : PanDB_family
	}]
	}
	else:
	data_output.update({
	"Ortholog" : [
	{ortholog_ss_title_4a : ortholog_ss_ID_4b ,
	"UniProt_ID:" : ortholog_UP_5b,
	"Ortholog_type": ortholog_type,
	"PantherDB_family" : PanDB_family
	}]
	}

	else: # if no more item with common gene ref is found, yield the data_output and stop
	yield data_output
	continue
	import re
	with open("testfile_PantherDB.txt", "r+") as testfile:
	# 1. Get the first line
	for line in testfile:
	# 2. Get the gene and the ortholog
	split_list = re.split("[\| \t \n]", line)
	DB = split_list [1]
	DB_break = re.split("=", DB)
	DB_name = DB_break [0]
	DB_ID = DB_break [1]
	UP = split_list [2]
	UP_break= re.split("[=]", UP)
	UP_2b = UP_break [1]
	# Species specific (ss) DB name and ID of ortholog
	ortholog_ss = split_list [4]
	ortholog_ss_break= re.split("[=]", ortholog_ss)
	ortholog_ss_title= ortholog_ss_break[0]
	ortholog_ss_ID = ortholog_ss_break[1]
	# The UniprotID of the ortholog
	ortholog_UP = split_list [5]
	ortholog_UP_break= re.split("[=]", ortholog_UP)
	ortholog_UP_ID = ortholog_UP_break [1]
	# The ortholog type
	ortholog_type = split_list [6]
	# The PantherDB Family
	PanDB_family = split_list [8]
	# 3. Get the next line
	for line in testfile:
	# 4. if the two lines have the same gene, append
	if UP_2b in line:
	DB_next = split_list [1]
	DB_next_break = re.split("=", DB_next)
	DB_next_name = DB_next_break [0]
	DB_next_ID = DB_next_break [1]
	split_next = re.split("[\| \t \n]", line)
	ESID_next= split_next [1]
	ESID_title_next_break = re.split("[=]", ESID_next)
	ESID_title_next = ESID_title_next_break [0]
	ESID_ID_next = ESID_title_next_break [1]
	# This one is for the uniprot ID
	UP_next = split_next [2]
	UP_next_break= re.split("[=]", UP_next)
	UP_next_2b = UP_next_break [1]
	# The UPID keyword would be used for searching lines with common ref. gene
	# Create the items for the ortholog
	# The ss name and id of the ortholog
	ortholog_ss_next = split_next [4]
	ortholog_ss_break_next= re.split("[=]", ortholog_ss_next)
	ortholog_ss_title_next= ortholog_ss_break_next[0]
	ortholog_ss_ID_next = ortholog_ss_break_next[1]
	# The UniprotID of the ortholog
	ortholog_UP_next = split_next [5]
	ortholog_UP_break_next= re.split("[=]", ortholog_UP_next)
	ortholog_UP_next_ID = ortholog_UP_break_next [1]
	# The ortholog type
	ortholog_type_next = split_next [6]
	# The PantherDB Family
	PanDB_family_next = split_next [8]
	print(UP_next_2b, ortholog_UP_next_ID, ortholog_type_next,PanDB_family_next)
	# This is for checking if the gene info match, in case the UP_2b matches with the
	# ortholog info in the next line
	if UP_2b == UP_next_2b:
	dict_list = {
	"id" : UP_2b,
	"pantherdb" : {
	DB_name : DB_ID,
	"uniprot_id": UP_2b,
	}
	}
	ortholog = [{
	ortholog_ss_title: ortholog_ss_ID,
	"UniProtKB" : ortholog_UP_ID,
	"ortholog_type" : ortholog_type,
	"panther_family" : PanDB_family,
	}]
	ortholog.append({ortholog_ss_title_next: ortholog_ss_ID_next,"UniProtKB" : ortholog_UP_next_ID,"ortholog_type" : ortholog_type_next,"panther_family" : PanDB_family_next})

	dict_list["ortholog"] = ortholog
	# create a dictionary and append
	else:
	# 5. if not, yield
	print (dict_list)
	buffer = None # empty buffer
	ortholog = None # empty ortholog for later usage
	import re
	with open("testfile_PantherDB.txt", "r+") as testfile:
	for line in testfile:
	split_list = re.split("[\| \t \n]", line)
	DB = split_list [1]
	DB_break = re.split("=", DB)
	DB_name = DB_break [0]
	DB_ID = DB_break [1]
	UP = split_list [2]
	UP_break= re.split("[=]", UP)
	UP_2b = UP_break [1]
	# Species specific (ss) DB name and ID of ortholog
	ortholog_ss = split_list [4]
	ortholog_ss_break= re.split("[=]", ortholog_ss)
	ortholog_ss_title= ortholog_ss_break[0]
	ortholog_ss_ID = ortholog_ss_break[1]
	# The UniprotID of the ortholog
	ortholog_UP = split_list [5]
	ortholog_UP_break= re.split("[=]", ortholog_UP)
	ortholog_UP_ID = ortholog_UP_break [1]
	# The ortholog type
	ortholog_type = split_list [6]
	# The PantherDB Family
	PanDB_family = split_list [8]
	if UP_2b in line:
	DB_next = split_list [1]
	DB_next_break = re.split("=", DB_next)
	DB_next_name = DB_next_break [0]
	DB_next_ID = DB_next_break [1]
	split_next = re.split("[\| \t \n]", line)
	ESID_next= split_next [1]
	ESID_title_next_break = re.split("[=]", ESID_next)
	ESID_title_next = ESID_title_next_break [0]
	ESID_ID_next = ESID_title_next_break [1]
	# This one is for the uniprot ID
	UP_next = split_next [2]
	UP_next_break= re.split("[=]", UP_next)
	UP_next_2b = UP_next_break [1]
	# The UPID keyword would be used for searching lines with common ref. gene
	# Create the items for the ortholog
	# The ss name and id of the ortholog
	ortholog_ss_next = split_next [4]
	ortholog_ss_break_next= re.split("[=]", ortholog_ss_next)
	ortholog_ss_title_next= ortholog_ss_break_next[0]
	ortholog_ss_ID_next = ortholog_ss_break_next[1]
	# The UniprotID of the ortholog
	ortholog_UP_next = split_next [5]
	ortholog_UP_break_next= re.split("[=]", ortholog_UP_next)
	ortholog_UP_next_ID = ortholog_UP_break_next [1]
	# The ortholog type
	ortholog_type_next = split_next [6]
	# The PantherDB Family
	PanDB_family_next = split_next [8]
	# This is for checking if the gene info match, in case the UP_2b matches with the
	# ortholog info in the next line
	if UP == UP_next:
	d = {
	"id" : UP_2b,
	"pantherdb" : {
	DB_name : DB_ID,
	"uniprot_id": UP_2b,
	}
	}
	if buffer == None and ortholog == None: # if no dict, set one
	buffer = d
	ortholog = [{
	ortholog_ss_title: ortholog_ss_ID,
	"UniProtKB" : ortholog_UP_ID,
	"ortholog_type" : ortholog_type,
	"panther_family" : PanDB_family,
	}]
	ortholog.append({ortholog_ss_title_next: ortholog_ss_ID_next,"UniProtKB" : ortholog_UP_next_ID,"ortholog_type" : ortholog_type_next,"panther_family" : PanDB_family_next})
	else: # if already have d, add item
	ortholog.append({ortholog_ss_title_next: ortholog_ss_ID_next,"UniProtKB" : ortholog_UP_next_ID,"ortholog_type" : ortholog_type_next,"panther_family" : PanDB_family_next})
	print(ortholog)
	else:
	d["ortholog"] = ortholog # only add the list at the last
	print(d)
	break
	import re

	# this empty dictionary is for storing the final output
	d = {}
	# this empty list is for storing the orthologs of the same reference gene
	o = []
	# this empty list stores the common Uniprot_ID temporarily for comparison
	e = []


	class Parser():

	def __init__(self, x):
	self.x = x
	self.y = re.split("[\\| \t \n]", self.x)
	self.z = self.y [2]
	self.a = self.z.split("=")
	self.b = self.y [1]
	self.c = self.b.split("=")
	self.d = self.y [4]
	self.g = self.d.split("=")
	self.h = self.y [5]
	self.i = self.h.split("=")
	# the previous variables does not matter much; they are only semi-product of splitting
	# the below are the ultimate products
	# those with ref_gene prefix refers to the information of the common reference gene
	# those wih the ortholog prefix refers to the information of the orthologs
	self.ref_gene_uniprot_ID = self.a [1]
	self.ref_gene_db_name = self.c [0]
	self.ref_gene_db_id = self.c [1]
	self.ortholog_db_name = self.g [0]
	self.ortholog_db_id = self.g[-1]
	self.ortholog_Uniprot_ID = self.i [1]
	self.ortholog_type = self.y [6]
	self.ortholog_pdb_family = self.y [8]

	with open("testfile_PantherDB.txt", "r+") as f:
	# This function is for splitting the line
	for line in f:
	line = Parser(line)
	if line.ref_gene_uniprot_ID in e: # i.e. the two lines has the same reference gene
	# append the ortholog to the list
	new = {line.ortholog_db_name: line.ortholog_db_id,
	"UniProtKB": line.ortholog_Uniprot_ID,
	"Ortholog_type": line.ortholog_type,
	"panther_family": line.ortholog_pdb_family
	}
	o.append(new)
	elif e == []:
	e.append(line.ref_gene_uniprot_ID) # This applies only to the first line
	d = { "id": line.ref_gene_uniprot_ID,
	"pantherdb": {
	line.ref_gene_db_name: line.ref_gene_db_id,
	"UniProtKB": line.ref_gene_uniprot_ID,
	}
	}
	o = [{line.ortholog_db_name: line.ortholog_db_id,
	"UniProtKB": line.ortholog_Uniprot_ID,
	"Ortholog_type": line.ortholog_type,
	"panther_family": line.ortholog_pdb_family
	}
	]
	]
	# elif EOFError: # output the last item if the end of file reached:
	# d["ortholog: "] = o
	# print(d)
	# break
	else: # if difference in reference gene, indicating output of dictionary and output of d
	d["ortholog: "] = o
	print(d)
	d.clear()
	e.clear()
	e.append(line.ref_gene_uniprot_ID)
	d = { "id": line.ref_gene_uniprot_ID,
	"pantherdb": {
	line.ref_gene_db_name: line.ref_gene_db_id,
	"UniProtKB": line.ref_gene_uniprot_ID,
	}
	}
	o = [{line.ortholog_db_name: line.ortholog_db_id,
	"UniProtKB": line.ortholog_Uniprot_ID,
	"Ortholog_type": line.ortholog_type,
	"panther_family": line.ortholog_pdb_family
	}
	]
	import re
	import os.path

	def load_data (data_folder):

	data_file = os.path.join(data_folder, "testfile_PantherDB.txt")

	# this empty dictionary is for storing the final output
	d = {}
	# this empty list is for storing the orthologs of the same reference gene
	o = []
	# this empty list stores the common Uniprot_ID temporarily for comparison
	e = None

	# Define a function that takes the datafile as the sole argument
	with open(data_file, "r+") as f:# change this to the file name
	# This function is for splitting the line
	for line in f:
	y = re.split("[\\| \t \n]", line)
	z = re.split("=", y [2])
	a = re.split("=", y [1])
	b = re.split("=", y [4])
	c = re.split("=", y [5])
	# The above are only intermediates
	# The below are the important variables
	ref_gene_uniprot_id = z [1]
	ref_gene_db_name = a [0]
	ref_gene_db_id = a[-1]
	ortholog_db_name = b [0]
	ortholog_db_id = b [-1]
	ortholog_uniprot_id = c [1]
	ortholog_type = y [6]
	ortholog_family = y [8]

	if e is None: # for the first item
	e = ref_gene_uniprot_id
	d = { "id": ref_gene_uniprot_id,
	"pantherdb": {
	ref_gene_db_name: ref_gene_db_id,
	"uniprot_kb": ref_gene_uniprot_id,
	}
	}

	if ref_gene_uniprot_id != e: # if read up to a different ref. gene
	d = { "id": ref_gene_uniprot_id,
	"pantherdb": {
	ref_gene_db_name: ref_gene_db_id,
	"uniprot_kb": ref_gene_uniprot_id,
	"orthologs" : o
	}
	}
	yield d
	d.clear()
	e = ref_gene_uniprot_id
	d = { "id": ref_gene_uniprot_id,
	"pantherdb": {
	ref_gene_db_name: ref_gene_db_id,
	"uniprot_kb": ref_gene_uniprot_id
	}
	}
	o = [{ortholog_db_name: ortholog_db_id,
	"uniprot_kb": ortholog_uniprot_id,
	"ortholog_type": ortholog_type,
	"panther_family": ortholog_family
	}
	]

	else: # in this case the ref. gene is the same, just append the ortholog
	new = {ortholog_db_name: ortholog_db_id,
	"uniprot_kb": ortholog_uniprot_id,
	"ortholog_type": ortholog_type,
	"panther_family": ortholog_family
	}
	o.append(new)

	if o:
	# at the last item, the ortholog is created but since it has no next ref_gene_uniprot_id to compare,
	# it does not go to the second if and output the result
	# and thus we need to let it output the result by giving it the condition if o == true.
	d = { "id": ref_gene_uniprot_id,
	"pantherdb": {
	ref_gene_db_name: ref_gene_db_id,
	"uniprot_kb": ref_gene_uniprot_id,
	"orthologs" : o
	}
	}
	yield d

	# The below code is what I used for testing whether my generator is working
	# I opened the file named Test_folder containing 2 files, one is my test file
	# and the other is a "fake data file" that contains data with the same structure
	# Just to test if my parser can get the right file from the folder

	# Then, I feed the function with the directory to the test folder that contains
	# both the right file and the fake file
	if "__name__" == "__main__":
	for i in load_data(data_folder):
	print(i)