AnnaTSW0609 · February 9, 2019 07:23
diff --git a/first_prototype_grammar_checker.py b/first_prototype_grammar_checker.py
 import re 
 import os.path
 import time 

 # start = time. time()


 def MeSH_DB(data_folder): 

 # Source: https://www.nlm.nih.gov/mesh/2019/download/2019New_Mesh_Tree_Hierarchy.txt
 # can we invoke this through API 

     # Import the MeSH to be checked against
     data_file = os.path.join(data_folder, "MeSH_keywords.txt")
     
     # finding the keyword in the file 
     with open (data_file, "r+") as f:
          
         
     
          for line in f:
               if line.startswith("TREE_NUMBER") == True or line.startswith("----") == True or not line.strip():
               # Skipping title line , -----grid and empty lines 
                      next(f)
    
               else:
                      MeSH_term = "" # create an empty term to store the MeSH item in each line
               
                      MeSH_entry = {} # empty dictionary to store the final indexed entry 
                
                      split_result = re.split(" ", line)
               
                      split_result = list(filter(None, split_result)) # remove all the empty strings " "
               
                      MeSH_index = split_result.pop(0) # store the first item as index
               
                      if split_result[-1] == "\n": # remove the last item 
                             split_result.pop(-1)
               
                      MeSH_term = str.join(" ", split_result) # join every term left into a single string
               
                      MeSH_entry = {"MeSH_index" : MeSH_index, "MeSH_term": MeSH_term} # store the index and term
                      
                      yield MeSH_term
               
                      
          # print (MeSH_list) # takes around 1.5s slower than the generator function 
                      
 generator = MeSH_DB("/Users/annatswater/Desktop/ChallengeCup_2019")

 x = (list(generator)) # must be a list or else cannot search for the word


 # end = time. time()
 # print(end - start)


 # Multiline input in GUI: https://stackoverflow.com/questions/9661854/how-to-create-a-multiline-entry-with-tkinter
 # Iteration through list and file: https://stackoverflow.com/questions/51297805/in-python-searching-a-text-file-for-multiple-words-and-printing-the-correspondi


 # The next function needs to accept a chunk of text in a textbox 
 # set the size to be big 
 # if matches value of "MeSH_Term" in the dictionary (a list of dictionary) 
 # change color into red
 # or at least count the times the term occurs 

 with open ("/Users/annatswater/Desktop/ChallengeCup_2019/new_text_file.txt", "r+") as input_file:
      for line in input_file:
           if any(word in line for word in x): 
              print(line) # cannot print the word; undefined
        
         
         # or highlight each word, and then count numbers of highlighted words and then do cluster counts 
         # but how can i do this now that I have ruined the formatting 
         
         
 # take each line as a string 
 # find the keyword in each line

 #for line in fruit_list:
    #if any(word in line for word in search_words):
        #print(line)


     
            
 # https://stackoverflow.com/questions/18366554/how-to-search-for-word-in-text-file-and-print-part-of-line-with-python
	import re
	import os.path
	import time

	# start = time. time()


	def MeSH_DB(data_folder):

	# Source: https://www.nlm.nih.gov/mesh/2019/download/2019New_Mesh_Tree_Hierarchy.txt
	# can we invoke this through API

	# Import the MeSH to be checked against
	data_file = os.path.join(data_folder, "MeSH_keywords.txt")

	# finding the keyword in the file
	with open (data_file, "r+") as f:



	for line in f:
	if line.startswith("TREE_NUMBER") == True or line.startswith("----") == True or not line.strip():
	# Skipping title line , -----grid and empty lines
	next(f)

	else:
	MeSH_term = "" # create an empty term to store the MeSH item in each line

	MeSH_entry = {} # empty dictionary to store the final indexed entry

	split_result = re.split(" ", line)

	split_result = list(filter(None, split_result)) # remove all the empty strings " "

	MeSH_index = split_result.pop(0) # store the first item as index

	if split_result[-1] == "\n": # remove the last item
	split_result.pop(-1)

	MeSH_term = str.join(" ", split_result) # join every term left into a single string

	MeSH_entry = {"MeSH_index" : MeSH_index, "MeSH_term": MeSH_term} # store the index and term

	yield MeSH_term


	# print (MeSH_list) # takes around 1.5s slower than the generator function

	generator = MeSH_DB("/Users/annatswater/Desktop/ChallengeCup_2019")

	x = (list(generator)) # must be a list or else cannot search for the word


	# end = time. time()
	# print(end - start)


	# Multiline input in GUI: https://stackoverflow.com/questions/9661854/how-to-create-a-multiline-entry-with-tkinter
	# Iteration through list and file: https://stackoverflow.com/questions/51297805/in-python-searching-a-text-file-for-multiple-words-and-printing-the-correspondi


	# The next function needs to accept a chunk of text in a textbox
	# set the size to be big
	# if matches value of "MeSH_Term" in the dictionary (a list of dictionary)
	# change color into red
	# or at least count the times the term occurs

	with open ("/Users/annatswater/Desktop/ChallengeCup_2019/new_text_file.txt", "r+") as input_file:
	for line in input_file:
	if any(word in line for word in x):
	print(line) # cannot print the word; undefined


	# or highlight each word, and then count numbers of highlighted words and then do cluster counts
	# but how can i do this now that I have ruined the formatting


	# take each line as a string
	# find the keyword in each line

	#for line in fruit_list:
	#if any(word in line for word in search_words):
	#print(line)




	# https://stackoverflow.com/questions/18366554/how-to-search-for-word-in-text-file-and-print-part-of-line-with-python