Last active
February 9, 2019 07:23
-
-
Save AnnaTSW0609/4deeb7d2245c2c00a955eba95e880132 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import os.path | |
import time | |
# start = time. time() | |
def MeSH_DB(data_folder): | |
# Source: https://www.nlm.nih.gov/mesh/2019/download/2019New_Mesh_Tree_Hierarchy.txt | |
# can we invoke this through API | |
# Import the MeSH to be checked against | |
data_file = os.path.join(data_folder, "MeSH_keywords.txt") | |
# finding the keyword in the file | |
with open (data_file, "r+") as f: | |
for line in f: | |
if line.startswith("TREE_NUMBER") == True or line.startswith("----") == True or not line.strip(): | |
# Skipping title line , -----grid and empty lines | |
next(f) | |
else: | |
MeSH_term = "" # create an empty term to store the MeSH item in each line | |
MeSH_entry = {} # empty dictionary to store the final indexed entry | |
split_result = re.split(" ", line) | |
split_result = list(filter(None, split_result)) # remove all the empty strings " " | |
MeSH_index = split_result.pop(0) # store the first item as index | |
if split_result[-1] == "\n": # remove the last item | |
split_result.pop(-1) | |
MeSH_term = str.join(" ", split_result) # join every term left into a single string | |
MeSH_entry = {"MeSH_index" : MeSH_index, "MeSH_term": MeSH_term} # store the index and term | |
yield MeSH_term | |
# print (MeSH_list) # takes around 1.5s slower than the generator function | |
generator = MeSH_DB("/Users/annatswater/Desktop/ChallengeCup_2019") | |
x = (list(generator)) # must be a list or else cannot search for the word | |
# end = time. time() | |
# print(end - start) | |
# Multiline input in GUI: https://stackoverflow.com/questions/9661854/how-to-create-a-multiline-entry-with-tkinter | |
# Iteration through list and file: https://stackoverflow.com/questions/51297805/in-python-searching-a-text-file-for-multiple-words-and-printing-the-correspondi | |
# The next function needs to accept a chunk of text in a textbox | |
# set the size to be big | |
# if matches value of "MeSH_Term" in the dictionary (a list of dictionary) | |
# change color into red | |
# or at least count the times the term occurs | |
with open ("/Users/annatswater/Desktop/ChallengeCup_2019/new_text_file.txt", "r+") as input_file: | |
for line in input_file: | |
if any(word in line for word in x): | |
print(line) # cannot print the word; undefined | |
# or highlight each word, and then count numbers of highlighted words and then do cluster counts | |
# but how can i do this now that I have ruined the formatting | |
# take each line as a string | |
# find the keyword in each line | |
#for line in fruit_list: | |
#if any(word in line for word in search_words): | |
#print(line) | |
# https://stackoverflow.com/questions/18366554/how-to-search-for-word-in-text-file-and-print-part-of-line-with-python |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment