Skip to content

Instantly share code, notes, and snippets.

@AnnaTSW0609
Last active February 9, 2019 07:23
Show Gist options
  • Save AnnaTSW0609/4deeb7d2245c2c00a955eba95e880132 to your computer and use it in GitHub Desktop.
Save AnnaTSW0609/4deeb7d2245c2c00a955eba95e880132 to your computer and use it in GitHub Desktop.
import re
import os.path
import time
# start = time. time()
def MeSH_DB(data_folder):
# Source: https://www.nlm.nih.gov/mesh/2019/download/2019New_Mesh_Tree_Hierarchy.txt
# can we invoke this through API
# Import the MeSH to be checked against
data_file = os.path.join(data_folder, "MeSH_keywords.txt")
# finding the keyword in the file
with open (data_file, "r+") as f:
for line in f:
if line.startswith("TREE_NUMBER") == True or line.startswith("----") == True or not line.strip():
# Skipping title line , -----grid and empty lines
next(f)
else:
MeSH_term = "" # create an empty term to store the MeSH item in each line
MeSH_entry = {} # empty dictionary to store the final indexed entry
split_result = re.split(" ", line)
split_result = list(filter(None, split_result)) # remove all the empty strings " "
MeSH_index = split_result.pop(0) # store the first item as index
if split_result[-1] == "\n": # remove the last item
split_result.pop(-1)
MeSH_term = str.join(" ", split_result) # join every term left into a single string
MeSH_entry = {"MeSH_index" : MeSH_index, "MeSH_term": MeSH_term} # store the index and term
yield MeSH_term
# print (MeSH_list) # takes around 1.5s slower than the generator function
generator = MeSH_DB("/Users/annatswater/Desktop/ChallengeCup_2019")
x = (list(generator)) # must be a list or else cannot search for the word
# end = time. time()
# print(end - start)
# Multiline input in GUI: https://stackoverflow.com/questions/9661854/how-to-create-a-multiline-entry-with-tkinter
# Iteration through list and file: https://stackoverflow.com/questions/51297805/in-python-searching-a-text-file-for-multiple-words-and-printing-the-correspondi
# The next function needs to accept a chunk of text in a textbox
# set the size to be big
# if matches value of "MeSH_Term" in the dictionary (a list of dictionary)
# change color into red
# or at least count the times the term occurs
with open ("/Users/annatswater/Desktop/ChallengeCup_2019/new_text_file.txt", "r+") as input_file:
for line in input_file:
if any(word in line for word in x):
print(line) # cannot print the word; undefined
# or highlight each word, and then count numbers of highlighted words and then do cluster counts
# but how can i do this now that I have ruined the formatting
# take each line as a string
# find the keyword in each line
#for line in fruit_list:
#if any(word in line for word in search_words):
#print(line)
# https://stackoverflow.com/questions/18366554/how-to-search-for-word-in-text-file-and-print-part-of-line-with-python
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment