Last active
January 8, 2022 10:39
-
-
Save deshwalmahesh/69d9d3a5fdf2192f2b280eca4b56d8f1 to your computer and use it in GitHub Desktop.
Clean the latex. Use Dict with t tokenization method for faster execution
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def process(sentence,use_lemmetization:False, use_stemming:False, add_pos: False, remove_length:bool = False): | |
if not isinstance(sentence,str): | |
return '' | |
# convert the characters into lower case | |
a = sentence.lower() | |
# remomve newline character | |
a = re.sub(r"\n+", " ", a) | |
# remove MathPix markdown starting from \( and ending at \) while preserving data inside \text { preserve this } | |
a = re.sub(r'\s*\\+\((.*?)\\+\)', lambda x: " ".join(re.findall(r'\\text\s*{([^{}]*)}', x.group(1))), a) | |
# remove whatever comes after \\ double or single slashes except space | |
a = re.sub(r"(\\[^ ]+)",' ',a) # makes sense. Just in case you enounter old mathpix API | |
a = re.sub(r'[^a-zA-Z]',' ',a) | |
# remove repeated space if there is any | |
a = re.sub(r"\s+", " ", a) | |
a = a.strip() # Remove start end spaces | |
if not len(a): | |
return '' | |
tokens = a.split(' ') | |
if use_lemmetization: # if lemmetize only | |
tokens = [lemmetizer.lemmatize(token) for token in tokens] | |
if use_stemming: # stemming only | |
tokens = [stemmer.stem(token) for token in tokens] | |
if add_pos: | |
tokens = [token+"_"+tag for token, tag in nltk.pos_tag(tokens)] | |
if remove_length: | |
tokens = [x for x in tokens if len(x) > remove_length] | |
return ' '.join(tokens) | |
def convert_numbers(x): | |
if bool(re.search(r'\d', x)): | |
x = re.sub('[0-9]{5,}', '#####', x) | |
x = re.sub('[0-9]{4}', '####', x) | |
x = re.sub('[0-9]{3}', '###', x) | |
x = re.sub('[0-9]{2}', '##', x) | |
return x | |
def convert_latex(input_str:str, to_unicode:bool = True, remove_numbers:bool = True): | |
""" | |
Function that converts input latex parts of a string to its unicode format eg. \\frac {3 x } {2 } -> 3 x / 2 | |
All the non latex characters will remain unaffected. | |
Args : | |
input_str : (string) input in string format with latex & non latex characters | |
to_unicode: Whether to convert string to unicode or not | |
remove_numbers: Whether to remove numbers or not. Numbers might cause redundancy instead of adding much info for classification as 123 have sme significance as 10.24 | |
Returns : | |
string converted with spaces, new lines & order preserved | |
""" | |
if not isinstance(input_str,str): | |
return '' | |
input_str = re.sub(r'\triangle',chr(9651), input_str) | |
input_str = re.sub(r'\\frac','/', input_str) | |
input_str = re.sub(r'\\text\s*{([^{}]*)}','',input_str) # Remove whatever is inside \text | |
if remove_numbers: | |
input_str = re.sub(r'[0-9]','',input_str) | |
if to_unicode: | |
raw_map = {8:r'\b', 7:r'\a', 12:r'\f', 10:r'\n', 13:r'\r', 9:r'\t', 11:r'\v'} # Constants to convert string escape characters to raw string | |
input_str = input_str.replace("\n", " !#! ") | |
input_str = LatexNodes2Text().latex_to_text(r''.join(i if ord(i) > 32 else raw_map.get(ord(i), i) for i in input_str)) | |
input_str = input_str.replace(" !#! ","\n") | |
return input_str.replace(' ','').lower() | |
def convert_latex(input_str:str, to_unicode:bool = True, remove_numbers:bool = True): | |
if not isinstance(input_str,str): | |
return '' # Whether it is a simple int (because when there is pure number in text, it is considered as int) or float (np.nan), then return empty string | |
input_str = re.sub(r'\\text\s*{([^{}]*)}','',input_str) # Remove whatever is inside \text | |
if remove_numbers: | |
input_str = re.sub(r'[0-9]','',input_str) | |
if to_unicode: | |
input_str = re.sub(r'\\bigodot',chr(416), input_str) # looks like a unique O | |
input_str = re.sub(r'\\hline',chr(713), input_str) # Horizontal Line | |
input_str = re.sub(r'\\overline',chr(727), input_str) # Horizontal Straight | |
input_str = re.sub(r'\\underline',chr(717), input_str) | |
input_str = re.sub(r'\\overbrace',chr(752), input_str) # Arrow pointing upward | |
input_str = re.sub(r'\\underbrace',chr(751), input_str) # Arrow pointing Downward | |
input_str = re.sub(r'\\overrightarrow',chr(754), input_str) # Arrow pointing right | |
input_str = re.sub(r'\\longdiv',chr(10188), input_str) # long Division | |
input_str = re.sub(r'\\jmath',chr(567), input_str) # dotless J | |
input_str = re.sub(r'\\imath',chr(305), input_str) # dotless I | |
input_str = re.sub(r'\\sqrt',chr(8730), input_str) | |
input_str = re.sub(r'\\Re',chr(344), input_str) # Looks like R symbol | |
input_str = re.sub(r'\\triangle',chr(9651), input_str) | |
input_str = re.sub(r'\\frac','/', input_str) | |
input_str = re.sub(r'\\widetilde',chr(771), input_str) # for \\tilde : 771 | |
input_str = re.sub(r'\\widehat',chr(770), input_str) # for \\hat : 770 | |
input_str = re.sub(r'\\Varangle',chr(8736), input_str) # Just like angle | |
input_str = re.sub(r'\\neg',chr(172), input_str) # Negation Sign | |
input_str = re.sub(r'\\begin',chr(705), input_str) # Random Begin Symbol | |
input_str = re.sub(r'\\end',chr(704), input_str) # Random end Symbol | |
input_str = re.sub(r'\\min',chr(707), input_str) # Random Begin Symbol | |
input_str = re.sub(r'\\max',chr(706), input_str) # Random end Symbol | |
input_str = re.sub(r'\\exp',chr(281), input_str) # Exponential | |
input_str = re.sub(r'\\lg',chr(315), input_str) # Binary Logrithm ---- UNIQUE case | |
input_str = re.sub(r'\\ln',chr(317), input_str) # Natural Logrithm e | |
input_str = re.sub(r'\\log',chr(319), input_str) # Base Log 10 | |
input_str = re.sub(r'\\lim',chr(321), input_str) # Limit | |
input_str = re.sub(r'\\arg',chr(478), input_str) # Random char to represent ARG | |
input_str = re.sub(r'\\S$',chr(167), input_str) # Section Symbol. It can create problem for \Sigma to $ termination is used | |
# Trigo Symbols | |
input_str = re.sub(r'\\sinh',chr(525), input_str) | |
input_str = re.sub(r'\\sin',chr(524), input_str) | |
input_str = re.sub(r'\\cosh',chr(527), input_str) | |
input_str = re.sub(r'\\cos',chr(526), input_str) | |
input_str = re.sub(r'\\tanh',chr(555), input_str) | |
input_str = re.sub(r'\\tan',chr(554), input_str) | |
input_str = re.sub(r'\\cot',chr(556), input_str) | |
input_str = re.sub(r'\\sec',chr(557), input_str) | |
input_str = re.sub(r'\\csc',chr(558), input_str) | |
input_str = re.sub(r'\\arcsin',chr(559), input_str) | |
input_str = re.sub(r'\\arccos',chr(560), input_str) | |
input_str = re.sub(r'\\arctan',chr(561), input_str) | |
input_str = LatexNodes2Text().latex_to_text(input_str) | |
return input_str.replace('\n','').replace(' ','').lower() | |
def top_n(pipeline, x_test, y_test, n = 5): | |
probs = pipeline.predict_proba(x_test) | |
topn = np.argsort(probs, axis = 1)[:,-n:] | |
return np.mean(np.array([1 if y_test[k] in topn[k] else 0 for k in range(len(topn))])) | |
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32') # return word , vector_array from word, list_of_vector | |
def load_embedding_dict(path:str, skip_first_line:bool = False): | |
''' | |
Load Embeddings as a Dictonary from the file | |
args: | |
path: Path to the file | |
skip_first_line: Some files like Word2Vec has first file as [Words Dimension] so need to skip the first line in order to load | |
out: | |
Returns a Dictonary of words and it's Vectors as {'word': [0.3,0.1.....]} | |
''' | |
with open(path) as f: | |
lines = f.readlines() | |
return dict(get_coefs(*o.split(" ")) for o in lines[skip_first_line:]) | |
def sentence_to_mean_vect(sentence:str): | |
''' | |
Create a mean feature from all the words in the sentence | |
''' | |
feat = [] | |
for word in sentence.split(' '): | |
val = embeddings_dict.get(word) | |
feat.append(val) if val is not None else feat.append(np.zeros(300)) | |
return np.mean(feat,axis=0) | |
def cross_entropy(predictions, targets): | |
N = predictions.shape[0] | |
ce = -np.sum(targets * np.log(predictions)) / N | |
return ce | |
import pandas as pd | |
import re | |
import numpy as np | |
def fetch_old_data(path, subject): | |
maths = pd.read_csv(path) | |
maths = maths[maths['Subject'] == subject] | |
maths.rename(columns = {'eng':'text', 'q_id':'_id','chapter':'CHAPTER','Subject':'SUBJECT'}, inplace = True) | |
maths = maths.iloc[:,:-2] | |
return maths | |
def new_data_merge(path, sheet_num): | |
bio = pd.read_excel(path, sheet_num) | |
bio.rename(columns = {'chapter':'CHAPTER', 'Subject':'SUBJECT','crop_url':'question_url'}, inplace = True) | |
if '_id' not in bio.columns: | |
bio['_id'] = bio['question_url'].apply(lambda x: x.split('/')[-2]) | |
bio.drop_duplicates(subset=['text'],inplace=True) | |
bio = bio[~((bio['text'].isna())&(bio['latex'].isna()))] # Drop empty | |
bio = bio[~bio['Done on'].isna()] # Get those which have been done only | |
bio = bio[bio['Problems? notPCMB, noText, can\'tPredictChapter'].isna()] # If there is not any problems | |
bio['CHAPTER'] = bio['CHAPTER'].apply(lambda x: ref[x]) # Map to NCERT Chapter Names | |
for index in bio.index: # Map to Correct Chapter | |
if not pd.isna(bio.loc[index,'Correct Chapter ']): | |
bio.loc[index,'CHAPTER'] = bio.loc[index,'Correct Chapter '] | |
if not pd.isna(bio.loc[index,'Correct Subject']): | |
bio.loc[index,'SUBJECT'] = bio.loc[index,'Correct Subject'] | |
return bio | |
def process(sentence,use_lemmetization:False, use_stemming:False, add_pos: False, remove_length:bool = False): | |
if not isinstance(sentence,str): | |
return '' | |
# convert the characters into lower case | |
a = sentence.lower() | |
# remomve newline character | |
a = re.sub(r"\n+", " ", a) | |
# remove MathPix markdown starting from \( and ending at \) while preserving data inside \text { preserve this } | |
a = re.sub(r'\s*\\+\((.*?)\\+\)', lambda x: " ".join(re.findall(r'\\text\s*{([^{}]*)}', x.group(1))), a) | |
# remove whatever comes after \\ double or single slashes except space | |
a = re.sub(r"(\\[^ ]+)",' ',a) # makes sense. Just in case you enounter old mathpix API | |
a = re.sub(r'[^a-zA-Z]',' ',a) | |
# remove repeated space if there is any | |
a = re.sub(r"\s+", " ", a) | |
a = a.strip() # Remove start end spaces | |
if not len(a): | |
return '' | |
tokens = a.split(' ') | |
if use_lemmetization: # if lemmetize only | |
tokens = [lemmetizer.lemmatize(token) for token in tokens] | |
if use_stemming: # stemming only | |
tokens = [stemmer.stem(token) for token in tokens] | |
if add_pos: | |
tokens = [token+"_"+tag for token, tag in nltk.pos_tag(tokens)] | |
if remove_length: | |
tokens = [x for x in tokens if len(x) > remove_length] | |
return ' '.join(tokens) | |
def convert_numbers(x): | |
if bool(re.search(r'\d', x)): | |
x = re.sub('[0-9]{5,}', '#####', x) | |
x = re.sub('[0-9]{4}', '####', x) | |
x = re.sub('[0-9]{3}', '###', x) | |
x = re.sub('[0-9]{2}', '##', x) | |
return x | |
def convert_latex(input_str:str, to_unicode:bool = True, remove_numbers:bool = True): | |
""" | |
Function that converts input latex parts of a string to its unicode format eg. \\frac {3 x } {2 } -> 3 x / 2 | |
All the non latex characters will remain unaffected. | |
Args : | |
input_str : (string) input in string format with latex & non latex characters | |
to_unicode: Whether to convert string to unicode or not | |
remove_numbers: Whether to remove numbers or not. Numbers might cause redundancy instead of adding much info for classification as 123 have sme significance as 10.24 | |
Returns : | |
string converted with spaces, new lines & order preserved | |
""" | |
if not isinstance(input_str,str): | |
return '' | |
input_str = re.sub(r'\triangle',chr(9651), input_str) | |
input_str = re.sub(r'\\frac','/', input_str) | |
input_str = re.sub(r'\\text\s*{([^{}]*)}','',input_str) # Remove whatever is inside \text | |
if remove_numbers: | |
input_str = re.sub(r'[0-9]','',input_str) | |
if to_unicode: | |
raw_map = {8:r'\b', 7:r'\a', 12:r'\f', 10:r'\n', 13:r'\r', 9:r'\t', 11:r'\v'} # Constants to convert string escape characters to raw string | |
input_str = input_str.replace("\n", " !#! ") | |
input_str = LatexNodes2Text().latex_to_text(r''.join(i if ord(i) > 32 else raw_map.get(ord(i), i) for i in input_str)) | |
input_str = input_str.replace(" !#! ","\n") | |
return input_str.replace(' ','').lower() | |
def convert_latex(input_str:str, to_unicode:bool = True, remove_numbers:bool = True): | |
if not isinstance(input_str,str): | |
return '' # Whether it is a simple int (because when there is pure number in text, it is considered as int) or float (np.nan), then return empty string | |
input_str = re.sub(r'\\text\s*{([^{}]*)}','',input_str) # Remove whatever is inside \text | |
if remove_numbers: | |
input_str = re.sub(r'[0-9]','',input_str) | |
if to_unicode: | |
input_str = re.sub(r'\\bigodot',chr(416), input_str) # looks like a unique O | |
input_str = re.sub(r'\\hline',chr(713), input_str) # Horizontal Line | |
input_str = re.sub(r'\\overline',chr(727), input_str) # Horizontal Straight | |
input_str = re.sub(r'\\underline',chr(717), input_str) | |
input_str = re.sub(r'\\overbrace',chr(752), input_str) # Arrow pointing upward | |
input_str = re.sub(r'\\underbrace',chr(751), input_str) # Arrow pointing Downward | |
input_str = re.sub(r'\\overrightarrow',chr(754), input_str) # Arrow pointing right | |
input_str = re.sub(r'\\longdiv',chr(10188), input_str) # long Division | |
input_str = re.sub(r'\\jmath',chr(567), input_str) # dotless J | |
input_str = re.sub(r'\\imath',chr(305), input_str) # dotless I | |
input_str = re.sub(r'\\sqrt',chr(8730), input_str) | |
input_str = re.sub(r'\\Re',chr(344), input_str) # Looks like R symbol | |
input_str = re.sub(r'\\triangle',chr(9651), input_str) | |
input_str = re.sub(r'\\frac','/', input_str) | |
input_str = re.sub(r'\\widetilde',chr(771), input_str) # for \\tilde : 771 | |
input_str = re.sub(r'\\widehat',chr(770), input_str) # for \\hat : 770 | |
input_str = re.sub(r'\\Varangle',chr(8736), input_str) # Just like angle | |
input_str = re.sub(r'\\neg',chr(172), input_str) # Negation Sign | |
input_str = re.sub(r'\\begin',chr(705), input_str) # Random Begin Symbol | |
input_str = re.sub(r'\\end',chr(704), input_str) # Random end Symbol | |
input_str = re.sub(r'\\min',chr(707), input_str) # Random Begin Symbol | |
input_str = re.sub(r'\\max',chr(706), input_str) # Random end Symbol | |
input_str = re.sub(r'\\exp',chr(281), input_str) # Exponential | |
input_str = re.sub(r'\\lg',chr(315), input_str) # Binary Logrithm ---- UNIQUE case | |
input_str = re.sub(r'\\ln',chr(317), input_str) # Natural Logrithm e | |
input_str = re.sub(r'\\log',chr(319), input_str) # Base Log 10 | |
input_str = re.sub(r'\\lim',chr(321), input_str) # Limit | |
input_str = re.sub(r'\\arg',chr(478), input_str) # Random char to represent ARG | |
input_str = re.sub(r'\\S$',chr(167), input_str) # Section Symbol. It can create problem for \Sigma to $ termination is used | |
# Trigo Symbols | |
input_str = re.sub(r'\\sinh',chr(525), input_str) | |
input_str = re.sub(r'\\sin',chr(524), input_str) | |
input_str = re.sub(r'\\cosh',chr(527), input_str) | |
input_str = re.sub(r'\\cos',chr(526), input_str) | |
input_str = re.sub(r'\\tanh',chr(555), input_str) | |
input_str = re.sub(r'\\tan',chr(554), input_str) | |
input_str = re.sub(r'\\cot',chr(556), input_str) | |
input_str = re.sub(r'\\sec',chr(557), input_str) | |
input_str = re.sub(r'\\csc',chr(558), input_str) | |
input_str = re.sub(r'\\arcsin',chr(559), input_str) | |
input_str = re.sub(r'\\arccos',chr(560), input_str) | |
input_str = re.sub(r'\\arctan',chr(561), input_str) | |
input_str = LatexNodes2Text().latex_to_text(input_str) | |
return input_str.replace('\n','').replace(' ','').lower() | |
def top_n(pipeline, x_test, y_test, n = 5): | |
probs = pipeline.predict_proba(x_test) | |
topn = np.argsort(probs, axis = 1)[:,-n:] | |
return np.mean(np.array([1 if y_test[k] in topn[k] else 0 for k in range(len(topn))])) | |
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32') # return word , vector_array from word, list_of_vector | |
def load_embedding_dict(path:str, skip_first_line:bool = False): | |
''' | |
Load Embeddings as a Dictonary from the file | |
args: | |
path: Path to the file | |
skip_first_line: Some files like Word2Vec has first file as [Words Dimension] so need to skip the first line in order to load | |
out: | |
Returns a Dictonary of words and it's Vectors as {'word': [0.3,0.1.....]} | |
''' | |
with open(path) as f: | |
lines = f.readlines() | |
return dict(get_coefs(*o.split(" ")) for o in lines[skip_first_line:]) | |
def sentence_to_mean_vect(sentence:str): | |
''' | |
Create a mean feature from all the words in the sentence | |
''' | |
feat = [] | |
for word in sentence.split(' '): | |
val = embeddings_dict.get(word) | |
feat.append(val) if val is not None else feat.append(np.zeros(300)) | |
return np.mean(feat,axis=0) | |
def cross_entropy(predictions, targets): | |
N = predictions.shape[0] | |
ce = -np.sum(targets * np.log(predictions)) / N | |
return ce | |
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32') | |
mispell_dict = {"aren't" : "are not","can't" : "cannot","couldn't" : "could not","couldnt" : "could not","didn't" : "did not","doesn't" : "does not","doesnt" : "does not","don't" : "do not","hadn't" : "had not","hasn't" : "has not","haven't" : "have not","havent" : "have not","he'd" : "he would","he'll" : "he will","he's" : "he is","i'd" : "I would","i'll" : "I will","i'm" : "I am","isn't" : "is not","it's" : "it is","it'll":"it will","i've" : "I have","let's" : "let us","mightn't" : "might not","mustn't" : "must not","shan't" : "shall not","she'd" : "she would","she'll" : "she will","she's" : "she is","shouldn't" : "should not","shouldnt" : "should not","that's" : "that is","thats" : "that is","there's" : "there is","theres" : "there is","they'd" : "they would","they'll" : "they will","they're" : "they are","theyre": "they are","they've" : "they have","we'd" : "we would","we're" : "we are","weren't" : "were not","we've" : "we have","what'll" : "what will","what're" : "what are","what's" : "what is","what've" : "what have","where's" : "where is","who'd" : "who would","who'll" : "who will","who're" : "who are","who's" : "who is","who've" : "who have","won't" : "will not","wouldn't" : "would not","you'd" : "you would","you'll" : "you will","you're" : "you are","you've" : "you have","'re": " are","wasn't": "was not","we'll":" will","tryin'":"trying"} | |
def _get_mispell(mispell_dict): | |
mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys())) | |
return mispell_dict, mispell_re | |
def replace_contractions(text): | |
mispellings, mispellings_re = _get_mispell(mispell_dict) | |
def replace(match): | |
return mispellings[match.group(0)] | |
return mispellings_re.sub(replace, text) | |
def insert_space_fun(sentence): | |
''' | |
Add a space around special characters So "x+y +-=y \\latex" becomes: "x + y + - = y \\latex" | |
''' | |
hindi = range(2304, 2425) | |
gurumukhi = range(2561,2679) | |
arabic = range(1542,1792) | |
mandarian = range(19968,40944) | |
bangla = range(2432,2556) | |
string = '' | |
for i in sentence: | |
if (ord(i) in hindi) or (ord(i) in gurumukhi) or (ord(i) in arabic) or (ord(i) in mandarian) or (ord(i) in bangla): # used in fill in the blanks ---- | |
string += ' ' | |
elif i == '\\': | |
string += ' '+i | |
elif (not i.isalnum()) and (i not in [' ','\\','.']): | |
string += ' '+i+' ' | |
else: | |
string += i | |
return string | |
def insert_space_re(sentence): | |
sentence = re.sub(r'(?<! )(?![.a-zA-Z \\])', ' ', sentence) | |
sentence = re.sub(r'(?<!^)(?<![.a-zA-Z \\])(?! )', ' ', sentence) # preserve abbrevations but will harm sentence seprator | |
return sentence | |
def clean_text_latex(string, remove_stop = False, stop_words = None, remove_single_length = False, remove_special = False, special_replacement = ' SPL ', number_replacement = ' NUM '): # for both | |
if not isinstance(string, str): | |
return '' | |
string = re.sub(r'[\n\r\t\u200b\x96]',' ', string) # All characters means Empty Space | |
string = re.sub(r'\b((?:[A-Z]\.)+)\.?|\.', lambda x: x.group(1).replace('.', '') if x.group(1) else ' ', string) # Get the Abbrevations M.I.T -> MIT, I.I.T. -> IIT | |
string = string.lower() | |
string = re.sub('\d+(?:\.\d+)?', number_replacement, string) # for Numbers and decimal numbers # (\d*\.)?\d+ | |
# sp_basic = r'a-zA-Z\\ \^+/%><=*-' #r'a-zA-Z\\ \^+/><-=%' | |
# allowed_sp = re.compile('[^'+sp_basic+']') | |
# string = replace_contractions(string) # Remove contractions they're built to work with smallcase should be done before Special Character Removal | |
# string = re.sub(r"[^a-zA-Z\\ ]", special_replacement, string) if remove_special else re.sub(allowed_sp,' ', string) # | |
string = replace_contractions(string) # Remove contractions | |
string = re.sub(r"[^a-zA-Z\\ ]", special_replacement, string) if remove_special else re.sub(r"[^a-zA-Z \\.\^+~:/'|%><₹*ા]",' ', string) # Special Characters | |
string = re.sub(r'\\\s+', ' ', string) | |
string = insert_space_fun(string) | |
string = re.sub(r'\s+', ' ',string) # Remove Extra Space | |
if remove_stop or remove_single_length: | |
temp = [] | |
for word in string.split(' '): | |
if remove_stop and (word in stop_words): | |
continue | |
elif (remove_single_length) and (len(word) < 2) and (word.isalpha()): | |
continue | |
else: | |
temp.append(word) | |
string = ' '.join(temp) | |
return string | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment