deshwalmahesh · January 8, 2022 10:39
diff --git a/latex_cleaner.py b/latex_cleaner.py
 def process(sentence,use_lemmetization:False, use_stemming:False, add_pos: False, remove_length:bool = False):
    
  if not isinstance(sentence,str):
    return ''

  # convert the characters into lower case
  a = sentence.lower()

  # remomve newline character
  a = re.sub(r"\n+", " ", a)

  # remove MathPix markdown starting from \( and ending at \) while preserving data inside \text { preserve this }
  a = re.sub(r'\s*\\+\((.*?)\\+\)', lambda x: " ".join(re.findall(r'\\text\s*{([^{}]*)}', x.group(1))), a)

  # remove whatever comes after \\ double or single slashes except space 
  a = re.sub(r"(\\[^ ]+)",' ',a) # makes sense. Just in case you enounter old mathpix API

  a = re.sub(r'[^a-zA-Z]',' ',a)
    
  # remove repeated space if there is any
  a = re.sub(r"\s+", " ", a)
    
  a = a.strip() # Remove start end spaces
  
  if not len(a):
    return ''

  tokens = a.split(' ')

  if use_lemmetization: # if lemmetize only 
    tokens = [lemmetizer.lemmatize(token) for token in tokens]

  if use_stemming: # stemming only
    tokens = [stemmer.stem(token) for token in tokens]
    
  if add_pos:
    tokens = [token+"_"+tag for token, tag in nltk.pos_tag(tokens)]
    
  if remove_length:
    tokens = [x for x in tokens if len(x) > remove_length]
    
  return ' '.join(tokens)

  
 def convert_numbers(x):
    if bool(re.search(r'\d', x)):
        x = re.sub('[0-9]{5,}', '#####', x)
        x = re.sub('[0-9]{4}', '####', x)
        x = re.sub('[0-9]{3}', '###', x)
        x = re.sub('[0-9]{2}', '##', x)
    return x


 def convert_latex(input_str:str, to_unicode:bool = True, remove_numbers:bool = True):
    """
    Function that converts input latex parts of a string to its unicode format eg. \\frac {3 x } {2 } -> 3 x / 2
    All the non latex characters will remain unaffected.
    Args :
        input_str : (string) input in string format with latex & non latex characters
        to_unicode: Whether to convert string to unicode or not
        remove_numbers: Whether to remove numbers or not. Numbers might cause redundancy instead of adding much info for classification as 123 have sme significance as 10.24
    Returns :
        string converted with spaces, new lines & order preserved
    """
    if not isinstance(input_str,str):
        return ''
    
    input_str = re.sub(r'\triangle',chr(9651), input_str)
    input_str = re.sub(r'\\frac','/', input_str)
    
    input_str = re.sub(r'\\text\s*{([^{}]*)}','',input_str) # Remove whatever is inside \text
    
    
    if remove_numbers:
        input_str = re.sub(r'[0-9]','',input_str)
    
    if to_unicode:
    
        raw_map = {8:r'\b', 7:r'\a', 12:r'\f', 10:r'\n', 13:r'\r', 9:r'\t', 11:r'\v'}  # Constants to convert string escape characters to raw string
        input_str = input_str.replace("\n", " !#! ")
        input_str = LatexNodes2Text().latex_to_text(r''.join(i if ord(i) > 32 else raw_map.get(ord(i), i) for i in input_str))
        input_str = input_str.replace(" !#! ","\n")

    return input_str.replace(' ','').lower()



 def convert_latex(input_str:str, to_unicode:bool = True, remove_numbers:bool = True):
    if not isinstance(input_str,str):
        return '' # Whether it is a simple int (because when there is pure number in text, it is considered as int) or float (np.nan), then return empty string
    
    input_str = re.sub(r'\\text\s*{([^{}]*)}','',input_str) # Remove whatever is inside \text
    
    if remove_numbers:
        input_str = re.sub(r'[0-9]','',input_str)
        
    if to_unicode:
        input_str = re.sub(r'\\bigodot',chr(416), input_str) # looks like a unique O
        input_str = re.sub(r'\\hline',chr(713), input_str) # Horizontal Line 
        input_str = re.sub(r'\\overline',chr(727), input_str) # Horizontal Straight
        input_str = re.sub(r'\\underline',chr(717), input_str)
        input_str = re.sub(r'\\overbrace',chr(752), input_str) # Arrow pointing upward
        input_str = re.sub(r'\\underbrace',chr(751), input_str) # Arrow pointing Downward
        input_str = re.sub(r'\\overrightarrow',chr(754), input_str) # Arrow pointing right
        input_str = re.sub(r'\\longdiv',chr(10188), input_str) # long Division
        input_str = re.sub(r'\\jmath',chr(567), input_str) # dotless J
        input_str = re.sub(r'\\imath',chr(305), input_str) # dotless I
        input_str = re.sub(r'\\sqrt',chr(8730), input_str)
        input_str = re.sub(r'\\Re',chr(344), input_str) # Looks like R symbol
        input_str = re.sub(r'\\triangle',chr(9651), input_str)
        input_str = re.sub(r'\\frac','/', input_str)
        input_str = re.sub(r'\\widetilde',chr(771), input_str) # for \\tilde : 771
        input_str = re.sub(r'\\widehat',chr(770), input_str) # for \\hat : 770
        input_str = re.sub(r'\\Varangle',chr(8736), input_str) # Just like angle
        input_str = re.sub(r'\\neg',chr(172), input_str) # Negation Sign
        input_str = re.sub(r'\\begin',chr(705), input_str) # Random Begin Symbol
        input_str = re.sub(r'\\end',chr(704), input_str) # Random end Symbol
        input_str = re.sub(r'\\min',chr(707), input_str) # Random Begin Symbol
        input_str = re.sub(r'\\max',chr(706), input_str) # Random end Symbol
        input_str = re.sub(r'\\exp',chr(281), input_str) # Exponential
        input_str = re.sub(r'\\lg',chr(315), input_str) # Binary Logrithm ---- UNIQUE case
        input_str = re.sub(r'\\ln',chr(317), input_str) # Natural Logrithm e
        input_str = re.sub(r'\\log',chr(319), input_str) # Base Log 10
        input_str = re.sub(r'\\lim',chr(321), input_str) # Limit
        input_str = re.sub(r'\\arg',chr(478), input_str) # Random char to represent ARG
        input_str = re.sub(r'\\S$',chr(167), input_str) # Section Symbol. It can create problem for \Sigma to $ termination is used
        
        # Trigo Symbols
        input_str = re.sub(r'\\sinh',chr(525), input_str)
        input_str = re.sub(r'\\sin',chr(524), input_str) 
        input_str = re.sub(r'\\cosh',chr(527), input_str)
        input_str = re.sub(r'\\cos',chr(526), input_str)
        input_str = re.sub(r'\\tanh',chr(555), input_str) 
        input_str = re.sub(r'\\tan',chr(554), input_str) 
        input_str = re.sub(r'\\cot',chr(556), input_str) 
        input_str = re.sub(r'\\sec',chr(557), input_str) 
        input_str = re.sub(r'\\csc',chr(558), input_str) 
        input_str = re.sub(r'\\arcsin',chr(559), input_str) 
        input_str = re.sub(r'\\arccos',chr(560), input_str) 
        input_str = re.sub(r'\\arctan',chr(561), input_str) 
        
        
        input_str = LatexNodes2Text().latex_to_text(input_str) 
        
    return input_str.replace('\n','').replace(' ','').lower()



 def top_n(pipeline, x_test, y_test, n = 5):
    probs = pipeline.predict_proba(x_test)
    topn = np.argsort(probs, axis = 1)[:,-n:]
    return np.mean(np.array([1 if y_test[k] in topn[k] else 0 for k in range(len(topn))]))


 def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32') # return word , vector_array from word, list_of_vector

 def load_embedding_dict(path:str, skip_first_line:bool = False):
    '''
    Load Embeddings as a Dictonary from the file
    args:
        path: Path to the file
        skip_first_line: Some files like Word2Vec has first file as [Words Dimension] so need to skip the first line in order to load
    out:
        Returns a Dictonary of words and it's Vectors as {'word': [0.3,0.1.....]}
    '''
    with open(path) as f:
        lines = f.readlines()
        
    return dict(get_coefs(*o.split(" ")) for o in lines[skip_first_line:])


 def sentence_to_mean_vect(sentence:str):
    '''
    Create a mean feature from all the words in the sentence
    '''
    feat = []
    for word in sentence.split(' '):
        val = embeddings_dict.get(word)
        feat.append(val) if val is not None else feat.append(np.zeros(300))

    return np.mean(feat,axis=0)


 def cross_entropy(predictions, targets):
    N = predictions.shape[0]
    ce = -np.sum(targets * np.log(predictions)) / N
    return ce






















 import pandas as pd
 import re
 import numpy as np


 def fetch_old_data(path, subject):
    maths = pd.read_csv(path)
    maths = maths[maths['Subject'] == subject]

    maths.rename(columns = {'eng':'text', 'q_id':'_id','chapter':'CHAPTER','Subject':'SUBJECT'}, inplace = True)
    maths = maths.iloc[:,:-2]
    return maths


 def new_data_merge(path, sheet_num):
    bio = pd.read_excel(path, sheet_num)
    bio.rename(columns = {'chapter':'CHAPTER', 'Subject':'SUBJECT','crop_url':'question_url'}, inplace = True)
    if '_id' not in bio.columns:
        bio['_id'] = bio['question_url'].apply(lambda x: x.split('/')[-2])
        
    bio.drop_duplicates(subset=['text'],inplace=True)

    bio = bio[~((bio['text'].isna())&(bio['latex'].isna()))] # Drop empty
    bio = bio[~bio['Done on'].isna()] # Get those which have been done only
    bio = bio[bio['Problems? notPCMB, noText, can\'tPredictChapter'].isna()] # If there is not any problems

    bio['CHAPTER'] = bio['CHAPTER'].apply(lambda x: ref[x]) # Map to NCERT Chapter Names

    for index in bio.index: # Map to Correct Chapter
        if not pd.isna(bio.loc[index,'Correct Chapter ']):
            bio.loc[index,'CHAPTER'] = bio.loc[index,'Correct Chapter ']

        if not pd.isna(bio.loc[index,'Correct Subject']):
            bio.loc[index,'SUBJECT'] = bio.loc[index,'Correct Subject']
            
    return bio



 def process(sentence,use_lemmetization:False, use_stemming:False, add_pos: False, remove_length:bool = False):
    
  if not isinstance(sentence,str):
    return ''

  # convert the characters into lower case
  a = sentence.lower()

  # remomve newline character
  a = re.sub(r"\n+", " ", a)

  # remove MathPix markdown starting from \( and ending at \) while preserving data inside \text { preserve this }
  a = re.sub(r'\s*\\+\((.*?)\\+\)', lambda x: " ".join(re.findall(r'\\text\s*{([^{}]*)}', x.group(1))), a)

  # remove whatever comes after \\ double or single slashes except space 
  a = re.sub(r"(\\[^ ]+)",' ',a) # makes sense. Just in case you enounter old mathpix API

  a = re.sub(r'[^a-zA-Z]',' ',a)
    
  # remove repeated space if there is any
  a = re.sub(r"\s+", " ", a)
    
  a = a.strip() # Remove start end spaces
  
  if not len(a):
    return ''

  tokens = a.split(' ')

  if use_lemmetization: # if lemmetize only 
    tokens = [lemmetizer.lemmatize(token) for token in tokens]

  if use_stemming: # stemming only
    tokens = [stemmer.stem(token) for token in tokens]
    
  if add_pos:
    tokens = [token+"_"+tag for token, tag in nltk.pos_tag(tokens)]
    
  if remove_length:
    tokens = [x for x in tokens if len(x) > remove_length]
    
  return ' '.join(tokens)

  
 def convert_numbers(x):
    if bool(re.search(r'\d', x)):
        x = re.sub('[0-9]{5,}', '#####', x)
        x = re.sub('[0-9]{4}', '####', x)
        x = re.sub('[0-9]{3}', '###', x)
        x = re.sub('[0-9]{2}', '##', x)
    return x


 def convert_latex(input_str:str, to_unicode:bool = True, remove_numbers:bool = True):
    """
    Function that converts input latex parts of a string to its unicode format eg. \\frac {3 x } {2 } -> 3 x / 2
    All the non latex characters will remain unaffected.
    Args :
        input_str : (string) input in string format with latex & non latex characters
        to_unicode: Whether to convert string to unicode or not
        remove_numbers: Whether to remove numbers or not. Numbers might cause redundancy instead of adding much info for classification as 123 have sme significance as 10.24
    Returns :
        string converted with spaces, new lines & order preserved
    """
    if not isinstance(input_str,str):
        return ''
    
    input_str = re.sub(r'\triangle',chr(9651), input_str)
    input_str = re.sub(r'\\frac','/', input_str)
    
    input_str = re.sub(r'\\text\s*{([^{}]*)}','',input_str) # Remove whatever is inside \text
    
    
    if remove_numbers:
        input_str = re.sub(r'[0-9]','',input_str)
    
    if to_unicode:
    
        raw_map = {8:r'\b', 7:r'\a', 12:r'\f', 10:r'\n', 13:r'\r', 9:r'\t', 11:r'\v'}  # Constants to convert string escape characters to raw string
        input_str = input_str.replace("\n", " !#! ")
        input_str = LatexNodes2Text().latex_to_text(r''.join(i if ord(i) > 32 else raw_map.get(ord(i), i) for i in input_str))
        input_str = input_str.replace(" !#! ","\n")

    return input_str.replace(' ','').lower()



 def convert_latex(input_str:str, to_unicode:bool = True, remove_numbers:bool = True):
    if not isinstance(input_str,str):
        return '' # Whether it is a simple int (because when there is pure number in text, it is considered as int) or float (np.nan), then return empty string
    
    input_str = re.sub(r'\\text\s*{([^{}]*)}','',input_str) # Remove whatever is inside \text
    
    if remove_numbers:
        input_str = re.sub(r'[0-9]','',input_str)
        
    if to_unicode:
        input_str = re.sub(r'\\bigodot',chr(416), input_str) # looks like a unique O
        input_str = re.sub(r'\\hline',chr(713), input_str) # Horizontal Line 
        input_str = re.sub(r'\\overline',chr(727), input_str) # Horizontal Straight
        input_str = re.sub(r'\\underline',chr(717), input_str)
        input_str = re.sub(r'\\overbrace',chr(752), input_str) # Arrow pointing upward
        input_str = re.sub(r'\\underbrace',chr(751), input_str) # Arrow pointing Downward
        input_str = re.sub(r'\\overrightarrow',chr(754), input_str) # Arrow pointing right
        input_str = re.sub(r'\\longdiv',chr(10188), input_str) # long Division
        input_str = re.sub(r'\\jmath',chr(567), input_str) # dotless J
        input_str = re.sub(r'\\imath',chr(305), input_str) # dotless I
        input_str = re.sub(r'\\sqrt',chr(8730), input_str)
        input_str = re.sub(r'\\Re',chr(344), input_str) # Looks like R symbol
        input_str = re.sub(r'\\triangle',chr(9651), input_str)
        input_str = re.sub(r'\\frac','/', input_str)
        input_str = re.sub(r'\\widetilde',chr(771), input_str) # for \\tilde : 771
        input_str = re.sub(r'\\widehat',chr(770), input_str) # for \\hat : 770
        input_str = re.sub(r'\\Varangle',chr(8736), input_str) # Just like angle
        input_str = re.sub(r'\\neg',chr(172), input_str) # Negation Sign
        input_str = re.sub(r'\\begin',chr(705), input_str) # Random Begin Symbol
        input_str = re.sub(r'\\end',chr(704), input_str) # Random end Symbol
        input_str = re.sub(r'\\min',chr(707), input_str) # Random Begin Symbol
        input_str = re.sub(r'\\max',chr(706), input_str) # Random end Symbol
        input_str = re.sub(r'\\exp',chr(281), input_str) # Exponential
        input_str = re.sub(r'\\lg',chr(315), input_str) # Binary Logrithm ---- UNIQUE case
        input_str = re.sub(r'\\ln',chr(317), input_str) # Natural Logrithm e
        input_str = re.sub(r'\\log',chr(319), input_str) # Base Log 10
        input_str = re.sub(r'\\lim',chr(321), input_str) # Limit
        input_str = re.sub(r'\\arg',chr(478), input_str) # Random char to represent ARG
        input_str = re.sub(r'\\S$',chr(167), input_str) # Section Symbol. It can create problem for \Sigma to $ termination is used
        
        # Trigo Symbols
        input_str = re.sub(r'\\sinh',chr(525), input_str)
        input_str = re.sub(r'\\sin',chr(524), input_str) 
        input_str = re.sub(r'\\cosh',chr(527), input_str)
        input_str = re.sub(r'\\cos',chr(526), input_str)
        input_str = re.sub(r'\\tanh',chr(555), input_str) 
        input_str = re.sub(r'\\tan',chr(554), input_str) 
        input_str = re.sub(r'\\cot',chr(556), input_str) 
        input_str = re.sub(r'\\sec',chr(557), input_str) 
        input_str = re.sub(r'\\csc',chr(558), input_str) 
        input_str = re.sub(r'\\arcsin',chr(559), input_str) 
        input_str = re.sub(r'\\arccos',chr(560), input_str) 
        input_str = re.sub(r'\\arctan',chr(561), input_str) 
        
        
        input_str = LatexNodes2Text().latex_to_text(input_str) 
        
    return input_str.replace('\n','').replace(' ','').lower()



 def top_n(pipeline, x_test, y_test, n = 5):
    probs = pipeline.predict_proba(x_test)
    topn = np.argsort(probs, axis = 1)[:,-n:]
    return np.mean(np.array([1 if y_test[k] in topn[k] else 0 for k in range(len(topn))]))


 def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32') # return word , vector_array from word, list_of_vector

 def load_embedding_dict(path:str, skip_first_line:bool = False):
    '''
    Load Embeddings as a Dictonary from the file
    args:
        path: Path to the file
        skip_first_line: Some files like Word2Vec has first file as [Words Dimension] so need to skip the first line in order to load
    out:
        Returns a Dictonary of words and it's Vectors as {'word': [0.3,0.1.....]}
    '''
    with open(path) as f:
        lines = f.readlines()
        
    return dict(get_coefs(*o.split(" ")) for o in lines[skip_first_line:])


 def sentence_to_mean_vect(sentence:str):
    '''
    Create a mean feature from all the words in the sentence
    '''
    feat = []
    for word in sentence.split(' '):
        val = embeddings_dict.get(word)
        feat.append(val) if val is not None else feat.append(np.zeros(300))

    return np.mean(feat,axis=0)


 def cross_entropy(predictions, targets):
    N = predictions.shape[0]
    ce = -np.sum(targets * np.log(predictions)) / N
    return ce


 def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')


 mispell_dict = {"aren't" : "are not","can't" : "cannot","couldn't" : "could not","couldnt" : "could not","didn't" : "did not","doesn't" : "does not","doesnt" : "does not","don't" : "do not","hadn't" : "had not","hasn't" : "has not","haven't" : "have not","havent" : "have not","he'd" : "he would","he'll" : "he will","he's" : "he is","i'd" : "I would","i'll" : "I will","i'm" : "I am","isn't" : "is not","it's" : "it is","it'll":"it will","i've" : "I have","let's" : "let us","mightn't" : "might not","mustn't" : "must not","shan't" : "shall not","she'd" : "she would","she'll" : "she will","she's" : "she is","shouldn't" : "should not","shouldnt" : "should not","that's" : "that is","thats" : "that is","there's" : "there is","theres" : "there is","they'd" : "they would","they'll" : "they will","they're" : "they are","theyre":  "they are","they've" : "they have","we'd" : "we would","we're" : "we are","weren't" : "were not","we've" : "we have","what'll" : "what will","what're" : "what are","what's" : "what is","what've" : "what have","where's" : "where is","who'd" : "who would","who'll" : "who will","who're" : "who are","who's" : "who is","who've" : "who have","won't" : "will not","wouldn't" : "would not","you'd" : "you would","you'll" : "you will","you're" : "you are","you've" : "you have","'re": " are","wasn't": "was not","we'll":" will","tryin'":"trying"}

 def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

 def replace_contractions(text):
    mispellings, mispellings_re = _get_mispell(mispell_dict)

    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)


 def insert_space_fun(sentence):
    '''
    Add a space around special characters So "x+y +-=y \\latex" becomes: "x + y + - = y \\latex"
    '''
    hindi = range(2304, 2425)
    gurumukhi = range(2561,2679)
    arabic = range(1542,1792)
    mandarian = range(19968,40944)
    bangla = range(2432,2556)
    
    
    string = ''
    for i in sentence:
        if (ord(i) in hindi) or (ord(i) in gurumukhi) or (ord(i) in arabic) or (ord(i) in mandarian) or (ord(i) in bangla): # used in fill in the blanks ----
            string += ' '
            
        elif i == '\\':
            string += ' '+i
               
        elif (not i.isalnum()) and (i not in [' ','\\','.']):
            string += ' '+i+' '
              
        else:
            string += i
            
    return string


 def insert_space_re(sentence):
    sentence = re.sub(r'(?<! )(?![.a-zA-Z \\])', ' ', sentence)
    sentence = re.sub(r'(?<!^)(?<![.a-zA-Z \\])(?! )', ' ', sentence) # preserve abbrevations but will harm sentence seprator
    return sentence


 def clean_text_latex(string, remove_stop = False, stop_words = None, remove_single_length = False, remove_special = False, special_replacement = ' SPL ', number_replacement = ' NUM '): # for both
    if not isinstance(string, str):
      return ''
    
    string = re.sub(r'[\n\r\t\u200b\x96]',' ', string) # All characters means Empty Space
    string = re.sub(r'\b((?:[A-Z]\.)+)\.?|\.', lambda x: x.group(1).replace('.', '') if x.group(1) else ' ', string) # Get the Abbrevations M.I.T -> MIT, I.I.T. -> IIT
    
    string = string.lower()
    
    string = re.sub('\d+(?:\.\d+)?', number_replacement, string) # for Numbers and decimal numbers # (\d*\.)?\d+ 
    
 #     sp_basic = r'a-zA-Z\\ \^+/%><=*-' #r'a-zA-Z\\ \^+/><-=%'
 #     allowed_sp = re.compile('[^'+sp_basic+']')
    
 #     string = replace_contractions(string) # Remove contractions they're built to work with smallcase should be done before Special Character Removal
 #     string = re.sub(r"[^a-zA-Z\\ ]", special_replacement, string) if remove_special else re.sub(allowed_sp,' ', string) # 
    
    
    string = replace_contractions(string) # Remove contractions
    string = re.sub(r"[^a-zA-Z\\ ]", special_replacement, string) if remove_special else re.sub(r"[^a-zA-Z \\.\^+~:/'|%><₹*ા]",' ', string) # Special Characters
    string = re.sub(r'\\\s+', ' ', string)
    
    string = insert_space_fun(string)

    string = re.sub(r'\s+', ' ',string) # Remove Extra Space
    
    if remove_stop or remove_single_length:
        temp = []
        for word in string.split(' '):
            if remove_stop and (word in stop_words):
                continue
            elif (remove_single_length) and (len(word) < 2) and (word.isalpha()):
                continue
            else:
                temp.append(word)
                
        string = ' '.join(temp)
                
    return string
	def process(sentence,use_lemmetization:False, use_stemming:False, add_pos: False, remove_length:bool = False):

	if not isinstance(sentence,str):
	return ''

	# convert the characters into lower case
	a = sentence.lower()

	# remomve newline character
	a = re.sub(r"\n+", " ", a)

	# remove MathPix markdown starting from \( and ending at \) while preserving data inside \text { preserve this }
	a = re.sub(r'\s\\+\((.?)\\+\)', lambda x: " ".join(re.findall(r'\\text\s{([^{}])}', x.group(1))), a)

	# remove whatever comes after \\ double or single slashes except space
	a = re.sub(r"(\\[^ ]+)",' ',a) # makes sense. Just in case you enounter old mathpix API

	a = re.sub(r'[^a-zA-Z]',' ',a)

	# remove repeated space if there is any
	a = re.sub(r"\s+", " ", a)

	a = a.strip() # Remove start end spaces

	if not len(a):
	return ''

	tokens = a.split(' ')

	if use_lemmetization: # if lemmetize only
	tokens = [lemmetizer.lemmatize(token) for token in tokens]

	if use_stemming: # stemming only
	tokens = [stemmer.stem(token) for token in tokens]

	if add_pos:
	tokens = [token+"_"+tag for token, tag in nltk.pos_tag(tokens)]

	if remove_length:
	tokens = [x for x in tokens if len(x) > remove_length]

	return ' '.join(tokens)


	def convert_numbers(x):
	if bool(re.search(r'\d', x)):
	x = re.sub('[0-9]{5,}', '#####', x)
	x = re.sub('[0-9]{4}', '####', x)
	x = re.sub('[0-9]{3}', '###', x)
	x = re.sub('[0-9]{2}', '##', x)
	return x


	def convert_latex(input_str:str, to_unicode:bool = True, remove_numbers:bool = True):
	"""
	Function that converts input latex parts of a string to its unicode format eg. \\frac {3 x } {2 } -> 3 x / 2
	All the non latex characters will remain unaffected.
	Args :
	input_str : (string) input in string format with latex & non latex characters
	to_unicode: Whether to convert string to unicode or not
	remove_numbers: Whether to remove numbers or not. Numbers might cause redundancy instead of adding much info for classification as 123 have sme significance as 10.24
	Returns :
	string converted with spaces, new lines & order preserved
	"""
	if not isinstance(input_str,str):
	return ''

	input_str = re.sub(r'\triangle',chr(9651), input_str)
	input_str = re.sub(r'\\frac','/', input_str)

	input_str = re.sub(r'\\text\s{([^{}])}','',input_str) # Remove whatever is inside \text


	if remove_numbers:
	input_str = re.sub(r'[0-9]','',input_str)

	if to_unicode:

	raw_map = {8:r'\b', 7:r'\a', 12:r'\f', 10:r'\n', 13:r'\r', 9:r'\t', 11:r'\v'} # Constants to convert string escape characters to raw string
	input_str = input_str.replace("\n", " !#! ")
	input_str = LatexNodes2Text().latex_to_text(r''.join(i if ord(i) > 32 else raw_map.get(ord(i), i) for i in input_str))
	input_str = input_str.replace(" !#! ","\n")

	return input_str.replace(' ','').lower()



	def convert_latex(input_str:str, to_unicode:bool = True, remove_numbers:bool = True):
	if not isinstance(input_str,str):
	return '' # Whether it is a simple int (because when there is pure number in text, it is considered as int) or float (np.nan), then return empty string

	input_str = re.sub(r'\\text\s{([^{}])}','',input_str) # Remove whatever is inside \text

	if remove_numbers:
	input_str = re.sub(r'[0-9]','',input_str)

	if to_unicode:
	input_str = re.sub(r'\\bigodot',chr(416), input_str) # looks like a unique O
	input_str = re.sub(r'\\hline',chr(713), input_str) # Horizontal Line
	input_str = re.sub(r'\\overline',chr(727), input_str) # Horizontal Straight
	input_str = re.sub(r'\\underline',chr(717), input_str)
	input_str = re.sub(r'\\overbrace',chr(752), input_str) # Arrow pointing upward
	input_str = re.sub(r'\\underbrace',chr(751), input_str) # Arrow pointing Downward
	input_str = re.sub(r'\\overrightarrow',chr(754), input_str) # Arrow pointing right
	input_str = re.sub(r'\\longdiv',chr(10188), input_str) # long Division
	input_str = re.sub(r'\\jmath',chr(567), input_str) # dotless J
	input_str = re.sub(r'\\imath',chr(305), input_str) # dotless I
	input_str = re.sub(r'\\sqrt',chr(8730), input_str)
	input_str = re.sub(r'\\Re',chr(344), input_str) # Looks like R symbol
	input_str = re.sub(r'\\triangle',chr(9651), input_str)
	input_str = re.sub(r'\\frac','/', input_str)
	input_str = re.sub(r'\\widetilde',chr(771), input_str) # for \\tilde : 771
	input_str = re.sub(r'\\widehat',chr(770), input_str) # for \\hat : 770
	input_str = re.sub(r'\\Varangle',chr(8736), input_str) # Just like angle
	input_str = re.sub(r'\\neg',chr(172), input_str) # Negation Sign
	input_str = re.sub(r'\\begin',chr(705), input_str) # Random Begin Symbol
	input_str = re.sub(r'\\end',chr(704), input_str) # Random end Symbol
	input_str = re.sub(r'\\min',chr(707), input_str) # Random Begin Symbol
	input_str = re.sub(r'\\max',chr(706), input_str) # Random end Symbol
	input_str = re.sub(r'\\exp',chr(281), input_str) # Exponential
	input_str = re.sub(r'\\lg',chr(315), input_str) # Binary Logrithm ---- UNIQUE case
	input_str = re.sub(r'\\ln',chr(317), input_str) # Natural Logrithm e
	input_str = re.sub(r'\\log',chr(319), input_str) # Base Log 10
	input_str = re.sub(r'\\lim',chr(321), input_str) # Limit
	input_str = re.sub(r'\\arg',chr(478), input_str) # Random char to represent ARG
	input_str = re.sub(r'\\S$',chr(167), input_str) # Section Symbol. It can create problem for \Sigma to $ termination is used

	# Trigo Symbols
	input_str = re.sub(r'\\sinh',chr(525), input_str)
	input_str = re.sub(r'\\sin',chr(524), input_str)
	input_str = re.sub(r'\\cosh',chr(527), input_str)
	input_str = re.sub(r'\\cos',chr(526), input_str)
	input_str = re.sub(r'\\tanh',chr(555), input_str)
	input_str = re.sub(r'\\tan',chr(554), input_str)
	input_str = re.sub(r'\\cot',chr(556), input_str)
	input_str = re.sub(r'\\sec',chr(557), input_str)
	input_str = re.sub(r'\\csc',chr(558), input_str)
	input_str = re.sub(r'\\arcsin',chr(559), input_str)
	input_str = re.sub(r'\\arccos',chr(560), input_str)
	input_str = re.sub(r'\\arctan',chr(561), input_str)


	input_str = LatexNodes2Text().latex_to_text(input_str)

	return input_str.replace('\n','').replace(' ','').lower()



	def top_n(pipeline, x_test, y_test, n = 5):
	probs = pipeline.predict_proba(x_test)
	topn = np.argsort(probs, axis = 1)[:,-n:]
	return np.mean(np.array([1 if y_test[k] in topn[k] else 0 for k in range(len(topn))]))


	def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32') # return word , vector_array from word, list_of_vector

	def load_embedding_dict(path:str, skip_first_line:bool = False):
	'''
	Load Embeddings as a Dictonary from the file
	args:
	path: Path to the file
	skip_first_line: Some files like Word2Vec has first file as [Words Dimension] so need to skip the first line in order to load
	out:
	Returns a Dictonary of words and it's Vectors as {'word': [0.3,0.1.....]}
	'''
	with open(path) as f:
	lines = f.readlines()

	return dict(get_coefs(*o.split(" ")) for o in lines[skip_first_line:])


	def sentence_to_mean_vect(sentence:str):
	'''
	Create a mean feature from all the words in the sentence
	'''
	feat = []
	for word in sentence.split(' '):
	val = embeddings_dict.get(word)
	feat.append(val) if val is not None else feat.append(np.zeros(300))

	return np.mean(feat,axis=0)


	def cross_entropy(predictions, targets):
	N = predictions.shape[0]
	ce = -np.sum(targets * np.log(predictions)) / N
	return ce






















	import pandas as pd
	import re
	import numpy as np


	def fetch_old_data(path, subject):
	maths = pd.read_csv(path)
	maths = maths[maths['Subject'] == subject]

	maths.rename(columns = {'eng':'text', 'q_id':'_id','chapter':'CHAPTER','Subject':'SUBJECT'}, inplace = True)
	maths = maths.iloc[:,:-2]
	return maths


	def new_data_merge(path, sheet_num):
	bio = pd.read_excel(path, sheet_num)
	bio.rename(columns = {'chapter':'CHAPTER', 'Subject':'SUBJECT','crop_url':'question_url'}, inplace = True)
	if '_id' not in bio.columns:
	bio['_id'] = bio['question_url'].apply(lambda x: x.split('/')[-2])

	bio.drop_duplicates(subset=['text'],inplace=True)

	bio = bio[~((bio['text'].isna())&(bio['latex'].isna()))] # Drop empty
	bio = bio[~bio['Done on'].isna()] # Get those which have been done only
	bio = bio[bio['Problems? notPCMB, noText, can\'tPredictChapter'].isna()] # If there is not any problems

	bio['CHAPTER'] = bio['CHAPTER'].apply(lambda x: ref[x]) # Map to NCERT Chapter Names

	for index in bio.index: # Map to Correct Chapter
	if not pd.isna(bio.loc[index,'Correct Chapter ']):
	bio.loc[index,'CHAPTER'] = bio.loc[index,'Correct Chapter ']

	if not pd.isna(bio.loc[index,'Correct Subject']):
	bio.loc[index,'SUBJECT'] = bio.loc[index,'Correct Subject']

	return bio



	def process(sentence,use_lemmetization:False, use_stemming:False, add_pos: False, remove_length:bool = False):

	if not isinstance(sentence,str):
	return ''

	# convert the characters into lower case
	a = sentence.lower()

	# remomve newline character
	a = re.sub(r"\n+", " ", a)

	# remove MathPix markdown starting from \( and ending at \) while preserving data inside \text { preserve this }
	a = re.sub(r'\s\\+\((.?)\\+\)', lambda x: " ".join(re.findall(r'\\text\s{([^{}])}', x.group(1))), a)

	# remove whatever comes after \\ double or single slashes except space
	a = re.sub(r"(\\[^ ]+)",' ',a) # makes sense. Just in case you enounter old mathpix API

	a = re.sub(r'[^a-zA-Z]',' ',a)

	# remove repeated space if there is any
	a = re.sub(r"\s+", " ", a)

	a = a.strip() # Remove start end spaces

	if not len(a):
	return ''

	tokens = a.split(' ')

	if use_lemmetization: # if lemmetize only
	tokens = [lemmetizer.lemmatize(token) for token in tokens]

	if use_stemming: # stemming only
	tokens = [stemmer.stem(token) for token in tokens]

	if add_pos:
	tokens = [token+"_"+tag for token, tag in nltk.pos_tag(tokens)]

	if remove_length:
	tokens = [x for x in tokens if len(x) > remove_length]

	return ' '.join(tokens)


	def convert_numbers(x):
	if bool(re.search(r'\d', x)):
	x = re.sub('[0-9]{5,}', '#####', x)
	x = re.sub('[0-9]{4}', '####', x)
	x = re.sub('[0-9]{3}', '###', x)
	x = re.sub('[0-9]{2}', '##', x)
	return x


	def convert_latex(input_str:str, to_unicode:bool = True, remove_numbers:bool = True):
	"""
	Function that converts input latex parts of a string to its unicode format eg. \\frac {3 x } {2 } -> 3 x / 2
	All the non latex characters will remain unaffected.
	Args :
	input_str : (string) input in string format with latex & non latex characters
	to_unicode: Whether to convert string to unicode or not
	remove_numbers: Whether to remove numbers or not. Numbers might cause redundancy instead of adding much info for classification as 123 have sme significance as 10.24
	Returns :
	string converted with spaces, new lines & order preserved
	"""
	if not isinstance(input_str,str):
	return ''

	input_str = re.sub(r'\triangle',chr(9651), input_str)
	input_str = re.sub(r'\\frac','/', input_str)

	input_str = re.sub(r'\\text\s{([^{}])}','',input_str) # Remove whatever is inside \text


	if remove_numbers:
	input_str = re.sub(r'[0-9]','',input_str)

	if to_unicode:

	raw_map = {8:r'\b', 7:r'\a', 12:r'\f', 10:r'\n', 13:r'\r', 9:r'\t', 11:r'\v'} # Constants to convert string escape characters to raw string
	input_str = input_str.replace("\n", " !#! ")
	input_str = LatexNodes2Text().latex_to_text(r''.join(i if ord(i) > 32 else raw_map.get(ord(i), i) for i in input_str))
	input_str = input_str.replace(" !#! ","\n")

	return input_str.replace(' ','').lower()



	def convert_latex(input_str:str, to_unicode:bool = True, remove_numbers:bool = True):
	if not isinstance(input_str,str):
	return '' # Whether it is a simple int (because when there is pure number in text, it is considered as int) or float (np.nan), then return empty string

	input_str = re.sub(r'\\text\s{([^{}])}','',input_str) # Remove whatever is inside \text

	if remove_numbers:
	input_str = re.sub(r'[0-9]','',input_str)

	if to_unicode:
	input_str = re.sub(r'\\bigodot',chr(416), input_str) # looks like a unique O
	input_str = re.sub(r'\\hline',chr(713), input_str) # Horizontal Line
	input_str = re.sub(r'\\overline',chr(727), input_str) # Horizontal Straight
	input_str = re.sub(r'\\underline',chr(717), input_str)
	input_str = re.sub(r'\\overbrace',chr(752), input_str) # Arrow pointing upward
	input_str = re.sub(r'\\underbrace',chr(751), input_str) # Arrow pointing Downward
	input_str = re.sub(r'\\overrightarrow',chr(754), input_str) # Arrow pointing right
	input_str = re.sub(r'\\longdiv',chr(10188), input_str) # long Division
	input_str = re.sub(r'\\jmath',chr(567), input_str) # dotless J
	input_str = re.sub(r'\\imath',chr(305), input_str) # dotless I
	input_str = re.sub(r'\\sqrt',chr(8730), input_str)
	input_str = re.sub(r'\\Re',chr(344), input_str) # Looks like R symbol
	input_str = re.sub(r'\\triangle',chr(9651), input_str)
	input_str = re.sub(r'\\frac','/', input_str)
	input_str = re.sub(r'\\widetilde',chr(771), input_str) # for \\tilde : 771
	input_str = re.sub(r'\\widehat',chr(770), input_str) # for \\hat : 770
	input_str = re.sub(r'\\Varangle',chr(8736), input_str) # Just like angle
	input_str = re.sub(r'\\neg',chr(172), input_str) # Negation Sign
	input_str = re.sub(r'\\begin',chr(705), input_str) # Random Begin Symbol
	input_str = re.sub(r'\\end',chr(704), input_str) # Random end Symbol
	input_str = re.sub(r'\\min',chr(707), input_str) # Random Begin Symbol
	input_str = re.sub(r'\\max',chr(706), input_str) # Random end Symbol
	input_str = re.sub(r'\\exp',chr(281), input_str) # Exponential
	input_str = re.sub(r'\\lg',chr(315), input_str) # Binary Logrithm ---- UNIQUE case
	input_str = re.sub(r'\\ln',chr(317), input_str) # Natural Logrithm e
	input_str = re.sub(r'\\log',chr(319), input_str) # Base Log 10
	input_str = re.sub(r'\\lim',chr(321), input_str) # Limit
	input_str = re.sub(r'\\arg',chr(478), input_str) # Random char to represent ARG
	input_str = re.sub(r'\\S$',chr(167), input_str) # Section Symbol. It can create problem for \Sigma to $ termination is used

	# Trigo Symbols
	input_str = re.sub(r'\\sinh',chr(525), input_str)
	input_str = re.sub(r'\\sin',chr(524), input_str)
	input_str = re.sub(r'\\cosh',chr(527), input_str)
	input_str = re.sub(r'\\cos',chr(526), input_str)
	input_str = re.sub(r'\\tanh',chr(555), input_str)
	input_str = re.sub(r'\\tan',chr(554), input_str)
	input_str = re.sub(r'\\cot',chr(556), input_str)
	input_str = re.sub(r'\\sec',chr(557), input_str)
	input_str = re.sub(r'\\csc',chr(558), input_str)
	input_str = re.sub(r'\\arcsin',chr(559), input_str)
	input_str = re.sub(r'\\arccos',chr(560), input_str)
	input_str = re.sub(r'\\arctan',chr(561), input_str)


	input_str = LatexNodes2Text().latex_to_text(input_str)

	return input_str.replace('\n','').replace(' ','').lower()



	def top_n(pipeline, x_test, y_test, n = 5):
	probs = pipeline.predict_proba(x_test)
	topn = np.argsort(probs, axis = 1)[:,-n:]
	return np.mean(np.array([1 if y_test[k] in topn[k] else 0 for k in range(len(topn))]))


	def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32') # return word , vector_array from word, list_of_vector

	def load_embedding_dict(path:str, skip_first_line:bool = False):
	'''
	Load Embeddings as a Dictonary from the file
	args:
	path: Path to the file
	skip_first_line: Some files like Word2Vec has first file as [Words Dimension] so need to skip the first line in order to load
	out:
	Returns a Dictonary of words and it's Vectors as {'word': [0.3,0.1.....]}
	'''
	with open(path) as f:
	lines = f.readlines()

	return dict(get_coefs(*o.split(" ")) for o in lines[skip_first_line:])


	def sentence_to_mean_vect(sentence:str):
	'''
	Create a mean feature from all the words in the sentence
	'''
	feat = []
	for word in sentence.split(' '):
	val = embeddings_dict.get(word)
	feat.append(val) if val is not None else feat.append(np.zeros(300))

	return np.mean(feat,axis=0)


	def cross_entropy(predictions, targets):
	N = predictions.shape[0]
	ce = -np.sum(targets * np.log(predictions)) / N
	return ce


	def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')


	mispell_dict = {"aren't" : "are not","can't" : "cannot","couldn't" : "could not","couldnt" : "could not","didn't" : "did not","doesn't" : "does not","doesnt" : "does not","don't" : "do not","hadn't" : "had not","hasn't" : "has not","haven't" : "have not","havent" : "have not","he'd" : "he would","he'll" : "he will","he's" : "he is","i'd" : "I would","i'll" : "I will","i'm" : "I am","isn't" : "is not","it's" : "it is","it'll":"it will","i've" : "I have","let's" : "let us","mightn't" : "might not","mustn't" : "must not","shan't" : "shall not","she'd" : "she would","she'll" : "she will","she's" : "she is","shouldn't" : "should not","shouldnt" : "should not","that's" : "that is","thats" : "that is","there's" : "there is","theres" : "there is","they'd" : "they would","they'll" : "they will","they're" : "they are","theyre": "they are","they've" : "they have","we'd" : "we would","we're" : "we are","weren't" : "were not","we've" : "we have","what'll" : "what will","what're" : "what are","what's" : "what is","what've" : "what have","where's" : "where is","who'd" : "who would","who'll" : "who will","who're" : "who are","who's" : "who is","who've" : "who have","won't" : "will not","wouldn't" : "would not","you'd" : "you would","you'll" : "you will","you're" : "you are","you've" : "you have","'re": " are","wasn't": "was not","we'll":" will","tryin'":"trying"}

	def _get_mispell(mispell_dict):
	mispell_re = re.compile('(%s)' % '\|'.join(mispell_dict.keys()))
	return mispell_dict, mispell_re

	def replace_contractions(text):
	mispellings, mispellings_re = _get_mispell(mispell_dict)

	def replace(match):
	return mispellings[match.group(0)]

	return mispellings_re.sub(replace, text)


	def insert_space_fun(sentence):
	'''
	Add a space around special characters So "x+y +-=y \\latex" becomes: "x + y + - = y \\latex"
	'''
	hindi = range(2304, 2425)
	gurumukhi = range(2561,2679)
	arabic = range(1542,1792)
	mandarian = range(19968,40944)
	bangla = range(2432,2556)


	string = ''
	for i in sentence:
	if (ord(i) in hindi) or (ord(i) in gurumukhi) or (ord(i) in arabic) or (ord(i) in mandarian) or (ord(i) in bangla): # used in fill in the blanks ----
	string += ' '

	elif i == '\\':
	string += ' '+i

	elif (not i.isalnum()) and (i not in [' ','\\','.']):
	string += ' '+i+' '

	else:
	string += i

	return string


	def insert_space_re(sentence):
	sentence = re.sub(r'(?<! )(?![.a-zA-Z \\])', ' ', sentence)
	sentence = re.sub(r'(?<!^)(?<![.a-zA-Z \\])(?! )', ' ', sentence) # preserve abbrevations but will harm sentence seprator
	return sentence


	def clean_text_latex(string, remove_stop = False, stop_words = None, remove_single_length = False, remove_special = False, special_replacement = ' SPL ', number_replacement = ' NUM '): # for both
	if not isinstance(string, str):
	return ''

	string = re.sub(r'[\n\r\t\u200b\x96]',' ', string) # All characters means Empty Space
	string = re.sub(r'\b((?:[A-Z]\.)+)\.?\|\.', lambda x: x.group(1).replace('.', '') if x.group(1) else ' ', string) # Get the Abbrevations M.I.T -> MIT, I.I.T. -> IIT

	string = string.lower()

	string = re.sub('\d+(?:\.\d+)?', number_replacement, string) # for Numbers and decimal numbers # (\d*\.)?\d+

	# sp_basic = r'a-zA-Z\\ \^+/%><=*-' #r'a-zA-Z\\ \^+/><-=%'
	# allowed_sp = re.compile('[^'+sp_basic+']')

	# string = replace_contractions(string) # Remove contractions they're built to work with smallcase should be done before Special Character Removal
	# string = re.sub(r"[^a-zA-Z\\ ]", special_replacement, string) if remove_special else re.sub(allowed_sp,' ', string) #


	string = replace_contractions(string) # Remove contractions
	string = re.sub(r"[^a-zA-Z\\ ]", special_replacement, string) if remove_special else re.sub(r"[^a-zA-Z \\.\^+~:/'\|%><₹*ા]",' ', string) # Special Characters
	string = re.sub(r'\\\s+', ' ', string)

	string = insert_space_fun(string)

	string = re.sub(r'\s+', ' ',string) # Remove Extra Space

	if remove_stop or remove_single_length:
	temp = []
	for word in string.split(' '):
	if remove_stop and (word in stop_words):
	continue
	elif (remove_single_length) and (len(word) < 2) and (word.isalpha()):
	continue
	else:
	temp.append(word)

	string = ' '.join(temp)

	return string