This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def ngrams(string, n=3): | |
| string = fix_text(string) # fix text encoding issues | |
| string = string.encode("ascii", errors="ignore").decode() #remove non ascii chars | |
| string = string.lower() #make lower case | |
| chars_to_remove = [")","(",".","|","[","]","{","}","'"] | |
| rx = '[' + re.escape(''.join(chars_to_remove)) + ']' | |
| string = re.sub(rx, '', string) #remove the list of chars defined above | |
| string = string.replace('&', 'and') | |
| string = string.replace(',', ' ') | |
| string = string.replace('-', ' ') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import IPython | |
| tkn = tfidf.build_tokenizer() | |
| sent = df.questionText.values[236178].lower() | |
| sent = tkn(sent) | |
| html='' | |
| for wrd in sent: | |
| try: | |
| weight = (tfidf.idf_[tfidf.vocabulary_[wrd]])*10 | |
| print(weight/10) | |
| except: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import spacy | |
| import torch | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| import IPython | |
| is_using_gpu = spacy.prefer_gpu() | |
| if is_using_gpu: | |
| torch.set_default_tensor_type("torch.cuda.FloatTensor") | |
| nlp = spacy.load("en_trf_bertbaseuncased_lg") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| vectorizer = TfidfVectorizer(stop_words='english') | |
| tfidf = vectorizer.fit_transform(df.questionText.values) | |
| totals = 0 | |
| for cluster in df.cluster.value_counts()[0:10].index: | |
| stg = " ".join(df.loc[df.cluster==cluster].questionText.values) | |
| response = vectorizer.transform([stg]) | |
| count = df.cluster.value_counts().loc[cluster] | |
| totals += count |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| nlp = spacy.load("en_core_web_sm") | |
| tok_text=[] # OUTPUT for our tokenised corpus | |
| text = df.text.str.lower().values | |
| text = [fix_text(str(i)) for i in text] | |
| #Tokenising using SpaCy: | |
| for doc in tqdm(nlp.pipe(text, n_threads=2, disable=["tagger", "parser","ner"])): | |
| tok = [t.text for t in doc if (t.is_ascii and not t.is_punct and not t.is_space)] | |
| tok_text.append(tok) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from gensim.models.fasttext import FastText | |
| ft_model = FastText( | |
| sg=1, # use skip-gram: usually gives better results | |
| size=100, # embedding dimension (default) | |
| window=10, # window size: 10 tokens before and 10 tokens after to get wider context | |
| min_count=5, # only consider tokens with at least n occurrences in the corpus | |
| negative=15, # negative subsampling: bigger than default to sample negative examples more | |
| min_n=2, # min character n-gram | |
| max_n=5 # max character n-gram |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| weighted_doc_vects = [] | |
| for i,doc in tqdm(enumerate(tok_text)): | |
| doc_vector = [] | |
| for word in doc: | |
| vector = ft_model[word] | |
| weight = (bm25.idf[word] * ((bm25.k1 + 1.0)*bm25.doc_freqs[i][word])) | |
| / | |
| (bm25.k1 * (1.0 - bm25.b + bm25.b *(bm25.doc_len[i]/bm25.avgdl))+bm25.doc_freqs[i][word]) | |
| weighted_vector = vector * weight |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import nmslib | |
| # create a matrix from our document vectors | |
| data = np.vstack(weighted_doc_vects) | |
| # initialize a new index, using a HNSW index on Cosine Similarity | |
| index = nmslib.init(method='hnsw', space='cosinesimil') | |
| index.addDataPointBatch(data) | |
| index.createIndex({'post': 2}, print_progress=True) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| input = 'flood defences'.lower().split() | |
| query = [ft_model[vec] for vec in input] | |
| query = np.mean(query,axis=0) | |
| t0 = time.time() | |
| ids, distances = index.knnQuery(query, k=10) | |
| t1 = time.time() | |
| print(f'Searched {df.shape[0]} records in {round(t1-t0,4) } seconds \n') | |
| for i,j in zip(ids,distances): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from scipy.optimize import minimize, LinearConstraint, basinhopping | |
| from math import floor | |
| import numpy as np | |
| #Setting up the pricing amounts for each supplier | |
| supplierPrice = [10.5,11,10] | |
| supplierDiscountAmount = [0.1,0.35,0.05] | |
| supplierDiscountThreshold = [100,260,300] | |
| n_suppliers = len(supplierPrice) | |
| #Our minimum order amount |
OlderNewer