joshua-taylor’s gists

joshua-taylor / Ngrams.py

Created June 30, 2019 15:35

Ngrams.py

	def ngrams(string, n=3):
	string = fix_text(string) # fix text encoding issues
	string = string.encode("ascii", errors="ignore").decode() #remove non ascii chars
	string = string.lower() #make lower case
	chars_to_remove = [")","(",".","\|","[","]","{","}","'"]
	rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
	string = re.sub(rx, '', string) #remove the list of chars defined above
	string = string.replace('&', 'and')
	string = string.replace(',', ' ')
	string = string.replace('-', ' ')

joshua-taylor / TF-IDF vector.py

Created November 24, 2019 13:05

	import IPython
	tkn = tfidf.build_tokenizer()
	sent = df.questionText.values[236178].lower()
	sent = tkn(sent)
	html=''
	for wrd in sent:
	try:
	weight = (tfidf.idf_[tfidf.vocabulary_[wrd]])*10
	print(weight/10)
	except:

joshua-taylor / BERT vectors.py

Created November 24, 2019 13:16

BERT vectors and TFIDF

	import spacy
	import torch
	from sklearn.feature_extraction.text import TfidfVectorizer
	import IPython

	is_using_gpu = spacy.prefer_gpu()
	if is_using_gpu:
	torch.set_default_tensor_type("torch.cuda.FloatTensor")

	nlp = spacy.load("en_trf_bertbaseuncased_lg")

joshua-taylor / Cluster labels.py

Last active November 24, 2019 13:48

Cluster labels

	from sklearn.feature_extraction.text import TfidfVectorizer
	vectorizer = TfidfVectorizer(stop_words='english')
	tfidf = vectorizer.fit_transform(df.questionText.values)
	totals = 0
	for cluster in df.cluster.value_counts()[0:10].index:
	stg = " ".join(df.loc[df.cluster==cluster].questionText.values)
	response = vectorizer.transform([stg])
	count = df.cluster.value_counts().loc[cluster]
	totals += count

joshua-taylor / sPacy tokenize.py

Created October 10, 2020 09:45

sPacy tokenize

	nlp = spacy.load("en_core_web_sm")
	tok_text=[] # OUTPUT for our tokenised corpus
	text = df.text.str.lower().values
	text = [fix_text(str(i)) for i in text]

	#Tokenising using SpaCy:
	for doc in tqdm(nlp.pipe(text, n_threads=2, disable=["tagger", "parser","ner"])):
	tok = [t.text for t in doc if (t.is_ascii and not t.is_punct and not t.is_space)]
	tok_text.append(tok)

joshua-taylor / fastText.py

Last active March 8, 2022 18:56

	from gensim.models.fasttext import FastText

	ft_model = FastText(
	sg=1, # use skip-gram: usually gives better results
	size=100, # embedding dimension (default)
	window=10, # window size: 10 tokens before and 10 tokens after to get wider context
	min_count=5, # only consider tokens with at least n occurrences in the corpus
	negative=15, # negative subsampling: bigger than default to sample negative examples more
	min_n=2, # min character n-gram
	max_n=5 # max character n-gram

joshua-taylor / BM25 for wordvectors.py

Created October 10, 2020 11:20

	weighted_doc_vects = []

	for i,doc in tqdm(enumerate(tok_text)):
	doc_vector = []
	for word in doc:
	vector = ft_model[word]
	weight = (bm25.idf[word] * ((bm25.k1 + 1.0)*bm25.doc_freqs[i][word]))
	/
	(bm25.k1 * (1.0 - bm25.b + bm25.b *(bm25.doc_len[i]/bm25.avgdl))+bm25.doc_freqs[i][word])
	weighted_vector = vector * weight

joshua-taylor / NMSLIB.py

Last active October 10, 2020 15:20

	import nmslib

	# create a matrix from our document vectors
	data = np.vstack(weighted_doc_vects)

	# initialize a new index, using a HNSW index on Cosine Similarity
	index = nmslib.init(method='hnsw', space='cosinesimil')
	index.addDataPointBatch(data)
	index.createIndex({'post': 2}, print_progress=True)

joshua-taylor / Query.py

Created October 10, 2020 15:50

	input = 'flood defences'.lower().split()

	query = [ft_model[vec] for vec in input]
	query = np.mean(query,axis=0)

	t0 = time.time()
	ids, distances = index.knnQuery(query, k=10)
	t1 = time.time()
	print(f'Searched {df.shape[0]} records in {round(t1-t0,4) } seconds \n')
	for i,j in zip(ids,distances):

joshua-taylor / supplyChainModel.py

Last active May 27, 2021 15:16

	from scipy.optimize import minimize, LinearConstraint, basinhopping
	from math import floor
	import numpy as np

	#Setting up the pricing amounts for each supplier
	supplierPrice = [10.5,11,10]
	supplierDiscountAmount = [0.1,0.35,0.05]
	supplierDiscountThreshold = [100,260,300]
	n_suppliers = len(supplierPrice)
	#Our minimum order amount