MLWhiz’s gists

MLWhiz / clean_text_regex.py

Created January 18, 2019 05:45

	def clean_text(x):
	pattern = r'[^a-zA-z0-9\s]'
	text = re.sub(pattern, '', x)
	return x

MLWhiz / clean_numbers.py

Created January 18, 2019 05:46

	def clean_numbers(x):
	if bool(re.search(r'\d', x)):
	x = re.sub('[0-9]{5,}', '#####', x)
	x = re.sub('[0-9]{4}', '####', x)
	x = re.sub('[0-9]{3}', '###', x)
	x = re.sub('[0-9]{2}', '##', x)
	return x

MLWhiz / misspell_checker.py

Created January 18, 2019 05:47

	# This comes from CPMP script in the Quora questions similarity challenge.
	import re
	from collections import Counter
	import gensim
	import heapq
	from operator import itemgetter
	from multiprocessing import Pool

	model = gensim.models.KeyedVectors.load_word2vec_format('../input/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin',
	binary=True)

MLWhiz / misspell_replace.py

Created January 18, 2019 05:47

	mispell_dict = {'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization'}

	def _g

MLWhiz / contraction_replace.py

Created January 18, 2019 05:48

contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "

MLWhiz / tokenizer_signature.py

Created January 18, 2019 05:49

	#Signature:
	Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{\|}~\t\n',
	lower=True, split=' ', char_level=False, oov_token=None, document_count=0, **kwargs)

MLWhiz / tokenizer_usage.py

Created January 18, 2019 05:50

	from keras.preprocessing.text import Tokenizer
	## Tokenize the sentences
	tokenizer = Tokenizer(num_words=max_features)
	tokenizer.fit_on_texts(list(train_X)+list(test_X))
	train_X = tokenizer.texts_to_sequences(train_X)
	test_X = tokenizer.texts_to_sequences(test_X)

MLWhiz / pad_seq.py

Created January 18, 2019 05:51

	train_X = pad_sequences(train_X, maxlen=maxlen)
	test_X = pad_sequences(test_X, maxlen=maxlen)

MLWhiz / load_glove_1.py

Last active January 18, 2019 05:53

	def load_glove_index():
	EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
	def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')[:300]
	embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))
	return embeddings_index

	glove_embedding_index = load_glove_index()

MLWhiz / create_glove_1.py

Created January 18, 2019 05:53

	def create_glove(word_index,embeddings_index):
	emb_mean,emb_std = -0.005838499,0.48782197
	all_embs = np.stack(embeddings_index.values())
	embed_size = all_embs.shape[1]
	nb_words = min(max_features, len(word_index))
	embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
	count_found = nb_words
	for word, i in tqdm(word_index.items()):
	if i >= max_features: continue
	embedding_vector = embeddings_index.get(word)

Rahul Agarwal MLWhiz