This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def clean_text(x): | |
pattern = r'[^a-zA-z0-9\s]' | |
text = re.sub(pattern, '', x) | |
return x |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def clean_numbers(x): | |
if bool(re.search(r'\d', x)): | |
x = re.sub('[0-9]{5,}', '#####', x) | |
x = re.sub('[0-9]{4}', '####', x) | |
x = re.sub('[0-9]{3}', '###', x) | |
x = re.sub('[0-9]{2}', '##', x) | |
return x |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This comes from CPMP script in the Quora questions similarity challenge. | |
import re | |
from collections import Counter | |
import gensim | |
import heapq | |
from operator import itemgetter | |
from multiprocessing import Pool | |
model = gensim.models.KeyedVectors.load_word2vec_format('../input/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin', | |
binary=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
mispell_dict = {'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization'} | |
def _g |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is", "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will", "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": " |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Signature: | |
Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', | |
lower=True, split=' ', char_level=False, oov_token=None, document_count=0, **kwargs) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from keras.preprocessing.text import Tokenizer | |
## Tokenize the sentences | |
tokenizer = Tokenizer(num_words=max_features) | |
tokenizer.fit_on_texts(list(train_X)+list(test_X)) | |
train_X = tokenizer.texts_to_sequences(train_X) | |
test_X = tokenizer.texts_to_sequences(test_X) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
train_X = pad_sequences(train_X, maxlen=maxlen) | |
test_X = pad_sequences(test_X, maxlen=maxlen) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def load_glove_index(): | |
EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt' | |
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')[:300] | |
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE)) | |
return embeddings_index | |
glove_embedding_index = load_glove_index() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def create_glove(word_index,embeddings_index): | |
emb_mean,emb_std = -0.005838499,0.48782197 | |
all_embs = np.stack(embeddings_index.values()) | |
embed_size = all_embs.shape[1] | |
nb_words = min(max_features, len(word_index)) | |
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size)) | |
count_found = nb_words | |
for word, i in tqdm(word_index.items()): | |
if i >= max_features: continue | |
embedding_vector = embeddings_index.get(word) |