Skip to content

Instantly share code, notes, and snippets.

View MLWhiz's full-sized avatar
🤓
Focusing

Rahul Agarwal MLWhiz

🤓
Focusing
View GitHub Profile
def create_edges(line):
a = [int(x) for x in line.split(" ")]
edges_list=[]
for i in range(0, len(a)-1):
for j in range(i+1 ,len(a)):
edges_list.append((a[i],a[j]))
edges_list.append((a[j],a[i]))
return edges_list
from graphframes import *
def vertices(line):
vert = [int(x) for x in line.split(" ")]
return vert
vertices = adjacency_list.flatMap(lambda x: vertices(x)).distinct().collect()
vertices = sqlContext.createDataFrame([[x] for x in vertices], ["id"])
def create_edges(line):
a = [int(x) for x in line.split(" ")]
from nltk.stem import SnowballStemmer
from nltk.tokenize.toktok import ToktokTokenizer
def stem_text(text):
tokenizer = ToktokTokenizer()
stemmer = SnowballStemmer('english')
tokens = tokenizer.tokenize(text)
tokens = [token.strip() for token in tokens]
tokens = [stemmer.stem(token) for token in tokens]
return ' '.join(tokens)
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.toktok import ToktokTokenizer
def lemma_text(text):
tokenizer = ToktokTokenizer()
tokens = tokenizer.tokenize(text)
tokens = [token.strip() for token in tokens]
tokens = [wordnet_lemmatizer.lemmatize(token) for token in tokens]
return ' '.join(tokens)
def clean_sentence(x):
x = x.lower()
x = clean_text(x)
x = clean_numbers(x)
x = replace_typical_misspell(x)
x = remove_stopwords(x)
x = replace_contractions(x)
x = lemma_text(x)
x = x.replace("'","")
return x
cnt_vectorizer = CountVectorizer(dtype=np.float32,
strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
ngram_range=(1, 3),min_df=3)
# we fit count vectorizer to get ngrams from both train and test data.
cnt_vectorizer.fit(list(train_df.cleaned_text.values) + list(test_df.cleaned_text.values))
xtrain_cntv = cnt_vectorizer.transform(train_df.cleaned_text.values)
xtest_cntv = cnt_vectorizer.transform(test_df.cleaned_text.values)
# Always start with these features. They work (almost) everytime!
tfv = TfidfVectorizer(dtype=np.float32, min_df=3, max_features=None,
strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
stop_words = 'english')
# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(train_df.cleaned_text.values) + list(test_df.cleaned_text.values))
xtrain_tfv = tfv.transform(train_df.cleaned_text.values)
xvalid_tfv = tfv.transform(test_df.cleaned_text.values)
# Always start with these features. They work (almost) everytime!
hv = HashingVectorizer(dtype=np.float32,
strip_accents='unicode', analyzer='word',
ngram_range=(1, 4),n_features=2**12,non_negative=True)
# Fitting Hash Vectorizer to both training and test sets (semi-supervised learning)
hv.fit(list(train_df.cleaned_text.values) + list(test_df.cleaned_text.values))
xtrain_hv = hv.transform(train_df.cleaned_text.values)
xvalid_hv = hv.transform(test_df.cleaned_text.values)
y_train = train_df.target.values
# load the GloVe vectors in a dictionary:
def load_glove_index():
EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')[:300]
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))
return embeddings_index
embeddings_index = load_glove_index()
print('Found %s word vectors.' % len(embeddings_index))
from nltk.stem import PorterStemmer
ps = PorterStemmer()
from nltk.stem.lancaster import LancasterStemmer
lc = LancasterStemmer()
from nltk.stem import SnowballStemmer
sb = SnowballStemmer("english")
def load_glove(word_dict, lemma_dict):
EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))