This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def create_edges(line): | |
a = [int(x) for x in line.split(" ")] | |
edges_list=[] | |
for i in range(0, len(a)-1): | |
for j in range(i+1 ,len(a)): | |
edges_list.append((a[i],a[j])) | |
edges_list.append((a[j],a[i])) | |
return edges_list |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from graphframes import * | |
def vertices(line): | |
vert = [int(x) for x in line.split(" ")] | |
return vert | |
vertices = adjacency_list.flatMap(lambda x: vertices(x)).distinct().collect() | |
vertices = sqlContext.createDataFrame([[x] for x in vertices], ["id"]) | |
def create_edges(line): | |
a = [int(x) for x in line.split(" ")] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk.stem import SnowballStemmer | |
from nltk.tokenize.toktok import ToktokTokenizer | |
def stem_text(text): | |
tokenizer = ToktokTokenizer() | |
stemmer = SnowballStemmer('english') | |
tokens = tokenizer.tokenize(text) | |
tokens = [token.strip() for token in tokens] | |
tokens = [stemmer.stem(token) for token in tokens] | |
return ' '.join(tokens) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk.stem import WordNetLemmatizer | |
from nltk.tokenize.toktok import ToktokTokenizer | |
def lemma_text(text): | |
tokenizer = ToktokTokenizer() | |
tokens = tokenizer.tokenize(text) | |
tokens = [token.strip() for token in tokens] | |
tokens = [wordnet_lemmatizer.lemmatize(token) for token in tokens] | |
return ' '.join(tokens) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def clean_sentence(x): | |
x = x.lower() | |
x = clean_text(x) | |
x = clean_numbers(x) | |
x = replace_typical_misspell(x) | |
x = remove_stopwords(x) | |
x = replace_contractions(x) | |
x = lemma_text(x) | |
x = x.replace("'","") | |
return x |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
cnt_vectorizer = CountVectorizer(dtype=np.float32, | |
strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}', | |
ngram_range=(1, 3),min_df=3) | |
# we fit count vectorizer to get ngrams from both train and test data. | |
cnt_vectorizer.fit(list(train_df.cleaned_text.values) + list(test_df.cleaned_text.values)) | |
xtrain_cntv = cnt_vectorizer.transform(train_df.cleaned_text.values) | |
xtest_cntv = cnt_vectorizer.transform(test_df.cleaned_text.values) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Always start with these features. They work (almost) everytime! | |
tfv = TfidfVectorizer(dtype=np.float32, min_df=3, max_features=None, | |
strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}', | |
ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1, | |
stop_words = 'english') | |
# Fitting TF-IDF to both training and test sets (semi-supervised learning) | |
tfv.fit(list(train_df.cleaned_text.values) + list(test_df.cleaned_text.values)) | |
xtrain_tfv = tfv.transform(train_df.cleaned_text.values) | |
xvalid_tfv = tfv.transform(test_df.cleaned_text.values) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Always start with these features. They work (almost) everytime! | |
hv = HashingVectorizer(dtype=np.float32, | |
strip_accents='unicode', analyzer='word', | |
ngram_range=(1, 4),n_features=2**12,non_negative=True) | |
# Fitting Hash Vectorizer to both training and test sets (semi-supervised learning) | |
hv.fit(list(train_df.cleaned_text.values) + list(test_df.cleaned_text.values)) | |
xtrain_hv = hv.transform(train_df.cleaned_text.values) | |
xvalid_hv = hv.transform(test_df.cleaned_text.values) | |
y_train = train_df.target.values |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# load the GloVe vectors in a dictionary: | |
def load_glove_index(): | |
EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt' | |
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')[:300] | |
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE)) | |
return embeddings_index | |
embeddings_index = load_glove_index() | |
print('Found %s word vectors.' % len(embeddings_index)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk.stem import PorterStemmer | |
ps = PorterStemmer() | |
from nltk.stem.lancaster import LancasterStemmer | |
lc = LancasterStemmer() | |
from nltk.stem import SnowballStemmer | |
sb = SnowballStemmer("english") | |
def load_glove(word_dict, lemma_dict): | |
EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt' | |
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32') | |
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE)) |