Skip to content

Instantly share code, notes, and snippets.

@LowriWilliams
Created August 16, 2020 10:53
Show Gist options
  • Save LowriWilliams/f9df0a5d745c06a7c1a540d7bb5df0d7 to your computer and use it in GitHub Desktop.
Save LowriWilliams/f9df0a5d745c06a7c1a540d7bb5df0d7 to your computer and use it in GitHub Desktop.
sms_adversarial/word2vec
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Time taken to train word2vec model: 2.8230369091033936\n"
]
}
],
"source": [
"# Skip-gram model (sg = 1)\n",
"size = 1000\n",
"window = 3\n",
"min_count = 1\n",
"workers = 3\n",
"sg = 1\n",
"\n",
"start_time = time.time()\n",
"tokens = pd.Series(df['patterns']).values\n",
"# Train the Word2Vec Model\n",
"w2v_model = Word2Vec(tokens, min_count = min_count, size = size, workers = workers, window = window, sg = sg)\n",
"print(\"Time taken to train word2vec model: \" + str(time.time() - start_time))\n",
"\n",
"word2vec_model_file = 'word2vec_' + str(size) + '.model'\n",
"w2v_model.save(word2vec_model_file)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total number of words\n",
"8355\n"
]
}
],
"source": [
"# Load the model from the model file\n",
"sg_w2v_model = Word2Vec.load(word2vec_model_file)\n",
"\n",
"# Total number of the words \n",
"print(\"Total number of words\")\n",
"print(len(sg_w2v_model.wv.vocab))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/ipykernel_launcher.py:6: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n",
" \n",
"/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/numpy/core/fromnumeric.py:3335: RuntimeWarning: Mean of empty slice.\n",
" out=out, **kwargs)\n",
"/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/numpy/core/_methods.py:161: RuntimeWarning: invalid value encountered in double_scalars\n",
" ret = ret.dtype.type(ret / rcount)\n"
]
}
],
"source": [
"# Store the vectors for train data in following file\n",
"word2vec_filename = 'all_review_word2vec.csv'\n",
"\n",
"with open(word2vec_filename, 'w+') as word2vec_file:\n",
" for index, row in df.iterrows():\n",
" model_vector = (np.mean([sg_w2v_model[token] for token in row['patterns']], axis=0)).tolist()\n",
" \n",
" if index == 0:\n",
" header = \",\".join(str(ele) for ele in range(1000))\n",
" word2vec_file.write(header)\n",
" word2vec_file.write(\"\\n\")\n",
" # Check if the line exists else it is vector of zeros\n",
" if type(model_vector) is list: \n",
" line1 = \",\".join( [str(vector_element) for vector_element in model_vector] )\n",
" else:\n",
" line1 = \",\".join([str(0) for i in range(size)])\n",
" word2vec_file.write(line1)\n",
" word2vec_file.write('\\n')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" <th>6</th>\n",
" <th>7</th>\n",
" <th>8</th>\n",
" <th>9</th>\n",
" <th>...</th>\n",
" <th>992</th>\n",
" <th>993</th>\n",
" <th>994</th>\n",
" <th>995</th>\n",
" <th>996</th>\n",
" <th>997</th>\n",
" <th>998</th>\n",
" <th>999</th>\n",
" <th>label_not_spam</th>\n",
" <th>label_spam</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>-0.002518</td>\n",
" <td>0.005691</td>\n",
" <td>0.028719</td>\n",
" <td>0.112553</td>\n",
" <td>0.004954</td>\n",
" <td>-0.000118</td>\n",
" <td>-0.047354</td>\n",
" <td>0.063511</td>\n",
" <td>0.015165</td>\n",
" <td>0.065314</td>\n",
" <td>...</td>\n",
" <td>0.022455</td>\n",
" <td>0.023590</td>\n",
" <td>0.049012</td>\n",
" <td>0.042182</td>\n",
" <td>0.067843</td>\n",
" <td>0.012421</td>\n",
" <td>-0.026096</td>\n",
" <td>0.048588</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>-0.003335</td>\n",
" <td>0.006316</td>\n",
" <td>0.030497</td>\n",
" <td>0.118717</td>\n",
" <td>0.004992</td>\n",
" <td>0.000358</td>\n",
" <td>-0.049837</td>\n",
" <td>0.067021</td>\n",
" <td>0.016288</td>\n",
" <td>0.069339</td>\n",
" <td>...</td>\n",
" <td>0.024083</td>\n",
" <td>0.024686</td>\n",
" <td>0.051802</td>\n",
" <td>0.044244</td>\n",
" <td>0.071909</td>\n",
" <td>0.013398</td>\n",
" <td>-0.027529</td>\n",
" <td>0.051413</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>-0.001751</td>\n",
" <td>0.004171</td>\n",
" <td>0.024952</td>\n",
" <td>0.101387</td>\n",
" <td>0.005779</td>\n",
" <td>-0.000073</td>\n",
" <td>-0.042342</td>\n",
" <td>0.057126</td>\n",
" <td>0.014099</td>\n",
" <td>0.059079</td>\n",
" <td>...</td>\n",
" <td>0.020235</td>\n",
" <td>0.020780</td>\n",
" <td>0.044079</td>\n",
" <td>0.038243</td>\n",
" <td>0.060210</td>\n",
" <td>0.011311</td>\n",
" <td>-0.023804</td>\n",
" <td>0.043236</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>-0.004231</td>\n",
" <td>0.008591</td>\n",
" <td>0.040617</td>\n",
" <td>0.158573</td>\n",
" <td>0.006669</td>\n",
" <td>0.000397</td>\n",
" <td>-0.066657</td>\n",
" <td>0.089698</td>\n",
" <td>0.021396</td>\n",
" <td>0.092416</td>\n",
" <td>...</td>\n",
" <td>0.032019</td>\n",
" <td>0.033373</td>\n",
" <td>0.069072</td>\n",
" <td>0.059283</td>\n",
" <td>0.096124</td>\n",
" <td>0.017709</td>\n",
" <td>-0.036909</td>\n",
" <td>0.068895</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>-0.003645</td>\n",
" <td>0.007694</td>\n",
" <td>0.037233</td>\n",
" <td>0.145041</td>\n",
" <td>0.005848</td>\n",
" <td>0.000056</td>\n",
" <td>-0.060643</td>\n",
" <td>0.081297</td>\n",
" <td>0.019609</td>\n",
" <td>0.084285</td>\n",
" <td>...</td>\n",
" <td>0.028998</td>\n",
" <td>0.030250</td>\n",
" <td>0.062514</td>\n",
" <td>0.054107</td>\n",
" <td>0.087722</td>\n",
" <td>0.015767</td>\n",
" <td>-0.033356</td>\n",
" <td>0.062634</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>-0.002665</td>\n",
" <td>0.006103</td>\n",
" <td>0.031361</td>\n",
" <td>0.123847</td>\n",
" <td>0.005783</td>\n",
" <td>-0.000024</td>\n",
" <td>-0.051800</td>\n",
" <td>0.069786</td>\n",
" <td>0.016936</td>\n",
" <td>0.072035</td>\n",
" <td>...</td>\n",
" <td>0.024850</td>\n",
" <td>0.025738</td>\n",
" <td>0.053763</td>\n",
" <td>0.046302</td>\n",
" <td>0.074421</td>\n",
" <td>0.013556</td>\n",
" <td>-0.028769</td>\n",
" <td>0.053455</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>-0.002668</td>\n",
" <td>0.005939</td>\n",
" <td>0.029417</td>\n",
" <td>0.115549</td>\n",
" <td>0.005098</td>\n",
" <td>-0.000145</td>\n",
" <td>-0.048460</td>\n",
" <td>0.064702</td>\n",
" <td>0.015695</td>\n",
" <td>0.067226</td>\n",
" <td>...</td>\n",
" <td>0.023183</td>\n",
" <td>0.024084</td>\n",
" <td>0.050333</td>\n",
" <td>0.043130</td>\n",
" <td>0.069733</td>\n",
" <td>0.012727</td>\n",
" <td>-0.026763</td>\n",
" <td>0.049763</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>-0.001704</td>\n",
" <td>0.003695</td>\n",
" <td>0.020013</td>\n",
" <td>0.079906</td>\n",
" <td>0.003758</td>\n",
" <td>-0.000446</td>\n",
" <td>-0.033470</td>\n",
" <td>0.044813</td>\n",
" <td>0.010834</td>\n",
" <td>0.046494</td>\n",
" <td>...</td>\n",
" <td>0.016119</td>\n",
" <td>0.016377</td>\n",
" <td>0.034478</td>\n",
" <td>0.029979</td>\n",
" <td>0.047723</td>\n",
" <td>0.008557</td>\n",
" <td>-0.018586</td>\n",
" <td>0.034297</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>-0.002183</td>\n",
" <td>0.004303</td>\n",
" <td>0.029799</td>\n",
" <td>0.123783</td>\n",
" <td>0.007773</td>\n",
" <td>-0.001676</td>\n",
" <td>-0.052463</td>\n",
" <td>0.070131</td>\n",
" <td>0.016861</td>\n",
" <td>0.072545</td>\n",
" <td>...</td>\n",
" <td>0.025195</td>\n",
" <td>0.024295</td>\n",
" <td>0.053709</td>\n",
" <td>0.046649</td>\n",
" <td>0.073422</td>\n",
" <td>0.012588</td>\n",
" <td>-0.029430</td>\n",
" <td>0.052398</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>-0.002857</td>\n",
" <td>0.006036</td>\n",
" <td>0.040144</td>\n",
" <td>0.166509</td>\n",
" <td>0.010320</td>\n",
" <td>0.000007</td>\n",
" <td>-0.070213</td>\n",
" <td>0.093398</td>\n",
" <td>0.022464</td>\n",
" <td>0.097874</td>\n",
" <td>...</td>\n",
" <td>0.033678</td>\n",
" <td>0.034048</td>\n",
" <td>0.072506</td>\n",
" <td>0.062497</td>\n",
" <td>0.098301</td>\n",
" <td>0.018354</td>\n",
" <td>-0.039930</td>\n",
" <td>0.069753</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10 rows × 1002 columns</p>\n",
"</div>"
],
"text/plain": [
" 0 1 2 3 4 5 6 \\\n",
"0 -0.002518 0.005691 0.028719 0.112553 0.004954 -0.000118 -0.047354 \n",
"1 -0.003335 0.006316 0.030497 0.118717 0.004992 0.000358 -0.049837 \n",
"2 -0.001751 0.004171 0.024952 0.101387 0.005779 -0.000073 -0.042342 \n",
"3 -0.004231 0.008591 0.040617 0.158573 0.006669 0.000397 -0.066657 \n",
"4 -0.003645 0.007694 0.037233 0.145041 0.005848 0.000056 -0.060643 \n",
"5 -0.002665 0.006103 0.031361 0.123847 0.005783 -0.000024 -0.051800 \n",
"6 -0.002668 0.005939 0.029417 0.115549 0.005098 -0.000145 -0.048460 \n",
"7 -0.001704 0.003695 0.020013 0.079906 0.003758 -0.000446 -0.033470 \n",
"8 -0.002183 0.004303 0.029799 0.123783 0.007773 -0.001676 -0.052463 \n",
"9 -0.002857 0.006036 0.040144 0.166509 0.010320 0.000007 -0.070213 \n",
"\n",
" 7 8 9 ... 992 993 994 995 \\\n",
"0 0.063511 0.015165 0.065314 ... 0.022455 0.023590 0.049012 0.042182 \n",
"1 0.067021 0.016288 0.069339 ... 0.024083 0.024686 0.051802 0.044244 \n",
"2 0.057126 0.014099 0.059079 ... 0.020235 0.020780 0.044079 0.038243 \n",
"3 0.089698 0.021396 0.092416 ... 0.032019 0.033373 0.069072 0.059283 \n",
"4 0.081297 0.019609 0.084285 ... 0.028998 0.030250 0.062514 0.054107 \n",
"5 0.069786 0.016936 0.072035 ... 0.024850 0.025738 0.053763 0.046302 \n",
"6 0.064702 0.015695 0.067226 ... 0.023183 0.024084 0.050333 0.043130 \n",
"7 0.044813 0.010834 0.046494 ... 0.016119 0.016377 0.034478 0.029979 \n",
"8 0.070131 0.016861 0.072545 ... 0.025195 0.024295 0.053709 0.046649 \n",
"9 0.093398 0.022464 0.097874 ... 0.033678 0.034048 0.072506 0.062497 \n",
"\n",
" 996 997 998 999 label_not_spam label_spam \n",
"0 0.067843 0.012421 -0.026096 0.048588 1 0 \n",
"1 0.071909 0.013398 -0.027529 0.051413 1 0 \n",
"2 0.060210 0.011311 -0.023804 0.043236 0 1 \n",
"3 0.096124 0.017709 -0.036909 0.068895 1 0 \n",
"4 0.087722 0.015767 -0.033356 0.062634 1 0 \n",
"5 0.074421 0.013556 -0.028769 0.053455 0 1 \n",
"6 0.069733 0.012727 -0.026763 0.049763 1 0 \n",
"7 0.047723 0.008557 -0.018586 0.034297 1 0 \n",
"8 0.073422 0.012588 -0.029430 0.052398 0 1 \n",
"9 0.098301 0.018354 -0.039930 0.069753 0 1 \n",
"\n",
"[10 rows x 1002 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"word2vec_df = pd.read_csv(word2vec_filename)\n",
"word2vec_df['label'] = df['label']\n",
"\n",
"# Encode labels\n",
"word2vec_df = pd.get_dummies(word2vec_df, columns=['label'])\n",
"\n",
"display(word2vec_df.head(10))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment