Created
July 29, 2019 04:45
-
-
Save yongjun823/a856610f65812f8d249dfb1659ba578d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import os\n", | |
"import pandas as pd\n", | |
"from tqdm import tqdm_notebook as tqdm\n", | |
"import numpy as np\n", | |
"import matplotlib.pyplot as plt\n", | |
"import collections" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>diffbotUri</th>\n", | |
" <th>images</th>\n", | |
" <th>specs</th>\n", | |
" <th>text</th>\n", | |
" <th>title</th>\n", | |
" <th>tokens</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>product|4|-1577418857</td>\n", | |
" <td>[product|4|-1577418857_1, product|4|-157741885...</td>\n", | |
" <td>{'color': 'Black', 'size': 'All Sizes'}</td>\n", | |
" <td>Our organization is involved in fabrication of...</td>\n", | |
" <td>Offering you a complete choice of products whi...</td>\n", | |
" <td>[Offering, you, a, complete, choice, of, produ...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>product|4|2080697291</td>\n", | |
" <td>[product|4|2080697291_1, product|4|2080697291_0]</td>\n", | |
" <td>{'specifications': '-'}</td>\n", | |
" <td>We are engaged in offering wide range of Cast ...</td>\n", | |
" <td>Acrylic Sheet in Surat</td>\n", | |
" <td>[Acrylic, Sheet, in, Surat, We, are, engaged, ...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>product|4|802308696</td>\n", | |
" <td>[product|4|802308696_0]</td>\n", | |
" <td>{'hard_drive_interface': 'Serial ATA', 'proces...</td>\n", | |
" <td>Application data, virtual images, client files...</td>\n", | |
" <td>Netgear ReadyNAS 312 2-Bay, 2x2TB Desktop Drive</td>\n", | |
" <td>[Netgear, ReadyNAS, 312, 2-Bay, ,, 2x2TB, Desk...</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" diffbotUri images \\\n", | |
"0 product|4|-1577418857 [product|4|-1577418857_1, product|4|-157741885... \n", | |
"1 product|4|2080697291 [product|4|2080697291_1, product|4|2080697291_0] \n", | |
"2 product|4|802308696 [product|4|802308696_0] \n", | |
"\n", | |
" specs \\\n", | |
"0 {'color': 'Black', 'size': 'All Sizes'} \n", | |
"1 {'specifications': '-'} \n", | |
"2 {'hard_drive_interface': 'Serial ATA', 'proces... \n", | |
"\n", | |
" text \\\n", | |
"0 Our organization is involved in fabrication of... \n", | |
"1 We are engaged in offering wide range of Cast ... \n", | |
"2 Application data, virtual images, client files... \n", | |
"\n", | |
" title \\\n", | |
"0 Offering you a complete choice of products whi... \n", | |
"1 Acrylic Sheet in Surat \n", | |
"2 Netgear ReadyNAS 312 2-Bay, 2x2TB Desktop Drive \n", | |
"\n", | |
" tokens \n", | |
"0 [Offering, you, a, complete, choice, of, produ... \n", | |
"1 [Acrylic, Sheet, in, Surat, We, are, engaged, ... \n", | |
"2 [Netgear, ReadyNAS, 312, 2-Bay, ,, 2x2TB, Desk... " | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df = pd.read_json('../../mae_data/sample/dataset-000-sample.json')\n", | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"dir_name = '../../mae_data/val/'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"jsons = os.listdir(dir_name)\n", | |
"df_arr = []\n", | |
"\n", | |
"for val in tqdm(jsons):\n", | |
" df_t = pd.read_json(dir_name + val)\n", | |
" df_arr.append(df_t)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df = pd.concat(df_arr, ignore_index=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df.info()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df.info()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"t_l = list(df['text'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"len(t_l)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# token\n", | |
"For textual description D, we first tokenize the text using the Stanford tokenizer [natural language processing toolkit. ], \n", | |
"followed by embedding all of the words using the Glove algorithm [ In\n", | |
"Empirical Methods in Natural Language Processing (EMNLP)] \n", | |
"on all of the descriptions in\n", | |
"the training data. " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import nltk \n", | |
"from nltk.tokenize import word_tokenize, sent_tokenize\n", | |
"from nltk.tokenize.stanford import StanfordTokenizer\n", | |
"from nltk.tag import pos_tag\n", | |
"from nltk.stem import PorterStemmer, WordNetLemmatizer" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"col0 = df.iloc[1]\n", | |
"col0" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"nltk_tokens = word_tokenize(col0['title'] + ' ' + col0['text'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true, | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"temp_tokens = col0['tokens']\n", | |
"\n", | |
"print(nltk_tokens)\n", | |
"print()\n", | |
"print(temp_tokens)\n", | |
"# for t, n in zip(nltk_tokens, temp_tokens):\n", | |
"# print(t, n)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"set(temp_tokens) - set(nltk_tokens)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true, | |
"scrolled": false | |
}, | |
"outputs": [], | |
"source": [ | |
"nltk_tag = pos_tag(nltk_tokens)\n", | |
"\n", | |
"print(nltk_tag)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"tb = TextBlob(col0['text'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"tb.sentiment" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"\n", | |
"sentiment_arr = []\n", | |
"str_len_arr = []\n", | |
"\n", | |
"for idx, row in tqdm(df.iterrows(), total=224417):\n", | |
" polarity = TextBlob(row['text']).polarity\n", | |
" str_len = len(row['text'])\n", | |
" \n", | |
" sentiment_arr.append(polarity)\n", | |
" str_len_arr.append(str_len)\n", | |
" \n", | |
"df['sentiment'] = sentiment_arr\n", | |
"df['length'] = str_len_arr" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"plt.hist(df['sentiment'], 50)\n", | |
"plt.title(f'{dir_name[:-1]} sentiment')\n", | |
"plt.ylabel('count')\n", | |
"plt.xlabel('sentiment')\n", | |
"plt.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"plt.hist(df['length'], 50)\n", | |
"plt.show()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# word Test" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import gensim\n", | |
"from gensim.test.utils import common_texts\n", | |
"from gensim.models.doc2vec import Doc2Vec, TaggedDocument" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(list(df['tokens'][:1000]))]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=10)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"model.build_vocab(documents)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"model.infer_vector(['today', 'is', 'monday'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"model.save('doc2vec_model')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"model = Doc2Vec.load('doc2vec_model')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([ 1.90684968e-03, 5.10863215e-03, -6.83930330e-03, 4.24504466e-02,\n", | |
" -8.52093566e-03, 2.18098294e-02, 1.32803489e-02, -1.64024420e-02,\n", | |
" -1.08477520e-02, -2.82693040e-02, -1.81713346e-02, -5.12728058e-02,\n", | |
" 9.23898350e-03, 3.69524513e-03, 2.79692058e-02, -8.50660354e-03,\n", | |
" -2.50089038e-02, -3.43031175e-02, -1.37462299e-02, -1.66270784e-05,\n", | |
" 2.29600780e-02, 2.84677893e-02, 8.70808400e-03, -4.41240408e-02,\n", | |
" 3.41164470e-02, -3.41066346e-03, -1.31926257e-02, 1.75585523e-02,\n", | |
" 1.46337249e-03, -3.57093737e-02, 1.09483898e-02, -2.06415392e-02,\n", | |
" 3.47126834e-02, 1.82276927e-02, 1.78174134e-02, 1.06351534e-02,\n", | |
" -5.87584637e-03, 2.28420668e-03, -1.01194605e-02, 4.11109962e-02,\n", | |
" -4.67140274e-03, -1.49065945e-02, -2.38524959e-03, 3.70601304e-02,\n", | |
" 1.55035721e-03, -3.38417664e-03, -5.95479039e-03, -2.35019345e-02,\n", | |
" 2.09632199e-02, -1.36642819e-02], dtype=float32)" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"model.infer_vector(['today', 'is', 'monday'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[ 0.02318487 0.12941211 -0.01019498 0.39590803 -0.19890492 0.29937196\n", | |
" 0.19320437 -0.20907912 -0.01436204 -0.37281448 -0.08047993 -0.3886785\n", | |
" 0.09956551 0.06531233 0.1989694 -0.05316336 -0.36244977 -0.4163024\n", | |
" -0.19148456 0.13530931 0.18599701 0.2871262 0.07562612 -0.40462962\n", | |
" 0.28658265 0.03622538 -0.11750802 0.22731116 0.10037821 -0.24162376\n", | |
" 0.14992128 -0.10280453 0.27019066 0.12914793 0.15215388 0.04479128\n", | |
" -0.15198174 -0.04264252 0.10957595 0.28032976 -0.10376939 -0.1324438\n", | |
" -0.11523126 0.34913477 -0.10260321 -0.03481036 -0.12959634 -0.11055352\n", | |
" 0.1551514 -0.14948249]\n", | |
"[ 0.02728441 0.08616875 -0.03975851 0.17530279 -0.0712239 0.11183552\n", | |
" 0.06924889 -0.08384503 0.00166007 -0.10335442 0.00223895 -0.19318974\n", | |
" -0.02932651 -0.04717765 0.10307036 0.00750166 -0.14363228 -0.21201877\n", | |
" -0.08054679 -0.00451265 0.06114795 0.14366579 -0.00170667 -0.12150924\n", | |
" 0.03641564 -0.06039124 -0.03471309 0.11358725 0.05010426 -0.08537217\n", | |
" 0.08282162 -0.02722 0.10330863 0.01061063 0.01847071 0.03365278\n", | |
" -0.03798063 -0.03324975 0.03354255 0.10738426 -0.08342923 -0.0341348\n", | |
" -0.04731661 0.19942264 0.02280398 0.09444627 -0.06170543 -0.0471497\n", | |
" 0.06000689 -0.13715923]\n", | |
"[ 0.04497992 0.04371653 0.13521104 0.5063594 -0.04960288 0.13490444\n", | |
" 0.31478998 -0.21063672 -0.07391322 -0.6034117 -0.1887687 -0.46626717\n", | |
" 0.23339225 0.34287342 0.5355682 -0.17295438 -0.36015546 -0.40443167\n", | |
" -0.12740871 0.33432946 0.16438791 0.46586597 -0.03368901 -0.63075244\n", | |
" 0.43255258 -0.12147865 -0.15518221 0.26541817 0.07995774 -0.41066536\n", | |
" 0.13746564 -0.33804914 0.56365365 0.14739208 0.31324977 0.34184933\n", | |
" -0.13105187 0.03928495 -0.18239 0.54022294 -0.20116769 -0.14667669\n", | |
" 0.03750107 0.4277502 0.00952854 -0.2622945 -0.379095 -0.19737807\n", | |
" 0.4030422 -0.32747307]\n" | |
] | |
} | |
], | |
"source": [ | |
"for tokens in list(df['tokens']):\n", | |
" print(model.infer_vector(tokens))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"ranks = []\n", | |
"second_ranks = []\n", | |
"for doc_id in range(len(documents)):\n", | |
" inferred_vector = model.infer_vector(documents[doc_id].words)\n", | |
" sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))\n", | |
" rank = [docid for docid, sim in sims].index(doc_id)\n", | |
" ranks.append(rank)\n", | |
" \n", | |
" second_ranks.append(sims[1])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"collections.Counter(ranks)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"anaconda-cloud": {}, | |
"kernelspec": { | |
"display_name": "Python [default]", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.6" | |
}, | |
"widgets": { | |
"state": { | |
"42a2357e7de140aaa445b8522e3d6957": { | |
"views": [ | |
{ | |
"cell_index": 3 | |
} | |
] | |
} | |
}, | |
"version": "1.2.0" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment