Created
August 1, 2017 03:07
-
-
Save zevisert/53348d6a1606d320dcc7f1727bbcb849 to your computer and use it in GitHub Desktop.
How I did NLP assignment 3 Q2 - maybe useful for our project as we move into MLP's
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import html\n", | |
"import xmltodict\n", | |
"import pandas as pd\n", | |
"from sklearn.model_selection import train_test_split\n", | |
"from sklearn.feature_extraction.text import CountVectorizer\n", | |
"from sklearn.neural_network import MLPClassifier\n", | |
"from sklearn.preprocessing import MaxAbsScaler\n", | |
"from sklearn.metrics import classification_report, confusion_matrix" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"headTags = (\"000\", \"000\")\n", | |
"\n", | |
"with open(\"./EnglishLS.train\") as raw:\n", | |
" with open(\"./EnglishLS.train.decoded\", \"w\") as decoded:\n", | |
" decoded.write(\"<EnglishLS>\")\n", | |
" for line in raw:\n", | |
" line = html.unescape(line)\n", | |
" if headTags is not None:\n", | |
" line = line.replace(\"<head>\", headTags[0]).replace(\"</head>\", headTags[1])\n", | |
" decoded.write(line)\n", | |
" decoded.write(\"</EnglishLS>\")\n", | |
"\n", | |
"with open('./EnglishLS.train.decoded') as cleaned:\n", | |
" trainxml = xmltodict.parse(cleaned.read())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"appear = [lexelt for lexelt in trainxml['EnglishLS']['lexelt'] if lexelt['@item'] == \"appear.v\"].pop()\n", | |
"instances = appear['instance']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"train = pd.DataFrame(instances)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# For rows that have more than one answer, take only the first answer\n", | |
"mask = train.answer.apply(lambda row: isinstance(row, list))\n", | |
"train.loc[mask, 'answer'] = train.loc[mask, 'answer'].apply(lambda row: row[0])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"X_text = train.context.values\n", | |
"vectorizer = CountVectorizer(\n", | |
" input='content',\n", | |
" stop_words='english',\n", | |
" max_df=1.0,\n", | |
" min_df=1,\n", | |
" binary=False\n", | |
")\n", | |
"\n", | |
"X = vectorizer.fit_transform(X_text)\n", | |
"y = train.answer.apply(lambda row: row['@senseid']).values" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"X_train, X_test, y_train, y_test = train_test_split(X, y)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"MaxAbsScaler(copy=True)" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"scaler = MaxAbsScaler()\n", | |
"# Fit only to the training data\n", | |
"scaler.fit(X_train)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"X_train = scaler.transform(X_train)\n", | |
"X_test = scaler.transform(X_test)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,\n", | |
" beta_2=0.999, early_stopping=False, epsilon=1e-08,\n", | |
" hidden_layer_sizes=(1000, 500, 30), learning_rate='constant',\n", | |
" learning_rate_init=0.001, max_iter=200, momentum=0.9,\n", | |
" nesterovs_momentum=True, power_t=0.5, random_state=None,\n", | |
" shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,\n", | |
" verbose=False, warm_start=False)" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"mlp = MLPClassifier(\n", | |
" hidden_layer_sizes=(1000, 500, 30)\n", | |
")\n", | |
"\n", | |
"mlp.fit(X_train, y_train)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"predictions = mlp.predict(X_test)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Train set size: (198, 6263)\n", | |
"Test set size: (67, 6263)\n" | |
] | |
} | |
], | |
"source": [ | |
"print(\"Train set size: {}\\nTest set size: {}\".format(X_train.shape, X_test.shape))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[[ 7 15 2]\n", | |
" [ 2 32 3]\n", | |
" [ 0 3 3]]\n" | |
] | |
} | |
], | |
"source": [ | |
"print(confusion_matrix(y_test, predictions))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" precision recall f1-score support\n", | |
"\n", | |
" 190901 0.78 0.29 0.42 24\n", | |
" 190902 0.64 0.86 0.74 37\n", | |
" 190903 0.38 0.50 0.43 6\n", | |
"\n", | |
"avg / total 0.67 0.63 0.60 67\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"print(classification_report(y_test, predictions))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment