Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save bfarzin/b2eef9af92c42572bd4c055dceae8dd7 to your computer and use it in GitHub Desktop.
Save bfarzin/b2eef9af92c42572bd4c055dceae8dd7 to your computer and use it in GitHub Desktop.
NBSVM Baseline
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"NB-SVM baseline from JHoward\n",
"https://www.kaggle.com/jhoward/nb-svm-strong-linear-baseline"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%reload_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd, numpy as np\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
"from pdb import set_trace"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Reading the texts"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"all_texts_df = pd.read_csv('/home/farzin/rnn_python_code/tweet_es_finetune/haha_2019_train.csv')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>text</th>\n",
" <th>is_humor</th>\n",
" <th>votes_no</th>\n",
" <th>votes_1</th>\n",
" <th>votes_2</th>\n",
" <th>votes_3</th>\n",
" <th>votes_4</th>\n",
" <th>votes_5</th>\n",
" <th>funniness_average</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>705196579758583809</td>\n",
" <td>Niveles de retraso mental: \\r\\n\\r\\n— Bajo.\\r\\n...</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>678040651817213952</td>\n",
" <td>—Vamos Luke desenfunda tu sable, demuestra tu ...</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>546750892213829633</td>\n",
" <td>- ¿Te ofrezco algo?, ¿Agua, café, mi corazón, ...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>965807211292364801</td>\n",
" <td>No se porqué me hago la cabeza deooos</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>638403841839484928</td>\n",
" <td>Quisiera saber que hago durante la siesta de l...</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id text \\\n",
"0 705196579758583809 Niveles de retraso mental: \\r\\n\\r\\n— Bajo.\\r\\n... \n",
"1 678040651817213952 —Vamos Luke desenfunda tu sable, demuestra tu ... \n",
"2 546750892213829633 - ¿Te ofrezco algo?, ¿Agua, café, mi corazón, ... \n",
"3 965807211292364801 No se porqué me hago la cabeza deooos \n",
"4 638403841839484928 Quisiera saber que hago durante la siesta de l... \n",
"\n",
" is_humor votes_no votes_1 votes_2 votes_3 votes_4 votes_5 \\\n",
"0 1 1 2 2 0 0 0 \n",
"1 1 1 3 0 1 0 0 \n",
"2 1 0 2 1 0 1 1 \n",
"3 0 3 0 0 0 0 0 \n",
"4 0 4 0 1 0 0 0 \n",
"\n",
" funniness_average \n",
"0 1.5 \n",
"1 1.5 \n",
"2 2.6 \n",
"3 NaN \n",
"4 NaN "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"all_texts_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"rnd_seed = 20190313\n",
"np.random.seed(rnd_seed)\n",
"\n",
"idx = np.random.permutation(len(all_texts_df))\n",
"test_cut = int(0.15 * len(idx))\n",
"valid_cut = int(0.15 * len(idx-test_cut))\n",
"\n",
"df_train = all_texts_df.iloc[idx[:-(valid_cut+test_cut)],:]\n",
"df_test = all_texts_df.iloc[idx[-(valid_cut+test_cut):-test_cut],:]\n",
"# df_test = all_texts_df.iloc[idx[-test_cut:],:]"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"((16800, 10), (3600, 10))"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_train.shape, df_test.shape"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"import re, string\n",
"re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')\n",
"def tokenize(s): \n",
" return re_tok.sub(r' \\1 ', s).split()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"n = all_texts_df.shape[0]\n",
"vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,\n",
" min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,\n",
" smooth_idf=1, sublinear_tf=1 )\n",
"trn_term_doc = vec.fit_transform(df_train['text'])\n",
"test_term_doc = vec.transform(df_test['text'])"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(<16800x22072 sparse matrix of type '<class 'numpy.float64'>'\n",
" \twith 388461 stored elements in Compressed Sparse Row format>,\n",
" <3600x22072 sparse matrix of type '<class 'numpy.float64'>'\n",
" \twith 79494 stored elements in Compressed Sparse Row format>)"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"trn_term_doc, test_term_doc"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"def pr(y_i, y):\n",
" p = x[y==y_i].sum(0)\n",
" return (p+1) / ((y==y_i).sum()+1)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"x = trn_term_doc\n",
"test_x = test_term_doc"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"def get_mdl(y):\n",
" y = y.values\n",
" r = np.log(pr(1,y) / pr(0,y))\n",
" m = LogisticRegression(C=4, dual=True)\n",
" x_nb = x.multiply(r)\n",
" return m.fit(x_nb, y), r"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"fit is_humor\n"
]
}
],
"source": [
"label_cols = ['is_humor']\n",
"preds = np.zeros((len(df_test), len(label_cols)))\n",
"\n",
"for i, j in enumerate(label_cols):\n",
" print('fit', j)\n",
" m,r = get_mdl(df_train[j])\n",
" preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"TP = ((preds > 0.5).astype(int) == df_test[label_cols].values)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Accuracy: 0.8344444444444444'"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"f'Accuracy: {TP.sum() / len(df_test)}'"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.7 fasta.ai1 DEV",
"language": "python",
"name": "fastai1_dev"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment