bfarzin · May 10, 2019 16:19
diff --git a/spanish_joke_classifier_NBSVM_baseline.ipynb b/spanish_joke_classifier_NBSVM_baseline.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "NB-SVM baseline from JHoward\n",
    "https://www.kaggle.com/jhoward/nb-svm-strong-linear-baseline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%reload_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd, numpy as np\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
    "from pdb import set_trace"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Reading the texts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_texts_df = pd.read_csv('/home/farzin/rnn_python_code/tweet_es_finetune/haha_2019_train.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>text</th>\n",
       "      <th>is_humor</th>\n",
       "      <th>votes_no</th>\n",
       "      <th>votes_1</th>\n",
       "      <th>votes_2</th>\n",
       "      <th>votes_3</th>\n",
       "      <th>votes_4</th>\n",
       "      <th>votes_5</th>\n",
       "      <th>funniness_average</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>705196579758583809</td>\n",
       "      <td>Niveles de retraso mental: \\r\\n\\r\\n— Bajo.\\r\\n...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>678040651817213952</td>\n",
       "      <td>—Vamos Luke desenfunda tu sable, demuestra tu ...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>546750892213829633</td>\n",
       "      <td>- ¿Te ofrezco algo?, ¿Agua, café, mi corazón, ...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>965807211292364801</td>\n",
       "      <td>No se porqué me hago la cabeza deooos</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>638403841839484928</td>\n",
       "      <td>Quisiera saber que hago durante la siesta de l...</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   id                                               text  \\\n",
       "0  705196579758583809  Niveles de retraso mental: \\r\\n\\r\\n— Bajo.\\r\\n...   \n",
       "1  678040651817213952  —Vamos Luke desenfunda tu sable, demuestra tu ...   \n",
       "2  546750892213829633  - ¿Te ofrezco algo?, ¿Agua, café, mi corazón, ...   \n",
       "3  965807211292364801              No se porqué me hago la cabeza deooos   \n",
       "4  638403841839484928  Quisiera saber que hago durante la siesta de l...   \n",
       "\n",
       "   is_humor  votes_no  votes_1  votes_2  votes_3  votes_4  votes_5  \\\n",
       "0         1         1        2        2        0        0        0   \n",
       "1         1         1        3        0        1        0        0   \n",
       "2         1         0        2        1        0        1        1   \n",
       "3         0         3        0        0        0        0        0   \n",
       "4         0         4        0        1        0        0        0   \n",
       "\n",
       "   funniness_average  \n",
       "0                1.5  \n",
       "1                1.5  \n",
       "2                2.6  \n",
       "3                NaN  \n",
       "4                NaN  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_texts_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "rnd_seed = 20190313\n",
    "np.random.seed(rnd_seed)\n",
    "\n",
    "idx = np.random.permutation(len(all_texts_df))\n",
    "test_cut = int(0.15 * len(idx))\n",
    "valid_cut = int(0.15 * len(idx-test_cut))\n",
    "\n",
    "df_train = all_texts_df.iloc[idx[:-(valid_cut+test_cut)],:]\n",
    "df_test  = all_texts_df.iloc[idx[-(valid_cut+test_cut):-test_cut],:]\n",
    "# df_test      = all_texts_df.iloc[idx[-test_cut:],:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((16800, 10), (3600, 10))"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train.shape, df_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re, string\n",
    "re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')\n",
    "def tokenize(s): \n",
    "    return re_tok.sub(r' \\1 ', s).split()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "n = all_texts_df.shape[0]\n",
    "vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,\n",
    "               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,\n",
    "               smooth_idf=1, sublinear_tf=1 )\n",
    "trn_term_doc = vec.fit_transform(df_train['text'])\n",
    "test_term_doc = vec.transform(df_test['text'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(<16800x22072 sparse matrix of type '<class 'numpy.float64'>'\n",
       " \twith 388461 stored elements in Compressed Sparse Row format>,\n",
       " <3600x22072 sparse matrix of type '<class 'numpy.float64'>'\n",
       " \twith 79494 stored elements in Compressed Sparse Row format>)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trn_term_doc, test_term_doc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "def pr(y_i, y):\n",
    "    p = x[y==y_i].sum(0)\n",
    "    return (p+1) / ((y==y_i).sum()+1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "x = trn_term_doc\n",
    "test_x = test_term_doc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_mdl(y):\n",
    "    y = y.values\n",
    "    r = np.log(pr(1,y) / pr(0,y))\n",
    "    m = LogisticRegression(C=4, dual=True)\n",
    "    x_nb = x.multiply(r)\n",
    "    return m.fit(x_nb, y), r"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "fit is_humor\n"
     ]
    }
   ],
   "source": [
    "label_cols = ['is_humor']\n",
    "preds = np.zeros((len(df_test), len(label_cols)))\n",
    "\n",
    "for i, j in enumerate(label_cols):\n",
    "    print('fit', j)\n",
    "    m,r = get_mdl(df_train[j])\n",
    "    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "TP = ((preds > 0.5).astype(int) == df_test[label_cols].values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Accuracy: 0.8344444444444444'"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "f'Accuracy: {TP.sum() / len(df_test)}'"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.7 fasta.ai1 DEV",
   "language": "python",
   "name": "fastai1_dev"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"NB-SVM baseline from JHoward\n",
	"https://www.kaggle.com/jhoward/nb-svm-strong-linear-baseline"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"%reload_ext autoreload\n",
	"%autoreload 2"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"import pandas as pd, numpy as np\n",
	"from sklearn.linear_model import LogisticRegression\n",
	"from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
	"from pdb import set_trace"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Reading the texts"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"all_texts_df = pd.read_csv('/home/farzin/rnn_python_code/tweet_es_finetune/haha_2019_train.csv')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>id</th>\n",
	" <th>text</th>\n",
	" <th>is_humor</th>\n",
	" <th>votes_no</th>\n",
	" <th>votes_1</th>\n",
	" <th>votes_2</th>\n",
	" <th>votes_3</th>\n",
	" <th>votes_4</th>\n",
	" <th>votes_5</th>\n",
	" <th>funniness_average</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>705196579758583809</td>\n",
	" <td>Niveles de retraso mental: \\r\\n\\r\\n— Bajo.\\r\\n...</td>\n",
	" <td>1</td>\n",
	" <td>1</td>\n",
	" <td>2</td>\n",
	" <td>2</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>1.5</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>678040651817213952</td>\n",
	" <td>—Vamos Luke desenfunda tu sable, demuestra tu ...</td>\n",
	" <td>1</td>\n",
	" <td>1</td>\n",
	" <td>3</td>\n",
	" <td>0</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>1.5</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>546750892213829633</td>\n",
	" <td>- ¿Te ofrezco algo?, ¿Agua, café, mi corazón, ...</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" <td>2</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" <td>1</td>\n",
	" <td>1</td>\n",
	" <td>2.6</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>965807211292364801</td>\n",
	" <td>No se porqué me hago la cabeza deooos</td>\n",
	" <td>0</td>\n",
	" <td>3</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>638403841839484928</td>\n",
	" <td>Quisiera saber que hago durante la siesta de l...</td>\n",
	" <td>0</td>\n",
	" <td>4</td>\n",
	" <td>0</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" id text \\\n",
	"0 705196579758583809 Niveles de retraso mental: \\r\\n\\r\\n— Bajo.\\r\\n... \n",
	"1 678040651817213952 —Vamos Luke desenfunda tu sable, demuestra tu ... \n",
	"2 546750892213829633 - ¿Te ofrezco algo?, ¿Agua, café, mi corazón, ... \n",
	"3 965807211292364801 No se porqué me hago la cabeza deooos \n",
	"4 638403841839484928 Quisiera saber que hago durante la siesta de l... \n",
	"\n",
	" is_humor votes_no votes_1 votes_2 votes_3 votes_4 votes_5 \\\n",
	"0 1 1 2 2 0 0 0 \n",
	"1 1 1 3 0 1 0 0 \n",
	"2 1 0 2 1 0 1 1 \n",
	"3 0 3 0 0 0 0 0 \n",
	"4 0 4 0 1 0 0 0 \n",
	"\n",
	" funniness_average \n",
	"0 1.5 \n",
	"1 1.5 \n",
	"2 2.6 \n",
	"3 NaN \n",
	"4 NaN "
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"all_texts_df.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {},
	"outputs": [],
	"source": [
	"rnd_seed = 20190313\n",
	"np.random.seed(rnd_seed)\n",
	"\n",
	"idx = np.random.permutation(len(all_texts_df))\n",
	"test_cut = int(0.15 * len(idx))\n",
	"valid_cut = int(0.15 * len(idx-test_cut))\n",
	"\n",
	"df_train = all_texts_df.iloc[idx[:-(valid_cut+test_cut)],:]\n",
	"df_test = all_texts_df.iloc[idx[-(valid_cut+test_cut):-test_cut],:]\n",
	"# df_test = all_texts_df.iloc[idx[-test_cut:],:]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"((16800, 10), (3600, 10))"
	]
	},
	"execution_count": 17,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df_train.shape, df_test.shape"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {},
	"outputs": [],
	"source": [
	"import re, string\n",
	"re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')\n",
	"def tokenize(s): \n",
	" return re_tok.sub(r' \\1 ', s).split()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {},
	"outputs": [],
	"source": [
	"n = all_texts_df.shape[0]\n",
	"vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,\n",
	" min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,\n",
	" smooth_idf=1, sublinear_tf=1 )\n",
	"trn_term_doc = vec.fit_transform(df_train['text'])\n",
	"test_term_doc = vec.transform(df_test['text'])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"(<16800x22072 sparse matrix of type '<class 'numpy.float64'>'\n",
	" \twith 388461 stored elements in Compressed Sparse Row format>,\n",
	" <3600x22072 sparse matrix of type '<class 'numpy.float64'>'\n",
	" \twith 79494 stored elements in Compressed Sparse Row format>)"
	]
	},
	"execution_count": 20,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"trn_term_doc, test_term_doc"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {},
	"outputs": [],
	"source": [
	"def pr(y_i, y):\n",
	" p = x[y==y_i].sum(0)\n",
	" return (p+1) / ((y==y_i).sum()+1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"metadata": {},
	"outputs": [],
	"source": [
	"x = trn_term_doc\n",
	"test_x = test_term_doc"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"metadata": {},
	"outputs": [],
	"source": [
	"def get_mdl(y):\n",
	" y = y.values\n",
	" r = np.log(pr(1,y) / pr(0,y))\n",
	" m = LogisticRegression(C=4, dual=True)\n",
	" x_nb = x.multiply(r)\n",
	" return m.fit(x_nb, y), r"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 24,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"fit is_humor\n"
	]
	}
	],
	"source": [
	"label_cols = ['is_humor']\n",
	"preds = np.zeros((len(df_test), len(label_cols)))\n",
	"\n",
	"for i, j in enumerate(label_cols):\n",
	" print('fit', j)\n",
	" m,r = get_mdl(df_train[j])\n",
	" preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 25,
	"metadata": {},
	"outputs": [],
	"source": [
	"TP = ((preds > 0.5).astype(int) == df_test[label_cols].values)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 26,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'Accuracy: 0.8344444444444444'"
	]
	},
	"execution_count": 26,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"f'Accuracy: {TP.sum() / len(df_test)}'"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3.7 fasta.ai1 DEV",
	"language": "python",
	"name": "fastai1_dev"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.3"
	},
	"varInspector": {
	"cols": {
	"lenName": 16,
	"lenType": 16,
	"lenVar": 40
	},
	"kernels_config": {
	"python": {
	"delete_cmd_postfix": "",
	"delete_cmd_prefix": "del ",
	"library": "var_list.py",
	"varRefreshCmd": "print(var_dic_list())"
	},
	"r": {
	"delete_cmd_postfix": ") ",
	"delete_cmd_prefix": "rm(",
	"library": "var_list.r",
	"varRefreshCmd": "cat(var_dic_list()) "
	}
	},
	"types_to_exclude": [
	"module",
	"function",
	"builtin_function_or_method",
	"instance",
	"_Feature"
	],
	"window_display": false
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}