LowriWilliams · August 15, 2020 15:44
diff --git a/pre-processing.ipynb b/pre-processing.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>sms_text</th>\n",
       "      <th>patterns</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>not_spam</td>\n",
       "      <td>Go until jurong point, crazy.. Available only ...</td>\n",
       "      <td>[go, jurong, point, crazy, available, bugis, n...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>not_spam</td>\n",
       "      <td>Ok lar... Joking wif u oni...</td>\n",
       "      <td>[ok, lar, joking, wif, u, oni]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>spam</td>\n",
       "      <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n",
       "      <td>[free, entry, wkly, comp, win, fa, cup, final,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>not_spam</td>\n",
       "      <td>U dun say so early hor... U c already then say...</td>\n",
       "      <td>[u, dun, say, early, hor, u, c, already, say]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>not_spam</td>\n",
       "      <td>Nah I don't think he goes to usf, he lives aro...</td>\n",
       "      <td>[nah, dont, think, go, usf, life, around, though]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5567</th>\n",
       "      <td>spam</td>\n",
       "      <td>This is the 2nd time we have tried 2 contact u...</td>\n",
       "      <td>[2nd, time, tried, contact, u, u, å750, pound,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5568</th>\n",
       "      <td>not_spam</td>\n",
       "      <td>Will Ì_ b going to esplanade fr home?</td>\n",
       "      <td>[ì_, b, going, esplanade, fr, home]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5569</th>\n",
       "      <td>not_spam</td>\n",
       "      <td>Pity, * was in mood for that. So...any other s...</td>\n",
       "      <td>[pity, mood, soany, suggestion]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5570</th>\n",
       "      <td>not_spam</td>\n",
       "      <td>The guy did some bitching but I acted like i'd...</td>\n",
       "      <td>[guy, bitching, acted, like, id, interested, b...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5571</th>\n",
       "      <td>not_spam</td>\n",
       "      <td>Rofl. Its true to its name</td>\n",
       "      <td>[rofl, true, name]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5572 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         label                                           sms_text  \\\n",
       "0     not_spam  Go until jurong point, crazy.. Available only ...   \n",
       "1     not_spam                      Ok lar... Joking wif u oni...   \n",
       "2         spam  Free entry in 2 a wkly comp to win FA Cup fina...   \n",
       "3     not_spam  U dun say so early hor... U c already then say...   \n",
       "4     not_spam  Nah I don't think he goes to usf, he lives aro...   \n",
       "...        ...                                                ...   \n",
       "5567      spam  This is the 2nd time we have tried 2 contact u...   \n",
       "5568  not_spam              Will Ì_ b going to esplanade fr home?   \n",
       "5569  not_spam  Pity, * was in mood for that. So...any other s...   \n",
       "5570  not_spam  The guy did some bitching but I acted like i'd...   \n",
       "5571  not_spam                         Rofl. Its true to its name   \n",
       "\n",
       "                                               patterns  \n",
       "0     [go, jurong, point, crazy, available, bugis, n...  \n",
       "1                        [ok, lar, joking, wif, u, oni]  \n",
       "2     [free, entry, wkly, comp, win, fa, cup, final,...  \n",
       "3         [u, dun, say, early, hor, u, c, already, say]  \n",
       "4     [nah, dont, think, go, usf, life, around, though]  \n",
       "...                                                 ...  \n",
       "5567  [2nd, time, tried, contact, u, u, å750, pound,...  \n",
       "5568                [ì_, b, going, esplanade, fr, home]  \n",
       "5569                    [pity, mood, soany, suggestion]  \n",
       "5570  [guy, bitching, acted, like, id, interested, b...  \n",
       "5571                                 [rofl, true, name]  \n",
       "\n",
       "[5572 rows x 3 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "df['patterns'] = df['sms_text'].apply(lambda x:' '.join(x.lower() for x in x.split()))\n",
    "df['patterns']= df['patterns'].apply(lambda x: ' '.join(x for x in x.split() if x not in string.punctuation))\n",
    "df['patterns']= df['patterns'].str.replace('[^\\w\\s]','')\n",
    "df['patterns']= df['patterns'].apply(lambda x: ' '.join(x for x in x.split() if  not x.isdigit()))\n",
    "df['patterns'] = df['patterns'].apply(lambda x:' '.join(x for x in x.split() if not x in stop_words))\n",
    "df['patterns'] = df['patterns'].apply(lambda x: \" \".join([wordnet_lemmatizer.lemmatize(word) for word in x.split()]))\n",
    "df['patterns'] = df.apply(lambda row: nltk.word_tokenize(row['patterns']), axis=1)\n",
    "\n",
    "display(df)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>label</th>\n",
	" <th>sms_text</th>\n",
	" <th>patterns</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>not_spam</td>\n",
	" <td>Go until jurong point, crazy.. Available only ...</td>\n",
	" <td>[go, jurong, point, crazy, available, bugis, n...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>not_spam</td>\n",
	" <td>Ok lar... Joking wif u oni...</td>\n",
	" <td>[ok, lar, joking, wif, u, oni]</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>spam</td>\n",
	" <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n",
	" <td>[free, entry, wkly, comp, win, fa, cup, final,...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>not_spam</td>\n",
	" <td>U dun say so early hor... U c already then say...</td>\n",
	" <td>[u, dun, say, early, hor, u, c, already, say]</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>not_spam</td>\n",
	" <td>Nah I don't think he goes to usf, he lives aro...</td>\n",
	" <td>[nah, dont, think, go, usf, life, around, though]</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>...</th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>5567</th>\n",
	" <td>spam</td>\n",
	" <td>This is the 2nd time we have tried 2 contact u...</td>\n",
	" <td>[2nd, time, tried, contact, u, u, å750, pound,...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>5568</th>\n",
	" <td>not_spam</td>\n",
	" <td>Will Ì_ b going to esplanade fr home?</td>\n",
	" <td>[ì_, b, going, esplanade, fr, home]</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>5569</th>\n",
	" <td>not_spam</td>\n",
	" <td>Pity, * was in mood for that. So...any other s...</td>\n",
	" <td>[pity, mood, soany, suggestion]</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>5570</th>\n",
	" <td>not_spam</td>\n",
	" <td>The guy did some bitching but I acted like i'd...</td>\n",
	" <td>[guy, bitching, acted, like, id, interested, b...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>5571</th>\n",
	" <td>not_spam</td>\n",
	" <td>Rofl. Its true to its name</td>\n",
	" <td>[rofl, true, name]</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>5572 rows × 3 columns</p>\n",
	"</div>"
	],
	"text/plain": [
	" label sms_text \\\n",
	"0 not_spam Go until jurong point, crazy.. Available only ... \n",
	"1 not_spam Ok lar... Joking wif u oni... \n",
	"2 spam Free entry in 2 a wkly comp to win FA Cup fina... \n",
	"3 not_spam U dun say so early hor... U c already then say... \n",
	"4 not_spam Nah I don't think he goes to usf, he lives aro... \n",
	"... ... ... \n",
	"5567 spam This is the 2nd time we have tried 2 contact u... \n",
	"5568 not_spam Will Ì_ b going to esplanade fr home? \n",
	"5569 not_spam Pity, * was in mood for that. So...any other s... \n",
	"5570 not_spam The guy did some bitching but I acted like i'd... \n",
	"5571 not_spam Rofl. Its true to its name \n",
	"\n",
	" patterns \n",
	"0 [go, jurong, point, crazy, available, bugis, n... \n",
	"1 [ok, lar, joking, wif, u, oni] \n",
	"2 [free, entry, wkly, comp, win, fa, cup, final,... \n",
	"3 [u, dun, say, early, hor, u, c, already, say] \n",
	"4 [nah, dont, think, go, usf, life, around, though] \n",
	"... ... \n",
	"5567 [2nd, time, tried, contact, u, u, å750, pound,... \n",
	"5568 [ì_, b, going, esplanade, fr, home] \n",
	"5569 [pity, mood, soany, suggestion] \n",
	"5570 [guy, bitching, acted, like, id, interested, b... \n",
	"5571 [rofl, true, name] \n",
	"\n",
	"[5572 rows x 3 columns]"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	}
	],
	"source": [
	"df['patterns'] = df['sms_text'].apply(lambda x:' '.join(x.lower() for x in x.split()))\n",
	"df['patterns']= df['patterns'].apply(lambda x: ' '.join(x for x in x.split() if x not in string.punctuation))\n",
	"df['patterns']= df['patterns'].str.replace('[^\\w\\s]','')\n",
	"df['patterns']= df['patterns'].apply(lambda x: ' '.join(x for x in x.split() if not x.isdigit()))\n",
	"df['patterns'] = df['patterns'].apply(lambda x:' '.join(x for x in x.split() if not x in stop_words))\n",
	"df['patterns'] = df['patterns'].apply(lambda x: \" \".join([wordnet_lemmatizer.lemmatize(word) for word in x.split()]))\n",
	"df['patterns'] = df.apply(lambda row: nltk.word_tokenize(row['patterns']), axis=1)\n",
	"\n",
	"display(df)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}