Created
August 15, 2020 15:44
-
-
Save LowriWilliams/645bbf894a19979ae374a99674a7b49e to your computer and use it in GitHub Desktop.
sms_adversarial/pre-processing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>label</th>\n", | |
| " <th>sms_text</th>\n", | |
| " <th>patterns</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>not_spam</td>\n", | |
| " <td>Go until jurong point, crazy.. Available only ...</td>\n", | |
| " <td>[go, jurong, point, crazy, available, bugis, n...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>not_spam</td>\n", | |
| " <td>Ok lar... Joking wif u oni...</td>\n", | |
| " <td>[ok, lar, joking, wif, u, oni]</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>spam</td>\n", | |
| " <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n", | |
| " <td>[free, entry, wkly, comp, win, fa, cup, final,...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>not_spam</td>\n", | |
| " <td>U dun say so early hor... U c already then say...</td>\n", | |
| " <td>[u, dun, say, early, hor, u, c, already, say]</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>not_spam</td>\n", | |
| " <td>Nah I don't think he goes to usf, he lives aro...</td>\n", | |
| " <td>[nah, dont, think, go, usf, life, around, though]</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>...</th>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5567</th>\n", | |
| " <td>spam</td>\n", | |
| " <td>This is the 2nd time we have tried 2 contact u...</td>\n", | |
| " <td>[2nd, time, tried, contact, u, u, å750, pound,...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5568</th>\n", | |
| " <td>not_spam</td>\n", | |
| " <td>Will Ì_ b going to esplanade fr home?</td>\n", | |
| " <td>[ì_, b, going, esplanade, fr, home]</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5569</th>\n", | |
| " <td>not_spam</td>\n", | |
| " <td>Pity, * was in mood for that. So...any other s...</td>\n", | |
| " <td>[pity, mood, soany, suggestion]</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5570</th>\n", | |
| " <td>not_spam</td>\n", | |
| " <td>The guy did some bitching but I acted like i'd...</td>\n", | |
| " <td>[guy, bitching, acted, like, id, interested, b...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5571</th>\n", | |
| " <td>not_spam</td>\n", | |
| " <td>Rofl. Its true to its name</td>\n", | |
| " <td>[rofl, true, name]</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "<p>5572 rows × 3 columns</p>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " label sms_text \\\n", | |
| "0 not_spam Go until jurong point, crazy.. Available only ... \n", | |
| "1 not_spam Ok lar... Joking wif u oni... \n", | |
| "2 spam Free entry in 2 a wkly comp to win FA Cup fina... \n", | |
| "3 not_spam U dun say so early hor... U c already then say... \n", | |
| "4 not_spam Nah I don't think he goes to usf, he lives aro... \n", | |
| "... ... ... \n", | |
| "5567 spam This is the 2nd time we have tried 2 contact u... \n", | |
| "5568 not_spam Will Ì_ b going to esplanade fr home? \n", | |
| "5569 not_spam Pity, * was in mood for that. So...any other s... \n", | |
| "5570 not_spam The guy did some bitching but I acted like i'd... \n", | |
| "5571 not_spam Rofl. Its true to its name \n", | |
| "\n", | |
| " patterns \n", | |
| "0 [go, jurong, point, crazy, available, bugis, n... \n", | |
| "1 [ok, lar, joking, wif, u, oni] \n", | |
| "2 [free, entry, wkly, comp, win, fa, cup, final,... \n", | |
| "3 [u, dun, say, early, hor, u, c, already, say] \n", | |
| "4 [nah, dont, think, go, usf, life, around, though] \n", | |
| "... ... \n", | |
| "5567 [2nd, time, tried, contact, u, u, å750, pound,... \n", | |
| "5568 [ì_, b, going, esplanade, fr, home] \n", | |
| "5569 [pity, mood, soany, suggestion] \n", | |
| "5570 [guy, bitching, acted, like, id, interested, b... \n", | |
| "5571 [rofl, true, name] \n", | |
| "\n", | |
| "[5572 rows x 3 columns]" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "df['patterns'] = df['sms_text'].apply(lambda x:' '.join(x.lower() for x in x.split()))\n", | |
| "df['patterns']= df['patterns'].apply(lambda x: ' '.join(x for x in x.split() if x not in string.punctuation))\n", | |
| "df['patterns']= df['patterns'].str.replace('[^\\w\\s]','')\n", | |
| "df['patterns']= df['patterns'].apply(lambda x: ' '.join(x for x in x.split() if not x.isdigit()))\n", | |
| "df['patterns'] = df['patterns'].apply(lambda x:' '.join(x for x in x.split() if not x in stop_words))\n", | |
| "df['patterns'] = df['patterns'].apply(lambda x: \" \".join([wordnet_lemmatizer.lemmatize(word) for word in x.split()]))\n", | |
| "df['patterns'] = df.apply(lambda row: nltk.word_tokenize(row['patterns']), axis=1)\n", | |
| "\n", | |
| "display(df)" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.7.2" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment