Skip to content

Instantly share code, notes, and snippets.

@LowriWilliams
Created August 15, 2020 15:44
Show Gist options
  • Save LowriWilliams/645bbf894a19979ae374a99674a7b49e to your computer and use it in GitHub Desktop.
Save LowriWilliams/645bbf894a19979ae374a99674a7b49e to your computer and use it in GitHub Desktop.
sms_adversarial/pre-processing
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>label</th>\n",
" <th>sms_text</th>\n",
" <th>patterns</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>not_spam</td>\n",
" <td>Go until jurong point, crazy.. Available only ...</td>\n",
" <td>[go, jurong, point, crazy, available, bugis, n...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>not_spam</td>\n",
" <td>Ok lar... Joking wif u oni...</td>\n",
" <td>[ok, lar, joking, wif, u, oni]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>spam</td>\n",
" <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n",
" <td>[free, entry, wkly, comp, win, fa, cup, final,...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>not_spam</td>\n",
" <td>U dun say so early hor... U c already then say...</td>\n",
" <td>[u, dun, say, early, hor, u, c, already, say]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>not_spam</td>\n",
" <td>Nah I don't think he goes to usf, he lives aro...</td>\n",
" <td>[nah, dont, think, go, usf, life, around, though]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5567</th>\n",
" <td>spam</td>\n",
" <td>This is the 2nd time we have tried 2 contact u...</td>\n",
" <td>[2nd, time, tried, contact, u, u, å750, pound,...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5568</th>\n",
" <td>not_spam</td>\n",
" <td>Will Ì_ b going to esplanade fr home?</td>\n",
" <td>[ì_, b, going, esplanade, fr, home]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5569</th>\n",
" <td>not_spam</td>\n",
" <td>Pity, * was in mood for that. So...any other s...</td>\n",
" <td>[pity, mood, soany, suggestion]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5570</th>\n",
" <td>not_spam</td>\n",
" <td>The guy did some bitching but I acted like i'd...</td>\n",
" <td>[guy, bitching, acted, like, id, interested, b...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5571</th>\n",
" <td>not_spam</td>\n",
" <td>Rofl. Its true to its name</td>\n",
" <td>[rofl, true, name]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5572 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" label sms_text \\\n",
"0 not_spam Go until jurong point, crazy.. Available only ... \n",
"1 not_spam Ok lar... Joking wif u oni... \n",
"2 spam Free entry in 2 a wkly comp to win FA Cup fina... \n",
"3 not_spam U dun say so early hor... U c already then say... \n",
"4 not_spam Nah I don't think he goes to usf, he lives aro... \n",
"... ... ... \n",
"5567 spam This is the 2nd time we have tried 2 contact u... \n",
"5568 not_spam Will Ì_ b going to esplanade fr home? \n",
"5569 not_spam Pity, * was in mood for that. So...any other s... \n",
"5570 not_spam The guy did some bitching but I acted like i'd... \n",
"5571 not_spam Rofl. Its true to its name \n",
"\n",
" patterns \n",
"0 [go, jurong, point, crazy, available, bugis, n... \n",
"1 [ok, lar, joking, wif, u, oni] \n",
"2 [free, entry, wkly, comp, win, fa, cup, final,... \n",
"3 [u, dun, say, early, hor, u, c, already, say] \n",
"4 [nah, dont, think, go, usf, life, around, though] \n",
"... ... \n",
"5567 [2nd, time, tried, contact, u, u, å750, pound,... \n",
"5568 [ì_, b, going, esplanade, fr, home] \n",
"5569 [pity, mood, soany, suggestion] \n",
"5570 [guy, bitching, acted, like, id, interested, b... \n",
"5571 [rofl, true, name] \n",
"\n",
"[5572 rows x 3 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df['patterns'] = df['sms_text'].apply(lambda x:' '.join(x.lower() for x in x.split()))\n",
"df['patterns']= df['patterns'].apply(lambda x: ' '.join(x for x in x.split() if x not in string.punctuation))\n",
"df['patterns']= df['patterns'].str.replace('[^\\w\\s]','')\n",
"df['patterns']= df['patterns'].apply(lambda x: ' '.join(x for x in x.split() if not x.isdigit()))\n",
"df['patterns'] = df['patterns'].apply(lambda x:' '.join(x for x in x.split() if not x in stop_words))\n",
"df['patterns'] = df['patterns'].apply(lambda x: \" \".join([wordnet_lemmatizer.lemmatize(word) for word in x.split()]))\n",
"df['patterns'] = df.apply(lambda row: nltk.word_tokenize(row['patterns']), axis=1)\n",
"\n",
"display(df)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment