Skip to content

Instantly share code, notes, and snippets.

@galvanic
Last active May 25, 2017 20:19
Show Gist options
  • Select an option

  • Save galvanic/5e92ebcb86bf05669b85ae8212efce0f to your computer and use it in GitHub Desktop.

Select an option

Save galvanic/5e92ebcb86bf05669b85ae8212efce0f to your computer and use it in GitHub Desktop.
Parse the TREC 2007 spam email dataset
#!/usr/bin/env python
# coding: utf-8
import numpy as np
import pandas as pd
import re
#import os ## TODO use this to traverse folders
from email.parser import Parser
from sklearn.feature_extraction.text import CountVectorizer
with open('trec2007/full/index', 'r') as ifile:
raw_labels = ifile.readlines()
labels = []
for label in raw_labels:
match = re.search(r'((?:sp|h)am) .*?inmail\\.(\\d{1,5})', label)
if match:
category = match.group(1)
email_num = match.group(2)
labels.append((email_num, category))
email_nums, categories = zip(*labels)
labels = pd.Series(categories, index=email_nums)
parser = Parser()
corpus = []
for email_num in email_nums:
filepath = 'trec2007/data/inmail.%s' % email_num
with open(filepath, 'r', encoding='ISO-8859-1') as ifile:
email = parser.parse(ifile)
## TODO also keep the subject header
## code and details from:
## https://stackoverflow.com/questions/17874360/python-how-to-parse-the-body-from-a-raw-email-given-that-raw-email-does-not/32840516#32840516
if email.is_multipart():
for part in email.walk():
content_type = part.get_content_type()
content_dispo = str(part.get('Content-Disposition'))
# skip any text/plain (txt) attachments
if content_type == 'text/plain' and 'attachment' not in content_dispo:
body = part.get_payload(decode=False)
break ## only keep the first email
else:
body = email.get_payload(decode=False)
## TODO strip html stuff ? or valuable features ?
try:
corpus.append((email_num, body))
except NameError:
pass
email_nums, corpus = zip(*corpus)
vectorizer = CountVectorizer(min_df=1, max_features=10000) # df: document frequency
X = vectorizer.fit_transform(corpus)
# transform to dataframe with multiIndex (word, email) (i.e. (feature, datapoint)) with count as data
df = pd.DataFrame(X.toarray(), index=email_nums, columns=vectorizer.get_feature_names(), dtype=np.uint8)
labels.to_csv('labels.csv')
## we are only interested in presence of a word not frequency
df.astype(bool).astype(np.uint8).to_csv('features.csv')
@galvanic
Copy link
Author

galvanic commented Jun 19, 2016

There are loads of french words

word
the              576502
font             532248
to               356458
td               350453
http             298259
and              290423
of               263801
com              243001
size             208665
in               197843
style            190827
width            172911
tr               170676
you              164156
br               159457
color            157344
3d               153277
www              151494
for              145346
is               143672
class            135812
family           125395
border           123353
arial            119418
that             117642
this             116215
span             114586
href             113576
align            112953
it               108837
                  ...  
apr                3222
seems              3220
80                 3219
space              3216
wish               3211
love               3208
model              3205
simple             3205
220                3203
values             3202
seen               3190
été                3189
bool               3188
48                 3188
general            3180
ok                 3178
conjuguer          3175
avoirs             3175
caisse             3171
problème           3171
apprécie           3169
compréhension      3169
assuré             3169
vérifier           3169
vérification       3169
accéder            3169
protéger           3169
variable           3153
word               3149
pay                3146
dtype: int64

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment