Last active
May 25, 2017 20:19
-
-
Save galvanic/5e92ebcb86bf05669b85ae8212efce0f to your computer and use it in GitHub Desktop.
Parse the TREC 2007 spam email dataset
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| # coding: utf-8 | |
| import numpy as np | |
| import pandas as pd | |
| import re | |
| #import os ## TODO use this to traverse folders | |
| from email.parser import Parser | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| with open('trec2007/full/index', 'r') as ifile: | |
| raw_labels = ifile.readlines() | |
| labels = [] | |
| for label in raw_labels: | |
| match = re.search(r'((?:sp|h)am) .*?inmail\\.(\\d{1,5})', label) | |
| if match: | |
| category = match.group(1) | |
| email_num = match.group(2) | |
| labels.append((email_num, category)) | |
| email_nums, categories = zip(*labels) | |
| labels = pd.Series(categories, index=email_nums) | |
| parser = Parser() | |
| corpus = [] | |
| for email_num in email_nums: | |
| filepath = 'trec2007/data/inmail.%s' % email_num | |
| with open(filepath, 'r', encoding='ISO-8859-1') as ifile: | |
| email = parser.parse(ifile) | |
| ## TODO also keep the subject header | |
| ## code and details from: | |
| ## https://stackoverflow.com/questions/17874360/python-how-to-parse-the-body-from-a-raw-email-given-that-raw-email-does-not/32840516#32840516 | |
| if email.is_multipart(): | |
| for part in email.walk(): | |
| content_type = part.get_content_type() | |
| content_dispo = str(part.get('Content-Disposition')) | |
| # skip any text/plain (txt) attachments | |
| if content_type == 'text/plain' and 'attachment' not in content_dispo: | |
| body = part.get_payload(decode=False) | |
| break ## only keep the first email | |
| else: | |
| body = email.get_payload(decode=False) | |
| ## TODO strip html stuff ? or valuable features ? | |
| try: | |
| corpus.append((email_num, body)) | |
| except NameError: | |
| pass | |
| email_nums, corpus = zip(*corpus) | |
| vectorizer = CountVectorizer(min_df=1, max_features=10000) # df: document frequency | |
| X = vectorizer.fit_transform(corpus) | |
| # transform to dataframe with multiIndex (word, email) (i.e. (feature, datapoint)) with count as data | |
| df = pd.DataFrame(X.toarray(), index=email_nums, columns=vectorizer.get_feature_names(), dtype=np.uint8) | |
| labels.to_csv('labels.csv') | |
| ## we are only interested in presence of a word not frequency | |
| df.astype(bool).astype(np.uint8).to_csv('features.csv') | |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
There are loads of french words