galvanic · May 25, 2017 20:19 · galvanic · Jun 19, 2016
diff --git a/parse_TREC.py b/parse_TREC.py
 #!/usr/bin/env python
 # coding: utf-8

 import numpy as np
 import pandas as pd
 import re
 #import os ## TODO use this to traverse folders
 from email.parser import Parser
 from sklearn.feature_extraction.text import CountVectorizer

 with open('trec2007/full/index', 'r') as ifile:
    raw_labels = ifile.readlines()

 labels = []
 for label in raw_labels:
    match = re.search(r'((?:sp|h)am) .*?inmail\\.(\\d{1,5})', label)
    if match:
        category = match.group(1)
        email_num = match.group(2)
        labels.append((email_num, category))

 email_nums, categories = zip(*labels)
 labels = pd.Series(categories, index=email_nums)

 parser = Parser()

 corpus = []
 for email_num in email_nums:
    filepath = 'trec2007/data/inmail.%s' % email_num

    with open(filepath, 'r', encoding='ISO-8859-1') as ifile:
        email = parser.parse(ifile)

    ## TODO also keep the subject header

    ## code and details from:
    ## https://stackoverflow.com/questions/17874360/python-how-to-parse-the-body-from-a-raw-email-given-that-raw-email-does-not/32840516#32840516
    if email.is_multipart():
        for part in email.walk():
            content_type = part.get_content_type()
            content_dispo = str(part.get('Content-Disposition'))

            # skip any text/plain (txt) attachments
            if content_type == 'text/plain' and 'attachment' not in content_dispo:
                body = part.get_payload(decode=False)
                break ## only keep the first email
    else:
        body = email.get_payload(decode=False)

    ## TODO strip html stuff ? or valuable features ?
    try:
        corpus.append((email_num, body))
    except NameError:
        pass

 email_nums, corpus = zip(*corpus)

 vectorizer = CountVectorizer(min_df=1, max_features=10000) # df: document frequency
 X = vectorizer.fit_transform(corpus)

 # transform to dataframe with multiIndex (word, email) (i.e. (feature, datapoint)) with count as data
 df = pd.DataFrame(X.toarray(), index=email_nums, columns=vectorizer.get_feature_names(), dtype=np.uint8)

 labels.to_csv('labels.csv')
 ## we are only interested in presence of a word not frequency
 df.astype(bool).astype(np.uint8).to_csv('features.csv')
	#!/usr/bin/env python
	# coding: utf-8

	import numpy as np
	import pandas as pd
	import re
	#import os ## TODO use this to traverse folders
	from email.parser import Parser
	from sklearn.feature_extraction.text import CountVectorizer

	with open('trec2007/full/index', 'r') as ifile:
	raw_labels = ifile.readlines()

	labels = []
	for label in raw_labels:
	match = re.search(r'((?:sp\|h)am) .*?inmail\\.(\\d{1,5})', label)
	if match:
	category = match.group(1)
	email_num = match.group(2)
	labels.append((email_num, category))

	email_nums, categories = zip(*labels)
	labels = pd.Series(categories, index=email_nums)

	parser = Parser()

	corpus = []
	for email_num in email_nums:
	filepath = 'trec2007/data/inmail.%s' % email_num

	with open(filepath, 'r', encoding='ISO-8859-1') as ifile:
	email = parser.parse(ifile)

	## TODO also keep the subject header

	## code and details from:
	## https://stackoverflow.com/questions/17874360/python-how-to-parse-the-body-from-a-raw-email-given-that-raw-email-does-not/32840516#32840516
	if email.is_multipart():
	for part in email.walk():
	content_type = part.get_content_type()
	content_dispo = str(part.get('Content-Disposition'))

	# skip any text/plain (txt) attachments
	if content_type == 'text/plain' and 'attachment' not in content_dispo:
	body = part.get_payload(decode=False)
	break ## only keep the first email
	else:
	body = email.get_payload(decode=False)

	## TODO strip html stuff ? or valuable features ?
	try:
	corpus.append((email_num, body))
	except NameError:
	pass

	email_nums, corpus = zip(*corpus)

	vectorizer = CountVectorizer(min_df=1, max_features=10000) # df: document frequency
	X = vectorizer.fit_transform(corpus)

	# transform to dataframe with multiIndex (word, email) (i.e. (feature, datapoint)) with count as data
	df = pd.DataFrame(X.toarray(), index=email_nums, columns=vectorizer.get_feature_names(), dtype=np.uint8)

	labels.to_csv('labels.csv')
	## we are only interested in presence of a word not frequency
	df.astype(bool).astype(np.uint8).to_csv('features.csv')
No results found