Created
March 8, 2016 16:42
-
-
Save blutarche/58612aa3dc64f340d0ad to your computer and use it in GitHub Desktop.
Spam/Ham detect with example dataset.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
#-*-coding: utf-8 -*-\ | |
import sys | |
import codecs | |
import json | |
from collections import Counter | |
print ("@relation Spamster") | |
print () | |
classified = {} | |
def attri(j, s): | |
if s in j: | |
return str(j[s]) | |
else: | |
return "?" | |
def isexist(j, s): | |
if s in j: | |
return 1 | |
else: | |
return 0 | |
def analyze(j, i): | |
h = j['headers'] | |
length = attri(h, 'content-length') | |
lines = attri(h, 'lines') | |
# sender = attri(j['headers']['from']) | |
xcheck = isexist(h, 'x-spam-checker-version') | |
xlevel = isexist(h, 'x-spam-level') | |
xstatus = isexist(h, 'x-spam-status') | |
xrcsp = isexist(h, 'x-rc-spam') | |
xrcvi = isexist(h, 'x-rc-virus') | |
print ("%s, %s, %s, %s, %s, %s, %s, \"%s\"" % | |
( str(length), | |
str(lines), | |
str(xcheck), | |
str(xlevel), | |
str(xstatus), | |
str(xrcsp), | |
str(xrcvi), | |
str(classified[i]))) | |
# ATTRIBUTE | |
f = codecs.open("full/index", 'r', 'utf-8') | |
lines = f.read().split('\n') | |
i = 0 | |
for line in lines: | |
i = i+1 | |
classified[i] = (line.split(" "))[0] | |
print ("@attribute \'Length\' numeric") | |
print ("@attribute \'Lines\' numeric") | |
print ("@attribute \'X spam checker\' numeric") | |
print ("@attribute \'X spam level\' numeric") | |
print ("@attribute \'X spam status\' numeric") | |
print ("@attribute \'X rc spam\' numeric") | |
print ("@attribute \'X rc virus\' numeric") | |
# print ("@attribute \'From\' string") | |
print ("@attribute \"Class\" {'spam', 'ham'}") | |
print () | |
# DATA | |
print ("@data") | |
print () | |
FILES = 75419 | |
def read_json_from_file(f): | |
with open(f) as json_file: | |
json_data = json.load(json_file) | |
return json_data | |
for i in range(1, FILES+1): | |
j = read_json_from_file("json/inmail."+str(i)+".json") | |
# print ("WTFFFF ", i) | |
analyze(j, i) | |
# print (j) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment