Skip to content

Instantly share code, notes, and snippets.

@blutarche
Created March 8, 2016 16:42
Show Gist options
  • Save blutarche/58612aa3dc64f340d0ad to your computer and use it in GitHub Desktop.
Save blutarche/58612aa3dc64f340d0ad to your computer and use it in GitHub Desktop.
Spam/Ham detect with example dataset.
#!/usr/bin/python
#-*-coding: utf-8 -*-\
import sys
import codecs
import json
from collections import Counter
print ("@relation Spamster")
print ()
classified = {}
def attri(j, s):
if s in j:
return str(j[s])
else:
return "?"
def isexist(j, s):
if s in j:
return 1
else:
return 0
def analyze(j, i):
h = j['headers']
length = attri(h, 'content-length')
lines = attri(h, 'lines')
# sender = attri(j['headers']['from'])
xcheck = isexist(h, 'x-spam-checker-version')
xlevel = isexist(h, 'x-spam-level')
xstatus = isexist(h, 'x-spam-status')
xrcsp = isexist(h, 'x-rc-spam')
xrcvi = isexist(h, 'x-rc-virus')
print ("%s, %s, %s, %s, %s, %s, %s, \"%s\"" %
( str(length),
str(lines),
str(xcheck),
str(xlevel),
str(xstatus),
str(xrcsp),
str(xrcvi),
str(classified[i])))
# ATTRIBUTE
f = codecs.open("full/index", 'r', 'utf-8')
lines = f.read().split('\n')
i = 0
for line in lines:
i = i+1
classified[i] = (line.split(" "))[0]
print ("@attribute \'Length\' numeric")
print ("@attribute \'Lines\' numeric")
print ("@attribute \'X spam checker\' numeric")
print ("@attribute \'X spam level\' numeric")
print ("@attribute \'X spam status\' numeric")
print ("@attribute \'X rc spam\' numeric")
print ("@attribute \'X rc virus\' numeric")
# print ("@attribute \'From\' string")
print ("@attribute \"Class\" {'spam', 'ham'}")
print ()
# DATA
print ("@data")
print ()
FILES = 75419
def read_json_from_file(f):
with open(f) as json_file:
json_data = json.load(json_file)
return json_data
for i in range(1, FILES+1):
j = read_json_from_file("json/inmail."+str(i)+".json")
# print ("WTFFFF ", i)
analyze(j, i)
# print (j)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment