Skip to content

Instantly share code, notes, and snippets.

@melpomene
Created April 18, 2013 15:50
Show Gist options
  • Save melpomene/5413822 to your computer and use it in GitHub Desktop.
Save melpomene/5413822 to your computer and use it in GitHub Desktop.
Takes tabseperated card files and turns them into light svm files for course in intelligent systems.
#!/usr/bin/env python
# encoding: utf-8
import codecs
from collections import Counter
class Card:
def __init__(self, data):
self.idnr= data[0].strip("\n")
self.category = data[1].strip("\n")
self.star = data[2].strip("\n")
self.name = data[3].strip("\n")
self.questions = list()
self.addQuestion(data[4].strip("\n"), data[5].strip("\n"), data[6].strip("\n"))
def addQuestion(self,value, text, answer):
value = value.strip()
if value== "250": nr = 1
elif value == "500": nr = 2
elif value == "1000": nr = 3
elif value == "2000": nr = 4
elif value == "5000": nr = 5
elif value == "10000": nr = 6
else: raise Exception("Parse error")
self.questions.append([nr, value.strip("\n"), text.strip("\n"), answer.strip("\n")])
def __str__(self):
s = u"kvitt:card{0} rdf:type kvitt:Card;\n"
s += u"\tkvitt:header [\n"
s += u"\t\tkvitt:id\t{0};\n"
s += u'\t\tkvitt:category\t"{1}";\n'
s += u'\t\tkvitt:star\t"{2}";\n'
s += u'\t\tkvitt:name\t"{3}";\n'
s += u'\t\tkvitt:questions \n'
for q in self.questions:
if q[0] == 6:
last = u"]."
else:
last = u","
s += u'\t\t\t\t[kvitt:line\t{0}; kvitt:value\t{1}; kvitt:text\t"{2}"; kvitt:answer\t"{3}"]{4}\n'.format(q[0],q[1],q[2],q[3],last)
return s.format(self.idnr, self.category, self.star, self.name)
def toSVM():
src = codecs.open('fragor.txt', 'r', "utf-8-sig")
output = open('bockerfilm_training_ngram.dat', 'w')
cards = dict()
for line in src:
data = line.split('\t')
if len(data) < 7: print data
if data[0] in cards:
cards[data[0]].addQuestion(data[4],data[5],data[6])
else:
cards[data[0]] = Card(data)
document_frequency = Counter()
ngrams_freq = Counter()
for card in cards.values():
for q in card.questions:
text = q[2] # extract only the question
append_ngrams(ngrams_freq, map(normalize, text.split()))
for a in map(normalize, text.split()): document_frequency[a] += 1
all_words = list(document_frequency.keys())
all_ngrams = list(ngrams_freq.keys())
for card in cards.values():
for q in card.questions:
if card.category == u"Böcker och film": row ="1 "
else: row = "-1 "
words = map(normalize, q[2].split())
for i in range(len(all_words)):
if all_words[i] in words: row += str(i+1) + ":" + str(words.count(all_words[i]) / float(document_frequency[all_words[i]]))+ " "
#else: row += str(i+1)+":0 "
i_from = len(all_words)
for i in range(len(all_ngrams)):
ng = ngrams(words)
if all_ngrams[i] in ng: row += str(i_from+i+1)+":"+str(ng[all_ngrams[i]] / float(ngrams_freq[all_ngrams[i]]))+" "
output.write(row+"\n")
def ngrams(wordlist, n=2):
data = Counter()
for i in range(len(wordlist)):
ng = wordlist[i]
for j in range(1, n):
ng += ","+wordlist[j]
data[ng] += 1
return data
def append_ngrams(counter, wordlist, n=2):
for i in range(len(wordlist)):
ng = wordlist[i]
for j in range(1, n):
ng += ","+wordlist[j]
counter[ng] += 1
return counter
def in_ngram(ngrams, words, n=2):
ngram = words[0]
for i in range(1, n):
ngram += ","+words[i]
return ngram in ngrams
def normalize(word):
word = word.strip()
word = word.lower()
word = word.strip(",!.;?")
return word
if __name__ == "__main__":
toSVM()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment