Skip to content

Instantly share code, notes, and snippets.

@melpomene
Last active December 16, 2015 11:18
Show Gist options
  • Save melpomene/5425995 to your computer and use it in GitHub Desktop.
Save melpomene/5425995 to your computer and use it in GitHub Desktop.
Converts the card into XRFF format, allowing WEKA to read UTF-8 data.
#!/usr/bin/env python
# encoding: utf-8
import codecs
from collections import Counter
class Card:
def __init__(self, data):
self.idnr= data[0].strip("\n")
self.category = data[1].strip("\n")
self.star = data[2].strip("\n")
self.name = data[3].strip("\n")
self.questions = list()
self.addQuestion(data[4].strip("\n"), data[5], data[6].strip("\n"))
def addQuestion(self,value, text, answer):
value = value.strip()
if value== "250": nr = 1
elif value == "500": nr = 2
elif value == "1000": nr = 3
elif value == "2000": nr = 4
elif value == "5000": nr = 5
elif value == "10000": nr = 6
else: raise Exception("Parse error")
self.questions.append([nr, value.strip("\n"), text.strip("\n"), answer.strip("\n")])
def __str__(self):
s = u"kvitt:card{0} rdf:type kvitt:Card;\n"
s += u"\tkvitt:header [\n"
s += u"\t\tkvitt:id\t{0};\n"
s += u'\t\tkvitt:category\t"{1}";\n'
s += u'\t\tkvitt:star\t"{2}";\n'
s += u'\t\tkvitt:name\t"{3}";\n'
s += u'\t\tkvitt:questions \n'
for q in self.questions:
if q[0] == 6:
last = u"]."
else:
last = u","
s += u'\t\t\t\t[kvitt:line\t{0}; kvitt:value\t{1}; kvitt:text\t"{2}"; kvitt:answer\t"{3}"]{4}\n'.format(q[0],q[1],q[2],q[3],last)
return s.format(self.idnr, self.category, self.star, self.name)
def toSVM():
src = codecs.open('fragor.txt', 'r', "utf-8-sig")
output = codecs.open('trainingset_stringkernel.xrff', 'w', 'utf-8')
cards = dict()
categories = set()
for line in src:
data = line.split('\t')
if len(data) < 7: print data
if data[0] in cards:
categories.add(data[1].lower().strip("\n"))
cards[data[0]].addQuestion(data[4],data[5],data[6])
else:
cards[data[0]] = Card(data)
output.write("""<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE dataset
[
<!ELEMENT dataset (header,body)>
<!ATTLIST dataset name CDATA #REQUIRED>
<!ATTLIST dataset version CDATA "3.5.4">
<!ELEMENT header (notes?,attributes)>
<!ELEMENT body (instances)>
<!ELEMENT notes ANY> <!-- comments, information, copyright, etc. -->
<!ELEMENT attributes (attribute+)>
<!ELEMENT attribute (labels?,metadata?,attributes?)>
<!ATTLIST attribute name CDATA #REQUIRED>
<!ATTLIST attribute type (numeric|date|nominal|string|relational) #REQUIRED>
<!ATTLIST attribute format CDATA #IMPLIED>
<!ATTLIST attribute class (yes|no) "no">
<!ELEMENT labels (label*)> <!-- only for type "nominal" -->
<!ELEMENT label ANY>
<!ELEMENT metadata (property*)>
<!ELEMENT property ANY>
<!ATTLIST property name CDATA #REQUIRED>
<!ELEMENT instances (instance*)>
<!ELEMENT instance (value*)>
<!ATTLIST instance type (normal|sparse) "normal">
<!ATTLIST instance weight CDATA #IMPLIED>
<!ELEMENT value (#PCDATA|instances)*>
<!ATTLIST value index CDATA #IMPLIED> <!-- 1-based index (only used for instance format "sparse") -->
<!ATTLIST value missing (yes|no) "no">
]
>
<dataset name="book" version="3.5.3">
<header>
<attributes>
<attribute name="data" type="string"/>
<attribute class="yes" name="class" type="nominal">
<labels>""")
for c in categories:
output.write("<label>%s</label>" % c.replace(' ', ''))
output.write("""
</labels>
</attribute>
</attributes>
</header>
<body>
<instances>
""")
for card in cards.values():
for q in card.questions:
row = "<instance><value>"+q[2].replace("&", "&amp;")+"</value><value>"+card.category.lower().replace(' ', '')
row += "</value></instance>\n"
output.write(row+"\n")
output.write("</instances></body></dataset>")
if __name__ == "__main__":
toSVM()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment