Last active
December 16, 2015 11:18
-
-
Save melpomene/5425995 to your computer and use it in GitHub Desktop.
Converts the card into XRFF format, allowing WEKA to read UTF-8 data.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# encoding: utf-8 | |
import codecs | |
from collections import Counter | |
class Card: | |
def __init__(self, data): | |
self.idnr= data[0].strip("\n") | |
self.category = data[1].strip("\n") | |
self.star = data[2].strip("\n") | |
self.name = data[3].strip("\n") | |
self.questions = list() | |
self.addQuestion(data[4].strip("\n"), data[5], data[6].strip("\n")) | |
def addQuestion(self,value, text, answer): | |
value = value.strip() | |
if value== "250": nr = 1 | |
elif value == "500": nr = 2 | |
elif value == "1000": nr = 3 | |
elif value == "2000": nr = 4 | |
elif value == "5000": nr = 5 | |
elif value == "10000": nr = 6 | |
else: raise Exception("Parse error") | |
self.questions.append([nr, value.strip("\n"), text.strip("\n"), answer.strip("\n")]) | |
def __str__(self): | |
s = u"kvitt:card{0} rdf:type kvitt:Card;\n" | |
s += u"\tkvitt:header [\n" | |
s += u"\t\tkvitt:id\t{0};\n" | |
s += u'\t\tkvitt:category\t"{1}";\n' | |
s += u'\t\tkvitt:star\t"{2}";\n' | |
s += u'\t\tkvitt:name\t"{3}";\n' | |
s += u'\t\tkvitt:questions \n' | |
for q in self.questions: | |
if q[0] == 6: | |
last = u"]." | |
else: | |
last = u"," | |
s += u'\t\t\t\t[kvitt:line\t{0}; kvitt:value\t{1}; kvitt:text\t"{2}"; kvitt:answer\t"{3}"]{4}\n'.format(q[0],q[1],q[2],q[3],last) | |
return s.format(self.idnr, self.category, self.star, self.name) | |
def toSVM(): | |
src = codecs.open('fragor.txt', 'r', "utf-8-sig") | |
output = codecs.open('trainingset_stringkernel.xrff', 'w', 'utf-8') | |
cards = dict() | |
categories = set() | |
for line in src: | |
data = line.split('\t') | |
if len(data) < 7: print data | |
if data[0] in cards: | |
categories.add(data[1].lower().strip("\n")) | |
cards[data[0]].addQuestion(data[4],data[5],data[6]) | |
else: | |
cards[data[0]] = Card(data) | |
output.write("""<?xml version="1.0" encoding="utf-8"?> | |
<!DOCTYPE dataset | |
[ | |
<!ELEMENT dataset (header,body)> | |
<!ATTLIST dataset name CDATA #REQUIRED> | |
<!ATTLIST dataset version CDATA "3.5.4"> | |
<!ELEMENT header (notes?,attributes)> | |
<!ELEMENT body (instances)> | |
<!ELEMENT notes ANY> <!-- comments, information, copyright, etc. --> | |
<!ELEMENT attributes (attribute+)> | |
<!ELEMENT attribute (labels?,metadata?,attributes?)> | |
<!ATTLIST attribute name CDATA #REQUIRED> | |
<!ATTLIST attribute type (numeric|date|nominal|string|relational) #REQUIRED> | |
<!ATTLIST attribute format CDATA #IMPLIED> | |
<!ATTLIST attribute class (yes|no) "no"> | |
<!ELEMENT labels (label*)> <!-- only for type "nominal" --> | |
<!ELEMENT label ANY> | |
<!ELEMENT metadata (property*)> | |
<!ELEMENT property ANY> | |
<!ATTLIST property name CDATA #REQUIRED> | |
<!ELEMENT instances (instance*)> | |
<!ELEMENT instance (value*)> | |
<!ATTLIST instance type (normal|sparse) "normal"> | |
<!ATTLIST instance weight CDATA #IMPLIED> | |
<!ELEMENT value (#PCDATA|instances)*> | |
<!ATTLIST value index CDATA #IMPLIED> <!-- 1-based index (only used for instance format "sparse") --> | |
<!ATTLIST value missing (yes|no) "no"> | |
] | |
> | |
<dataset name="book" version="3.5.3"> | |
<header> | |
<attributes> | |
<attribute name="data" type="string"/> | |
<attribute class="yes" name="class" type="nominal"> | |
<labels>""") | |
for c in categories: | |
output.write("<label>%s</label>" % c.replace(' ', '')) | |
output.write(""" | |
</labels> | |
</attribute> | |
</attributes> | |
</header> | |
<body> | |
<instances> | |
""") | |
for card in cards.values(): | |
for q in card.questions: | |
row = "<instance><value>"+q[2].replace("&", "&")+"</value><value>"+card.category.lower().replace(' ', '') | |
row += "</value></instance>\n" | |
output.write(row+"\n") | |
output.write("</instances></body></dataset>") | |
if __name__ == "__main__": | |
toSVM() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment