Created
February 14, 2010 11:03
-
-
Save nazt/303956 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.nazt.lexto.* | |
System.setProperty("file.encoding","UTF-8") | |
class Tokenizer | |
{ | |
private def lextoObj = new LongLexTo("SELF") | |
private def TypeList,IndexList=[] | |
def tokenizedTextList | |
Tokenizer(String srcText) | |
{ | |
new File('lexitron.txt').eachLine { lextoObj.addDict(it)} | |
tokenize(srcText) | |
} | |
def tokenize(String srcText) | |
{ | |
tokenizedTextList=[] | |
lextoObj.lineInstance(srcText) | |
def indexPtr=0 | |
TypeList = lextoObj.getTypeList() | |
IndexList= lextoObj.getIndexList() | |
IndexList.eachWithIndex{ val, idx -> | |
tokenizedTextList<<srcText[indexPtr..<val] | |
indexPtr=val | |
} | |
} | |
def getTokenizedList() | |
{ | |
return this.tokenizedTextList | |
} | |
def getTokenizedText(String str) | |
{ | |
this.tokenizedTextList.join("|") | |
} | |
def getTypeList() {this.TypeList} | |
def getIndexList() {this.IndexList} | |
} | |
def c = new Tokenizer("x") | |
println c.getTokenizedText("String") | |
def tokenizedFile | |
def map=new TreeMap() | |
def sumarize= [:] | |
def stopWords=[" ","%","&","'","+","#","","(",")","_","appVersion","arrow","is","of","the","!","\"",",",".","*", "\$","\t","\\","-","|","/",":",";","?",">","<","All","Click","Copyright","ffffff","]","["] | |
new File("./contents").eachFileRecurse { fn-> | |
if (fn.isFile()) { | |
/* tokenizedFile=new File("tokenized/"+fn.name+"_tokized")*/ | |
def filex=new File("contents/"+fn.name) | |
c.tokenize(filex.getText("UTF-8")) | |
sumarize_counter=0 | |
(c.getTokenizedList()-stopWords).each{ | |
vocab=it.trim() | |
mapValue=map.get(vocab) | |
if(mapValue==null) | |
{ | |
treemap=new TreeMap() | |
treemap[fn.name]=1 | |
treemap["sum"]=1 | |
map[vocab]=treemap | |
} | |
else | |
{ | |
mapFileValue=mapValue.get(fn.name) | |
mapValue["sum"]+=1 | |
if(mapFileValue==null) | |
map[vocab][fn.name]=1 | |
else | |
map[vocab][fn.name]+=1 | |
} | |
sumarize_counter++ | |
} | |
sumarize[fn.name]=sumarize_counter | |
/* println "\t\t\t\t" + map.size*/ | |
/* tokenizedFile.append(c.getTokenizedText("String"),"UTF-8") */ | |
} | |
} | |
/* println "First Key = " + map.firstKey()*/ | |
println sumarize | |
println "size = " + sumarize.size() | |
map.remove(map.firstKey()) | |
def inverted=new File('inverted.index') | |
def invertedFile=new File('inverted.file') | |
/*map.each { k,v-> | |
v.each { kk,vv -> invertedFile.append(k+"\t"+kk+"->"+vv+"\n","UTF-8") } | |
}*/ | |
def searched=new File('search.finish') | |
println map.size() | |
def result=[] | |
new File('search.text').eachLine{ | |
result=map.get(it.trim()) | |
try { | |
inverted.append(it+"\n","UTF-8") | |
searched.append("Searching -> " + it+ "\n\tTF = " + result["sum"]+ "\t","UTF-8") | |
searched.append("IDF = " + result.size()+"\n","UTF-8") | |
} | |
catch(Exception e) { | |
println "ERROR!!! " | |
} | |
result.each{k,v-> | |
if(k!="sum") | |
{searched.append("K " + k + " v raw = " + v + " v normalize = " + (v/sumarize[k] as Float)*100 + "sum "+sumarize[k] +"\n","UTF-8") } | |
} | |
} | |
/*mapFile.each { k,v-> invertedFile.append(k + " " + v +"\n","UTF-8") }*/ | |
//groovy -cp ./LexTo.jar tokenizer.groovy |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment