Skip to content

Instantly share code, notes, and snippets.

@nazt
Created February 14, 2010 11:03
Show Gist options
  • Save nazt/303956 to your computer and use it in GitHub Desktop.
Save nazt/303956 to your computer and use it in GitHub Desktop.
import org.nazt.lexto.*
System.setProperty("file.encoding","UTF-8")
class Tokenizer
{
private def lextoObj = new LongLexTo("SELF")
private def TypeList,IndexList=[]
def tokenizedTextList
Tokenizer(String srcText)
{
new File('lexitron.txt').eachLine { lextoObj.addDict(it)}
tokenize(srcText)
}
def tokenize(String srcText)
{
tokenizedTextList=[]
lextoObj.lineInstance(srcText)
def indexPtr=0
TypeList = lextoObj.getTypeList()
IndexList= lextoObj.getIndexList()
IndexList.eachWithIndex{ val, idx ->
tokenizedTextList<<srcText[indexPtr..<val]
indexPtr=val
}
}
def getTokenizedList()
{
return this.tokenizedTextList
}
def getTokenizedText(String str)
{
this.tokenizedTextList.join("|")
}
def getTypeList() {this.TypeList}
def getIndexList() {this.IndexList}
}
def c = new Tokenizer("x")
println c.getTokenizedText("String")
def tokenizedFile
def map=new TreeMap()
def sumarize= [:]
def stopWords=[" ","%","&","'","+","#","","(",")","_","appVersion","arrow","is","of","the","!","\"",",",".","*", "\$","\t","\\","-","|","/",":",";","?",">","<","All","Click","Copyright","ffffff","]","["]
new File("./contents").eachFileRecurse { fn->
if (fn.isFile()) {
/* tokenizedFile=new File("tokenized/"+fn.name+"_tokized")*/
def filex=new File("contents/"+fn.name)
c.tokenize(filex.getText("UTF-8"))
sumarize_counter=0
(c.getTokenizedList()-stopWords).each{
vocab=it.trim()
mapValue=map.get(vocab)
if(mapValue==null)
{
treemap=new TreeMap()
treemap[fn.name]=1
treemap["sum"]=1
map[vocab]=treemap
}
else
{
mapFileValue=mapValue.get(fn.name)
mapValue["sum"]+=1
if(mapFileValue==null)
map[vocab][fn.name]=1
else
map[vocab][fn.name]+=1
}
sumarize_counter++
}
sumarize[fn.name]=sumarize_counter
/* println "\t\t\t\t" + map.size*/
/* tokenizedFile.append(c.getTokenizedText("String"),"UTF-8") */
}
}
/* println "First Key = " + map.firstKey()*/
println sumarize
println "size = " + sumarize.size()
map.remove(map.firstKey())
def inverted=new File('inverted.index')
def invertedFile=new File('inverted.file')
/*map.each { k,v->
v.each { kk,vv -> invertedFile.append(k+"\t"+kk+"->"+vv+"\n","UTF-8") }
}*/
def searched=new File('search.finish')
println map.size()
def result=[]
new File('search.text').eachLine{
result=map.get(it.trim())
try {
inverted.append(it+"\n","UTF-8")
searched.append("Searching -> " + it+ "\n\tTF = " + result["sum"]+ "\t","UTF-8")
searched.append("IDF = " + result.size()+"\n","UTF-8")
}
catch(Exception e) {
println "ERROR!!! "
}
result.each{k,v->
if(k!="sum")
{searched.append("K " + k + " v raw = " + v + " v normalize = " + (v/sumarize[k] as Float)*100 + "sum "+sumarize[k] +"\n","UTF-8") }
}
}
/*mapFile.each { k,v-> invertedFile.append(k + " " + v +"\n","UTF-8") }*/
//groovy -cp ./LexTo.jar tokenizer.groovy
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment