Created
December 2, 2010 22:47
-
-
Save MasterEx/726252 to your computer and use it in GitHub Desktop.
A lucene application for my Information Retrieval class.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* A lucene application for my Information Retrieval class. | |
*/ | |
package luceneir; | |
import java.io.IOException; | |
import java.util.Enumeration; | |
import java.util.Iterator; | |
import java.util.List; | |
import org.apache.lucene.analysis.SimpleAnalyzer; | |
import org.apache.lucene.document.Document; | |
import org.apache.lucene.document.Field; | |
import org.apache.lucene.index.CorruptIndexException; | |
import org.apache.lucene.index.IndexReader; | |
import org.apache.lucene.index.IndexWriter; | |
import org.apache.lucene.index.TermDocs; | |
import org.apache.lucene.index.TermEnum; | |
import org.apache.lucene.queryParser.ParseException; | |
import org.apache.lucene.queryParser.QueryParser; | |
import org.apache.lucene.search.IndexSearcher; | |
import org.apache.lucene.search.Query; | |
import org.apache.lucene.search.ScoreDoc; | |
import org.apache.lucene.search.TopScoreDocCollector; | |
import org.apache.lucene.store.Directory; | |
import org.apache.lucene.store.LockObtainFailedException; | |
import org.apache.lucene.store.RAMDirectory; | |
import org.apache.lucene.util.Version; | |
/** | |
* | |
* @author Periklis Ntanasis | |
*/ | |
public class Main { | |
/** | |
* @param args the command line arguments | |
*/ | |
public static void main(String[] args) throws CorruptIndexException, LockObtainFailedException, IOException, ParseException { | |
SimpleAnalyzer analyzer = new SimpleAnalyzer(); | |
Directory index = new RAMDirectory(); | |
IndexWriter writer = new IndexWriter(index,analyzer,true,IndexWriter.MaxFieldLength.UNLIMITED); | |
System.out.println("\t\t~~The texts~~"); | |
addDoc(writer,"0","Visual retrieval engine","Search for images by content is a challenge"); | |
addDoc(writer,"1","Semantic retrieval","Use of semantic retrieval as a technique to retrieve images"); | |
addDoc(writer,"2","Object Oriented Programming","C++ is a object oriented programming language"); | |
addDoc(writer,"3","Natural Language processing","NLP and programming techniques"); | |
addDoc(writer,"4","Language models","Semantic retrieval using the language model"); | |
addDoc(writer,"5","Multimedia retrieval","Combined visual and semantic retrieval of images"); | |
addDoc(writer,"6","Semantic retrieval of images","impact of semantic retrieval to image retrieval"); | |
addDoc(writer,"7","Java","Programming tools with java"); | |
writer.optimize(); | |
writer.close(); | |
IndexReader ir = IndexReader.open(index, true); | |
System.out.println("\nTotal number of Docs: "+ir.numDocs()+"\n"); | |
System.out.println("\t\t~~INDEX~~"); | |
TermEnum ten = ir.terms(); | |
TermDocs tdoc = ir.termDocs(); | |
System.out.println("DocID TF DF IDF \t TF*IDF \t TERM"); | |
Double tmp; | |
while(ten.next()) | |
{ | |
tdoc.seek(ten); | |
tmp = idf(8.0D,ten.docFreq()); | |
while(tdoc.next()) | |
{ | |
System.out.println(tdoc.doc()+"\t"+tdoc.freq()+"\t"+ten.docFreq()+"\t"+String.format("%f", tmp)+"\t"+String.format("%f",(tmp*tdoc.freq()))+"\t"+ten.term().text()); | |
} | |
} | |
System.out.println("\t\t~~DOCUMENT VECTORS~~"); | |
ten = ir.terms(); | |
tdoc = ir.termDocs(); | |
while(ten.next()) | |
{ | |
tdoc.seek(ten); | |
tmp = idf(8.0D,ten.docFreq()); | |
System.out.println("DOCID\tTerm: "+ten.term().text()); | |
while(tdoc.next()) | |
{ | |
System.out.println(tdoc.doc()+" "+String.format("%f",(tmp*tdoc.freq()))); | |
} | |
} | |
System.out.println("\t\t~~QUERIES~~"); | |
String[] queries = {"image retrieval engines","image retrieval","image retrieval image" | |
,"processing with programming languages processes","Visual multimedia","java" | |
,"Visual and semantic multimedia retrieval","models"}; | |
IndexSearcher searcher = new IndexSearcher(ir); | |
TopScoreDocCollector collector; | |
Query q; | |
QueryParser tparser = new QueryParser(Version.LUCENE_29, "title", analyzer); | |
QueryParser bparser = new QueryParser(Version.LUCENE_29, "body", analyzer); | |
for(int i=0;i<queries.length;i++) | |
{ | |
System.out.println("*Query* : "+queries[i]); | |
q = tparser.parse(queries[i]); | |
collector = TopScoreDocCollector.create(10, true); | |
searcher.search(q, collector); | |
ScoreDoc[] hits = collector.topDocs().scoreDocs; | |
q = bparser.parse(queries[i]); | |
collector = TopScoreDocCollector.create(10, true); | |
searcher.search(q, collector); | |
ScoreDoc[] bhits = collector.topDocs().scoreDocs; | |
System.out.println("In title"); | |
for(int j=0;j<hits.length;j++) | |
System.out.println(">"+hits[j]); | |
System.out.println("In body"); | |
for(int k=0;k<bhits.length;k++) | |
System.out.println(">"+bhits[k]); | |
} | |
} | |
private static void addDoc(IndexWriter w,String docid, String title,String body) throws IOException { | |
Document doc = new Document(); | |
doc.add(new Field("docid", docid, Field.Store.YES, Field.Index.ANALYZED)); | |
doc.add(new Field("title", title, Field.Store.YES, Field.Index.ANALYZED)); | |
doc.add(new Field("body", body, Field.Store.YES, Field.Index.ANALYZED)); | |
printAllFieldsNew(doc); | |
w.addDocument(doc); | |
} | |
/** | |
* | |
* @param doc | |
* @deprecated | |
*/ | |
public static void printAllFields(Document doc) | |
{ | |
Enumeration fieldEnum = doc.fields(); | |
while(fieldEnum.hasMoreElements()) | |
{ | |
Field field = (Field)fieldEnum.nextElement(); | |
String value = doc.get(field.name()); | |
System.out.println(field.name() + ":" + value); | |
} | |
} | |
public static void printAllFieldsNew(Document doc) | |
{ | |
List fieldList = doc.getFields(); | |
Iterator<Field> iterator = fieldList.iterator(); | |
while(iterator.hasNext()) | |
{ | |
Field field = (Field)iterator.next(); | |
String value = doc.get(field.name()); | |
System.out.println(field.name() + ":" + value); | |
} | |
} | |
private static double idf(double d,int dt) { | |
return Math.log(d/dt)/Math.log(10); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
~~The texts~~ | |
docid:0 | |
title:Visual retrieval engine | |
body:Search for images by content is a challenge | |
docid:1 | |
title:Semantic retrieval | |
body:Use of semantic retrieval as a technique to retrieve images | |
docid:2 | |
title:Object Oriented Programming | |
body:C++ is a object oriented programming language | |
docid:3 | |
title:Natural Language processing | |
body:NLP and programming techniques | |
docid:4 | |
title:Language models | |
body:Semantic retrieval using the language model | |
docid:5 | |
title:Multimedia retrieval | |
body:Combined visual and semantic retrieval of images | |
docid:6 | |
title:Semantic retrieval of images | |
body:impact of semantic retrieval to image retrieval | |
docid:7 | |
title:Java | |
body:Programming tools with java | |
Total number of Docs: 8 | |
~~INDEX~~ | |
DocID TF DF IDF TF*IDF TERM | |
0 1 3 0.425969 0.425969 a | |
1 1 3 0.425969 0.425969 a | |
2 1 3 0.425969 0.425969 a | |
3 1 2 0.602060 0.602060 and | |
5 1 2 0.602060 0.602060 and | |
1 1 1 0.903090 0.903090 as | |
0 1 1 0.903090 0.903090 by | |
2 1 1 0.903090 0.903090 c | |
0 1 1 0.903090 0.903090 challenge | |
5 1 1 0.903090 0.903090 combined | |
0 1 1 0.903090 0.903090 content | |
0 1 1 0.903090 0.903090 for | |
6 1 1 0.903090 0.903090 image | |
0 1 3 0.425969 0.425969 images | |
1 1 3 0.425969 0.425969 images | |
5 1 3 0.425969 0.425969 images | |
6 1 1 0.903090 0.903090 impact | |
0 1 2 0.602060 0.602060 is | |
2 1 2 0.602060 0.602060 is | |
7 1 1 0.903090 0.903090 java | |
2 1 2 0.602060 0.602060 language | |
4 1 2 0.602060 0.602060 language | |
4 1 1 0.903090 0.903090 model | |
3 1 1 0.903090 0.903090 nlp | |
2 1 1 0.903090 0.903090 object | |
1 1 3 0.425969 0.425969 of | |
5 1 3 0.425969 0.425969 of | |
6 1 3 0.425969 0.425969 of | |
2 1 1 0.903090 0.903090 oriented | |
2 1 3 0.425969 0.425969 programming | |
3 1 3 0.425969 0.425969 programming | |
7 1 3 0.425969 0.425969 programming | |
1 1 4 0.301030 0.301030 retrieval | |
4 1 4 0.301030 0.301030 retrieval | |
5 1 4 0.301030 0.301030 retrieval | |
6 2 4 0.301030 0.602060 retrieval | |
1 1 1 0.903090 0.903090 retrieve | |
0 1 1 0.903090 0.903090 search | |
1 1 4 0.301030 0.301030 semantic | |
4 1 4 0.301030 0.301030 semantic | |
5 1 4 0.301030 0.301030 semantic | |
6 1 4 0.301030 0.301030 semantic | |
1 1 1 0.903090 0.903090 technique | |
3 1 1 0.903090 0.903090 techniques | |
4 1 1 0.903090 0.903090 the | |
1 1 2 0.602060 0.602060 to | |
6 1 2 0.602060 0.602060 to | |
7 1 1 0.903090 0.903090 tools | |
1 1 1 0.903090 0.903090 use | |
4 1 1 0.903090 0.903090 using | |
5 1 1 0.903090 0.903090 visual | |
7 1 1 0.903090 0.903090 with | |
0 1 1 0.903090 0.903090 engine | |
6 1 1 0.903090 0.903090 images | |
7 1 1 0.903090 0.903090 java | |
3 1 2 0.602060 0.602060 language | |
4 1 2 0.602060 0.602060 language | |
4 1 1 0.903090 0.903090 models | |
5 1 1 0.903090 0.903090 multimedia | |
3 1 1 0.903090 0.903090 natural | |
2 1 1 0.903090 0.903090 object | |
6 1 1 0.903090 0.903090 of | |
2 1 1 0.903090 0.903090 oriented | |
3 1 1 0.903090 0.903090 processing | |
2 1 1 0.903090 0.903090 programming | |
0 1 4 0.301030 0.301030 retrieval | |
1 1 4 0.301030 0.301030 retrieval | |
5 1 4 0.301030 0.301030 retrieval | |
6 1 4 0.301030 0.301030 retrieval | |
1 1 2 0.602060 0.602060 semantic | |
6 1 2 0.602060 0.602060 semantic | |
0 1 1 0.903090 0.903090 visual | |
~~DOCUMENT VECTORS~~ | |
DOCID Term: a | |
0 0.425969 | |
1 0.425969 | |
2 0.425969 | |
DOCID Term: and | |
3 0.602060 | |
5 0.602060 | |
DOCID Term: as | |
1 0.903090 | |
DOCID Term: by | |
0 0.903090 | |
DOCID Term: c | |
2 0.903090 | |
DOCID Term: challenge | |
0 0.903090 | |
DOCID Term: combined | |
5 0.903090 | |
DOCID Term: content | |
0 0.903090 | |
DOCID Term: for | |
0 0.903090 | |
DOCID Term: image | |
6 0.903090 | |
DOCID Term: images | |
0 0.425969 | |
1 0.425969 | |
5 0.425969 | |
DOCID Term: impact | |
6 0.903090 | |
DOCID Term: is | |
0 0.602060 | |
2 0.602060 | |
DOCID Term: java | |
7 0.903090 | |
DOCID Term: language | |
2 0.602060 | |
4 0.602060 | |
DOCID Term: model | |
4 0.903090 | |
DOCID Term: nlp | |
3 0.903090 | |
DOCID Term: object | |
2 0.903090 | |
DOCID Term: of | |
1 0.425969 | |
5 0.425969 | |
6 0.425969 | |
DOCID Term: oriented | |
2 0.903090 | |
DOCID Term: programming | |
2 0.425969 | |
3 0.425969 | |
7 0.425969 | |
DOCID Term: retrieval | |
1 0.301030 | |
4 0.301030 | |
5 0.301030 | |
6 0.602060 | |
DOCID Term: retrieve | |
1 0.903090 | |
DOCID Term: search | |
0 0.903090 | |
DOCID Term: semantic | |
1 0.301030 | |
4 0.301030 | |
5 0.301030 | |
6 0.301030 | |
DOCID Term: technique | |
1 0.903090 | |
DOCID Term: techniques | |
3 0.903090 | |
DOCID Term: the | |
4 0.903090 | |
DOCID Term: to | |
1 0.602060 | |
6 0.602060 | |
DOCID Term: tools | |
7 0.903090 | |
DOCID Term: use | |
1 0.903090 | |
DOCID Term: using | |
4 0.903090 | |
DOCID Term: visual | |
5 0.903090 | |
DOCID Term: with | |
7 0.903090 | |
DOCID Term: engine | |
0 0.903090 | |
DOCID Term: images | |
6 0.903090 | |
DOCID Term: java | |
7 0.903090 | |
DOCID Term: language | |
3 0.602060 | |
4 0.602060 | |
DOCID Term: models | |
4 0.903090 | |
DOCID Term: multimedia | |
5 0.903090 | |
DOCID Term: natural | |
3 0.903090 | |
DOCID Term: object | |
2 0.903090 | |
DOCID Term: of | |
6 0.903090 | |
DOCID Term: oriented | |
2 0.903090 | |
DOCID Term: processing | |
3 0.903090 | |
DOCID Term: programming | |
2 0.903090 | |
DOCID Term: retrieval | |
0 0.301030 | |
1 0.301030 | |
5 0.301030 | |
6 0.301030 | |
DOCID Term: semantic | |
1 0.602060 | |
6 0.602060 | |
DOCID Term: visual | |
0 0.903090 | |
~~QUERIES~~ | |
*Query* : image retrieval engines | |
In title | |
>doc=1 score=0.09794413 | |
>doc=5 score=0.09794413 | |
>doc=0 score=0.078355305 | |
>doc=6 score=0.078355305 | |
In body | |
>doc=6 score=0.5253691 | |
>doc=4 score=0.06487 | |
>doc=5 score=0.06487 | |
>doc=1 score=0.054058336 | |
*Query* : image retrieval | |
In title | |
>doc=1 score=0.19789651 | |
>doc=5 score=0.19789651 | |
>doc=0 score=0.15831721 | |
>doc=6 score=0.15831721 | |
In body | |
>doc=6 score=1.1707845 | |
>doc=4 score=0.14456275 | |
>doc=5 score=0.14456275 | |
>doc=1 score=0.12046896 | |
*Query* : image retrieval image | |
In title | |
>doc=1 score=0.09794413 | |
>doc=5 score=0.09794413 | |
>doc=0 score=0.078355305 | |
>doc=6 score=0.078355305 | |
In body | |
>doc=6 score=1.4715583 | |
>doc=4 score=0.073380694 | |
>doc=5 score=0.073380694 | |
>doc=1 score=0.06115058 | |
*Query* : processing with programming languages processes | |
In title | |
>doc=2 score=0.09021962 | |
>doc=3 score=0.09021962 | |
In body | |
>doc=7 score=0.28145066 | |
>doc=3 score=0.047122654 | |
>doc=2 score=0.03534199 | |
*Query* : Visual multimedia | |
In title | |
>doc=5 score=0.52730155 | |
>doc=0 score=0.42184123 | |
In body | |
>doc=5 score=0.2740635 | |
*Query* : java | |
In title | |
>doc=7 score=2.3862944 | |
In body | |
>doc=7 score=1.1931472 | |
*Query* : Visual and semantic multimedia retrieval | |
In title | |
>doc=5 score=0.37824464 | |
>doc=0 score=0.30259573 | |
>doc=1 score=0.2929821 | |
>doc=6 score=0.23438568 | |
In body | |
>doc=5 score=0.8640942 | |
>doc=6 score=0.16169022 | |
>doc=4 score=0.13394856 | |
>doc=1 score=0.1116238 | |
>doc=3 score=0.08107259 | |
*Query* : models | |
In title | |
>doc=4 score=1.491434 | |
In body |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment