Skip to content

Instantly share code, notes, and snippets.

@MasterEx
Created December 2, 2010 22:47
Show Gist options
  • Save MasterEx/726252 to your computer and use it in GitHub Desktop.
Save MasterEx/726252 to your computer and use it in GitHub Desktop.
A lucene application for my Information Retrieval class.
/**
* A lucene application for my Information Retrieval class.
*/
package luceneir;
import java.io.IOException;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.List;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
/**
*
* @author Periklis Ntanasis
*/
public class Main {
/**
* @param args the command line arguments
*/
public static void main(String[] args) throws CorruptIndexException, LockObtainFailedException, IOException, ParseException {
SimpleAnalyzer analyzer = new SimpleAnalyzer();
Directory index = new RAMDirectory();
IndexWriter writer = new IndexWriter(index,analyzer,true,IndexWriter.MaxFieldLength.UNLIMITED);
System.out.println("\t\t~~The texts~~");
addDoc(writer,"0","Visual retrieval engine","Search for images by content is a challenge");
addDoc(writer,"1","Semantic retrieval","Use of semantic retrieval as a technique to retrieve images");
addDoc(writer,"2","Object Oriented Programming","C++ is a object oriented programming language");
addDoc(writer,"3","Natural Language processing","NLP and programming techniques");
addDoc(writer,"4","Language models","Semantic retrieval using the language model");
addDoc(writer,"5","Multimedia retrieval","Combined visual and semantic retrieval of images");
addDoc(writer,"6","Semantic retrieval of images","impact of semantic retrieval to image retrieval");
addDoc(writer,"7","Java","Programming tools with java");
writer.optimize();
writer.close();
IndexReader ir = IndexReader.open(index, true);
System.out.println("\nTotal number of Docs: "+ir.numDocs()+"\n");
System.out.println("\t\t~~INDEX~~");
TermEnum ten = ir.terms();
TermDocs tdoc = ir.termDocs();
System.out.println("DocID TF DF IDF \t TF*IDF \t TERM");
Double tmp;
while(ten.next())
{
tdoc.seek(ten);
tmp = idf(8.0D,ten.docFreq());
while(tdoc.next())
{
System.out.println(tdoc.doc()+"\t"+tdoc.freq()+"\t"+ten.docFreq()+"\t"+String.format("%f", tmp)+"\t"+String.format("%f",(tmp*tdoc.freq()))+"\t"+ten.term().text());
}
}
System.out.println("\t\t~~DOCUMENT VECTORS~~");
ten = ir.terms();
tdoc = ir.termDocs();
while(ten.next())
{
tdoc.seek(ten);
tmp = idf(8.0D,ten.docFreq());
System.out.println("DOCID\tTerm: "+ten.term().text());
while(tdoc.next())
{
System.out.println(tdoc.doc()+" "+String.format("%f",(tmp*tdoc.freq())));
}
}
System.out.println("\t\t~~QUERIES~~");
String[] queries = {"image retrieval engines","image retrieval","image retrieval image"
,"processing with programming languages processes","Visual multimedia","java"
,"Visual and semantic multimedia retrieval","models"};
IndexSearcher searcher = new IndexSearcher(ir);
TopScoreDocCollector collector;
Query q;
QueryParser tparser = new QueryParser(Version.LUCENE_29, "title", analyzer);
QueryParser bparser = new QueryParser(Version.LUCENE_29, "body", analyzer);
for(int i=0;i<queries.length;i++)
{
System.out.println("*Query* : "+queries[i]);
q = tparser.parse(queries[i]);
collector = TopScoreDocCollector.create(10, true);
searcher.search(q, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
q = bparser.parse(queries[i]);
collector = TopScoreDocCollector.create(10, true);
searcher.search(q, collector);
ScoreDoc[] bhits = collector.topDocs().scoreDocs;
System.out.println("In title");
for(int j=0;j<hits.length;j++)
System.out.println(">"+hits[j]);
System.out.println("In body");
for(int k=0;k<bhits.length;k++)
System.out.println(">"+bhits[k]);
}
}
private static void addDoc(IndexWriter w,String docid, String title,String body) throws IOException {
Document doc = new Document();
doc.add(new Field("docid", docid, Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("title", title, Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("body", body, Field.Store.YES, Field.Index.ANALYZED));
printAllFieldsNew(doc);
w.addDocument(doc);
}
/**
*
* @param doc
* @deprecated
*/
public static void printAllFields(Document doc)
{
Enumeration fieldEnum = doc.fields();
while(fieldEnum.hasMoreElements())
{
Field field = (Field)fieldEnum.nextElement();
String value = doc.get(field.name());
System.out.println(field.name() + ":" + value);
}
}
public static void printAllFieldsNew(Document doc)
{
List fieldList = doc.getFields();
Iterator<Field> iterator = fieldList.iterator();
while(iterator.hasNext())
{
Field field = (Field)iterator.next();
String value = doc.get(field.name());
System.out.println(field.name() + ":" + value);
}
}
private static double idf(double d,int dt) {
return Math.log(d/dt)/Math.log(10);
}
}
~~The texts~~
docid:0
title:Visual retrieval engine
body:Search for images by content is a challenge
docid:1
title:Semantic retrieval
body:Use of semantic retrieval as a technique to retrieve images
docid:2
title:Object Oriented Programming
body:C++ is a object oriented programming language
docid:3
title:Natural Language processing
body:NLP and programming techniques
docid:4
title:Language models
body:Semantic retrieval using the language model
docid:5
title:Multimedia retrieval
body:Combined visual and semantic retrieval of images
docid:6
title:Semantic retrieval of images
body:impact of semantic retrieval to image retrieval
docid:7
title:Java
body:Programming tools with java
Total number of Docs: 8
~~INDEX~~
DocID TF DF IDF TF*IDF TERM
0 1 3 0.425969 0.425969 a
1 1 3 0.425969 0.425969 a
2 1 3 0.425969 0.425969 a
3 1 2 0.602060 0.602060 and
5 1 2 0.602060 0.602060 and
1 1 1 0.903090 0.903090 as
0 1 1 0.903090 0.903090 by
2 1 1 0.903090 0.903090 c
0 1 1 0.903090 0.903090 challenge
5 1 1 0.903090 0.903090 combined
0 1 1 0.903090 0.903090 content
0 1 1 0.903090 0.903090 for
6 1 1 0.903090 0.903090 image
0 1 3 0.425969 0.425969 images
1 1 3 0.425969 0.425969 images
5 1 3 0.425969 0.425969 images
6 1 1 0.903090 0.903090 impact
0 1 2 0.602060 0.602060 is
2 1 2 0.602060 0.602060 is
7 1 1 0.903090 0.903090 java
2 1 2 0.602060 0.602060 language
4 1 2 0.602060 0.602060 language
4 1 1 0.903090 0.903090 model
3 1 1 0.903090 0.903090 nlp
2 1 1 0.903090 0.903090 object
1 1 3 0.425969 0.425969 of
5 1 3 0.425969 0.425969 of
6 1 3 0.425969 0.425969 of
2 1 1 0.903090 0.903090 oriented
2 1 3 0.425969 0.425969 programming
3 1 3 0.425969 0.425969 programming
7 1 3 0.425969 0.425969 programming
1 1 4 0.301030 0.301030 retrieval
4 1 4 0.301030 0.301030 retrieval
5 1 4 0.301030 0.301030 retrieval
6 2 4 0.301030 0.602060 retrieval
1 1 1 0.903090 0.903090 retrieve
0 1 1 0.903090 0.903090 search
1 1 4 0.301030 0.301030 semantic
4 1 4 0.301030 0.301030 semantic
5 1 4 0.301030 0.301030 semantic
6 1 4 0.301030 0.301030 semantic
1 1 1 0.903090 0.903090 technique
3 1 1 0.903090 0.903090 techniques
4 1 1 0.903090 0.903090 the
1 1 2 0.602060 0.602060 to
6 1 2 0.602060 0.602060 to
7 1 1 0.903090 0.903090 tools
1 1 1 0.903090 0.903090 use
4 1 1 0.903090 0.903090 using
5 1 1 0.903090 0.903090 visual
7 1 1 0.903090 0.903090 with
0 1 1 0.903090 0.903090 engine
6 1 1 0.903090 0.903090 images
7 1 1 0.903090 0.903090 java
3 1 2 0.602060 0.602060 language
4 1 2 0.602060 0.602060 language
4 1 1 0.903090 0.903090 models
5 1 1 0.903090 0.903090 multimedia
3 1 1 0.903090 0.903090 natural
2 1 1 0.903090 0.903090 object
6 1 1 0.903090 0.903090 of
2 1 1 0.903090 0.903090 oriented
3 1 1 0.903090 0.903090 processing
2 1 1 0.903090 0.903090 programming
0 1 4 0.301030 0.301030 retrieval
1 1 4 0.301030 0.301030 retrieval
5 1 4 0.301030 0.301030 retrieval
6 1 4 0.301030 0.301030 retrieval
1 1 2 0.602060 0.602060 semantic
6 1 2 0.602060 0.602060 semantic
0 1 1 0.903090 0.903090 visual
~~DOCUMENT VECTORS~~
DOCID Term: a
0 0.425969
1 0.425969
2 0.425969
DOCID Term: and
3 0.602060
5 0.602060
DOCID Term: as
1 0.903090
DOCID Term: by
0 0.903090
DOCID Term: c
2 0.903090
DOCID Term: challenge
0 0.903090
DOCID Term: combined
5 0.903090
DOCID Term: content
0 0.903090
DOCID Term: for
0 0.903090
DOCID Term: image
6 0.903090
DOCID Term: images
0 0.425969
1 0.425969
5 0.425969
DOCID Term: impact
6 0.903090
DOCID Term: is
0 0.602060
2 0.602060
DOCID Term: java
7 0.903090
DOCID Term: language
2 0.602060
4 0.602060
DOCID Term: model
4 0.903090
DOCID Term: nlp
3 0.903090
DOCID Term: object
2 0.903090
DOCID Term: of
1 0.425969
5 0.425969
6 0.425969
DOCID Term: oriented
2 0.903090
DOCID Term: programming
2 0.425969
3 0.425969
7 0.425969
DOCID Term: retrieval
1 0.301030
4 0.301030
5 0.301030
6 0.602060
DOCID Term: retrieve
1 0.903090
DOCID Term: search
0 0.903090
DOCID Term: semantic
1 0.301030
4 0.301030
5 0.301030
6 0.301030
DOCID Term: technique
1 0.903090
DOCID Term: techniques
3 0.903090
DOCID Term: the
4 0.903090
DOCID Term: to
1 0.602060
6 0.602060
DOCID Term: tools
7 0.903090
DOCID Term: use
1 0.903090
DOCID Term: using
4 0.903090
DOCID Term: visual
5 0.903090
DOCID Term: with
7 0.903090
DOCID Term: engine
0 0.903090
DOCID Term: images
6 0.903090
DOCID Term: java
7 0.903090
DOCID Term: language
3 0.602060
4 0.602060
DOCID Term: models
4 0.903090
DOCID Term: multimedia
5 0.903090
DOCID Term: natural
3 0.903090
DOCID Term: object
2 0.903090
DOCID Term: of
6 0.903090
DOCID Term: oriented
2 0.903090
DOCID Term: processing
3 0.903090
DOCID Term: programming
2 0.903090
DOCID Term: retrieval
0 0.301030
1 0.301030
5 0.301030
6 0.301030
DOCID Term: semantic
1 0.602060
6 0.602060
DOCID Term: visual
0 0.903090
~~QUERIES~~
*Query* : image retrieval engines
In title
>doc=1 score=0.09794413
>doc=5 score=0.09794413
>doc=0 score=0.078355305
>doc=6 score=0.078355305
In body
>doc=6 score=0.5253691
>doc=4 score=0.06487
>doc=5 score=0.06487
>doc=1 score=0.054058336
*Query* : image retrieval
In title
>doc=1 score=0.19789651
>doc=5 score=0.19789651
>doc=0 score=0.15831721
>doc=6 score=0.15831721
In body
>doc=6 score=1.1707845
>doc=4 score=0.14456275
>doc=5 score=0.14456275
>doc=1 score=0.12046896
*Query* : image retrieval image
In title
>doc=1 score=0.09794413
>doc=5 score=0.09794413
>doc=0 score=0.078355305
>doc=6 score=0.078355305
In body
>doc=6 score=1.4715583
>doc=4 score=0.073380694
>doc=5 score=0.073380694
>doc=1 score=0.06115058
*Query* : processing with programming languages processes
In title
>doc=2 score=0.09021962
>doc=3 score=0.09021962
In body
>doc=7 score=0.28145066
>doc=3 score=0.047122654
>doc=2 score=0.03534199
*Query* : Visual multimedia
In title
>doc=5 score=0.52730155
>doc=0 score=0.42184123
In body
>doc=5 score=0.2740635
*Query* : java
In title
>doc=7 score=2.3862944
In body
>doc=7 score=1.1931472
*Query* : Visual and semantic multimedia retrieval
In title
>doc=5 score=0.37824464
>doc=0 score=0.30259573
>doc=1 score=0.2929821
>doc=6 score=0.23438568
In body
>doc=5 score=0.8640942
>doc=6 score=0.16169022
>doc=4 score=0.13394856
>doc=1 score=0.1116238
>doc=3 score=0.08107259
*Query* : models
In title
>doc=4 score=1.491434
In body
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment