Created
August 17, 2011 03:01
-
-
Save javajosh/1150723 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.BufferedInputStream; | |
import java.io.File; | |
import java.io.FileInputStream; | |
import java.util.ArrayList; | |
import java.util.List; | |
import org.apache.lucene.analysis.Analyzer; | |
import org.apache.lucene.analysis.standard.StandardAnalyzer; | |
import org.apache.lucene.document.Document; | |
import org.apache.lucene.document.Field; | |
import org.apache.lucene.index.IndexWriter; | |
import org.apache.lucene.index.IndexWriterConfig; | |
import org.apache.lucene.index.IndexWriterConfig.OpenMode; | |
import org.apache.lucene.queryParser.ParseException; | |
import org.apache.lucene.queryParser.QueryParser; | |
import org.apache.lucene.search.IndexSearcher; | |
import org.apache.lucene.search.Query; | |
import org.apache.lucene.search.TopDocs; | |
import org.apache.lucene.store.Directory; | |
import org.apache.lucene.store.NIOFSDirectory; | |
import org.apache.lucene.store.RAMDirectory; | |
import org.apache.lucene.util.Version; | |
import org.xml.sax.Attributes; | |
import org.xml.sax.InputSource; | |
import org.xml.sax.XMLReader; | |
import org.xml.sax.helpers.DefaultHandler; | |
import org.xml.sax.helpers.XMLReaderFactory; | |
import com.csvreader.CsvReader; | |
public class Main { | |
static final boolean DEBUG = true; | |
/** | |
* Find out if the activity log note field mentions any known procedure. | |
* | |
* TODO: Doesn't work! There are far too many false positives! I suspect a problem with query construction. | |
* | |
* @param args | |
* @throws Exception | |
*/ | |
public static void main(String[] args) throws Exception { | |
long start = System.currentTimeMillis(); | |
List<ActivityRecord> activityLog = parseActivityLog("Activity.csv"); | |
List<String> procedureNames = parseProcedureNames("Procedures.xml"); | |
// Now find the overlap with Lucene. We will write our index into | |
// memory. | |
Directory directory = DEBUG ? new NIOFSDirectory(new File( | |
"lucene_index")) : new RAMDirectory(); // allows us to use Luke | |
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_33); | |
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_33, | |
analyzer); | |
iwc.setOpenMode(OpenMode.CREATE); | |
IndexWriter writer = new IndexWriter(directory, iwc); | |
// Index the log notes | |
for (ActivityRecord activity : activityLog) { | |
if (activity.note.trim().isEmpty()) | |
continue; | |
Document doc = new Document(); | |
doc.add(new Field("start", activity.start, Field.Store.YES, | |
Field.Index.NOT_ANALYZED)); | |
doc.add(new Field("note", activity.note, Field.Store.YES, | |
Field.Index.ANALYZED)); | |
// System.out.println(activity); | |
writer.addDocument(doc); | |
} | |
writer.close(); | |
// Do one search per procedure name on the log notes index | |
IndexSearcher searcher = new IndexSearcher(directory); | |
QueryParser queryParser = new QueryParser(Version.LUCENE_33, "note", | |
analyzer); | |
for (String procedureName : procedureNames) { | |
try { | |
// This may actually be wrong, as strange characters may appear. | |
Query query = queryParser.parse(procedureName); // may throw | |
// parse | |
// exception | |
TopDocs rs = searcher.search(query, null, 10); | |
if (rs.totalHits > 0) { | |
Document firstHit = searcher.doc(rs.scoreDocs[0].doc); | |
System.out.printf("start: %s hits: %s proc: %s note: %s\n", | |
firstHit.getFieldable("start").stringValue(), | |
rs.totalHits, procedureName, | |
firstHit.getFieldable("note").stringValue()); | |
} | |
} catch (ParseException e) { | |
// | |
} | |
} | |
System.out.printf("Duration: %sms", System.currentTimeMillis() - start); | |
} | |
}} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment