Created
June 23, 2011 16:29
-
-
Save rbramley/1042921 to your computer and use it in GitHub Desktop.
Script to walk an unoptimised Lucene index to determine how many deletions were due to updates
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Copying and distribution of this file, with or without modification, | |
* are permitted in any medium without royalty provided the copyright | |
* notice and this notice are preserved. This file is offered as-is, | |
* without any warranty. | |
* | |
* @author Robin Bramley (c) 2011 | |
* | |
* Purpose: | |
* Of the deletions in an unoptimised Lucene index, how many were | |
* due to updates? | |
* | |
* This script performs an undelete (not committed), then walks a | |
* Lucene index checking a 'unique' key field for duplicates. In | |
* verbose mode the unique keys and matching document numbers are | |
* output (e.g. for inspection using Luke). | |
* | |
* This is quick & dirty and has been tested using Groovy 1.8.0 with | |
* Lucene 2.9.1 DirectoryIndexReader (against an older Solr install). | |
* | |
* WARNING: You are advised to work on a backup copy of the index. | |
* | |
* Sample output: | |
* Docs (initial): 33426 | |
* Deleted docs: 545 | |
* ---------- | |
* Docs: 33971 | |
* Updates: 545 (updated records: 461) | |
* Deletions: 0 | |
* ---------- | |
* NYR48 : [8187, 33745] | |
* ... | |
* TAK141 : [30951, 33620, 33960] | |
* ... | |
* ---------- | |
*/ | |
import org.apache.lucene.store.FSDirectory | |
import org.apache.lucene.index.IndexReader | |
import org.apache.lucene.index.IndexWriter | |
// Which version of Lucene to Grab | |
@Grab(group='org.apache.lucene', module='lucene-core', version='2.9.1') | |
// TODO: You might want to set these | |
def uniqueKeyField = 'id' | |
def indexPath = './index' | |
def verbose = true | |
// key working Lucene objects | |
path = new File(indexPath) | |
fsd = FSDirectory.open(path) | |
ir = IndexReader.open(fsd, false) // open read-write so we can undelete | |
try { | |
// variables | |
working = [:] | |
dupes = [] as Set | |
updates = 0 | |
deleted = ir.numDeletedDocs() | |
println "Docs (initial): ${ir.numDocs()}" | |
// check for deletions | |
if(ir.hasDeletions()) { | |
println "Deleted docs: ${deleted}" | |
ir.undeleteAll() | |
// we don't commit this undelete | |
} | |
println '----------' | |
n = ir.numDocs() | |
println "Docs: ${n}" | |
// iterate through all the documents | |
(0..(n-1)).each { i -> | |
id = ir.document(i).getField(uniqueKeyField).stringValue() | |
if(working.containsKey(id)) { | |
list = working.get(id) | |
list << i | |
working.put(id, list) | |
dupes.add(id) | |
updates++ | |
} else { | |
working.put(id, [i]) | |
} | |
} | |
println "Updates: ${updates} (updated records: ${dupes.size()})" | |
println "Deletions: ${deleted - updates}" | |
println '----------' | |
if(verbose) { | |
dupes.each { println "${it} : ${working.get(it)}" } | |
println '----------' | |
} | |
} finally { | |
// clean up | |
try { | |
if((ir.properties.class as String).endsWith('org.apache.lucene.index.DirectoryReader')) { | |
// access private variable thanks to Groovy | |
ir.writeLock.release() | |
} else { | |
IndexWriter.unlock(fsd) | |
} | |
} finally { | |
try { | |
ir = ir.reopen(true) // fail safe re-open as read-only | |
} finally { | |
try { | |
ir.close() | |
} finally { | |
fsd.close() | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment