rbramley · June 23, 2011 16:29
diff --git a/DuplicateId_LuceneIndexWalker.groovy b/DuplicateId_LuceneIndexWalker.groovy
 /**
 * Copying and distribution of this file, with or without modification,
 * are permitted in any medium without royalty provided the copyright
 * notice and this notice are preserved.  This file is offered as-is,
 * without any warranty.
 *
 * @author Robin Bramley (c) 2011
 *
 * Purpose:
 * Of the deletions in an unoptimised Lucene index, how many were 
 * due to updates?
 *
 * This script performs an undelete (not committed), then walks a 
 * Lucene index checking a 'unique' key field for duplicates. In 
 * verbose mode the unique keys and matching document numbers are 
 * output (e.g. for inspection using Luke).
 *
 * This is quick & dirty and has been tested using Groovy 1.8.0 with 
 * Lucene 2.9.1 DirectoryIndexReader (against an older Solr install).
 *
 * WARNING: You are advised to work on a backup copy of the index. 
 * 
 * Sample output:
 *   Docs (initial): 33426
 *   Deleted docs: 545
 *   ----------
 *   Docs: 33971
 *   Updates: 545 (updated records: 461)
 *   Deletions: 0
 *   ----------
 *   NYR48 : [8187, 33745]
 *     ...
 *   TAK141 : [30951, 33620, 33960]
 *     ...
 *   ----------
 */
 
 import org.apache.lucene.store.FSDirectory
 import org.apache.lucene.index.IndexReader
 import org.apache.lucene.index.IndexWriter

 // Which version of Lucene to Grab
 @Grab(group='org.apache.lucene', module='lucene-core', version='2.9.1')

 // TODO: You might want to set these
 def uniqueKeyField = 'id'
 def indexPath = './index'
 def verbose = true

 // key working Lucene objects
 path = new File(indexPath)
 fsd = FSDirectory.open(path) 
 ir = IndexReader.open(fsd, false) // open read-write so we can undelete

 try {
    // variables
    working = [:]
    dupes = [] as Set
    updates = 0
    deleted = ir.numDeletedDocs()

    println "Docs (initial): ${ir.numDocs()}"

    // check for deletions
    if(ir.hasDeletions()) {
      println "Deleted docs: ${deleted}"
      ir.undeleteAll()
      // we don't commit this undelete
    }

    println '----------'
    n = ir.numDocs()
    println "Docs: ${n}"
    
    // iterate through all the documents
    (0..(n-1)).each { i ->
      id = ir.document(i).getField(uniqueKeyField).stringValue()
      if(working.containsKey(id)) {
          list = working.get(id)
          list << i
          working.put(id, list)
          dupes.add(id)
          updates++
      } else {
          working.put(id, [i])
      }
    }
    
    println "Updates: ${updates} (updated records: ${dupes.size()})"
    println "Deletions: ${deleted - updates}"
    println '----------'
    if(verbose) {
        dupes.each { println "${it} : ${working.get(it)}" }
        println '----------'
    }
 } finally {
    // clean up
    try {
        if((ir.properties.class as String).endsWith('org.apache.lucene.index.DirectoryReader')) {
            // access private variable thanks to Groovy
            ir.writeLock.release()
        } else {
            IndexWriter.unlock(fsd)
        }
    } finally {
        try {
            ir = ir.reopen(true) // fail safe re-open as read-only
        } finally {
            try {
                ir.close()
            } finally {
                fsd.close()
            }
        }
    }
 }
	/**
	* Copying and distribution of this file, with or without modification,
	* are permitted in any medium without royalty provided the copyright
	* notice and this notice are preserved. This file is offered as-is,
	* without any warranty.
	*
	* @author Robin Bramley (c) 2011
	*
	* Purpose:
	* Of the deletions in an unoptimised Lucene index, how many were
	* due to updates?
	*
	* This script performs an undelete (not committed), then walks a
	* Lucene index checking a 'unique' key field for duplicates. In
	* verbose mode the unique keys and matching document numbers are
	* output (e.g. for inspection using Luke).
	*
	* This is quick & dirty and has been tested using Groovy 1.8.0 with
	* Lucene 2.9.1 DirectoryIndexReader (against an older Solr install).
	*
	* WARNING: You are advised to work on a backup copy of the index.
	*
	* Sample output:
	* Docs (initial): 33426
	* Deleted docs: 545
	* ----------
	* Docs: 33971
	* Updates: 545 (updated records: 461)
	* Deletions: 0
	* ----------
	* NYR48 : [8187, 33745]
	* ...
	* TAK141 : [30951, 33620, 33960]
	* ...
	* ----------
	*/

	import org.apache.lucene.store.FSDirectory
	import org.apache.lucene.index.IndexReader
	import org.apache.lucene.index.IndexWriter

	// Which version of Lucene to Grab
	@Grab(group='org.apache.lucene', module='lucene-core', version='2.9.1')

	// TODO: You might want to set these
	def uniqueKeyField = 'id'
	def indexPath = './index'
	def verbose = true

	// key working Lucene objects
	path = new File(indexPath)
	fsd = FSDirectory.open(path)
	ir = IndexReader.open(fsd, false) // open read-write so we can undelete

	try {
	// variables
	working = [:]
	dupes = [] as Set
	updates = 0
	deleted = ir.numDeletedDocs()

	println "Docs (initial): ${ir.numDocs()}"

	// check for deletions
	if(ir.hasDeletions()) {
	println "Deleted docs: ${deleted}"
	ir.undeleteAll()
	// we don't commit this undelete
	}

	println '----------'
	n = ir.numDocs()
	println "Docs: ${n}"

	// iterate through all the documents
	(0..(n-1)).each { i ->
	id = ir.document(i).getField(uniqueKeyField).stringValue()
	if(working.containsKey(id)) {
	list = working.get(id)
	list << i
	working.put(id, list)
	dupes.add(id)
	updates++
	} else {
	working.put(id, [i])
	}
	}

	println "Updates: ${updates} (updated records: ${dupes.size()})"
	println "Deletions: ${deleted - updates}"
	println '----------'
	if(verbose) {
	dupes.each { println "${it} : ${working.get(it)}" }
	println '----------'
	}
	} finally {
	// clean up
	try {
	if((ir.properties.class as String).endsWith('org.apache.lucene.index.DirectoryReader')) {
	// access private variable thanks to Groovy
	ir.writeLock.release()
	} else {
	IndexWriter.unlock(fsd)
	}
	} finally {
	try {
	ir = ir.reopen(true) // fail safe re-open as read-only
	} finally {
	try {
	ir.close()
	} finally {
	fsd.close()
	}
	}
	}
	}