jbaiter · April 10, 2017 11:12
diff --git a/CoordinateSolrHighlighter.java b/CoordinateSolrHighlighter.java
 package org.apache.solr.highlight;

 import com.google.common.primitives.Longs;
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import org.apache.lucene.analysis.payloads.OcrInfo;
 import org.apache.lucene.analysis.payloads.OcrPayloadHelper;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.Fields;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.PostingsEnum;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.util.BytesRef;
 import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.common.util.SimpleOrderedMap;
 import org.apache.solr.request.SolrQueryRequest;
 import org.apache.solr.schema.IndexSchema;
 import org.apache.solr.schema.SchemaField;
 import org.apache.solr.search.DocIterator;
 import org.apache.solr.search.DocList;
 import org.apache.solr.search.SolrIndexSearcher;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 // ====================================================================================================================
 // NOTE: The use of arrays instead of generics throughout this class might be irritating at first, but this is
 //       in line with the rest of the Solr/Lucene code base. I didn't find any references that give a reason for
 //       this style, but I assume that is is intended to reduce heap allocations as much as possible and make the
 //       operations as fast as possible. Think stuff like allowing the Hotspot VM to unroll loops since it
 //       knows the size of the arrays in advance. (jbaiter)
 // ====================================================================================================================

 /**
 * Custom highlighter that returns OCR coordinates for matches in matching documents without needing the stored
 * document content in the index.
 *
 * Parameters are:
 *  hl.maxPerDoc:  Maximum number of OCR coordinates returned for a single document
 *  hl.maxPerPage: Maximum number of OCR coordinates returned for an individual page
 */
 public class CoordinateSolrHighlighter extends SolrHighlighter {
  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

  /**
   * Generates a list of Highlighted query term coordinates for each item in a
   * list of documents, or returns null if highlighting is disabled.
   *
   * @param docs query results
   * @param query the query
   * @param req the current request
   * @param defaultFields default list of fields to summarize
   *
   * @return NamedList containing a NamedList for each document, which in
   *         turns contains sets (field, coordinates) pairs.
   */
  @Override
  public NamedList<Object> doHighlighting(DocList docs, Query query, SolrQueryRequest req,
                                          String[] defaultFields) throws IOException {
    SolrParams params = req.getParams();
    if (!isHighlightingEnabled(params)) {
      return null;
    }
    int maxHighlightsPerDoc = params.getInt("hl.maxPerDoc", -1);
    int maxHighlightsPerPage = params.getInt("hl.maxPerPage", -1);
    IndexReader reader = req.getSearcher().getIndexReader();
    FieldQueryAdapter fq = new FieldQueryAdapter(query, reader);
    int[] docIds = toDocIDs(docs);
    String[] keys = getUniqueKeys(req.getSearcher(), docIds);
    String[] fieldNames = getHighlightFields(query, req, defaultFields);

    // For each document, obtain a mapping from field names to their matching OCR boxes
    List<Map<String, OcrInfo[]>> boxes = new ArrayList<>();
    for (int docId : docIds) {
      Map<String, OcrInfo[]> docBoxes = new HashMap<>();
      for (String fieldName : fieldNames) {
        // We grab the terms in their UTF-8 encoded form to avoid costly decoding operations
        // when checking for term equality down the line
        Set<BytesRef> termSet = fq.getBytesTermSet(fieldName);
        docBoxes.put(fieldName, getOcrInfos(reader, docId, fieldName, termSet, maxHighlightsPerDoc, maxHighlightsPerPage));
      }
      boxes.add(docBoxes);
    }
    return encodeSnippets(keys, fieldNames, boxes);

  }

  /**
   * Retrieve unique keys for matching documents.
   */
  private String[] getUniqueKeys(SolrIndexSearcher searcher, int[] docIds) throws IOException {
    IndexSchema schema = searcher.getSchema();
    SchemaField keyField = schema.getUniqueKeyField();
    if (keyField != null) {
      Set<String> selector = Collections.singleton(keyField.getName());
      String[] uniqueKeys = new String[docIds.length];
      for (int i=0; i < docIds.length; i++) {
        int docId = docIds[i];
        Document doc = searcher.doc(docId, selector);
        String id = schema.printableUniqueKey(doc);
        uniqueKeys[i] = id;
      }
      return uniqueKeys;
    } else {
      return new String[docIds.length];
    }
  }

  /**
   * Retrieve Document IDs from the list of matching documents.
   */
  private int[] toDocIDs(DocList docs) {
    int[] ids = new int[docs.size()];
    DocIterator iterator = docs.iterator();
    for (int i = 0; i < ids.length; i++) {
      if (!iterator.hasNext()) {
        throw new AssertionError();
      }
      ids[i] = iterator.nextDoc();
    }
    if (iterator.hasNext()) {
      throw new AssertionError();
    }
    return ids;
  }

  /**
   * Retrieve all {@link OcrInfo}s for matching terms from a given field in a document.
   *
   * @param reader A reader into the search index
   * @param docId Identifier of the matching document
   * @param fieldName Field to obtain OCR information from
   * @param termSet Set of matching terms
   * @param maxHighlightsPerDoc Maximum number of OCR terms per document
   * @param maxHighlightsPerPage  Maximum number of OCR terms per page
   * @return All OCR information for matching terms on all positions in the field
   * @throws IOException Error during retrieval from index
   */
  private OcrInfo[] getOcrInfos(IndexReader reader, int docId, String fieldName, Set<BytesRef> termSet,
                                int maxHighlightsPerDoc, int maxHighlightsPerPage) throws IOException {
    List<OcrInfo> ocrList = new ArrayList<>();
    final Fields vectors = reader.getTermVectors(docId);
    if (vectors == null) {
      return null;
    }

    final Terms vector = vectors.terms(fieldName);
    if (vector == null || !vector.hasPositions() || !vector.hasPayloads()) {
      return null;
    }

    final TermsEnum termsEnum = vector.iterator();
    PostingsEnum dpEnum = null;
    BytesRef text;
    int currentPage = -1;
    int matchesOnCurrentPage = 0;

    // TODO: This is currently O(n) in respect to the document vocabulary size.
    //       Unfortunately there's no easy way to avoid a linear scan with TermsEnum :/
    while ((text = termsEnum.next()) != null && (maxHighlightsPerDoc < 0 || ocrList.size() < maxHighlightsPerDoc)) {
      if (!termSet.contains(text)) {
        continue;
      }
      dpEnum = termsEnum.postings(dpEnum, PostingsEnum.POSITIONS | PostingsEnum.PAYLOADS);
      dpEnum.nextDoc();

      final int freq = dpEnum.freq();
      for (int i=0; i < freq && (maxHighlightsPerDoc < 0 || ocrList.size() < maxHighlightsPerDoc); i++) {
        int pos = dpEnum.nextPosition();
        BytesRef payload = dpEnum.getPayload();
        if (payload.length != 7) {
          log.warn("Payload for matching term {} at position {} in document {} does not contain encoded OCR information " +
                   "({} bytes instead of 7), ignoring it during highlighting.", text.utf8ToString(), pos, docId, payload.length);
          continue;
        }
        // NOTE: By encoding the payload directly into a long, we avoid one extra allocation
        // The previous approach was to copy the data into a byte[] and then to copy that into a long, i.e.
        // two allocations instead of one.
        // TODO: Isn't there a less verbose way to do this?
        long encoded = Longs.fromBytes(
            (byte) 0x00, payload.bytes[payload.offset], payload.bytes[payload.offset + 1],
            payload.bytes[payload.offset + 2],  payload.bytes[payload.offset + 3], payload.bytes[payload.offset + 4],
            payload.bytes[payload.offset + 5], payload.bytes[payload.offset + 6]);
        OcrInfo info = OcrPayloadHelper.decodeOcrInfo(encoded);
        if (info.getPageNumber() != currentPage) {  // Are we on a new page?
          matchesOnCurrentPage = 0;
          currentPage = info.getPageNumber();
        }
        if (maxHighlightsPerPage < 0 || matchesOnCurrentPage < maxHighlightsPerPage) {  // Limit matches per page?
          info.setTerm(text.utf8ToString());
          info.setPosition(pos);
          ocrList.add(info);
          matchesOnCurrentPage++;
        }
      }
    }
    OcrInfo[] out = new OcrInfo[ocrList.size()];
    out = ocrList.toArray(out);
    return out;
  }

  /**
   * Encode the highlighting result into a format that can be used by upstream users.
   */
  private NamedList<Object> encodeSnippets(String[] keys, String[] fieldNames, List<Map<String, OcrInfo[]>> ocrInfos) {
    NamedList<Object> list = new SimpleOrderedMap<>();
    for (int i=0; i < keys.length; i++) {
      NamedList<Object> summary = new SimpleOrderedMap<>();
      Map<String, OcrInfo[]> docBoxes = ocrInfos.get(i);
      for (String field : fieldNames) {
        OcrInfo[] boxes = docBoxes.get(field);
        NamedList[] encodedBoxes = new NamedList[boxes.length];
        for (int j=0; j < boxes.length; j++) {
          OcrInfo info = boxes[j];
          NamedList<Object> encoded = new SimpleOrderedMap<>();
          encoded.add("page", info.getPageNumber());
          encoded.add("position", info.getPosition());
          encoded.add("term", info.getTerm());
          encoded.add("x", info.getHorizontalOffset());
          encoded.add("y", info.getVerticalOffset());
          encoded.add("width", info.getWidth());
          encoded.add("height", info.getHeight());
          encodedBoxes[j] = encoded;
        }
        summary.add(field, encodedBoxes);
      }
      list.add(keys[i], summary);
    }
    return list;
  }
 }
	package org.apache.solr.highlight;

	import com.google.common.primitives.Longs;
	import java.io.IOException;
	import java.lang.invoke.MethodHandles;
	import java.util.ArrayList;
	import java.util.Collections;
	import java.util.HashMap;
	import java.util.List;
	import java.util.Map;
	import java.util.Set;
	import org.apache.lucene.analysis.payloads.OcrInfo;
	import org.apache.lucene.analysis.payloads.OcrPayloadHelper;
	import org.apache.lucene.document.Document;
	import org.apache.lucene.index.Fields;
	import org.apache.lucene.index.IndexReader;
	import org.apache.lucene.index.PostingsEnum;
	import org.apache.lucene.index.Terms;
	import org.apache.lucene.index.TermsEnum;
	import org.apache.lucene.search.Query;
	import org.apache.lucene.util.BytesRef;
	import org.apache.solr.common.params.SolrParams;
	import org.apache.solr.common.util.NamedList;
	import org.apache.solr.common.util.SimpleOrderedMap;
	import org.apache.solr.request.SolrQueryRequest;
	import org.apache.solr.schema.IndexSchema;
	import org.apache.solr.schema.SchemaField;
	import org.apache.solr.search.DocIterator;
	import org.apache.solr.search.DocList;
	import org.apache.solr.search.SolrIndexSearcher;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	// ====================================================================================================================
	// NOTE: The use of arrays instead of generics throughout this class might be irritating at first, but this is
	// in line with the rest of the Solr/Lucene code base. I didn't find any references that give a reason for
	// this style, but I assume that is is intended to reduce heap allocations as much as possible and make the
	// operations as fast as possible. Think stuff like allowing the Hotspot VM to unroll loops since it
	// knows the size of the arrays in advance. (jbaiter)
	// ====================================================================================================================

	/**
	* Custom highlighter that returns OCR coordinates for matches in matching documents without needing the stored
	* document content in the index.
	*
	* Parameters are:
	* hl.maxPerDoc: Maximum number of OCR coordinates returned for a single document
	* hl.maxPerPage: Maximum number of OCR coordinates returned for an individual page
	*/
	public class CoordinateSolrHighlighter extends SolrHighlighter {
	private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

	/**
	* Generates a list of Highlighted query term coordinates for each item in a
	* list of documents, or returns null if highlighting is disabled.
	*
	* @param docs query results
	* @param query the query
	* @param req the current request
	* @param defaultFields default list of fields to summarize
	*
	* @return NamedList containing a NamedList for each document, which in
	* turns contains sets (field, coordinates) pairs.
	*/
	@Override
	public NamedList<Object> doHighlighting(DocList docs, Query query, SolrQueryRequest req,
	String[] defaultFields) throws IOException {
	SolrParams params = req.getParams();
	if (!isHighlightingEnabled(params)) {
	return null;
	}
	int maxHighlightsPerDoc = params.getInt("hl.maxPerDoc", -1);
	int maxHighlightsPerPage = params.getInt("hl.maxPerPage", -1);
	IndexReader reader = req.getSearcher().getIndexReader();
	FieldQueryAdapter fq = new FieldQueryAdapter(query, reader);
	int[] docIds = toDocIDs(docs);
	String[] keys = getUniqueKeys(req.getSearcher(), docIds);
	String[] fieldNames = getHighlightFields(query, req, defaultFields);

	// For each document, obtain a mapping from field names to their matching OCR boxes
	List<Map<String, OcrInfo[]>> boxes = new ArrayList<>();
	for (int docId : docIds) {
	Map<String, OcrInfo[]> docBoxes = new HashMap<>();
	for (String fieldName : fieldNames) {
	// We grab the terms in their UTF-8 encoded form to avoid costly decoding operations
	// when checking for term equality down the line
	Set<BytesRef> termSet = fq.getBytesTermSet(fieldName);
	docBoxes.put(fieldName, getOcrInfos(reader, docId, fieldName, termSet, maxHighlightsPerDoc, maxHighlightsPerPage));
	}
	boxes.add(docBoxes);
	}
	return encodeSnippets(keys, fieldNames, boxes);

	}

	/**
	* Retrieve unique keys for matching documents.
	*/
	private String[] getUniqueKeys(SolrIndexSearcher searcher, int[] docIds) throws IOException {
	IndexSchema schema = searcher.getSchema();
	SchemaField keyField = schema.getUniqueKeyField();
	if (keyField != null) {
	Set<String> selector = Collections.singleton(keyField.getName());
	String[] uniqueKeys = new String[docIds.length];
	for (int i=0; i < docIds.length; i++) {
	int docId = docIds[i];
	Document doc = searcher.doc(docId, selector);
	String id = schema.printableUniqueKey(doc);
	uniqueKeys[i] = id;
	}
	return uniqueKeys;
	} else {
	return new String[docIds.length];
	}
	}

	/**
	* Retrieve Document IDs from the list of matching documents.
	*/
	private int[] toDocIDs(DocList docs) {
	int[] ids = new int[docs.size()];
	DocIterator iterator = docs.iterator();
	for (int i = 0; i < ids.length; i++) {
	if (!iterator.hasNext()) {
	throw new AssertionError();
	}
	ids[i] = iterator.nextDoc();
	}
	if (iterator.hasNext()) {
	throw new AssertionError();
	}
	return ids;
	}

	/**
	* Retrieve all {@link OcrInfo}s for matching terms from a given field in a document.
	*
	* @param reader A reader into the search index
	* @param docId Identifier of the matching document
	* @param fieldName Field to obtain OCR information from
	* @param termSet Set of matching terms
	* @param maxHighlightsPerDoc Maximum number of OCR terms per document
	* @param maxHighlightsPerPage Maximum number of OCR terms per page
	* @return All OCR information for matching terms on all positions in the field
	* @throws IOException Error during retrieval from index
	*/
	private OcrInfo[] getOcrInfos(IndexReader reader, int docId, String fieldName, Set<BytesRef> termSet,
	int maxHighlightsPerDoc, int maxHighlightsPerPage) throws IOException {
	List<OcrInfo> ocrList = new ArrayList<>();
	final Fields vectors = reader.getTermVectors(docId);
	if (vectors == null) {
	return null;
	}

	final Terms vector = vectors.terms(fieldName);
	if (vector == null \|\| !vector.hasPositions() \|\| !vector.hasPayloads()) {
	return null;
	}

	final TermsEnum termsEnum = vector.iterator();
	PostingsEnum dpEnum = null;
	BytesRef text;
	int currentPage = -1;
	int matchesOnCurrentPage = 0;

	// TODO: This is currently O(n) in respect to the document vocabulary size.
	// Unfortunately there's no easy way to avoid a linear scan with TermsEnum :/
	while ((text = termsEnum.next()) != null && (maxHighlightsPerDoc < 0 \|\| ocrList.size() < maxHighlightsPerDoc)) {
	if (!termSet.contains(text)) {
	continue;
	}
	dpEnum = termsEnum.postings(dpEnum, PostingsEnum.POSITIONS \| PostingsEnum.PAYLOADS);
	dpEnum.nextDoc();

	final int freq = dpEnum.freq();
	for (int i=0; i < freq && (maxHighlightsPerDoc < 0 \|\| ocrList.size() < maxHighlightsPerDoc); i++) {
	int pos = dpEnum.nextPosition();
	BytesRef payload = dpEnum.getPayload();
	if (payload.length != 7) {
	log.warn("Payload for matching term {} at position {} in document {} does not contain encoded OCR information " +
	"({} bytes instead of 7), ignoring it during highlighting.", text.utf8ToString(), pos, docId, payload.length);
	continue;
	}
	// NOTE: By encoding the payload directly into a long, we avoid one extra allocation
	// The previous approach was to copy the data into a byte[] and then to copy that into a long, i.e.
	// two allocations instead of one.
	// TODO: Isn't there a less verbose way to do this?
	long encoded = Longs.fromBytes(
	(byte) 0x00, payload.bytes[payload.offset], payload.bytes[payload.offset + 1],
	payload.bytes[payload.offset + 2], payload.bytes[payload.offset + 3], payload.bytes[payload.offset + 4],
	payload.bytes[payload.offset + 5], payload.bytes[payload.offset + 6]);
	OcrInfo info = OcrPayloadHelper.decodeOcrInfo(encoded);
	if (info.getPageNumber() != currentPage) { // Are we on a new page?
	matchesOnCurrentPage = 0;
	currentPage = info.getPageNumber();
	}
	if (maxHighlightsPerPage < 0 \|\| matchesOnCurrentPage < maxHighlightsPerPage) { // Limit matches per page?
	info.setTerm(text.utf8ToString());
	info.setPosition(pos);
	ocrList.add(info);
	matchesOnCurrentPage++;
	}
	}
	}
	OcrInfo[] out = new OcrInfo[ocrList.size()];
	out = ocrList.toArray(out);
	return out;
	}

	/**
	* Encode the highlighting result into a format that can be used by upstream users.
	*/
	private NamedList<Object> encodeSnippets(String[] keys, String[] fieldNames, List<Map<String, OcrInfo[]>> ocrInfos) {
	NamedList<Object> list = new SimpleOrderedMap<>();
	for (int i=0; i < keys.length; i++) {
	NamedList<Object> summary = new SimpleOrderedMap<>();
	Map<String, OcrInfo[]> docBoxes = ocrInfos.get(i);
	for (String field : fieldNames) {
	OcrInfo[] boxes = docBoxes.get(field);
	NamedList[] encodedBoxes = new NamedList[boxes.length];
	for (int j=0; j < boxes.length; j++) {
	OcrInfo info = boxes[j];
	NamedList<Object> encoded = new SimpleOrderedMap<>();
	encoded.add("page", info.getPageNumber());
	encoded.add("position", info.getPosition());
	encoded.add("term", info.getTerm());
	encoded.add("x", info.getHorizontalOffset());
	encoded.add("y", info.getVerticalOffset());
	encoded.add("width", info.getWidth());
	encoded.add("height", info.getHeight());
	encodedBoxes[j] = encoded;
	}
	summary.add(field, encodedBoxes);
	}
	list.add(keys[i], summary);
	}
	return list;
	}
	}