Skip to content

Instantly share code, notes, and snippets.

@jbaiter
Created April 10, 2017 11:12
Show Gist options
  • Save jbaiter/c6d73d7987fc1723d367a5b31fe7e5fd to your computer and use it in GitHub Desktop.
Save jbaiter/c6d73d7987fc1723d367a5b31fe7e5fd to your computer and use it in GitHub Desktop.
package org.apache.solr.highlight;
import com.google.common.primitives.Longs;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.payloads.OcrInfo;
import org.apache.lucene.analysis.payloads.OcrPayloadHelper;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocList;
import org.apache.solr.search.SolrIndexSearcher;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
// ====================================================================================================================
// NOTE: The use of arrays instead of generics throughout this class might be irritating at first, but this is
// in line with the rest of the Solr/Lucene code base. I didn't find any references that give a reason for
// this style, but I assume that is is intended to reduce heap allocations as much as possible and make the
// operations as fast as possible. Think stuff like allowing the Hotspot VM to unroll loops since it
// knows the size of the arrays in advance. (jbaiter)
// ====================================================================================================================
/**
* Custom highlighter that returns OCR coordinates for matches in matching documents without needing the stored
* document content in the index.
*
* Parameters are:
* hl.maxPerDoc: Maximum number of OCR coordinates returned for a single document
* hl.maxPerPage: Maximum number of OCR coordinates returned for an individual page
*/
public class CoordinateSolrHighlighter extends SolrHighlighter {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
/**
* Generates a list of Highlighted query term coordinates for each item in a
* list of documents, or returns null if highlighting is disabled.
*
* @param docs query results
* @param query the query
* @param req the current request
* @param defaultFields default list of fields to summarize
*
* @return NamedList containing a NamedList for each document, which in
* turns contains sets (field, coordinates) pairs.
*/
@Override
public NamedList<Object> doHighlighting(DocList docs, Query query, SolrQueryRequest req,
String[] defaultFields) throws IOException {
SolrParams params = req.getParams();
if (!isHighlightingEnabled(params)) {
return null;
}
int maxHighlightsPerDoc = params.getInt("hl.maxPerDoc", -1);
int maxHighlightsPerPage = params.getInt("hl.maxPerPage", -1);
IndexReader reader = req.getSearcher().getIndexReader();
FieldQueryAdapter fq = new FieldQueryAdapter(query, reader);
int[] docIds = toDocIDs(docs);
String[] keys = getUniqueKeys(req.getSearcher(), docIds);
String[] fieldNames = getHighlightFields(query, req, defaultFields);
// For each document, obtain a mapping from field names to their matching OCR boxes
List<Map<String, OcrInfo[]>> boxes = new ArrayList<>();
for (int docId : docIds) {
Map<String, OcrInfo[]> docBoxes = new HashMap<>();
for (String fieldName : fieldNames) {
// We grab the terms in their UTF-8 encoded form to avoid costly decoding operations
// when checking for term equality down the line
Set<BytesRef> termSet = fq.getBytesTermSet(fieldName);
docBoxes.put(fieldName, getOcrInfos(reader, docId, fieldName, termSet, maxHighlightsPerDoc, maxHighlightsPerPage));
}
boxes.add(docBoxes);
}
return encodeSnippets(keys, fieldNames, boxes);
}
/**
* Retrieve unique keys for matching documents.
*/
private String[] getUniqueKeys(SolrIndexSearcher searcher, int[] docIds) throws IOException {
IndexSchema schema = searcher.getSchema();
SchemaField keyField = schema.getUniqueKeyField();
if (keyField != null) {
Set<String> selector = Collections.singleton(keyField.getName());
String[] uniqueKeys = new String[docIds.length];
for (int i=0; i < docIds.length; i++) {
int docId = docIds[i];
Document doc = searcher.doc(docId, selector);
String id = schema.printableUniqueKey(doc);
uniqueKeys[i] = id;
}
return uniqueKeys;
} else {
return new String[docIds.length];
}
}
/**
* Retrieve Document IDs from the list of matching documents.
*/
private int[] toDocIDs(DocList docs) {
int[] ids = new int[docs.size()];
DocIterator iterator = docs.iterator();
for (int i = 0; i < ids.length; i++) {
if (!iterator.hasNext()) {
throw new AssertionError();
}
ids[i] = iterator.nextDoc();
}
if (iterator.hasNext()) {
throw new AssertionError();
}
return ids;
}
/**
* Retrieve all {@link OcrInfo}s for matching terms from a given field in a document.
*
* @param reader A reader into the search index
* @param docId Identifier of the matching document
* @param fieldName Field to obtain OCR information from
* @param termSet Set of matching terms
* @param maxHighlightsPerDoc Maximum number of OCR terms per document
* @param maxHighlightsPerPage Maximum number of OCR terms per page
* @return All OCR information for matching terms on all positions in the field
* @throws IOException Error during retrieval from index
*/
private OcrInfo[] getOcrInfos(IndexReader reader, int docId, String fieldName, Set<BytesRef> termSet,
int maxHighlightsPerDoc, int maxHighlightsPerPage) throws IOException {
List<OcrInfo> ocrList = new ArrayList<>();
final Fields vectors = reader.getTermVectors(docId);
if (vectors == null) {
return null;
}
final Terms vector = vectors.terms(fieldName);
if (vector == null || !vector.hasPositions() || !vector.hasPayloads()) {
return null;
}
final TermsEnum termsEnum = vector.iterator();
PostingsEnum dpEnum = null;
BytesRef text;
int currentPage = -1;
int matchesOnCurrentPage = 0;
// TODO: This is currently O(n) in respect to the document vocabulary size.
// Unfortunately there's no easy way to avoid a linear scan with TermsEnum :/
while ((text = termsEnum.next()) != null && (maxHighlightsPerDoc < 0 || ocrList.size() < maxHighlightsPerDoc)) {
if (!termSet.contains(text)) {
continue;
}
dpEnum = termsEnum.postings(dpEnum, PostingsEnum.POSITIONS | PostingsEnum.PAYLOADS);
dpEnum.nextDoc();
final int freq = dpEnum.freq();
for (int i=0; i < freq && (maxHighlightsPerDoc < 0 || ocrList.size() < maxHighlightsPerDoc); i++) {
int pos = dpEnum.nextPosition();
BytesRef payload = dpEnum.getPayload();
if (payload.length != 7) {
log.warn("Payload for matching term {} at position {} in document {} does not contain encoded OCR information " +
"({} bytes instead of 7), ignoring it during highlighting.", text.utf8ToString(), pos, docId, payload.length);
continue;
}
// NOTE: By encoding the payload directly into a long, we avoid one extra allocation
// The previous approach was to copy the data into a byte[] and then to copy that into a long, i.e.
// two allocations instead of one.
// TODO: Isn't there a less verbose way to do this?
long encoded = Longs.fromBytes(
(byte) 0x00, payload.bytes[payload.offset], payload.bytes[payload.offset + 1],
payload.bytes[payload.offset + 2], payload.bytes[payload.offset + 3], payload.bytes[payload.offset + 4],
payload.bytes[payload.offset + 5], payload.bytes[payload.offset + 6]);
OcrInfo info = OcrPayloadHelper.decodeOcrInfo(encoded);
if (info.getPageNumber() != currentPage) { // Are we on a new page?
matchesOnCurrentPage = 0;
currentPage = info.getPageNumber();
}
if (maxHighlightsPerPage < 0 || matchesOnCurrentPage < maxHighlightsPerPage) { // Limit matches per page?
info.setTerm(text.utf8ToString());
info.setPosition(pos);
ocrList.add(info);
matchesOnCurrentPage++;
}
}
}
OcrInfo[] out = new OcrInfo[ocrList.size()];
out = ocrList.toArray(out);
return out;
}
/**
* Encode the highlighting result into a format that can be used by upstream users.
*/
private NamedList<Object> encodeSnippets(String[] keys, String[] fieldNames, List<Map<String, OcrInfo[]>> ocrInfos) {
NamedList<Object> list = new SimpleOrderedMap<>();
for (int i=0; i < keys.length; i++) {
NamedList<Object> summary = new SimpleOrderedMap<>();
Map<String, OcrInfo[]> docBoxes = ocrInfos.get(i);
for (String field : fieldNames) {
OcrInfo[] boxes = docBoxes.get(field);
NamedList[] encodedBoxes = new NamedList[boxes.length];
for (int j=0; j < boxes.length; j++) {
OcrInfo info = boxes[j];
NamedList<Object> encoded = new SimpleOrderedMap<>();
encoded.add("page", info.getPageNumber());
encoded.add("position", info.getPosition());
encoded.add("term", info.getTerm());
encoded.add("x", info.getHorizontalOffset());
encoded.add("y", info.getVerticalOffset());
encoded.add("width", info.getWidth());
encoded.add("height", info.getHeight());
encodedBoxes[j] = encoded;
}
summary.add(field, encodedBoxes);
}
list.add(keys[i], summary);
}
return list;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment