Created
April 10, 2017 11:12
-
-
Save jbaiter/c6d73d7987fc1723d367a5b31fe7e5fd to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package org.apache.solr.highlight; | |
import com.google.common.primitives.Longs; | |
import java.io.IOException; | |
import java.lang.invoke.MethodHandles; | |
import java.util.ArrayList; | |
import java.util.Collections; | |
import java.util.HashMap; | |
import java.util.List; | |
import java.util.Map; | |
import java.util.Set; | |
import org.apache.lucene.analysis.payloads.OcrInfo; | |
import org.apache.lucene.analysis.payloads.OcrPayloadHelper; | |
import org.apache.lucene.document.Document; | |
import org.apache.lucene.index.Fields; | |
import org.apache.lucene.index.IndexReader; | |
import org.apache.lucene.index.PostingsEnum; | |
import org.apache.lucene.index.Terms; | |
import org.apache.lucene.index.TermsEnum; | |
import org.apache.lucene.search.Query; | |
import org.apache.lucene.util.BytesRef; | |
import org.apache.solr.common.params.SolrParams; | |
import org.apache.solr.common.util.NamedList; | |
import org.apache.solr.common.util.SimpleOrderedMap; | |
import org.apache.solr.request.SolrQueryRequest; | |
import org.apache.solr.schema.IndexSchema; | |
import org.apache.solr.schema.SchemaField; | |
import org.apache.solr.search.DocIterator; | |
import org.apache.solr.search.DocList; | |
import org.apache.solr.search.SolrIndexSearcher; | |
import org.slf4j.Logger; | |
import org.slf4j.LoggerFactory; | |
// ==================================================================================================================== | |
// NOTE: The use of arrays instead of generics throughout this class might be irritating at first, but this is | |
// in line with the rest of the Solr/Lucene code base. I didn't find any references that give a reason for | |
// this style, but I assume that is is intended to reduce heap allocations as much as possible and make the | |
// operations as fast as possible. Think stuff like allowing the Hotspot VM to unroll loops since it | |
// knows the size of the arrays in advance. (jbaiter) | |
// ==================================================================================================================== | |
/** | |
* Custom highlighter that returns OCR coordinates for matches in matching documents without needing the stored | |
* document content in the index. | |
* | |
* Parameters are: | |
* hl.maxPerDoc: Maximum number of OCR coordinates returned for a single document | |
* hl.maxPerPage: Maximum number of OCR coordinates returned for an individual page | |
*/ | |
public class CoordinateSolrHighlighter extends SolrHighlighter { | |
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); | |
/** | |
* Generates a list of Highlighted query term coordinates for each item in a | |
* list of documents, or returns null if highlighting is disabled. | |
* | |
* @param docs query results | |
* @param query the query | |
* @param req the current request | |
* @param defaultFields default list of fields to summarize | |
* | |
* @return NamedList containing a NamedList for each document, which in | |
* turns contains sets (field, coordinates) pairs. | |
*/ | |
@Override | |
public NamedList<Object> doHighlighting(DocList docs, Query query, SolrQueryRequest req, | |
String[] defaultFields) throws IOException { | |
SolrParams params = req.getParams(); | |
if (!isHighlightingEnabled(params)) { | |
return null; | |
} | |
int maxHighlightsPerDoc = params.getInt("hl.maxPerDoc", -1); | |
int maxHighlightsPerPage = params.getInt("hl.maxPerPage", -1); | |
IndexReader reader = req.getSearcher().getIndexReader(); | |
FieldQueryAdapter fq = new FieldQueryAdapter(query, reader); | |
int[] docIds = toDocIDs(docs); | |
String[] keys = getUniqueKeys(req.getSearcher(), docIds); | |
String[] fieldNames = getHighlightFields(query, req, defaultFields); | |
// For each document, obtain a mapping from field names to their matching OCR boxes | |
List<Map<String, OcrInfo[]>> boxes = new ArrayList<>(); | |
for (int docId : docIds) { | |
Map<String, OcrInfo[]> docBoxes = new HashMap<>(); | |
for (String fieldName : fieldNames) { | |
// We grab the terms in their UTF-8 encoded form to avoid costly decoding operations | |
// when checking for term equality down the line | |
Set<BytesRef> termSet = fq.getBytesTermSet(fieldName); | |
docBoxes.put(fieldName, getOcrInfos(reader, docId, fieldName, termSet, maxHighlightsPerDoc, maxHighlightsPerPage)); | |
} | |
boxes.add(docBoxes); | |
} | |
return encodeSnippets(keys, fieldNames, boxes); | |
} | |
/** | |
* Retrieve unique keys for matching documents. | |
*/ | |
private String[] getUniqueKeys(SolrIndexSearcher searcher, int[] docIds) throws IOException { | |
IndexSchema schema = searcher.getSchema(); | |
SchemaField keyField = schema.getUniqueKeyField(); | |
if (keyField != null) { | |
Set<String> selector = Collections.singleton(keyField.getName()); | |
String[] uniqueKeys = new String[docIds.length]; | |
for (int i=0; i < docIds.length; i++) { | |
int docId = docIds[i]; | |
Document doc = searcher.doc(docId, selector); | |
String id = schema.printableUniqueKey(doc); | |
uniqueKeys[i] = id; | |
} | |
return uniqueKeys; | |
} else { | |
return new String[docIds.length]; | |
} | |
} | |
/** | |
* Retrieve Document IDs from the list of matching documents. | |
*/ | |
private int[] toDocIDs(DocList docs) { | |
int[] ids = new int[docs.size()]; | |
DocIterator iterator = docs.iterator(); | |
for (int i = 0; i < ids.length; i++) { | |
if (!iterator.hasNext()) { | |
throw new AssertionError(); | |
} | |
ids[i] = iterator.nextDoc(); | |
} | |
if (iterator.hasNext()) { | |
throw new AssertionError(); | |
} | |
return ids; | |
} | |
/** | |
* Retrieve all {@link OcrInfo}s for matching terms from a given field in a document. | |
* | |
* @param reader A reader into the search index | |
* @param docId Identifier of the matching document | |
* @param fieldName Field to obtain OCR information from | |
* @param termSet Set of matching terms | |
* @param maxHighlightsPerDoc Maximum number of OCR terms per document | |
* @param maxHighlightsPerPage Maximum number of OCR terms per page | |
* @return All OCR information for matching terms on all positions in the field | |
* @throws IOException Error during retrieval from index | |
*/ | |
private OcrInfo[] getOcrInfos(IndexReader reader, int docId, String fieldName, Set<BytesRef> termSet, | |
int maxHighlightsPerDoc, int maxHighlightsPerPage) throws IOException { | |
List<OcrInfo> ocrList = new ArrayList<>(); | |
final Fields vectors = reader.getTermVectors(docId); | |
if (vectors == null) { | |
return null; | |
} | |
final Terms vector = vectors.terms(fieldName); | |
if (vector == null || !vector.hasPositions() || !vector.hasPayloads()) { | |
return null; | |
} | |
final TermsEnum termsEnum = vector.iterator(); | |
PostingsEnum dpEnum = null; | |
BytesRef text; | |
int currentPage = -1; | |
int matchesOnCurrentPage = 0; | |
// TODO: This is currently O(n) in respect to the document vocabulary size. | |
// Unfortunately there's no easy way to avoid a linear scan with TermsEnum :/ | |
while ((text = termsEnum.next()) != null && (maxHighlightsPerDoc < 0 || ocrList.size() < maxHighlightsPerDoc)) { | |
if (!termSet.contains(text)) { | |
continue; | |
} | |
dpEnum = termsEnum.postings(dpEnum, PostingsEnum.POSITIONS | PostingsEnum.PAYLOADS); | |
dpEnum.nextDoc(); | |
final int freq = dpEnum.freq(); | |
for (int i=0; i < freq && (maxHighlightsPerDoc < 0 || ocrList.size() < maxHighlightsPerDoc); i++) { | |
int pos = dpEnum.nextPosition(); | |
BytesRef payload = dpEnum.getPayload(); | |
if (payload.length != 7) { | |
log.warn("Payload for matching term {} at position {} in document {} does not contain encoded OCR information " + | |
"({} bytes instead of 7), ignoring it during highlighting.", text.utf8ToString(), pos, docId, payload.length); | |
continue; | |
} | |
// NOTE: By encoding the payload directly into a long, we avoid one extra allocation | |
// The previous approach was to copy the data into a byte[] and then to copy that into a long, i.e. | |
// two allocations instead of one. | |
// TODO: Isn't there a less verbose way to do this? | |
long encoded = Longs.fromBytes( | |
(byte) 0x00, payload.bytes[payload.offset], payload.bytes[payload.offset + 1], | |
payload.bytes[payload.offset + 2], payload.bytes[payload.offset + 3], payload.bytes[payload.offset + 4], | |
payload.bytes[payload.offset + 5], payload.bytes[payload.offset + 6]); | |
OcrInfo info = OcrPayloadHelper.decodeOcrInfo(encoded); | |
if (info.getPageNumber() != currentPage) { // Are we on a new page? | |
matchesOnCurrentPage = 0; | |
currentPage = info.getPageNumber(); | |
} | |
if (maxHighlightsPerPage < 0 || matchesOnCurrentPage < maxHighlightsPerPage) { // Limit matches per page? | |
info.setTerm(text.utf8ToString()); | |
info.setPosition(pos); | |
ocrList.add(info); | |
matchesOnCurrentPage++; | |
} | |
} | |
} | |
OcrInfo[] out = new OcrInfo[ocrList.size()]; | |
out = ocrList.toArray(out); | |
return out; | |
} | |
/** | |
* Encode the highlighting result into a format that can be used by upstream users. | |
*/ | |
private NamedList<Object> encodeSnippets(String[] keys, String[] fieldNames, List<Map<String, OcrInfo[]>> ocrInfos) { | |
NamedList<Object> list = new SimpleOrderedMap<>(); | |
for (int i=0; i < keys.length; i++) { | |
NamedList<Object> summary = new SimpleOrderedMap<>(); | |
Map<String, OcrInfo[]> docBoxes = ocrInfos.get(i); | |
for (String field : fieldNames) { | |
OcrInfo[] boxes = docBoxes.get(field); | |
NamedList[] encodedBoxes = new NamedList[boxes.length]; | |
for (int j=0; j < boxes.length; j++) { | |
OcrInfo info = boxes[j]; | |
NamedList<Object> encoded = new SimpleOrderedMap<>(); | |
encoded.add("page", info.getPageNumber()); | |
encoded.add("position", info.getPosition()); | |
encoded.add("term", info.getTerm()); | |
encoded.add("x", info.getHorizontalOffset()); | |
encoded.add("y", info.getVerticalOffset()); | |
encoded.add("width", info.getWidth()); | |
encoded.add("height", info.getHeight()); | |
encodedBoxes[j] = encoded; | |
} | |
summary.add(field, encodedBoxes); | |
} | |
list.add(keys[i], summary); | |
} | |
return list; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment