Skip to content

Instantly share code, notes, and snippets.

@slorber
Created July 5, 2012 12:57
Show Gist options
  • Save slorber/3053540 to your computer and use it in GitHub Desktop.
Save slorber/3053540 to your computer and use it in GitHub Desktop.
ElasticSearch mapping multi_field analyzer
package com.stample.search;
import com.google.common.collect.Collections2;
import com.stample.search.engine.SearchEngineBuilder;
import com.stample.search.enums.EnumIndex;
import com.stample.search.enums.EnumType;
import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse;
import org.elasticsearch.action.index.IndexResponse;
import org.elasticsearch.client.AdminClient;
import org.elasticsearch.client.Client;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.cluster.metadata.MappingMetaData;
import org.elasticsearch.common.Preconditions;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.node.Node;
import org.elasticsearch.node.NodeBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.testng.annotations.*;
import org.testng.collections.Lists;
import java.io.IOException;
import java.util.*;
/**
* @author Sebastien Lorber (<i>[email protected]</i>)
* Date: 02/07/12 - Time: 12:05
*/
public abstract class AbstractStampleUnitTest {
private static final Logger LOGGER = LoggerFactory.getLogger(AbstractStampleUnitTest.class);
private Node node1;
private Node node2;
private Node node3;
private Client client;
private AdminClient adminClient;
private SearchEngine searchEngine;
private Node buildTestNode() {
NodeBuilder nodeBuilder = NodeBuilder.nodeBuilder().local(true).data(true).loadConfigSettings(true);
nodeBuilder.getSettings().put("plugin.mandatory","");
nodeBuilder.getSettings().put("index.mapper.dynamic","false");
//nodeBuilder.getSettings().put("index.mapper.default_mapping_location","config/mappings/");
nodeBuilder.getSettings().put("path.conf","./target/classes/config");
nodeBuilder.getSettings().put("path.data","./target/es-data/"+ UUID.randomUUID().toString());
return nodeBuilder.build();
}
@BeforeClass
public void setUp() {
node1 = buildTestNode();
node2 = buildTestNode();
node3 = buildTestNode();
node1.start();
node2.start();
node3.start();
//
client = node1.client();
adminClient = client.admin();
searchEngine = new SearchEngineBuilder()
.setHighlightPreTag("<b>")
.setHighlightPostTag("</b>")
.setHighlightFragmentNumber(3)
.setHighlightFragmentSize(20)
.build(client);
LOGGER.info("\n" +
"###############################################\n" +
"######## ELASTICSEARCH TEST SETUP OK ##########\n" +
"###############################################\n"
);
}
@AfterClass
public void shutDown() {
node1.close();
node2.close();
node3.close();
}
//@BeforeMethod
public void beforeTest() {
for ( EnumIndex index : EnumIndex.values() ) {
Preconditions.checkState( indiceExists(index), "The "+index+" index does not exist");
}
}
protected boolean indiceExists(EnumIndex index) {
boolean bool = adminClient.indices().prepareExists(index.getIndexName()).execute().actionGet().exists();
LOGGER.info("Index found: {}={}",index,bool);
return bool;
}
private String[] collectionToArray(Collection<String> collection) {
return Arrays.copyOf(collection.toArray(),collection.size(),String[].class);
}
protected void refreshIndex(EnumIndex... indexes) {
List<String> indexNames = Lists.newArrayList( Collections2.transform(Arrays.asList(indexes), EnumIndex.TO_INDEX_NAME) );
adminClient.indices().prepareRefresh( collectionToArray(indexNames) ).setWaitForOperations(true).execute().actionGet();
LOGGER.info("Indexes refreshed: {}",indexNames);
}
protected Client getClient() {
return client;
}
public SearchEngine getSearchEngine() {
return searchEngine;
}
protected void createStample(String title,String description) {
try {
String stampleDoc = XContentFactory.jsonBuilder()
.startObject()
.field("title", title)
.field("description", description)
.field("description", description)
.field("creationDate", String.valueOf(new Date().getTime()))
.endObject()
.string();
IndexResponse response = client.prepareIndex()
.setIndex(EnumIndex.STAMPLES.getIndexName())
.setType(EnumType.STAMPLE.getTypeName())
.setSource(stampleDoc)
.execute()
.actionGet();
LOGGER.info("Stample indexed with id={} and title={}",response.getId(),title);
refreshIndex(EnumIndex.STAMPLES); // refresh so that the stample is immediately visible to the search engine
if ( LOGGER.isDebugEnabled() ) {
logAnalysis(title,"title");
logAnalysis(description,"description");
}
} catch (IOException e) {
throw new RuntimeException("Can't insert stample",e);
}
}
protected MappingMetaData getMapping(EnumIndex index,EnumType type) {
ClusterState cs = adminClient.cluster().prepareState().setFilterIndices(index.getIndexName()).execute().actionGet().getState();
IndexMetaData imd = Preconditions.checkNotNull(cs.getMetaData().index(index.getIndexName()), "Index metadata not found for " + index);
return Preconditions.checkNotNull(imd.mapping(type.getTypeName()), "Mapping metadata not found on index " + index + " for type " + type);
}
private void logAnalysis(String text,String fieldName) {
try {
if ( !LOGGER.isDebugEnabled() ) {
return;
}
AnalyzeResponse res = adminClient.indices()
.prepareAnalyze(EnumIndex.STAMPLES.getIndexName(),text)
.setField(fieldName)
.execute().actionGet();
StringBuilder sb = new StringBuilder();
sb.append("The text for field [").append(fieldName).append("] will be analyzed as:");
for ( AnalyzeResponse.AnalyzeToken token : res.getTokens() ) {
sb.append("\n");
sb.append("[offset=").append(token.getStartOffset()).append("-").append(token.getEndOffset()).append("]");
sb.append("[type=").append(token.getType()).append("]");
sb.append("[term=").append(token.getTerm()).append("]");
}
LOGGER.debug(sb.toString());
} catch ( Exception e ) {
LOGGER.warn("Can't log analysis for field {}",fieldName,e);
}
}
}
package com.stample.search.engine;
import com.google.common.base.Function;
import com.google.common.collect.Collections2;
import com.google.common.collect.Lists;
import com.stample.search.SearchEngine;
import com.stample.search.dto.SearchResultDTO;
import com.stample.search.dto.SuggestionDTO;
import com.stample.search.enums.EnumIndex;
import com.stample.search.enums.EnumType;
import org.elasticsearch.action.search.SearchRequest;
import org.elasticsearch.action.search.SearchRequestBuilder;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.common.Preconditions;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.TextQueryBuilder;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.highlight.HighlightField;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Arrays;
import java.util.List;
/**
* @author Sebastien Lorber (<i>[email protected]</i>)
* Date: 28/06/12 - Time: 21:38
*/
public class SearchEngineImpl implements SearchEngine {
private static final Logger LOGGER = LoggerFactory.getLogger(SearchEngineImpl.class);
private final Client client;
private final String highlightPreTag;
private final String highlightPostTag;
private final int highlightFragmentSize;
private final int highlightFragmentNumber;
// package scope: Use the builder
SearchEngineImpl(Client client,
String highlightPreTag,String highlightPostTag,
int highlightFragmentSize,int highlightFragmentNumber) {
Preconditions.checkNotNull(client, "No client provided!");
Preconditions.checkNotNull(highlightPreTag);
Preconditions.checkNotNull(highlightPostTag);
this.client = client;
this.highlightPreTag = highlightPreTag;
this.highlightPostTag = highlightPostTag;
this.highlightFragmentSize = highlightFragmentSize;
this.highlightFragmentNumber = highlightFragmentNumber;
}
/**
* Because the title is a multifield!
* Order matters for retrieving the most appropriate highlight!
*/
private static final List<String> TITLE_MULTI_FIELDS = Arrays.asList(
"title.title",
"title.english",
"title.edgengrams"
);
/**
* Get the most appropriate title highlight from a search/ suggestion hit
* As we can get highlights for all title fields we consider exact match is more appropriate than stemmed match than ngrams match
* This may return null for search results, but normally not null for suggestions
*/
private static final Function<SearchHit,String> HIT_TO_TITLE_HIGHLIGHT = new Function<SearchHit,String>() {
@Override
public String apply(SearchHit hit) {
if ( hit.getHighlightFields() != null ) {
for ( String highlightField : TITLE_MULTI_FIELDS ) {
if ( hit.getHighlightFields().get(highlightField) != null ) {
String highlight = hit.getHighlightFields().get(highlightField).getFragments()[0];
Preconditions.checkNotNull(highlight,"Highlight should not be null! (?)");
return highlight;
}
}
}
return null;
}
};
private static final Function<SearchHit,SuggestionDTO> HIT_TO_SUGGESTION = new Function<SearchHit,SuggestionDTO>() {
@Override
public SuggestionDTO apply(SearchHit hit) {
SuggestionDTO dto = new SuggestionDTO();
dto.fillBasicData(hit);
String highlight = HIT_TO_TITLE_HIGHLIGHT.apply(hit);
Preconditions.checkNotNull(highlight,"title highlight should not be null on a suggestion search hit");
dto.setHighlight(highlight);
String fieldValue = hit.getFields().get("title").getValue();
dto.setSuggestion( fieldValue );
return dto;
}
};
private BoolQueryBuilder titleQuery(String text) {
BoolQueryBuilder builder = new BoolQueryBuilder();
for ( String titleField : TITLE_MULTI_FIELDS) {
builder
.should(
new TextQueryBuilder(titleField, text).type(TextQueryBuilder.Type.PHRASE).boost(2f)
)
.should(
new TextQueryBuilder(titleField, text).type(TextQueryBuilder.Type.BOOLEAN)
);
}
builder.minimumNumberShouldMatch(1);
return builder;
}
@Override
public List<SuggestionDTO> getSuggestions(String text, String userId) {
SearchRequestBuilder builder = client.prepareSearch(EnumIndex.STAMPLES.getIndexName())
.setTypes(EnumType.STAMPLE.getTypeName())
.addField("title")
.setHighlighterPreTags(highlightPreTag)
.setHighlighterPostTags(highlightPostTag);
for ( String titleField : TITLE_MULTI_FIELDS) {
builder.addHighlightedField(titleField, 0, 0);
}
builder.setQuery( titleQuery(text) );
SearchResponse res = builder.execute().actionGet();
List<SearchHit> hits = Arrays.asList(res.getHits().hits());
List<SuggestionDTO> results = Lists.newArrayList( Collections2.transform(hits, HIT_TO_SUGGESTION) );
LOGGER.info("Suggestions found for [{}] = {}",text,results.size());
LOGGER.debug("Suggestion results displayed for [{}] = {}",text,results);
return results;
}
}
{
"stample":{
"dynamic":false,
"_timestamp":{
"enabled":true,
"path":"creationDate"
},
"_source":{
"enabled":false
},
"properties":{
"_id":{
"store":"yes",
"type":"string"
},
"title": {
"store":"yes",
"type" : "multi_field",
"fields" : {
"title" : {
"store":"yes",
"type" : "string",
"index" : "not_analyzed",
"boost":3
},
"english" : {
"store":"yes",
"type":"string",
"boost":1.5,
"analyzer":"english_analyzer"
},
"edgengrams" : {
"store":"yes",
"type":"string",
"boost":0.5,
"analyzer":"title_edgengrams"
}
}
},
"description":{
"store":"yes",
"type":"string",
"term_vector":"with_positions_offsets",
"analyzer":"english_analyzer"
},
"isPublic":{
"store":"yes",
"type":"boolean"
},
"userId":{
"store":"yes",
"type":"string"
}
}
}
}
package com.stample.search;
import com.stample.search.dto.SuggestionDTO;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.testng.annotations.Test;
import java.util.List;
import static org.testng.Assert.*;
/**
* @author Sebastien Lorber (<i>[email protected]</i>)
* Date: 04/07/12 - Time: 13:10
*/
public class SuggestionsTest extends AbstractStampleUnitTest {
private static final Logger LOGGER = LoggerFactory.getLogger(SuggestionsTest.class);
@Test
public void testSuggestions() {
createStample(
"the description of those things is a cool naming",
"blablabla random useless description that will be tested whenever it becomes possible"
);
List<SuggestionDTO> suggestions;
//
suggestions = getSearchEngine().getSuggestions("des", "TODO");
assertEquals(suggestions.size(),0); // no match because ngram is size is 4-8
//
suggestions = getSearchEngine().getSuggestions("desc", "TODO");
assertEquals(suggestions.size(),1); // 1 match because ngram is size is 4-8
assertTrue( suggestions.get(0).getHighlight().contains("<b>desc</b>") , "Highlight = " + suggestions.get(0).getHighlight());
//
suggestions = getSearchEngine().getSuggestions("descr", "TODO");
assertEquals(suggestions.size(),1); // 1 match because ngram is size is 4-8
assertTrue( suggestions.get(0).getHighlight().contains("<b>descr</b>") , "Highlight = " + suggestions.get(0).getHighlight());
//
suggestions = getSearchEngine().getSuggestions("descr", "TODO");
assertEquals(suggestions.size(),1); // 1 match because ngram is size is 4-8
assertTrue( suggestions.get(0).getHighlight().contains("<b>descr</b>") , "Highlight = " + suggestions.get(0).getHighlight());
//
suggestions = getSearchEngine().getSuggestions("descri", "TODO");
assertEquals(suggestions.size(),1); // 1 match because ngram is size is 4-8
assertTrue( suggestions.get(0).getHighlight().contains("<b>descri</b>") , "Highlight = " + suggestions.get(0).getHighlight());
//
suggestions = getSearchEngine().getSuggestions("descrip", "TODO");
assertEquals(suggestions.size(),1); // 1 match because ngram is size is 4-8
assertTrue( suggestions.get(0).getHighlight().contains("<b>descrip</b>") , "Highlight = " + suggestions.get(0).getHighlight());
//
suggestions = getSearchEngine().getSuggestions("descript", "TODO");
assertEquals(suggestions.size(),1); // 1 match because english stems to descript!
assertTrue( suggestions.get(0).getHighlight().contains("<b>description</b>") , "Highlight = " + suggestions.get(0).getHighlight());
//
suggestions = getSearchEngine().getSuggestions("descripti", "TODO");
assertEquals(suggestions.size(),1); // 1 match because english stems to descript!
assertTrue( suggestions.get(0).getHighlight().contains("<b>description</b>") , "Highlight = " + suggestions.get(0).getHighlight());
//
suggestions = getSearchEngine().getSuggestions("descriptio", "TODO");
assertEquals(suggestions.size(),1); // 1 match because english stems to descript!
assertTrue( suggestions.get(0).getHighlight().contains("<b>description</b>") , "Highlight = " + suggestions.get(0).getHighlight());
//
suggestions = getSearchEngine().getSuggestions("description", "TODO");
assertEquals(suggestions.size(),1); // 1 match because exact match
assertTrue( suggestions.get(0).getHighlight().contains("<b>description</b>") , "Highlight = " + suggestions.get(0).getHighlight());
}
@Test
public void testSuggestionsStemming() {
createStample(
"the description of those things is a cool name",
"blablabla random useless description that will be tested whenever it becomes possible"
);
List<SuggestionDTO> suggestions;
//
//
suggestions = getSearchEngine().getSuggestions("name", "TODO");
assertEquals(suggestions.size(),1); // 1 match because ngram is size is 4-8
assertTrue( suggestions.get(0).getHighlight().contains("<b>name</b>") , "Highlight = " + suggestions.get(0).getHighlight());
//
suggestions = getSearchEngine().getSuggestions("naming", "TODO");
assertEquals(suggestions.size(),1); // 1 match because ngram is size is 4-8
assertTrue( suggestions.get(0).getHighlight().contains("<b>name</b>") , "Highlight = " + suggestions.get(0).getHighlight());
//
suggestions = getSearchEngine().getSuggestions("namings", "TODO");
assertEquals(suggestions.size(),1); // 1 match because ngram is size is 4-8
assertTrue( suggestions.get(0).getHighlight().contains("<b>name</b>") , "Highlight = " + suggestions.get(0).getHighlight());
//
suggestions = getSearchEngine().getSuggestions("namongs", "TODO");
assertEquals(suggestions.size(),0);
suggestions = getSearchEngine().getSuggestions("namungs", "TODO");
assertEquals(suggestions.size(),0);
suggestions = getSearchEngine().getSuggestions("namong", "TODO");
assertEquals(suggestions.size(),0);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment