Created
July 5, 2012 12:57
-
-
Save slorber/3053540 to your computer and use it in GitHub Desktop.
ElasticSearch mapping multi_field analyzer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.stample.search; | |
import com.google.common.collect.Collections2; | |
import com.stample.search.engine.SearchEngineBuilder; | |
import com.stample.search.enums.EnumIndex; | |
import com.stample.search.enums.EnumType; | |
import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse; | |
import org.elasticsearch.action.index.IndexResponse; | |
import org.elasticsearch.client.AdminClient; | |
import org.elasticsearch.client.Client; | |
import org.elasticsearch.cluster.ClusterState; | |
import org.elasticsearch.cluster.metadata.IndexMetaData; | |
import org.elasticsearch.cluster.metadata.MappingMetaData; | |
import org.elasticsearch.common.Preconditions; | |
import org.elasticsearch.common.xcontent.XContentFactory; | |
import org.elasticsearch.node.Node; | |
import org.elasticsearch.node.NodeBuilder; | |
import org.slf4j.Logger; | |
import org.slf4j.LoggerFactory; | |
import org.testng.annotations.*; | |
import org.testng.collections.Lists; | |
import java.io.IOException; | |
import java.util.*; | |
/** | |
* @author Sebastien Lorber (<i>[email protected]</i>) | |
* Date: 02/07/12 - Time: 12:05 | |
*/ | |
public abstract class AbstractStampleUnitTest { | |
private static final Logger LOGGER = LoggerFactory.getLogger(AbstractStampleUnitTest.class); | |
private Node node1; | |
private Node node2; | |
private Node node3; | |
private Client client; | |
private AdminClient adminClient; | |
private SearchEngine searchEngine; | |
private Node buildTestNode() { | |
NodeBuilder nodeBuilder = NodeBuilder.nodeBuilder().local(true).data(true).loadConfigSettings(true); | |
nodeBuilder.getSettings().put("plugin.mandatory",""); | |
nodeBuilder.getSettings().put("index.mapper.dynamic","false"); | |
//nodeBuilder.getSettings().put("index.mapper.default_mapping_location","config/mappings/"); | |
nodeBuilder.getSettings().put("path.conf","./target/classes/config"); | |
nodeBuilder.getSettings().put("path.data","./target/es-data/"+ UUID.randomUUID().toString()); | |
return nodeBuilder.build(); | |
} | |
@BeforeClass | |
public void setUp() { | |
node1 = buildTestNode(); | |
node2 = buildTestNode(); | |
node3 = buildTestNode(); | |
node1.start(); | |
node2.start(); | |
node3.start(); | |
// | |
client = node1.client(); | |
adminClient = client.admin(); | |
searchEngine = new SearchEngineBuilder() | |
.setHighlightPreTag("<b>") | |
.setHighlightPostTag("</b>") | |
.setHighlightFragmentNumber(3) | |
.setHighlightFragmentSize(20) | |
.build(client); | |
LOGGER.info("\n" + | |
"###############################################\n" + | |
"######## ELASTICSEARCH TEST SETUP OK ##########\n" + | |
"###############################################\n" | |
); | |
} | |
@AfterClass | |
public void shutDown() { | |
node1.close(); | |
node2.close(); | |
node3.close(); | |
} | |
//@BeforeMethod | |
public void beforeTest() { | |
for ( EnumIndex index : EnumIndex.values() ) { | |
Preconditions.checkState( indiceExists(index), "The "+index+" index does not exist"); | |
} | |
} | |
protected boolean indiceExists(EnumIndex index) { | |
boolean bool = adminClient.indices().prepareExists(index.getIndexName()).execute().actionGet().exists(); | |
LOGGER.info("Index found: {}={}",index,bool); | |
return bool; | |
} | |
private String[] collectionToArray(Collection<String> collection) { | |
return Arrays.copyOf(collection.toArray(),collection.size(),String[].class); | |
} | |
protected void refreshIndex(EnumIndex... indexes) { | |
List<String> indexNames = Lists.newArrayList( Collections2.transform(Arrays.asList(indexes), EnumIndex.TO_INDEX_NAME) ); | |
adminClient.indices().prepareRefresh( collectionToArray(indexNames) ).setWaitForOperations(true).execute().actionGet(); | |
LOGGER.info("Indexes refreshed: {}",indexNames); | |
} | |
protected Client getClient() { | |
return client; | |
} | |
public SearchEngine getSearchEngine() { | |
return searchEngine; | |
} | |
protected void createStample(String title,String description) { | |
try { | |
String stampleDoc = XContentFactory.jsonBuilder() | |
.startObject() | |
.field("title", title) | |
.field("description", description) | |
.field("description", description) | |
.field("creationDate", String.valueOf(new Date().getTime())) | |
.endObject() | |
.string(); | |
IndexResponse response = client.prepareIndex() | |
.setIndex(EnumIndex.STAMPLES.getIndexName()) | |
.setType(EnumType.STAMPLE.getTypeName()) | |
.setSource(stampleDoc) | |
.execute() | |
.actionGet(); | |
LOGGER.info("Stample indexed with id={} and title={}",response.getId(),title); | |
refreshIndex(EnumIndex.STAMPLES); // refresh so that the stample is immediately visible to the search engine | |
if ( LOGGER.isDebugEnabled() ) { | |
logAnalysis(title,"title"); | |
logAnalysis(description,"description"); | |
} | |
} catch (IOException e) { | |
throw new RuntimeException("Can't insert stample",e); | |
} | |
} | |
protected MappingMetaData getMapping(EnumIndex index,EnumType type) { | |
ClusterState cs = adminClient.cluster().prepareState().setFilterIndices(index.getIndexName()).execute().actionGet().getState(); | |
IndexMetaData imd = Preconditions.checkNotNull(cs.getMetaData().index(index.getIndexName()), "Index metadata not found for " + index); | |
return Preconditions.checkNotNull(imd.mapping(type.getTypeName()), "Mapping metadata not found on index " + index + " for type " + type); | |
} | |
private void logAnalysis(String text,String fieldName) { | |
try { | |
if ( !LOGGER.isDebugEnabled() ) { | |
return; | |
} | |
AnalyzeResponse res = adminClient.indices() | |
.prepareAnalyze(EnumIndex.STAMPLES.getIndexName(),text) | |
.setField(fieldName) | |
.execute().actionGet(); | |
StringBuilder sb = new StringBuilder(); | |
sb.append("The text for field [").append(fieldName).append("] will be analyzed as:"); | |
for ( AnalyzeResponse.AnalyzeToken token : res.getTokens() ) { | |
sb.append("\n"); | |
sb.append("[offset=").append(token.getStartOffset()).append("-").append(token.getEndOffset()).append("]"); | |
sb.append("[type=").append(token.getType()).append("]"); | |
sb.append("[term=").append(token.getTerm()).append("]"); | |
} | |
LOGGER.debug(sb.toString()); | |
} catch ( Exception e ) { | |
LOGGER.warn("Can't log analysis for field {}",fieldName,e); | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.stample.search.engine; | |
import com.google.common.base.Function; | |
import com.google.common.collect.Collections2; | |
import com.google.common.collect.Lists; | |
import com.stample.search.SearchEngine; | |
import com.stample.search.dto.SearchResultDTO; | |
import com.stample.search.dto.SuggestionDTO; | |
import com.stample.search.enums.EnumIndex; | |
import com.stample.search.enums.EnumType; | |
import org.elasticsearch.action.search.SearchRequest; | |
import org.elasticsearch.action.search.SearchRequestBuilder; | |
import org.elasticsearch.action.search.SearchResponse; | |
import org.elasticsearch.client.Client; | |
import org.elasticsearch.common.Preconditions; | |
import org.elasticsearch.index.query.BoolQueryBuilder; | |
import org.elasticsearch.index.query.TextQueryBuilder; | |
import org.elasticsearch.search.SearchHit; | |
import org.elasticsearch.search.highlight.HighlightField; | |
import org.slf4j.Logger; | |
import org.slf4j.LoggerFactory; | |
import java.util.Arrays; | |
import java.util.List; | |
/** | |
* @author Sebastien Lorber (<i>[email protected]</i>) | |
* Date: 28/06/12 - Time: 21:38 | |
*/ | |
public class SearchEngineImpl implements SearchEngine { | |
private static final Logger LOGGER = LoggerFactory.getLogger(SearchEngineImpl.class); | |
private final Client client; | |
private final String highlightPreTag; | |
private final String highlightPostTag; | |
private final int highlightFragmentSize; | |
private final int highlightFragmentNumber; | |
// package scope: Use the builder | |
SearchEngineImpl(Client client, | |
String highlightPreTag,String highlightPostTag, | |
int highlightFragmentSize,int highlightFragmentNumber) { | |
Preconditions.checkNotNull(client, "No client provided!"); | |
Preconditions.checkNotNull(highlightPreTag); | |
Preconditions.checkNotNull(highlightPostTag); | |
this.client = client; | |
this.highlightPreTag = highlightPreTag; | |
this.highlightPostTag = highlightPostTag; | |
this.highlightFragmentSize = highlightFragmentSize; | |
this.highlightFragmentNumber = highlightFragmentNumber; | |
} | |
/** | |
* Because the title is a multifield! | |
* Order matters for retrieving the most appropriate highlight! | |
*/ | |
private static final List<String> TITLE_MULTI_FIELDS = Arrays.asList( | |
"title.title", | |
"title.english", | |
"title.edgengrams" | |
); | |
/** | |
* Get the most appropriate title highlight from a search/ suggestion hit | |
* As we can get highlights for all title fields we consider exact match is more appropriate than stemmed match than ngrams match | |
* This may return null for search results, but normally not null for suggestions | |
*/ | |
private static final Function<SearchHit,String> HIT_TO_TITLE_HIGHLIGHT = new Function<SearchHit,String>() { | |
@Override | |
public String apply(SearchHit hit) { | |
if ( hit.getHighlightFields() != null ) { | |
for ( String highlightField : TITLE_MULTI_FIELDS ) { | |
if ( hit.getHighlightFields().get(highlightField) != null ) { | |
String highlight = hit.getHighlightFields().get(highlightField).getFragments()[0]; | |
Preconditions.checkNotNull(highlight,"Highlight should not be null! (?)"); | |
return highlight; | |
} | |
} | |
} | |
return null; | |
} | |
}; | |
private static final Function<SearchHit,SuggestionDTO> HIT_TO_SUGGESTION = new Function<SearchHit,SuggestionDTO>() { | |
@Override | |
public SuggestionDTO apply(SearchHit hit) { | |
SuggestionDTO dto = new SuggestionDTO(); | |
dto.fillBasicData(hit); | |
String highlight = HIT_TO_TITLE_HIGHLIGHT.apply(hit); | |
Preconditions.checkNotNull(highlight,"title highlight should not be null on a suggestion search hit"); | |
dto.setHighlight(highlight); | |
String fieldValue = hit.getFields().get("title").getValue(); | |
dto.setSuggestion( fieldValue ); | |
return dto; | |
} | |
}; | |
private BoolQueryBuilder titleQuery(String text) { | |
BoolQueryBuilder builder = new BoolQueryBuilder(); | |
for ( String titleField : TITLE_MULTI_FIELDS) { | |
builder | |
.should( | |
new TextQueryBuilder(titleField, text).type(TextQueryBuilder.Type.PHRASE).boost(2f) | |
) | |
.should( | |
new TextQueryBuilder(titleField, text).type(TextQueryBuilder.Type.BOOLEAN) | |
); | |
} | |
builder.minimumNumberShouldMatch(1); | |
return builder; | |
} | |
@Override | |
public List<SuggestionDTO> getSuggestions(String text, String userId) { | |
SearchRequestBuilder builder = client.prepareSearch(EnumIndex.STAMPLES.getIndexName()) | |
.setTypes(EnumType.STAMPLE.getTypeName()) | |
.addField("title") | |
.setHighlighterPreTags(highlightPreTag) | |
.setHighlighterPostTags(highlightPostTag); | |
for ( String titleField : TITLE_MULTI_FIELDS) { | |
builder.addHighlightedField(titleField, 0, 0); | |
} | |
builder.setQuery( titleQuery(text) ); | |
SearchResponse res = builder.execute().actionGet(); | |
List<SearchHit> hits = Arrays.asList(res.getHits().hits()); | |
List<SuggestionDTO> results = Lists.newArrayList( Collections2.transform(hits, HIT_TO_SUGGESTION) ); | |
LOGGER.info("Suggestions found for [{}] = {}",text,results.size()); | |
LOGGER.debug("Suggestion results displayed for [{}] = {}",text,results); | |
return results; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"stample":{ | |
"dynamic":false, | |
"_timestamp":{ | |
"enabled":true, | |
"path":"creationDate" | |
}, | |
"_source":{ | |
"enabled":false | |
}, | |
"properties":{ | |
"_id":{ | |
"store":"yes", | |
"type":"string" | |
}, | |
"title": { | |
"store":"yes", | |
"type" : "multi_field", | |
"fields" : { | |
"title" : { | |
"store":"yes", | |
"type" : "string", | |
"index" : "not_analyzed", | |
"boost":3 | |
}, | |
"english" : { | |
"store":"yes", | |
"type":"string", | |
"boost":1.5, | |
"analyzer":"english_analyzer" | |
}, | |
"edgengrams" : { | |
"store":"yes", | |
"type":"string", | |
"boost":0.5, | |
"analyzer":"title_edgengrams" | |
} | |
} | |
}, | |
"description":{ | |
"store":"yes", | |
"type":"string", | |
"term_vector":"with_positions_offsets", | |
"analyzer":"english_analyzer" | |
}, | |
"isPublic":{ | |
"store":"yes", | |
"type":"boolean" | |
}, | |
"userId":{ | |
"store":"yes", | |
"type":"string" | |
} | |
} | |
} | |
} | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.stample.search; | |
import com.stample.search.dto.SuggestionDTO; | |
import org.slf4j.Logger; | |
import org.slf4j.LoggerFactory; | |
import org.testng.annotations.Test; | |
import java.util.List; | |
import static org.testng.Assert.*; | |
/** | |
* @author Sebastien Lorber (<i>[email protected]</i>) | |
* Date: 04/07/12 - Time: 13:10 | |
*/ | |
public class SuggestionsTest extends AbstractStampleUnitTest { | |
private static final Logger LOGGER = LoggerFactory.getLogger(SuggestionsTest.class); | |
@Test | |
public void testSuggestions() { | |
createStample( | |
"the description of those things is a cool naming", | |
"blablabla random useless description that will be tested whenever it becomes possible" | |
); | |
List<SuggestionDTO> suggestions; | |
// | |
suggestions = getSearchEngine().getSuggestions("des", "TODO"); | |
assertEquals(suggestions.size(),0); // no match because ngram is size is 4-8 | |
// | |
suggestions = getSearchEngine().getSuggestions("desc", "TODO"); | |
assertEquals(suggestions.size(),1); // 1 match because ngram is size is 4-8 | |
assertTrue( suggestions.get(0).getHighlight().contains("<b>desc</b>") , "Highlight = " + suggestions.get(0).getHighlight()); | |
// | |
suggestions = getSearchEngine().getSuggestions("descr", "TODO"); | |
assertEquals(suggestions.size(),1); // 1 match because ngram is size is 4-8 | |
assertTrue( suggestions.get(0).getHighlight().contains("<b>descr</b>") , "Highlight = " + suggestions.get(0).getHighlight()); | |
// | |
suggestions = getSearchEngine().getSuggestions("descr", "TODO"); | |
assertEquals(suggestions.size(),1); // 1 match because ngram is size is 4-8 | |
assertTrue( suggestions.get(0).getHighlight().contains("<b>descr</b>") , "Highlight = " + suggestions.get(0).getHighlight()); | |
// | |
suggestions = getSearchEngine().getSuggestions("descri", "TODO"); | |
assertEquals(suggestions.size(),1); // 1 match because ngram is size is 4-8 | |
assertTrue( suggestions.get(0).getHighlight().contains("<b>descri</b>") , "Highlight = " + suggestions.get(0).getHighlight()); | |
// | |
suggestions = getSearchEngine().getSuggestions("descrip", "TODO"); | |
assertEquals(suggestions.size(),1); // 1 match because ngram is size is 4-8 | |
assertTrue( suggestions.get(0).getHighlight().contains("<b>descrip</b>") , "Highlight = " + suggestions.get(0).getHighlight()); | |
// | |
suggestions = getSearchEngine().getSuggestions("descript", "TODO"); | |
assertEquals(suggestions.size(),1); // 1 match because english stems to descript! | |
assertTrue( suggestions.get(0).getHighlight().contains("<b>description</b>") , "Highlight = " + suggestions.get(0).getHighlight()); | |
// | |
suggestions = getSearchEngine().getSuggestions("descripti", "TODO"); | |
assertEquals(suggestions.size(),1); // 1 match because english stems to descript! | |
assertTrue( suggestions.get(0).getHighlight().contains("<b>description</b>") , "Highlight = " + suggestions.get(0).getHighlight()); | |
// | |
suggestions = getSearchEngine().getSuggestions("descriptio", "TODO"); | |
assertEquals(suggestions.size(),1); // 1 match because english stems to descript! | |
assertTrue( suggestions.get(0).getHighlight().contains("<b>description</b>") , "Highlight = " + suggestions.get(0).getHighlight()); | |
// | |
suggestions = getSearchEngine().getSuggestions("description", "TODO"); | |
assertEquals(suggestions.size(),1); // 1 match because exact match | |
assertTrue( suggestions.get(0).getHighlight().contains("<b>description</b>") , "Highlight = " + suggestions.get(0).getHighlight()); | |
} | |
@Test | |
public void testSuggestionsStemming() { | |
createStample( | |
"the description of those things is a cool name", | |
"blablabla random useless description that will be tested whenever it becomes possible" | |
); | |
List<SuggestionDTO> suggestions; | |
// | |
// | |
suggestions = getSearchEngine().getSuggestions("name", "TODO"); | |
assertEquals(suggestions.size(),1); // 1 match because ngram is size is 4-8 | |
assertTrue( suggestions.get(0).getHighlight().contains("<b>name</b>") , "Highlight = " + suggestions.get(0).getHighlight()); | |
// | |
suggestions = getSearchEngine().getSuggestions("naming", "TODO"); | |
assertEquals(suggestions.size(),1); // 1 match because ngram is size is 4-8 | |
assertTrue( suggestions.get(0).getHighlight().contains("<b>name</b>") , "Highlight = " + suggestions.get(0).getHighlight()); | |
// | |
suggestions = getSearchEngine().getSuggestions("namings", "TODO"); | |
assertEquals(suggestions.size(),1); // 1 match because ngram is size is 4-8 | |
assertTrue( suggestions.get(0).getHighlight().contains("<b>name</b>") , "Highlight = " + suggestions.get(0).getHighlight()); | |
// | |
suggestions = getSearchEngine().getSuggestions("namongs", "TODO"); | |
assertEquals(suggestions.size(),0); | |
suggestions = getSearchEngine().getSuggestions("namungs", "TODO"); | |
assertEquals(suggestions.size(),0); | |
suggestions = getSearchEngine().getSuggestions("namong", "TODO"); | |
assertEquals(suggestions.size(),0); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment