We create an index with:
- two filters:
synonyms_expand
andsynonyms_contract
- two analyzers:
synonyms_expand
andsynonyms_contract
- three text fields:
text_1
uses thesynonyms_expand
analyzer at index and search timetext_2
uses thesynonyms_expand
analyzer at index time, but thestandard
analyzer at search timetext_3
uses thesynonyms_contract
analyzer at index and search time
.
curl -XPUT 'http://127.0.0.1:9200/test/?pretty=1' -d '
{
"settings" : {
"analysis" : {
"filter" : {
"synonyms_expand" : {
"synonyms" : [
"foo,bar,baz"
],
"type" : "synonym"
},
"synonyms_contract" : {
"expand" : 0,
"synonyms" : [
"foo,bar,baz"
],
"type" : "synonym"
}
},
"analyzer" : {
"synonyms_expand" : {
"filter" : [
"standard",
"lowercase",
"stop",
"synonyms_expand"
],
"type" : "custom",
"tokenizer" : "standard"
},
"synonyms_contract" : {
"filter" : [
"standard",
"lowercase",
"stop",
"synonyms_contract"
],
"type" : "custom",
"tokenizer" : "standard"
}
}
}
},
"mappings" : {
"test" : {
"properties" : {
"text_1" : {
"type" : "string",
"analyzer" : "synonyms_expand"
},
"text_2" : {
"search_analyzer" : "standard",
"index_analyzer" : "synonyms_expand",
"type" : "string"
},
"text_3" : {
"type" : "string",
"analyzer" : "synonyms_contract"
}
}
}
}
}
'
Create a doc which includes a word in our synonyms list foo
:
curl -XPUT 'http://127.0.0.1:9200/test/test/1?pretty=1' -d '
{
"text_3" : "foo dog cat",
"text_2" : "foo dog cat",
"text_1" : "foo dog cat"
}
'
See what tokens have been stored in each of our fields:
curl -XGET 'http://127.0.0.1:9200/test/test/_search?pretty=1' -d '
{
"facets" : {
"text_3" : {
"terms" : {
"field" : "text_3"
}
},
"text_1" : {
"terms" : {
"field" : "text_1"
}
},
"text_2" : {
"terms" : {
"field" : "text_2"
}
}
},
"size" : 0
}
'
# "facets" : {
# "text_3" : {
# "other" : 0,
# "terms" : [
# {
# "count" : 1,
# "term" : "foo"
# },
# {
# "count" : 1,
# "term" : "dog"
# },
# {
# "count" : 1,
# "term" : "cat"
# }
# ],
# "missing" : 0,
# "_type" : "terms",
# "total" : 3
# },
# "text_2" : {
# "other" : 0,
# "terms" : [
# {
# "count" : 1,
# "term" : "foo"
# },
# {
# "count" : 1,
# "term" : "dog"
# },
# {
# "count" : 1,
# "term" : "cat"
# },
# {
# "count" : 1,
# "term" : "baz"
# },
# {
# "count" : 1,
# "term" : "bar"
# }
# ],
# "missing" : 0,
# "_type" : "terms",
# "total" : 5
# },
# "text_1" : {
# "other" : 0,
# "terms" : [
# {
# "count" : 1,
# "term" : "foo"
# },
# {
# "count" : 1,
# "term" : "dog"
# },
# {
# "count" : 1,
# "term" : "cat"
# },
# {
# "count" : 1,
# "term" : "baz"
# },
# {
# "count" : 1,
# "term" : "bar"
# }
# ],
# "missing" : 0,
# "_type" : "terms",
# "total" : 5
# }
So, text_1
and text_2
have expanded foo
into the full synonym list. text_3
has indexed just foo
(which is the first synonym in the collapsed list).
Searching on text_1
uses the synonyms_expand
analyzer on the query string, so the query below becomes a query for "foo bar baz"
:
curl -XGET 'http://127.0.0.1:9200/test/test/_search?pretty=1' -d '
{
"query" : {
"match" : {
"text_1" : "bar"
}
},
"explain" : 1
}
'
# {
# "hits" : {
# "hits" : [
# {
# "_source" : {
# "text_3" : "foo dog cat",
# "text_1" : "foo dog cat",
# "text_2" : "foo dog cat"
# },
# "_score" : 0.26574233,
# "_index" : "test",
# "_shard" : 2,
# "_id" : "1",
# "_node" : "Yit05d94RgiUwMg9vzMOgw",
# "_type" : "test",
# "_explanation" : {
# "value" : 0.26574233,
# "details" : [
# {
# "value" : 0.08858078,
# "details" : [
# {
# "value" : 0.57735026,
# "details" : [
# {
# "value" : 0.30685282,
# "description" : "idf(docFreq=1, maxDocs=1)"
# },
# {
# "value" : 1.8815218,
# "description" : "queryNorm"
# }
# ],
# "description" : "queryWeight(text_1:foo), product of:"
# },
# {
# "value" : 0.15342641,
# "details" : [
# {
# "value" : 1,
# "description" : "tf(termFreq(text_1:foo)=1)"
# },
# {
# "value" : 0.30685282,
# "description" : "idf(docFreq=1, maxDocs=1)"
# },
# {
# "value" : 0.5,
# "description" : "fieldNorm(field=text_1, doc=0)"
# }
# ],
# "description" : "fieldWeight(text_1:foo in 0), product of:"
# }
# ],
# "description" : "weight(text_1:foo in 0), product of:"
# },
# {
# "value" : 0.08858078,
# "details" : [
# {
# "value" : 0.57735026,
# "details" : [
# {
# "value" : 0.30685282,
# "description" : "idf(docFreq=1, maxDocs=1)"
# },
# {
# "value" : 1.8815218,
# "description" : "queryNorm"
# }
# ],
# "description" : "queryWeight(text_1:bar), product of:"
# },
# {
# "value" : 0.15342641,
# "details" : [
# {
# "value" : 1,
# "description" : "tf(termFreq(text_1:bar)=1)"
# },
# {
# "value" : 0.30685282,
# "description" : "idf(docFreq=1, maxDocs=1)"
# },
# {
# "value" : 0.5,
# "description" : "fieldNorm(field=text_1, doc=0)"
# }
# ],
# "description" : "fieldWeight(text_1:bar in 0), product of:"
# }
# ],
# "description" : "weight(text_1:bar in 0), product of:"
# },
# {
# "value" : 0.08858078,
# "details" : [
# {
# "value" : 0.57735026,
# "details" : [
# {
# "value" : 0.30685282,
# "description" : "idf(docFreq=1, maxDocs=1)"
# },
# {
# "value" : 1.8815218,
# "description" : "queryNorm"
# }
# ],
# "description" : "queryWeight(text_1:baz), product of:"
# },
# {
# "value" : 0.15342641,
# "details" : [
# {
# "value" : 1,
# "description" : "tf(termFreq(text_1:baz)=1)"
# },
# {
# "value" : 0.30685282,
# "description" : "idf(docFreq=1, maxDocs=1)"
# },
# {
# "value" : 0.5,
# "description" : "fieldNorm(field=text_1, doc=0)"
# }
# ],
# "description" : "fieldWeight(text_1:baz in 0), product of:"
# }
# ],
# "description" : "weight(text_1:baz in 0), product of:"
# }
# ],
# "description" : "sum of:"
# }
# }
# ],
# "max_score" : 0.26574233,
# "total" : 1
# },
# "timed_out" : false,
# "_shards" : {
# "failed" : 0,
# "successful" : 5,
# "total" : 5
# },
# "took" : 3
# }
Searching on text_2
uses the standard
analyzer on the query string, so the query below remains a query for "bar"
. But because text_2
contains all 3 synonyms, we find our doc:
curl -XGET 'http://127.0.0.1:9200/test/test/_search?pretty=1' -d '
{
"query" : {
"match" : {
"text_2" : "bar"
}
},
"explain" : 1
}
'
# {
# "hits" : {
# "hits" : [
# {
# "_source" : {
# "text_3" : "foo dog cat",
# "text_1" : "foo dog cat",
# "text_2" : "foo dog cat"
# },
# "_score" : 0.15342641,
# "_index" : "test",
# "_shard" : 2,
# "_id" : "1",
# "_node" : "Yit05d94RgiUwMg9vzMOgw",
# "_type" : "test",
# "_explanation" : {
# "value" : 0.15342641,
# "details" : [
# {
# "value" : 1,
# "description" : "tf(termFreq(text_2:bar)=1)"
# },
# {
# "value" : 0.30685282,
# "description" : "idf(docFreq=1, maxDocs=1)"
# },
# {
# "value" : 0.5,
# "description" : "fieldNorm(field=text_2, doc=0)"
# }
# ],
# "description" : "fieldWeight(text_2:bar in 0), product of:"
# }
# }
# ],
# "max_score" : 0.15342641,
# "total" : 1
# },
# "timed_out" : false,
# "_shards" : {
# "failed" : 0,
# "successful" : 5,
# "total" : 5
# },
# "took" : 2
# }
Searching on text_3
uses the synonyms_contract
analyzer on the query string, so the query below becomes a query for "foo"
:
curl -XGET 'http://127.0.0.1:9200/test/test/_search?pretty=1' -d '
{
"query" : {
"match" : {
"text_3" : "bar"
}
},
"explain" : 1
}
'
# {
# "hits" : {
# "hits" : [
# {
# "_source" : {
# "text_3" : "foo dog cat",
# "text_1" : "foo dog cat",
# "text_2" : "foo dog cat"
# },
# "_score" : 0.15342641,
# "_index" : "test",
# "_shard" : 2,
# "_id" : "1",
# "_node" : "Yit05d94RgiUwMg9vzMOgw",
# "_type" : "test",
# "_explanation" : {
# "value" : 0.15342641,
# "details" : [
# {
# "value" : 1,
# "description" : "tf(termFreq(text_3:foo)=1)"
# },
# {
# "value" : 0.30685282,
# "description" : "idf(docFreq=1, maxDocs=1)"
# },
# {
# "value" : 0.5,
# "description" : "fieldNorm(field=text_3, doc=0)"
# }
# ],
# "description" : "fieldWeight(text_3:foo in 0), product of:"
# }
# }
# ],
# "max_score" : 0.15342641,
# "total" : 1
# },
# "timed_out" : false,
# "_shards" : {
# "failed" : 0,
# "successful" : 5,
# "total" : 5
# },
# "took" : 1
# }
Thank you for this! =D