Created
August 27, 2012 12:12
-
-
Save da-mkay/3487909 to your computer and use it in GitHub Desktop.
ElasticSearch - filename search using nGram
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Delete previous tests | |
curl -XDELETE 'http://127.0.0.1:9200/files/?pretty=1' | |
# Setup | |
curl -XPUT 'http://127.0.0.1:9200/files/?pretty=1' -d ' | |
{ | |
"settings" : { | |
"analysis" : { | |
"analyzer" : { | |
"filename_analyzer" : { | |
"tokenizer" : "filename_tokenizer", | |
"filter" : ["lowercase"] | |
} | |
}, | |
"tokenizer" : { | |
"filename_tokenizer" : { | |
"type" : "NGram", | |
"max_gram" : 100, | |
"min_gram" : 2 | |
} | |
} | |
} | |
}, | |
"mappings" : { | |
"file" : { | |
"properties" : { | |
"filename" : { | |
"type" : "string", | |
"analyzer" : "filename_analyzer" | |
} | |
} | |
} | |
} | |
} | |
' | |
# Insert some documents: | |
curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2012.01.12.txt" }' | |
curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2012.01.05.txt" }' | |
curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2012.05.01.txt" }' | |
curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2012.08.27.txt" }' | |
curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2012.12.12.txt" }' | |
curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2011.12.12.txt" }' | |
curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "file_01_2012.09.09.txt" }' | |
curl -X POST 'http://localhost:9200/files/_refresh' | |
# Find all documents except the last one (which contains "2011"): | |
curl -XGET 'http://127.0.0.1:9200/files/file/_search?pretty=1' -d ' | |
{ | |
"query" : { | |
"span_near" : { | |
"clauses" : [ | |
{ "span_term" : { "filename" : "2012" } } | |
], | |
"slop": 100, | |
"in_order" : true | |
} | |
} | |
} | |
' | |
# Find all documents which contain "12" followed by "01" (the first three | |
# documents): | |
curl -XGET 'http://127.0.0.1:9200/files/file/_search?pretty=1' -d ' | |
{ | |
"query" : { | |
"span_near" : { | |
"clauses" : [ | |
{ "span_term" : { "filename" : "12" } }, | |
{ "span_term" : { "filename" : "01" } } | |
], | |
"slop": 100, | |
"in_order" : true | |
} | |
} | |
} | |
' | |
# BUT this search does not work: | |
curl -XGET 'http://127.0.0.1:9200/files/file/_search?pretty=1' -d ' | |
{ | |
"query" : { | |
"span_near" : { | |
"clauses" : [ | |
{ "span_term" : { "filename" : "2012" } }, | |
{ "span_term" : { "filename" : "01" } } | |
], | |
"slop": 100, | |
"in_order" : true | |
} | |
} | |
} | |
' | |
# However with "in_order" set to false it works, but it returns the unwanted | |
# "file_01_2012.09.09.txt": | |
curl -XGET 'http://127.0.0.1:9200/files/file/_search?pretty=1' -d ' | |
{ | |
"query" : { | |
"span_near" : { | |
"clauses" : [ | |
{ "span_term" : { "filename" : "2012" } }, | |
{ "span_term" : { "filename" : "01" } } | |
], | |
"slop": 100, | |
"in_order" : true | |
} | |
} | |
} | |
' | |
# I think the "2012 followed by 01"-query does not work because of how the | |
# nGram-tokenizer determines the token's position which is used by "span_near"- | |
# query to determine the order of the terms: | |
curl -XGET 'http://127.0.0.1:9200/files/_analyze?pretty=1&text=My_file_2012.01.05.txt&analyzer=filename_analyzer' | |
... | |
{ | |
"token" : "01", ------------> "01" inside "2012" | |
"start_offset" : 9, | |
"end_offset" : 11, | |
"type" : "word", | |
"position" : 10 | |
}, | |
{ | |
"token" : "12", | |
"start_offset" : 10, | |
"end_offset" : 12, | |
"type" : "word", | |
"position" : 11 | |
}, | |
... | |
{ | |
"token" : "01", ------------> "01" inside ".01." | |
"start_offset" : 13, | |
"end_offset" : 15, | |
"type" : "word", | |
"position" : 14 | |
} | |
... | |
{ | |
"token" : "2012", | |
"start_offset" : 8, | |
"end_offset" : 12, | |
"type" : "word", | |
"position" : 50 | |
} | |
... | |
# The nGram-tokenizer just increments the position for each token: | |
# First it generates the tokens with two characters which get a "position"-value | |
# from 1 to 21. Then it generates the tokens with three characters (position 22 | |
# to 41) and so on. | |
# So when searching for "2012" and "01" the position-values 50 and 10 are used | |
# which are not in order and so the files are not found. But when using "12" and | |
# "01" then the values 11 and 14 are used which are in order. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment