da-mkay · August 27, 2012 12:12
diff --git a/gistfile1.sh b/gistfile1.sh
 # Delete previous tests

 curl -XDELETE 'http://127.0.0.1:9200/files/?pretty=1'


 # Setup

 curl -XPUT 'http://127.0.0.1:9200/files/?pretty=1'  -d '
 {
   "settings" : {
      "analysis" : {
         "analyzer" : {
            "filename_analyzer" : {
               "tokenizer" : "filename_tokenizer",
               "filter" : ["lowercase"]
            }
         },
         "tokenizer" : {
            "filename_tokenizer" : {
               "type" : "NGram",
               "max_gram" : 100,
               "min_gram" : 2
            }
         }
      }
   },
   "mappings" : {
      "file" : {
         "properties" : {
            "filename" : {
               "type" : "string",
               "analyzer" : "filename_analyzer"
            }
         }
      }
   }
 }
 '


 # Insert some documents:

 curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2012.01.12.txt" }'
 curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2012.01.05.txt" }'
 curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2012.05.01.txt" }'
 curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2012.08.27.txt" }'
 curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2012.12.12.txt" }'
 curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2011.12.12.txt" }'
 curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "file_01_2012.09.09.txt" }'
 curl -X POST 'http://localhost:9200/files/_refresh'


 # Find all documents except the last one (which contains "2011"):

 curl -XGET 'http://127.0.0.1:9200/files/file/_search?pretty=1'  -d '
 {
    "query" : {
        "span_near" : {
            "clauses" : [
                { "span_term" : { "filename" : "2012" } }
            ],
            "slop": 100,
            "in_order" : true
        }
    }
 }
 '


 # Find all documents which contain "12" followed by "01" (the first three
 # documents):

 curl -XGET 'http://127.0.0.1:9200/files/file/_search?pretty=1'  -d '
 {
    "query" : {
        "span_near" : {
            "clauses" : [
                { "span_term" : { "filename" : "12" } },
                { "span_term" : { "filename" : "01" } }
            ],
            "slop": 100,
            "in_order" : true
        }
    }
 }
 '


 # BUT this search does not work:

 curl -XGET 'http://127.0.0.1:9200/files/file/_search?pretty=1'  -d '
 {
    "query" : {
        "span_near" : {
            "clauses" : [
                { "span_term" : { "filename" : "2012" } },
                { "span_term" : { "filename" : "01" } }
            ],
            "slop": 100,
            "in_order" : true
        }
    }
 }
 '


 # However with "in_order" set to false it works, but it returns the unwanted
 # "file_01_2012.09.09.txt":

 curl -XGET 'http://127.0.0.1:9200/files/file/_search?pretty=1'  -d '
 {
    "query" : {
        "span_near" : {
            "clauses" : [
                { "span_term" : { "filename" : "2012" } },
                { "span_term" : { "filename" : "01" } }
            ],
            "slop": 100,
            "in_order" : true
        }
    }
 }
 '


 # I think the "2012 followed by 01"-query does not work because of how the
 # nGram-tokenizer determines the token's position which is used by "span_near"-
 # query to determine the order of the terms:

 curl -XGET 'http://127.0.0.1:9200/files/_analyze?pretty=1&text=My_file_2012.01.05.txt&analyzer=filename_analyzer' 
 ...
 {
    "token" : "01",         ------------> "01" inside "2012"
    "start_offset" : 9,
    "end_offset" : 11,
    "type" : "word",
    "position" : 10
 },
 {
    "token" : "12",
    "start_offset" : 10,
    "end_offset" : 12,
    "type" : "word",
    "position" : 11
 },
 ...
 {
    "token" : "01",         ------------> "01" inside ".01."
    "start_offset" : 13,
    "end_offset" : 15,
    "type" : "word",
    "position" : 14
 }
 ...
 {
    "token" : "2012",
    "start_offset" : 8,
    "end_offset" : 12,
    "type" : "word",
    "position" : 50
 }
 ...

 # The nGram-tokenizer just increments the position for each token:
 # First it generates the tokens with two characters which get a "position"-value
 # from 1 to 21. Then it generates the tokens with three characters (position 22
 # to 41) and so on.
 # So when searching for "2012" and "01" the position-values 50 and 10 are used
 # which are not in order and so the files are not found. But when using "12" and
 # "01" then the values 11 and 14 are used which are in order.
	# Delete previous tests

	curl -XDELETE 'http://127.0.0.1:9200/files/?pretty=1'


	# Setup

	curl -XPUT 'http://127.0.0.1:9200/files/?pretty=1' -d '
	{
	"settings" : {
	"analysis" : {
	"analyzer" : {
	"filename_analyzer" : {
	"tokenizer" : "filename_tokenizer",
	"filter" : ["lowercase"]
	}
	},
	"tokenizer" : {
	"filename_tokenizer" : {
	"type" : "NGram",
	"max_gram" : 100,
	"min_gram" : 2
	}
	}
	}
	},
	"mappings" : {
	"file" : {
	"properties" : {
	"filename" : {
	"type" : "string",
	"analyzer" : "filename_analyzer"
	}
	}
	}
	}
	}
	'


	# Insert some documents:

	curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2012.01.12.txt" }'
	curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2012.01.05.txt" }'
	curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2012.05.01.txt" }'
	curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2012.08.27.txt" }'
	curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2012.12.12.txt" }'
	curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2011.12.12.txt" }'
	curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "file_01_2012.09.09.txt" }'
	curl -X POST 'http://localhost:9200/files/_refresh'


	# Find all documents except the last one (which contains "2011"):

	curl -XGET 'http://127.0.0.1:9200/files/file/_search?pretty=1' -d '
	{
	"query" : {
	"span_near" : {
	"clauses" : [
	{ "span_term" : { "filename" : "2012" } }
	],
	"slop": 100,
	"in_order" : true
	}
	}
	}
	'


	# Find all documents which contain "12" followed by "01" (the first three
	# documents):

	curl -XGET 'http://127.0.0.1:9200/files/file/_search?pretty=1' -d '
	{
	"query" : {
	"span_near" : {
	"clauses" : [
	{ "span_term" : { "filename" : "12" } },
	{ "span_term" : { "filename" : "01" } }
	],
	"slop": 100,
	"in_order" : true
	}
	}
	}
	'


	# BUT this search does not work:

	curl -XGET 'http://127.0.0.1:9200/files/file/_search?pretty=1' -d '
	{
	"query" : {
	"span_near" : {
	"clauses" : [
	{ "span_term" : { "filename" : "2012" } },
	{ "span_term" : { "filename" : "01" } }
	],
	"slop": 100,
	"in_order" : true
	}
	}
	}
	'


	# However with "in_order" set to false it works, but it returns the unwanted
	# "file_01_2012.09.09.txt":

	curl -XGET 'http://127.0.0.1:9200/files/file/_search?pretty=1' -d '
	{
	"query" : {
	"span_near" : {
	"clauses" : [
	{ "span_term" : { "filename" : "2012" } },
	{ "span_term" : { "filename" : "01" } }
	],
	"slop": 100,
	"in_order" : true
	}
	}
	}
	'


	# I think the "2012 followed by 01"-query does not work because of how the
	# nGram-tokenizer determines the token's position which is used by "span_near"-
	# query to determine the order of the terms:

	curl -XGET 'http://127.0.0.1:9200/files/_analyze?pretty=1&text=My_file_2012.01.05.txt&analyzer=filename_analyzer'
	...
	{
	"token" : "01", ------------> "01" inside "2012"
	"start_offset" : 9,
	"end_offset" : 11,
	"type" : "word",
	"position" : 10
	},
	{
	"token" : "12",
	"start_offset" : 10,
	"end_offset" : 12,
	"type" : "word",
	"position" : 11
	},
	...
	{
	"token" : "01", ------------> "01" inside ".01."
	"start_offset" : 13,
	"end_offset" : 15,
	"type" : "word",
	"position" : 14
	}
	...
	{
	"token" : "2012",
	"start_offset" : 8,
	"end_offset" : 12,
	"type" : "word",
	"position" : 50
	}
	...

	# The nGram-tokenizer just increments the position for each token:
	# First it generates the tokens with two characters which get a "position"-value
	# from 1 to 21. Then it generates the tokens with three characters (position 22
	# to 41) and so on.
	# So when searching for "2012" and "01" the position-values 50 and 10 are used
	# which are not in order and so the files are not found. But when using "12" and
	# "01" then the values 11 and 14 are used which are in order.