xeraa · May 12, 2024 21:56
diff --git a/chunking-multi-vector.json b/chunking-multi-vector.json
 DELETE my-long-text-index

 PUT my-long-text-index
 {
  "mappings": {
    "properties": {
      "my_long_text_field": {
        "type": "nested", //because there can be multiple vectors per doc
        "properties": {
          "vector": {
            "type": "dense_vector" //the vector used for ranking
          },
          "text_chunk": {
            "type": "text" //the text from which the vector was created
          }
        }
      }
    }
  }
 }

 // We'll search for the vector [5,5]
 // Document with the closest chunk
 PUT my-long-text-index/_doc/1
 {
  "my_long_text_field" : [
    {
      "vector" : [5,4],
      "text_chunk" :  "doc 1 chunk 1"
    },
    {
      "vector" : [5,1],
      "text_chunk" :  "doc 1 chunk 2"
    },
    {
      "vector" : [5,0],
      "text_chunk" :  "doc 1 chunk 3"
    }
  ]
 }
 // Document with the second and third closest chunk
 PUT my-long-text-index/_doc/2
 {
  "my_long_text_field" : [
    {
      "vector" : [5,3],
      "text_chunk" :  "doc 2 chunk 1"
    },
    {
      "vector" : [5,2],
      "text_chunk" :  "doc 2 chunk 2"
    },
    {
      "vector" : [5,0],
      "text_chunk" :  "doc 2 chunk 3"
    }
  ]
 }
 // Document with the closest aggregated chunks
 PUT my-long-text-index/_doc/3
 {
  "my_long_text_field" : [
    {
      "vector" : [5,1.9],
      "text_chunk" :  "doc 3 chunk 1"
    },
    {
      "vector" : [5,1.8],
      "text_chunk" :  "doc 3 chunk 2"
    },
    {
      "vector" : [5,1.7],
      "text_chunk" :  "doc 3 chunk 3"
    }
  ]
 }

 GET my-long-text-index/_search
 {
  "knn": {
    "field": "my_long_text_field.vector",
    "query_vector": [5,5],
    "inner_hits":{
      "_source": false,
      "fields": [ "my_long_text_field.text_chunk"
        ],
        "size": 1 // Best chunk
    }
  },
  "size": 2, // 2 closest documents
  "_source": false
 }

 GET my-long-text-index/_search
 {
  "knn": {
    "field": "my_long_text_field.vector",
    "query_vector": [5,5],
    "inner_hits":{
      "_source": false,
      "fields": [ "my_long_text_field.text_chunk"
        ],
        "size": 2 // 2 best chunks
    }
  },
  "size": 1, // Best document
  "_source": false
 }
	DELETE my-long-text-index

	PUT my-long-text-index
	{
	"mappings": {
	"properties": {
	"my_long_text_field": {
	"type": "nested", //because there can be multiple vectors per doc
	"properties": {
	"vector": {
	"type": "dense_vector" //the vector used for ranking
	},
	"text_chunk": {
	"type": "text" //the text from which the vector was created
	}
	}
	}
	}
	}
	}

	// We'll search for the vector [5,5]
	// Document with the closest chunk
	PUT my-long-text-index/_doc/1
	{
	"my_long_text_field" : [
	{
	"vector" : [5,4],
	"text_chunk" : "doc 1 chunk 1"
	},
	{
	"vector" : [5,1],
	"text_chunk" : "doc 1 chunk 2"
	},
	{
	"vector" : [5,0],
	"text_chunk" : "doc 1 chunk 3"
	}
	]
	}
	// Document with the second and third closest chunk
	PUT my-long-text-index/_doc/2
	{
	"my_long_text_field" : [
	{
	"vector" : [5,3],
	"text_chunk" : "doc 2 chunk 1"
	},
	{
	"vector" : [5,2],
	"text_chunk" : "doc 2 chunk 2"
	},
	{
	"vector" : [5,0],
	"text_chunk" : "doc 2 chunk 3"
	}
	]
	}
	// Document with the closest aggregated chunks
	PUT my-long-text-index/_doc/3
	{
	"my_long_text_field" : [
	{
	"vector" : [5,1.9],
	"text_chunk" : "doc 3 chunk 1"
	},
	{
	"vector" : [5,1.8],
	"text_chunk" : "doc 3 chunk 2"
	},
	{
	"vector" : [5,1.7],
	"text_chunk" : "doc 3 chunk 3"
	}
	]
	}

	GET my-long-text-index/_search
	{
	"knn": {
	"field": "my_long_text_field.vector",
	"query_vector": [5,5],
	"inner_hits":{
	"_source": false,
	"fields": [ "my_long_text_field.text_chunk"
	],
	"size": 1 // Best chunk
	}
	},
	"size": 2, // 2 closest documents
	"_source": false
	}

	GET my-long-text-index/_search
	{
	"knn": {
	"field": "my_long_text_field.vector",
	"query_vector": [5,5],
	"inner_hits":{
	"_source": false,
	"fields": [ "my_long_text_field.text_chunk"
	],
	"size": 2 // 2 best chunks
	}
	},
	"size": 1, // Best document
	"_source": false
	}