Last active
July 13, 2024 00:11
-
-
Save xeraa/c827d265b73560edde2bf5157fa6f95b to your computer and use it in GitHub Desktop.
Include metadata with dense vector chunks in Elasticsearch
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
PUT audio | |
{ | |
"mappings": { | |
"properties": { | |
"my_long_content": { | |
"type": "nested", //because there can be multiple vectors per doc | |
"properties": { | |
"vector": { | |
"type": "dense_vector" //the vector used for ranking | |
}, | |
"text": { | |
"type": "text" //the extracted text from which the vector was created | |
}, | |
"time_ms": { | |
"type": "long" //offset from the start in ms of the chunk | |
} | |
} | |
} | |
} | |
} | |
} | |
PUT audio/_doc/1 | |
{ | |
"my_long_content" : [ | |
{ | |
"vector" : [23,14,8], | |
"text" : "doc 1 chunk 1", | |
"time_ms": 0 | |
}, | |
{ | |
"vector" : [34,95,17], | |
"text" : "doc 1 chunk 2", | |
"time_ms": 34 | |
} | |
] | |
} | |
PUT audio/_doc/2 | |
{ | |
"my_long_content" : [ | |
{ | |
"vector" : [3,2,890], | |
"text" : "doc 2 chunk 1", | |
"time_ms": 0 | |
}, | |
{ | |
"vector" : [129,765,13], | |
"text" : "doc 2 chunk 2", | |
"time_ms": 90 | |
} | |
] | |
} | |
GET audio/_search | |
{ | |
"_source": false, | |
"size": 1, | |
"knn": { | |
"field": "my_long_content.vector", | |
"query_vector": [ | |
35, | |
90, | |
20 | |
], | |
"inner_hits": { | |
"_source": true, | |
"size": 1 | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment