Last active
December 25, 2023 07:52
-
-
Save smnh/30f96028511e1440b7b02ea559858af4 to your computer and use it in GitHub Desktop.
Function for flattening data before indexing it to Elasticsearch (http://smnh.me/indexing-and-searching-arbitrary-json-data-using-elasticsearch)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const _ = require('lodash'); | |
module.exports = { | |
flattenData | |
}; | |
/** | |
* This function flattens objects by converting them into a flat array of objects having four fields: | |
* - "key": the path of the field in the original object | |
* - "type": the type of the field value | |
* - "key_type": the key and the type concatenated by a "." (for faster aggregations) | |
* - "value_{type}": the value of the field. The name of this field is created by concatenating the string "value_" | |
* with the value of the type field (e.g.: value_string, value_float, value_long, etc.). | |
* | |
* This is to deal with elastic search dynamic field mapping while indexing documents with arbitrary data. | |
* @see {@link http://smnh.me/indexing-and-searching-arbitrary-json-data-using-elasticsearch Indexing and Searching Arbitrary JSON Data using Elasticsearch} | |
* @see {@link https://www.elastic.co/blog/great-mapping-refactoring The Great Mapping Refactoring} | |
* | |
* Array values are flattened as well, but they do not add any additional part to the "key", thus conforming to the | |
* multi-value fields nature of Elasticsearch. If an array has values of different types, its values will be | |
* grouped in separate objects by their types. | |
* | |
* For example calling: | |
* flattenData({ | |
* 'key1': 'val1', | |
* 'key2': true, | |
* 'key3': { | |
* 'innerKey1': 1, | |
* 'innerKey2': 1.5 | |
* }, | |
* 'key4': ['val2', 'val3', 3, {'key5': 'val4'}, {'key5': 'val5'}, {'key5': 5}], | |
* 'key6': [{'key7': ['val6', 'val7']}, {'key7': ['val8', 'val9']}] | |
* }) | |
* Will produce: | |
* [ | |
* {"key": "key1", "type": "string", "key_type": "key1.string", "value_string": "val1"}, | |
* {"key": "key2", "type": "boolean", "key_type": "key2.boolean", "value_boolean": true}, | |
* {"key": "key3.innerKey1", "type": "long", "key_type": "key3.innerKey1.long", "value_long": 1}, | |
* {"key": "key3.innerKey2", "type": "float", "key_type": "key3.innerKey2.float", "value_float": 1.5}, | |
* {"key": "key4", "type": "string", "key_type": "key4.string", "value_string": ["val2", "val3"]}, | |
* {"key": "key4", "type": "long", "key_type": "key4.long", "value_long": [3]}, | |
* {"key": "key4.key5", "type": "string", "key_type": "key4.key5.string", "value_string": ["val4", "val5"]}, | |
* {"key": "key4.key5", "type": "long", "key_type": "key4.key5.long", "value_long": [5]}, | |
* {"key": "key6.key7", "type": "string", "key_type": "key6.key7.string", "value_string": ["val6", "val7", "val8", "val9"]} | |
* ] | |
* | |
* Root scalar values, or root arrays of scalar values will have empty string "key": | |
* flattenData('stringValue') => [{"key": "", "type": "string", "key_type": ".string", "value_string": "stringValue"}] | |
* flattenData(['val1', 'val2', 10, 20]) => [ | |
* {"key": "", "type": "string", "key_type": ".string", "value_string": ["val1", "val2"]}, | |
* {"key": "", "type": "long", "key_type": ".long", "value_long": [10, 20]} | |
* ] | |
* | |
* @param {*} data | |
* @param {string} prefix, for internal use | |
* @returns {Array.<Object>} | |
*/ | |
function flattenData(data, prefix = "") { | |
if (_.isPlainObject(data)) { | |
// Parse plain object recursively by extending prefixes with property keys | |
let prefixDot = (prefix ? prefix + '.' : ''); | |
return _.transform(data, (accumulator, value, key) => { | |
Array.prototype.push.apply(accumulator, flattenData(value, prefixDot + key)); | |
}, []); | |
} else if (_.isArray(data)) { | |
let resultValuesByKeyAndType = {}; | |
data.forEach(item => { | |
flattenData(item, prefix).forEach(result => { | |
let key = result.key; | |
if (!(key in resultValuesByKeyAndType)) { | |
resultValuesByKeyAndType[key] = {}; | |
} | |
let type = result.type; | |
if (!(type in resultValuesByKeyAndType[key])) { | |
resultValuesByKeyAndType[key][type] = []; | |
} | |
Array.prototype.push.apply(resultValuesByKeyAndType[key][type], _.castArray(result[flatDataValueKey(type)])); | |
}); | |
}); | |
let result = []; | |
Object.keys(resultValuesByKeyAndType).forEach(key => { | |
Object.keys(resultValuesByKeyAndType[key]).forEach(type => { | |
result.push(flatDataObject(key, type, resultValuesByKeyAndType[key][type])); | |
}); | |
}); | |
return result; | |
} | |
let result = null; | |
if (typeof data === "string") { | |
if (/^\d{4}-\d{2}-\d{2}(?:T\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:\d{2})?)?$/.test(data)) { | |
// Date (strict_date_optional_time) | |
result = flatDataObject(prefix, 'date', data); | |
} else { | |
// String | |
result = flatDataObject(prefix, 'string', data); | |
} | |
} else if (typeof data === "number") { | |
if (data % 1 === 0) { | |
// Long | |
result = flatDataObject(prefix, 'long', data); | |
} else { | |
// Float | |
result = flatDataObject(prefix, 'float', data); | |
} | |
} else if (typeof data === "boolean") { | |
// Boolean | |
result = flatDataObject(prefix, 'boolean', data); | |
} else if (data === null) { | |
// Null | |
// We have defined "null_value" to be of type boolean mapped to false value | |
// https://www.elastic.co/guide/en/elasticsearch/reference/current/null-value.html | |
result = flatDataObject(prefix, 'null', data); | |
} else { | |
// If you expect to have any other types, make sure to process them here | |
// as well as adding them to Elasticsearch index. | |
} | |
return result ? [result] : []; | |
} | |
function flatDataObject(key, type, value) { | |
return { | |
key: key, | |
type: type, | |
key_type: flatDataKeyTypeValue(key, type), | |
[flatDataValueKey(type)]: value | |
}; | |
} | |
function flatDataKeyTypeValue(key, type) { | |
return key + '.' + type; | |
} | |
function flatDataValueKey(type) { | |
return 'value_' + type | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment