Skip to content

Instantly share code, notes, and snippets.

@smnh
Last active December 25, 2023 07:52
Show Gist options
  • Save smnh/30f96028511e1440b7b02ea559858af4 to your computer and use it in GitHub Desktop.
Save smnh/30f96028511e1440b7b02ea559858af4 to your computer and use it in GitHub Desktop.
Function for flattening data before indexing it to Elasticsearch (http://smnh.me/indexing-and-searching-arbitrary-json-data-using-elasticsearch)
const _ = require('lodash');
module.exports = {
flattenData
};
/**
* This function flattens objects by converting them into a flat array of objects having four fields:
* - "key": the path of the field in the original object
* - "type": the type of the field value
* - "key_type": the key and the type concatenated by a "." (for faster aggregations)
* - "value_{type}": the value of the field. The name of this field is created by concatenating the string "value_"
* with the value of the type field (e.g.: value_string, value_float, value_long, etc.).
*
* This is to deal with elastic search dynamic field mapping while indexing documents with arbitrary data.
* @see {@link http://smnh.me/indexing-and-searching-arbitrary-json-data-using-elasticsearch Indexing and Searching Arbitrary JSON Data using Elasticsearch}
* @see {@link https://www.elastic.co/blog/great-mapping-refactoring The Great Mapping Refactoring}
*
* Array values are flattened as well, but they do not add any additional part to the "key", thus conforming to the
* multi-value fields nature of Elasticsearch. If an array has values of different types, its values will be
* grouped in separate objects by their types.
*
* For example calling:
* flattenData({
* 'key1': 'val1',
* 'key2': true,
* 'key3': {
* 'innerKey1': 1,
* 'innerKey2': 1.5
* },
* 'key4': ['val2', 'val3', 3, {'key5': 'val4'}, {'key5': 'val5'}, {'key5': 5}],
* 'key6': [{'key7': ['val6', 'val7']}, {'key7': ['val8', 'val9']}]
* })
* Will produce:
* [
* {"key": "key1", "type": "string", "key_type": "key1.string", "value_string": "val1"},
* {"key": "key2", "type": "boolean", "key_type": "key2.boolean", "value_boolean": true},
* {"key": "key3.innerKey1", "type": "long", "key_type": "key3.innerKey1.long", "value_long": 1},
* {"key": "key3.innerKey2", "type": "float", "key_type": "key3.innerKey2.float", "value_float": 1.5},
* {"key": "key4", "type": "string", "key_type": "key4.string", "value_string": ["val2", "val3"]},
* {"key": "key4", "type": "long", "key_type": "key4.long", "value_long": [3]},
* {"key": "key4.key5", "type": "string", "key_type": "key4.key5.string", "value_string": ["val4", "val5"]},
* {"key": "key4.key5", "type": "long", "key_type": "key4.key5.long", "value_long": [5]},
* {"key": "key6.key7", "type": "string", "key_type": "key6.key7.string", "value_string": ["val6", "val7", "val8", "val9"]}
* ]
*
* Root scalar values, or root arrays of scalar values will have empty string "key":
* flattenData('stringValue') => [{"key": "", "type": "string", "key_type": ".string", "value_string": "stringValue"}]
* flattenData(['val1', 'val2', 10, 20]) => [
* {"key": "", "type": "string", "key_type": ".string", "value_string": ["val1", "val2"]},
* {"key": "", "type": "long", "key_type": ".long", "value_long": [10, 20]}
* ]
*
* @param {*} data
* @param {string} prefix, for internal use
* @returns {Array.<Object>}
*/
function flattenData(data, prefix = "") {
if (_.isPlainObject(data)) {
// Parse plain object recursively by extending prefixes with property keys
let prefixDot = (prefix ? prefix + '.' : '');
return _.transform(data, (accumulator, value, key) => {
Array.prototype.push.apply(accumulator, flattenData(value, prefixDot + key));
}, []);
} else if (_.isArray(data)) {
let resultValuesByKeyAndType = {};
data.forEach(item => {
flattenData(item, prefix).forEach(result => {
let key = result.key;
if (!(key in resultValuesByKeyAndType)) {
resultValuesByKeyAndType[key] = {};
}
let type = result.type;
if (!(type in resultValuesByKeyAndType[key])) {
resultValuesByKeyAndType[key][type] = [];
}
Array.prototype.push.apply(resultValuesByKeyAndType[key][type], _.castArray(result[flatDataValueKey(type)]));
});
});
let result = [];
Object.keys(resultValuesByKeyAndType).forEach(key => {
Object.keys(resultValuesByKeyAndType[key]).forEach(type => {
result.push(flatDataObject(key, type, resultValuesByKeyAndType[key][type]));
});
});
return result;
}
let result = null;
if (typeof data === "string") {
if (/^\d{4}-\d{2}-\d{2}(?:T\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:\d{2})?)?$/.test(data)) {
// Date (strict_date_optional_time)
result = flatDataObject(prefix, 'date', data);
} else {
// String
result = flatDataObject(prefix, 'string', data);
}
} else if (typeof data === "number") {
if (data % 1 === 0) {
// Long
result = flatDataObject(prefix, 'long', data);
} else {
// Float
result = flatDataObject(prefix, 'float', data);
}
} else if (typeof data === "boolean") {
// Boolean
result = flatDataObject(prefix, 'boolean', data);
} else if (data === null) {
// Null
// We have defined "null_value" to be of type boolean mapped to false value
// https://www.elastic.co/guide/en/elasticsearch/reference/current/null-value.html
result = flatDataObject(prefix, 'null', data);
} else {
// If you expect to have any other types, make sure to process them here
// as well as adding them to Elasticsearch index.
}
return result ? [result] : [];
}
function flatDataObject(key, type, value) {
return {
key: key,
type: type,
key_type: flatDataKeyTypeValue(key, type),
[flatDataValueKey(type)]: value
};
}
function flatDataKeyTypeValue(key, type) {
return key + '.' + type;
}
function flatDataValueKey(type) {
return 'value_' + type
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment