Last active
September 1, 2015 19:59
-
-
Save russorat/fdc2264ba54665dfd23a to your computer and use it in GitHub Desktop.
This code is meant to be used on script.google.com in conjuction with a google spreadsheet. It will transform a set of rows and columns (with headers) into json documents and index them in an Elasticsearch cluster.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// If you are running this from a spreadsheet, you can leave this blank and | |
// it will pick up the active spreadsheet. If you are running this from a | |
// new script.google.com project, you should put the url of the google sheet here. | |
var SPREADSHEET_URL = ''; | |
var ES_HOST = { | |
host : '', // for found.io, something.loc.aws.found.io | |
port : 9243, // the port, usually 9200 or 9243 | |
username : '', | |
password : '', | |
use_ssl : true // set to false to use http | |
} | |
// You can enter your own index or use the default | |
// which is the name of the spreadsheet lowercased | |
// and underscored | |
var INDEX = '' || getDefaultIndex(); | |
var INDEX_TYPE = 'spreadsheet'; | |
// If this template does not exist, it will create one | |
// using the DEFULT_TEMPLATE found at the bottom of this doc. | |
// Leave empty to disable template management. | |
var TEMPLATE_NAME = 'google_spreadsheet'; | |
// If you have a lot of data, enable this to use the bulk api | |
// https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-bulk.html | |
var BULK = true; | |
function sendToES() { | |
var data = getActiveSheet().getDataRange().getValues(); | |
var headers = data.shift(); | |
for(var i in headers) { | |
headers[i] = headers[i].replace(/[^0-9a-zA-Z]/g,'_'); // clean up the column names for index keys | |
headers[i] = headers[i].toLowerCase(); | |
} | |
var bulkList = []; | |
if(TEMPLATE_NAME) { createTemplate(); } | |
for(var i in data) { | |
var row = data[i]; | |
var toInsert = {}; | |
for(var c in row) { | |
toInsert[headers[c]] = row[c]; | |
} | |
if(BULK) { | |
bulkList.push(JSON.stringify({ "index" : { "_index" : INDEX, "_type" : INDEX_TYPE } })); | |
bulkList.push(JSON.stringify(toInsert)); | |
// Don't hit the UrlFetchApp limits of 10MB for POST calls. | |
if(bulkList.length >= 2000) { | |
postDataToES(bulkList.join("\n")+"\n"); | |
bulkList = []; | |
} | |
} else { | |
postDataToES(JSON.stringify(toInsert)); | |
} | |
} | |
if(BULK && bulkList.length > 0) { | |
postDataToES(bulkList.join("\n")+"\n"); | |
} | |
} | |
function postDataToES(data) { | |
var url = [(ES_HOST.use_ssl) ? 'https://' : 'http://', | |
ES_HOST.host,':',ES_HOST.port].join(''); | |
if(BULK) { | |
url += '/_bulk'; | |
} else { | |
url += ['/',INDEX,'/',INDEX_TYPE].join(''); | |
} | |
var options = getDefaultOptions(); | |
options.method = 'POST'; | |
options['payload'] = data; | |
options.headers["Content-Type"] = "application/json"; | |
var resp = UrlFetchApp.fetch(url, options); | |
} | |
function createTemplate() { | |
var url = [(ES_HOST.use_ssl) ? 'https://' : 'http://', | |
ES_HOST.host,':',ES_HOST.port, | |
'/_template/',TEMPLATE_NAME].join('') | |
var options = getDefaultOptions(); | |
options['muteHttpExceptions'] = true; | |
var resp = UrlFetchApp.fetch(url, options); | |
if(resp.getResponseCode() == 404) { | |
options = getDefaultOptions(); | |
options.method = 'POST'; | |
options['payload'] = JSON.stringify(DEFAULT_TEMPLATE); | |
options.headers["Content-Type"] = "application/json"; | |
resp = UrlFetchApp.fetch(url, options); | |
} else { | |
Logger.log('Template already exists'); | |
} | |
} | |
function getDefaultOptions() { | |
var options = { | |
method : 'GET', | |
headers : { }, | |
} | |
if(ES_HOST.username) { | |
options.headers["Authorization"] = "Basic " + Utilities.base64Encode(ES_HOST.username + ":" + ES_HOST.password); | |
} | |
return options; | |
} | |
function getDefaultIndex() { | |
var index_name = getActiveSheet().getParent().getName(); | |
return index_name.replace(/[^0-9a-zA-Z]/g,'_').toLowerCase() | |
} | |
function getActiveSheet() { | |
if(SPREADSHEET_URL == '') { | |
return SpreadsheetApp.getActiveSheet(); | |
} else { | |
return SpreadsheetApp.openByUrl(SPREADSHEET_URL).getActiveSheet(); | |
} | |
} | |
var DEFAULT_TEMPLATE = { | |
"order": 0, | |
"template": INDEX, | |
"settings": { | |
"index.refresh_interval": "5s", | |
"index.analysis.analyzer.default.type": "standard", | |
"index.number_of_replicas": "1", | |
"index.number_of_shards": "1", | |
"index.analysis.analyzer.default.stopwords": "_none_" | |
}, | |
"mappings": { | |
"_default_": { | |
"dynamic_templates": [ | |
{ | |
"string_fields": { | |
"mapping": { | |
"fields": { | |
"{name}": { | |
"index": "analyzed", | |
"omit_norms": true, | |
"type": "string" | |
}, | |
"raw": { | |
"search_analyzer": "keyword", | |
"ignore_above": 256, | |
"index": "not_analyzed", | |
"type": "string" | |
} | |
}, | |
"type": "multi_field" | |
}, | |
"match_mapping_type": "string", | |
"match": "*" | |
} | |
} | |
], | |
"_all": { | |
"enabled": true | |
} | |
} | |
}, | |
"aliases": { | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment