Skip to content

Instantly share code, notes, and snippets.

@linglung
Forked from KenDUemura/getTableByXpath.gs
Created October 17, 2018 10:07
Show Gist options
  • Save linglung/73662f8585d1e50b86f2c3b9050c6ed4 to your computer and use it in GitHub Desktop.
Save linglung/73662f8585d1e50b86f2c3b9050c6ed4 to your computer and use it in GitHub Desktop.
Google Apps Script for parse table element from HTML string parsed with XmlService and returns 2D array
/*
PARAMS:
text XML (In this gist, namespace needs to be 'http://www.w3.org/1999/xhtml')
path XPATH (So far tested with simple indexed lookup /html/body/table[3])
RETURNS:
Array[][] (Table data)
Missing th/thead/tfoot support
*/
function getDataFromXpath (text, path) {
var xmlDoc = XmlService.parse(text)
Logger.log("INFO: xmlDoc \n" + xmlDoc)
// html will be the RootElement
path = path.replace("/html/","")
var tags = path.split("/");
Logger.log("tags : " + tags);
var element = xmlDoc.getRootElement();
var namespace = XmlService.getNamespace('http://www.w3.org/1999/xhtml');
for(var i in tags) {
var tag = tags[i];
Logger.log("Tag : " + tag);
var index = tag.indexOf("[");
if(index != -1) {
var val = parseInt(tag.match(/\[(\d+)\]/)[1]);
tag = tag.substring(0,index);
element = element.getChildren(tag, namespace)[val-1];
} else {
element = element.getChild(tag, namespace)
}
Logger.log(element);
}
var data = [];
if (tags[tags.length - 1].match('table')) {
Logger.log("Parsing Table")
// TODO: thead
// tbody
var tbody = element.getChild("tbody", namespace);
if (tbody) {
element = tbody;
}
var rows = element.getChildren("tr", namespace);
for (var i in rows) {
var row = [];
var cols = rows[i].getChildren("td", namespace);
for (var j in cols) {
var cell = cols[j].getValue();
row.push(cell)
}
}
// TODO: tfoot
} else {
Logger.log("Unsupported tag type: " + tags[tags.length - 1])
}
return data;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment