Created
January 22, 2018 20:55
-
-
Save Andrews54757/fef89705a1fafdc48c85a21823c30987 to your computer and use it in GitHub Desktop.
Get text from website preserving position
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function isHidden(el) { | |
var style = window.getComputedStyle(el); | |
return (style.display === 'none') | |
} | |
function getOffset(el) { | |
var _x = 0; | |
var _y = 0; | |
while (el && !isNaN(el.offsetLeft) && !isNaN(el.offsetTop)) { | |
_x += el.offsetLeft - el.scrollLeft; | |
_y += el.offsetTop - el.scrollTop; | |
el = el.offsetParent; | |
} | |
return { | |
top: _y, | |
left: _x | |
}; | |
} | |
function buildFromNode(node, arr) { | |
arr = arr || []; | |
for (var i = 0; i < node.childNodes.length; i++) { | |
var child = node.childNodes[i]; | |
var tag = child.tagName ? child.tagName.toLowerCase() : '' | |
if (child.nodeType === 1 && tag !== 'script' && tag !== 'link' && tag !== 'img' && tag !== 'style') { | |
if (isHidden(child)) continue; | |
buildFromNode(child, arr); | |
} else if (child.nodeType === 3) { | |
var val = child.nodeValue.trim().replace(/\s+/g, ' '); | |
if (val) { | |
var rect = getOffset(node) | |
var index = Math.floor(rect.top / 10); | |
var index2 = Math.floor(rect.left / 10); | |
if (!arr[index]) arr[index] = []; | |
if (!arr[index][index2]) arr[index][index2] = []; | |
arr[index][index2].push(val); | |
} | |
} | |
} | |
return arr; | |
} | |
function getTextFromNode(node) { | |
var arr = buildFromNode(node) | |
return arr.map((n) => { | |
return n.filter((m) => { | |
return m && m.length > 0; | |
}).map((m) => { | |
return m.join(' '); | |
}) | |
}).filter((n) => { | |
return n && n.length > 0; | |
}).map((n) => { | |
return n.join(' ').split(/[ ]/) | |
}); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment