|
import * as puppeteer from "puppeteer"; |
|
|
|
export interface Tree { |
|
nodeName: string; |
|
xpath: string; |
|
attrs: { [key: string]: string }; |
|
x: number; |
|
y: number; |
|
width: number; |
|
height: number; |
|
children: this[]; |
|
} |
|
|
|
export interface Rect { |
|
x: number; |
|
y: number; |
|
width: number; |
|
height: number; |
|
} |
|
|
|
interface EmulateOptions { |
|
viewport: { |
|
width: number; |
|
height: number; |
|
}; |
|
userAgent: string; |
|
} |
|
|
|
async function waitRendering(page: puppeteer.Page, wait: number) { |
|
return await page.evaluate(async wait => { |
|
await new Promise(resolve => setTimeout(resolve, wait)); |
|
await new Promise(resolve => (window as any).requestIdleCallback(resolve, { timeout: 5000 })); |
|
}, wait); |
|
} |
|
|
|
async function getPageSize(page: puppeteer.Page) { |
|
return await page.evaluate(() => [ |
|
document.documentElement.scrollWidth, document.documentElement.scrollHeight |
|
]) as [number, number]; |
|
} |
|
|
|
async function fixPageSize(page: puppeteer.Page, options: EmulateOptions) { |
|
let n = 50; |
|
await waitRendering(page, 50); |
|
let [w, h] = await getPageSize(page); |
|
options.viewport.width = w; |
|
options.viewport.height = h; |
|
while (n--) { |
|
options.viewport.width = w; |
|
await page.emulate(options); |
|
await waitRendering(page, 50); |
|
const [nw, nh] = await getPageSize(page); |
|
if (w >= nw) break; |
|
w = nw; |
|
h = nh; |
|
} |
|
return [w, h]; |
|
} |
|
|
|
export async function createTree(page: puppeteer.Page) { |
|
return await page.evaluate(() => { |
|
function qs(el: HTMLElement, selector: string) { |
|
try { |
|
return document.querySelector(selector); |
|
} catch (e) { |
|
return null; |
|
} |
|
} |
|
|
|
// https://stackoverflow.com/questions/2661818/javascript-get-xpath-of-a-node |
|
function getXPathForElement(element: Element): string { |
|
const idx = (sib: any, name?: any): any => sib |
|
? idx(sib.previousElementSibling, name || sib.localName) + (sib.localName == name) |
|
: 1; |
|
const segs = (elm: any): any => !elm || elm.nodeType !== 1 |
|
? [''] |
|
: elm.id && qs(elm, `#${elm.id}`) === elm |
|
? [`id("${elm.id}")`] |
|
: [...segs(elm.parentNode), `${elm.localName.toLowerCase()}[${idx(elm)}]`]; |
|
return segs(element).join('/'); |
|
} |
|
|
|
class DomTreeWrapper { |
|
visible: boolean; |
|
rect: Rect; |
|
children: DomTreeWrapper[]; |
|
|
|
constructor( |
|
private el: HTMLElement, |
|
private parentRect: Rect, |
|
) { |
|
const computed = window.getComputedStyle(el); |
|
const r = el.getBoundingClientRect(); |
|
const sourceRect = { |
|
x: r.left, |
|
y: r.top, |
|
width: r.width, |
|
height: r.height, |
|
}; |
|
this.visible = this.setupVisible(sourceRect, computed); |
|
if (!this.visible) return; |
|
this.rect = this.setupRect(sourceRect, computed); |
|
const crop = ["overflow", "overflowX", "overflowY"].some((k: any) => this.el.nodeName !== "BODY" && /hidden|auto|scroll/.test(computed[k])); |
|
this.children = this.getVisibleChildren(crop ? this.rect : this.parentRect); |
|
} |
|
|
|
private setupVisible(sourceRect: Rect, computed: CSSStyleDeclaration) { |
|
if (/META|SCRIPT|LINK|STYLE|IFRAME/.test(this.el.nodeName)) return false; |
|
// 要素が非表示 |
|
if (computed.display === "none" || |
|
computed.visibility === "hidden" || |
|
computed.visibility === "collapse" || |
|
(this.el.nodeName === "INPUT" && (this.el as HTMLInputElement).type === "hidden")) { |
|
return false; |
|
} |
|
// 要素が透明 |
|
if (computed.opacity === "0") return false; |
|
// z-indexがマイナス |
|
if (computed.zIndex && +computed.zIndex < 0) return false; |
|
// 幅or高さが0でoverflow: hidden |
|
let { x, y, width, height } = sourceRect; |
|
if (width === 0 && (computed.overflow === "hidden" || computed.overflowY === "hidden")) return false; |
|
if (height === 0 && (computed.overflow === "hidden" || computed.overflowX === "hidden")) return false; |
|
// 幅高さを無理やり取る |
|
const origPosition = this.el.style.position; |
|
this.el.style.position = "absolute"; |
|
const r = this.el.getBoundingClientRect(); |
|
this.el.style.position = origPosition; |
|
width = r.width; |
|
height = r.height; |
|
// それでも幅or高さが0 |
|
if (width === 0 || height === 0) return false; |
|
// 要素がページの枠外にある |
|
const documentWidth = document.documentElement.scrollWidth; |
|
const documentHeight = document.documentElement.scrollHeight; |
|
if (x + width <= 0) return false; |
|
if (x >= documentWidth) return false; |
|
if (y + height <= 0) return false; |
|
if (y >= documentHeight) return false; |
|
return true; |
|
} |
|
|
|
private crop(rect: Rect) { |
|
let { x, y, width, height } = rect; |
|
const r = this.parentRect; |
|
const left = Math.max(x, r.x); |
|
const top = Math.max(y, r.y); |
|
const right = Math.min(x + width, r.x + r.width); |
|
const bottom = Math.min(y + height, r.y + r.height); |
|
x = left; |
|
y = top; |
|
width = right - left; |
|
height = bottom - top; |
|
return { x, y, width, height }; |
|
} |
|
|
|
private setupRect(sourceRect: Rect, computed: CSSStyleDeclaration) { |
|
// 親の箱でcrop(ページ全体 or overflow: autoとか) |
|
let { x, y, width, height } = sourceRect; |
|
// 稀に幅or高さが0になってしまうやつの対応 |
|
if (width === 0 || height === 0) { |
|
// なぜか位置がおかしなところに行く場合があるので子要素から取得 |
|
const children = this.getVisibleChildren(this.parentRect); |
|
if (children.length) { |
|
const r = children[0].rect; |
|
x = r.x; |
|
y = r.y; |
|
} |
|
// 幅と高さ修正 |
|
const origPosition = this.el.style.position; |
|
this.el.style.position = "absolute"; |
|
const r = this.el.getBoundingClientRect(); |
|
width = r.width; |
|
height = r.height; |
|
this.el.style.position = origPosition; |
|
} |
|
return this.crop({ x, y, width, height }); |
|
} |
|
|
|
private getAttrs() { |
|
const attrs: { [key: string]: string } = {}; |
|
for (let i = 0; i < this.el.attributes.length; ++i) { |
|
const { name, value } = this.el.attributes.item(i); |
|
attrs[name] = value; |
|
} |
|
return attrs; |
|
} |
|
|
|
private getVisibleChildren(rect: Rect) { |
|
return [...this.el.children as any as HTMLElement[]] |
|
.map(el => new DomTreeWrapper(el, rect)) |
|
.filter(dtw => dtw.visible); |
|
} |
|
|
|
exportTree(): Tree { |
|
return { |
|
nodeName: this.el.nodeName, |
|
...this.rect, |
|
attrs: this.getAttrs(), |
|
xpath: getXPathForElement(this.el), |
|
children: this.children.map(child => child.exportTree()), |
|
}; |
|
} |
|
} |
|
|
|
const domTreeWrapper = new DomTreeWrapper(document.body, { |
|
x: 0, |
|
y: 0, |
|
width: document.documentElement.scrollWidth, |
|
height: document.documentElement.scrollHeight, |
|
}); |
|
return domTreeWrapper.exportTree(); |
|
}) as Tree; |
|
} |
|
|
|
export async function parse(url: string, screenshotOpt?: puppeteer.ScreenshotOptions) { |
|
const browser = await puppeteer.launch(); |
|
const page = await browser.newPage(); |
|
const options = { |
|
viewport: { |
|
width: 1024, |
|
height: 600, |
|
}, |
|
userAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36", |
|
}; |
|
await page.emulate(options); |
|
await page.goto(url); |
|
const [width, height] = await fixPageSize(page, options); |
|
const tree = await createTree(page); |
|
if (screenshotOpt) await page.screenshot(screenshotOpt); |
|
await browser.close(); |
|
return { |
|
tree, |
|
documentWidth: width, |
|
documentHeight: height, |
|
}; |
|
} |