Skip to content

Instantly share code, notes, and snippets.

@ukyo
Last active May 14, 2024 05:55
Show Gist options
  • Save ukyo/906764beb36481301405c586abd81011 to your computer and use it in GitHub Desktop.
Save ukyo/906764beb36481301405c586abd81011 to your computer and use it in GitHub Desktop.
maincontent取るくん

Install and Build and Screenshot.

git clone https://gist.github.com/906764beb36481301405c586abd81011.git
cd 906764beb36481301405c586abd81011
npm install
npm run build
npm run screenshot https://gist.github.com/ukyo/906764beb36481301405c586abd81011
open out.html
declare module "jStat" {
export = jStat;
}
declare var jStat: JStatStatic;
interface JStatStatic {
jStat: {
gamma(shape: number, scale: number): JStatGamma;
normal(mean: number, std: number): JStatNormal;
}
}
interface JStatGamma {
pdf(x: number): number;
}
interface JStatNormal {
pdf(x: number): number;
}
{
"name": "maincontent-torukun",
"version": "0.1.0",
"description": "mainconten torukun",
"author": "ukyo",
"license": "ISC",
"scripts": {
"build": "tsc",
"screenshot": "node ./screenshot.js"
},
"dependencies": {
"jStat": "^1.7.1",
"puppeteer": "^0.10.2"
},
"devDependencies": {
"@types/node": "^8.0.28",
"@types/puppeteer": "^0.12.2",
"typescript": "^2.5.2"
}
}
import { createTree, parse } from "./simple-scraper";
import { detectMainContent } from "./simple-maincontents";
import * as fs from "fs";
const url = process.argv[2];
(async () => {
const { tree, documentWidth, documentHeight } = await parse(url, { fullPage: true, path: "screenshot.png" });
const mainContents = detectMainContent(tree, documentWidth, documentHeight);
const { rect, xpath } = mainContents[0];
fs.writeFileSync("out.html", `
<div style="position: relative;">
<img src="screenshot.png">
<div style="
position: absolute;
background-color: rgba(126, 185, 255, 0.3);
left: ${rect.x}px;
top: ${rect.y}px;
width: ${rect.width}px;
height: ${rect.height}px;
">${xpath}</div>
</div>
`, "utf8");
})();
import { Tree, Rect } from "./simple-scraper";
import { jStat } from "jStat";
export interface Score {
totalScore: number;
xScore: number;
yScore: number;
widthScore: number;
heightScore: number;
domScore: number;
}
export interface ScoredTree extends Tree {
score: Score;
}
export interface MainContentSummary {
nodeName: string;
xpath: string;
attrs: { [key: string]: string};
score: Score;
rect: Rect;
}
const _xFn = jStat.normal(0.5, 0.35);
const _yFn = jStat.normal(0.05, 0.35);
const _widthFn = jStat.gamma(6.7, 0.11);
const distributions = {
x: (x: number) => _xFn.pdf(x),
y: (y: number) => y === 0 ? .5 : _yFn.pdf(y),
width: (w: number) => _widthFn.pdf(w),
height: (h: number) => Math.min(h, 0.9),
}
// 適当に重みをつける
const weights = {
x: 1,
y: 1,
width: 2.5,
height: 1,
};
const SKIP_THRESHOLD = 5;
// デザインとかCMSの都合上ネストしているだけか
function isSkippable(t: Tree): boolean {
if (t.children.length !== 1) return false;
const [c] = t.children;
return Math.abs(t.x - c.x) < SKIP_THRESHOLD &&
Math.abs(t.y - c.y) < SKIP_THRESHOLD &&
Math.abs(t.width - c.width) < SKIP_THRESHOLD &&
Math.abs(t.height - c.height) < SKIP_THRESHOLD;
}
// mainになりそうなnodeNameか
function isElementNameValid(t: Tree) {
return !/^(NAV|ASIDE|HEADER|FOOTER|H[1-6]|P|BLOCKQUOTE|PRE|A|THEAD|TFOOT|TH|DD|DT|MENU)$/.test(t.nodeName);
}
// メインっぽいか
function isMain(t: Tree) {
return (
t.nodeName === "MAIN" ||
(t.attrs.id && /main/i.test(t.attrs.id))
);
}
class MainContentDetector {
constructor(
private root: Tree,
private documentWidth: number,
private documentHeight: number,
) { }
private calcXScore(t: Tree) {
return Math.pow(distributions.x((t.x + t.width / 2) / this.documentWidth), weights.x);
}
private calcYScore(t: Tree) {
return Math.pow(distributions.y(t.y / this.documentHeight), weights.y);
}
private calcWidthScore(t: Tree) {
return Math.pow(distributions.width(t.width / this.documentWidth), weights.width);
}
private calcHeightScore(t: Tree) {
return Math.pow(distributions.height(t.height / this.documentHeight), weights.height);
}
private isElementTooSmall(t: Tree) {
return t.width * t.height < this.documentWidth * this.documentHeight * 0.05;
}
private filterElement(t: Tree) {
return isElementNameValid(t) && !this.isElementTooSmall(t);
}
detect(limit = 5): MainContentSummary[] {
const trees: ScoredTree[] = [];
const applyScore = (t: ScoredTree, domScore: number) => {
if (isMain(t)) domScore *= 2;
if (isSkippable(t)) {
applyScore(t.children[0], domScore);
return;
}
const xScore = this.calcXScore(t);
const yScore = this.calcYScore(t);
const widthScore = this.calcWidthScore(t);
const heightScore = this.calcHeightScore(t);
t.score = {
totalScore: xScore * yScore * widthScore * heightScore * domScore,
xScore,
yScore,
widthScore,
heightScore,
domScore,
};
trees.push(t);
t.children.forEach(child => applyScore(child, domScore));
}
this.root.children.forEach(c => applyScore(c as ScoredTree, 1));
return trees.filter(child => this.filterElement(child)).map(t => ({
nodeName: t.nodeName,
xpath: t.xpath,
attrs: t.attrs,
score: t.score,
rect: {
x: t.x,
y: t.y,
width: t.width,
height: t.height,
},
})).sort((a, b) => b.score.totalScore - a.score.totalScore).slice(0, limit);
}
}
function getRootTree(t: Tree) {
while (true) {
const children = t.children.filter(c => !/IMG|IFRAME/.test(c.nodeName));
if (children.length !== 1) return t;
[t] = children;
}
}
export function detectMainContent(tree: Tree, documentWidth: number, documentHeight: number) {
const rootTree = getRootTree(tree);
const detector = new MainContentDetector(rootTree, documentWidth, documentHeight);
const summaries = detector.detect(10);
return summaries;
}
import * as puppeteer from "puppeteer";
export interface Tree {
nodeName: string;
xpath: string;
attrs: { [key: string]: string };
x: number;
y: number;
width: number;
height: number;
children: this[];
}
export interface Rect {
x: number;
y: number;
width: number;
height: number;
}
interface EmulateOptions {
viewport: {
width: number;
height: number;
};
userAgent: string;
}
async function waitRendering(page: puppeteer.Page, wait: number) {
return await page.evaluate(async wait => {
await new Promise(resolve => setTimeout(resolve, wait));
await new Promise(resolve => (window as any).requestIdleCallback(resolve, { timeout: 5000 }));
}, wait);
}
async function getPageSize(page: puppeteer.Page) {
return await page.evaluate(() => [
document.documentElement.scrollWidth, document.documentElement.scrollHeight
]) as [number, number];
}
async function fixPageSize(page: puppeteer.Page, options: EmulateOptions) {
let n = 50;
await waitRendering(page, 50);
let [w, h] = await getPageSize(page);
options.viewport.width = w;
options.viewport.height = h;
while (n--) {
options.viewport.width = w;
await page.emulate(options);
await waitRendering(page, 50);
const [nw, nh] = await getPageSize(page);
if (w >= nw) break;
w = nw;
h = nh;
}
return [w, h];
}
export async function createTree(page: puppeteer.Page) {
return await page.evaluate(() => {
function qs(el: HTMLElement, selector: string) {
try {
return document.querySelector(selector);
} catch (e) {
return null;
}
}
// https://stackoverflow.com/questions/2661818/javascript-get-xpath-of-a-node
function getXPathForElement(element: Element): string {
const idx = (sib: any, name?: any): any => sib
? idx(sib.previousElementSibling, name || sib.localName) + (sib.localName == name)
: 1;
const segs = (elm: any): any => !elm || elm.nodeType !== 1
? ['']
: elm.id && qs(elm, `#${elm.id}`) === elm
? [`id("${elm.id}")`]
: [...segs(elm.parentNode), `${elm.localName.toLowerCase()}[${idx(elm)}]`];
return segs(element).join('/');
}
class DomTreeWrapper {
visible: boolean;
rect: Rect;
children: DomTreeWrapper[];
constructor(
private el: HTMLElement,
private parentRect: Rect,
) {
const computed = window.getComputedStyle(el);
const r = el.getBoundingClientRect();
const sourceRect = {
x: r.left,
y: r.top,
width: r.width,
height: r.height,
};
this.visible = this.setupVisible(sourceRect, computed);
if (!this.visible) return;
this.rect = this.setupRect(sourceRect, computed);
const crop = ["overflow", "overflowX", "overflowY"].some((k: any) => this.el.nodeName !== "BODY" && /hidden|auto|scroll/.test(computed[k]));
this.children = this.getVisibleChildren(crop ? this.rect : this.parentRect);
}
private setupVisible(sourceRect: Rect, computed: CSSStyleDeclaration) {
if (/META|SCRIPT|LINK|STYLE|IFRAME/.test(this.el.nodeName)) return false;
// 要素が非表示
if (computed.display === "none" ||
computed.visibility === "hidden" ||
computed.visibility === "collapse" ||
(this.el.nodeName === "INPUT" && (this.el as HTMLInputElement).type === "hidden")) {
return false;
}
// 要素が透明
if (computed.opacity === "0") return false;
// z-indexがマイナス
if (computed.zIndex && +computed.zIndex < 0) return false;
// 幅or高さが0でoverflow: hidden
let { x, y, width, height } = sourceRect;
if (width === 0 && (computed.overflow === "hidden" || computed.overflowY === "hidden")) return false;
if (height === 0 && (computed.overflow === "hidden" || computed.overflowX === "hidden")) return false;
// 幅高さを無理やり取る
const origPosition = this.el.style.position;
this.el.style.position = "absolute";
const r = this.el.getBoundingClientRect();
this.el.style.position = origPosition;
width = r.width;
height = r.height;
// それでも幅or高さが0
if (width === 0 || height === 0) return false;
// 要素がページの枠外にある
const documentWidth = document.documentElement.scrollWidth;
const documentHeight = document.documentElement.scrollHeight;
if (x + width <= 0) return false;
if (x >= documentWidth) return false;
if (y + height <= 0) return false;
if (y >= documentHeight) return false;
return true;
}
private crop(rect: Rect) {
let { x, y, width, height } = rect;
const r = this.parentRect;
const left = Math.max(x, r.x);
const top = Math.max(y, r.y);
const right = Math.min(x + width, r.x + r.width);
const bottom = Math.min(y + height, r.y + r.height);
x = left;
y = top;
width = right - left;
height = bottom - top;
return { x, y, width, height };
}
private setupRect(sourceRect: Rect, computed: CSSStyleDeclaration) {
// 親の箱でcrop(ページ全体 or overflow: autoとか)
let { x, y, width, height } = sourceRect;
// 稀に幅or高さが0になってしまうやつの対応
if (width === 0 || height === 0) {
// なぜか位置がおかしなところに行く場合があるので子要素から取得
const children = this.getVisibleChildren(this.parentRect);
if (children.length) {
const r = children[0].rect;
x = r.x;
y = r.y;
}
// 幅と高さ修正
const origPosition = this.el.style.position;
this.el.style.position = "absolute";
const r = this.el.getBoundingClientRect();
width = r.width;
height = r.height;
this.el.style.position = origPosition;
}
return this.crop({ x, y, width, height });
}
private getAttrs() {
const attrs: { [key: string]: string } = {};
for (let i = 0; i < this.el.attributes.length; ++i) {
const { name, value } = this.el.attributes.item(i);
attrs[name] = value;
}
return attrs;
}
private getVisibleChildren(rect: Rect) {
return [...this.el.children as any as HTMLElement[]]
.map(el => new DomTreeWrapper(el, rect))
.filter(dtw => dtw.visible);
}
exportTree(): Tree {
return {
nodeName: this.el.nodeName,
...this.rect,
attrs: this.getAttrs(),
xpath: getXPathForElement(this.el),
children: this.children.map(child => child.exportTree()),
};
}
}
const domTreeWrapper = new DomTreeWrapper(document.body, {
x: 0,
y: 0,
width: document.documentElement.scrollWidth,
height: document.documentElement.scrollHeight,
});
return domTreeWrapper.exportTree();
}) as Tree;
}
export async function parse(url: string, screenshotOpt?: puppeteer.ScreenshotOptions) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
const options = {
viewport: {
width: 1024,
height: 600,
},
userAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36",
};
await page.emulate(options);
await page.goto(url);
const [width, height] = await fixPageSize(page, options);
const tree = await createTree(page);
if (screenshotOpt) await page.screenshot(screenshotOpt);
await browser.close();
return {
tree,
documentWidth: width,
documentHeight: height,
};
}
{
"compilerOptions": {
"target": "es2017",
"module": "commonjs",
"outDir": "./",
"rootDir": "./",
"strict": true
},
"include": [
"./"
],
"exclude": [
"node_modules"
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment