Created
January 5, 2022 23:08
-
-
Save biancadanforth/b490f7669cc1d9e1062053504aaa71f3 to your computer and use it in GitHub Desktop.
Fathom article webpage ruleset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* This Source Code Form is subject to the terms of the Mozilla Public | |
* License, v. 2.0. If a copy of the MPL was not distributed with this | |
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */ | |
/* eslint-disable max-len, arrow-body-style */ | |
import {linearScale} from "fathom-web/utilsForFrontend"; | |
import {dom, out, rule, ruleset, score, type} from "fathom-web"; | |
const coefficients = { | |
"paragraph": [ | |
["pElementHasListItemAncestor", -2.86763596534729], | |
["hasLongTextContent", 5.575725555419922], | |
["containsElipsisAtEndOfText", -0.13708636164665222], | |
["classNameOfSelfOrParentContainsUnlikelyWord", -2.073239326477051] | |
], | |
"article": [ | |
["hasEnoughParagraphs", -1.0311405658721924], | |
["hasExactlyOneArticleElement", -1.2359271049499512], | |
["paragraphElementsHaveSiblingsWithSameTagName", 12.159211158752441], | |
["mostParagraphElementsAreHorizontallyAligned", 0.5681423544883728], | |
["moreParagraphElementsThanListItemsOrTableRows", -2.6533799171447754], | |
["headerElementIsSiblingToParagraphElements", 12.294110298156738], | |
["hasMultipleArticleElements", -3.300487756729126], | |
["hasMultipleParagraphsWhoseClassNameIncludesArticle", 0.26676997542381287] | |
] | |
}; | |
const biases = [ | |
["paragraph", -4.550228595733643], | |
["article", -2.676619291305542] | |
]; | |
/** | |
* Fathom ruleset | |
* | |
* These are the features used to extract different types of information on a page (or categorize the entire page). | |
*/ | |
// Memoize expensive results, so they are only computed once. | |
let highestScoringParagraphs; | |
let numParagraphsInAllDivs; | |
const MIN_PARAGRAPH_LENGTH = 234; // Optimized with 10 sample pages | |
const UNLIKELY_WORDS_IN_PARAGRAPH_CLASSNAMES = /comment|caption/i; | |
// Text nodes are not targetable via document.querySelectorAll (i.e. Fathom's `dom` method), so we instead use | |
// different heuristics based on the child elements contained inside the <div>. | |
function numParagraphTextNodesInDiv({element}) { | |
if (divHasBrChildElement({element})) { | |
// Estimate the number of paragraph-like text nodes based on the number of descendant <br> elements and | |
// list elements in the <div> | |
const listDescendants = Array.from(element.querySelectorAll("ol")).concat(Array.from(element.querySelectorAll("ul"))); | |
const brDescendants = Array.from(element.querySelectorAll("br")); | |
const pDescendants = Array.from(element.querySelectorAll("p")); | |
// We assume a <br> divides two text nodes/"chunks" (a paragraph or a list) | |
// But let's make sure each <br> is actually immediately adjacent to at least one textNode of sufficient length, as | |
// sometimes there are lots of extra <br>s just for styling purposes. | |
const brsNextToSufficientlyLongTextNodes = brDescendants.filter((descendant) => { | |
const {previousSibling, nextSibling} = descendant; | |
if (previousSibling && previousSibling.nodeType === Node.TEXT_NODE && previousSibling.length >= MIN_PARAGRAPH_LENGTH) { | |
return true; | |
} | |
if (nextSibling && nextSibling.nodeType === Node.TEXT_NODE && nextSibling.length >= MIN_PARAGRAPH_LENGTH) { | |
return true; | |
} | |
return false; | |
}); | |
return (brsNextToSufficientlyLongTextNodes.length - listDescendants.length - pDescendants.length + 1); | |
} | |
// The only other divs this function would receive are if divHasOnlyTextNodesAnchorElementsOrSpanElements, | |
// so we'll just say the div contains one paragraph if its text nodes, when summed together, have sufficient length. | |
const textNodeLengths = Array.from(element.childNodes).map(node => node.nodeType === Node.TEXT_NODE ? node.nodeValue.length : 0); | |
const totalLength = textNodeLengths.reduce((prev, current) => current + prev, 0); | |
return (totalLength >= MIN_PARAGRAPH_LENGTH) ? 1 : 0; | |
} | |
function getNumParagraphsInAllDivs(highestScoringParagraphs) { | |
const divFnodes = highestScoringParagraphs.filter(({element}) => element.tagName === "DIV"); | |
return divFnodes.reduce((accumulator, currentValue) => { | |
return accumulator + currentValue.noteFor("paragraph"); | |
}, 0); | |
} | |
// Returns true if an element's center coordinates are somewhere likely to be the main content area of the page. | |
function elementIsInTheMainContentArea(element) { | |
const {left, top, width, height} = element.getBoundingClientRect(); | |
const [xCenter, yCenter] = [left + (width / 2), top + (height / 2)]; | |
// Get the middle 50% area of the page in the x-direction (TODO: Optimize %). | |
const win = element.ownerDocument.defaultView; | |
const docLeftCutoff = win.innerWidth / 4; | |
const docRightCutoff = 3 * win.innerWidth / 4; | |
const MAIN_CONTENT_VERTICAL_CUTOFF = 200; // TODO Optimize | |
return (xCenter >= docLeftCutoff && xCenter <= docRightCutoff && yCenter >= MAIN_CONTENT_VERTICAL_CUTOFF); | |
} | |
/** | |
* Positive ``when`` callbacks | |
*/ | |
function isElementVisible({element}) { | |
// Have to null-check element.style to deal with SVG and MathML nodes. | |
return ( | |
(!element.style || element.style.display != "none") | |
&& !element.hasAttribute("hidden") | |
); | |
} | |
function divHasOnlyTextNodesAnchorElementsOrSpanElements({element}) { | |
return Array.from(element.childNodes).every(node => (node.nodeType === Node.TEXT_NODE || node.tagName === "A" || node.tagName === "SPAN")); | |
} | |
function divHasBrChildElement({element}) { | |
return Array.from(element.children).some((childEle) => childEle.tagName === "BR"); | |
} | |
/** | |
* Negative "paragraph" rules | |
*/ | |
function pElementHasListItemAncestor({element}) { | |
return element.matches("li p"); | |
} | |
// This probably means this is just a preview of a complete paragraph | |
function containsElipsisAtEndOfText({element}) { | |
return element.innerText.endsWith("..."); | |
} | |
// Modeled after toolkit/components/reader/Readability-readerable.js in Firefox | |
function classNameOfSelfOrParentContainsUnlikelyWord({element}) { | |
const matchString = `${element.className} ${element.parentNode.className}`; | |
return UNLIKELY_WORDS_IN_PARAGRAPH_CLASSNAMES.test(matchString); | |
} | |
/** | |
* Positive "paragraph" rules | |
*/ | |
function hasLongTextContent({element}) { | |
const textContentLength = element.textContent.trim().length; | |
return linearScale(textContentLength, 0, MIN_PARAGRAPH_LENGTH); | |
} | |
function getHighestScoringParagraphs(fnode) { | |
return fnode._ruleset.get("paragraph"); | |
} | |
/** | |
* Negative "article rules" | |
*/ | |
// Often homepages of news websites have article previews (i.e. not a single, encapsulated article). | |
function hasMultipleArticleElements({element}) { | |
const doc = element.ownerDocument; | |
const articleElements = doc.querySelectorAll("article"); | |
return articleElements.length > 1; | |
} | |
function hasMultipleParagraphsWhoseClassNameIncludesArticle(fnode) { | |
highestScoringParagraphs = highestScoringParagraphs || getHighestScoringParagraphs(fnode); | |
const paragraphsWithArticleInClassName = highestScoringParagraphs.filter(({element}) => element.className.toLowerCase().includes("article")); | |
return paragraphsWithArticleInClassName.length > 1; | |
} | |
/** | |
* Positive "article" rules | |
*/ | |
function hasEnoughParagraphs(fnode) { | |
highestScoringParagraphs = highestScoringParagraphs || getHighestScoringParagraphs(fnode); | |
numParagraphsInAllDivs = numParagraphsInAllDivs || getNumParagraphsInAllDivs(highestScoringParagraphs); | |
return (highestScoringParagraphs.length + numParagraphsInAllDivs) >= 9; // Optimized with 40 training samples | |
} | |
function hasExactlyOneArticleElement({element}) { | |
const doc = element.ownerDocument; | |
const articleElements = doc.querySelectorAll("article"); | |
// TODO: May want to award less points the more article elements a page has. Revisit. | |
return articleElements.length === 1; | |
} | |
function paragraphElementsHaveSiblingsWithSameTagName(fnode) { | |
highestScoringParagraphs = highestScoringParagraphs || getHighestScoringParagraphs(fnode); | |
const numSiblingsPerParagraphNode = []; | |
for (const fnode of highestScoringParagraphs) { | |
const {element} = fnode; | |
let siblingsWithSameTagName = 0; | |
if (element.tagName === "DIV") { | |
const numParagraphs = fnode.noteFor("paragraph"); | |
siblingsWithSameTagName = numParagraphs - 1; | |
} else { | |
siblingsWithSameTagName = Array.from( | |
element.parentNode.children | |
).filter( | |
node => node.tagName === element.tagName && node !== element | |
).length; | |
} | |
numSiblingsPerParagraphNode.push(siblingsWithSameTagName); | |
} | |
const sum = numSiblingsPerParagraphNode.reduce((prev, current) => current + prev, 0); | |
// average sibling count per highest scoring paragraph node; divide by 0 returns NaN which makes the feature return false | |
return Math.round(sum / numSiblingsPerParagraphNode.length) >= 3; // Optimized with 40 training samples | |
} | |
function mostParagraphElementsAreHorizontallyAligned(fnode) { | |
// TODO: Include paragraphs inside divs with brs, see 'getNumParagraphsInAllDivs' | |
highestScoringParagraphs = highestScoringParagraphs || getHighestScoringParagraphs(fnode); | |
const leftPositionVsFrequency = new Map(); | |
for (const {element} of highestScoringParagraphs) { | |
const left = element.getBoundingClientRect().left; | |
if (leftPositionVsFrequency.get(left) === undefined) { | |
leftPositionVsFrequency.set(left, 1); | |
} else { | |
leftPositionVsFrequency.set(left, leftPositionVsFrequency.get(left) + 1); | |
} | |
} | |
const totals = []; // Each element (int) corresponds to the number of paragraphs with the same left position | |
for (const total of leftPositionVsFrequency.values()) { | |
totals.push(total); | |
} | |
const maxNumParagraphsWithSameLeftPosition = Math.max(...totals); | |
if (highestScoringParagraphs.length < 2) { | |
// Avoid divide by 0 errors, and we don't want to give a page that only has one paragraph the max score; | |
// this rule is intended to compare a paragraph's left position relative to other paragraphs. | |
return 0; | |
} | |
return maxNumParagraphsWithSameLeftPosition / highestScoringParagraphs.length; | |
} | |
function moreParagraphElementsThanListItemsOrTableRows(fnode) { | |
highestScoringParagraphs = highestScoringParagraphs || getHighestScoringParagraphs(fnode); | |
const numParagraphElements = highestScoringParagraphs.length; | |
const doc = fnode.element.ownerDocument; | |
const tableRowElements = Array.from(doc.querySelectorAll("tr")).filter(node => elementIsInTheMainContentArea(node)); | |
const listItemElements = Array.from(doc.getElementsByTagName("li")).filter(node => elementIsInTheMainContentArea(node)); | |
// TODO: Include paragraphs inside divs with brs, see 'getNumParagraphsInAllDivs' | |
// TODO: the greater the difference, the higher the score | |
return numParagraphElements > tableRowElements.length && numParagraphElements > listItemElements.length; | |
} | |
function headerElementIsSiblingToParagraphElements(fnode) { | |
const headerTagNames = ["H1", "H2"]; | |
let counter = 0; | |
highestScoringParagraphs = highestScoringParagraphs || getHighestScoringParagraphs(fnode); | |
for (const {element} of highestScoringParagraphs) { | |
const siblings = Array.from(element.parentNode.children).filter(node => node !== element); | |
if (siblings.some(sibling => headerTagNames.includes(sibling.tagName))) { | |
counter++; | |
} | |
} | |
// TODO: Include paragraphs inside divs with brs, see 'getNumParagraphsInAllDivs' | |
return linearScale(counter, 4, 11); // oneAt cut-off optimized with 40 samples | |
} | |
function makeRuleset(coeffs, biases) { | |
return ruleset([ | |
/** | |
* Paragraph rules | |
*/ | |
// Consider all visible paragraph-ish elements | |
rule(dom("p, pre").when(isElementVisible), type("paragraph")), | |
rule(dom("div").when(isElementVisible).when(divHasBrChildElement), type("paragraph").note(numParagraphTextNodesInDiv)), | |
rule(dom("div").when(isElementVisible).when(divHasOnlyTextNodesAnchorElementsOrSpanElements), type("paragraph").note(numParagraphTextNodesInDiv)), | |
rule(type("paragraph"), score(pElementHasListItemAncestor), {name: "pElementHasListItemAncestor"}), | |
rule(type("paragraph"), score(hasLongTextContent), {name: "hasLongTextContent"}), | |
rule(type("paragraph"), score(containsElipsisAtEndOfText), {name: "containsElipsisAtEndOfText"}), | |
rule(type("paragraph"), score(classNameOfSelfOrParentContainsUnlikelyWord), {name: "classNameOfSelfOrParentContainsUnlikelyWord"}), | |
// return paragraph-ish element(s) with max score | |
rule(type("paragraph").max(), out("paragraph")), | |
/** | |
* Article rules | |
*/ | |
rule(dom("html"), type("article")), | |
rule(type("article"), score(hasEnoughParagraphs), {name: "hasEnoughParagraphs"}), | |
rule(type("article"), score(hasExactlyOneArticleElement), {name: "hasExactlyOneArticleElement"}), | |
rule(type("article"), score(paragraphElementsHaveSiblingsWithSameTagName), {name: "paragraphElementsHaveSiblingsWithSameTagName"}), | |
rule(type("article"), score(mostParagraphElementsAreHorizontallyAligned), {name: "mostParagraphElementsAreHorizontallyAligned"}), | |
rule(type("article"), score(moreParagraphElementsThanListItemsOrTableRows), {name: "moreParagraphElementsThanListItemsOrTableRows"}), | |
rule(type("article"), score(headerElementIsSiblingToParagraphElements), {name: "headerElementIsSiblingToParagraphElements"}), | |
rule(type("article"), score(hasMultipleArticleElements), {name: "hasMultipleArticleElements"}), | |
rule(type("article"), score(hasMultipleParagraphsWhoseClassNameIncludesArticle), {name: "hasMultipleParagraphsWhoseClassNameIncludesArticle"}), | |
rule(type("article"), out("article")) | |
], | |
coeffs, | |
biases); | |
} | |
/** | |
* FathomFox sends the fathom-trainees extension a ``trainees`` object to execute the Fathom ruleset on the page. | |
*/ | |
const trainees = new Map(); | |
const VIEWPORT_SIZE = { | |
width: 1680, | |
height: 950 | |
}; | |
const FEATURES = ["paragraph", "article"]; | |
for (const feature of FEATURES) { | |
const ruleset = { | |
coeffs: new Map(coefficients[feature]), | |
viewportSize: VIEWPORT_SIZE, | |
vectorType: feature, | |
rulesetMaker: () => makeRuleset([ | |
...coefficients.paragraph, | |
...coefficients.article, | |
], biases), | |
}; | |
trainees.set(feature, ruleset); | |
} | |
export default trainees; | |
/** | |
* Ruleset development helpers | |
* | |
* These helpers run each Fathom ruleset when the page is loaded; this allows debugging and iterating without | |
* having to use the Vectorizer. These would not ship with the ruleset in the Fathom application. | |
*/ | |
function getHighestScoringParagraphElements() { | |
const rules = makeRuleset([ | |
...coefficients.paragraph, | |
...coefficients.article, | |
], biases); | |
const results = rules.against(document); | |
const fnodesList = results.get("paragraph"); | |
const elementsList = fnodesList.map((fnode) => fnode.element); | |
const elementToScore = new Map(); | |
fnodesList.forEach(fnode => { | |
elementToScore.set(fnode.element, fnode.scoreFor("paragraph")); | |
}); | |
return elementsList; | |
} | |
const highScoringParagraphElementsList = getHighestScoringParagraphElements(); | |
const allParagraphTargetElements = Array.from(document.querySelectorAll("*[data-fathom='paragraph']")); | |
const falseNegativesParagraphs = []; // target elements that Fathom doesn't find | |
const falsePositivesParagraphs = []; // candidate elements that Fathom wrongly thinks are targets | |
for (const element of allParagraphTargetElements) { | |
if (!highScoringParagraphElementsList.includes(element)) { | |
falseNegativesParagraphs.push(element); | |
} | |
} | |
for (const element of highScoringParagraphElementsList) { | |
if (!allParagraphTargetElements.includes(element)) { | |
falsePositivesParagraphs.push(element); | |
} | |
} | |
console.log("False Negatives Paragraph: ", falseNegativesParagraphs); | |
console.log("False Positives Paragraph: ", falsePositivesParagraphs); | |
function getHighestScoringArticleElement() { | |
const rules = makeRuleset([ | |
...coefficients.paragraph, | |
...coefficients.article, | |
], biases); | |
const results = rules.against(document); | |
const fnodesList = results.get("article"); | |
const elementsList = fnodesList.map((fnode) => fnode.element); | |
const elementToScore = new Map(); | |
fnodesList.forEach(fnode => { | |
elementToScore.set(fnode.element, fnode.scoreFor("article")); | |
}); | |
return elementsList; | |
} | |
const highScoringArticleElementsList = getHighestScoringArticleElement(); | |
const allArticleTargetElements = Array.from(document.querySelectorAll("*[data-fathom='article']")); | |
const falseNegativesArticle = []; // target elements that Fathom doesn't find | |
const falsePositivesArticle = []; // candidate elements that Fathom wrongly thinks are targets | |
for (const element of allArticleTargetElements) { | |
if (!highScoringArticleElementsList.includes(element)) { | |
falseNegativesArticle.push(element); | |
} | |
} | |
for (const element of highScoringArticleElementsList) { | |
if (!allArticleTargetElements.includes(element)) { | |
falsePositivesArticle.push(element); | |
} | |
} | |
console.log("False Negatives Article: ", falseNegativesArticle); | |
console.log("False Positives Article: ", falsePositivesArticle); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment