iacore · March 20, 2024 11:49
diff --git a/readability.go b/readability.go
 // Copyright 2017 Frédéric Guillot. All rights reserved.
 // Use of this source code is governed by the Apache 2.0
 // license that can be found in the LICENSE file.

 package readability

 import (
 	"bytes"
 	"errors"
 	"fmt"
 	"io"
 	"math"
 	"regexp"
 	"strings"

 	"github.com/nkanaev/yarr/src/content/htmlutil"
 	"golang.org/x/net/html"
 )

 const (
 	defaultTagsToScore = "section,h2,h3,h4,h5,h6,p,td,pre,div"
 )

 var (
 	divToPElementsRegexp = regexp.MustCompile(`(?i)<(a|blockquote|dl|div|img|ol|p|pre|table|ul)`)
 	sentenceRegexp       = regexp.MustCompile(`\.( |$)`)

 	blacklistCandidatesRegexp  = regexp.MustCompile(`(?i)popupbody|-ad|g-plus`)
 	okMaybeItsACandidateRegexp = regexp.MustCompile(`(?i)and|article|body|column|main|shadow`)
 	unlikelyCandidatesRegexp   = regexp.MustCompile(`(?i)banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote`)

 	negativeRegexp = regexp.MustCompile(`(?i)hidden|^hid$|hid$|hid|^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget|byline|author|dateline|writtenby|p-author`)
 	positiveRegexp = regexp.MustCompile(`(?i)article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story`)
 )

 type nodeScores map[*html.Node]float32

 // ExtractContent returns relevant content.
 func ExtractContent(page io.Reader) (string, error) {
 	root, err := html.Parse(page)
 	if err != nil {
 		return "", err
 	}

 	for _, trash := range htmlutil.Query(root, "script,style") {
 		if trash.Parent != nil {
 			trash.Parent.RemoveChild(trash)
 		}
 	}

 	transformMisusedDivsIntoParagraphs(root)
 	removeUnlikelyCandidates(root)

 	scores := getCandidates(root)
 	//log.Printf("[Readability] Candidates: %v", candidates)

 	best := getTopCandidate(scores)
 	if best == nil {
 		for _, body := range htmlutil.Query(root, "body") {
 			best = body
 			break
 		}
 		if best == nil {
 			return "", errors.New("failed to extract content")
 		}
 	}
 	//log.Printf("[Readability] TopCandidate: %v", topCandidate)

 	output := getArticle(best, scores)
 	return output, nil
 }

 // Now that we have the top candidate, look through its siblings for content that might also be related.
 // Things like preambles, content split by ads that we removed, etc.
 func getArticle(best *html.Node, scores nodeScores) string {
 	output := bytes.NewBufferString("<div>")
 	siblingScoreThreshold := float32(math.Max(10, float64(scores[best]*.2)))

 	nodelist := make([]*html.Node, 0)
 	nodelist = append(nodelist, best)

 	// Get the candidate's siblings
 	for n := best.NextSibling; n != nil; n = n.NextSibling {
 		nodelist = append(nodelist, n)
 	}
 	for n := best.PrevSibling; n != nil; n = n.PrevSibling {
 		nodelist = append(nodelist, n)
 	}

 	for _, node := range nodelist {
 		append := false
 		isP := node.Data == "p"

 		if node == best {
 			append = true
 		} else if scores[node] >= siblingScoreThreshold {
 			append = true
 		} else {
 			if isP {
 				linkDensity := getLinkDensity(node)
 				content := htmlutil.Text(node)
 				contentLength := len(content)

 				if contentLength >= 80 && linkDensity < .25 {
 					append = true
 				} else if contentLength < 80 && linkDensity == 0 && sentenceRegexp.MatchString(content) {
 					append = true
 				}
 			}
 		}
 		if append {
 			tag := "div"
 			if isP {
 				tag = "p"
 			}
 			fmt.Fprintf(output, "<%s>%s</%s>", tag, htmlutil.InnerHTML(node), tag)
 		}
 	}

 	output.Write([]byte("</div>"))
 	return output.String()
 }

 func removeUnlikelyCandidates(root *html.Node) {
 	body := htmlutil.Query(root, "body")
 	if len(body) == 0 {
 		return
 	}
 	for _, node := range htmlutil.Query(body[0], "*") {
 		str := htmlutil.Attr(node, "class") + htmlutil.Attr(node, "id")

 		if htmlutil.Closest(node, "table,code") != nil {
 			continue
 		}

 		blacklisted := (blacklistCandidatesRegexp.MatchString(str) ||
 			(unlikelyCandidatesRegexp.MatchString(str) &&
 				!okMaybeItsACandidateRegexp.MatchString(str)))
 		if blacklisted && node.Parent != nil {
 			node.Parent.RemoveChild(node)
 		}
 	}
 }

 func getTopCandidate(scores nodeScores) *html.Node {
 	var top *html.Node
 	var max float32

 	for node, score := range scores {
 		if score > max {
 			top = node
 			max = score
 		}
 	}

 	return top
 }

 // Loop through all paragraphs, and assign a score to them based on how content-y they look.
 // Then add their score to their parent node.
 // A score is determined by things like number of commas, class names, etc.
 // Maybe eventually link density.
 func getCandidates(root *html.Node) nodeScores {
 	scores := make(nodeScores)
 	for _, node := range htmlutil.Query(root, defaultTagsToScore) {
 		text := htmlutil.Text(node)

 		// If this paragraph is less than 25 characters, don't even count it.
 		if len(text) < 25 {
 			continue
 		}

 		parentNode := node.Parent
 		grandParentNode := parentNode.Parent

 		if _, found := scores[parentNode]; !found {
 			scores[parentNode] = scoreNode(parentNode)
 		}

 		if grandParentNode != nil {
 			if _, found := scores[grandParentNode]; !found {
 				scores[grandParentNode] = scoreNode(grandParentNode)
 			}
 		}

 		// Add a point for the paragraph itself as a base.
 		contentScore := float32(1.0)

 		// Add points for any commas within this paragraph.
 		contentScore += float32(strings.Count(text, ",") + 1)

 		// For every 100 characters in this paragraph, add another point. Up to 3 points.
 		contentScore += float32(math.Min(float64(int(len(text)/100.0)), 3))

 		scores[parentNode] += contentScore
 		if grandParentNode != nil {
 			scores[grandParentNode] += contentScore / 2.0
 		}
 	}

 	// Scale the final candidates score based on link density. Good content
 	// should have a relatively small link density (5% or less) and be mostly
 	// unaffected by this operation
 	for node := range scores {
 		scores[node] *= (1 - getLinkDensity(node))
 	}

 	return scores
 }

 func scoreNode(node *html.Node) float32 {
 	var score float32

 	switch node.Data {
 	case "div":
 		score += 5
 	case "pre", "td", "blockquote", "img":
 		score += 3
 	case "address", "ol", "ul", "dl", "dd", "dt", "li", "form":
 		score -= 3
 	case "h1", "h2", "h3", "h4", "h5", "h6", "th":
 		score -= 5
 	}

 	return score + getClassWeight(node)
 }

 // Get the density of links as a percentage of the content
 // This is the amount of text that is inside a link divided by the total text in the node.
 func getLinkDensity(n *html.Node) float32 {
 	textLength := len(htmlutil.Text(n))
 	if textLength == 0 {
 		return 0
 	}

 	linkLength := 0.0
 	for _, a := range htmlutil.Query(n, "a") {
 		linkLength += float64(len(htmlutil.Text(a)))
 	}

 	return float32(linkLength) / float32(textLength)
 }

 // Get an elements class/id weight. Uses regular expressions to tell if this
 // element looks good or bad.
 func getClassWeight(node *html.Node) float32 {
 	weight := 0
 	class := htmlutil.Attr(node, "class")
 	id := htmlutil.Attr(node, "id")

 	if class != "" {
 		if negativeRegexp.MatchString(class) {
 			weight -= 25
 		}

 		if positiveRegexp.MatchString(class) {
 			weight += 25
 		}
 	}

 	if id != "" {
 		if negativeRegexp.MatchString(id) {
 			weight -= 25
 		}

 		if positiveRegexp.MatchString(id) {
 			weight += 25
 		}
 	}

 	return float32(weight)
 }

 func transformMisusedDivsIntoParagraphs(root *html.Node) {
 	for _, node := range htmlutil.Query(root, "div") {
 		if !divToPElementsRegexp.MatchString(htmlutil.InnerHTML(node)) {
 			node.Data = "p"
 		}
 	}
 }
	// Copyright 2017 Frédéric Guillot. All rights reserved.
	// Use of this source code is governed by the Apache 2.0
	// license that can be found in the LICENSE file.

	package readability

	import (
	"bytes"
	"errors"
	"fmt"
	"io"
	"math"
	"regexp"
	"strings"

	"github.com/nkanaev/yarr/src/content/htmlutil"
	"golang.org/x/net/html"
	)

	const (
	defaultTagsToScore = "section,h2,h3,h4,h5,h6,p,td,pre,div"
	)

	var (
	divToPElementsRegexp = regexp.MustCompile(`(?i)<(a\|blockquote\|dl\|div\|img\|ol\|p\|pre\|table\|ul)`)
	sentenceRegexp = regexp.MustCompile(`\.( \|$)`)

	blacklistCandidatesRegexp = regexp.MustCompile(`(?i)popupbody\|-ad\|g-plus`)
	okMaybeItsACandidateRegexp = regexp.MustCompile(`(?i)and\|article\|body\|column\|main\|shadow`)
	unlikelyCandidatesRegexp = regexp.MustCompile(`(?i)banner\|breadcrumbs\|combx\|comment\|community\|cover-wrap\|disqus\|extra\|foot\|header\|legends\|menu\|modal\|related\|remark\|replies\|rss\|shoutbox\|sidebar\|skyscraper\|social\|sponsor\|supplemental\|ad-break\|agegate\|pagination\|pager\|popup\|yom-remote`)

	negativeRegexp = regexp.MustCompile(`(?i)hidden\|^hid$\|hid$\|hid\|^hid \|banner\|combx\|comment\|com-\|contact\|foot\|footer\|footnote\|masthead\|media\|meta\|modal\|outbrain\|promo\|related\|scroll\|share\|shoutbox\|sidebar\|skyscraper\|sponsor\|shopping\|tags\|tool\|widget\|byline\|author\|dateline\|writtenby\|p-author`)
	positiveRegexp = regexp.MustCompile(`(?i)article\|body\|content\|entry\|hentry\|h-entry\|main\|page\|pagination\|post\|text\|blog\|story`)
	)

	type nodeScores map[*html.Node]float32

	// ExtractContent returns relevant content.
	func ExtractContent(page io.Reader) (string, error) {
	root, err := html.Parse(page)
	if err != nil {
	return "", err
	}

	for _, trash := range htmlutil.Query(root, "script,style") {
	if trash.Parent != nil {
	trash.Parent.RemoveChild(trash)
	}
	}

	transformMisusedDivsIntoParagraphs(root)
	removeUnlikelyCandidates(root)

	scores := getCandidates(root)
	//log.Printf("[Readability] Candidates: %v", candidates)

	best := getTopCandidate(scores)
	if best == nil {
	for _, body := range htmlutil.Query(root, "body") {
	best = body
	break
	}
	if best == nil {
	return "", errors.New("failed to extract content")
	}
	}
	//log.Printf("[Readability] TopCandidate: %v", topCandidate)

	output := getArticle(best, scores)
	return output, nil
	}

	// Now that we have the top candidate, look through its siblings for content that might also be related.
	// Things like preambles, content split by ads that we removed, etc.
	func getArticle(best *html.Node, scores nodeScores) string {
	output := bytes.NewBufferString("<div>")
	siblingScoreThreshold := float32(math.Max(10, float64(scores[best]*.2)))

	nodelist := make([]*html.Node, 0)
	nodelist = append(nodelist, best)

	// Get the candidate's siblings
	for n := best.NextSibling; n != nil; n = n.NextSibling {
	nodelist = append(nodelist, n)
	}
	for n := best.PrevSibling; n != nil; n = n.PrevSibling {
	nodelist = append(nodelist, n)
	}

	for _, node := range nodelist {
	append := false
	isP := node.Data == "p"

	if node == best {
	append = true
	} else if scores[node] >= siblingScoreThreshold {
	append = true
	} else {
	if isP {
	linkDensity := getLinkDensity(node)
	content := htmlutil.Text(node)
	contentLength := len(content)

	if contentLength >= 80 && linkDensity < .25 {
	append = true
	} else if contentLength < 80 && linkDensity == 0 && sentenceRegexp.MatchString(content) {
	append = true
	}
	}
	}
	if append {
	tag := "div"
	if isP {
	tag = "p"
	}
	fmt.Fprintf(output, "<%s>%s</%s>", tag, htmlutil.InnerHTML(node), tag)
	}
	}

	output.Write([]byte("</div>"))
	return output.String()
	}

	func removeUnlikelyCandidates(root *html.Node) {
	body := htmlutil.Query(root, "body")
	if len(body) == 0 {
	return
	}
	for _, node := range htmlutil.Query(body[0], "*") {
	str := htmlutil.Attr(node, "class") + htmlutil.Attr(node, "id")

	if htmlutil.Closest(node, "table,code") != nil {
	continue
	}

	blacklisted := (blacklistCandidatesRegexp.MatchString(str) \|\|
	(unlikelyCandidatesRegexp.MatchString(str) &&
	!okMaybeItsACandidateRegexp.MatchString(str)))
	if blacklisted && node.Parent != nil {
	node.Parent.RemoveChild(node)
	}
	}
	}

	func getTopCandidate(scores nodeScores) *html.Node {
	var top *html.Node
	var max float32

	for node, score := range scores {
	if score > max {
	top = node
	max = score
	}
	}

	return top
	}

	// Loop through all paragraphs, and assign a score to them based on how content-y they look.
	// Then add their score to their parent node.
	// A score is determined by things like number of commas, class names, etc.
	// Maybe eventually link density.
	func getCandidates(root *html.Node) nodeScores {
	scores := make(nodeScores)
	for _, node := range htmlutil.Query(root, defaultTagsToScore) {
	text := htmlutil.Text(node)

	// If this paragraph is less than 25 characters, don't even count it.
	if len(text) < 25 {
	continue
	}

	parentNode := node.Parent
	grandParentNode := parentNode.Parent

	if _, found := scores[parentNode]; !found {
	scores[parentNode] = scoreNode(parentNode)
	}

	if grandParentNode != nil {
	if _, found := scores[grandParentNode]; !found {
	scores[grandParentNode] = scoreNode(grandParentNode)
	}
	}

	// Add a point for the paragraph itself as a base.
	contentScore := float32(1.0)

	// Add points for any commas within this paragraph.
	contentScore += float32(strings.Count(text, ",") + 1)

	// For every 100 characters in this paragraph, add another point. Up to 3 points.
	contentScore += float32(math.Min(float64(int(len(text)/100.0)), 3))

	scores[parentNode] += contentScore
	if grandParentNode != nil {
	scores[grandParentNode] += contentScore / 2.0
	}
	}

	// Scale the final candidates score based on link density. Good content
	// should have a relatively small link density (5% or less) and be mostly
	// unaffected by this operation
	for node := range scores {
	scores[node] *= (1 - getLinkDensity(node))
	}

	return scores
	}

	func scoreNode(node *html.Node) float32 {
	var score float32

	switch node.Data {
	case "div":
	score += 5
	case "pre", "td", "blockquote", "img":
	score += 3
	case "address", "ol", "ul", "dl", "dd", "dt", "li", "form":
	score -= 3
	case "h1", "h2", "h3", "h4", "h5", "h6", "th":
	score -= 5
	}

	return score + getClassWeight(node)
	}

	// Get the density of links as a percentage of the content
	// This is the amount of text that is inside a link divided by the total text in the node.
	func getLinkDensity(n *html.Node) float32 {
	textLength := len(htmlutil.Text(n))
	if textLength == 0 {
	return 0
	}

	linkLength := 0.0
	for _, a := range htmlutil.Query(n, "a") {
	linkLength += float64(len(htmlutil.Text(a)))
	}

	return float32(linkLength) / float32(textLength)
	}

	// Get an elements class/id weight. Uses regular expressions to tell if this
	// element looks good or bad.
	func getClassWeight(node *html.Node) float32 {
	weight := 0
	class := htmlutil.Attr(node, "class")
	id := htmlutil.Attr(node, "id")

	if class != "" {
	if negativeRegexp.MatchString(class) {
	weight -= 25
	}

	if positiveRegexp.MatchString(class) {
	weight += 25
	}
	}

	if id != "" {
	if negativeRegexp.MatchString(id) {
	weight -= 25
	}

	if positiveRegexp.MatchString(id) {
	weight += 25
	}
	}

	return float32(weight)
	}

	func transformMisusedDivsIntoParagraphs(root *html.Node) {
	for _, node := range htmlutil.Query(root, "div") {
	if !divToPElementsRegexp.MatchString(htmlutil.InnerHTML(node)) {
	node.Data = "p"
	}
	}
	}