-
-
Save pkrnjevic/d03f2e5aba2d2261b7084b3d290e1732 to your computer and use it in GitHub Desktop.
Simple DOM node traversal in golang using a very useful collector/matcher function
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bytes" | |
"fmt" | |
"io" | |
"strings" | |
"unicode" | |
"golang.org/x/net/html" | |
"golang.org/x/net/html/atom" | |
) | |
const htm = `<!DOCTYPE html> | |
<html> | |
<head> | |
<title></title> | |
</head> | |
<body> | |
body content | |
<p>more <a href="">content</a></p> | |
<p>This <a href="/foo"><em>important</em> link <br> to | |
foo</a> is here</p> | |
<p>Call at <a href="mailto:[email protected]">[email protected]</a></p> | |
<div> | |
<span>[email protected]</span></div> | |
<p>Hello and <a href="">example.com</a>.</span></p> | |
<em>are all valid. You can email "john" if you need me.</em> | |
</body> | |
</html>` | |
func main() { | |
doc, err := html.Parse(strings.NewReader(htm)) | |
if err != nil { | |
fmt.Println(err) | |
return | |
} | |
// Find all the text nodes that are not children of a <p> | |
matcher := func(node *html.Node) (keep bool, exit bool) { | |
if node.Type == html.TextNode && strings.TrimSpace(node.Data) != "" { | |
keep = true | |
} | |
if node.DataAtom == atom.P { | |
exit = true | |
} | |
return | |
} | |
nodes = TraverseNode(doc, matcher) | |
for i, node := range nodes { | |
fmt.Println(i, renderNode(node)) | |
} | |
} | |
// TraverseNode collecting the nodes that match the given function | |
func TraverseNode(doc *html.Node, matcher func(node *html.Node) (bool, bool)) (nodes []*html.Node) { | |
var keep, exit bool | |
var f func(*html.Node) | |
f = func(n *html.Node) { | |
keep, exit = matcher(n) | |
if keep { | |
nodes = append(nodes, n) | |
} | |
if exit { | |
return | |
} | |
for c := n.FirstChild; c != nil; c = c.NextSibling { | |
f(c) | |
} | |
} | |
f(doc) | |
return nodes | |
} | |
// Works better than: https://github.com/yhat/scrape/blob/master/scrape.go#L129 | |
// because you can cut the search short from the matcher function | |
func renderNode(n *html.Node) string { | |
var buf bytes.Buffer | |
w := io.Writer(&buf) | |
html.Render(w, n) | |
return buf.String() | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment