Skip to content

Instantly share code, notes, and snippets.

@niski84
Last active November 9, 2023 01:29
Show Gist options
  • Save niski84/d0a9c61b44c10b11540aee15a6fbdc5e to your computer and use it in GitHub Desktop.
Save niski84/d0a9c61b44c10b11540aee15a6fbdc5e to your computer and use it in GitHub Desktop.
custom html parsing for health check report
package main
import (
"bytes"
"fmt"
"golang.org/x/net/html"
"strings"
)
// parseHTMLData finds the `tr` containing a `td` with exact text match for serviceName,
// then extracts the health status, lines between, and timestamp from the `tr`.
func parseHTMLData(htmlContent, serviceName string) (healthStatus string, linesBetween []string, ts string, err error) {
doc, err := html.Parse(strings.NewReader(htmlContent))
if err != nil {
return "", nil, "", err
}
// Find the tr element that contains the td with the service name
trNode := findTRNode(doc, serviceName)
if trNode == nil {
return "", nil, "", fmt.Errorf("service name '%s' not found", serviceName)
}
// Extract the health status, linesBetween, and timestamp
healthStatus = extractHealthStatus(trNode)
linesBetween = extractLinesBetween(trNode)
ts = extractTimestamp(trNode)
if healthStatus == "" {
err = fmt.Errorf("health status not found")
}
if ts == "" {
err = fmt.Errorf("%v, timestamp not found", err)
}
return healthStatus, linesBetween, ts, err
}
// findTRNode traverses the HTML node tree and returns the `tr` node that contains the specified service name.
func findTRNode(n *html.Node, serviceName string) *html.Node {
if n.Type == html.ElementNode && n.Data == "td" && getTextFromNode(n) == serviceName {
return n.Parent
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
if trNode := findTRNode(c, serviceName); trNode != nil {
return trNode
}
}
return nil
}
// extractHealthStatus extracts the health status from the first td element in the tr.
func extractHealthStatus(trNode *html.Node) string {
for c := trNode.FirstChild; c != nil; c = c.NextSibling {
if c.Type == html.ElementNode && c.Data == "td" {
return getTextFromNode(c)
}
}
return ""
}
<div class="line-between>
// extractTimestamp searches for the td element with class "ts" and returns its text content.
func extractTimestamp(trNode *html.Node) string {
for c := trNode.FirstChild; c != nil; c = c.NextSibling {
if c.Type == html.ElementNode && c.Data == "td" {
for _, a := range c.Attr {
if a.Key == "class" && a.Val == "ts" {
return getTextFromNode(c)
}
}
}
}
return ""
}
// getTextFromNode extracts and returns the concatenated text content of a node.
func getTextFromNode(n *html.Node) string {
var buf bytes.Buffer
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.TextNode {
buf.WriteString(n.Data)
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
f(n)
return strings.TrimSpace(buf.String())
}
// extractLinesBetween finds all div elements with class "line-between" within the tr node and returns their text content.
func extractLinesBetween(trNode *html.Node) []string {
var lines []string
// Traverse the tr node to find the td element that contains div with class "line-between".
for c := trNode.FirstChild; c != nil; c = c.NextSibling {
if c.Type == html.ElementNode && c.Data == "td" {
// Now look for div elements with the specific class inside the td.
for div := c.FirstChild; div != nil; div = div.NextSibling {
if div.Type == html.ElementNode && div.Data == "div" {
for _, a := range div.Attr {
if a.Key == "class" && a.Val == "line-between" {
lines = append(lines, getTextFromNode(div))
}
}
}
}
}
}
return lines
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment