Skip to content

Instantly share code, notes, and snippets.

@salmoni
Last active October 13, 2024 12:20
Show Gist options
  • Save salmoni/27aee5bb0d26536391aabe7f13a72494 to your computer and use it in GitHub Desktop.
Save salmoni/27aee5bb0d26536391aabe7f13a72494 to your computer and use it in GitHub Desktop.
Parsing HTML in Go/Golang using goQuery to extract data from only one of multiple tables. Demonstrates nested Find statements.
package main
import (
"fmt"
"log"
"strings"
"github.com/PuerkitoBio/goquery"
)
func goGet() {
var headings, row []string
var rows [][]string
data := `<html><body>
<table>
<tr><th>Heading 1</th><th>Heading two</th></tr>
<tr><td>Data 11</td><td>Data 12</td></tr>
<tr><td>Data 21</td><td>Data 22</td></tr>
<tr><td>Data 31</td><td>Data 32</td></tr>
<tr><td>Data 41</td><td>Data 42</td></tr>
</table>
<p>Stuff in here</p>
<table>
<tr><th>Heading 21</th><th>Heading 2two</th></tr>
<tr><td>Data 211</td><td>Data 212</td></tr>
<tr><td>Data 221</td><td>Data 222</td></tr>
<tr><td>Data 231</td><td><span></span><span><a href="">Data 232</a></span></td></tr>
<tr><td>Data 241</td><td>Data 242</td></tr>
</table>
</body>
</html>
`
doc, err := goquery.NewDocumentFromReader(strings.NewReader(data))
if err != nil {
fmt.Println("No url found")
log.Fatal(err)
}
// Find each table
doc.Find("table").Each(func(index int, tablehtml *goquery.Selection) {
tablehtml.Find("tr").Each(func(indextr int, rowhtml *goquery.Selection) {
rowhtml.Find("th").Each(func(indexth int, tableheading *goquery.Selection) {
headings = append(headings, tableheading.Text())
})
rowhtml.Find("td").Each(func(indexth int, tablecell *goquery.Selection) {
row = append(row, tablecell.Text())
})
rows = append(rows, row)
row = nil
})
})
fmt.Println("####### headings = ", len(headings), headings)
fmt.Println("####### rows = ", len(rows), rows)
}
func main() {
goGet()
}
@pejman-hkh
Copy link

Another way is using GDP:

https://go.dev/play/p/QDNq1qs6p2S

package main

import (
	"fmt"

	"github.com/pejman-hkh/gdp/gdp"
)

func goGet() {
	var headings, row []string
	var rows [][]string

	data := `<html><body>
	<table>
		<tr><th>Heading 1</th><th>Heading two</th></tr>
		<tr><td>Data 11</td><td>Data 12</td></tr>
		<tr><td>Data 21</td><td>Data 22</td></tr>
		<tr><td>Data 31</td><td>Data 32</td></tr>
		<tr><td>Data 41</td><td>Data 42</td></tr>
	</table>
	<p>Stuff in here</p>
	<table>
		<tr><th>Heading 21</th><th>Heading 2two</th></tr>
		<tr><td>Data 211</td><td>Data 212</td></tr>
		<tr><td>Data 221</td><td>Data 222</td></tr>
		<tr><td>Data 231</td><td><span></span><span><a href="">Data 232</a></span></td></tr>
		<tr><td>Data 241</td><td>Data 242</td></tr>
	</table>
	</body>
	</html>
	`
	doc := gdp.Default(data)

	// Find each table
	doc.Find("table").Each(func(index int, tablehtml *gdp.Tag) {
		tablehtml.Find("tr").Each(func(indextr int, rowhtml *gdp.Tag) {
			rowhtml.Find("th").Each(func(indexth int, tableheading *gdp.Tag) {
				headings = append(headings, tableheading.Text())
			})
			rowhtml.Find("td").Each(func(indexth int, tablecell *gdp.Tag) {
				row = append(row, tablecell.Text())
			})
			rows = append(rows, row)
			row = nil
		})
	})
	fmt.Println("####### headings = ", len(headings), headings)
	fmt.Println("####### rows = ", len(rows), rows)
}

func main() {
	goGet()
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment