Skip to content

Instantly share code, notes, and snippets.

@SimonRichardson
Created April 24, 2019 09:13
Show Gist options
  • Save SimonRichardson/6b263dd2376552cc2d168bf88976dafd to your computer and use it in GitHub Desktop.
Save SimonRichardson/6b263dd2376552cc2d168bf88976dafd to your computer and use it in GitHub Desktop.
package main
import (
"fmt"
"io"
"net/http"
"net/url"
"sync"
"golang.org/x/net/html"
)
type Fetcher interface {
Fetch(url string) (urls []string, err error)
}
func Crawl(url string, depth int, fetcher Fetcher) {
if depth <= 0 {
return
}
urls, err := fetcher.Fetch(url)
if err != nil {
return
}
fmt.Println("Found urls", urls)
var wg sync.WaitGroup
wg.Add(len(urls))
for _, u := range urls {
go func() {
defer wg.Done()
Crawl(u, depth-1, fetcher)
}()
}
wg.Wait()
return
}
type fetcher struct {
*http.Client
}
func (f fetcher) Fetch(href string) ([]string, error) {
req, _ := http.NewRequest("GET", href, nil)
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)")
res, err := f.Do(req)
if err != nil {
return nil, err
}
defer res.Body.Close()
links, err := f.parseBody(res.Body)
if err != nil {
return nil, err
}
result := make([]string, len(links))
for k, v := range links {
link, err := url.Parse(v)
if err != nil {
return nil, err
}
if !link.IsAbs() {
link.Scheme = req.URL.Scheme
link.Host = req.URL.Host
}
result[k] = link.String()
}
return result, nil
}
func (f fetcher) parseBody(body io.Reader) ([]string, error) {
doc, err := html.Parse(body)
if err != nil {
return nil, err
}
var links []string
var parser func(*html.Node)
parser = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "a" {
for _, v := range n.Attr {
if v.Key == "href" && len(v.Val) > 0 && v.Val[0] != '#' {
links = append(links, v.Val)
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
parser(c)
}
}
parser(doc)
return links, nil
}
func main() {
Crawl("https://canonical.com", 2, fetcher{&http.Client{}})
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment