Created
April 24, 2019 09:13
-
-
Save SimonRichardson/6b263dd2376552cc2d168bf88976dafd to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"io" | |
"net/http" | |
"net/url" | |
"sync" | |
"golang.org/x/net/html" | |
) | |
type Fetcher interface { | |
Fetch(url string) (urls []string, err error) | |
} | |
func Crawl(url string, depth int, fetcher Fetcher) { | |
if depth <= 0 { | |
return | |
} | |
urls, err := fetcher.Fetch(url) | |
if err != nil { | |
return | |
} | |
fmt.Println("Found urls", urls) | |
var wg sync.WaitGroup | |
wg.Add(len(urls)) | |
for _, u := range urls { | |
go func() { | |
defer wg.Done() | |
Crawl(u, depth-1, fetcher) | |
}() | |
} | |
wg.Wait() | |
return | |
} | |
type fetcher struct { | |
*http.Client | |
} | |
func (f fetcher) Fetch(href string) ([]string, error) { | |
req, _ := http.NewRequest("GET", href, nil) | |
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)") | |
res, err := f.Do(req) | |
if err != nil { | |
return nil, err | |
} | |
defer res.Body.Close() | |
links, err := f.parseBody(res.Body) | |
if err != nil { | |
return nil, err | |
} | |
result := make([]string, len(links)) | |
for k, v := range links { | |
link, err := url.Parse(v) | |
if err != nil { | |
return nil, err | |
} | |
if !link.IsAbs() { | |
link.Scheme = req.URL.Scheme | |
link.Host = req.URL.Host | |
} | |
result[k] = link.String() | |
} | |
return result, nil | |
} | |
func (f fetcher) parseBody(body io.Reader) ([]string, error) { | |
doc, err := html.Parse(body) | |
if err != nil { | |
return nil, err | |
} | |
var links []string | |
var parser func(*html.Node) | |
parser = func(n *html.Node) { | |
if n.Type == html.ElementNode && n.Data == "a" { | |
for _, v := range n.Attr { | |
if v.Key == "href" && len(v.Val) > 0 && v.Val[0] != '#' { | |
links = append(links, v.Val) | |
} | |
} | |
} | |
for c := n.FirstChild; c != nil; c = c.NextSibling { | |
parser(c) | |
} | |
} | |
parser(doc) | |
return links, nil | |
} | |
func main() { | |
Crawl("https://canonical.com", 2, fetcher{&http.Client{}}) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment