Last active
August 29, 2015 14:02
-
-
Save srom/a68524ece9b143240199 to your computer and use it in GitHub Desktop.
A solution to "A Tour of Go #73: Web Crawler"
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import "fmt" | |
type Fetcher interface { | |
// Fetch returns the body of URL and | |
// a slice of URLs found on that page. | |
Fetch(url string) (body string, urls []string, err error) | |
} | |
// Keep track of already crawled urls. | |
var store map[string]bool | |
// An edge in the crawling tree is made of an | |
// url and the current depth of the edge. | |
type Edge struct { | |
depth int | |
url string | |
} | |
// _crawl fetches a given edge, retrieve the next edges and | |
// feeds them to the channel. | |
func _crawl(edge Edge, fetcher Fetcher, Edges chan []Edge) { | |
url := edge.url | |
body, urls, err := fetcher.Fetch(url) | |
if err != nil { | |
fmt.Println(err) | |
} else { | |
fmt.Printf("found: %s %q\n", url, body) | |
} | |
// Create edges from the new urls and the updated depth. | |
edges := make([]Edge, len(urls)) | |
for i, u := range urls { | |
edges[i] = Edge{edge.depth + 1, u} | |
} | |
// Feed the list of edges to the channel. | |
Edges <- edges | |
} | |
func Crawl(url string, depth int, fetcher Fetcher) { | |
// Channel of pending list of edges. | |
Edges := make(chan []Edge) | |
// Crawl level 0. | |
go _crawl(Edge{0, url}, fetcher, Edges) | |
store[url] = true | |
// Crawl next levels. | |
band := 1 // track the number of outstanding goroutines | |
for band > 0 { | |
band-- | |
next := <-Edges // next list of edges to process | |
for _, edge := range next { | |
// Process the edge if it is a new url and if the max depth | |
// has not been reached yet. | |
if _, seen := store[edge.url]; !seen && edge.depth < depth { | |
store[edge.url] = true | |
band++ | |
go _crawl(edge, fetcher, Edges) | |
} | |
} | |
} | |
} | |
func main() { | |
store = make(map[string]bool) | |
Crawl("http://golang.org/", 3, fetcher) | |
} | |
// fakeFetcher is Fetcher that returns canned results. | |
type fakeFetcher map[string]*fakeResult | |
type fakeResult struct { | |
body string | |
urls []string | |
} | |
func (f fakeFetcher) Fetch(url string) (string, []string, error) { | |
if res, ok := f[url]; ok { | |
return res.body, res.urls, nil | |
} | |
return "", nil, fmt.Errorf("not found: %s", url) | |
} | |
// fetcher is a populated fakeFetcher. | |
var fetcher = fakeFetcher{ | |
"http://golang.org/": &fakeResult{ | |
"The Go Programming Language", | |
[]string{ | |
"http://golang.org/pkg/", | |
"http://golang.org/cmd/", | |
}, | |
}, | |
"http://golang.org/pkg/": &fakeResult{ | |
"Packages", | |
[]string{ | |
"http://golang.org/", | |
"http://golang.org/cmd/", | |
"http://golang.org/pkg/fmt/", | |
"http://golang.org/pkg/os/", | |
}, | |
}, | |
"http://golang.org/pkg/fmt/": &fakeResult{ | |
"Package fmt", | |
[]string{ | |
"http://golang.org/", | |
"http://golang.org/pkg/woot", | |
}, | |
}, | |
"http://golang.org/pkg/os/": &fakeResult{ | |
"Package os", | |
[]string{ | |
"http://golang.org/", | |
"http://golang.org/pkg/", | |
}, | |
}, | |
// Add more examples to verify that the depth | |
// limit is indeed enforced | |
"http://golang.org/pkg/woot": &fakeResult{ | |
"Woot Woot!", | |
[]string{ | |
"http://golang.org/", | |
"http://golang.org/pkg/bla", | |
}, | |
}, | |
"http://golang.org/pkg/bla": &fakeResult{ | |
"Bla Bla", | |
[]string{ | |
"http://golang.org/", | |
}, | |
}, | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment