harryhare/exercise-web-crawler.go

alcocerd · 2019-06-04T21:13:35Z

I've modified your code to use the more idiomatic way of waiting for goroutines, which is to use sync.WaitGroup. This eliminates the need to use time.Sleep on the main thread.

package main

import (
	"fmt"
	// No longer needed for sleeping.
	//"time"
	"sync"
)

type Fetcher interface {
	// Fetch returns the body of URL and
	// a slice of URLs found on that page.
	Fetch(url string) (body string, urls []string, err error)
}
type SafeCounter struct {
	v   map[string]bool
	mux sync.Mutex
	wg  sync.WaitGroup
}

var c SafeCounter = SafeCounter{v: make(map[string]bool)}

func (s SafeCounter) checkvisited(url string) bool {
	s.mux.Lock()
	defer s.mux.Unlock()
	_, ok := s.v[url]
	if ok == false {
		s.v[url] = true
		return false
	}
	return true

}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
	// TODO: Fetch URLs in parallel.
	// TODO: Don't fetch the same URL twice.
	// This implementation doesn't do either:
	defer c.wg.Done()
	if depth <= 0 {
		return
	}
	if c.checkvisited(url) {
		return
	}
	body, urls, err := fetcher.Fetch(url)
	if err != nil {
		fmt.Println(err)
		return
	}
	fmt.Printf("found: %s %q\n", url, body)
	for _, u := range urls {
		c.wg.Add(1)
		go Crawl(u, depth-1, fetcher)
	}
	return
}

func main() {
	c.wg.Add(1)
	Crawl("http://golang.org/", 4, fetcher)
	c.wg.Wait()
	// Not ideal to sleep on the main thread.
	//time.Sleep(5*time.Second)
}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
	body string
	urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
	if res, ok := f[url]; ok {
		return res.body, res.urls, nil
	}
	return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
	"http://golang.org/": &fakeResult{
		"The Go Programming Language",
		[]string{
			"http://golang.org/pkg/",
			"http://golang.org/cmd/",
		},
	},
	"http://golang.org/pkg/": &fakeResult{
		"Packages",
		[]string{
			"http://golang.org/",
			"http://golang.org/cmd/",
			"http://golang.org/pkg/fmt/",
			"http://golang.org/pkg/os/",
		},
	},
	"http://golang.org/pkg/fmt/": &fakeResult{
		"Package fmt",
		[]string{
			"http://golang.org/",
			"http://golang.org/pkg/",
		},
	},
	"http://golang.org/pkg/os/": &fakeResult{
		"Package os",
		[]string{
			"http://golang.org/",
			"http://golang.org/pkg/",
		},
	},
}

delongGao · 2019-12-30T21:38:53Z

@alcocerd, thanks for the tip of using WaitGroup! 👍

olehcambel · 2020-04-21T18:45:56Z

I moved WaitGroup to Crawl. I think it has more sence and now we can use cache separately from sync

package main

import (
	"fmt"
	"sync"
)

type mutexCache struct {
	mux   sync.Mutex
	store map[string]bool
}

func (cache *mutexCache) setVisited(name string) bool {
	cache.mux.Lock()
	defer cache.mux.Unlock()

	if cache.store[name] {
		return true
	}

	cache.store[name] = true

	return false
}

var cacheInstance = mutexCache{store: make(map[string]bool)}

// Fetcher interface
type Fetcher interface {
	// Fetch returns the body of URL and
	// a slice of URLs found on that page.
	Fetch(url string) (body string, urls []string, err error)
}

func crawlInner(url string, depth int, fetcher Fetcher, wg *sync.WaitGroup) {
	defer wg.Done()
	if depth <= 0 {
		return
	}

	if cacheInstance.setVisited(url) {
		return
	}

	body, urls, err := fetcher.Fetch(url)
	if err != nil {
		fmt.Println(err)
		return
	}
	fmt.Printf("found: %s %q\n", url, body)
	for _, u := range urls {
		wg.Add(1)
		go crawlInner(u, depth-1, fetcher, wg)
	}
	return
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
	waitGroup := &sync.WaitGroup{}

	waitGroup.Add(1)

	go crawlInner(url, depth, fetcher, waitGroup)

	waitGroup.Wait()
}

func main() {
	Crawl("https://golang.org/", 4, fetcher)

}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
	body string
	urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
	if res, ok := f[url]; ok {
		return res.body, res.urls, nil
	}
	return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
	"https://golang.org/": &fakeResult{
		"The Go Programming Language",
		[]string{
			"https://golang.org/pkg/",
			"https://golang.org/cmd/",
		},
	},
	"https://golang.org/pkg/": &fakeResult{
		"Packages",
		[]string{
			"https://golang.org/",
			"https://golang.org/cmd/",
			"https://golang.org/pkg/fmt/",
			"https://golang.org/pkg/os/",
		},
	},
	"https://golang.org/pkg/fmt/": &fakeResult{
		"Package fmt",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
	"https://golang.org/pkg/os/": &fakeResult{
		"Package os",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
}

kyoa · 2020-07-17T10:00:03Z

I've modified your code to use the more idiomatic way of waiting for goroutines, which is to use sync.WaitGroup. This eliminates the need to use time.Sleep on the main thread.

package main

import (
	"fmt"
	// No longer needed for sleeping.
	//"time"
	"sync"
)

type Fetcher interface {
	// Fetch returns the body of URL and
	// a slice of URLs found on that page.
	Fetch(url string) (body string, urls []string, err error)
}
type SafeCounter struct {
	v   map[string]bool
	mux sync.Mutex
	wg  sync.WaitGroup
}

var c SafeCounter = SafeCounter{v: make(map[string]bool)}

func (s SafeCounter) checkvisited(url string) bool {
	s.mux.Lock()
	defer s.mux.Unlock()
	_, ok := s.v[url]
	if ok == false {
		s.v[url] = true
		return false
	}
	return true

}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
	// TODO: Fetch URLs in parallel.
	// TODO: Don't fetch the same URL twice.
	// This implementation doesn't do either:
	defer c.wg.Done()
	if depth <= 0 {
		return
	}
	if c.checkvisited(url) {
		return
	}
	body, urls, err := fetcher.Fetch(url)
	if err != nil {
		fmt.Println(err)
		return
	}
	fmt.Printf("found: %s %q\n", url, body)
	for _, u := range urls {
		c.wg.Add(1)
		go Crawl(u, depth-1, fetcher)
	}
	return
}

func main() {
	c.wg.Add(1)
	Crawl("http://golang.org/", 4, fetcher)
	c.wg.Wait()
	// Not ideal to sleep on the main thread.
	//time.Sleep(5*time.Second)
}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
	body string
	urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
	if res, ok := f[url]; ok {
		return res.body, res.urls, nil
	}
	return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
	"http://golang.org/": &fakeResult{
		"The Go Programming Language",
		[]string{
			"http://golang.org/pkg/",
			"http://golang.org/cmd/",
		},
	},
	"http://golang.org/pkg/": &fakeResult{
		"Packages",
		[]string{
			"http://golang.org/",
			"http://golang.org/cmd/",
			"http://golang.org/pkg/fmt/",
			"http://golang.org/pkg/os/",
		},
	},
	"http://golang.org/pkg/fmt/": &fakeResult{
		"Package fmt",
		[]string{
			"http://golang.org/",
			"http://golang.org/pkg/",
		},
	},
	"http://golang.org/pkg/os/": &fakeResult{
		"Package os",
		[]string{
			"http://golang.org/",
			"http://golang.org/pkg/",
		},
	},
}

use WaitGroup is much better, THX

abhising10p14 · 2020-08-08T18:29:22Z

Use WaitGroup, it will save time which is restricted to 5 seconds in your case:

package main

import (
"fmt"
"sync"
"time"
)

type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}

var urlMap map[string]int

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher,wg *sync.WaitGroup) {

if depth <= 0 {
	wg.Done()
	return
}
if(urlMap[url] == 1){
	wg.Done()
	return
}

urlMap[url] = 1
body, urls, err := fetcher.Fetch(url)
if err != nil {
	fmt.Println(err)
	wg.Done()
	return
}
fmt.Printf("found: %s %q\n", url, body)
for _, u := range urls {
	wg.Add(1) 
	go Crawl(u, depth-1, fetcher,wg)
}
wg.Done()
return

}

func main() {
startTime := time.Now()
urlMap = make(map[string]int)
var w sync.WaitGroup
w.Add(1)
Crawl("https://golang.org/", 4, fetcher,&w)
w.Wait()
endTime := time.Now()
diff := endTime.Sub(startTime)
fmt.Println("total time taken ", diff.Seconds(), "seconds")
}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
body string
urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f[url]; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}

tim-hub · 2020-09-13T03:50:07Z

my version.

package main

import (
	"fmt"
	"sync"
)

type Fetcher interface {
	// Fetch returns the body of URL and
	// a slice of URLs found on that page.
	Fetch(url string) (body string, urls []string, err error)
}

type History struct {
	history map[string]bool
	mux     sync.Mutex
}

func (h *History) isFetched(url string) bool {
	h.mux.Lock()
	defer h.mux.Unlock()

	_, ok := h.history[url]
	if ok {
		return ok
	} else {
		h.history[url] = true
	}
	return false
}

var history = History{history: make(map[string]bool)}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, wg *sync.WaitGroup) {
	// TODO: Fetch URLs in parallel.
	// TODO: Don't fetch the same URL twice.
	// This implementation doesn't do either:
	defer wg.Done()
	if depth <= 0 || history.isFetched(url) {
		return
	}

	body, urls, err := fetcher.Fetch(url)
	if err != nil {
		fmt.Println(err)
		return
	}
	fmt.Printf("found: %s %q\n", url, body)
	for _, u := range urls {
		wg.Add(1)
		go Crawl(u, depth-1, fetcher, wg)
	}

	return
}

func syncCrawl(url string, depth int, fetcher Fetcher) {
	var wg sync.WaitGroup
	wg.Add(1)
	go Crawl("https://golang.org/", 4, fetcher, &wg)
	wg.Wait()
}

func main() {
	syncCrawl("https://golang.org/", 4, fetcher)
	//time.Sleep(5*time.Second)
	fmt.Println("done")
}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
	body string
	urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
	if res, ok := f[url]; ok {
		return res.body, res.urls, nil
	}
	return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
	"https://golang.org/": &fakeResult{
		"The Go Programming Language",
		[]string{
			"https://golang.org/pkg/",
			"https://golang.org/cmd/",
		},
	},
	"https://golang.org/pkg/": &fakeResult{
		"Packages",
		[]string{
			"https://golang.org/",
			"https://golang.org/cmd/",
			"https://golang.org/pkg/fmt/",
			"https://golang.org/pkg/os/",
		},
	},
	"https://golang.org/pkg/fmt/": &fakeResult{
		"Package fmt",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
	"https://golang.org/pkg/os/": &fakeResult{
		"Package os",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
}

ref

ilessy · 2020-12-23T16:16:10Z

package main

import (
	"fmt"
	"sync"
)

type Fetcher interface {
	// Fetch returns the body of URL and
	// a slice of URLs found on that page.
	Fetch(url string) (body string, urls []string, err error)
}
type SafeCounter struct {
	v   map[string]bool
	mux sync.Mutex
}

var c SafeCounter = SafeCounter{v: make(map[string]bool)}

func (s SafeCounter) checkvisited(url string) bool {
	s.mux.Lock()
	defer s.mux.Unlock()
	_, ok := s.v[url]
	if ok == false {
		s.v[url] = true
		return false
	}
	return true

}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, wg *sync.WaitGroup) {
	// TODO: Fetch URLs in parallel.
	// TODO: Don't fetch the same URL twice.
	// This implementation doesn't do either:
	defer wg.Done()
	if depth <= 0 {
		return
	}
	if c.checkvisited(url) {
		return
	}
	body, urls, err := fetcher.Fetch(url)
	if err != nil {
		fmt.Println(err)
		return
	}
	fmt.Printf("found: %s %q\n", url, body)
	for _, u := range urls {
		wg.Add(1)
		go Crawl(u, depth-1, fetcher, wg)
	}
}

func main() {
	var wg sync.WaitGroup
	wg.Add(1)
	go Crawl("http://golang.org/", 4, fetcher, &wg)
	wg.Wait()
}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
	body string
	urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
	if res, ok := f[url]; ok {
		return res.body, res.urls, nil
	}
	return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
	"http://golang.org/": &fakeResult{
		"The Go Programming Language",
		[]string{
			"http://golang.org/pkg/",
			"http://golang.org/cmd/",
		},
	},
	"http://golang.org/pkg/": &fakeResult{
		"Packages",
		[]string{
			"http://golang.org/",
			"http://golang.org/cmd/",
			"http://golang.org/pkg/fmt/",
			"http://golang.org/pkg/os/",
		},
	},
	"http://golang.org/pkg/fmt/": &fakeResult{
		"Package fmt",
		[]string{
			"http://golang.org/",
			"http://golang.org/pkg/",
		},
	},
	"http://golang.org/pkg/os/": &fakeResult{
		"Package os",
		[]string{
			"http://golang.org/",
			"http://golang.org/pkg/",
		},
	},
}

sergeyzalunin · 2021-02-07T07:23:19Z

It has better to use an empty struct than bool as map value. The empty struct takes zero memory.
map[string]struct{}

dilyanpalauzov · 2021-08-09T11:06:37Z

The Tutorial explains mutex, channels, maps, so the solution shall contain only these primitives. None of the solutions sticks to this limitation.

Amaimersion · 2021-09-13T12:19:18Z

The Tutorial explains matex, channels, maps, so the solution shall contain only these primitives. None of the solutions sticks to this limitation.

Agree

zurcacielos · 2021-09-14T07:11:58Z

The Tutorial explains matex, channels, maps, so the solution shall contain only these primitives. None of the solutions sticks to this limitation.

@dilyanpalauzov what's the solution you did or find finally? I would like to check it.

dilyanpalauzov · 2021-09-14T07:59:40Z

I have not found any.

jldiaz · 2021-10-05T06:41:21Z

@zurcacielos @dilyanpalauzov I've found an implementation which uses only channel, maps and mutexes. I ended up using these primitives to build my own WaitGroup implementation.

package main

import (
	"fmt"
	"sync"
)

type Fetcher interface {
	// Fetch returns the body of URL and
	// a slice of URLs found on that page.
	Fetch(url string) (body string, urls []string, err error)
}

type UrlCache struct {
	// Thread-safe cache for visited urls
	url map[string]bool
	mu  sync.Mutex
}

func NewUrlCache() UrlCache {
	return UrlCache{url: make(map[string]bool)}
}

func (c *UrlCache) Add(key string) {
	// Add a new visited URL
	c.mu.Lock()
	c.url[key] = true
	c.mu.Unlock()
}

func (c *UrlCache) Visited(key string) bool {
	// Returns boolean, true for visited URLs
	c.mu.Lock()
	defer c.mu.Unlock()
	v, ok := c.url[key]
	return v && ok
}

// ---------- custom WaitGroup implementation ----------------
//
// What follows implements a custom version of WaitGroup, using only
// the primitives explained in the tutorial (channels, and mutexes)
//
// It is more inefficient than sync.WaitGroup, but it does not introduce
// new library objects
//
type WaitGroup struct {
	counter int        // How many coroutines were started
	mu      sync.Mutex // Mutex to protect the counter
	ch      chan bool  // Channel for signaling when the counter reaches 0
}

func NewWaitGroup() WaitGroup {
	return WaitGroup{ch: make(chan bool)}
}

func (c *WaitGroup) Add(n int) int {
	// New coroutine starts
	c.mu.Lock()
	defer c.mu.Unlock()
	c.counter += n
	return c.counter
}

func (c *WaitGroup) Done() {
	// Coroutine ends
	v := c.Add(-1) // Counter is decremented
	if v <= 0 {
		c.ch <- true // If the counter reaches 0, we signal the end
	}
}

func (c *WaitGroup) Wait() {
	// Wait for the signal
	<-c.ch
}

// ---------------------------------------------------------------

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.

// Modified to pass the cache and the waitgroup, avoiding the use of global vars
func Crawl(url string, depth int, fetcher Fetcher, cache *UrlCache, wg *WaitGroup) {
	defer wg.Done()
	if cache.Visited(url) {
		return
	}
	if depth <= 0 {
		return
	}
	body, urls, err := fetcher.Fetch(url)
	cache.Add(url)
	if err != nil {
		fmt.Println(err)
		return
	}
	fmt.Printf("found: %s %q\n", url, body)
	for _, u := range urls {
		wg.Add(1)
		go Crawl(u, depth-1, fetcher, cache, wg)
	}
	return
}

func main() {
	cache := NewUrlCache()
	wg := NewWaitGroup()
	wg.Add(1)
	go Crawl("https://golang.org/", 4, fetcher, &cache, &wg)
	wg.Wait()
}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
	body string
	urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
	if res, ok := f[url]; ok {
		return res.body, res.urls, nil
	}
	return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
	"https://golang.org/": &fakeResult{
		"The Go Programming Language",
		[]string{
			"https://golang.org/pkg/",
			"https://golang.org/cmd/",
		},
	},
	"https://golang.org/pkg/": &fakeResult{
		"Packages",
		[]string{
			"https://golang.org/",
			"https://golang.org/cmd/",
			"https://golang.org/pkg/fmt/",
			"https://golang.org/pkg/os/",
		},
	},
	"https://golang.org/pkg/fmt/": &fakeResult{
		"Package fmt",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
	"https://golang.org/pkg/os/": &fakeResult{
		"Package os",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
}

CagdasCemre · 2021-10-09T18:09:40Z

It's been one week since I started learning Go and 4 days ago I took completing this exercise without waitGroup as a challenge when I saw this gist and 4 days ago somebody already refactored the code, great implementation @jldiaz 🥇
`

    uMap := make(map[string]bool)
    var mut sync.Mutex
    c := make(chan struct{}) // Create channel
    q := make(chan struct{}) // Quit channel
    go Crawl("https://golang.org/", 4, fetcher, &uMap, c, q, &mut)
    for i := 0; ; {
            select {
            case <-c:
                    i = i + 1
            case <-q:
                    i = i - 1
            }
            if i == 0 {
                    break
            }
    }

`

zurcacielos · 2021-11-22T10:51:05Z

@jldiaz I found this "official"? resolution at
https://cs.opensource.google/go/x/website/+/master:tour/solutions/webcrawler.go

// Copyright 2012 The Go Authors.  All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// +build ignore

package main

import (
	"errors"
	"fmt"
	"sync"
)

type Fetcher interface {
	// Fetch returns the body of URL and
	// a slice of URLs found on that page.
	Fetch(url string) (body string, urls []string, err error)
}

// fetched tracks URLs that have been (or are being) fetched.
// The lock must be held while reading from or writing to the map.
// See https://golang.org/ref/spec#Struct_types section on embedded types.
var fetched = struct {
	m map[string]error
	sync.Mutex
}{m: make(map[string]error)}

var loading = errors.New("url load in progress") // sentinel value

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
	if depth <= 0 {
		fmt.Printf("<- Done with %v, depth 0.\n", url)
		return
	}

	fetched.Lock()
	if _, ok := fetched.m[url]; ok {
		fetched.Unlock()
		fmt.Printf("<- Done with %v, already fetched.\n", url)
		return
	}
	// We mark the url to be loading to avoid others reloading it at the same time.
	fetched.m[url] = loading
	fetched.Unlock()

	// We load it concurrently.
	body, urls, err := fetcher.Fetch(url)

	// And update the status in a synced zone.
	fetched.Lock()
	fetched.m[url] = err
	fetched.Unlock()

	if err != nil {
		fmt.Printf("<- Error on %v: %v\n", url, err)
		return
	}
	fmt.Printf("Found: %s %q\n", url, body)
	done := make(chan bool)
	for i, u := range urls {
		fmt.Printf("-> Crawling child %v/%v of %v : %v.\n", i, len(urls), url, u)
		go func(url string) {
			Crawl(url, depth-1, fetcher)
			done <- true
		}(u)
	}
	for i, u := range urls {
		fmt.Printf("<- [%v] %v/%v Waiting for child %v.\n", url, i, len(urls), u)
		<-done
	}
	fmt.Printf("<- Done with %v\n", url)
}

func main() {
	Crawl("https://golang.org/", 4, fetcher)

	fmt.Println("Fetching stats\n--------------")
	for url, err := range fetched.m {
		if err != nil {
			fmt.Printf("%v failed: %v\n", url, err)
		} else {
			fmt.Printf("%v was fetched\n", url)
		}
	}
}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
	body string
	urls []string
}

func (f *fakeFetcher) Fetch(url string) (string, []string, error) {
	if res, ok := (*f)[url]; ok {
		return res.body, res.urls, nil
	}
	return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = &fakeFetcher{
	"https://golang.org/": &fakeResult{
		"The Go Programming Language",
		[]string{
			"https://golang.org/pkg/",
			"https://golang.org/cmd/",
		},
	},
	"https://golang.org/pkg/": &fakeResult{
		"Packages",
		[]string{
			"https://golang.org/",
			"https://golang.org/cmd/",
			"https://golang.org/pkg/fmt/",
			"https://golang.org/pkg/os/",
		},
	},
	"https://golang.org/pkg/fmt/": &fakeResult{
		"Package fmt",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
	"https://golang.org/pkg/os/": &fakeResult{
		"Package os",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
}

juniormayhe · 2022-03-12T19:37:13Z

👋 Learning Go here, not much know how on this language. Here I'm getting the same results but with a simpler approach.

Maybe using a struct to hold the fetched URLs would let this more organized.

Not sure if channels would be needed in this exercise.

If you have any thoughts, comments or see something wrong please let me know.

package main

import (
	"fmt"
	"sync"
	"time"

	"github.com/juniormayhe/currentFilename"
)

type Fetcher interface {
	// Fetch returns the body of URL and
	// a slice of URLs found on that page.
	Fetch(url string) (body string, urls []string, err error)
}

func setVisited(url string, visited map[string]bool, lock sync.Mutex) {
	lock.Lock()
	visited[url] = true
	lock.Unlock()
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, fetched map[string]bool) {

	var lock sync.Mutex

	_, exist := fetched[url]

	if depth <= 0 || exist {
		return
	}

	body, urls, err := fetcher.Fetch(url)

	if err != nil {
		setVisited(url, fetched, lock)

		fmt.Println(err)
		return
	}

	fmt.Printf("found: %s %q\n", url, body)

	for _, u := range urls {
		go Crawl(u, depth-1, fetcher, fetched)
	}
	setVisited(url, fetched, lock)

	time.Sleep(time.Second)

}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
	body string
	urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
	if res, ok := f[url]; ok {
		return res.body, res.urls, nil
	}
	return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
	"https://golang.org/": &fakeResult{
		"The Go Programming Language",
		[]string{
			"https://golang.org/pkg/",
			"https://golang.org/cmd/",
		},
	},
	"https://golang.org/pkg/": &fakeResult{
		"Packages",
		[]string{
			"https://golang.org/",
			"https://golang.org/cmd/",
			"https://golang.org/pkg/fmt/",
			"https://golang.org/pkg/os/",
		},
	},
	"https://golang.org/pkg/fmt/": &fakeResult{
		"Package fmt",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
	"https://golang.org/pkg/os/": &fakeResult{
		"Package os",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
}

func main() {
	fmt.Println(currentFilename.GetCurrentFileName(), "\n")
	fetchedUrls := make(map[string]bool)
	Crawl("https://golang.org/", 4, fetcher, fetchedUrls)
	fmt.Println(fetchedUrls)
}

diserere · 2022-05-23T00:56:26Z

Solution with waitGroup looks to be out of "A tour of Go" scope, really. And solution with 'time.Sleep' looks disguisting too much ))

The Tutorial explains mutex, channels, maps, so the solution shall contain only these primitives. None of the solutions sticks to this limitation.

Here is my solution using chained channels - it's not ideal cause I learn Go about two weeks, but it works and covers (i hope:) these conditions:

package main

import (
	"fmt"
	"sync"
	// "time"
)

type Fetcher interface {
	// Fetch returns the body of URL and
	// a slice of URLs found on that page.
	Fetch(url string) (body string, urls []string, err error)
}

type UrlCache struct {
	mu sync.Mutex
	cache map[string]int
}

func (u *UrlCache) IsFetched(url string) bool {
	u.mu.Lock()
	defer u.mu.Unlock()
	_, fetched := u.cache[url];
	if !fetched {
		u.cache[url]++
	}
	return fetched
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, c UrlCache, out chan string) {
	defer close(out)
	if depth <= 0 {
		return
	}
	if !c.IsFetched(url) {
		body, urls, err := fetcher.Fetch(url)
		if err != nil {
			// out <- err.Error()
			out <- fmt.Sprintf("%v", err)
			return
		}
		out <- fmt.Sprintf("found: %s %q", url, body)
		outs := make([]chan string, len(urls))
		for i, u := range urls {
			outs[i] = make(chan string)
			go Crawl(u, depth-1, fetcher, c, outs[i])
		}
		for _, ch := range outs {
			for msg := range ch {
				out <- msg
			}
		}
	}
}

func main() {
	cache := UrlCache{cache: make(map[string]int)}
	out := make(chan string)
	go Crawl("https://golang.org/", 4, fetcher, cache, out)
	// time.Sleep(time.Second*1)
	for msg := range out {
		fmt.Println(msg)
	}

}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
	body string
	urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
	if res, ok := f[url]; ok {
		return res.body, res.urls, nil
	}
	return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
	"https://golang.org/": &fakeResult{
		"The Go Programming Language",
		[]string{
			"https://golang.org/pkg/",
			"https://golang.org/cmd/",
		},
	},
	"https://golang.org/pkg/": &fakeResult{
		"Packages",
		[]string{
			"https://golang.org/",
			"https://golang.org/cmd/",
			"https://golang.org/pkg/fmt/",
			"https://golang.org/pkg/os/",
		},
	},
	"https://golang.org/pkg/fmt/": &fakeResult{
		"Package fmt",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
	"https://golang.org/pkg/os/": &fakeResult{
		"Package os",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
}

alg · 2022-08-21T08:54:55Z

Why you guys didn't use sync.Map for the safe cache and instead implemented it all manually?

krapie · 2022-12-13T07:52:42Z

package main

import (
	"fmt"
	"sync"
)

type Cache struct {
	mu sync.Mutex
	urls map[string]bool
}

func (c *Cache) InsertUrls(urls []string) {
	c.mu.Lock()
	defer c.mu.Unlock()
	
	for _, url := range urls {
		c.urls[url] = false
	}
}

func (c *Cache) UpdateFetchedUrl(url string) {
	c.mu.Lock()
	defer c.mu.Unlock()
	
	c.urls[url] = true
}

func (c *Cache) IsUrlFetched(url string) bool {
	c.mu.Lock()
	defer c.mu.Unlock()
	
	return c.urls[url]
}

type Fetcher interface {
	// Fetch returns the body of URL and
	// a slice of URLs found on that page.
	Fetch(url string) (body string, urls []string, err error)
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, cache *Cache, wg *sync.WaitGroup) {
	defer wg.Done()
	
	if depth <= 0 {
		return
	}
	body, urls, err := fetcher.Fetch(url)
	// insert new urls in cache
	// and update url fetched
	cache.InsertUrls(urls)
	cache.UpdateFetchedUrl(url)
	
	if err != nil {
		fmt.Println(err)
		return
	}
	
	fmt.Printf("found: %s %q\n", url, body)
	for _, u := range urls {
		// check cache if url is already fetched
		if !cache.IsUrlFetched(u) {
			wg.Add(1)
			go Crawl(u, depth-1, fetcher, cache, wg)
		}
	}
	return
}

// synced crawl using sync.Metux and sync.WaitGroup
func SyncCrawl(url string, depth int, fetcher Fetcher) {
	var wg sync.WaitGroup
	cache := Cache{urls: make(map[string]bool)}
	
	wg.Add(1)
	go Crawl(url, depth, fetcher, &cache, &wg)
	wg.Wait()
}

func main() {
	SyncCrawl("https://golang.org/", 4, fetcher)
	fmt.Println("done")
}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
	body string
	urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
	if res, ok := f[url]; ok {
		return res.body, res.urls, nil
	}
	return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
	"https://golang.org/": &fakeResult{
		"The Go Programming Language",
		[]string{
			"https://golang.org/pkg/",
			"https://golang.org/cmd/",
		},
	},
	"https://golang.org/pkg/": &fakeResult{
		"Packages",
		[]string{
			"https://golang.org/",
			"https://golang.org/cmd/",
			"https://golang.org/pkg/fmt/",
			"https://golang.org/pkg/os/",
		},
	},
	"https://golang.org/pkg/fmt/": &fakeResult{
		"Package fmt",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
	"https://golang.org/pkg/os/": &fakeResult{
		"Package os",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
}

cirpi · 2023-09-26T12:50:36Z

package main

import (
	"fmt"
	"sync"
	"time"
)

type Fetcher interface {
	// Fetch returns the body of URL and
	// a slice of URLs found on that page.
	Fetch(url string) (body string, urls []string, err error)
}

type safemap struct {
	m sync.Mutex
	mm map[string]bool
}
var safe safemap = safemap{mm:make(map[string]bool)}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
	// TODO: Fetch URLs in parallel.
	// TODO: Don't fetch the same URL twice.
	safe.m.Lock()
	safe.mm[url] = true
	safe.m.Unlock()
	// This implementation doesn't do either:
	if depth <= 0 {
		return
	}
	body, urls, err := fetcher.Fetch(url)
	if err != nil {
		fmt.Println(err)
		return
	}
	fmt.Printf("found: %s %q\n", url, body)
	for _, u := range urls {
		if _, visited := safe.mm[u]; !visited {
			go Crawl(u, depth-1, fetcher)
		}
	}
	return
}

func main() {
	Crawl("https://golang.org/", 4, fetcher)
	time.Sleep(time.Second)
}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
	body string
	urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
	if res, ok := f[url]; ok {
		return res.body, res.urls, nil
	}
	return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
	"https://golang.org/": &fakeResult{
		"The Go Programming Language",
		[]string{
			"https://golang.org/pkg/",
			"https://golang.org/cmd/",
		},
	},
	"https://golang.org/pkg/": &fakeResult{
		"Packages",
		[]string{
			"https://golang.org/",
			"https://golang.org/cmd/",
			"https://golang.org/pkg/fmt/",
			"https://golang.org/pkg/os/",
		},
	},
	"https://golang.org/pkg/fmt/": &fakeResult{
		"Package fmt",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
	"https://golang.org/pkg/os/": &fakeResult{
		"Package os",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
}

could you guys give me some feedback on this code?

JerrodTanner · 2024-03-07T18:58:19Z

here is my answer, using channels.

package main

import (
	"fmt"
	"sync"
)

type Fetcher interface {
	Fetch(url string) (body string, urls []string, err error)
}

type SafeCounter struct {
	mu sync.Mutex
	v  map[string]bool
}

func (c *SafeCounter) Inc(key string) {
	c.mu.Lock()
	defer c.mu.Unlock()
	c.v[key] = true
}

func (c *SafeCounter) Visited(key string) bool {
	c.mu.Lock()
	defer c.mu.Unlock()
	return c.v[key]
}

func Crawl(url string, depth int, fetcher Fetcher, ch chan string, wg *sync.WaitGroup, c *SafeCounter) error {
	defer func() {
		if r := recover(); r != nil {
			fmt.Println("Recovered in Crawl:", r)
		}
	}()

	if depth <= 0 || c.Visited(url) {
		return nil
	}

	body, urls, err := fetcher.Fetch(url)
	if err != nil {
		return err
	}
	//body, urls, err := fetcher.Fetch(url)
	//fmt.Printf("found: %s %q\n", url, body)
	wg.Add(len(urls)) // Add the number of URLs to crawl before starting to crawl them

	ch <- url + " " + "'" + body + "'"
	c.Inc(url)

	for _, u := range urls {
		go func(u string) {
			defer wg.Done()
			Crawl(u, depth-1, fetcher, ch, wg, c)
		}(u)
	}

	return nil
}

func main() {
	ch := make(chan string)
	c := SafeCounter{v: make(map[string]bool)}
	var wg sync.WaitGroup

	// Increment the WaitGroup before starting the crawl
	wg.Add(1)
	go func() {
		defer wg.Done()
		if err := Crawl("https://golang.org/", 4, fetcher, ch, &wg, &c); err != nil {
			fmt.Println(err)
		}
	}()

	go func() {
		wg.Wait()
		close(ch)
	}()

	for result := range ch {
		fmt.Printf("found: %s \n", result)
	}

	wg.Wait() // Wait for all goroutines to finish before exiting
	//fmt.Println(c.v)
}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
	body string
	urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
	if res, ok := f[url]; ok {
		return res.body, res.urls, nil
	}
	return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
	"https://golang.org/": &fakeResult{
		"The Go Programming Language",
		[]string{
			"https://golang.org/pkg/",
			"https://golang.org/cmd/",
		},
	},
	"https://golang.org/pkg/": &fakeResult{
		"Packages",
		[]string{
			"https://golang.org/",
			"https://golang.org/cmd/",
			"https://golang.org/pkg/fmt/",
			"https://golang.org/pkg/os/",
		},
	},
	"https://golang.org/pkg/fmt/": &fakeResult{
		"Package fmt",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
	"https://golang.org/pkg/os/": &fakeResult{
		"Package os",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
}

devkaare · 2024-09-05T19:12:59Z

Here's my solution. Pretty neat imo!

package main

import (
	"fmt"
	"sync"
)

type Fetcher interface {
	// Fetch returns the body of URL and
	// a slice of URLs found on that page.
	Fetch(url string) (body string, urls []string, err error)
}

type SafeMap struct {
	mu sync.Mutex
	m  map[string]string
}

func (sm *SafeMap) addCache(url string) {
	sm.mu.Lock()
	defer sm.mu.Unlock()

	sm.m[url] = url
}

func (sm *SafeMap) checkCache(url string) bool {
	sm.mu.Lock()
	defer sm.mu.Unlock()

	_, ok := sm.m[url]
	return ok
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, sm *SafeMap) {
	defer wg.Done()

	if depth <= 0 {
		return
	}

	if sm.checkCache(url) {
		return
	}

	sm.addCache(url)

	body, urls, err := fetcher.Fetch(url)
	if err != nil {
		fmt.Println(err)
		return
	}

	fmt.Printf("found: %s %q\n", url, body)
	for _, u := range urls {
		wg.Add(1)
		go Crawl(u, depth-1, fetcher, sm)
	}
}

var wg sync.WaitGroup

func main() {
	sm := SafeMap{m: make(map[string]string)}

	wg.Add(1)
	Crawl("https://golang.org/", 4, fetcher, &sm)
	wg.Wait()
}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
	body string
	urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
	if res, ok := f[url]; ok {
		return res.body, res.urls, nil
	}
	return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
	"https://golang.org/": &fakeResult{
		"The Go Programming Language",
		[]string{
			"https://golang.org/pkg/",
			"https://golang.org/cmd/",
		},
	},
	"https://golang.org/pkg/": &fakeResult{
		"Packages",
		[]string{
			"https://golang.org/",
			"https://golang.org/cmd/",
			"https://golang.org/pkg/fmt/",
			"https://golang.org/pkg/os/",
		},
	},
	"https://golang.org/pkg/fmt/": &fakeResult{
		"Package fmt",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
	"https://golang.org/pkg/os/": &fakeResult{
		"Package os",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
}

harryhare/exercise-web-crawler.go

alcocerd commented Jun 4, 2019 •

edited

Loading

delongGao commented Dec 30, 2019

olehcambel commented Apr 21, 2020

kyoa commented Jul 17, 2020

abhising10p14 commented Aug 8, 2020

tim-hub commented Sep 13, 2020 •

edited

Loading

ilessy commented Dec 23, 2020

sergeyzalunin commented Feb 7, 2021

dilyanpalauzov commented Aug 9, 2021 •

edited

Loading

Amaimersion commented Sep 13, 2021

zurcacielos commented Sep 14, 2021

dilyanpalauzov commented Sep 14, 2021

jldiaz commented Oct 5, 2021 •

edited

Loading

CagdasCemre commented Oct 9, 2021 •

edited

Loading

zurcacielos commented Nov 22, 2021 •

edited

Loading

juniormayhe commented Mar 12, 2022

diserere commented May 23, 2022 •

edited

Loading

alg commented Aug 21, 2022

krapie commented Dec 13, 2022

cirpi commented Sep 26, 2023 •

edited

Loading

JerrodTanner commented Mar 7, 2024 •

edited

Loading

devkaare commented Sep 5, 2024 •

edited

Loading

	package main

	import (
	"fmt"
	"time"
	"sync"
	)

	type Fetcher interface {
	// Fetch returns the body of URL and
	// a slice of URLs found on that page.
	Fetch(url string) (body string, urls []string, err error)
	}
	type SafeCounter struct {
	v map[string]bool
	mux sync.Mutex
	}
	var c SafeCounter = SafeCounter{v: make(map[string]bool)}
	func (s SafeCounter)checkvisited(url string)bool{
	s.mux.Lock()
	defer s.mux.Unlock()
	_,ok:=s.v[url]
	if ok==false {
	s.v[url]=true
	return false
	}
	return true

	}


	// Crawl uses fetcher to recursively crawl
	// pages starting with url, to a maximum of depth.
	func Crawl(url string, depth int, fetcher Fetcher) {
	// TODO: Fetch URLs in parallel.
	// TODO: Don't fetch the same URL twice.
	// This implementation doesn't do either:
	if depth <= 0 {
	return
	}
	if c.checkvisited(url) {
	return;
	}
	body, urls, err := fetcher.Fetch(url)
	if err != nil {
	fmt.Println(err)
	return
	}
	fmt.Printf("found: %s %q\n", url, body)
	for _, u := range urls {
	go Crawl(u, depth-1, fetcher)
	}
	return
	}

	func main() {
	Crawl("http://golang.org/", 4, fetcher)
	time.Sleep(5*time.Second)
	}

	// fakeFetcher is Fetcher that returns canned results.
	type fakeFetcher map[string]*fakeResult

	type fakeResult struct {
	body string
	urls []string
	}

	func (f fakeFetcher) Fetch(url string) (string, []string, error) {
	if res, ok := f[url]; ok {
	return res.body, res.urls, nil
	}
	return "", nil, fmt.Errorf("not found: %s", url)
	}

	// fetcher is a populated fakeFetcher.
	var fetcher = fakeFetcher{
	"http://golang.org/": &fakeResult{
	"The Go Programming Language",
	[]string{
	"http://golang.org/pkg/",
	"http://golang.org/cmd/",
	},
	},
	"http://golang.org/pkg/": &fakeResult{
	"Packages",
	[]string{
	"http://golang.org/",
	"http://golang.org/cmd/",
	"http://golang.org/pkg/fmt/",
	"http://golang.org/pkg/os/",
	},
	},
	"http://golang.org/pkg/fmt/": &fakeResult{
	"Package fmt",
	[]string{
	"http://golang.org/",
	"http://golang.org/pkg/",
	},
	},
	"http://golang.org/pkg/os/": &fakeResult{
	"Package os",
	[]string{
	"http://golang.org/",
	"http://golang.org/pkg/",
	},
	},
	}

harryhare/exercise-web-crawler.go

alcocerd commented Jun 4, 2019 • edited Loading

delongGao commented Dec 30, 2019

olehcambel commented Apr 21, 2020

kyoa commented Jul 17, 2020

abhising10p14 commented Aug 8, 2020

tim-hub commented Sep 13, 2020 • edited Loading

ref

ilessy commented Dec 23, 2020

sergeyzalunin commented Feb 7, 2021

dilyanpalauzov commented Aug 9, 2021 • edited Loading

Amaimersion commented Sep 13, 2021

zurcacielos commented Sep 14, 2021

dilyanpalauzov commented Sep 14, 2021

jldiaz commented Oct 5, 2021 • edited Loading

CagdasCemre commented Oct 9, 2021 • edited Loading

zurcacielos commented Nov 22, 2021 • edited Loading

juniormayhe commented Mar 12, 2022

diserere commented May 23, 2022 • edited Loading

alg commented Aug 21, 2022

krapie commented Dec 13, 2022

cirpi commented Sep 26, 2023 • edited Loading

JerrodTanner commented Mar 7, 2024 • edited Loading

devkaare commented Sep 5, 2024 • edited Loading

alcocerd commented Jun 4, 2019 •

edited

Loading

tim-hub commented Sep 13, 2020 •

edited

Loading

dilyanpalauzov commented Aug 9, 2021 •

edited

Loading

jldiaz commented Oct 5, 2021 •

edited

Loading

CagdasCemre commented Oct 9, 2021 •

edited

Loading

zurcacielos commented Nov 22, 2021 •

edited

Loading

diserere commented May 23, 2022 •

edited

Loading

cirpi commented Sep 26, 2023 •

edited

Loading

JerrodTanner commented Mar 7, 2024 •

edited

Loading

devkaare commented Sep 5, 2024 •

edited

Loading