Skip to content

Instantly share code, notes, and snippets.

@crazyyi
Created September 25, 2017 03:34
Show Gist options
  • Save crazyyi/074c4b912221e6cbe707b5c5de4d5da2 to your computer and use it in GitHub Desktop.
Save crazyyi/074c4b912221e6cbe707b5c5de4d5da2 to your computer and use it in GitHub Desktop.
A Golang concurrent Web crawler sulotion using channels and mutext. Inspired by the "The Go Programming Language" book.
package main
import (
"fmt"
"log"
"net/http"
"os"
"strings"
"sync"
"golang.org/x/net/html"
)
var lock = sync.RWMutex{}
func main() {
if len(os.Args) != 2 {
fmt.Println("Usage: crawl [URL].")
}
url := os.Args[1]
if !strings.HasPrefix(url, "http://") {
url = "http://" + url
}
n := 0
for link := range newCrawl(url, 1) {
n++
fmt.Println(link)
}
fmt.Printf("Total links: %d\n", n)
}
func newCrawl(url string, num int) chan string {
visited := make(map[string]bool)
ch := make(chan string, 20)
go func() {
crawl(url, 3, ch, &visited)
close(ch)
}()
return ch
}
func crawl(url string, n int, ch chan string, visited *map[string]bool) {
if n < 1 {
return
}
resp, err := http.Get(url)
if err != nil {
log.Fatalf("Can not reach the site. Error = %v\n", err)
os.Exit(1)
}
b := resp.Body
defer b.Close()
z := html.NewTokenizer(b)
nextN := n - 1
for {
token := z.Next()
switch token {
case html.ErrorToken:
return
case html.StartTagToken:
current := z.Token()
if current.Data != "a" {
continue
}
result, ok := getHrefTag(current)
if !ok {
continue
}
hasProto := strings.HasPrefix(result, "http")
if hasProto {
lock.RLock()
ok := (*visited)[result]
lock.RUnlock()
if ok {
continue
}
done := make(chan struct{})
go func() {
crawl(result, nextN, ch, visited)
close(done)
}()
<-done
lock.Lock()
(*visited)[result] = true
lock.Unlock()
ch <- result
}
}
}
}
func getHrefTag(token html.Token) (result string, ok bool) {
for _, a := range token.Attr {
if a.Key == "href" {
result = a.Val
ok = true
break
}
}
return
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment