Skip to content

Instantly share code, notes, and snippets.

@yrong
Created July 24, 2019 08:51
Show Gist options
  • Save yrong/28867cf2852e18ded7730ecdba614636 to your computer and use it in GitHub Desktop.
Save yrong/28867cf2852e18ded7730ecdba614636 to your computer and use it in GitHub Desktop.
package main
import (
"bytes"
"flag"
"fmt"
"log"
"net/http"
"net/url"
"runtime"
"strings"
"sync"
"time"
"github.com/PuerkitoBio/fetchbot"
"github.com/PuerkitoBio/goquery"
)
var (
// Protect access to dup
mu sync.Mutex
// Duplicates table
dup = map[string]bool{}
// Command-line flags
seed = flag.String("seed", "http://golang.org", "seed URL")
cancelAfter = flag.Duration("cancelafter", 0, "automatically cancel the fetchbot after a given time")
cancelAtURL = flag.String("cancelat", "", "automatically cancel the fetchbot at a given URL")
stopAfter = flag.Duration("stopafter", 0, "automatically stop the fetchbot after a given time")
stopAtURL = flag.String("stopat", "", "automatically stop the fetchbot at a given URL")
memStats = flag.Duration("memstats", 0, "display memory statistics at a given interval")
)
func main() {
flag.Parse()
// Parse the provided seed
u, err := url.Parse(*seed)
if err != nil {
log.Fatal(err)
}
// Create the muxer
mux := fetchbot.NewMux()
// Handle all errors the same
mux.HandleErrors(fetchbot.HandlerFunc(func(ctx *fetchbot.Context, res *http.Response, err error) {
fmt.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err)
}))
// Handle GET requests for html responses, to parse the body and enqueue all links as HEAD
// requests.
mux.Response().Method("GET").ContentType("text/html").Handler(fetchbot.HandlerFunc(
func(ctx *fetchbot.Context, res *http.Response, err error) {
// Process the body to find the links
doc, err := goquery.NewDocumentFromResponse(res)
if err != nil {
fmt.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err)
return
}
// Enqueue all links as HEAD requests
enqueueLinks(ctx, doc)
}))
// Handle HEAD requests for html responses coming from the source host - we don't want
// to crawl links from other hosts.
mux.Response().Method("HEAD").Host(u.Host).ContentType("text/html").Handler(fetchbot.HandlerFunc(
func(ctx *fetchbot.Context, res *http.Response, err error) {
if _, err := ctx.Q.SendStringGet(ctx.Cmd.URL().String()); err != nil {
fmt.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err)
}
}))
// Create the Fetcher, handle the logging first, then dispatch to the Muxer
h := logHandler(mux)
if *stopAtURL != "" || *cancelAtURL != "" {
stopURL := *stopAtURL
if *cancelAtURL != "" {
stopURL = *cancelAtURL
}
h = stopHandler(stopURL, *cancelAtURL != "", logHandler(mux))
}
f := fetchbot.New(h)
// First mem stat print must be right after creating the fetchbot
if *memStats > 0 {
// Print starting stats
printMemStats(nil)
// Run at regular intervals
runMemStats(f, *memStats)
// On exit, print ending stats after a GC
defer func() {
runtime.GC()
printMemStats(nil)
}()
}
// Start processing
q := f.Start()
// if a stop or cancel is requested after some duration, launch the goroutine
// that will stop or cancel.
if *stopAfter > 0 || *cancelAfter > 0 {
after := *stopAfter
stopFunc := q.Close
if *cancelAfter != 0 {
after = *cancelAfter
stopFunc = q.Cancel
}
go func() {
c := time.After(after)
<-c
stopFunc()
}()
}
// Enqueue the seed, which is the first entry in the dup map
dup[*seed] = true
_, err = q.SendStringGet(*seed)
if err != nil {
fmt.Printf("[ERR] GET %s - %s\n", *seed, err)
}
q.Block()
}
func runMemStats(f *fetchbot.Fetcher, tick time.Duration) {
var mu sync.Mutex
var di *fetchbot.DebugInfo
// Start goroutine to collect fetchbot debug info
go func() {
for v := range f.Debug() {
mu.Lock()
di = v
mu.Unlock()
}
}()
// Start ticker goroutine to print mem stats at regular intervals
go func() {
c := time.Tick(tick)
for _ = range c {
mu.Lock()
printMemStats(di)
mu.Unlock()
}
}()
}
func printMemStats(di *fetchbot.DebugInfo) {
var mem runtime.MemStats
runtime.ReadMemStats(&mem)
buf := bytes.NewBuffer(nil)
buf.WriteString(strings.Repeat("=", 72) + "\n")
buf.WriteString("Memory Profile:\n")
buf.WriteString(fmt.Sprintf("\tAlloc: %d Kb\n", mem.Alloc/1024))
buf.WriteString(fmt.Sprintf("\tTotalAlloc: %d Kb\n", mem.TotalAlloc/1024))
buf.WriteString(fmt.Sprintf("\tNumGC: %d\n", mem.NumGC))
buf.WriteString(fmt.Sprintf("\tGoroutines: %d\n", runtime.NumGoroutine()))
if di != nil {
buf.WriteString(fmt.Sprintf("\tNumHosts: %d\n", di.NumHosts))
}
buf.WriteString(strings.Repeat("=", 72))
fmt.Println(buf.String())
}
// stopHandler stops the fetcher if the stopurl is reached. Otherwise it dispatches
// the call to the wrapped Handler.
func stopHandler(stopurl string, cancel bool, wrapped fetchbot.Handler) fetchbot.Handler {
return fetchbot.HandlerFunc(func(ctx *fetchbot.Context, res *http.Response, err error) {
if ctx.Cmd.URL().String() == stopurl {
fmt.Printf(">>>>> STOP URL %s\n", ctx.Cmd.URL())
// generally not a good idea to stop/block from a handler goroutine
// so do it in a separate goroutine
go func() {
if cancel {
ctx.Q.Cancel()
} else {
ctx.Q.Close()
}
}()
return
}
wrapped.Handle(ctx, res, err)
})
}
// logHandler prints the fetch information and dispatches the call to the wrapped Handler.
func logHandler(wrapped fetchbot.Handler) fetchbot.Handler {
return fetchbot.HandlerFunc(func(ctx *fetchbot.Context, res *http.Response, err error) {
if err == nil {
fmt.Printf("[%d] %s %s - %s\n", res.StatusCode, ctx.Cmd.Method(), ctx.Cmd.URL(), res.Header.Get("Content-Type"))
}
wrapped.Handle(ctx, res, err)
})
}
func enqueueLinks(ctx *fetchbot.Context, doc *goquery.Document) {
mu.Lock()
doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
val, _ := s.Attr("href")
// Resolve address
u, err := ctx.Cmd.URL().Parse(val)
if err != nil {
fmt.Printf("error: resolve URL %s - %s\n", val, err)
return
}
if !dup[u.String()] {
if _, err := ctx.Q.SendStringHead(u.String()); err != nil {
fmt.Printf("error: enqueue head %s - %s\n", u, err)
} else {
dup[u.String()] = true
}
}
})
mu.Unlock()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment