mewmew · August 9, 2016 01:20
diff --git a/crawl.go b/crawl.go
 // Based on https://github.com/PuerkitoBio/gocrawl/blob/master/examples_test.go

 package main

 import (
 	"fmt"
 	"io/ioutil"
 	"log"
 	"net/http"
 	"os"
 	"path/filepath"
 	"regexp"
 	"time"

 	"github.com/PuerkitoBio/gocrawl"
 	"github.com/PuerkitoBio/goquery"
 	"github.com/mewkiz/pkg/errutil"
 )

 // Only enqueue the paths related to the diary directory.
 var rxOk = regexp.MustCompile(`https?://github\.com/xh3b4sd/anna/(tree|blob)/master/doc/diary.*?$`)

 // Create the Extender implementation, based on the gocrawl-provided
 // DefaultExtender, because we don't want/need to override all methods.
 type Extender struct {
 	gocrawl.DefaultExtender // Will use the default implementation of all but Visit and Filter
 }

 // Override Visit for our need.
 func (x *Extender) Visit(ctx *gocrawl.URLContext, res *http.Response, doc *goquery.Document) (interface{}, bool) {
 	s, err := doc.Html()
 	if err != nil {
 		log.Fatal(errutil.Err(err))
 	}
 	url := filepath.Join(doc.Url.Host, doc.Url.Path)
 	dir := url
 	name := "index.html"
 	if ext := filepath.Ext(url); len(ext) > 0 {
 		dir, name = filepath.Split(url)
 	}
 	path := filepath.Join(dir, name)
 	if err := os.MkdirAll(dir, 0755); err != nil {
 		log.Fatal(errutil.Err(err))
 	}
 	if err := ioutil.WriteFile(path, []byte(s), 0644); err != nil {
 		log.Fatal(errutil.Err(err))
 	}

 	// Return nil and true - let gocrawl find the links
 	return nil, true
 }

 var exists = make(map[string]bool)

 // Override Filter for our need.
 func (x *Extender) Filter(ctx *gocrawl.URLContext, isVisited bool) bool {
 	url := ctx.NormalizedURL().String()
 	if _, ok := exists[url]; !ok {
 		resp, err := http.Get(url)
 		if err != nil {
 			fmt.Printf("GET ERROR; %v (from %s)\n", err, ctx.SourceURL())
 			return !isVisited && rxOk.MatchString(url)
 		}
 		defer resp.Body.Close()
 		exists[url] = resp.StatusCode == http.StatusNotFound
 	}
 	if exists[url] {
 		fmt.Printf("NOT FOUND: %s (from %s)\n", url, ctx.SourceURL())
 	}
 	return !isVisited && rxOk.MatchString(url)
 }

 func main() {
 	// Set custom options
 	opts := gocrawl.NewOptions(new(Extender))

 	// should always set your robot name so that it looks for the most
 	// specific rules possible in robots.txt.
 	opts.RobotUserAgent = "CCBot"
 	// and reflect that in the user-agent string used to make requests,
 	// ideally with a link so site owners can contact you if there's an issue
 	opts.UserAgent = "CCBot"

 	opts.CrawlDelay = 1 * time.Second
 	opts.LogFlags = gocrawl.LogAll
 	opts.WorkerIdleTTL = 0

 	// Create crawler and start at root of the diary directory.
 	c := gocrawl.NewCrawlerWithOptions(opts)
 	c.Run("https://github.com/xh3b4sd/anna/tree/master/doc/diary")
 }
	// Based on https://github.com/PuerkitoBio/gocrawl/blob/master/examples_test.go

	package main

	import (
	"fmt"
	"io/ioutil"
	"log"
	"net/http"
	"os"
	"path/filepath"
	"regexp"
	"time"

	"github.com/PuerkitoBio/gocrawl"
	"github.com/PuerkitoBio/goquery"
	"github.com/mewkiz/pkg/errutil"
	)

	// Only enqueue the paths related to the diary directory.
	var rxOk = regexp.MustCompile(`https?://github\.com/xh3b4sd/anna/(tree\|blob)/master/doc/diary.*?$`)

	// Create the Extender implementation, based on the gocrawl-provided
	// DefaultExtender, because we don't want/need to override all methods.
	type Extender struct {
	gocrawl.DefaultExtender // Will use the default implementation of all but Visit and Filter
	}

	// Override Visit for our need.
	func (x Extender) Visit(ctx gocrawl.URLContext, res http.Response, doc goquery.Document) (interface{}, bool) {
	s, err := doc.Html()
	if err != nil {
	log.Fatal(errutil.Err(err))
	}
	url := filepath.Join(doc.Url.Host, doc.Url.Path)
	dir := url
	name := "index.html"
	if ext := filepath.Ext(url); len(ext) > 0 {
	dir, name = filepath.Split(url)
	}
	path := filepath.Join(dir, name)
	if err := os.MkdirAll(dir, 0755); err != nil {
	log.Fatal(errutil.Err(err))
	}
	if err := ioutil.WriteFile(path, []byte(s), 0644); err != nil {
	log.Fatal(errutil.Err(err))
	}

	// Return nil and true - let gocrawl find the links
	return nil, true
	}

	var exists = make(map[string]bool)

	// Override Filter for our need.
	func (x Extender) Filter(ctx gocrawl.URLContext, isVisited bool) bool {
	url := ctx.NormalizedURL().String()
	if _, ok := exists[url]; !ok {
	resp, err := http.Get(url)
	if err != nil {
	fmt.Printf("GET ERROR; %v (from %s)\n", err, ctx.SourceURL())
	return !isVisited && rxOk.MatchString(url)
	}
	defer resp.Body.Close()
	exists[url] = resp.StatusCode == http.StatusNotFound
	}
	if exists[url] {
	fmt.Printf("NOT FOUND: %s (from %s)\n", url, ctx.SourceURL())
	}
	return !isVisited && rxOk.MatchString(url)
	}

	func main() {
	// Set custom options
	opts := gocrawl.NewOptions(new(Extender))

	// should always set your robot name so that it looks for the most
	// specific rules possible in robots.txt.
	opts.RobotUserAgent = "CCBot"
	// and reflect that in the user-agent string used to make requests,
	// ideally with a link so site owners can contact you if there's an issue
	opts.UserAgent = "CCBot"

	opts.CrawlDelay = 1 * time.Second
	opts.LogFlags = gocrawl.LogAll
	opts.WorkerIdleTTL = 0

	// Create crawler and start at root of the diary directory.
	c := gocrawl.NewCrawlerWithOptions(opts)
	c.Run("https://github.com/xh3b4sd/anna/tree/master/doc/diary")
	}