Last active
August 9, 2016 01:20
-
-
Save mewmew/57196a0281f48ff83c5d78397f1ae73e to your computer and use it in GitHub Desktop.
Identify broken links based on GitHub directory listing.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Based on https://github.com/PuerkitoBio/gocrawl/blob/master/examples_test.go | |
package main | |
import ( | |
"fmt" | |
"io/ioutil" | |
"log" | |
"net/http" | |
"os" | |
"path/filepath" | |
"regexp" | |
"time" | |
"github.com/PuerkitoBio/gocrawl" | |
"github.com/PuerkitoBio/goquery" | |
"github.com/mewkiz/pkg/errutil" | |
) | |
// Only enqueue the paths related to the diary directory. | |
var rxOk = regexp.MustCompile(`https?://github\.com/xh3b4sd/anna/(tree|blob)/master/doc/diary.*?$`) | |
// Create the Extender implementation, based on the gocrawl-provided | |
// DefaultExtender, because we don't want/need to override all methods. | |
type Extender struct { | |
gocrawl.DefaultExtender // Will use the default implementation of all but Visit and Filter | |
} | |
// Override Visit for our need. | |
func (x *Extender) Visit(ctx *gocrawl.URLContext, res *http.Response, doc *goquery.Document) (interface{}, bool) { | |
s, err := doc.Html() | |
if err != nil { | |
log.Fatal(errutil.Err(err)) | |
} | |
url := filepath.Join(doc.Url.Host, doc.Url.Path) | |
dir := url | |
name := "index.html" | |
if ext := filepath.Ext(url); len(ext) > 0 { | |
dir, name = filepath.Split(url) | |
} | |
path := filepath.Join(dir, name) | |
if err := os.MkdirAll(dir, 0755); err != nil { | |
log.Fatal(errutil.Err(err)) | |
} | |
if err := ioutil.WriteFile(path, []byte(s), 0644); err != nil { | |
log.Fatal(errutil.Err(err)) | |
} | |
// Return nil and true - let gocrawl find the links | |
return nil, true | |
} | |
var exists = make(map[string]bool) | |
// Override Filter for our need. | |
func (x *Extender) Filter(ctx *gocrawl.URLContext, isVisited bool) bool { | |
url := ctx.NormalizedURL().String() | |
if _, ok := exists[url]; !ok { | |
resp, err := http.Get(url) | |
if err != nil { | |
fmt.Printf("GET ERROR; %v (from %s)\n", err, ctx.SourceURL()) | |
return !isVisited && rxOk.MatchString(url) | |
} | |
defer resp.Body.Close() | |
exists[url] = resp.StatusCode == http.StatusNotFound | |
} | |
if exists[url] { | |
fmt.Printf("NOT FOUND: %s (from %s)\n", url, ctx.SourceURL()) | |
} | |
return !isVisited && rxOk.MatchString(url) | |
} | |
func main() { | |
// Set custom options | |
opts := gocrawl.NewOptions(new(Extender)) | |
// should always set your robot name so that it looks for the most | |
// specific rules possible in robots.txt. | |
opts.RobotUserAgent = "CCBot" | |
// and reflect that in the user-agent string used to make requests, | |
// ideally with a link so site owners can contact you if there's an issue | |
opts.UserAgent = "CCBot" | |
opts.CrawlDelay = 1 * time.Second | |
opts.LogFlags = gocrawl.LogAll | |
opts.WorkerIdleTTL = 0 | |
// Create crawler and start at root of the diary directory. | |
c := gocrawl.NewCrawlerWithOptions(opts) | |
c.Run("https://github.com/xh3b4sd/anna/tree/master/doc/diary") | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment