Skip to content

Instantly share code, notes, and snippets.

@itarato
Last active September 23, 2015 15:22
Show Gist options
  • Select an option

  • Save itarato/1ece484c9de67d064795 to your computer and use it in GitHub Desktop.

Select an option

Save itarato/1ece484c9de67d064795 to your computer and use it in GitHub Desktop.
Website crawler looking for missing IMG tag ALT attributes on all linked pages (recursively)
package main
import (
"errors"
"io/ioutil"
"log"
"net/http"
"net/url"
"regexp"
"strconv"
"sync"
)
// Base URL for No trailing slash.
const BaseURL = "http://???"
// Base path.
const BasePath = "/"
// Fetcher struct to execute the main fetching operation on.
type Fetcher struct {
Base *url.URL
Histroy map[string]bool
}
var mutex sync.Mutex
// FetchRecursive is the kick-off for the crawl.
func (f *Fetcher) FetchRecursive() {
q := []string{BasePath}
var wg sync.WaitGroup
for len(q) > 0 {
wg.Add(len(q))
currentQ := q
q = []string{}
for _, currentPath := range currentQ {
go func(currentPath string) {
defer wg.Done()
q = append(q, f.Fetch(currentPath)...)
}(currentPath)
}
wg.Wait()
}
}
// Fetch is executing one resource fetch.
func (f *Fetcher) Fetch(currentPath string) []string {
mutex.Lock()
_, ok := f.Histroy[currentPath]
mutex.Unlock()
if ok {
// Seen this path before.
return nil
}
mutex.Lock()
f.Histroy[currentPath] = true
mutex.Unlock()
currentURL := BaseURL + currentPath
cr, err := fetch(currentURL)
if err != nil {
log.Println(err)
return nil
}
cr.AnalyzeAltTags()
return cr.CollectReferences()
}
// NewFetcher is a constructor for Fetcher.
func NewFetcher(path string) *Fetcher {
u, err := url.Parse(path)
if err != nil {
log.Panicln("Base path is not a valid URL:", path)
}
f := &Fetcher{Base: u}
f.Histroy = make(map[string]bool)
return f
}
// CrawlResult is the type to store result data.
type CrawlResult struct {
Path string
Response *http.Response
BodyCached string
}
// NewCrawlResult creates a new CrawlResult object.
func NewCrawlResult(path string, response *http.Response) *CrawlResult {
cr := &CrawlResult{
Path: path,
Response: response,
}
return cr
}
// GetBody returns the body string of the content.
func (cr *CrawlResult) GetBody() string {
if len(cr.BodyCached) != 0 {
return cr.BodyCached
}
body, err := ioutil.ReadAll(cr.Response.Body)
if err != nil {
log.Println("Can not read body for:", cr.Path)
return ""
}
cr.BodyCached = string(body)
return cr.BodyCached
}
// AnalyzeAltTags will do the ALT tag check and prints out the findings.
func (cr *CrawlResult) AnalyzeAltTags() {
reImg, err := regexp.Compile("<img[^>]*>")
if err != nil {
log.Panicln("Can not create regular expression")
}
imgs := reImg.FindAllString(cr.GetBody(), -1)
reAlt, err := regexp.Compile("alt=\"[^\"]+\"")
out := "Analyzing " + strconv.Itoa(len(imgs)) + " tags on page " + cr.Path
for _, img := range imgs {
if !reAlt.MatchString(img) {
out += "\nMissing ALT tag on page: " + cr.Path + " In tag: " + img
}
}
mutex.Lock()
log.Println(out + "\n")
mutex.Unlock()
}
// GetSourceHost returns the host.
func (cr *CrawlResult) GetSourceHost() string {
u, _ := url.Parse(cr.Path)
return u.Host
}
// CollectReferences will return all the referred site paths.
func (cr *CrawlResult) CollectReferences() []string {
reA, err := regexp.Compile("<a[^>]+href=\"((/|" + cr.GetSourceHost() + ")[^\"#]*)(|#.*)\"")
if err != nil {
log.Panicln("Can not create regular expression")
}
links := reA.FindAllString(cr.GetBody(), -1)
var paths []string
for _, link := range links {
paths = append(paths, reA.ReplaceAllString(link, "$1"))
}
return paths
}
func main() {
f := NewFetcher(BaseURL)
f.FetchRecursive()
}
func fetch(path string) (*CrawlResult, error) {
resp, err := http.Get(path)
if err != nil {
return nil, errors.New("Could not fetch resource: " + path)
}
return NewCrawlResult(path, resp), nil
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment