itarato · September 23, 2015 15:22
diff --git a/alt_crawler.go b/alt_crawler.go
 package main

 import (
 	"errors"
 	"io/ioutil"
 	"log"
 	"net/http"
 	"net/url"
 	"regexp"
 	"strconv"
 	"sync"
 )

 // Base URL for	No trailing slash.
 const BaseURL = "http://???"

 // Base path.
 const BasePath = "/"

 // Fetcher struct to execute the main fetching operation on.
 type Fetcher struct {
 	Base    *url.URL
 	Histroy map[string]bool
 }

 var mutex sync.Mutex

 // FetchRecursive is the kick-off for the crawl.
 func (f *Fetcher) FetchRecursive() {
 	q := []string{BasePath}

 	var wg sync.WaitGroup

 	for len(q) > 0 {
 		wg.Add(len(q))

 		currentQ := q
 		q = []string{}

 		for _, currentPath := range currentQ {
 			go func(currentPath string) {
 				defer wg.Done()
 				q = append(q, f.Fetch(currentPath)...)
 			}(currentPath)
 		}

 		wg.Wait()
 	}
 }

 // Fetch is executing one resource fetch.
 func (f *Fetcher) Fetch(currentPath string) []string {
 	mutex.Lock()
 	_, ok := f.Histroy[currentPath]
 	mutex.Unlock()
 	if ok {
 		// Seen this path before.
 		return nil
 	}

 	mutex.Lock()
 	f.Histroy[currentPath] = true
 	mutex.Unlock()

 	currentURL := BaseURL + currentPath

 	cr, err := fetch(currentURL)
 	if err != nil {
 		log.Println(err)
 		return nil
 	}

 	cr.AnalyzeAltTags()
 	return cr.CollectReferences()
 }

 // NewFetcher is a constructor for Fetcher.
 func NewFetcher(path string) *Fetcher {
 	u, err := url.Parse(path)
 	if err != nil {
 		log.Panicln("Base path is not a valid URL:", path)
 	}

 	f := &Fetcher{Base: u}
 	f.Histroy = make(map[string]bool)
 	return f
 }

 // CrawlResult is the type to store result data.
 type CrawlResult struct {
 	Path       string
 	Response   *http.Response
 	BodyCached string
 }

 // NewCrawlResult creates a new CrawlResult object.
 func NewCrawlResult(path string, response *http.Response) *CrawlResult {
 	cr := &CrawlResult{
 		Path:     path,
 		Response: response,
 	}
 	return cr
 }

 // GetBody returns the body string of the content.
 func (cr *CrawlResult) GetBody() string {
 	if len(cr.BodyCached) != 0 {
 		return cr.BodyCached
 	}

 	body, err := ioutil.ReadAll(cr.Response.Body)
 	if err != nil {
 		log.Println("Can not read body for:", cr.Path)
 		return ""
 	}
 	cr.BodyCached = string(body)
 	return cr.BodyCached
 }

 // AnalyzeAltTags will do the ALT tag check and prints out the findings.
 func (cr *CrawlResult) AnalyzeAltTags() {
 	reImg, err := regexp.Compile("<img[^>]*>")
 	if err != nil {
 		log.Panicln("Can not create regular expression")
 	}
 	imgs := reImg.FindAllString(cr.GetBody(), -1)

 	reAlt, err := regexp.Compile("alt=\"[^\"]+\"")
 	out := "Analyzing " + strconv.Itoa(len(imgs)) + " tags on page " + cr.Path
 	for _, img := range imgs {
 		if !reAlt.MatchString(img) {
 			out += "\nMissing ALT tag on page: " + cr.Path + " In tag: " + img
 		}
 	}

 	mutex.Lock()
 	log.Println(out + "\n")
 	mutex.Unlock()
 }

 // GetSourceHost returns the host.
 func (cr *CrawlResult) GetSourceHost() string {
 	u, _ := url.Parse(cr.Path)
 	return u.Host
 }

 // CollectReferences will return all the referred site paths.
 func (cr *CrawlResult) CollectReferences() []string {
 	reA, err := regexp.Compile("<a[^>]+href=\"((/|" + cr.GetSourceHost() + ")[^\"#]*)(|#.*)\"")
 	if err != nil {
 		log.Panicln("Can not create regular expression")
 	}
 	links := reA.FindAllString(cr.GetBody(), -1)

 	var paths []string
 	for _, link := range links {
 		paths = append(paths, reA.ReplaceAllString(link, "$1"))
 	}

 	return paths
 }

 func main() {
 	f := NewFetcher(BaseURL)
 	f.FetchRecursive()
 }

 func fetch(path string) (*CrawlResult, error) {
 	resp, err := http.Get(path)
 	if err != nil {
 		return nil, errors.New("Could not fetch resource: " + path)
 	}

 	return NewCrawlResult(path, resp), nil
 }
	package main

	import (
	"errors"
	"io/ioutil"
	"log"
	"net/http"
	"net/url"
	"regexp"
	"strconv"
	"sync"
	)

	// Base URL for No trailing slash.
	const BaseURL = "http://???"

	// Base path.
	const BasePath = "/"

	// Fetcher struct to execute the main fetching operation on.
	type Fetcher struct {
	Base *url.URL
	Histroy map[string]bool
	}

	var mutex sync.Mutex

	// FetchRecursive is the kick-off for the crawl.
	func (f *Fetcher) FetchRecursive() {
	q := []string{BasePath}

	var wg sync.WaitGroup

	for len(q) > 0 {
	wg.Add(len(q))

	currentQ := q
	q = []string{}

	for _, currentPath := range currentQ {
	go func(currentPath string) {
	defer wg.Done()
	q = append(q, f.Fetch(currentPath)...)
	}(currentPath)
	}

	wg.Wait()
	}
	}

	// Fetch is executing one resource fetch.
	func (f *Fetcher) Fetch(currentPath string) []string {
	mutex.Lock()
	_, ok := f.Histroy[currentPath]
	mutex.Unlock()
	if ok {
	// Seen this path before.
	return nil
	}

	mutex.Lock()
	f.Histroy[currentPath] = true
	mutex.Unlock()

	currentURL := BaseURL + currentPath

	cr, err := fetch(currentURL)
	if err != nil {
	log.Println(err)
	return nil
	}

	cr.AnalyzeAltTags()
	return cr.CollectReferences()
	}

	// NewFetcher is a constructor for Fetcher.
	func NewFetcher(path string) *Fetcher {
	u, err := url.Parse(path)
	if err != nil {
	log.Panicln("Base path is not a valid URL:", path)
	}

	f := &Fetcher{Base: u}
	f.Histroy = make(map[string]bool)
	return f
	}

	// CrawlResult is the type to store result data.
	type CrawlResult struct {
	Path string
	Response *http.Response
	BodyCached string
	}

	// NewCrawlResult creates a new CrawlResult object.
	func NewCrawlResult(path string, response http.Response) CrawlResult {
	cr := &CrawlResult{
	Path: path,
	Response: response,
	}
	return cr
	}

	// GetBody returns the body string of the content.
	func (cr *CrawlResult) GetBody() string {
	if len(cr.BodyCached) != 0 {
	return cr.BodyCached
	}

	body, err := ioutil.ReadAll(cr.Response.Body)
	if err != nil {
	log.Println("Can not read body for:", cr.Path)
	return ""
	}
	cr.BodyCached = string(body)
	return cr.BodyCached
	}

	// AnalyzeAltTags will do the ALT tag check and prints out the findings.
	func (cr *CrawlResult) AnalyzeAltTags() {
	reImg, err := regexp.Compile("<img[^>]*>")
	if err != nil {
	log.Panicln("Can not create regular expression")
	}
	imgs := reImg.FindAllString(cr.GetBody(), -1)

	reAlt, err := regexp.Compile("alt=\"[^\"]+\"")
	out := "Analyzing " + strconv.Itoa(len(imgs)) + " tags on page " + cr.Path
	for _, img := range imgs {
	if !reAlt.MatchString(img) {
	out += "\nMissing ALT tag on page: " + cr.Path + " In tag: " + img
	}
	}

	mutex.Lock()
	log.Println(out + "\n")
	mutex.Unlock()
	}

	// GetSourceHost returns the host.
	func (cr *CrawlResult) GetSourceHost() string {
	u, _ := url.Parse(cr.Path)
	return u.Host
	}

	// CollectReferences will return all the referred site paths.
	func (cr *CrawlResult) CollectReferences() []string {
	reA, err := regexp.Compile("<a[^>]+href=\"((/\|" + cr.GetSourceHost() + ")[^\"#])(\|#.)\"")
	if err != nil {
	log.Panicln("Can not create regular expression")
	}
	links := reA.FindAllString(cr.GetBody(), -1)

	var paths []string
	for _, link := range links {
	paths = append(paths, reA.ReplaceAllString(link, "$1"))
	}

	return paths
	}

	func main() {
	f := NewFetcher(BaseURL)
	f.FetchRecursive()
	}

	func fetch(path string) (*CrawlResult, error) {
	resp, err := http.Get(path)
	if err != nil {
	return nil, errors.New("Could not fetch resource: " + path)
	}

	return NewCrawlResult(path, resp), nil
	}
No results found