Last active
September 23, 2015 15:22
-
-
Save itarato/1ece484c9de67d064795 to your computer and use it in GitHub Desktop.
Website crawler looking for missing IMG tag ALT attributes on all linked pages (recursively)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| package main | |
| import ( | |
| "errors" | |
| "io/ioutil" | |
| "log" | |
| "net/http" | |
| "net/url" | |
| "regexp" | |
| "strconv" | |
| "sync" | |
| ) | |
| // Base URL for No trailing slash. | |
| const BaseURL = "http://???" | |
| // Base path. | |
| const BasePath = "/" | |
| // Fetcher struct to execute the main fetching operation on. | |
| type Fetcher struct { | |
| Base *url.URL | |
| Histroy map[string]bool | |
| } | |
| var mutex sync.Mutex | |
| // FetchRecursive is the kick-off for the crawl. | |
| func (f *Fetcher) FetchRecursive() { | |
| q := []string{BasePath} | |
| var wg sync.WaitGroup | |
| for len(q) > 0 { | |
| wg.Add(len(q)) | |
| currentQ := q | |
| q = []string{} | |
| for _, currentPath := range currentQ { | |
| go func(currentPath string) { | |
| defer wg.Done() | |
| q = append(q, f.Fetch(currentPath)...) | |
| }(currentPath) | |
| } | |
| wg.Wait() | |
| } | |
| } | |
| // Fetch is executing one resource fetch. | |
| func (f *Fetcher) Fetch(currentPath string) []string { | |
| mutex.Lock() | |
| _, ok := f.Histroy[currentPath] | |
| mutex.Unlock() | |
| if ok { | |
| // Seen this path before. | |
| return nil | |
| } | |
| mutex.Lock() | |
| f.Histroy[currentPath] = true | |
| mutex.Unlock() | |
| currentURL := BaseURL + currentPath | |
| cr, err := fetch(currentURL) | |
| if err != nil { | |
| log.Println(err) | |
| return nil | |
| } | |
| cr.AnalyzeAltTags() | |
| return cr.CollectReferences() | |
| } | |
| // NewFetcher is a constructor for Fetcher. | |
| func NewFetcher(path string) *Fetcher { | |
| u, err := url.Parse(path) | |
| if err != nil { | |
| log.Panicln("Base path is not a valid URL:", path) | |
| } | |
| f := &Fetcher{Base: u} | |
| f.Histroy = make(map[string]bool) | |
| return f | |
| } | |
| // CrawlResult is the type to store result data. | |
| type CrawlResult struct { | |
| Path string | |
| Response *http.Response | |
| BodyCached string | |
| } | |
| // NewCrawlResult creates a new CrawlResult object. | |
| func NewCrawlResult(path string, response *http.Response) *CrawlResult { | |
| cr := &CrawlResult{ | |
| Path: path, | |
| Response: response, | |
| } | |
| return cr | |
| } | |
| // GetBody returns the body string of the content. | |
| func (cr *CrawlResult) GetBody() string { | |
| if len(cr.BodyCached) != 0 { | |
| return cr.BodyCached | |
| } | |
| body, err := ioutil.ReadAll(cr.Response.Body) | |
| if err != nil { | |
| log.Println("Can not read body for:", cr.Path) | |
| return "" | |
| } | |
| cr.BodyCached = string(body) | |
| return cr.BodyCached | |
| } | |
| // AnalyzeAltTags will do the ALT tag check and prints out the findings. | |
| func (cr *CrawlResult) AnalyzeAltTags() { | |
| reImg, err := regexp.Compile("<img[^>]*>") | |
| if err != nil { | |
| log.Panicln("Can not create regular expression") | |
| } | |
| imgs := reImg.FindAllString(cr.GetBody(), -1) | |
| reAlt, err := regexp.Compile("alt=\"[^\"]+\"") | |
| out := "Analyzing " + strconv.Itoa(len(imgs)) + " tags on page " + cr.Path | |
| for _, img := range imgs { | |
| if !reAlt.MatchString(img) { | |
| out += "\nMissing ALT tag on page: " + cr.Path + " In tag: " + img | |
| } | |
| } | |
| mutex.Lock() | |
| log.Println(out + "\n") | |
| mutex.Unlock() | |
| } | |
| // GetSourceHost returns the host. | |
| func (cr *CrawlResult) GetSourceHost() string { | |
| u, _ := url.Parse(cr.Path) | |
| return u.Host | |
| } | |
| // CollectReferences will return all the referred site paths. | |
| func (cr *CrawlResult) CollectReferences() []string { | |
| reA, err := regexp.Compile("<a[^>]+href=\"((/|" + cr.GetSourceHost() + ")[^\"#]*)(|#.*)\"") | |
| if err != nil { | |
| log.Panicln("Can not create regular expression") | |
| } | |
| links := reA.FindAllString(cr.GetBody(), -1) | |
| var paths []string | |
| for _, link := range links { | |
| paths = append(paths, reA.ReplaceAllString(link, "$1")) | |
| } | |
| return paths | |
| } | |
| func main() { | |
| f := NewFetcher(BaseURL) | |
| f.FetchRecursive() | |
| } | |
| func fetch(path string) (*CrawlResult, error) { | |
| resp, err := http.Get(path) | |
| if err != nil { | |
| return nil, errors.New("Could not fetch resource: " + path) | |
| } | |
| return NewCrawlResult(path, resp), nil | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment