Skip to content

Instantly share code, notes, and snippets.

@raedatoui
Last active January 10, 2017 17:56
Show Gist options
  • Save raedatoui/b33fac34fb24ae5ecaabd5f7b3b67e0c to your computer and use it in GitHub Desktop.
Save raedatoui/b33fac34fb24ae5ecaabd5f7b3b67e0c to your computer and use it in GitHub Desktop.
Scrape and test links from files using Golang
package main
import (
"fmt"
"golang.org/x/net/html"
"io"
"log"
"net/http"
"os"
"sort"
"strings"
"io/ioutil"
)
// Helper function to pull the href attribute from a Token
func getHref(t html.Token) (ok bool, href string) {
// Iterate over all of the Token's attributes until we find an "href"
for _, a := range t.Attr {
if a.Key == "href" {
href = a.Val
ok = true
}
}
// "bare" return will return the variables (ok, href) as defined in
// the function definition
return
}
// Extract all http** links from a given webpage
func crawlPage(url string, ch chan string, chFinished chan bool) {
resp, err := http.Get(url)
defer func() {
// Notify that we're done after this function
chFinished <- true
}()
if err != nil {
fmt.Println("ERROR: Failed to crawl \"" + url + "\"")
return
}
b := resp.Body
defer b.Close() // close Body when the function returns
parse(b, ch)
}
// Extract all http** links from a file
func readFile(file string, ch chan string, chFinished chan bool) {
reader, err := os.Open(file)
defer func() {
// Notify that we're done after this function
chFinished <- true
}()
if err != nil {
log.Fatal(err)
return
}
defer reader.Close()
parse(reader, ch)
}
func testStatus(url string, ch chan string, chFinished chan bool) {
resp, err := http.Get(url)
defer func() {
// Notify that we're done after this function
chFinished <- true
}()
if err != nil {
fmt.Printf("ERROR: Failed to check %v %v\n", url, err)
return
}
if resp.StatusCode > 400 {
//fmt.Printf("ERROR: Not good %v %v\n", url , resp.Status)
return
}
ch <- url
}
func parse(r io.Reader, ch chan string) {
z := html.NewTokenizer(r)
for {
tt := z.Next()
switch {
case tt == html.ErrorToken:
// End of the document, we're done
return
case tt == html.StartTagToken:
t := z.Token()
// Check if the token is an <a> tag
isAnchor := t.Data == "a"
if !isAnchor {
continue
}
// Extract the href value, if there is one
ok, url := getHref(t)
if !ok {
continue
}
// Make sure the url begines in http**
hasProto := strings.Index(url, "http") == 0
if hasProto {
ch <- url
}
}
}
}
func readFiles(dir string) map[string]bool{
foundUrls := make(map[string]bool)
// Channels
chUrls := make(chan string)
chFinished := make(chan bool)
// read files from directory
files, err := ioutil.ReadDir(dir)
if err != nil {
log.Fatal(err)
}
// Kick off the crawl process (concurrently)
for _, file := range files {
fmt.Println(file.Name())
go readFile(dir+file.Name(), chUrls, chFinished)
}
// Subscribe to both channels
for c := 0; c < len(files); {
select {
case url := <-chUrls:
foundUrls[url] = true
case <-chFinished:
c++
}
}
close(chUrls)
close(chFinished)
// We're done! Print the results...
fmt.Println("\nFound", len(foundUrls), "unique urls:\n")
return foundUrls
}
func testUrls(urls []string) map[string]bool {
testUrls := make(chan string, 5)
testFinished := make(chan bool, 5)
for _, url := range urls {
go func(v string) {
testStatus(v, testUrls, testFinished)
}(url)
}
goodUrls := make(map[string]bool)
for c := 0; c < len(urls); {
select {
case url := <-testUrls:
goodUrls[url] = true
case <-testFinished:
c++
}
}
fmt.Println("\nFound", len(goodUrls), "good urls:\n")
close(testUrls)
close(testFinished)
return goodUrls
}
func main() {
directory := os.Args[1]
foundUrls := readFiles(directory)
list := make([]string, len(foundUrls))
i := 0
for k := range foundUrls {
list[i] = k
i++
}
sort.Strings(list)
goodUrls := testUrls(list)
list = make([]string, len(goodUrls))
i = 0
for k := range goodUrls {
list[i] = k
i++
}
sort.Strings(list)
for _, url := range list {
fmt.Println(url)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment