Last active
January 10, 2017 17:56
-
-
Save raedatoui/b33fac34fb24ae5ecaabd5f7b3b67e0c to your computer and use it in GitHub Desktop.
Scrape and test links from files using Golang
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"golang.org/x/net/html" | |
"io" | |
"log" | |
"net/http" | |
"os" | |
"sort" | |
"strings" | |
"io/ioutil" | |
) | |
// Helper function to pull the href attribute from a Token | |
func getHref(t html.Token) (ok bool, href string) { | |
// Iterate over all of the Token's attributes until we find an "href" | |
for _, a := range t.Attr { | |
if a.Key == "href" { | |
href = a.Val | |
ok = true | |
} | |
} | |
// "bare" return will return the variables (ok, href) as defined in | |
// the function definition | |
return | |
} | |
// Extract all http** links from a given webpage | |
func crawlPage(url string, ch chan string, chFinished chan bool) { | |
resp, err := http.Get(url) | |
defer func() { | |
// Notify that we're done after this function | |
chFinished <- true | |
}() | |
if err != nil { | |
fmt.Println("ERROR: Failed to crawl \"" + url + "\"") | |
return | |
} | |
b := resp.Body | |
defer b.Close() // close Body when the function returns | |
parse(b, ch) | |
} | |
// Extract all http** links from a file | |
func readFile(file string, ch chan string, chFinished chan bool) { | |
reader, err := os.Open(file) | |
defer func() { | |
// Notify that we're done after this function | |
chFinished <- true | |
}() | |
if err != nil { | |
log.Fatal(err) | |
return | |
} | |
defer reader.Close() | |
parse(reader, ch) | |
} | |
func testStatus(url string, ch chan string, chFinished chan bool) { | |
resp, err := http.Get(url) | |
defer func() { | |
// Notify that we're done after this function | |
chFinished <- true | |
}() | |
if err != nil { | |
fmt.Printf("ERROR: Failed to check %v %v\n", url, err) | |
return | |
} | |
if resp.StatusCode > 400 { | |
//fmt.Printf("ERROR: Not good %v %v\n", url , resp.Status) | |
return | |
} | |
ch <- url | |
} | |
func parse(r io.Reader, ch chan string) { | |
z := html.NewTokenizer(r) | |
for { | |
tt := z.Next() | |
switch { | |
case tt == html.ErrorToken: | |
// End of the document, we're done | |
return | |
case tt == html.StartTagToken: | |
t := z.Token() | |
// Check if the token is an <a> tag | |
isAnchor := t.Data == "a" | |
if !isAnchor { | |
continue | |
} | |
// Extract the href value, if there is one | |
ok, url := getHref(t) | |
if !ok { | |
continue | |
} | |
// Make sure the url begines in http** | |
hasProto := strings.Index(url, "http") == 0 | |
if hasProto { | |
ch <- url | |
} | |
} | |
} | |
} | |
func readFiles(dir string) map[string]bool{ | |
foundUrls := make(map[string]bool) | |
// Channels | |
chUrls := make(chan string) | |
chFinished := make(chan bool) | |
// read files from directory | |
files, err := ioutil.ReadDir(dir) | |
if err != nil { | |
log.Fatal(err) | |
} | |
// Kick off the crawl process (concurrently) | |
for _, file := range files { | |
fmt.Println(file.Name()) | |
go readFile(dir+file.Name(), chUrls, chFinished) | |
} | |
// Subscribe to both channels | |
for c := 0; c < len(files); { | |
select { | |
case url := <-chUrls: | |
foundUrls[url] = true | |
case <-chFinished: | |
c++ | |
} | |
} | |
close(chUrls) | |
close(chFinished) | |
// We're done! Print the results... | |
fmt.Println("\nFound", len(foundUrls), "unique urls:\n") | |
return foundUrls | |
} | |
func testUrls(urls []string) map[string]bool { | |
testUrls := make(chan string, 5) | |
testFinished := make(chan bool, 5) | |
for _, url := range urls { | |
go func(v string) { | |
testStatus(v, testUrls, testFinished) | |
}(url) | |
} | |
goodUrls := make(map[string]bool) | |
for c := 0; c < len(urls); { | |
select { | |
case url := <-testUrls: | |
goodUrls[url] = true | |
case <-testFinished: | |
c++ | |
} | |
} | |
fmt.Println("\nFound", len(goodUrls), "good urls:\n") | |
close(testUrls) | |
close(testFinished) | |
return goodUrls | |
} | |
func main() { | |
directory := os.Args[1] | |
foundUrls := readFiles(directory) | |
list := make([]string, len(foundUrls)) | |
i := 0 | |
for k := range foundUrls { | |
list[i] = k | |
i++ | |
} | |
sort.Strings(list) | |
goodUrls := testUrls(list) | |
list = make([]string, len(goodUrls)) | |
i = 0 | |
for k := range goodUrls { | |
list[i] = k | |
i++ | |
} | |
sort.Strings(list) | |
for _, url := range list { | |
fmt.Println(url) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment