Created
May 30, 2020 03:23
-
-
Save ahmadrosid/070a9c333413a63f991c0d35a3eaea1a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"crypto/tls" | |
"flag" | |
"fmt" | |
"github.com/jackdanger/collectlinks" | |
"net/http" | |
"net/url" | |
"os" | |
"strings" | |
) | |
func usage() { | |
fmt.Fprintf(os.Stderr, "usage: crawl http://example.com/path/file.html\n") | |
flag.PrintDefaults() | |
os.Exit(2) | |
} | |
var host string | |
func main() { | |
flag.Usage = usage | |
flag.Parse() | |
args := flag.Args() | |
fmt.Println(args) | |
if len(args) < 1 { | |
usage() | |
fmt.Println("Please specify start page") | |
os.Exit(1) | |
} | |
input := args[0] | |
uri, err := url.ParseRequestURI(input) | |
if err != nil { | |
fmt.Println("Invalid URL!", input) | |
return | |
} | |
host = uri.String() | |
queue := make(chan string) | |
filteredQueue := make(chan string) | |
go func() { queue <- args[0] }() | |
go filterQueue(queue, filteredQueue) | |
// introduce a bool channel to synchronize execution of concurrently running crawlers | |
done := make(chan bool) | |
// pull from the filtered queue, add to the unfiltered queue | |
for i := 0; i < 5; i++ { | |
go func() { | |
for uri := range filteredQueue { | |
enqueue(uri, queue) | |
} | |
done <- true | |
}() | |
} | |
<-done | |
} | |
func filterQueue(in chan string, out chan string) { | |
var seen = make(map[string]bool) | |
for val := range in { | |
if !seen[val] { | |
seen[val] = true | |
out <- val | |
} | |
} | |
} | |
func enqueue(uri string, queue chan string) { | |
if uri == "" { | |
return | |
} | |
fmt.Println("fetching", uri) | |
transport := &http.Transport{ | |
TLSClientConfig: &tls.Config{ | |
InsecureSkipVerify: true, | |
}, | |
} | |
client := http.Client{Transport: transport} | |
resp, err := client.Get(uri) | |
if err != nil { | |
fmt.Println(err) | |
return | |
} | |
defer resp.Body.Close() | |
links := collectlinks.All(resp.Body) | |
for _, link := range links { | |
absolute := fixUrl(link, uri) | |
if uri != "" { | |
go func() { queue <- absolute }() | |
} | |
} | |
} | |
func fixUrl(href, base string) string { | |
uri, err := url.Parse(href) | |
if err != nil { | |
return "" | |
} | |
baseUrl, err := url.Parse(base) | |
if err != nil { | |
return "" | |
} | |
uri = baseUrl.ResolveReference(uri) | |
if strings.Contains(uri.String(), host) { | |
return uri.String() | |
} | |
return "" | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment