Skip to content

Instantly share code, notes, and snippets.

@jodoherty
Created October 6, 2017 21:07
Show Gist options
  • Save jodoherty/0f2b14dab8770dc0f473a063f955b020 to your computer and use it in GitHub Desktop.
Save jodoherty/0f2b14dab8770dc0f473a063f955b020 to your computer and use it in GitHub Desktop.
Recursive relative path downloader
package main
import (
"errors"
"fmt"
"golang.org/x/net/html"
"io"
"net/http"
"net/url"
"os"
"path"
"regexp"
"strings"
"sync"
)
const NumWorkers = 8
var nonRelMatcher = regexp.MustCompile(`^(\.\.+|[^:]*:?//|/)`)
type EntryType int
const (
DirEntry = EntryType(0)
FileEntry = EntryType(1)
)
type Entry struct {
EntryType EntryType
UrlString string
OutDir string
}
func requestFile(urlString, outdir string) error {
err := os.MkdirAll(outdir, 0755)
if err != nil {
return err
}
u, err := url.Parse(urlString)
if err != nil {
return err
}
resp, err := http.Get(urlString)
if err != nil {
return err
}
defer resp.Body.Close()
f, err := os.OpenFile(
outdir+"/"+path.Base(u.Path),
os.O_RDWR|os.O_CREATE,
0644,
)
if err != nil {
return err
}
defer f.Close()
_, err = io.Copy(f, resp.Body)
if err != nil {
return err
}
return nil
}
func requestListing(urlString, outdir string, c chan Entry) error {
u, err := url.Parse(urlString)
if err != nil {
return err
}
outdir = strings.TrimSuffix(outdir, "/")
outdir += "/" + path.Base(u.Path)
resp, err := http.Get(urlString)
if err != nil {
return err
}
defer resp.Body.Close()
if !strings.Contains(resp.Header.Get("Content-Type"), "text/html") {
return errors.New(
fmt.Sprintf(
"Invalid content type %s",
resp.Header.Get("Content-Type"),
),
)
}
z := html.NewTokenizer(resp.Body)
for {
tt := z.Next()
switch tt {
case html.ErrorToken:
err := z.Err()
if err == io.EOF {
return nil
}
return err
case html.StartTagToken:
tn, hasAttr := z.TagName()
name := string(tn)
if name != "a" || !hasAttr {
break
}
for {
key, val, hasMore := z.TagAttr()
if string(key) == "href" {
filename := string(val)
if nonRelMatcher.MatchString(filename) {
continue
}
e := Entry{
DirEntry,
strings.TrimSuffix(urlString, "/") + "/" + string(val),
outdir,
}
if strings.Contains(filename, ".") {
e.EntryType = FileEntry
} else {
e.UrlString = strings.TrimSuffix(e.UrlString, "/")
}
c <- e
}
if !hasMore {
break
}
}
}
}
return nil
}
func main() {
c := make(chan Entry)
done := sync.WaitGroup{}
wg := sync.WaitGroup{}
if len(os.Args) > 1 {
for _, arg := range os.Args[1:] {
wg.Add(1)
go func(arg string) {
err := requestListing(arg, ".", c)
if err != nil {
fmt.Println(err)
}
wg.Done()
}(arg)
}
} else {
fmt.Println("Usage: download <url>")
return
}
done.Add(NumWorkers)
for w := 0; w < NumWorkers; w++ {
go func() {
for e := range c {
if e.EntryType == DirEntry {
wg.Add(1)
go func(e Entry) {
err := requestListing(e.UrlString, e.OutDir, c)
if err != nil {
fmt.Println(err)
}
wg.Done()
}(e)
}
if e.EntryType == FileEntry {
fmt.Println("Downloading", e.UrlString)
err := requestFile(e.UrlString, e.OutDir)
if err != nil {
fmt.Println(
"Failed to download",
e.UrlString,
":",
err,
)
} else {
fmt.Println("Downloaded", e.UrlString)
}
}
}
done.Done()
}()
}
wg.Wait()
close(c)
done.Wait()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment