junaidk · January 11, 2021 15:15
diff --git a/reddit-download.go b/reddit-download.go
 package main

 import (
 	"fmt"
 	"github.com/anaskhan96/soup"
 	"github.com/dustin/go-humanize"
 	"io"
 	"io/ioutil"
 	"net/http"
 	"os"
 	"path"
 	"strings"
 	"sync"
 )


 const basePath = "<download-base-path>"
 const paralledDownload = 2

 func main() {

 	// file obtained with
 	// https://redditmanager.com/
 	dat, err := ioutil.ReadFile("<path to reddit_export.html")

 	if err != nil {
 		fmt.Println(err.Error())
 		os.Exit(1)
 	}
 	doc := soup.HTMLParse(string(dat))
 	links := doc.FindAll("li")

 	jobs := make(chan Job, len(links))
 	var wg sync.WaitGroup

 	for w := 1; w <= paralledDownload; w++ {
 		wg.Add(1)
 		go worker(w, jobs, &wg)
 	}

 	for index, link := range links {
 		anchors := link.FindAll("a")
 		if len(anchors) < 2 {
 			continue
 		}
 		split := strings.Split(anchors[1].Attrs()["href"], "/")

 		name := split[len(split)-2]
 		folder := split[4]

 		folderPath := path.Join(basePath, folder)
 		filePath := path.Join(folderPath, name)
 		url := anchors[0].Attrs()["href"]

 		job := Job{
 			Index:      index,
 			Url:        url,
 			FilePath:   filePath,
 			FolderPath: folderPath,
 		}

 		jobs <- job

 	}

 	// to stop the worker, first close the job channel
 	close(jobs)

 	// then wait using the WaitGroup
 	wg.Wait()
 }

 type Job struct {
 	Index      int
 	Url        string
 	FilePath   string
 	FolderPath string
 }

 func worker(id int, jobs <-chan Job, wg *sync.WaitGroup) {
 	defer wg.Done()
 	for job := range jobs {

 		newUrl := urlGenerator(job.Url)

 		fmt.Printf("------------------------------\n"+
 			"worker %d started\norignal_url: %s \nnew_url: %s \nfilename: %s\n", id, job.Url, newUrl, job.FilePath,
 		)

 		if len(newUrl) == 0 {
 			continue
 		}

 		ensureDir(job.FolderPath)
 		putFile(job.FilePath, newUrl)

 		//fmt.Println("worker", id, "finished job",job.Url)
 	}
 	fmt.Println("worker", id, "finished")
 }

 func urlGenerator(urlInput string) string {

 	var urlOut string

 	if strings.Contains(urlInput, "i.redd.it") ||
 		strings.Contains(urlInput, "gfycat.com") ||
 		strings.Contains(urlInput, "imgur") ||
 		strings.Contains(urlInput, "redgifs") {

 		if strings.Contains(urlInput, "gfycat") {
 			urlOut = gfyCatMP4Url(urlInput)
 			if strings.Contains(urlOut, "redgifs") {
 				urlOut = strings.Replace(urlOut, "-mobile", "", 1)
 			} else {
 				urlOut = strings.Replace(urlOut, "thumbs", "giant", 1)
 				urlOut = strings.Replace(urlOut, "-mobile", "", 1)
 			}

 		} else if strings.Contains(urlInput, "i.imgur") {
 			urlOut = strings.Replace(urlInput, "gifv", "mp4", 1)
 		} else if strings.Contains(urlInput, "redgifs") {
 			urlOut = redgifMP4Url(urlInput)
 			urlOut = strings.Replace(urlOut, "-mobile", "", 1)
 		} else {
 			urlOut = urlInput
 		}
 	} else {
 		urlOut = ""
 	}
 	return urlOut
 }
 func redgifMP4Url(urlInput string) string {

 	resp, err := soup.Get(urlInput)
 	if err != nil {
 		fmt.Println(err.Error())
 		return ""
 	}
 	doc := soup.HTMLParse(string(resp))
 	link := doc.Find("source", "type", "video/mp4")

 	if link.Error != nil {
 		return ""
 	}

 	urlOut, ok := link.Attrs()["src"]
 	if !ok {
 		return ""
 	}
 	return urlOut

 }
 func gfyCatMP4Url(urlInput string) string {
 	resp, err := soup.Get(urlInput)
 	if err != nil {
 		fmt.Println(err.Error())
 		return ""
 	}
 	doc := soup.HTMLParse(string(resp))
 	link := doc.Find("meta", "property", "og:video")

 	if link.Error != nil {
 		return ""
 	}

 	urlOut, ok := link.Attrs()["content"]
 	if !ok {
 		return ""
 	}
 	return urlOut
 }

 func putFile(fileName, url string) {

 	client := httpClient()
 	resp, err := client.Get(url)
 	if err != nil {
 		fmt.Printf("error in downloading %s, %s\n", url, err.Error())
 		return
 	}
 	defer resp.Body.Close()

 	ext := getExtention(resp.Header["Content-Type"][0])
 	filePath := fileName + "." + ext

 	if ensureFile(filePath) {
 		fmt.Printf("file %s exists, not downloading\n", filePath)
 		return
 	}
 	file := createImage(filePath)
 	//counter := &WriteCounter{}
 	//size, err := io.Copy(file, io.TeeReader(resp.Body, counter))
 	_, err = io.Copy(file, resp.Body)
 	fmt.Println()

 	defer file.Close()
 	checkError(err)
 	//fmt.Printf("Just Downloaded a file %s with size %s\n", fileName, humanize.Bytes(uint64(size)))
 }

 func getExtention(url string) string {

 	splits := strings.Split(url, "/")
 	ext := splits[len(splits)-1]

 	return ext
 }

 func httpClient() *http.Client {
 	client := http.Client{
 		CheckRedirect: func(r *http.Request, via []*http.Request) error {
 			r.URL.Opaque = r.URL.Path
 			return nil
 		},
 	}

 	return &client
 }

 func createImage(fileName string) *os.File {
 	file, err := os.Create(fileName)

 	checkError(err)
 	return file
 }

 func ensureFile(filePath string) bool {

 	if _, err := os.Stat(filePath); err == nil {
 		return true
 	} else if os.IsNotExist(err) {
 		return false
 	}
 	return false
 }

 func ensureDir(dirName string) error {
 	err := os.Mkdir(dirName, os.ModeDir)
 	if err == nil || os.IsExist(err) {
 		return nil
 	} else {
 		return err
 	}
 }

 func checkError(err error) {
 	if err != nil {
 		fmt.Println(err)
 	}
 }

 type WriteCounter struct {
 	Total uint64
 }

 func (wc *WriteCounter) Write(p []byte) (int, error) {
 	n := len(p)
 	wc.Total += uint64(n)
 	wc.PrintProgress()
 	return n, nil
 }

 // PrintProgress prints the progress of a file write
 func (wc WriteCounter) PrintProgress() {
 	// Clear the line by using a character return to go back to the start and remove
 	// the remaining characters by filling it with spaces
 	fmt.Printf("\r%s", strings.Repeat(" ", 50))

 	// Return again and print current status of download
 	// We use the humanize package to print the bytes in a meaningful way (e.g. 10 MB)
 	fmt.Printf("\rDownloading... %s complete", humanize.Bytes(wc.Total))
 }
	package main

	import (
	"fmt"
	"github.com/anaskhan96/soup"
	"github.com/dustin/go-humanize"
	"io"
	"io/ioutil"
	"net/http"
	"os"
	"path"
	"strings"
	"sync"
	)


	const basePath = "<download-base-path>"
	const paralledDownload = 2

	func main() {

	// file obtained with
	// https://redditmanager.com/
	dat, err := ioutil.ReadFile("<path to reddit_export.html")

	if err != nil {
	fmt.Println(err.Error())
	os.Exit(1)
	}
	doc := soup.HTMLParse(string(dat))
	links := doc.FindAll("li")

	jobs := make(chan Job, len(links))
	var wg sync.WaitGroup

	for w := 1; w <= paralledDownload; w++ {
	wg.Add(1)
	go worker(w, jobs, &wg)
	}

	for index, link := range links {
	anchors := link.FindAll("a")
	if len(anchors) < 2 {
	continue
	}
	split := strings.Split(anchors[1].Attrs()["href"], "/")

	name := split[len(split)-2]
	folder := split[4]

	folderPath := path.Join(basePath, folder)
	filePath := path.Join(folderPath, name)
	url := anchors[0].Attrs()["href"]

	job := Job{
	Index: index,
	Url: url,
	FilePath: filePath,
	FolderPath: folderPath,
	}

	jobs <- job

	}

	// to stop the worker, first close the job channel
	close(jobs)

	// then wait using the WaitGroup
	wg.Wait()
	}

	type Job struct {
	Index int
	Url string
	FilePath string
	FolderPath string
	}

	func worker(id int, jobs <-chan Job, wg *sync.WaitGroup) {
	defer wg.Done()
	for job := range jobs {

	newUrl := urlGenerator(job.Url)

	fmt.Printf("------------------------------\n"+
	"worker %d started\norignal_url: %s \nnew_url: %s \nfilename: %s\n", id, job.Url, newUrl, job.FilePath,
	)

	if len(newUrl) == 0 {
	continue
	}

	ensureDir(job.FolderPath)
	putFile(job.FilePath, newUrl)

	//fmt.Println("worker", id, "finished job",job.Url)
	}
	fmt.Println("worker", id, "finished")
	}

	func urlGenerator(urlInput string) string {

	var urlOut string

	if strings.Contains(urlInput, "i.redd.it") \|\|
	strings.Contains(urlInput, "gfycat.com") \|\|
	strings.Contains(urlInput, "imgur") \|\|
	strings.Contains(urlInput, "redgifs") {

	if strings.Contains(urlInput, "gfycat") {
	urlOut = gfyCatMP4Url(urlInput)
	if strings.Contains(urlOut, "redgifs") {
	urlOut = strings.Replace(urlOut, "-mobile", "", 1)
	} else {
	urlOut = strings.Replace(urlOut, "thumbs", "giant", 1)
	urlOut = strings.Replace(urlOut, "-mobile", "", 1)
	}

	} else if strings.Contains(urlInput, "i.imgur") {
	urlOut = strings.Replace(urlInput, "gifv", "mp4", 1)
	} else if strings.Contains(urlInput, "redgifs") {
	urlOut = redgifMP4Url(urlInput)
	urlOut = strings.Replace(urlOut, "-mobile", "", 1)
	} else {
	urlOut = urlInput
	}
	} else {
	urlOut = ""
	}
	return urlOut
	}
	func redgifMP4Url(urlInput string) string {

	resp, err := soup.Get(urlInput)
	if err != nil {
	fmt.Println(err.Error())
	return ""
	}
	doc := soup.HTMLParse(string(resp))
	link := doc.Find("source", "type", "video/mp4")

	if link.Error != nil {
	return ""
	}

	urlOut, ok := link.Attrs()["src"]
	if !ok {
	return ""
	}
	return urlOut

	}
	func gfyCatMP4Url(urlInput string) string {
	resp, err := soup.Get(urlInput)
	if err != nil {
	fmt.Println(err.Error())
	return ""
	}
	doc := soup.HTMLParse(string(resp))
	link := doc.Find("meta", "property", "og:video")

	if link.Error != nil {
	return ""
	}

	urlOut, ok := link.Attrs()["content"]
	if !ok {
	return ""
	}
	return urlOut
	}

	func putFile(fileName, url string) {

	client := httpClient()
	resp, err := client.Get(url)
	if err != nil {
	fmt.Printf("error in downloading %s, %s\n", url, err.Error())
	return
	}
	defer resp.Body.Close()

	ext := getExtention(resp.Header["Content-Type"][0])
	filePath := fileName + "." + ext

	if ensureFile(filePath) {
	fmt.Printf("file %s exists, not downloading\n", filePath)
	return
	}
	file := createImage(filePath)
	//counter := &WriteCounter{}
	//size, err := io.Copy(file, io.TeeReader(resp.Body, counter))
	_, err = io.Copy(file, resp.Body)
	fmt.Println()

	defer file.Close()
	checkError(err)
	//fmt.Printf("Just Downloaded a file %s with size %s\n", fileName, humanize.Bytes(uint64(size)))
	}

	func getExtention(url string) string {

	splits := strings.Split(url, "/")
	ext := splits[len(splits)-1]

	return ext
	}

	func httpClient() *http.Client {
	client := http.Client{
	CheckRedirect: func(r http.Request, via []http.Request) error {
	r.URL.Opaque = r.URL.Path
	return nil
	},
	}

	return &client
	}

	func createImage(fileName string) *os.File {
	file, err := os.Create(fileName)

	checkError(err)
	return file
	}

	func ensureFile(filePath string) bool {

	if _, err := os.Stat(filePath); err == nil {
	return true
	} else if os.IsNotExist(err) {
	return false
	}
	return false
	}

	func ensureDir(dirName string) error {
	err := os.Mkdir(dirName, os.ModeDir)
	if err == nil \|\| os.IsExist(err) {
	return nil
	} else {
	return err
	}
	}

	func checkError(err error) {
	if err != nil {
	fmt.Println(err)
	}
	}

	type WriteCounter struct {
	Total uint64
	}

	func (wc *WriteCounter) Write(p []byte) (int, error) {
	n := len(p)
	wc.Total += uint64(n)
	wc.PrintProgress()
	return n, nil
	}

	// PrintProgress prints the progress of a file write
	func (wc WriteCounter) PrintProgress() {
	// Clear the line by using a character return to go back to the start and remove
	// the remaining characters by filling it with spaces
	fmt.Printf("\r%s", strings.Repeat(" ", 50))

	// Return again and print current status of download
	// We use the humanize package to print the bytes in a meaningful way (e.g. 10 MB)
	fmt.Printf("\rDownloading... %s complete", humanize.Bytes(wc.Total))
	}