Skip to content

Instantly share code, notes, and snippets.

@junaidk
Last active January 11, 2021 15:15
Show Gist options
  • Save junaidk/0732f61336b823887776deca6388fe5c to your computer and use it in GitHub Desktop.
Save junaidk/0732f61336b823887776deca6388fe5c to your computer and use it in GitHub Desktop.
Download images and videos from reddit saved posts
package main
import (
"fmt"
"github.com/anaskhan96/soup"
"github.com/dustin/go-humanize"
"io"
"io/ioutil"
"net/http"
"os"
"path"
"strings"
"sync"
)
const basePath = "<download-base-path>"
const paralledDownload = 2
func main() {
// file obtained with
// https://redditmanager.com/
dat, err := ioutil.ReadFile("<path to reddit_export.html")
if err != nil {
fmt.Println(err.Error())
os.Exit(1)
}
doc := soup.HTMLParse(string(dat))
links := doc.FindAll("li")
jobs := make(chan Job, len(links))
var wg sync.WaitGroup
for w := 1; w <= paralledDownload; w++ {
wg.Add(1)
go worker(w, jobs, &wg)
}
for index, link := range links {
anchors := link.FindAll("a")
if len(anchors) < 2 {
continue
}
split := strings.Split(anchors[1].Attrs()["href"], "/")
name := split[len(split)-2]
folder := split[4]
folderPath := path.Join(basePath, folder)
filePath := path.Join(folderPath, name)
url := anchors[0].Attrs()["href"]
job := Job{
Index: index,
Url: url,
FilePath: filePath,
FolderPath: folderPath,
}
jobs <- job
}
// to stop the worker, first close the job channel
close(jobs)
// then wait using the WaitGroup
wg.Wait()
}
type Job struct {
Index int
Url string
FilePath string
FolderPath string
}
func worker(id int, jobs <-chan Job, wg *sync.WaitGroup) {
defer wg.Done()
for job := range jobs {
newUrl := urlGenerator(job.Url)
fmt.Printf("------------------------------\n"+
"worker %d started\norignal_url: %s \nnew_url: %s \nfilename: %s\n", id, job.Url, newUrl, job.FilePath,
)
if len(newUrl) == 0 {
continue
}
ensureDir(job.FolderPath)
putFile(job.FilePath, newUrl)
//fmt.Println("worker", id, "finished job",job.Url)
}
fmt.Println("worker", id, "finished")
}
func urlGenerator(urlInput string) string {
var urlOut string
if strings.Contains(urlInput, "i.redd.it") ||
strings.Contains(urlInput, "gfycat.com") ||
strings.Contains(urlInput, "imgur") ||
strings.Contains(urlInput, "redgifs") {
if strings.Contains(urlInput, "gfycat") {
urlOut = gfyCatMP4Url(urlInput)
if strings.Contains(urlOut, "redgifs") {
urlOut = strings.Replace(urlOut, "-mobile", "", 1)
} else {
urlOut = strings.Replace(urlOut, "thumbs", "giant", 1)
urlOut = strings.Replace(urlOut, "-mobile", "", 1)
}
} else if strings.Contains(urlInput, "i.imgur") {
urlOut = strings.Replace(urlInput, "gifv", "mp4", 1)
} else if strings.Contains(urlInput, "redgifs") {
urlOut = redgifMP4Url(urlInput)
urlOut = strings.Replace(urlOut, "-mobile", "", 1)
} else {
urlOut = urlInput
}
} else {
urlOut = ""
}
return urlOut
}
func redgifMP4Url(urlInput string) string {
resp, err := soup.Get(urlInput)
if err != nil {
fmt.Println(err.Error())
return ""
}
doc := soup.HTMLParse(string(resp))
link := doc.Find("source", "type", "video/mp4")
if link.Error != nil {
return ""
}
urlOut, ok := link.Attrs()["src"]
if !ok {
return ""
}
return urlOut
}
func gfyCatMP4Url(urlInput string) string {
resp, err := soup.Get(urlInput)
if err != nil {
fmt.Println(err.Error())
return ""
}
doc := soup.HTMLParse(string(resp))
link := doc.Find("meta", "property", "og:video")
if link.Error != nil {
return ""
}
urlOut, ok := link.Attrs()["content"]
if !ok {
return ""
}
return urlOut
}
func putFile(fileName, url string) {
client := httpClient()
resp, err := client.Get(url)
if err != nil {
fmt.Printf("error in downloading %s, %s\n", url, err.Error())
return
}
defer resp.Body.Close()
ext := getExtention(resp.Header["Content-Type"][0])
filePath := fileName + "." + ext
if ensureFile(filePath) {
fmt.Printf("file %s exists, not downloading\n", filePath)
return
}
file := createImage(filePath)
//counter := &WriteCounter{}
//size, err := io.Copy(file, io.TeeReader(resp.Body, counter))
_, err = io.Copy(file, resp.Body)
fmt.Println()
defer file.Close()
checkError(err)
//fmt.Printf("Just Downloaded a file %s with size %s\n", fileName, humanize.Bytes(uint64(size)))
}
func getExtention(url string) string {
splits := strings.Split(url, "/")
ext := splits[len(splits)-1]
return ext
}
func httpClient() *http.Client {
client := http.Client{
CheckRedirect: func(r *http.Request, via []*http.Request) error {
r.URL.Opaque = r.URL.Path
return nil
},
}
return &client
}
func createImage(fileName string) *os.File {
file, err := os.Create(fileName)
checkError(err)
return file
}
func ensureFile(filePath string) bool {
if _, err := os.Stat(filePath); err == nil {
return true
} else if os.IsNotExist(err) {
return false
}
return false
}
func ensureDir(dirName string) error {
err := os.Mkdir(dirName, os.ModeDir)
if err == nil || os.IsExist(err) {
return nil
} else {
return err
}
}
func checkError(err error) {
if err != nil {
fmt.Println(err)
}
}
type WriteCounter struct {
Total uint64
}
func (wc *WriteCounter) Write(p []byte) (int, error) {
n := len(p)
wc.Total += uint64(n)
wc.PrintProgress()
return n, nil
}
// PrintProgress prints the progress of a file write
func (wc WriteCounter) PrintProgress() {
// Clear the line by using a character return to go back to the start and remove
// the remaining characters by filling it with spaces
fmt.Printf("\r%s", strings.Repeat(" ", 50))
// Return again and print current status of download
// We use the humanize package to print the bytes in a meaningful way (e.g. 10 MB)
fmt.Printf("\rDownloading... %s complete", humanize.Bytes(wc.Total))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment