Last active
January 11, 2021 15:15
-
-
Save junaidk/0732f61336b823887776deca6388fe5c to your computer and use it in GitHub Desktop.
Download images and videos from reddit saved posts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"github.com/anaskhan96/soup" | |
"github.com/dustin/go-humanize" | |
"io" | |
"io/ioutil" | |
"net/http" | |
"os" | |
"path" | |
"strings" | |
"sync" | |
) | |
const basePath = "<download-base-path>" | |
const paralledDownload = 2 | |
func main() { | |
// file obtained with | |
// https://redditmanager.com/ | |
dat, err := ioutil.ReadFile("<path to reddit_export.html") | |
if err != nil { | |
fmt.Println(err.Error()) | |
os.Exit(1) | |
} | |
doc := soup.HTMLParse(string(dat)) | |
links := doc.FindAll("li") | |
jobs := make(chan Job, len(links)) | |
var wg sync.WaitGroup | |
for w := 1; w <= paralledDownload; w++ { | |
wg.Add(1) | |
go worker(w, jobs, &wg) | |
} | |
for index, link := range links { | |
anchors := link.FindAll("a") | |
if len(anchors) < 2 { | |
continue | |
} | |
split := strings.Split(anchors[1].Attrs()["href"], "/") | |
name := split[len(split)-2] | |
folder := split[4] | |
folderPath := path.Join(basePath, folder) | |
filePath := path.Join(folderPath, name) | |
url := anchors[0].Attrs()["href"] | |
job := Job{ | |
Index: index, | |
Url: url, | |
FilePath: filePath, | |
FolderPath: folderPath, | |
} | |
jobs <- job | |
} | |
// to stop the worker, first close the job channel | |
close(jobs) | |
// then wait using the WaitGroup | |
wg.Wait() | |
} | |
type Job struct { | |
Index int | |
Url string | |
FilePath string | |
FolderPath string | |
} | |
func worker(id int, jobs <-chan Job, wg *sync.WaitGroup) { | |
defer wg.Done() | |
for job := range jobs { | |
newUrl := urlGenerator(job.Url) | |
fmt.Printf("------------------------------\n"+ | |
"worker %d started\norignal_url: %s \nnew_url: %s \nfilename: %s\n", id, job.Url, newUrl, job.FilePath, | |
) | |
if len(newUrl) == 0 { | |
continue | |
} | |
ensureDir(job.FolderPath) | |
putFile(job.FilePath, newUrl) | |
//fmt.Println("worker", id, "finished job",job.Url) | |
} | |
fmt.Println("worker", id, "finished") | |
} | |
func urlGenerator(urlInput string) string { | |
var urlOut string | |
if strings.Contains(urlInput, "i.redd.it") || | |
strings.Contains(urlInput, "gfycat.com") || | |
strings.Contains(urlInput, "imgur") || | |
strings.Contains(urlInput, "redgifs") { | |
if strings.Contains(urlInput, "gfycat") { | |
urlOut = gfyCatMP4Url(urlInput) | |
if strings.Contains(urlOut, "redgifs") { | |
urlOut = strings.Replace(urlOut, "-mobile", "", 1) | |
} else { | |
urlOut = strings.Replace(urlOut, "thumbs", "giant", 1) | |
urlOut = strings.Replace(urlOut, "-mobile", "", 1) | |
} | |
} else if strings.Contains(urlInput, "i.imgur") { | |
urlOut = strings.Replace(urlInput, "gifv", "mp4", 1) | |
} else if strings.Contains(urlInput, "redgifs") { | |
urlOut = redgifMP4Url(urlInput) | |
urlOut = strings.Replace(urlOut, "-mobile", "", 1) | |
} else { | |
urlOut = urlInput | |
} | |
} else { | |
urlOut = "" | |
} | |
return urlOut | |
} | |
func redgifMP4Url(urlInput string) string { | |
resp, err := soup.Get(urlInput) | |
if err != nil { | |
fmt.Println(err.Error()) | |
return "" | |
} | |
doc := soup.HTMLParse(string(resp)) | |
link := doc.Find("source", "type", "video/mp4") | |
if link.Error != nil { | |
return "" | |
} | |
urlOut, ok := link.Attrs()["src"] | |
if !ok { | |
return "" | |
} | |
return urlOut | |
} | |
func gfyCatMP4Url(urlInput string) string { | |
resp, err := soup.Get(urlInput) | |
if err != nil { | |
fmt.Println(err.Error()) | |
return "" | |
} | |
doc := soup.HTMLParse(string(resp)) | |
link := doc.Find("meta", "property", "og:video") | |
if link.Error != nil { | |
return "" | |
} | |
urlOut, ok := link.Attrs()["content"] | |
if !ok { | |
return "" | |
} | |
return urlOut | |
} | |
func putFile(fileName, url string) { | |
client := httpClient() | |
resp, err := client.Get(url) | |
if err != nil { | |
fmt.Printf("error in downloading %s, %s\n", url, err.Error()) | |
return | |
} | |
defer resp.Body.Close() | |
ext := getExtention(resp.Header["Content-Type"][0]) | |
filePath := fileName + "." + ext | |
if ensureFile(filePath) { | |
fmt.Printf("file %s exists, not downloading\n", filePath) | |
return | |
} | |
file := createImage(filePath) | |
//counter := &WriteCounter{} | |
//size, err := io.Copy(file, io.TeeReader(resp.Body, counter)) | |
_, err = io.Copy(file, resp.Body) | |
fmt.Println() | |
defer file.Close() | |
checkError(err) | |
//fmt.Printf("Just Downloaded a file %s with size %s\n", fileName, humanize.Bytes(uint64(size))) | |
} | |
func getExtention(url string) string { | |
splits := strings.Split(url, "/") | |
ext := splits[len(splits)-1] | |
return ext | |
} | |
func httpClient() *http.Client { | |
client := http.Client{ | |
CheckRedirect: func(r *http.Request, via []*http.Request) error { | |
r.URL.Opaque = r.URL.Path | |
return nil | |
}, | |
} | |
return &client | |
} | |
func createImage(fileName string) *os.File { | |
file, err := os.Create(fileName) | |
checkError(err) | |
return file | |
} | |
func ensureFile(filePath string) bool { | |
if _, err := os.Stat(filePath); err == nil { | |
return true | |
} else if os.IsNotExist(err) { | |
return false | |
} | |
return false | |
} | |
func ensureDir(dirName string) error { | |
err := os.Mkdir(dirName, os.ModeDir) | |
if err == nil || os.IsExist(err) { | |
return nil | |
} else { | |
return err | |
} | |
} | |
func checkError(err error) { | |
if err != nil { | |
fmt.Println(err) | |
} | |
} | |
type WriteCounter struct { | |
Total uint64 | |
} | |
func (wc *WriteCounter) Write(p []byte) (int, error) { | |
n := len(p) | |
wc.Total += uint64(n) | |
wc.PrintProgress() | |
return n, nil | |
} | |
// PrintProgress prints the progress of a file write | |
func (wc WriteCounter) PrintProgress() { | |
// Clear the line by using a character return to go back to the start and remove | |
// the remaining characters by filling it with spaces | |
fmt.Printf("\r%s", strings.Repeat(" ", 50)) | |
// Return again and print current status of download | |
// We use the humanize package to print the bytes in a meaningful way (e.g. 10 MB) | |
fmt.Printf("\rDownloading... %s complete", humanize.Bytes(wc.Total)) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment