Skip to content

Instantly share code, notes, and snippets.

@KristofferK
Last active July 30, 2019 20:42
Show Gist options
  • Save KristofferK/f0bc519824e46815939b240c4f6a7553 to your computer and use it in GitHub Desktop.
Save KristofferK/f0bc519824e46815939b240c4f6a7553 to your computer and use it in GitHub Desktop.
Simple script to download images and videos from a Imgur category
import os
import re
import hashlib
import urllib.request
class ImgurDownloader:
destinationFolder = 'D:\\imgur-downloads\\'
pattern = re.compile('<img alt="" src="//([^"]+)"')
def __init__(self, category):
self.category = category
if not os.path.exists(self.destinationFolder):
os.makedirs(self.destinationFolder)
def getCategoryLink(self):
return "https://imgur.com/r/%s/top/month" %self.category
def getResources(self):
resource = urllib.request.urlopen(self.getCategoryLink())
source = resource.read().decode(resource.headers.get_content_charset())
matches = self.pattern.findall(source)
return list(map(lambda link: "https://" + link.replace("b.jpg", ".jpg"), matches))
def getDestination(self, imgurLink):
return self.destinationFolder + hashlib.sha224(imgurLink.encode()).hexdigest() + ".jpg"
def download(self):
images = self.getResources()
print("Found %i images" %len(images))
for image in images:
destination = self.getDestination(image)
print(image + " --> " + destination)
urllib.request.urlretrieve(image, destination)
class AdvancedImgurDownloader(ImgurDownloader):
pagePattern = re.compile('image-list-link" href="(/r[^"]+)"')
imagePattern = re.compile('<meta property="og:image"\s+content="([^"\?]+)')
videoPattern = re.compile('<meta name="twitter:player:stream"\s+content="([^"]+)"')
def __init__(self, category):
self.category = category
if not os.path.exists(self.destinationFolder):
os.makedirs(self.destinationFolder)
def getResources(self):
resources = {}
for pageLink in self.getPageLinks():
resources[pageLink] = self.getResourceLink(pageLink)
print('Found ' + str(resources[pageLink]))
return resources
def getPageLinks(self):
resource = urllib.request.urlopen(self.getCategoryLink())
source = resource.read().decode(resource.headers.get_content_charset())
pagesRelative = self.pagePattern.findall(source)
return list(map(lambda link: "https://imgur.com/" + link, pagesRelative))
def getResourceLink(self, pageLink):
resource = urllib.request.urlopen(pageLink)
source = resource.read().decode(resource.headers.get_content_charset())
videoMatch = self.videoPattern.findall(source)
if (videoMatch):
return {'url': videoMatch[0], 'isVideo': True}
imageMatch = self.imagePattern.findall(source)
if (imageMatch):
return {'url': imageMatch[0], 'isVideo': False}
def getDestination(self, resource):
extension = ".mp4" if resource["isVideo"] else ".jpg"
return self.destinationFolder + hashlib.sha224(resource["url"].encode()).hexdigest() + extension
def download(self):
resources = self.getResources()
print("Found %i resources" %len(resources))
for key in resources:
resource = resources[key]
destination = self.getDestination(resource)
print(resource["url"] + " --> " + destination)
urllib.request.urlretrieve(resource["url"], destination)
advancedDownloader = AdvancedImgurDownloader("cats")
advancedDownloader.download()
#downloader = ImgurDownloader("cats")
#print("About to download from " + downloader.getCategoryLink())
#downloader.download()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment