Last active
July 30, 2019 20:42
-
-
Save KristofferK/f0bc519824e46815939b240c4f6a7553 to your computer and use it in GitHub Desktop.
Simple script to download images and videos from a Imgur category
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import hashlib | |
import urllib.request | |
class ImgurDownloader: | |
destinationFolder = 'D:\\imgur-downloads\\' | |
pattern = re.compile('<img alt="" src="//([^"]+)"') | |
def __init__(self, category): | |
self.category = category | |
if not os.path.exists(self.destinationFolder): | |
os.makedirs(self.destinationFolder) | |
def getCategoryLink(self): | |
return "https://imgur.com/r/%s/top/month" %self.category | |
def getResources(self): | |
resource = urllib.request.urlopen(self.getCategoryLink()) | |
source = resource.read().decode(resource.headers.get_content_charset()) | |
matches = self.pattern.findall(source) | |
return list(map(lambda link: "https://" + link.replace("b.jpg", ".jpg"), matches)) | |
def getDestination(self, imgurLink): | |
return self.destinationFolder + hashlib.sha224(imgurLink.encode()).hexdigest() + ".jpg" | |
def download(self): | |
images = self.getResources() | |
print("Found %i images" %len(images)) | |
for image in images: | |
destination = self.getDestination(image) | |
print(image + " --> " + destination) | |
urllib.request.urlretrieve(image, destination) | |
class AdvancedImgurDownloader(ImgurDownloader): | |
pagePattern = re.compile('image-list-link" href="(/r[^"]+)"') | |
imagePattern = re.compile('<meta property="og:image"\s+content="([^"\?]+)') | |
videoPattern = re.compile('<meta name="twitter:player:stream"\s+content="([^"]+)"') | |
def __init__(self, category): | |
self.category = category | |
if not os.path.exists(self.destinationFolder): | |
os.makedirs(self.destinationFolder) | |
def getResources(self): | |
resources = {} | |
for pageLink in self.getPageLinks(): | |
resources[pageLink] = self.getResourceLink(pageLink) | |
print('Found ' + str(resources[pageLink])) | |
return resources | |
def getPageLinks(self): | |
resource = urllib.request.urlopen(self.getCategoryLink()) | |
source = resource.read().decode(resource.headers.get_content_charset()) | |
pagesRelative = self.pagePattern.findall(source) | |
return list(map(lambda link: "https://imgur.com/" + link, pagesRelative)) | |
def getResourceLink(self, pageLink): | |
resource = urllib.request.urlopen(pageLink) | |
source = resource.read().decode(resource.headers.get_content_charset()) | |
videoMatch = self.videoPattern.findall(source) | |
if (videoMatch): | |
return {'url': videoMatch[0], 'isVideo': True} | |
imageMatch = self.imagePattern.findall(source) | |
if (imageMatch): | |
return {'url': imageMatch[0], 'isVideo': False} | |
def getDestination(self, resource): | |
extension = ".mp4" if resource["isVideo"] else ".jpg" | |
return self.destinationFolder + hashlib.sha224(resource["url"].encode()).hexdigest() + extension | |
def download(self): | |
resources = self.getResources() | |
print("Found %i resources" %len(resources)) | |
for key in resources: | |
resource = resources[key] | |
destination = self.getDestination(resource) | |
print(resource["url"] + " --> " + destination) | |
urllib.request.urlretrieve(resource["url"], destination) | |
advancedDownloader = AdvancedImgurDownloader("cats") | |
advancedDownloader.download() | |
#downloader = ImgurDownloader("cats") | |
#print("About to download from " + downloader.getCategoryLink()) | |
#downloader.download() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment