Last active
October 30, 2016 19:04
-
-
Save suhithr/9a06a422f68a155a6b16494d81f5a21f to your computer and use it in GitHub Desktop.
A very simple 9gag scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Code used for the Delta Workshop on Python and Linux 2016 at NIT, Trichy | |
import urllib | |
from time import sleep | |
import requests # http://docs.python-requests.org/en/master/ | |
from bs4 import BeautifulSoup # https://www.crummy.com/software/BeautifulSoup/bs4/doc/ | |
# Try printing out the variables at various stages to view the objects | |
# Better yet execute this in your Python interpreter to see all the action happening | |
# and print each variable | |
url = "http://9gag.com" | |
r = requests.get(url) | |
soup = BeautifulSoup(r.text, 'html.parser') | |
img_tags = soup.select('img.badge-item-img') | |
# This is a CSS selector: The format is tag.class#id | |
# soup.find_all('img') | |
# We are not using the find_all() method because it gets all the images | |
# All the images are not what we want, since some of them are badges, icons, and we only want images that are part of the feed | |
# We will only download the first ten images | |
n = 10 | |
for img in img_tags: | |
if img['src'] is not None and n > 0: | |
sleep(1) # Adding a delay so we don't flood 9gag's servers | |
urllib.urlretrieve(img['src'], img['alt']) | |
print "Downloaded {}".format(img['alt']) | |
n -= 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment