Skip to content

Instantly share code, notes, and snippets.

@vinovator
Last active January 21, 2016 13:57
Show Gist options
  • Save vinovator/2799d606b5c8183bea2a to your computer and use it in GitHub Desktop.
Save vinovator/2799d606b5c8183bea2a to your computer and use it in GitHub Desktop.
Simple webscraper to download cartoon strips from cricinfo site.
# python 2.7.6
# Cricinfoscraper_py2.py
"""
Simple webscraper to download cartoons from cricinfo site.
The url is http://www.espncricinfo.com/ci/content/story/author.html?author=333
Changed the code to dynamically scroll the page to fetch more cartoons.
Using Splinter/ Selenium library to fetch javascript rendered content
"""
import requests
from BeautifulSoup import BeautifulSoup
import os
# Splinter abstracts selenium and webdriver
from splinter.browser import Browser
from PIL import Image # To resize image
from selenium import webdriver
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
import time # to induce sleep time to browser
# Initialize parameters
base_url = "http://www.cricinfo.com/"
comics_url = "http://www.espncricinfo.com/ci/content/story/author.html?author=333"
# This is the folder where the cartoon strips get downloaded
base_folder = "cricinfo"
def downsize_images(ratio):
"""
The images downloaded from cricinfo site are larger in size
which makes it difficult to view in HTML page.
Programatically resize image to half
"""
for path, dirs, files in os.walk(base_folder):
for file_name in files:
try:
if file_name.split(".")[-1].lower() == "jpg":
im = Image.open(os.path.join(base_folder, file_name))
w, h = im.size # returns tuple with width & height
newIm = im.resize((int(w * ratio), int(h * ratio)))
# Save the resized image back with original name
newIm.save(os.path.join(base_folder, file_name))
except Exception as e:
print (e)
# If any of the images cannot be resized due to file corruption
# etc, skip and move on
continue
print("Images scaled down")
def write_html(text, mode):
"""
Write all comics strips to a HTML page
"""
with open(os.path.join(base_folder, "cricinfo" + ".html"), mode) as html:
html.write(text)
def build_block(alt, title, src):
"""
Construct HTML block for each comic strip
"""
block = "<tr><td>"
block += "<head><b>" + alt + "</b></head>"
block += "<p></P>"
block += "<img src=" + "\"" + src + "\"" + "/>"
block += "<body><p><i>" + title + "</i></p></body>"
block += "<p></P>"
block += "<p>-----------------------</p>"
block += "</td></tr>"
return block
def render_content_splinter():
"""
Fetch the Javascript rendered content using Splinter
Splinter uses Selenium in the background
Firefox is used as the webdriver for ease of use
"""
# Firefox is used as webdriver
browser = Browser("firefox")
# This is the url for cartoons page
browser.visit(comics_url)
html = browser.html
# close the browser
browser.quit()
# The rendered HTML is returned back
return html
def render_content():
"""
Fetch the Javascript rendered content
"""
driver = webdriver.Firefox()
driver.get(comics_url)
# Scroll the page until the end
# End of page is determined by comparing height at each loop
lastHeight = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script(
"window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
# WebDriverWait(driver, 20).until(EC.presence_of_element_located(
# driver.find_element_by_class_name('img-full')))
newHeight = driver.execute_script("return document.body.scrollHeight")
if newHeight == lastHeight:
break
lastHeight = newHeight
# wait for sometime to load the page fully
time.sleep(20)
html = driver.page_source
# Close the browser
driver.quit()
# The rendered HTML is returned back
return html
def main():
"""
Starting block of the program
Render the cricinfo comics using browser driver
"""
if not os.path.exists(base_folder):
os.makedirs(base_folder)
# Fetch the Javascript rendered content
html = render_content()
soup = BeautifulSoup(html)
cartoon_div = soup.findAll("div", {"class": "story-imgwrap"})
# Start constructing the HTML file, overwrite existing content
write_html("<html>", "w")
for tag in cartoon_div:
attrs = tag.findAll("img", {"class": "img-full"})
for attr in attrs:
src = attr.get("src") # Image url
alt = attr.get("alt") # alt text
title = attr.get("title") # title of the comics
# print("***src {}".format(src))
# Some urls are corrupted and dont point to jpg
# Skip such files
if not (src.split(".")[-1].lower() == "jpg"):
print("{} skipped".format(src))
continue
# The filename of the cartoon is formatted to indicate preview.
# for e.g. /db/PICTURES/CMS/218600/218653.4.jpg.
# The .x.jpg in end indicates preview image
# strip the last digit following a dot to get full image
src = ".".join(src.split(".")[:-2]) + ".jpg"
# print("{} - {} - {}".format(src, alt, title))
# Old comics has full URL
# e.g. http://p.imgci.com/db/PICTURES/CMS/166300/166361.jpg
if "http" in src.split("/")[0]:
resp = requests.get(src)
# Recent comics has relative url
# e.g. /db/PICTURES/CMS/216000/216029.jpg
else:
resp = requests.get(base_url + src)
# strip out the relative path to get file name
img_file = src.split("/")[-1]
try:
with open(os.path.join(base_folder, img_file), "wb") as jpg:
for chunk in resp.iter_content(chunk_size=1024):
if chunk:
jpg.write(chunk)
print(
"{} - {} downloaded from {}".format(img_file, title, src))
except Exception as e:
print(e)
continue
# Initiate variable which is used to construct the html file
htmlBlockString = ""
# Build the html block string for thie iteration of comic
htmlBlockString += build_block(alt, title, img_file)
# Append the html file with the block from current loop
write_html(htmlBlockString, "a")
# End constructing HTML file, in append mode
write_html("</html>", "a")
print("\nComics file created at path {0}".format(
os.path.join(base_folder, "cricinfo" + ".html")))
# Reduce the size of the images downloaded
downsize_images(0.5)
print("--------------Done---------------------")
if __name__ == "__main__":
# Starting block of program
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment