Last active
January 21, 2016 13:57
-
-
Save vinovator/2799d606b5c8183bea2a to your computer and use it in GitHub Desktop.
Simple webscraper to download cartoon strips from cricinfo site.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# python 2.7.6 | |
# Cricinfoscraper_py2.py | |
""" | |
Simple webscraper to download cartoons from cricinfo site. | |
The url is http://www.espncricinfo.com/ci/content/story/author.html?author=333 | |
Changed the code to dynamically scroll the page to fetch more cartoons. | |
Using Splinter/ Selenium library to fetch javascript rendered content | |
""" | |
import requests | |
from BeautifulSoup import BeautifulSoup | |
import os | |
# Splinter abstracts selenium and webdriver | |
from splinter.browser import Browser | |
from PIL import Image # To resize image | |
from selenium import webdriver | |
# from selenium.webdriver.support.ui import WebDriverWait | |
# from selenium.webdriver.support import expected_conditions as EC | |
import time # to induce sleep time to browser | |
# Initialize parameters | |
base_url = "http://www.cricinfo.com/" | |
comics_url = "http://www.espncricinfo.com/ci/content/story/author.html?author=333" | |
# This is the folder where the cartoon strips get downloaded | |
base_folder = "cricinfo" | |
def downsize_images(ratio): | |
""" | |
The images downloaded from cricinfo site are larger in size | |
which makes it difficult to view in HTML page. | |
Programatically resize image to half | |
""" | |
for path, dirs, files in os.walk(base_folder): | |
for file_name in files: | |
try: | |
if file_name.split(".")[-1].lower() == "jpg": | |
im = Image.open(os.path.join(base_folder, file_name)) | |
w, h = im.size # returns tuple with width & height | |
newIm = im.resize((int(w * ratio), int(h * ratio))) | |
# Save the resized image back with original name | |
newIm.save(os.path.join(base_folder, file_name)) | |
except Exception as e: | |
print (e) | |
# If any of the images cannot be resized due to file corruption | |
# etc, skip and move on | |
continue | |
print("Images scaled down") | |
def write_html(text, mode): | |
""" | |
Write all comics strips to a HTML page | |
""" | |
with open(os.path.join(base_folder, "cricinfo" + ".html"), mode) as html: | |
html.write(text) | |
def build_block(alt, title, src): | |
""" | |
Construct HTML block for each comic strip | |
""" | |
block = "<tr><td>" | |
block += "<head><b>" + alt + "</b></head>" | |
block += "<p></P>" | |
block += "<img src=" + "\"" + src + "\"" + "/>" | |
block += "<body><p><i>" + title + "</i></p></body>" | |
block += "<p></P>" | |
block += "<p>-----------------------</p>" | |
block += "</td></tr>" | |
return block | |
def render_content_splinter(): | |
""" | |
Fetch the Javascript rendered content using Splinter | |
Splinter uses Selenium in the background | |
Firefox is used as the webdriver for ease of use | |
""" | |
# Firefox is used as webdriver | |
browser = Browser("firefox") | |
# This is the url for cartoons page | |
browser.visit(comics_url) | |
html = browser.html | |
# close the browser | |
browser.quit() | |
# The rendered HTML is returned back | |
return html | |
def render_content(): | |
""" | |
Fetch the Javascript rendered content | |
""" | |
driver = webdriver.Firefox() | |
driver.get(comics_url) | |
# Scroll the page until the end | |
# End of page is determined by comparing height at each loop | |
lastHeight = driver.execute_script("return document.body.scrollHeight") | |
while True: | |
driver.execute_script( | |
"window.scrollTo(0, document.body.scrollHeight);") | |
time.sleep(2) | |
# WebDriverWait(driver, 20).until(EC.presence_of_element_located( | |
# driver.find_element_by_class_name('img-full'))) | |
newHeight = driver.execute_script("return document.body.scrollHeight") | |
if newHeight == lastHeight: | |
break | |
lastHeight = newHeight | |
# wait for sometime to load the page fully | |
time.sleep(20) | |
html = driver.page_source | |
# Close the browser | |
driver.quit() | |
# The rendered HTML is returned back | |
return html | |
def main(): | |
""" | |
Starting block of the program | |
Render the cricinfo comics using browser driver | |
""" | |
if not os.path.exists(base_folder): | |
os.makedirs(base_folder) | |
# Fetch the Javascript rendered content | |
html = render_content() | |
soup = BeautifulSoup(html) | |
cartoon_div = soup.findAll("div", {"class": "story-imgwrap"}) | |
# Start constructing the HTML file, overwrite existing content | |
write_html("<html>", "w") | |
for tag in cartoon_div: | |
attrs = tag.findAll("img", {"class": "img-full"}) | |
for attr in attrs: | |
src = attr.get("src") # Image url | |
alt = attr.get("alt") # alt text | |
title = attr.get("title") # title of the comics | |
# print("***src {}".format(src)) | |
# Some urls are corrupted and dont point to jpg | |
# Skip such files | |
if not (src.split(".")[-1].lower() == "jpg"): | |
print("{} skipped".format(src)) | |
continue | |
# The filename of the cartoon is formatted to indicate preview. | |
# for e.g. /db/PICTURES/CMS/218600/218653.4.jpg. | |
# The .x.jpg in end indicates preview image | |
# strip the last digit following a dot to get full image | |
src = ".".join(src.split(".")[:-2]) + ".jpg" | |
# print("{} - {} - {}".format(src, alt, title)) | |
# Old comics has full URL | |
# e.g. http://p.imgci.com/db/PICTURES/CMS/166300/166361.jpg | |
if "http" in src.split("/")[0]: | |
resp = requests.get(src) | |
# Recent comics has relative url | |
# e.g. /db/PICTURES/CMS/216000/216029.jpg | |
else: | |
resp = requests.get(base_url + src) | |
# strip out the relative path to get file name | |
img_file = src.split("/")[-1] | |
try: | |
with open(os.path.join(base_folder, img_file), "wb") as jpg: | |
for chunk in resp.iter_content(chunk_size=1024): | |
if chunk: | |
jpg.write(chunk) | |
print( | |
"{} - {} downloaded from {}".format(img_file, title, src)) | |
except Exception as e: | |
print(e) | |
continue | |
# Initiate variable which is used to construct the html file | |
htmlBlockString = "" | |
# Build the html block string for thie iteration of comic | |
htmlBlockString += build_block(alt, title, img_file) | |
# Append the html file with the block from current loop | |
write_html(htmlBlockString, "a") | |
# End constructing HTML file, in append mode | |
write_html("</html>", "a") | |
print("\nComics file created at path {0}".format( | |
os.path.join(base_folder, "cricinfo" + ".html"))) | |
# Reduce the size of the images downloaded | |
downsize_images(0.5) | |
print("--------------Done---------------------") | |
if __name__ == "__main__": | |
# Starting block of program | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment