vinovator · January 21, 2016 13:57
diff --git a/Cricinfoscraper_py2.py b/Cricinfoscraper_py2.py
 # python 2.7.6
 # Cricinfoscraper_py2.py

 """
 Simple webscraper to download cartoons from cricinfo site.
 The url is http://www.espncricinfo.com/ci/content/story/author.html?author=333
 Changed the code to dynamically scroll the page to fetch more cartoons.
 Using Splinter/ Selenium library to fetch javascript rendered content
 """

 import requests
 from BeautifulSoup import BeautifulSoup
 import os
 # Splinter abstracts selenium and webdriver
 from splinter.browser import Browser
 from PIL import Image  # To resize image
 from selenium import webdriver
 # from selenium.webdriver.support.ui import WebDriverWait
 # from selenium.webdriver.support import expected_conditions as EC
 import time  # to induce sleep time to browser


 # Initialize parameters
 base_url = "http://www.cricinfo.com/"
 comics_url = "http://www.espncricinfo.com/ci/content/story/author.html?author=333"

 # This is the folder where the cartoon strips get downloaded
 base_folder = "cricinfo"


 def downsize_images(ratio):
    """
    The images downloaded from cricinfo site are larger in size
    which makes it difficult to view in HTML page.
    Programatically resize image to half
    """
    for path, dirs, files in os.walk(base_folder):
        for file_name in files:
            try:
                if file_name.split(".")[-1].lower() == "jpg":
                    im = Image.open(os.path.join(base_folder, file_name))
                    w, h = im.size  # returns tuple with width & height
                    newIm = im.resize((int(w * ratio), int(h * ratio)))
                    # Save the resized image back with original name
                    newIm.save(os.path.join(base_folder, file_name))
            except Exception as e:
                print (e)
                # If any of the images cannot be resized due to file corruption
                # etc, skip and move on
                continue
    print("Images scaled down")


 def write_html(text, mode):
    """
    Write all comics strips to a HTML page
    """
    with open(os.path.join(base_folder, "cricinfo" + ".html"), mode) as html:
        html.write(text)


 def build_block(alt, title, src):
    """
    Construct HTML block for each comic strip
    """
    block = "<tr><td>"
    block += "<head><b>" + alt + "</b></head>"
    block += "<p></P>"
    block += "<img src=" + "\"" + src + "\"" + "/>"
    block += "<body><p><i>" + title + "</i></p></body>"
    block += "<p></P>"
    block += "<p>-----------------------</p>"
    block += "</td></tr>"
    return block


 def render_content_splinter():
    """
    Fetch the Javascript rendered content using Splinter
    Splinter uses Selenium in the background
    Firefox is used as the webdriver for ease of use
    """
    # Firefox is used as webdriver
    browser = Browser("firefox")

    # This is the url for cartoons page
    browser.visit(comics_url)

    html = browser.html

    # close the browser
    browser.quit()

    # The rendered HTML is returned back
    return html


 def render_content():
    """
    Fetch the Javascript rendered content
    """
    driver = webdriver.Firefox()
    driver.get(comics_url)

    # Scroll the page until the end
    # End of page is determined by comparing height at each loop
    lastHeight = driver.execute_script("return document.body.scrollHeight")

    while True:
        driver.execute_script(
            "window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)

        # WebDriverWait(driver, 20).until(EC.presence_of_element_located(
        #    driver.find_element_by_class_name('img-full')))

        newHeight = driver.execute_script("return document.body.scrollHeight")
        if newHeight == lastHeight:
            break
        lastHeight = newHeight

    # wait for sometime to load the page fully
    time.sleep(20)

    html = driver.page_source

    # Close the browser
    driver.quit()

    # The rendered HTML is returned back
    return html


 def main():
    """
    Starting block of the program
    Render the cricinfo comics using browser driver
    """

    if not os.path.exists(base_folder):
        os.makedirs(base_folder)

    # Fetch the Javascript rendered content
    html = render_content()

    soup = BeautifulSoup(html)

    cartoon_div = soup.findAll("div", {"class": "story-imgwrap"})

    # Start constructing the HTML file, overwrite existing content
    write_html("<html>", "w")

    for tag in cartoon_div:
        attrs = tag.findAll("img", {"class": "img-full"})
        for attr in attrs:
            src = attr.get("src")  # Image url
            alt = attr.get("alt")  # alt text
            title = attr.get("title")  # title of the comics

            # print("***src {}".format(src))

            # Some urls are corrupted and dont point to jpg
            # Skip such files
            if not (src.split(".")[-1].lower() == "jpg"):
                print("{} skipped".format(src))
                continue

            # The filename of the cartoon is formatted to indicate preview.
            # for e.g. /db/PICTURES/CMS/218600/218653.4.jpg.
            # The .x.jpg in end indicates preview image
            # strip the last digit following a dot to get full image
            src = ".".join(src.split(".")[:-2]) + ".jpg"

            # print("{} - {} - {}".format(src, alt, title))

            # Old comics has full URL
            # e.g. http://p.imgci.com/db/PICTURES/CMS/166300/166361.jpg
            if "http" in src.split("/")[0]:
                resp = requests.get(src)
            # Recent comics has relative url
            # e.g. /db/PICTURES/CMS/216000/216029.jpg
            else:
                resp = requests.get(base_url + src)

            # strip out the relative path to get file name
            img_file = src.split("/")[-1]

            try:
                with open(os.path.join(base_folder, img_file), "wb") as jpg:
                    for chunk in resp.iter_content(chunk_size=1024):
                        if chunk:
                            jpg.write(chunk)
                print(
                    "{} - {} downloaded from {}".format(img_file, title, src))
            except Exception as e:
                print(e)
                continue

            # Initiate variable which is used to construct the html file
            htmlBlockString = ""

            # Build the html block string for thie iteration of comic
            htmlBlockString += build_block(alt, title, img_file)

            # Append the html file with the block from current loop
            write_html(htmlBlockString, "a")

    # End constructing HTML file, in append mode
    write_html("</html>", "a")

    print("\nComics file created at path {0}".format(
        os.path.join(base_folder, "cricinfo" + ".html")))

    # Reduce the size of the images downloaded
    downsize_images(0.5)

    print("--------------Done---------------------")


 if __name__ == "__main__":

    # Starting block of program
    main()
	# python 2.7.6
	# Cricinfoscraper_py2.py

	"""
	Simple webscraper to download cartoons from cricinfo site.
	The url is http://www.espncricinfo.com/ci/content/story/author.html?author=333
	Changed the code to dynamically scroll the page to fetch more cartoons.
	Using Splinter/ Selenium library to fetch javascript rendered content
	"""

	import requests
	from BeautifulSoup import BeautifulSoup
	import os
	# Splinter abstracts selenium and webdriver
	from splinter.browser import Browser
	from PIL import Image # To resize image
	from selenium import webdriver
	# from selenium.webdriver.support.ui import WebDriverWait
	# from selenium.webdriver.support import expected_conditions as EC
	import time # to induce sleep time to browser


	# Initialize parameters
	base_url = "http://www.cricinfo.com/"
	comics_url = "http://www.espncricinfo.com/ci/content/story/author.html?author=333"

	# This is the folder where the cartoon strips get downloaded
	base_folder = "cricinfo"


	def downsize_images(ratio):
	"""
	The images downloaded from cricinfo site are larger in size
	which makes it difficult to view in HTML page.
	Programatically resize image to half
	"""
	for path, dirs, files in os.walk(base_folder):
	for file_name in files:
	try:
	if file_name.split(".")[-1].lower() == "jpg":
	im = Image.open(os.path.join(base_folder, file_name))
	w, h = im.size # returns tuple with width & height
	newIm = im.resize((int(w * ratio), int(h * ratio)))
	# Save the resized image back with original name
	newIm.save(os.path.join(base_folder, file_name))
	except Exception as e:
	print (e)
	# If any of the images cannot be resized due to file corruption
	# etc, skip and move on
	continue
	print("Images scaled down")


	def write_html(text, mode):
	"""
	Write all comics strips to a HTML page
	"""
	with open(os.path.join(base_folder, "cricinfo" + ".html"), mode) as html:
	html.write(text)


	def build_block(alt, title, src):
	"""
	Construct HTML block for each comic strip
	"""
	block = "<tr><td>"
	block += "<head><b>" + alt + "</b></head>"
	block += "<p></P>"
	block += "<img src=" + "\"" + src + "\"" + "/>"
	block += "<body><p><i>" + title + "</i></p></body>"
	block += "<p></P>"
	block += "<p>-----------------------</p>"
	block += "</td></tr>"
	return block


	def render_content_splinter():
	"""
	Fetch the Javascript rendered content using Splinter
	Splinter uses Selenium in the background
	Firefox is used as the webdriver for ease of use
	"""
	# Firefox is used as webdriver
	browser = Browser("firefox")

	# This is the url for cartoons page
	browser.visit(comics_url)

	html = browser.html

	# close the browser
	browser.quit()

	# The rendered HTML is returned back
	return html


	def render_content():
	"""
	Fetch the Javascript rendered content
	"""
	driver = webdriver.Firefox()
	driver.get(comics_url)

	# Scroll the page until the end
	# End of page is determined by comparing height at each loop
	lastHeight = driver.execute_script("return document.body.scrollHeight")

	while True:
	driver.execute_script(
	"window.scrollTo(0, document.body.scrollHeight);")
	time.sleep(2)

	# WebDriverWait(driver, 20).until(EC.presence_of_element_located(
	# driver.find_element_by_class_name('img-full')))

	newHeight = driver.execute_script("return document.body.scrollHeight")
	if newHeight == lastHeight:
	break
	lastHeight = newHeight

	# wait for sometime to load the page fully
	time.sleep(20)

	html = driver.page_source

	# Close the browser
	driver.quit()

	# The rendered HTML is returned back
	return html


	def main():
	"""
	Starting block of the program
	Render the cricinfo comics using browser driver
	"""

	if not os.path.exists(base_folder):
	os.makedirs(base_folder)

	# Fetch the Javascript rendered content
	html = render_content()

	soup = BeautifulSoup(html)

	cartoon_div = soup.findAll("div", {"class": "story-imgwrap"})

	# Start constructing the HTML file, overwrite existing content
	write_html("<html>", "w")

	for tag in cartoon_div:
	attrs = tag.findAll("img", {"class": "img-full"})
	for attr in attrs:
	src = attr.get("src") # Image url
	alt = attr.get("alt") # alt text
	title = attr.get("title") # title of the comics

	# print("***src {}".format(src))

	# Some urls are corrupted and dont point to jpg
	# Skip such files
	if not (src.split(".")[-1].lower() == "jpg"):
	print("{} skipped".format(src))
	continue

	# The filename of the cartoon is formatted to indicate preview.
	# for e.g. /db/PICTURES/CMS/218600/218653.4.jpg.
	# The .x.jpg in end indicates preview image
	# strip the last digit following a dot to get full image
	src = ".".join(src.split(".")[:-2]) + ".jpg"

	# print("{} - {} - {}".format(src, alt, title))

	# Old comics has full URL
	# e.g. http://p.imgci.com/db/PICTURES/CMS/166300/166361.jpg
	if "http" in src.split("/")[0]:
	resp = requests.get(src)
	# Recent comics has relative url
	# e.g. /db/PICTURES/CMS/216000/216029.jpg
	else:
	resp = requests.get(base_url + src)

	# strip out the relative path to get file name
	img_file = src.split("/")[-1]

	try:
	with open(os.path.join(base_folder, img_file), "wb") as jpg:
	for chunk in resp.iter_content(chunk_size=1024):
	if chunk:
	jpg.write(chunk)
	print(
	"{} - {} downloaded from {}".format(img_file, title, src))
	except Exception as e:
	print(e)
	continue

	# Initiate variable which is used to construct the html file
	htmlBlockString = ""

	# Build the html block string for thie iteration of comic
	htmlBlockString += build_block(alt, title, img_file)

	# Append the html file with the block from current loop
	write_html(htmlBlockString, "a")

	# End constructing HTML file, in append mode
	write_html("</html>", "a")

	print("\nComics file created at path {0}".format(
	os.path.join(base_folder, "cricinfo" + ".html")))

	# Reduce the size of the images downloaded
	downsize_images(0.5)

	print("--------------Done---------------------")


	if __name__ == "__main__":

	# Starting block of program
	main()