Created
March 23, 2019 10:51
-
-
Save wasi0013/e6a51de8f9d0cdb2a657969f44967fa3 to your computer and use it in GitHub Desktop.
Find un-optimized images of a webpage using requests_html python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests_html | |
def unoptimized_images(url): | |
""" | |
find unoptimized images in a webpage | |
:param url: webpage_url | |
:return : tuple of image_count in int, images list of dict | |
""" | |
session = requests_html.HTMLSession() | |
response = session.get(url) | |
images = [] | |
image_count = 0 | |
for element in response.html.find("img"): | |
image_url = element.attrs.get("src") | |
try: | |
if image_url: | |
i = session.get(image_url) | |
else: | |
continue | |
except: | |
if image_url[0] == "/" and element.base_url[-1] == "/": image_url = image_url[1::] | |
image_url = element.base_url + image_url | |
i = session.get(image_url) | |
if i.status_code != 200: | |
continue | |
image_size = None | |
try: | |
image_size = int(i.raw.info().get("Content-Length"))/1000 | |
except: | |
print("Error fetching image size for:", image_url) | |
# check if image size is greater than 1MB | |
if image_size is not None and image_size >=1024: | |
images.append({ | |
'url': image_url, | |
'size(KB)': image_size, | |
}) | |
image_count += 1 | |
return image_count, images |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment