Created
April 26, 2016 13:59
-
-
Save bactisme/8c56aa6f04f3df1fce7b4346a2522049 to your computer and use it in GitHub Desktop.
Take a file containing a liste of URL, print average, sum and max file size
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import requests | |
from lxml import html | |
import sys | |
def sum_avg_max(images): | |
total = sum(s for url, s in images) | |
average = total / len(images) | |
max_s = max(s for url, s in images) | |
return (total, average, max_s) | |
def get_page_content(url): | |
#r = get_page_content("http://www.frandroid.com/produits-android/smartphone/355381_xiaomi-mi-max-apercu-snapdragon-652") | |
r = requests.get(url, timeout=15) | |
text = r.text | |
tree = html.fromstring(r.content) | |
srcs = tree.xpath('//div[@class="post-content"]//img/@src') | |
results = [] | |
for image in srcs: | |
if image[0:2] == "//": | |
image = "http:"+image | |
try: | |
r2 = requests.head(image, timeout=5) | |
#print(r2.headers) | |
results.append((image, int(r2.headers["content-length"])) ) | |
except: | |
pass | |
return results | |
def checkimagesize(file): | |
fp = open(file, "r") | |
for line in fp: | |
line = line.strip(' \t\n\r') | |
if line[0:3] == "www": | |
line = "http://"+line | |
images = get_page_content(line) | |
r = sum_avg_max(images) | |
print line + (",%d" % len(images)) + (",%d,%d,%d" % r) | |
checkimagesize( sys.argv[1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment