Created
February 22, 2017 07:21
-
-
Save moskomule/e50a09e7d505b8d883ae1abf5c7a8a23 to your computer and use it in GitHub Desktop.
save images in specifeied pages
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from html.parser import HTMLParser | |
import urllib.request | |
import re | |
def download(url, path): | |
with urllib.request.urlopen(url) as file: | |
file_name = path + "/" + url.split("/")[-1] | |
with open(file_name, 'wb') as local: | |
local.write(file.read()) | |
class TestParser(HTMLParser): | |
def __init__(self): | |
HTMLParser.__init__(self) | |
self.url = "" | |
def handle_starttag(self, tag, attrs): | |
if tag == "link": | |
attrs = dict(attrs) | |
if attrs["rel"] == "image_src": | |
download(attrs["href"], path) | |
def handle_endtag(self, tag): | |
if self.url and re.match('^https', self.url): | |
self.url = "" | |
parser = TestParser() | |
with open(csv) as f: | |
urls = f.readlines() | |
for url in urls: | |
try: | |
with urllib.request.urlopen(url) as response: | |
page = response.read().decode('utf-8') | |
parser.feed(page) | |
parser.close() | |
except Exception: | |
pass |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment