Skip to content

Instantly share code, notes, and snippets.

@bongkook
Last active August 29, 2015 13:57
Show Gist options
  • Save bongkook/9447005 to your computer and use it in GitHub Desktop.
Save bongkook/9447005 to your computer and use it in GitHub Desktop.
Only image files with the HTML parsing and extraction
# -*- coding: utf-8 -*-
import re, sys
def extractimgs(html):
exp = re.compile(r'<img.+?src="(http://imgcomic\.naver\.net/webtoon/[0-9]+/[0-9]+/(.+?\.(jpg|png|gif)))".*?>')
imgs = exp.findall(html)
return imgs
def main(argv):
if len(argv) != 2:
print 'Usage: extractimgs.py <filename>'
return 1
f = open(argv[1], 'r')
html = f.read()
f.close()
imgs = extractimgs(html)
if len(imgs) == 0:
print >> sys.stderr, "No images!"
return 1
for img in imgs:
print img[0] # full link
print img[1] # file name
print img[2] # extension
return 0
if __name__ == '__main__':
sys.exit(main(sys.argv))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment