Last active
January 28, 2016 06:38
-
-
Save liuderchi/6ebf5066677d5ce89cb3 to your computer and use it in GitHub Desktop.
crawl img in line store and download them. using beautifulsoup4
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Web Crawler for line stickers | |
# USAGE: | |
# $ python line_sticker_printer.py <url> | |
# example url: | |
# https://store.line.me/stickershop/home/user/zh-Hant | |
# https://store.line.me/stickershop/home/general/zh-Hant | |
# | |
# credit: | |
# jminh@github and hour of code | |
# | |
# Reference: | |
# main: http://pycontw.blogspot.tw/2015/12/hour-of-code-90.html | |
# code: https://github.com/jminh/hour_of_code_python_2015 | |
# hackpad notes: https://hocpython.hackpad.com/Hour-of-Code-Python--oQL8j5m00dp | |
# | |
# Revised by D. Liu | |
from bs4 import BeautifulSoup as BS | |
import os | |
import requests | |
import sys | |
def download_file(url, dir='.'): | |
# NOTE filename is customize | |
# url example: | |
# http://.../products/0/0/1/1239040/LINEStorePC/thumbnail_shop.png | |
# http://.../products/0/0/1/1235900/LINEStorePC/thumbnail_shop.png | |
local_filename = url.split('/')[-3] | |
# NOTE the stream=True parameter | |
r = requests.get(url, stream=True) | |
with open(os.path.join(dir, local_filename), 'wb') as f: | |
for chunk in r.iter_content(chunk_size=1024): | |
if chunk: # filter out keep-alive new chunks | |
f.write(chunk) | |
f.flush() # commented by recommendation from J.F.Sebastian | |
return local_filename | |
def main(): | |
stickers = [] | |
# make request and get response | |
res = requests.get(sys.argv[1]) | |
# parse txt content of response | |
# NOTE check response_snippet | |
soup = BS(res.text, 'html.parser') | |
# NOTE this way is customized to get div tags | |
divs = soup.find_all('div', 'mdCMN05Img') | |
# get image link from src attributes | |
stickers = [ div.img['src'] for div in divs if div.img['src'] ] | |
#for div in divs: | |
# src = div.img['src'] | |
# if src: | |
# stickers.append(src) | |
# make folder named as webpage title | |
# get text content of <title> | |
title = soup.title.text.split('-')[0].strip() | |
if not os.path.exists(title): | |
os.mkdir(title) | |
# download image | |
download_dir = os.path.join(os.getcwd(), title) | |
for url in stickers: | |
print(download_file(url, download_dir), "downloaded!") | |
if __name__ == '__main__': | |
if len(sys.argv) < 2: | |
print("Usage: line_sticker_printer.py [URL]") | |
sys.exit(0) | |
main() | |
response_snippet = u""" | |
<div class="MdCMN02List"> | |
<ul class="mdCMN02Ul"> | |
<li class="mdCMN02Li"> | |
<a href="/stickershop/product/1236945/zh-Hant" data-gcl="sticker|click|sticker_item_1236945"> | |
<div class="MdCMN05Item mdCMN05Sticker"> | |
<div class="mdCMN05Img"> | |
<img src="https://sdl-stickershop.line.naver.jp/products/0/0/1/1236945/LINEStorePC/thumbnail_shop.png" height="120" width="120"> | |
</div> | |
<p class="mdCMN05Ttl">title of the stickers</p> | |
</div></a> | |
</li> | |
<li class="mdCMN02Li"> | |
<a href="/stickershop/product/1238025/zh-Hant" data-gcl="sticker|click|sticker_item_1238025"> | |
<div class="MdCMN05Item mdCMN05Sticker"> | |
<div class="mdCMN05Img"> | |
<img src="https://sdl-stickershop.line.naver.jp/products/0/0/1/1238025/LINEStorePC/thumbnail_shop.png" height="120" width="120"> | |
</div> | |
<p class="mdCMN05Ttl">title of the stickers</p> | |
</div></a> | |
</li> | |
<!--..More li tags here...--> | |
</ul> | |
</div> | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment