Skip to content

Instantly share code, notes, and snippets.

@seungwonpark
Last active September 8, 2018 06:27
Show Gist options
  • Select an option

  • Save seungwonpark/7997b6a676b04a7daab8514e6833f6ed to your computer and use it in GitHub Desktop.

Select an option

Save seungwonpark/7997b6a676b04a7daab8514e6833f6ed to your computer and use it in GitHub Desktop.
효자동사진관 사진 크롤러
# open.pss.go.kr image crawler
# n=150: 2017-12-28, Conversation with future scientists
import requests
import re
import os
baseurl = 'http://open.pss.go.kr'
for n in range(151):
# n = 150
req = requests.get(baseurl + '/picture/view/?no=%d' % n)
html = req.text
if(len(html) < 200): # no info
print('n = %d: No such event was found.' % n)
print('')
continue
keys = re.findall(r'filekey=(.*)"', html)
print('n = %d: Total %d images found.' % (n, len(keys)))
date = re.findall(r'"fa fa-calendar" aria-hidden="true"></i> (.*)&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;', html)[0]
yy, mm, dd = map(int, date.split('-'))
foldername = '%04d%02d%02d-pss' % (yy, mm, dd)
if not os.path.exists(foldername):
os.makedirs(foldername)
for i in range(len(keys)):
local_filename = '%s/%04d%02d%02d_pss_%03d.jpeg' % (foldername, yy, mm, dd, i)
url = baseurl + '/download/?filekey=' + keys[i]
# Refer to https://stackoverflow.com/a/16696317
r = requests.get(url, stream=True)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
print('%03d/%03d (%.2lf%%)' % ((i+1), len(keys), (i+1)/len(keys)*100))
print('Done')
print('')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment