Last active
December 30, 2016 23:19
-
-
Save okiriza/858fbcfaba8b154fb7c12a9fad45a262 to your computer and use it in GitHub Desktop.
Script for downloading screenshots of Transjakarta CCTV (https://tentangdata.wordpress.com/2016/12/10/managed-database-semurah-tahu-tempe-dengan-amazon-dynamodb/)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import print_function | |
import os | |
import time | |
import boto3 | |
from boto3 import dynamodb | |
import cv2 | |
import numpy as np | |
import scipy.misc | |
DB = boto3.resource('dynamodb', region_name='ap-southeast-1') | |
TABLE = DB.Table('cctv-jak') | |
LOC_COL = dynamodb.conditions.Key('loc') | |
TIMESTAMP_COL = dynamodb.conditions.Key('timestamp') | |
def download_images(loc, min_ts=None, max_ts=None, item_limit=None, | |
out_path=None, sleep=None, verbose=0, raise_img_error=False): | |
projection_expr = 'img_bytes, #time' | |
expr_attr_names = {'#time': 'timestamp'} | |
if verbose >= 1: | |
print('Downloading images for loc {}'.format(loc), end='') | |
if min_ts == 'checkpoint': | |
assert out_path | |
min_ts = _get_latest_image_time(os.path.join(out_path, loc)) | |
if min_ts: | |
min_ts += ' ' # add space to sort key to avoid re-downloading last item | |
loc_cond = LOC_COL.eq(loc) | |
time_cond = _get_time_cond(min_ts, max_ts, verbose=verbose) | |
results = [] | |
while True: | |
cond = dynamodb.conditions.And(loc_cond, time_cond) if time_cond else loc_cond | |
response = TABLE.query(ProjectionExpression=projection_expr, | |
ExpressionAttributeNames=expr_attr_names, | |
KeyConditionExpression=cond) | |
results.extend(response['Items']) | |
last_ts = response.get('LastEvaluatedKey') | |
if last_ts: | |
last_ts = last_ts['timestamp'] | |
if (verbose >= 2) and response['Items']: | |
print(' Downloaded {} images, ending at {}'.format(len(response['Items']), last_ts)) | |
for item in response['Items']: | |
item['loc'] = loc | |
if out_path: | |
_save_img(item['img_bytes'], loc, item['timestamp'], out_path, raise_img_error=raise_img_error) | |
if item_limit and (len(results) >= item_limit): | |
if verbose >= 1: | |
print('Exceeded downloaded images limit of {}, exiting...'.format(item_limit)) | |
break | |
if last_ts: | |
# add space to sort key to avoid re-downloading last item | |
time_cond = _get_time_cond(last_ts + ' ', max_ts) | |
else: | |
break | |
if sleep: | |
time.sleep(sleep) | |
if verbose >= 1: | |
print('Finished downloading, total images: {}'.format(len(results))) | |
return results | |
def _get_latest_image_time(dir_path): | |
if not os.path.isdir(dir_path): | |
return None | |
paths = [_ for _ in os.listdir(dir_path) if _.endswith('.jpg')] | |
if not paths: | |
return None | |
return max(paths)[:-4].split('_')[1].replace('.', ':') | |
def _get_time_cond(min_ts, max_ts, verbose=0): | |
if min_ts and max_ts: | |
time_cond = TIMESTAMP_COL.between(min_ts, max_ts) | |
if verbose >= 1: | |
print(' from {} to {}'.format(min_ts, max_ts)) | |
elif min_ts: | |
time_cond = TIMESTAMP_COL.gte(min_ts) | |
if verbose >= 1: | |
print(' from {}'.format(min_ts)) | |
elif max_ts: | |
time_cond = TIMESTAMP_COL.lte(max_ts) | |
if verbose >= 1: | |
print(' up to {}'.format(max_ts)) | |
else: | |
print('') | |
time_cond = None | |
return time_cond | |
def _save_img(img_bytes, loc, timestamp, out_path, raise_img_error=False): | |
loc_path = os.path.join(out_path, loc) | |
if not os.path.isdir(loc_path): | |
os.makedirs(loc_path) | |
fname = '{}_{}.jpg'.format(loc, timestamp).replace(':', '.') | |
fpath = os.path.join(loc_path, fname) | |
try: | |
img_bytes = img_bytes.encode('ISO-8859-1') | |
img_array = cv2.imdecode(np.fromstring(img_bytes, dtype=np.uint8), cv2.CV_LOAD_IMAGE_COLOR) | |
scipy.misc.imsave(fpath, img_array) | |
except ValueError: | |
# Something wrong when handling the image | |
if raise_img_error: | |
raise | |
if verbose >= 1: | |
print('Failed to save image for {} {}'.format(loc, item['timestamp'])) | |
if __name__ == '__main__': | |
loc = 'Pondok Indah 1 S' | |
min_ts = 'checkpoint' | |
max_ts = None | |
sleep = 1 | |
verbose = 2 | |
out_path = 'data/cctv' | |
download_images(loc, min_ts=min_ts, max_ts=max_ts, out_path=out_path, sleep=sleep, verbose=verbose) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment