perrygeo · June 2, 2020 16:21
diff --git a/.geotiff_headers.md b/.geotiff_headers.md
diff --git a/.gitignore b/.gitignore
 *.tif
 *.pyc
 _*
 .*.swp
diff --git a/example.sh b/example.sh
 # For every s3 object in the BUCKET matching the PARTIAL_KEY
 # read the geotiff tags and construct bboxes
 # then find all the unique tiles at zoom level 13

 export TMPFILE="/tmp/bboxes.txt"
 export ZOOMS="/tmp/zooms.txt"

 aws s3 ls s3://$BUCKET/$PARTIAL_KEY/ \
    | grep tif | head -n 20 | rev | cut -d " " -f1 | rev \
    | parallel python geotiff_bbox.py s3://$BUCKET/$PARTIAL_KEY/{} | tee -a $TMPFILE

 while read line           
 do           
   mercantile tiles 13 "$line"
 done < $TMPFILE | sort | uniq | tee $ZOOMS
diff --git a/geotiff_bbox.py b/geotiff_bbox.py
 #!/usr/bin/env python
 from __future__ import print_function
 import struct
 import sys
 import json

 import s3reader

 def tags_to_bbox(header_vals, to_wgs84=True):
    tiepoint = header_vals["ModelTiePointTag"]
    scalex, scaley, _ = header_vals["ModelPixelScaleTag"]
    width = header_vals["ImageWidth"]
    height = header_vals["ImageLength"]
    try:
        epsg = header_vals["epsg"]
    except KeyError:
        epsg = None

    if tiepoint[:3] == (0, 0, 0):  # corner of UL
        ulx, uly = tiepoint[3:5]
    elif tiepoint[:3] == (0.5, 0.5, 0):  # center of UL
        culx, culy = tiepoint[3:5]
        ulx = culx - (0.5 * scalex)
        uly = culy + (0.5 * scaley)
    else:
        raise Exception("not tied to UL")

    lrx = ulx + (scalex * width)
    lry = uly + (-1 * scaley * height)

    bounds = [ulx, lry, lrx, uly]

    if to_wgs84 and epsg and epsg != 4326:
        import rasterio.warp
        xs = bounds[::2]
        ys = bounds[1::2]
        xs, ys = rasterio.warp.transform({'init': 'EPSG:{}'.format(epsg)},
                                         {'init': 'EPSG:4326'},
                                         xs, ys)
        result = [0]*len(bounds)
        result[::2] = xs
        result[1::2] = ys
        return result
    else:
        return bounds



 def read_tags(path, tags):
    struct_fmt = {
        2: ('c', 1),  # char
        3: ('H', 2),  # unsigned short int
        4: ('I', 4),  # unsigned long
        12: ('d', 8),  # 8 byte double
    }

    try:
        s3reader.parse_path(path)
        # s3 URI, use s3reader
        _open = s3reader.open
    except ValueError:
        # normal file
        _open = open

    with _open(path, 'rb') as src:
        header = src.read(8)

        endian_tuple = struct.unpack('2c', header[0:2])

        if endian_tuple == ('I', 'I'):
            endian = '<'  # little
        elif endian_tuple == ('M', 'M'):
            endian = '>'
        else:
            raise Exception

        meaning_of_life = struct.unpack(endian + 'h', header[2:4])[0]
        if meaning_of_life != 42:
            raise Exception("Not a GeoTiff")

        ifd_offset = struct.unpack(endian + "i", header[4:8])[0]

        header_vals = {}

        src.seek(ifd_offset)
        nde = src.read(2)
        n_dir_entries = struct.unpack(endian + 'h', nde)[0]

        tags_bytes = src.read(12 * n_dir_entries)

        for d in range(n_dir_entries):
            start = d * 12
            end = (d + 1) * 12
            data = tags_bytes[start:end]
            tag = struct.unpack(endian + 'H', data[0:2])[0]

            tag, typ, count, val_offset = struct.unpack(endian + 'HHII', data)

            try:
                tagname = tags[tag]
            except KeyError:
                # special case
                tagname = 'GeoTIFF-34735'
                if tag != 34735:
                    continue

            fmtchar, bytesper = struct_fmt[typ]
            total_bytes = bytesper * count
            if total_bytes <= 4:
                values = val_offset  # not an offset but an actual value
            else:
                src.seek(val_offset)
                values_packed = src.read(total_bytes)
                fmt = "{}{}".format(count, fmtchar)
                values = struct.unpack(endian + fmt, values_packed)

            if fmtchar == 'c':
                values = ''.join(values)

            if tag == 34735:
                # geotiff, lets get the epsg code
                keyids = values[0::4]
                ttagloc = values[1::4]
                keyvals = values[3::4]
                try:
                    if dict(zip(keyids, ttagloc))[3072] == 0:
                        # it's a value not and offset
                        epsg = dict(zip(keyids, keyvals))[3072]
                        header_vals['epsg'] = epsg
                except KeyError:
                    continue

            header_vals[tagname] = values

    return header_vals


 def main(path):

    tags = {
        256: 'ImageWidth',
        257: 'ImageLength',
        33922: "ModelTiePointTag",
        33550: "ModelPixelScaleTag"}

    print(json.dumps(tags_to_bbox(read_tags(path, tags))))


 if __name__ == "__main__":
    path = sys.argv[1]
    main(path)
diff --git a/requirements.txt b/requirements.txt
 tifffile
 mercantile
 shapely
 boto3

diff --git a/s3reader.py b/s3reader.py
 #!/usr/bin/env python
 import boto3
 s3 = boto3.resource('s3')


 def open(path, mode='rb', *args, **kwargs):
    if mode != 'rb':
        raise ValueError("s3reader only works in binary read ('rb') mode")
    bucket, key = parse_path(path)
    return S3Reader(bucket, key)


 def parse_path(path):
    if not path.startswith("s3://"):
        raise ValueError("s3reader.open requires an s3:// URI")
    path = path.replace("s3://", "")
    parts = path.split("/")
    bucket = parts[0]
    key = "/".join(parts[1:])
    return bucket, key


 class S3Reader(object):
    def __init__(self, bucket, key):
        self.bucket = bucket
        self.key = key
        self.obj = s3.Object(self.bucket, self.key)
        self.size = self.obj.content_length
        self.pos = 0  # pointer to starting read position

    def __enter__(self):
        return self

    def __exit__(self, type, value, traceback):
        self.pos = 0

    def range_string(self, start, stop):
        return "bytes={}-{}".format(start, stop)

    def read(self, nbytes=None):
        if not nbytes:
            nbytes = self.size - self.pos
        # TODO confirm that start and stop bytes are within 0 to size
        rng = self.range_string(self.pos, self.pos + nbytes - 1)
        self.pos += nbytes  # TODO wait to move pointer until read confirmed
        return self.obj.get(Range=rng)['Body'].read()

    def seek(self, offset, whence=0):
        self.pos = whence + offset


 if __name__ == "__main__":
    with open("s3://perrygeo-gdal/test1.tif") as test:
        aa = test.read(5)
        bb = test.read()
        assert len(aa + bb) == test.obj.content_length

    with open("s3://perrygeo-gdal/test1.tif") as test:
        aa = test.read(5)
        bb = test.read(5)
        test.seek(2)
        cc = test.read(8)
        assert (aa + bb)[2:] == cc
	# For every s3 object in the BUCKET matching the PARTIAL_KEY
	# read the geotiff tags and construct bboxes
	# then find all the unique tiles at zoom level 13

	export TMPFILE="/tmp/bboxes.txt"
	export ZOOMS="/tmp/zooms.txt"

	aws s3 ls s3://$BUCKET/$PARTIAL_KEY/ \
	\| grep tif \| head -n 20 \| rev \| cut -d " " -f1 \| rev \
	\| parallel python geotiff_bbox.py s3://$BUCKET/$PARTIAL_KEY/{} \| tee -a $TMPFILE

	while read line
	do
	mercantile tiles 13 "$line"
	done < $TMPFILE \| sort \| uniq \| tee $ZOOMS
	#!/usr/bin/env python
	from __future__ import print_function
	import struct
	import sys
	import json

	import s3reader

	def tags_to_bbox(header_vals, to_wgs84=True):
	tiepoint = header_vals["ModelTiePointTag"]
	scalex, scaley, _ = header_vals["ModelPixelScaleTag"]
	width = header_vals["ImageWidth"]
	height = header_vals["ImageLength"]
	try:
	epsg = header_vals["epsg"]
	except KeyError:
	epsg = None

	if tiepoint[:3] == (0, 0, 0): # corner of UL
	ulx, uly = tiepoint[3:5]
	elif tiepoint[:3] == (0.5, 0.5, 0): # center of UL
	culx, culy = tiepoint[3:5]
	ulx = culx - (0.5 * scalex)
	uly = culy + (0.5 * scaley)
	else:
	raise Exception("not tied to UL")

	lrx = ulx + (scalex * width)
	lry = uly + (-1 * scaley * height)

	bounds = [ulx, lry, lrx, uly]

	if to_wgs84 and epsg and epsg != 4326:
	import rasterio.warp
	xs = bounds[::2]
	ys = bounds[1::2]
	xs, ys = rasterio.warp.transform({'init': 'EPSG:{}'.format(epsg)},
	{'init': 'EPSG:4326'},
	xs, ys)
	result = [0]*len(bounds)
	result[::2] = xs
	result[1::2] = ys
	return result
	else:
	return bounds



	def read_tags(path, tags):
	struct_fmt = {
	2: ('c', 1), # char
	3: ('H', 2), # unsigned short int
	4: ('I', 4), # unsigned long
	12: ('d', 8), # 8 byte double
	}

	try:
	s3reader.parse_path(path)
	# s3 URI, use s3reader
	_open = s3reader.open
	except ValueError:
	# normal file
	_open = open

	with _open(path, 'rb') as src:
	header = src.read(8)

	endian_tuple = struct.unpack('2c', header[0:2])

	if endian_tuple == ('I', 'I'):
	endian = '<' # little
	elif endian_tuple == ('M', 'M'):
	endian = '>'
	else:
	raise Exception

	meaning_of_life = struct.unpack(endian + 'h', header[2:4])[0]
	if meaning_of_life != 42:
	raise Exception("Not a GeoTiff")

	ifd_offset = struct.unpack(endian + "i", header[4:8])[0]

	header_vals = {}

	src.seek(ifd_offset)
	nde = src.read(2)
	n_dir_entries = struct.unpack(endian + 'h', nde)[0]

	tags_bytes = src.read(12 * n_dir_entries)

	for d in range(n_dir_entries):
	start = d * 12
	end = (d + 1) * 12
	data = tags_bytes[start:end]
	tag = struct.unpack(endian + 'H', data[0:2])[0]

	tag, typ, count, val_offset = struct.unpack(endian + 'HHII', data)

	try:
	tagname = tags[tag]
	except KeyError:
	# special case
	tagname = 'GeoTIFF-34735'
	if tag != 34735:
	continue

	fmtchar, bytesper = struct_fmt[typ]
	total_bytes = bytesper * count
	if total_bytes <= 4:
	values = val_offset # not an offset but an actual value
	else:
	src.seek(val_offset)
	values_packed = src.read(total_bytes)
	fmt = "{}{}".format(count, fmtchar)
	values = struct.unpack(endian + fmt, values_packed)

	if fmtchar == 'c':
	values = ''.join(values)

	if tag == 34735:
	# geotiff, lets get the epsg code
	keyids = values[0::4]
	ttagloc = values[1::4]
	keyvals = values[3::4]
	try:
	if dict(zip(keyids, ttagloc))[3072] == 0:
	# it's a value not and offset
	epsg = dict(zip(keyids, keyvals))[3072]
	header_vals['epsg'] = epsg
	except KeyError:
	continue

	header_vals[tagname] = values

	return header_vals


	def main(path):

	tags = {
	256: 'ImageWidth',
	257: 'ImageLength',
	33922: "ModelTiePointTag",
	33550: "ModelPixelScaleTag"}

	print(json.dumps(tags_to_bbox(read_tags(path, tags))))


	if __name__ == "__main__":
	path = sys.argv[1]
	main(path)
	#!/usr/bin/env python
	import boto3
	s3 = boto3.resource('s3')


	def open(path, mode='rb', args, *kwargs):
	if mode != 'rb':
	raise ValueError("s3reader only works in binary read ('rb') mode")
	bucket, key = parse_path(path)
	return S3Reader(bucket, key)


	def parse_path(path):
	if not path.startswith("s3://"):
	raise ValueError("s3reader.open requires an s3:// URI")
	path = path.replace("s3://", "")
	parts = path.split("/")
	bucket = parts[0]
	key = "/".join(parts[1:])
	return bucket, key


	class S3Reader(object):
	def __init__(self, bucket, key):
	self.bucket = bucket
	self.key = key
	self.obj = s3.Object(self.bucket, self.key)
	self.size = self.obj.content_length
	self.pos = 0 # pointer to starting read position

	def __enter__(self):
	return self

	def __exit__(self, type, value, traceback):
	self.pos = 0

	def range_string(self, start, stop):
	return "bytes={}-{}".format(start, stop)

	def read(self, nbytes=None):
	if not nbytes:
	nbytes = self.size - self.pos
	# TODO confirm that start and stop bytes are within 0 to size
	rng = self.range_string(self.pos, self.pos + nbytes - 1)
	self.pos += nbytes # TODO wait to move pointer until read confirmed
	return self.obj.get(Range=rng)['Body'].read()

	def seek(self, offset, whence=0):
	self.pos = whence + offset


	if __name__ == "__main__":
	with open("s3://perrygeo-gdal/test1.tif") as test:
	aa = test.read(5)
	bb = test.read()
	assert len(aa + bb) == test.obj.content_length

	with open("s3://perrygeo-gdal/test1.tif") as test:
	aa = test.read(5)
	bb = test.read(5)
	test.seek(2)
	cc = test.read(8)
	assert (aa + bb)[2:] == cc