wbuchwalter · February 28, 2020 23:08
diff --git a/custom_datasets.py b/custom_datasets.py
 # CALL WITH:
 import argparse
 from shared_code.custom_datasets import test_new_stuff

 parser = argparse.ArgumentParser()
 parser.add_argument('i', type=int, help='GPU Index')
 parser.add_arguments('s', type=int, help='dataset size')

 args = parser.parse_args()

 test_new_stuff(args.i, args.s)

 #-----------------------------

 import pdb
 import os
 import os.path
 import re
 import sys
 import json
 import time
 import copy
 import itertools
 import pickle
 import numpy as np
 import torch

 from PIL import Image
 from collections import defaultdict
 from torchvision.datasets.vision import VisionDataset
 from torch._six import container_abcs, string_classes, int_classes

 from .detectron_models import load_predictor_coco, load_predictor_vg, \
    dummy_aux_data_dict, extract_raw_features, process_raw_features, \
    update_box_features


 def load_image(img_path, img_format='cv2'):
    '''
    Load an image from img_path and return either cv2 or PIL-style image.
    '''
    assert (img_format in ['cv2', 'PIL']), 'img_format must be cv2 or PIL.'
    raw_img = Image.open(img_path).convert('RGB')
    if img_format == 'cv2':
        raw_img = np.array(raw_img)
        raw_img = raw_img[:, :, ::-1].copy()
    return raw_img


 class CustomCocoCaptions(VisionDataset):
    """`MS Coco Captions <http://mscoco.org/dataset/#captions-challenge2015>`_ Dataset.

    -- This custom version adds functionality for loading dicts of auxiliary data.

    Args:
      root      : directory holding the source images
      root_aux  : directory holding the auxiliary data dicts (in .pkl form)
      annFile   : path to JSON annotations for linking images/captions/aux data
      transform : transforms for augmenting/etc source images for the model
    """
    def __init__(self, root, root_aux, annFile, transform=None):
        super(CustomCocoCaptions, self).__init__(root, None, transform, None)
        self.root_img = root
        self.root_aux = root_aux  # directory holding per-image auxiliary pkl files
        self.coco = CustomCoCo(annFile)
        self.ids = list(sorted(self.coco.imgs.keys()))
        print('Loading CoCo dataset with {} images...'.format(len(self.coco.imgs)))

    def __getitem__(self, index):
        """
        Args:
            index (int)   : index of item to get (in pytorch frame of reference)
        Returns:
            tuple (stuff) : data that will come out of the data loader
        """
        coco = self.coco
        # get coco index for the image with this pytorch/dataset index
        img_id = self.ids[index]
        # get ids of annotations associated with this image
        ann_ids = coco.getAnnIds(imgIds=img_id)
        anns = coco.loadAnns(ann_ids)
        # process the list of annotations for this image
        target = []
        aux_file = None
        for ann in anns:
            if 'caption' in ann:
                # found a caption!
                target.append(ann['caption'])
            elif 'aux_file' in ann:
                # found an auxiliary data file!
                aux_file = ann['aux_file']

        # get info dict for the current image
        img_info = coco.loadImgs(img_id)[0]
        # load image file using path from info dict
        img_path = os.path.join(self.root_img, img_info['file_name'])
        raw_img = load_image(img_path, img_format='PIL')
        # load auxiliary info file if available -- it should just be a dict!
        if aux_file is not None:
            aux_path = os.path.join(self.root_aux, aux_file)
            aux_dict = pickle.load(open(aux_path, 'rb'))
        else:
            aux_dict = {}
        # apply augmentation/transforms to raw image to prep it for model
        if self.transforms is not None:
            img, target = self.transforms(raw_img, target)
        # shove raw_img into aux_dict
        # aux_dict['raw_img_pil'] = raw_img
        return img, target, aux_dict

    def __len__(self):
        return len(self.ids)


 np_str_obj_array_pattern = re.compile(r'[SaUO]')


 coco_collate_err_msg_format = (
    "coco_collate: batch must contain tensors, numpy arrays, numbers, "
    "dicts or lists; found {}")


 def coco_collate(batch):
    r"""Puts each data field into a tensor with outer dimension batch size"""

    elem = batch[0]
    elem_type = type(elem)
    if isinstance(elem, torch.Tensor):
        out = None
        if torch.utils.data.get_worker_info() is not None:
            # If we're in a background process, concatenate directly into a
            # shared memory tensor to avoid an extra copy
            numel = sum([x.numel() for x in batch])
            storage = elem.storage()._new_shared(numel)
            out = elem.new(storage)
        return torch.stack(batch, 0, out=out)
    elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
            and elem_type.__name__ != 'string_':
        elem = batch[0]
        if elem_type.__name__ == 'ndarray':
            # array of string classes and object
            if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
                raise TypeError(coco_collate_err_msg_format.format(elem.dtype))

            return coco_collate([torch.as_tensor(b) for b in batch])
        elif elem.shape == ():  # scalars
            return torch.as_tensor(batch)
    elif isinstance(elem, float):
        return torch.tensor(batch, dtype=torch.float64)
    elif isinstance(elem, int_classes):
        return torch.tensor(batch)
    elif isinstance(elem, string_classes):
        return batch
    # DO NOT COLLATE DICTS/MAPPINGS FOR COCO -- CHRIST :-(
    elif isinstance(elem, container_abcs.Mapping):
        return {key: coco_collate([d[key] for d in batch]) for key in elem}
        # return batch
    elif isinstance(elem, tuple) and hasattr(elem, '_fields'):  # namedtuple
        return elem_type(*(coco_collate(samples) for samples in zip(*batch)))
    elif isinstance(elem, container_abcs.Sequence):
        transposed = zip(*batch)
        return [coco_collate(samples) for samples in transposed]

    raise TypeError(coco_collate_err_msg_format.format(elem_type))


 #
 # HELPER STUFF FOR WORKING WITH COCO-STYLE ANNOTATED IMAGE DATASETS
 #


 def _isArrayLike(obj):
    return hasattr(obj, '__iter__') and hasattr(obj, '__len__')


 class CustomCoCo:
    def __init__(self, annotation_file):
        """
        Constructor of Microsoft CustomCoCo helper class for reading annotations.
        :param annotation_file (str): location of annotation file
        :return:
        """
        # ...
        self.dataset, self.anns, self.cats, self.imgs = \
            dict(), dict(), dict(), dict()
        self.imgToAnns, self.catToImgs = defaultdict(list), defaultdict(list)
        # load dataset
        print('Loading annotations into memory...')
        tic = time.time()
        annotation_file = open(annotation_file, 'r')
        dataset = json.load(annotation_file)
        annotation_file.close()
        assert (type(dataset) == dict), 'annotation file format {} not supported'.format(type(dataset))
        # ...
        self.dataset = dataset
        self.buildIndex()
        print('Done (t={:0.2f}s)'.format(time.time() - tic))

    def buildIndex(self):
        # build index for retrieving things from the current self.dataset
        print('creating index...')
        anns, cats, imgs = {}, {}, {}
        imgToAnns, catToImgs = defaultdict(list), defaultdict(list)
        if 'annotations' in self.dataset:
            for ann in self.dataset['annotations']:
                imgToAnns[ann['image_id']].append(ann)
                anns[ann['id']] = ann
        if 'images' in self.dataset:
            for img in self.dataset['images']:
                imgs[img['id']] = img
        if 'categories' in self.dataset:
            for cat in self.dataset['categories']:
                cats[cat['id']] = cat
        if 'annotations' in self.dataset and 'categories' in self.dataset:
            for ann in self.dataset['annotations']:
                catToImgs[ann['category_id']].append(ann['image_id'])
        print('index created!')
        # create class members
        self.anns = anns            # map from ann ids to anns
        self.imgToAnns = imgToAnns  # map from img ids to their anns
        self.catToImgs = catToImgs  # map from cat ids to their imgs
        self.imgs = imgs            # map from img ids to imgs
        self.cats = cats            # map from cat ids to cats

    def saveDataset(self, output_file):
        '''
        Save the current set of coco-style annotations to some file (JSON).
        '''
        self.buildIndex()  # rebuild index to make sure it's up-to-date
        print('Writing dataset JSON to: {0:s}'.format(output_file))
        json_file = open(output_file, "w+")
        json_file.write(json.dumps(self.dataset))
        json_file.close()

    def addAnn(self, ann_dict):
        '''
        Add a dict-style annotation to the dataset.
        -- check if this annotation is already in dataset before adding it
        '''
        img_id = ann_dict['image_id']  # image this annotation belongs to
        # check that the annotation is not already in the dataset
        ann_exists = False
        for ann in self.imgToAnns[img_id]:
            ann_match = True
            for k in ann_dict:
                if not (k in ann):
                    # there's a key that doesn't match
                    ann_match = False
                    break
                elif not (ann_dict[k] == ann[k]):
                    # there's a value that doesn't match
                    ann_match = False
                    break
            if ann_match:
                ann_exists = True
                break
        # if the annotation doesn't exist yet, then add it to dataset
        if not ann_exists:
            self.dataset['annotations'].append(ann_dict)
            self.imgToAnns[img_id].append(ann_dict)
            self.anns[ann_dict['id']] = ann_dict

    def info(self):
        """
        Print information about the annotation file.
        :return:
        """
        for key, value in self.dataset['info'].items():
            print('{}: {}'.format(key, value))

    def getAllIds(self):
        '''
        Get all integer-valued ids for all images/annotations/categories.
        '''
        ids = []
        ids.extend(self.imgs)
        ids.extend(self.anns)
        ids.extend(self.cats)
        return set(ids)

    def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None):
        """
        Get ann ids that satisfy given filter conditions. default skips that filter
        :param imgIds  (int array)     : get anns for given imgs
               catIds  (int array)     : get anns for given cats
               areaRng (float array)   : get anns for given area range (e.g. [0 inf])
               iscrowd (boolean)       : get anns for given crowd label (False or True)
        :return: ids (int array)       : integer array of ann ids
        """
        imgIds = imgIds if _isArrayLike(imgIds) else [imgIds]
        catIds = catIds if _isArrayLike(catIds) else [catIds]

        if len(imgIds) == len(catIds) == len(areaRng) == 0:
            anns = self.dataset['annotations']
        else:
            if not len(imgIds) == 0:
                lists = [self.imgToAnns[imgId] for imgId in imgIds if imgId in self.imgToAnns]
                anns = list(itertools.chain.from_iterable(lists))
            else:
                anns = self.dataset['annotations']
            anns = anns if len(catIds) == 0 else [ann for ann in anns if ann['category_id'] in catIds]
            anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['area'] > areaRng[0] and ann['area'] < areaRng[1]]
        if iscrowd is not None:
            ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd]
        else:
            ids = [ann['id'] for ann in anns]
        return ids

    def getCatIds(self, catNms=[], supNms=[], catIds=[]):
        """
        filtering parameters. default skips that filter.
        :param catNms (str array)  : get cats for given cat names
        :param supNms (str array)  : get cats for given supercategory names
        :param catIds (int array)  : get cats for given cat ids
        :return: ids (int array)   : integer array of cat ids
        """
        catNms = catNms if _isArrayLike(catNms) else [catNms]
        supNms = supNms if _isArrayLike(supNms) else [supNms]
        catIds = catIds if _isArrayLike(catIds) else [catIds]

        if len(catNms) == len(supNms) == len(catIds) == 0:
            cats = self.dataset['categories']
        else:
            cats = self.dataset['categories']
            cats = cats if len(catNms) == 0 else [cat for cat in cats if cat['name']          in catNms]
            cats = cats if len(supNms) == 0 else [cat for cat in cats if cat['supercategory'] in supNms]
            cats = cats if len(catIds) == 0 else [cat for cat in cats if cat['id']            in catIds]
        ids = [cat['id'] for cat in cats]
        return ids

    def getImgIds(self, imgIds=[], catIds=[]):
        '''
        Get img ids that satisfy given filter conditions.
        :param imgIds (int array) : get imgs for given ids
        :param catIds (int array) : get imgs with all given cats
        :return: ids (int array)  : integer array of img ids
        '''
        imgIds = imgIds if _isArrayLike(imgIds) else [imgIds]
        catIds = catIds if _isArrayLike(catIds) else [catIds]

        if len(imgIds) == len(catIds) == 0:
            ids = self.imgs.keys()
        else:
            ids = set(imgIds)
            for i, catId in enumerate(catIds):
                if i == 0 and len(ids) == 0:
                    ids = set(self.catToImgs[catId])
                else:
                    ids &= set(self.catToImgs[catId])
        return list(ids)

    def loadAnns(self, ids=[]):
        """
        Load anns with the specified ids.
        :param ids (int array)       : integer ids specifying anns
        :return: anns (object array) : loaded ann objects
        """
        if _isArrayLike(ids):
            return [self.anns[id] for id in ids]
        elif type(ids) == int:
            return [self.anns[ids]]

    def loadCats(self, ids=[]):
        """
        Load cats with the specified ids.
        :param ids (int array)       : integer ids specifying cats
        :return: cats (object array) : loaded cat objects
        """
        if _isArrayLike(ids):
            return [self.cats[id] for id in ids]
        elif type(ids) == int:
            return [self.cats[ids]]

    def loadImgs(self, ids=[]):
        """
        Load anns with the specified ids.
        :param ids (int array)       : integer ids specifying img
        :return: imgs (object array) : loaded img objects
        """
        if _isArrayLike(ids):
            return [self.imgs[id] for id in ids]
        elif type(ids) == int:
            return [self.imgs[ids]]


 class CoCoAuxData:
    def __init__(self, img_dir):
        '''
        Class for scanning over a CoCo-style dataset and adding some auxiliary
        information for each of the underlying images.

        Input:
          img_dir         : directory where underlying images are stored
        '''
        self.img_dir = img_dir
        
    def new_aux_data(self, start_idx, num_samples, predictor, ann_in_file, ann_out_file, aux_out_dir,
                     num_boxes=50, img_extension='.jpg', real_aux_data=False):
        '''
        Compute totally new auxiliary data for the underlying CoCo dataset.

        Input:
          predictor     : detectron2-style model to use for boxes and features
          ann_in_file   : input file for CoCo JSON with source annotations
          ann_out_file  : output file for CoCo JSON with new annotations
          aux_out_dir   : directory to write the new auxiliary data to
          num_boxes     : number of boxes to detect for each image
          img_extension : file extension for the source images
          real_aux_data : whether to compute real aux data or dummy data
        Output:
          -- writes some files...
        '''
        # load the source coco annotations/dataset
        self.coco = CustomCoCo(ann_in_file)
        # make a directory for storing auxiliary files if it doesn't exist
        if not os.path.isdir(aux_out_dir):
            os.mkdir(aux_out_dir)
        # get max id of any item in dataset, to let us generate new ids
        # TODO: improve method for assigning unique IDs
        # max_old_id = max(self.coco.getAllIds())
        # min_new_id = max_old_id + 1
        # make an auxiliary data file and json annotation for each image
        tic = time.time()
        print('Adding auxiliary data to {0:d} images:'.format(len(self.coco.imgs)))
        raw_images, aux_anns, aux_data_dicts = [], [], []
        for i in range(start_idx, start_idx+num_samples):
        # for i, img_id in enumerate(self.coco.imgs):
            # figure out what to name the auxiliary file for this image
            if i not in self.coco.imgs or os.path.isfile('/gcc/GCC/train/AUX_train/{:07.0f}.pkl'.format(i)):
                continue

            img = self.coco.imgs[i]
            #img = self.coco.imgs[img_id]
            img_name = img['file_name']
            if img_name.endswith(img_extension):
                aux_name = img_name.replace(img_extension, '.pkl')
            else:
                assert False, 'Invalid image file extension!'
            # load the image from disk...
            img_path = os.path.join(self.img_dir, img_name)
            try:
                img_data = load_image(img_path, img_format='cv2')
            except Exception as e:
                print(e)
                continue 
            raw_images.append(img_data)
            # make an annotation json dict for this image's aux data
            aux_ann = {'id': img['id'],
                       'image_id': img['id'],
                       'aux_file': aux_name}
            aux_anns.append(aux_ann)
            if len(raw_images) == 5:
                # compute and dump auxiliary data for batches of 10 images
                aux_data_dicts = \
                    self._compute_aux_data(raw_images, predictor, num_boxes, real_aux_data)
                self._write_aux_data(aux_out_dir, aux_anns, aux_data_dicts)
                raw_images, aux_anns, aux_data_dicts = [], [], []
            if ((i + 1) % 100) == 0:
                # derp, progress indicator, derp
                img_per_sec = 100. / (time.time() - tic)
                print('-- {0:7d} images completed, {1:.2f} img/sec...'.format((i + 1), img_per_sec))
                tic = time.time()
        if (len(raw_images) > 0) and (len(raw_images) < 5):
            # compute and dump auxiliary data for final batch of <10 images
            aux_data_dicts = \
                self._compute_aux_data(raw_images, predictor, num_boxes, real_aux_data)
            self._write_aux_data(aux_out_dir, aux_anns, aux_data_dicts)
        # store dataset json file including update for new annotations
        #self.coco.saveDataset(ann_out_file)
        print('Writing updated dataset to JSON: {0:s}, Good Job!'.format(ann_out_file))
        return

    def new_aux_ftrs(self, encoder, ann_in_file, ann_out_file,
                     aux_in_dir, aux_out_dir, real_aux_data=False):
        '''
        Given a CoCo dataset with existing bbox annotations, compute new
        auxiliary features for each bbox using resized crops from the raw images
        as input to an encoder (rather than box features from the MaskRCNN).
        -- we want to decouple box quality/cost from feature quality/cost

        Input:
          encoder       : encoder to turn cropped+resized regions into features
          ann_in_file   : input file for CoCo JSON with new annotations
          ann_out_file  : output file for CoCo JSON with new annotations
          aux_in_dir    : directory to load existing auxiliary data from
          aux_out_dir   : directory to write new auxiliary data to
          real_aux_data : whether to compute real aux data or dummy data
        Output:
          -- writes some files...
        '''
        # load the source coco annotations/dataset
        self.coco = CustomCoCo(ann_in_file)
        # make a directory for storing auxiliary files if it doesn't exist
        if not os.path.isdir(aux_out_dir):
            os.mkdir(aux_out_dir)
        # make an auxiliary data file and json annotation for each image
        tic = time.time()
        print('Computing new auxiliary features for {0:d} images:'.format(len(self.coco.imgs)))
        raw_images, aux_anns, aux_data_dicts = [], [], []
        for i, img_id in enumerate(self.coco.imgs):
            # fetch image info from the coco manager
            img = self.coco.imgs[img_id]
            img_file = img['file_name']
            # fetch the image from disk...
            img_path = os.path.join(self.img_dir, img_file)
            raw_images.append(load_image(img_path, img_format='cv2'))
            # fetch the auxiliary data annotation for this image
            img_anns = self.coco.loadAnns(self.coco.getAnnIds(imgIds=img_id))
            aux_ann = [ann for ann in img_anns if ('aux_file' in ann)][0]
            aux_anns.append(aux_ann)
            # fetch the auxiliary data for this image
            aux_file = os.path.join(aux_in_dir, aux_ann['aux_file'])
            aux_data_dict = pickle.load(open(aux_file, 'rb'))
            aux_data_dicts.append(aux_data_dict)
            # ...
            if len(raw_images) == 5:
                # compute new auxiliary features for batches of 10 images
                aux_data_dicts = \
                    self._compute_aux_features(encoder, raw_images, aux_data_dicts, real_aux_data)
                self._write_aux_data(aux_out_dir, aux_anns, aux_data_dicts)
                raw_images, aux_anns, aux_data_dicts = [], [], []
            if ((i + 1) % 100) == 0:
                # derp, progress indicator, derp
                img_per_sec = 100. / (time.time() - tic)
                print('-- {0:7d} images completed, {1:.2f} img/sec...'.format((i + 1), img_per_sec))
                tic = time.time()
        if (len(raw_images) > 0) and (len(raw_images) < 5):
            # compute and dump auxiliary data for final batch of <10 images
            aux_data_dicts = \
                self._compute_aux_features(encoder, raw_images, aux_data_dicts, real_aux_data)
            self._write_aux_data(aux_out_dir, aux_anns, aux_data_dicts)
        # store dataset json file including update for new annotations
        self.coco.saveDataset(ann_out_file)
        print('Writing updated dataset to JSON: {0:s}, Good Job!'.format(ann_out_file))
        return

    def _compute_aux_data(self, raw_images, predictor, num_boxes, real_aux_data):
        '''
        Compute an auxiliary data dict for each image in raw_images.

        Input:
          raw_images     : list of images as numpy arrays in 'cv2' format
          real_aux_data  : flag for whether to compute real or dummy aux data
        Output:
          aux_data_dicts : a dict of aux data for each image in raw_images
        '''
        if real_aux_data:
            # compute auxiliary data using a detectron2-style model
            instances, features, probs = \
                extract_raw_features(raw_images, predictor, num_boxes)
            aux_data_dicts = process_raw_features(raw_images, instances, features, probs)
        else:
            aux_data_dicts = []
            for raw_img in raw_images:
                # make a dummy auxiliary data file and write it to disk...
                aux_data_dict = dummy_aux_data_dict(50)
                aux_data_dicts.append(aux_data_dict)
        return aux_data_dicts

    def _compute_aux_features(self, encoder, raw_images, aux_data_dicts, real_aux_data):
        '''
        Compute new per-box features for these images, using this encoder.
        '''
        if real_aux_data:
            aux_data_dicts = update_box_features(encoder, raw_images, aux_data_dicts)
        else:
            pass
        return aux_data_dicts

    def _write_aux_data(self, aux_dir, aux_anns, aux_data_dicts):
        '''
        Write batch of auxiliary data to disk and update coco annotation file.

        Input:
          aux_anns       : list of coco annotations linking aux data to image
          aux_data_dicts : list of dicts holding aux data for each image
        '''
        for aux_ann, aux_data_dict in zip(aux_anns, aux_data_dicts):
            # add annotation to the coco annotation file
            self.coco.addAnn(aux_ann)
            # save auxiliary data to some file
            aux_file = open(os.path.join(aux_dir, aux_ann['aux_file']), 'wb')
            pickle.dump(aux_data_dict, aux_file)
            aux_file.close()


 def test_new_stuff(i, ds_size, stuff_type='aux_data'):
    # hardcoded BS for now
    print('Loading default detectron2 predictor...')
    predictor, _ = load_predictor_coco()
    predictor, _ = load_predictor_vg()
    # set directories for source dataset
    source_ann_json = '/gcc/GCC/train/captions_train.json'
    source_img_dir = '/gcc/GCC/train/train'
    # set directories for new annotation files
    new_ann_json = '/gcc/GCC/train/AUX_train.json'
    new_aux_data_dir = '/gcc/GCC/train/AUX_train'

    # run the stuff...
    print('Building CoCoAuxData...')
    cad = CoCoAuxData(source_img_dir)
    start_idx = 0
    num = 10
    if stuff_type == 'aux_data':
        print('Adding auxiliary data to dataset...')
        chunk_size = (ds_size // 4) + 1
        start_idx = chunk_size * i
        cad.new_aux_data(start_idx, chunk_size, predictor, source_ann_json, new_ann_json, new_aux_data_dir,
                         num_boxes=50, img_extension='.jpg', real_aux_data=True)
    else:
        print('Adding new auxiliary features to dataset...')
        encoder = predictor
        source_ann_json = new_ann_json
        source_aux_data_dir = new_aux_data_dir
        cad.new_aux_ftrs(encoder, source_ann_json, new_ann_json,
                         source_aux_data_dir, new_aux_data_dir,
                         real_aux_data=True)






 ##############
 # EYE BUFFER #
 ##############