Last active
April 14, 2018 23:15
-
-
Save Yatoom/01aaf6026269081f8c7c89a4f14a8df7 to your computer and use it in GitHub Desktop.
Add ids and image ids columns
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from tqdm import tqdm | |
import string | |
import json | |
import collections | |
import numpy as np | |
import os | |
DATA_PATH = "data" | |
################################################################################################ | |
# IMPORT NUMPY FILES | |
################################################################################################ | |
np_train_data = np.load(os.path.join(DATA_PATH,'train_data.npy')) | |
np_val_data = np.load(os.path.join(DATA_PATH,'val_data.npy')) | |
np_test_data = np.load(os.path.join(DATA_PATH,'test_data.npy')) | |
train_data = collections.OrderedDict() | |
for i in range(len(np_train_data.item())): | |
cap = np_train_data.item()['caps'] | |
img = np_train_data.item()['ims'] | |
train_data['caps'] = cap | |
train_data['ims'] = img | |
val_data = collections.OrderedDict() | |
for i in range(len(np_val_data.item())): | |
cap = np_val_data.item()['caps'] | |
img = np_val_data.item()['ims'] | |
val_data['caps'] = cap | |
val_data['ims'] = img | |
test_data = collections.OrderedDict() | |
for i in range(len(np_test_data.item())): | |
cap = np_test_data.item()['caps'] | |
img = np_test_data.item()['ims'] | |
test_data['caps'] = cap | |
test_data['ims'] = img | |
################################################################################################ | |
# IMPORT FILES JSON FILES | |
################################################################################################ | |
with open(os.path.join(DATA_PATH,'instances_val2014.json')) as json_file: | |
coco_instances_val = json.load(json_file) | |
with open(os.path.join(DATA_PATH,'captions_val2014.json')) as json_file: | |
coco_caption_val = json.load(json_file) | |
################################################################################################ | |
# HELPER FUNCTIONS | |
################################################################################################ | |
# Group captions for the same image together | |
def group_captions(data): | |
return np.array(data['caps']).reshape(-1, 5) | |
# Remove punctuation and make it lowercase | |
def process_annotations(annotations): | |
result = [] | |
for i in annotations: | |
translation = ''.join(c for c in i["caption"] if c not in string.punctuation) | |
i["processed"] = translation.lower() | |
result.append(i) | |
return result | |
# Get matches using the captions | |
def get_matches(captions, lookup): | |
result = [] | |
for i in tqdm(range(len(captions))): | |
found = False | |
for caption in captions[i]: | |
match = get_match(query=caption, lookup=lookup) | |
# If it happens that we couldn't find the first caption in the lookup, | |
# which happens rarely, we continue to the next one. | |
if match: | |
found = True | |
result.append(match) | |
break | |
if not found: | |
result.append(None) | |
return result | |
# Get item that matches query | |
def get_match(query, lookup): | |
q = str(query, "utf8") | |
for i in lookup: | |
if q in i["processed"]: | |
return i | |
return None | |
################################################################################################ | |
# MAIN FUNCTIONS | |
################################################################################################ | |
# Use the captions to find the ids | |
def get_ids(coco_caption, data): | |
caps = group_captions(data) | |
lookup = coco_caption["annotations"] | |
lookup = process_annotations(lookup) | |
matches = get_matches(caps, lookup) | |
ids = [i["id"] for i in matches] | |
image_ids = [i["image_id"] for i in matches] | |
return ids, image_ids, caps | |
# Adding the ids and image ids as columns to data, and group the captions | |
def add_ids(coco_caption, data): | |
ids, image_ids, caps = get_ids(coco_caption, data) | |
data["caps"] = caps | |
data["ids"] = ids | |
data["image_ids"] = image_ids | |
return data | |
################################################################################################ | |
# ADD COLUMNS IDS AND IMAGE IDS, AND GROUP CAPTIONS | |
################################################################################################ | |
val_data = add_ids(coco_caption_val, val_data) | |
train_data = add_ids(coco_caption_val, train_data) | |
test_data = add_ids(coco_caption_val, test_data) | |
np.save("val_data_with_ids", val_data) | |
np.save("train_data_with_ids", train_data) | |
np.save("test_data_with_ids", test_data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
To retrieve the new files:
Example: