Last active
June 6, 2018 20:04
-
-
Save rosterloh/380f7a6ce0e8989e7cac57ca59dfc24e to your computer and use it in GitHub Desktop.
CV-ND Project 2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
sys.path.append('/opt/cocoapi/PythonAPI') | |
from pycocotools.coco import COCO | |
import nltk | |
nltk.download('punkt') | |
from data_loader import get_loader | |
from torchvision import transforms | |
import torch | |
from collections import Counter | |
import numpy as np | |
import torch.utils.data as data | |
from model import EncoderCNN, DecoderRNN | |
# Define a transform to pre-process the training images. | |
transform_train = transforms.Compose([ | |
transforms.Resize(256), # smaller edge of image resized to 256 | |
transforms.RandomCrop(224), # get 224x224 crop from random location | |
transforms.RandomHorizontalFlip(), # horizontally flip image with probability=0.5 | |
transforms.ToTensor(), # convert the PIL Image to a tensor | |
transforms.Normalize((0.485, 0.456, 0.406), # normalize image for pre-trained model | |
(0.229, 0.224, 0.225))]) | |
# Set the minimum word count threshold. | |
vocab_threshold = 4 | |
# Specify the batch size. | |
batch_size = 10 | |
# Obtain the data loader. | |
data_loader = get_loader(transform=transform_train, | |
mode='train', | |
batch_size=batch_size, | |
vocab_threshold=vocab_threshold, | |
vocab_from_file=True) # False if tweaking threshold | |
# Print the total number of keys in the word2idx dictionary. | |
print('Total number of tokens in vocabulary:', len(data_loader.dataset.vocab)) | |
# Tally the total number of training captions with each length. | |
# counter = Counter(data_loader.dataset.caption_lengths) | |
# lengths = sorted(counter.items(), key=lambda pair: pair[1], reverse=True) | |
# for value, count in lengths: | |
# print('value: %2d --- count: %5d' % (value, count)) | |
# Randomly sample a caption length, and sample indices with that length. | |
indices = data_loader.dataset.get_train_indices() | |
print('sampled indices:', indices) | |
# Create and assign a batch sampler to retrieve a batch with the sampled indices. | |
new_sampler = data.sampler.SubsetRandomSampler(indices=indices) | |
data_loader.batch_sampler.sampler = new_sampler | |
# Obtain the batch. | |
images, captions = next(iter(data_loader)) | |
print('images.shape:', images.shape) | |
print('captions.shape:', captions.shape) | |
# (Optional) Uncomment the lines of code below to print the pre-processed images and captions. | |
# print('images:', images) | |
# print('captions:', captions) | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
print('Running on', device) | |
# Specify the dimensionality of the image embedding. | |
embed_size = 256 | |
# Initialize the encoder. (Optional: Add additional arguments if necessary.) | |
encoder = EncoderCNN(embed_size) | |
# Move the encoder to GPU if CUDA is available. | |
encoder.to(device) | |
# Move last batch of images (from Step 2) to GPU if CUDA is available. | |
images = images.to(device) | |
# Pass the images through the encoder. | |
features = encoder(images) | |
print('type(features):', type(features)) | |
print('features.shape:', features.shape) | |
print('isCUDA:', features.is_cuda) | |
# Check that your encoder satisfies some requirements of the project! :D | |
assert type(features)==torch.Tensor, "Encoder output needs to be a PyTorch Tensor." | |
assert (features.shape[0]==batch_size) & (features.shape[1]==embed_size), "The shape of the encoder output is incorrect." | |
# Specify the number of features in the hidden state of the RNN decoder. | |
hidden_size = 512 | |
# Store the size of the vocabulary. | |
vocab_size = len(data_loader.dataset.vocab) | |
# Initialize the decoder. | |
decoder = DecoderRNN(embed_size, hidden_size, vocab_size) | |
# Move the decoder to GPU if CUDA is available. | |
decoder.to(device) | |
# Move last batch of captions (from Step 1) to GPU if CUDA is available | |
captions = captions.to(device) | |
# Pass the encoder output and captions through the decoder. | |
outputs = decoder(features, captions) | |
print('type(outputs):', type(outputs)) | |
print('outputs.shape:', outputs.shape) | |
# Check that your decoder satisfies some requirements of the project! :D | |
assert type(outputs)==torch.Tensor, "Decoder output needs to be a PyTorch Tensor." | |
assert (outputs.shape[0]==batch_size) & (outputs.shape[1]==captions.shape[1]) & (outputs.shape[2]==vocab_size), "The shape of the decoder output is incorrect." |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import torch.nn as nn | |
import torchvision.models as models | |
from torch.nn.utils.rnn import pack_padded_sequence | |
class EncoderCNN(nn.Module): | |
def __init__(self, embed_size): | |
"""Load the pretrained ResNet-152 and replace top fc layer.""" | |
super(EncoderCNN, self).__init__() | |
resnet = models.resnet152(pretrained=True) | |
modules = list(resnet.children())[:-1] # delete the last fc layer. | |
self.resnet = nn.Sequential(*modules) | |
self.linear = nn.Linear(resnet.fc.in_features, embed_size) | |
self.bn = nn.BatchNorm1d(embed_size, momentum=0.01) | |
def forward(self, images): | |
"""Extract feature vectors from input images.""" | |
with torch.no_grad(): | |
features = self.resnet(images) | |
features = features.reshape(features.size(0), -1) | |
features = self.bn(self.linear(features)) | |
return features | |
class DecoderRNN(nn.Module): | |
r""" | |
Provides functionality for decoding in https://arxiv.org/pdf/1411.4555.pdf | |
Args: | |
embed_size (int): The size of each embedding vector | |
hidden_size (int): The number of features in the hidden state of the RNN decoder | |
vocab_size (int): The size of the dictionary of embeddings | |
num_layers (int): The number of recurrent layers in the LSTM | |
Inputs: | |
features (batch_size, embed_size): Tensor containing the encoder output | |
captions (batch_size, decoder_hidden): Tensor containing input captions | |
Outputs: | |
decoder_outputs (): | |
decoder_hidden (): | |
ret_dict (): | |
""" | |
def __init__(self, embed_size, hidden_size, vocab_size, num_layers=1): | |
super(DecoderRNN, self).__init__() | |
self.hidden_size = hidden_size | |
self.embed = nn.Embedding(vocab_size, embed_size) | |
# The LSTM takes word embeddings as inputs, and outputs hidden states | |
# with dimensionality hidden_size. | |
self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True) | |
# The linear layer that maps from hidden state space to tag space | |
self.linear = nn.Linear(hidden_size, vocab_size) | |
self.softmax = nn.Softmax() | |
def forward(self, features, captions): | |
"""Decode image feature vectors and generates captions.""" | |
batch_size = features.size(0) | |
lengths = [len(cap) for cap in captions] | |
# remove the <end> token from the caption | |
captions = captions[:,:-1] | |
embeddings = self.embed(captions) | |
embeddings = torch.cat((features.unsqueeze(1), embeddings), 1) | |
hiddens, _ = self.lstm(embeddings) | |
out = self.linear(hiddens) | |
return out | |
def sample(self, inputs, states=None, max_len=20): | |
" accepts pre-processed image tensor (inputs) and returns predicted sentence (list of tensor ids of length max_len) " | |
pass |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/home/rosterloh/miniconda3/envs/cv-nd/bin/python /home/rosterloh/cv-nd/P2_Image_Captioning/analyse.py | |
[nltk_data] Downloading package punkt to /home/rosterloh/nltk_data... | |
[nltk_data] Package punkt is already up-to-date! | |
Vocabulary successfully loaded from vocab.pkl file! | |
loading annotations into memory... | |
Done (t=0.54s) | |
creating index... | |
index created! | |
Obtaining caption lengths... | |
100%|██████████| 414113/414113 [00:38<00:00, 10650.50it/s] | |
Total number of tokens in vocabulary: 9956 | |
sampled indices: [199955, 76317, 13471, 289884, 182100, 101662, 386165, 85410, 384008, 80888] | |
images.shape: torch.Size([10, 3, 224, 224]) | |
captions.shape: torch.Size([10, 15]) | |
Running on cuda | |
type(features): <class 'torch.Tensor'> | |
features.shape: torch.Size([10, 256]) | |
isCUDA: True | |
Embeddings: torch.Size([10, 15, 256]) | |
LSTM Output: torch.Size([10, 16, 512]) | |
Linear Output: torch.Size([10, 16, 9956]) | |
type(outputs): <class 'torch.Tensor'> | |
outputs.shape: torch.Size([10, 16, 9956]) | |
Traceback (most recent call last): | |
File "/home/rosterloh/cv-nd/P2_Image_Captioning/analyse.py", line 112, in <module> | |
assert (outputs.shape[0]==batch_size) & (outputs.shape[1]==captions.shape[1]) & (outputs.shape[2]==vocab_size), "The shape of the decoder output is incorrect." | |
AssertionError: The shape of the decoder output is incorrect. | |
Process finished with exit code 1 | |
Embeddings: torch.Size([10, 11, 256]) | |
LSTM Output: torch.Size([10, 12, 512]) | |
Linear Output: torch.Size([10, 12, 9955]) | |
type(outputs): <class 'torch.Tensor'> | |
outputs.shape: torch.Size([10, 12, 9955]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment