How to load external dataset - python

How can I load external dataset instead of mnist?
# underscore to omit the label arrays
(train_images, train_labels), (_, _) = tf.keras.datasets.mnist.load_data()
train_images = train_images.reshape(train_images.shape[0], 28, 28, 1).astype('float32')
train_images = (train_images - 127.5) / 127.5 # Normalize the images to [-1, 1]
BUFFER_SIZE = 60000
BATCH_SIZE = 256
# Batch and shuffle the data
train_dataset = tf.data.Dataset.from_tensor_slices(train_images).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
I've tried flow_from_directory "(train_images, _) = train_generator" but can not extract image values as the above code
train_image_generator = ImageDataGenerator(
rescale=1./255)
#rotation_range=10,
#zoom_range = 0.10,
#width_shift_range=0.1,
#height_shift_range=0.1
train_generator = train_image_generator.flow_from_directory(
batch_size=256,
color_mode="grayscale",
directory='../input/main-dataset110/Train',
shuffle=True,
target_size=(28, 28),
class_mode='sparse')
(train_images, _) = train_generator

I found a code which helped
try:
import tensorflow as tf
import cv2
import os
import pickle
import numpy as np
print("Library Loaded Successfully ..........")
except:
print("Library not Found ! ")
class MasterImage(object):
def __init__(self,PATH='', IMAGE_SIZE = 28):
self.PATH = PATH
self.IMAGE_SIZE = IMAGE_SIZE
self.image_data = []
self.x_data = []
self.y_data = []
self.CATEGORIES = []
# This will get List of categories
self.list_categories = []
def get_categories(self):
for path in os.listdir(self.PATH):
if '.DS_Store' in path:
pass
else:
self.list_categories.append(path)
print("Found Categories ",self.list_categories,'\n')
return self.list_categories
def Process_Image(self):
try:
"""
Return Numpy array of image
:return: X_Data, Y_Data
"""
self.CATEGORIES = self.get_categories()
for categories in self.CATEGORIES: # Iterate over categories
train_folder_path = os.path.join(self.PATH, categories) # Folder Path
class_index = self.CATEGORIES.index(categories) # this will get index for classification
for img in os.listdir(train_folder_path): # This will iterate in the Folder
new_path = os.path.join(train_folder_path, img) # image Path
try: # if any image is corrupted
image_data_temp = cv2.imread(new_path,cv2.IMREAD_GRAYSCALE) # Read Image as numbers
image_temp_resize = cv2.resize(image_data_temp,(self.IMAGE_SIZE,self.IMAGE_SIZE))
self.image_data.append([image_temp_resize,class_index])
except:
pass
data = np.asanyarray(self.image_data)
# Iterate over the Data
for x in data:
self.x_data.append(x[0]) # Get the X_Data
self.y_data.append(x[1]) # get the label
X_Data = np.asarray(self.x_data) / (255.0) # Normalize Data
Y_Data = np.asarray(self.y_data)
# reshape x_Data
X_Data = X_Data.reshape(-1, self.IMAGE_SIZE, self.IMAGE_SIZE, 1)
return X_Data, Y_Data
except:
print("Failed to run Function Process Image ")
def pickle_image(self):
"""
:return: None Creates a Pickle Object of DataSet
"""
# Call the Function and Get the Data
X_Data,Y_Data = self.Process_Image()
# Write the Entire Data into a Pickle File
pickle_out = open('X_Data','wb')
pickle.dump(X_Data, pickle_out)
pickle_out.close()
# Write the Y Label Data
pickle_out = open('Y_Data', 'wb')
pickle.dump(Y_Data, pickle_out)
pickle_out.close()
print("Pickled Image Successfully ")
return X_Data,Y_Data
def load_dataset(self):
try:
# Read the Data from Pickle Object
X_Temp = open('X_Data','rb')
X_Data = pickle.load(X_Temp)
Y_Temp = open('Y_Data','rb')
Y_Data = pickle.load(Y_Temp)
print('Reading Dataset from PIckle Object')
return X_Data,Y_Data
except:
print('Could not Found Pickle File ')
print('Loading File and Dataset ..........')
X_Data,Y_Data = self.pickle_image()
return X_Data,Y_Data
if __name__ == "__main__":
path = '../input/main-dataset110/Test'
a = MasterImage(PATH=path,
IMAGE_SIZE=28)
X_Data,Y_Data = a.load_dataset()
print(X_Data.shape)

Related

Am I wrongly converting VGG19 based Style Transfer pytorch model to ONNX?

https://github.com/safwankdb/Neural-Style-Transfer
I created a model file by running the code in the link above. I then converted it to ONNX to run with C++ and OpenCV.
But while Input (1,3,512,512) in the ONNX file I exported, it is in the form (1,512,28,28) in Output. My export code:
VGG.eval()
torch.save(VGG, 'torchmodel.pth')
dummy_input = Variable(torch.randn(1, 3, 512, 512, device='cuda:1'))
input_names = ['input']
output_names = ['output']
onnxfile='style.onnx'
torch.onnx.export(VGG,dummy_input,onnxfile,verbose=False,input_names=input_names,opset_version=11, output_names=output_names)
I tried export parameters from some sites but it didn't work.
Does anyone have an opinion on this matter?
Original code :
# -*- coding: utf-8 -*-
"""StyleTransfer.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/15JKaqmpVNr8NhURJWgbkvIl1sd0aKS3o
"""
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision as tv
from PIL import Image
import imageio
import numpy as np
from matplotlib import pyplot as plt
to_tensor = tv.transforms.Compose([
tv.transforms.Resize((512,512)),
tv.transforms.ToTensor(),
tv.transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[1, 1, 1]),
])
unload = tv.transforms.Compose([
tv.transforms.Normalize(mean=[-0.485,-0.456,-0.406],
std=[1,1,1]),
tv.transforms.Lambda(lambda x: x.clamp(0,1))
])
to_image = tv.transforms.ToPILImage()
style_img = 'udnie.jpg'
input_img = 'chicago.jpg'
style_img = Image.open(style_img)
input_img = Image.open(input_img)
style_img = to_tensor(style_img).cuda()
input_img = to_tensor(input_img).cuda()
def get_features(module, x, y):
# print('here')
features.append(y)
def gram_matrix(x):
b, c, h, w = x.size()
F = x.view(b,c,h*w)
G = torch.bmm(F, F.transpose(1,2))/(h*w)
return G
VGG = tv.models.vgg19(pretrained=True).features
VGG.cuda()
for i, layer in enumerate(VGG):
if i in [0,5,10,19,21,28]:
VGG[i].register_forward_hook(get_features)
elif isinstance(layer, nn.MaxPool2d):
VGG[i] = nn.AvgPool2d(kernel_size=2)
VGG.eval()
for p in VGG.parameters():
p.requires_grad = False
features = []
VGG(input_img.unsqueeze(0))
c_target = features[4].detach()
features = []
VGG(style_img.unsqueeze(0))
f_targets = features[:4]+features[5:]
gram_targets = [gram_matrix(i).detach() for i in f_targets]
alpha = 1
beta = 1e3
iterations = 200
image = input_img.clone().unsqueeze(0)
# image = torch.randn(1,3,512,512).cuda()
images = []
optimizer = optim.LBFGS([
image.requires_grad_()], lr=1)
mse_loss = nn.MSELoss(reduction='mean')
l_c = []
l_s = []
counter = 0
for itr in range(iterations):
features = []
def closure():
optimizer.zero_grad()
VGG(image)
t_features = features[-6:]
content = t_features[4]
style_features = t_features[:4]+t_features[5:]
t_features = []
gram_styles = [gram_matrix(i) for i in style_features]
c_loss = alpha * mse_loss(content, c_target)
s_loss = 0
for i in range(5):
n_c = gram_styles[i].shape[0]
s_loss += beta * mse_loss(gram_styles[i],gram_targets[i])/(n_c**2)
total_loss = c_loss+s_loss
l_c.append(c_loss)
l_s.append(s_loss)
total_loss.backward()
return total_loss
optimizer.step(closure)
print('Step {}: S_loss: {:.8f} C_loss: {:.8f}'.format(itr, l_s[-1], l_c[-1]))
if itr%1 == 0:
temp = unload(image[0].cpu().detach())
temp = to_image(temp)
temp = np.array(temp)
images.append(temp)
imageio.mimsave('progress.gif', images)
plt.clf()
plt.plot(l_c, label='Content Loss')
plt.legend()
plt.savefig('loss1.png')
plt.clf()
plt.plot(l_s, label='Style Loss')
plt.legend()
plt.savefig('loss2.png')
plt.imsave('last.jpg',images[-1])

I'm trying to load BERT "tfbert-large-uncased" but i got an error "Can't load config.json file"

I'm trying to load the pre-train BERT model but I'm getting an error while loading tokenized it says config.json is not found.
If anyone knows how to solve these issues please help me
Model and path configure
model_name = 'bert_v13'
data_dir = Path('../input/commonlitreadabilityprize/')
train_file = data_dir / 'train.csv'
test_file = data_dir / 'test.csv'
sample_file = data_dir / 'sample_submission.csv'
build_dir = Path('./build/')
output_dir = build_dir / model_name
trn_encode_file = output_dir / 'trn.enc.joblib'
val_predict_file = output_dir / f'{model_name}.val.txt'
submission_file = 'submission.csv'
pretrained_dir = '../tmp/input/tfbert-large-uncased'
id_col = 'id'
target_col = 'target'
text_col = 'excerpt'
max_len = 205
n_fold = 5
n_est = 2
n_stop = 2
batch_size = 8
seed = 42
Load Tokenizer and Model
# Tokenization using "Transformers"
# load tokenizer
def load_tokenizer():
if not os.path.exists(pretrained_dir + '/vocab.txt'):
Path(pretrained_dir).mkdir(parents=True, exist_ok=True)
tokenizer = BertTokenizerFast.from_pretrained("bert-large-uncased")
tokenizer.save_pretrained(pretrained_dir)
else:
print('loading the saved pretrained tokenizer')
tokenizer = BertTokenizerFast.from_pretrained(pretrained_dir)
model_config = BertConfig.from_pretrained(pretrained_dir)
model_config.output_hidden_states = True
return tokenizer, model_config
# load bert model
def load_bert(config):
if not os.path.exists(pretrained_dir + '/tf_model.h5'):
Path(pretrained_dir).mkdir(parents=True, exist_ok=True)
bert_model = TFBertModel.from_pretrained("bert-large-uncased", config=config)
bert_model.save_pretrained(pretrained_dir)
else:
print('loading the saved pretrained model')
bert_model = TFBertModel.from_pretrained(pretrained_dir, config=config)
return bert_model
loading encoder
def bert_encode(texts, tokenizer, max_len=max_len):
input_ids = []
token_type_ids = []
attention_mask = []
for text in texts:
token = tokenizer(text, max_lenght = max_len,truncation=True, padding='max_length',add_special_tokens = True)
input_ids.append(token['input_ids'])
token_type_ids.append(token['token_type_ids'])
attention_mask.append(token['attention_mask'])
return np.array(input_ids), np.array(token_type_ids),np.array(attention_mask)
this function gives an error
tokenizer, bert_cofig = load_tokenizer()
X = bert_encode(trn[text_col].values, tokenizer,
max_len=max_len)
X_tst = bert_encode(tst[text_col].values, tokenizer,
max_len = max_len)
y = trn[target_col].values
print(X[0].shape, X_tst[0].shape, y.shape)
Error
file ../tmp/input/tfbert-large-uncased/config.json not found

Cannot fix error: OpenCV(4.1.2) esize.cpp:3720: error: (-215:Assertion failed) !ssize.empty() in function 'resize'

I am trying to replicate this code from a repository of COVIDNet in Keras.
I fully ran the first notebook in the repo, but in the second one, I get an error.
import cv2
import numpy as np
import matplotlib.pyplot as plt
import os
# Set parameters here
INPUT_SIZE = (224, 224)
mapping = {'normal': 0, 'bacteria': 1, 'viral': 2, 'COVID-19': 3}
train_filepath = 'train_split.txt'
test_filepath = 'test_split.txt'
# load in the train and test files
file = open(train_filepath, 'r')
trainfiles = file.readlines()
file = open(test_filepath, 'r')
testfiles = file.readlines()
# Print
print('Total samples for train: ', len(trainfiles))
print('Total samples for test: ', len(testfiles))
Total samples for train: 72696
Total samples for test: 6255
# resize to input size and normalize to 0 - 1
x_train = []
x_test = []
y_train = []
y_test = []
for i in range(len(testfiles)):
test_i = testfiles[i].split()
imgpath = test_i[1]
img = cv2.imread(os.path.join('data', 'test', imgpath))
img = cv2.resize(img, INPUT_SIZE) # resize
img = img.astype('float32') / 255.0
x_test.append(img)
y_test.append(mapping[test_i[2]])
print('Shape of test images: ', x_test[0].shape)
for i in range(len(trainfiles)):
train_i = trainfiles[i].split()
imgpath = train_i[1]
img = cv2.imread(os.path.join('data', 'train', imgpath))
img = cv2.resize(img, INPUT_SIZE) # resize
img = img.astype('float32') / 255.0
x_train.append(img)
y_train.append(mapping[train_i[2]])
print('Shape of train images: ', x_train[0].shape)
Error:
error Traceback (most recent call last)
<ipython-input-31-5fa13deb65a6> in <module>()
10 imgpath = test_i[1]
11 img = cv2.imread(os.path.join('data', 'test', imgpath))
---> 12 img = cv2.resize(img, INPUT_SIZE) # resize
13 img = img.astype('float32') / 255.0
14 x_test.append(img)
There are multiple questions about similar errors, but I do not see how to fix it. Thanks!

ValueError: only one element tensors can be converted to Python scalars

I'm following this tutorial.
I'm at the last part where we combine the models in a regression.
I'm coding this in jupyter as follows:
import shutil
import os
import time
from datetime import datetime
import argparse
import pandas
import numpy as np
from tqdm import tqdm
from tqdm import tqdm_notebook
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torchsample.transforms import RandomRotate, RandomTranslate, RandomFlip, ToTensor, Compose, RandomAffine
from torchvision import transforms
import torch.nn.functional as F
from tensorboardX import SummaryWriter
import dataloader
from dataloader import MRDataset
import model
from sklearn import metrics
def extract_predictions(task, plane, train=True):
assert task in ['acl', 'meniscus', 'abnormal']
assert plane in ['axial', 'coronal', 'sagittal']
models = os.listdir('models/')
model_name = list(filter(lambda name: task in name and plane in name, models))[0]
model_path = f'models/{model_name}'
mrnet = torch.load(model_path)
_ = mrnet.eval()
train_dataset = MRDataset('data/',
task,
plane,
transform=None,
train=train,
)
train_loader = torch.utils.data.DataLoader(train_dataset,
batch_size=1,
shuffle=False,
num_workers=10,
drop_last=False)
predictions = []
labels = []
with torch.no_grad():
for image, label, _ in tqdm_notebook(train_loader):
logit = mrnet(image.cuda())
prediction = torch.sigmoid(logit)
predictions.append(prediction.item())
labels.append(label.item())
return predictions, labels
task = 'acl'
results = {}
for plane in ['axial', 'coronal', 'sagittal']:
predictions, labels = extract_predictions(task, plane)
results['labels'] = labels
results[plane] = predictions
X = np.zeros((len(predictions), 3))
X[:, 0] = results['axial']
X[:, 1] = results['coronal']
X[:, 2] = results['sagittal']
y = np.array(labels)
logreg = LogisticRegression(solver='lbfgs')
logreg.fit(X, y)
task = 'acl'
results_val = {}
for plane in ['axial', 'coronal', 'sagittal']:
predictions, labels = extract_predictions(task, plane, train=False)
results_val['labels'] = labels
results_val[plane] = predictions
y_pred = logreg.predict_proba(X_val)[:, 1]
metrics.roc_auc_score(y_val, y_pred)
However I get this error:
ValueError Traceback (most recent call last)
<ipython-input-2-979acb314bc5> in <module>
3
4 for plane in ['axial', 'coronal', 'sagittal']:
----> 5 predictions, labels = extract_predictions(task, plane)
6 results['labels'] = labels
7 results[plane] = predictions
<ipython-input-1-647731b6b5c8> in extract_predictions(task, plane, train)
54 logit = mrnet(image.cuda())
55 prediction = torch.sigmoid(logit)
---> 56 predictions.append(prediction.item())
57 labels.append(label.item())
58
ValueError: only one element tensors can be converted to Python scalars
Here's the MRDataset code in case:
class MRDataset(data.Dataset):
def __init__(self, root_dir, task, plane, train=True, transform=None, weights=None):
super().__init__()
self.task = task
self.plane = plane
self.root_dir = root_dir
self.train = train
if self.train:
self.folder_path = self.root_dir + 'train/{0}/'.format(plane)
self.records = pd.read_csv(
self.root_dir + 'train-{0}.csv'.format(task), header=None, names=['id', 'label'])
else:
transform = None
self.folder_path = self.root_dir + 'valid/{0}/'.format(plane)
self.records = pd.read_csv(
self.root_dir + 'valid-{0}.csv'.format(task), header=None, names=['id', 'label'])
self.records['id'] = self.records['id'].map(
lambda i: '0' * (4 - len(str(i))) + str(i))
self.paths = [self.folder_path + filename +
'.npy' for filename in self.records['id'].tolist()]
self.labels = self.records['label'].tolist()
self.transform = transform
if weights is None:
pos = np.sum(self.labels)
neg = len(self.labels) - pos
self.weights = torch.FloatTensor([1, neg / pos])
else:
self.weights = torch.FloatTensor(weights)
def __len__(self):
return len(self.paths)
def __getitem__(self, index):
array = np.load(self.paths[index])
label = self.labels[index]
if label == 1:
label = torch.FloatTensor([[0, 1]])
elif label == 0:
label = torch.FloatTensor([[1, 0]])
if self.transform:
array = self.transform(array)
else:
array = np.stack((array,)*3, axis=1)
array = torch.FloatTensor(array)
# if label.item() == 1:
# weight = np.array([self.weights[1]])
# weight = torch.FloatTensor(weight)
# else:
# weight = np.array([self.weights[0]])
# weight = torch.FloatTensor(weight)
return array, label, self.weights
I've only trained my models using 1 and 2 epochs for each plane of the MRI instead of 35 as in the tutorial, not sure if that has anything to do with it. Other than that I'm stranded as to what this could be? I also removed normalize=False in the options for train_dataset as it kept giving me an error and I read that it could be removed, but I'm not so sure?
Only a tensor that contains a single value can be converted to a scalar with item(), try printing the contents of prediction, I imagine this is a vector of probabilities indicating which label is most likely. Using argmax on prediction will give you your actual predicted label (assuming your labels are 0-n).

IndexError: index 82459 is out of bounds for axis 0 with size 82459

I am trying to run code (found here) for a visual question generation model. I am running the code using Windows Subsystem for Linux, in an Anaconda virtual environment for Python 2.7. I am using Tensorflow v1.3.0, as I experienced issues using more recent versions of Tensorflow -- the repository is relatively old.
I am receiving the following error (full traceback included):
Traceback (most recent call last):
File "main.py", line 70, in <module>
tf.app.run()
File "/home/username/anaconda2/envs/py27/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 48, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "main.py", line 64, in main
model.train()
File "/home/username/VQG-tensorflow/question_generator.py", line 124, in train
feats = self.img_feature[img_list,:]
IndexError: index 82459 is out of bounds for axis 0 with size 82459
I've included the source code for main.py and question_generator.py below. Obviously, the program is trying to access an index that doesn't exist. I can't figure out what would make it behave this way. Similar questions to this one (like this and this) were not helpful. I tried padding the array using the numpy.pad method, but that only led to a different and related error:
ValueError: Cannot feed value of shape (256, 4097) for Tensor u'Placeholder:0', which has shape '(256, 4096)'
Any and all help is greatly appreciated!
Source code for main.py:
#-*- coding: utf-8 -*-
import math
import os
import tensorflow as tf
import numpy as np
import cPickle
import skimage
import pprint
import tensorflow.python.platform
from keras.preprocessing import sequence
from data_loader import *
import vgg19
import question_generator
flags = tf.app.flags
pp = pprint.PrettyPrinter().pprint
tf.app.flags.DEFINE_string('input_img_h5', './data_img.h5', 'path to the h5file containing the image feature')
tf.app.flags.DEFINE_string('input_ques_h5', './data_prepro.h5', 'path to the h5file containing the preprocessed dataset')
tf.app.flags.DEFINE_string('input_json', './data_prepro.json', 'path to the json file containing additional info and vocab')
tf.app.flags.DEFINE_string('model_path', './models/', 'where should we save')
tf.app.flags.DEFINE_string('vgg_path', './vgg16.tfmodel', 'momentum for adam')
tf.app.flags.DEFINE_string('gpu_fraction', '2/3', 'define the gpu fraction used')
tf.app.flags.DEFINE_string('test_image_path', './assets/demo.jpg', 'the image you want to generate question')
tf.app.flags.DEFINE_string('test_model_path', './models/model-250', 'model we saved')
tf.app.flags.DEFINE_integer('batch_size', 256, 'tch_size for each iterations')
tf.app.flags.DEFINE_integer('dim_embed', 512, 'word embedding size')
tf.app.flags.DEFINE_integer('dim_hidden', 512, 'hidden size')
tf.app.flags.DEFINE_integer('dim_image', 4096, 'dimension of output from fc7')
tf.app.flags.DEFINE_integer('img_norm', 1, 'do normalization on image or not')
tf.app.flags.DEFINE_integer('maxlen', 26, 'max length of question')
tf.app.flags.DEFINE_integer('n_epochs', 250, 'how many epochs are we going to train')
tf.app.flags.DEFINE_float('learning_rate', '0.001', 'learning rate for adam')
tf.app.flags.DEFINE_float('momentum', 0.9, 'momentum for adam')
tf.app.flags.DEFINE_boolean('is_train', 'True', 'momentum for adam')
conf = flags.FLAGS
def calc_gpu_fraction(fraction_string):
idx, num = fraction_string.split('/')
idx, num = float(idx), float(num)
fraction = 1 / (num - idx + 1)
print " [*] GPU : %.4f" % fraction
return fraction
def main(_):
attrs = conf.__dict__['__flags']
pp(attrs)
dataset, img_feature, train_data = get_data(conf.input_json, conf.input_img_h5, conf.input_ques_h5, conf.img_norm)
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=calc_gpu_fraction(conf.gpu_fraction))
with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
model = question_generator.Question_Generator(sess, conf, dataset, img_feature, train_data)
if conf.is_train:
model.build_model()
model.train()
else:
model.build_generator()
model.test(test_image_path=conf.test_image_path, model_path=conf.test_model_path, maxlen=26)
if __name__ == '__main__':
tf.app.run()
Source code for question_generation.py:
import os
import tensorflow as tf
import numpy as np
import tensorflow.python.platform
from keras.preprocessing import sequence
from data_loader import *
import vgg19
tf.pack = tf.stack
tf.select = tf.where
tf.batch_matmul = tf.matmul
class Question_Generator():
def __init__(self, sess, conf, dataset, img_feature, train_data):
self.sess = sess
self.dataset = dataset
self.img_feature = img_feature
self.train_data = train_data
self.dim_image = conf.dim_image
self.dim_embed = conf.dim_embed
self.dim_hidden = conf.dim_hidden
self.batch_size = conf.batch_size
self.maxlen = conf.maxlen
self.n_lstm_steps = conf.maxlen+2
self.model_path = conf.model_path
if conf.is_train:
self.n_epochs = conf.n_epochs
self.learning_rate = conf.learning_rate
self.num_train = train_data['question'].shape[0] # total number of data
self.n_words = len(dataset['ix_to_word'].keys()) # vocabulary_size
# word embedding
self.Wemb = tf.Variable(tf.random_uniform([self.n_words, self.dim_embed], -0.1, 0.1), name='Wemb')
self.bemb = tf.Variable(tf.random_uniform([self.dim_embed], -0.1, 0.1), name='bemb')
# LSTM
self.lstm = tf.contrib.rnn.BasicLSTMCell(self.dim_hidden)
#self.lstm = tf.nn.rnn_cell.BasicLSTMCell(self.dim_hidden)
# fc7 encoder
self.encode_img_W = tf.Variable(tf.random_uniform([self.dim_image, self.dim_hidden], -0.1, 0.1), name='encode_img_W')
self.encode_img_b = tf.Variable(tf.random_uniform([self.dim_hidden], -0.1, 0.1), name='encode_img_b')
# feat -> word
self.embed_word_W = tf.Variable(tf.random_uniform([self.dim_hidden, self.n_words], -0.1, 0.1), name='embed_word_W')
self.embed_word_b = tf.Variable(tf.random_uniform([self.n_words], -0.1, 0.1), name='embed_word_b')
def build_model(self):
self.image = tf.placeholder(tf.float32, [self.batch_size, self.dim_image])
self.question = tf.placeholder(tf.int32, [self.batch_size, self.n_lstm_steps])
self.mask = tf.placeholder(tf.float32, [self.batch_size, self.n_lstm_steps])
image_emb = tf.nn.xw_plus_b(self.image, self.encode_img_W, self.encode_img_b) # (batch_size, dim_hidden)
state = self.lstm.zero_state(self.batch_size,tf.float32)
loss = 0.0
with tf.variable_scope("RNN"):
for i in range(self.n_lstm_steps):
if i == 0:
current_emb = image_emb
else:
tf.get_variable_scope().reuse_variables()
current_emb = tf.nn.embedding_lookup(self.Wemb, self.question[:,i-1]) + self.bemb
# LSTM
output, state = self.lstm(current_emb, state)
if i > 0:
# ground truth
labels = tf.expand_dims(self.question[:, i], 1)
indices = tf.expand_dims(tf.range(0, self.batch_size, 1), 1)
concated = tf.concat([indices, labels], 1)
#concated = tf.concat(1, [indices, labels])
onehot_labels = tf.sparse_to_dense(
concated, tf.pack([self.batch_size, self.n_words]), 1.0, 0.0)
# predict word
logit_words = tf.nn.xw_plus_b(output, self.embed_word_W, self.embed_word_b)
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logit_words, labels=onehot_labels)
cross_entropy = cross_entropy * self.mask[:,i]
current_loss = tf.reduce_sum(cross_entropy)
loss = loss + current_loss
self.loss = loss / tf.reduce_sum(self.mask[:,1:])
def build_generator(self):
self.image = tf.placeholder(tf.float32, [1, self.dim_image]) # only one image
image_emb = tf.nn.xw_plus_b(self.image, self.encode_img_W, self.encode_img_b)
state = tf.zeros([1, self.lstm.state_size])
self.generated_words = []
with tf.variable_scope("RNN"):
output, state = self.lstm(image_emb, state)
last_word = tf.nn.embedding_lookup(self.Wemb, [0]) + self.bemb
for i in range(self.maxlen):
tf.get_variable_scope().reuse_variables()
output, state = self.lstm(last_word, state)
logit_words = tf.nn.xw_plus_b(output, self.embed_word_W, self.embed_word_b)
max_prob_word = tf.argmax(logit_words, 1)
last_word = tf.nn.embedding_lookup(self.Wemb, max_prob_word)
last_word += self.bemb
self.generated_words.append(max_prob_word)
def train(self):
index = np.arange(self.num_train)
np.random.shuffle(index)
questions = self.train_data['question'][index,:]
img_list = self.train_data['img_list'][index]
print("img feature length: " + str(len(self.img_feature)))
print("img list: " + str(img_list))
#self.img_feature = np.pad(self.img_feature, (0,1),'constant', constant_values=(0,0)) #pad array to prevent bug
print("img feature length: " + str(len(self.img_feature)))
feats = self.img_feature[img_list,:]
self.saver = tf.train.Saver(max_to_keep=50)
train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
tf.initialize_all_variables().run()
for epoch in range(self.n_epochs):
counter = 0
for start, end in zip( \
range(0, len(feats), self.batch_size),
range(self.batch_size, len(feats), self.batch_size)
):
current_feats = feats[start:end]
current_questions = questions[start:end]
current_question_matrix = sequence.pad_sequences(current_questions, padding='post', maxlen=self.maxlen+1)
current_question_matrix = np.hstack( [np.full( (len(current_question_matrix),1), 0), current_question_matrix] ).astype(int)
current_mask_matrix = np.zeros((current_question_matrix.shape[0], current_question_matrix.shape[1]))
nonzeros = np.array( map(lambda x: (x != 0).sum()+2, current_question_matrix ))
# +2 -> #START# and '.'
for ind, row in enumerate(current_mask_matrix):
row[:nonzeros[ind]] = 1
_, loss_value = self.sess.run([train_op, self.loss], feed_dict={
self.image: current_feats,
self.question : current_question_matrix,
self.mask : current_mask_matrix
})
if np.mod(counter, 100) == 0:
print "Epoch: ", epoch, " batch: ", counter ," Current Cost: ", loss_value
counter = counter + 1
if np.mod(epoch, 25) == 0:
print "Epoch ", epoch, " is done. Saving the model ... "
self.save_model(epoch)
def test(self, test_image_path, model_path, maxlen):
ixtoword = self.dataset['ix_to_word']
images = tf.placeholder("float32", [1, 224, 224, 3])
image_val = read_image(test_image_path)
vgg = vgg19.Vgg19()
with tf.name_scope("content_vgg"):
vgg.build(images)
fc7 = self.sess.run(vgg.relu7, feed_dict={images:image_val})
saver = tf.train.Saver()
saver.restore(self.sess, model_path)
generated_word_index = self.sess.run(self.generated_words, feed_dict={self.image:fc7})
generated_word_index = np.hstack(generated_word_index)
generated_sentence = ''
for x in generated_word_index:
if x==0:
break
word = ixtoword[str(x)]
generated_sentence = generated_sentence + ' ' + word
print ' '
print '--------------------------------------------------------------------------------------------------------'
print generated_sentence
def save_model(self, epoch):
if not os.path.exists(self.model_path):
os.makedirs(self.model_path)
self.saver.save(self.sess, os.path.join(self.model_path, 'model'), global_step=epoch)
This is a really basic problem. What you didn't understand when you were running this code is that arrays (lists in Python) are 0-indexed. If you have a list of length n, then when you try to access the nth element in that list, you will receive an index error.

Categories

Resources