How do I set up a multi-variate regression problem using Trax?
I get AssertionError: Invalid shape (16, 2); expected (16,). from the code below, coming from the L2Loss object.
The following is my attempt to adapt the sentiment analysis example into a regression problem:
import os
import trax
from trax import layers as tl
from trax.supervised import training
import numpy
import random
#train_stream = trax.data.TFDS('imdb_reviews', keys=('text', 'label'), train=True)()
#eval_stream = trax.data.TFDS('imdb_reviews', keys=('text', 'label'), train=False)()
def generate_samples():
# (text, lat/lon)
data= [
("Aberdeen MS", numpy.array((33.824742, -88.554591)) ),
("Aberdeen SD", numpy.array((45.463186, -98.471033))),
("Aberdeen WA", numpy.array((46.976432, -123.795781))),
("Amite City LA", numpy.array((30.733723, -90.5208))),
("Amory MS", numpy.array((33.984789, -88.48001))),
("Amouli AS", numpy.array((-14.26556, -170.589772))),
("Amsterdam NY", numpy.array((42.953149, -74.19505)))
]
for i in range(1024*8):
yield random.choice(data)
train_stream = generate_samples()
eval_stream = generate_samples()
model = tl.Serial(
tl.Embedding(vocab_size=8192, d_feature=256),
tl.Mean(axis=1), # Average on axis 1 (length of sentence).
tl.Dense(2), # Regress to lat/lon
# tl.LogSoftmax() # Produce log-probabilities.
)
# You can print model structure.
print(model)
print(next(train_stream)) # See one example.
data_pipeline = trax.data.Serial(
trax.data.Tokenize(vocab_file='en_8k.subword', keys=[0]),
trax.data.Shuffle(),
# trax.data.FilterByLength(max_length=2048, length_keys=[0]),
trax.data.BucketByLength(boundaries=[ 8, 128,],
batch_sizes=[256, 64, 4],
length_keys=[0]),
trax.data.AddLossWeights()
)
train_batches_stream = data_pipeline(train_stream)
eval_batches_stream = data_pipeline(eval_stream)
example_batch = next(train_batches_stream)
print(f'shapes = {[x.shape for x in example_batch]}') # Check the shapes.:wq
# Training task.
train_task = training.TrainTask(
labeled_data=train_batches_stream,
# loss_layer=tl.CrossEntropyLoss(),
loss_layer=tl.L2Loss(),
optimizer=trax.optimizers.Adam(0.01),
n_steps_per_checkpoint=500,
)
# Evaluaton task.
eval_task = training.EvalTask(
labeled_data=eval_batches_stream,
metrics=[tl.L2Loss(),],
n_eval_batches=20 # For less variance in eval numbers.
)
# Training loop saves checkpoints to output_dir.
output_dir = os.path.expanduser('~/output_dir/')
training_loop = training.Loop(model,
train_task,
eval_tasks=[eval_task],
output_dir=output_dir)
# Run 2000 steps (batches).
training_loop.run(2000)
The problem might be in the generate_samples() generator: This yields only 1024*8 (=8192) samples. If I replace the line
for i in range(1024*8):
by
while True:
so that an infinite amount of samples is generated, your example works on my machine.
Since generate_samples() only yields 8192 samples, train_batches_stream only yields 32 batches of 256 samples each, so that you can only train for at most 32 steps. However, you ask for 2000 steps.
Related
I have built a random forest regressor to predict the elasticity of a certain object based on color, material, size and other features.
The model works fine and I can predict the expected elasticity given certain inputs.
Eventually, I want to be able to find the lowest elasticity with certain constraints. The inputs have limited possibilities, i.e., material can only be plastic or textile.
I would like to have a smart solution in which I don't have to brute force and try all the possible combinations and find the one with lowest elasticity. I have found that surrogate models can be used for this but I don't understand how to apply this concept to my problem. For example, what is the objective function I should optimize in my case? I thought of passing the .predict() of the random forest but I'm not sure this is the correct way.
To summarize, I'd like to have a solution that given certain conditions, tells me what should be the best set of features to have lowest elasticity. Example, I'm looking for the lowest elasticity when the object is made of plastic --> I'd like to receive the set of other features that tells me how to get lowest elasticity in that case. Or simply, what feature I should tune to improve the performance
import numpy
from scipy.optimize import minimize
import random
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=10, random_state=0)
model.fit(X_train,y_train)
material = [0,1]
size= list(range(1, 45))
color= list(range(1, 500))
def objective(x):
material= x[0]
size = x[1]
color = x[2]
return model.predict([[material,size,color]])
# initial guesses
n = 3
x0 = np.zeros(n)
x0[0] = random.choice(material)
x0[1] = random.choice(size)
x0[2] = random.choice(color)
# optimize
b = (None,None)
bnds = (b, b, b, b, b)
solution = minimize(objective, x0, method='nelder-mead',
options={'xtol': 1e-8, 'disp': True})
x = solution.x
print('Final Objective: ' + str(objective(x)))
This is one solution if I understood you correctly,
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from scipy.optimize import differential_evolution
model = None
def objective(x):
material= x[0]
size = x[1]
color = x[2]
return model.predict([[material,size,color]])
# define input data
material = np.random.choice([0,1], 10); material = np.expand_dims(material, 1)
size = np.arange(10); size = np.expand_dims(size, 1)
color = np.arange(20, 30); color = np.expand_dims(color, 1)
input = np.concatenate((material, size, color), 1) # shape = (10, 3)
# define output = elasticity between [0, 1] i.e. 0-100%
elasticity = np.array([0.51135295, 0.54830051, 0.42198349, 0.72614775, 0.55087905,
0.99819945, 0.3175208 , 0.78232872, 0.11621277, 0.32219236])
# model and minimize
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(input, elasticity)
limits = ((0, 1), (0, 10), (20, 30))
res = differential_evolution(objective, limits, maxiter = 10000, seed = 11111)
min_y = model.predict([res.x])[0]
print("min_elasticity ==", round(min_y, 5))
The output is minimal elasticity based on the limits
min_elasticity == 0.19029
These are random data so the RandomForestRegressor doesn't do the best job perhaps
I am using pytorch-1.5 to do some gan test. My code is very simple gan code which just fit the sin(x) function:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
# Hyper Parameters
BATCH_SIZE = 64
LR_G = 0.0001
LR_D = 0.0001
N_IDEAS = 5
ART_COMPONENTS = 15
PAINT_POINTS = np.vstack([np.linspace(-1, 1, ART_COMPONENTS) for _ in range(BATCH_SIZE)])
def artist_works(): # painting from the famous artist (real target)
r = 0.02 * np.random.randn(1, ART_COMPONENTS)
paintings = np.sin(PAINT_POINTS * np.pi) + r
paintings = torch.from_numpy(paintings).float()
return paintings
G = nn.Sequential( # Generator
nn.Linear(N_IDEAS, 128), # random ideas (could from normal distribution)
nn.ReLU(),
nn.Linear(128, ART_COMPONENTS), # making a painting from these random ideas
)
D = nn.Sequential( # Discriminator
nn.Linear(ART_COMPONENTS, 128), # receive art work either from the famous artist or a newbie like G
nn.ReLU(),
nn.Linear(128, 1),
nn.Sigmoid(), # tell the probability that the art work is made by artist
)
opt_D = torch.optim.Adam(D.parameters(), lr=LR_D)
opt_G = torch.optim.Adam(G.parameters(), lr=LR_G)
for step in range(10000):
artist_paintings = artist_works() # real painting from artist
G_ideas = torch.randn(BATCH_SIZE, N_IDEAS) # random ideas
G_paintings = G(G_ideas) # fake painting from G (random ideas)
prob_artist0 = D(artist_paintings) # D try to increase this prob
prob_artist1 = D(G_paintings) # D try to reduce this prob
D_loss = - torch.mean(torch.log(prob_artist0) + torch.log(1. - prob_artist1))
G_loss = torch.mean(torch.log(1. - prob_artist1))
opt_D.zero_grad()
D_loss.backward(retain_graph=True) # reusing computational graph
opt_D.step()
opt_G.zero_grad()
G_loss.backward()
opt_G.step()
But when i runing it got this error:
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [128, 1]], which is output 0 of TBackward, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
Is there something wrong with my code?
This happens because the opt_D.step() modifies the parameters of your discriminator inplace. But these parameters are required to compute the gradient for the generator. You can fix this by changing your code to:
for step in range(10000):
artist_paintings = artist_works() # real painting from artist
G_ideas = torch.randn(BATCH_SIZE, N_IDEAS) # random ideas
G_paintings = G(G_ideas) # fake painting from G (random ideas)
prob_artist1 = D(G_paintings) # G tries to fool D
G_loss = torch.mean(torch.log(1. - prob_artist1))
opt_G.zero_grad()
G_loss.backward()
opt_G.step()
prob_artist0 = D(artist_paintings) # D try to increase this prob
# detach here to make sure we don't backprop in G that was already changed.
prob_artist1 = D(G_paintings.detach()) # D try to reduce this prob
D_loss = - torch.mean(torch.log(prob_artist0) + torch.log(1. - prob_artist1))
opt_D.zero_grad()
D_loss.backward(retain_graph=True) # reusing computational graph
opt_D.step()
You can find more about this issue here https://github.com/pytorch/pytorch/issues/39141
A general reason why it works on 1.4 but give an error in 1.5 is because 'Before 1.5, these tests were not working properly for the optimizers. That’s why you didn’t see any error. But the computed gradients were not correct.'
You can check this link for more discussion about the influence of version: https://discuss.pytorch.org/t/solved-pytorch1-5-runtimeerror-one-of-the-variables-needed-for-gradient-computation-has-been-modified-by-an-inplace-operation/90256/4
I had an similar issue and I solved it by changing my torch version to 1.4 and it worked for me
I have created the following Python program, which, as far as I understand CTC, should be a valid CTC-based model, as well as training data. The best documentation I can find is CNTK_208_Speech_CTC Tutorial, which is what I've based this on. The program is as simple as I could make it, and it relies only on numpy and CNTK, and generates data itself.
When I run this, I get the following error:
Validating --> ForwardBackward2850 = ForwardBackward (LabelsToGraph2847, StableSigmoid2703) : [5 x labelAxis1], [5 x inputAxis1] -> []
RuntimeError: The Matrix dimension in the ForwardBackwardNode operation does not match.
This seems to be the same issue from this ticket: https://github.com/Microsoft/CNTK/issues/2156
Here is the Python program:
# cntk_ctc_hello_world.py
#
# This is a "hello world" example of using CTC (Connectionist Temporal Classification) with CNTK.
#
# The input is a sequence of vectors of size 17. We use 17 because it's easy to spot that number in
# error messages. The output is a string of codes, each code being one of 4 possible characters from
# our alphabet that we'll refer to here as "ABCD", although they're actually just represented
# by the numbers 0..3, which is typical for classification systems. To make the setup of training data
# trivial, we assign the first four elements of our 17-dimension input vector to the four characters
# of our alphabet, so that the matching is:
# 10000000000000000 A
# 01000000000000000 B
# 00100000000000000 C
# 00010000000000000 D
# In our input sequences, we repeat each code three to five times, followed by three to five codes
# containing random noise. Whether it's repeated 3,4, or 5 times, is random for each code and each
# spacer. When we emit one of our codes, we fill the first 4 values with the code, and the remaining
# 13 values with random noise.
# For example:
# Input: AAA-----CCCC---DDDDD
# Output: ACD
import cntk as C
import numpy as np
import random
import sys
InputDim = 17
NumClasses = 4 # A,B,C,D
MinibatchSize = 100
MinibatchPerEpoch = 50
NumEpochs = 10
MaxOutputSeqLen = 10 # ABCDABCDAB
inputAxis = C.Axis.new_unique_dynamic_axis('inputAxis')
labelAxis = C.Axis.new_unique_dynamic_axis('labelAxis')
inputVar = C.sequence.input_variable((InputDim), sequence_axis=inputAxis, name="input")
labelVar = C.sequence.input_variable((NumClasses+1), sequence_axis=labelAxis, name="labels")
# Construct an LSTM-based model that will perform the classification
with C.default_options(activation=C.sigmoid):
classifier = C.layers.Sequential([
C.layers.For(range(3), lambda: C.layers.Recurrence(C.layers.LSTM(128))),
C.layers.Dense(NumClasses + 1)
])(inputVar)
criteria = C.forward_backward(C.labels_to_graph(labelVar), classifier, blankTokenId=NumClasses, delayConstraint=3)
err = C.edit_distance_error(classifier, labelVar, squashInputs=True, tokensToIgnore=[NumClasses])
lr = C.learning_rate_schedule([(3, .01), (1,.001)], C.UnitType.sample)
mm = C.momentum_schedule([(1000, 0.9), (0, 0.99)], MinibatchSize)
learner = C.momentum_sgd(classifier.parameters, lr, mm)
trainer = C.Trainer(classifier, (criteria, err), learner)
# Return a numpy array of 17 elements, for this code
def make_code(code):
a = np.zeros(NumClasses) # 0,0,0,0
v = np.random.rand(InputDim - NumClasses) # 13x random
a = np.concatenate((a, v))
a[code] = 1
return a
def make_noise_code():
return np.random.rand(InputDim)
def make_onehot(code):
v = np.zeros(NumClasses+1)
v[code] = 1
return v
def gen_batch():
x_batch = []
y_batch = []
for mb in range(MinibatchSize):
yLen = random.randint(1, MaxOutputSeqLen)
x = []
y = []
for i in range(yLen):
code = random.randint(0,3)
y.append(make_onehot(code))
xLen = random.randint(3,5) # Input is 3 to 5 repetitions of the code
for j in range(xLen):
x.append(make_code(code))
spacerLen = random.randint(3,5) # Spacer is 3 to 5 repetitions of noise
for j in range(spacerLen):
x.append(make_noise_code())
x_batch.append(np.array(x, dtype='float32'))
y_batch.append(np.array(y, dtype='float32'))
return x_batch, y_batch
#######################################################################################
# Dump first X/Y training pair from minibatch
#x, y = gen_batch()
#print("\nx sequence of first sample of minibatch:\n", x[0])
#print("\ny sequence of first sample of minibatch:\n", y[0])
#######################################################################################
progress_printer = C.logging.progress_print.ProgressPrinter(tag='Training', num_epochs=NumEpochs)
for epoch in range(NumEpochs):
for mb in range(MinibatchPerEpoch):
x_batch, y_batch = gen_batch()
trainer.train_minibatch({inputVar: x_batch, labelVar: y_batch})
progress_printer.epoch_summary(with_metric=True)
For those who are facing this error, there are two points to take note of:
(1) The data provided to labels sequence tensor to labels_to_graph must have the same sequence length as the data coming out from network output at runtime.
(2) If during the model building you change the dynamic sequence axis of input sequence tensor (e.g. stride in sequential axis), then you must call reconcile_dynamic_axes on your labels sequence tensor with the network_output as the second argument to the function. This tells CNTK that labels have the same dynamic axis as the network.
Adhering to these 2 points will allow forward_backward to run.
While attempting to replicate the section 3.1 in Incorporating Discrete Translation Lexicons into Neural MT in paddle-paddle
I tried to have a static matrix that I'll need to load into the seqToseq training pipeline, e.g.:
>>> import numpy as np
>>> x = np.random.rand(3,2)
>>> x
array([[ 0.64077103, 0.03278357],
[ 0.47133411, 0.16309775],
[ 0.63986919, 0.07130613]])
# where there is 3 target words and 2 source words,
# and each cell in the matrix represents some co-occurrence probabilities.
With the seqToseq_net demo, this matrix would need to be multiplied to the attention layer output in the gru_decoder_with_attention. The original demo:
def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
decoder_mem = memory(name='gru_decoder',
size=decoder_size,
boot_layer=decoder_boot)
# This attention context layer would have been
# a vector of size |src_vocab| x 1
context = simple_attention(encoded_sequence=enc_vec,
encoded_proj=enc_proj,
decoder_state=decoder_mem, )
with mixed_layer(size=decoder_size * 3) as decoder_inputs:
decoder_inputs += full_matrix_projection(input=context)
decoder_inputs += full_matrix_projection(input=current_word)
gru_step = gru_step_layer(name='gru_decoder',
input=decoder_inputs,
output_mem=decoder_mem,
size=decoder_size)
with mixed_layer(size=target_dict_dim,
bias_attr=True,
act=SoftmaxActivation()) as out:
out += full_matrix_projection(input=gru_step)
return out
The goal is to affect the attention layer by multiplying it with the static matrix:
def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
decoder_mem = memory(name='gru_decoder',
size=decoder_size,
boot_layer=decoder_boot)
# This attention context layer would have been
# of size |src_vocab| x 1
context = simple_attention(encoded_sequence=enc_vec,
encoded_proj=enc_proj,
decoder_state=decoder_mem, )
# This static matrix layer, x, would have been
# of size |trg_vocab| x |src_vocab|
static_matrix = some_sort_of_layer(x)
# This should yield a vector of size
# |trg_vocab| x 1
static_matrix_multiply_context = some_sort_of_operation_layer( static_matrix, context)
with mixed_layer(size=decoder_size * 3) as decoder_inputs:
#
decoder_inputs += full_matrix_projection(input= static_matrix_multiply_context)
decoder_inputs += full_matrix_projection(input=current_word)
I've tried looking through the code in Paddle/python/trainer_config_helps and walked-through all the demo code and I've also asked on PaddlePaddle's gitter. But I can't find how can I load a customized static matrix that doesn't need to be updated in the training process and interact with one of Paddle's layer.
How to load a matrix to change the attention layer in seqToseq demo?
What should some_sort_of_layer and some_sort_of_operation_layer be in the above example?
I'm puzzled as to why the in the code below (the section where I labeled "HERE"), would work because j+1 would make the list of list (which is the X_train_folds) go out of range when j reaches the end of the range. Why would this even work? Is it because vstack can automatically detect this change? I couldn't find any documentation for it though.
num_folds = 5
k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]
X_train_folds = []
y_train_folds = []
################################################################################
# Split up the training data into folds. After splitting, X_train_folds and #
# y_train_folds should each be lists of length num_folds, where #
# y_train_folds[i] is the label vector for the points in X_train_folds[i]. #
# Hint: Look up the numpy array_split function. #
################################################################################
X_train_folds = np.array_split(X_train, num_folds)
y_train_folds = np.array_split(y_train, num_folds)
# print y_train_folds
# A dictionary holding the accuracies for different values of k that we find
# when running cross-validation. After running cross-validation,
# k_to_accuracies[k] should be a list of length num_folds giving the different
# accuracy values that we found when using that value of k.
k_to_accuracies = {}
################################################################################
# Perform k-fold cross validation to find the best value of k. For each #
# possible value of k, run the k-nearest-neighbor algorithm num_folds times, #
# where in each case you use all but one of the folds as training data and the #
# last fold as a validation set. Store the accuracies for all fold and all #
# values of k in the k_to_accuracies dictionary. #
################################################################################
for k in k_choices:
k_to_accuracies[k] = []
for k in k_choices:
print 'evaluating k=%d' % k
for j in range(num_folds):
X_train_cv = np.vstack(X_train_folds[0:j]+X_train_folds[j+1:])#<--------------HERE
X_test_cv = X_train_folds[j]
#print len(y_train_folds), y_train_folds[0].shape
y_train_cv = np.hstack(y_train_folds[0:j]+y_train_folds[j+1:]) #<----------------HERE
y_test_cv = y_train_folds[j]
#print 'Training data shape: ', X_train_cv.shape
#print 'Training labels shape: ', y_train_cv.shape
#print 'Test data shape: ', X_test_cv.shape
#print 'Test labels shape: ', y_test_cv.shape
classifier.train(X_train_cv, y_train_cv)
dists_cv = classifier.compute_distances_no_loops(X_test_cv)
#print 'predicting now'
y_test_pred = classifier.predict_labels(dists_cv, k)
num_correct = np.sum(y_test_pred == y_test_cv)
accuracy = float(num_correct) / num_test
k_to_accuracies[k].append(accuracy)
################################################################################
# END OF YOUR CODE #
################################################################################
# Print out the computed accuracies
for k in sorted(k_to_accuracies):
for accuracy in k_to_accuracies[k]:
print 'k = %d, accuracy = %f' % (k, accuracy)
No. vstack is not causing that, but the very powerful indexation of numpy is. The internals of numpy are complex and sometimes it returns a copy, other times a view. In both cases, however, you are launching methods. And this method in particular returns an empty array when indexation is, itself, empty (as outside the space of the array).
See the following example and the consequential outputs (in print):
import numpy as np
a = np.array([1, 2, 3])
print(a[10:]) # This will return empty
print(a[10]) # This is an error
, the result is:
[]
Traceback (most recent call last): File "C:/Users/imactuallyavegetable/temp.py", line 333, in
print(a[10]) IndexError: index 10 is out of bounds for axis 0 with size 3
First an empty array, second the exception.