The accuracy on my code doesn't work(accu), it stays at 0, even though it should get higher.
The loss function works perfectly fine but the accu doesn't and i dont know why it doesnt go up.
It doesn't even show an error or anything it just stays at 0 all the time.
And i need the accu variable to evaluate the trained data with matplotlib.
What did i do wrong in this code?
import torch
import os
from torchvision import transforms
from PIL import Image
from os import listdir
import random
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
import torch.nn as nn
import matplotlib.pyplot as plt
from tqdm import tqdm
from torchsummary import summary
normalize = transforms.Normalize(
mean = [0.485, 0.456, 0.406],
std = [0.229, 0.224, 0.225])
transform = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(256),
transforms.ToTensor(), normalize])
train_data_list = []
target_list = []
train_data = []
waited = False
files = listdir('catsanddogs/train/')
for i in range(len(listdir('catsanddogs/train/'))):
if len(train_data) == 58 and not waited:
waited = True
continue
f = random.choice(files)
files.remove(f)
img = Image.open("catsanddogs/train/" + f)
img_tensor = transform(img)
train_data_list.append(img_tensor)
isSomething = 0
isCat = 1 if 'cat' in f else 0
isDog = 1 if 'dog' in f else 0
if isDog == 0 and isCat == 0:
isSomething = 2
target = [isCat, isDog, isSomething] #, isSomthing
target_list.append(target)
if len(train_data_list) >= 256:
train_data.append((torch.stack(train_data_list), target_list))
train_data_list = []
target_list = []
print('Loaded batch ', len(train_data), 'of ', int(len(listdir('catsanddogs/train/')) / 64))
print('Percentage Done: ', 100 * len(train_data) / int(len(listdir('catsanddogs/train/')) / 64), '%')
if len(train_data) > 2 :
break
class Netz(nn.Module):
def __init__(self):
super(Netz, self).__init__()
self.conv1 = nn.Conv2d(3, 6, kernel_size=5)
self.conv2 = nn.Conv2d(6, 12, kernel_size=5)
self.conv3 = nn.Conv2d(12, 18, kernel_size=5)
self.conv4 = nn.Conv2d(18, 24, kernel_size=5)
self.fc1 = nn.Linear(3456, 1000)
self.fc2 = nn.Linear(1000, 3)
def forward(self, x):
x = self.conv1(x)
x = F.max_pool2d(x, 2)
x = F.relu(x)
x = self.conv2(x)
x = F.max_pool2d(x, 2)
x = F.relu(x)
x = self.conv3(x)
x = F.max_pool2d(x, 2)
x = F.relu(x)
x = self.conv4(x)
x = F.max_pool2d(x, 2)
x = F.relu(x)
x = x.view(-1, 3456)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return torch.sigmoid(x)
model = Netz()
if os.path.isfile('catdognetz.pt'):
model = torch.load('catdognetz.pt')
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
count_parameters(model)
train_losses = []
train_accu = []
def train(epoch):
print('\nEpoch : %d' % epoch)
model.train()
running_loss = 0
correct = 0
total = 0
for data, target in tqdm(train_data):
target = torch.Tensor(target)
data = Variable(data)
target = Variable(target)
inputs, labels = data[0], data[1]
optimizer.zero_grad()
out = model(data)
criterion = F.binary_cross_entropy
loss = criterion(out, target)
loss.backward()
optimizer.step()
running_loss += loss.item()
_, predicted = out.max(1)
total += target.size(0)
correct += predicted.eq(labels).sum().item()
train_loss = running_loss / len(train_data)
accu = 100. * correct / total
train_accu.append(accu)
train_losses.append(train_loss)
print('Train Loss: %.3f | Accuracy: %.3f' % (train_loss, accu))
You should delete this row, because I do not think it is true:
inputs, labels = data[0], data[1]
because you do not need it, where did you use inputs? your ground-truth classes or actual classes are targets, therefore you should use the same variable to compare to the predicted classes. You should replace labels with targets in this line:
correct += predicted.eq(targets).sum().item()
Besides all these steps, you should try to print your targets to make sure that you have configured targets correctly and as they should be.
Related
I'm currently switching from tensorflow to pytorch and facing the warning UserWarning: Using a target size (torch.Size([400])) that is different to the input size (torch.Size([400, 1])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size
I came across that unsqueeze(1) on my target could help to resolve my problem, however, I do so obtain problems in regard of the multitarget which results from the shape my loss function (crossentropy) expects.
Here is a minimal example to my code:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.nn.functional as F
X1 = torch.randn(400, 1, 9999)
X2 = torch.randn((400,1, 9999))
aux1 = torch.randn(400,1)
aux2 = torch.randn(400,1)
aux3 = torch.randn(400,1)
y1 = torch.rand(400,)
y2 = torch.rand(400,)
y3 = torch.rand(400,)
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.nn.functional as F
# In[18]:
class MultiTaskDataset:
def __init__(self,
amplitude,
phase,
weight,
temperature,
humidity,
shelf_life_clf,
shelf_life_pred,
thickness_pred
):
self.amplitude = amplitude
self.phase = phase
self.weight = weight
self.temperature = temperature
self.humidity = humidity
self.shelf_life_clf = shelf_life_clf
self.shelf_life_pred = shelf_life_pred
self.thickness_pred = thickness_pred
def __len__(self):
return self.amplitude.shape[0]
def __getitem__(self, idx):
#inputs
amplitude = self.amplitude[idx]
phase = self.phase[idx]
weight = self.weight[idx]
temperature = self.temperature[idx]
humidity = self.humidity[idx]
#outputs
shelf_life_clf = self.shelf_life_clf[idx]
shelf_life_reg = self.shelf_life_pred[idx]
thickness_pred = self.thickness_pred[idx]
return ([torch.tensor(amplitude, dtype=torch.float32),
torch.tensor(phase, dtype=torch.float32),
torch.tensor(weight, dtype=torch.float32),
torch.tensor(temperature, dtype=torch.float32),
torch.tensor(humidity, dtype=torch.float32)],
[torch.tensor(shelf_life_clf, dtype=torch.long),
torch.tensor(shelf_life_reg, dtype=torch.float32),
torch.tensor(thickness_pred, dtype=torch.float32)])
# In[19]:
# train loader
dataset = MultiTaskDataset(X1, X2, aux1, aux2, aux3,
y1,y2,y3)
train_loader = DataLoader(dataset, batch_size=512, shuffle=True, num_workers=0)
# test loader
# In[20]:
class MyModel(nn.Module):
def __init__(self):
super(MyModel, self).__init__()
self.features_amp = nn.Sequential(
nn.LazyConv1d(1, 3, 1),
)
self.features_phase = nn.Sequential(
nn.LazyConv1d(1, 3, 1),
)
self.backbone1 = nn.Sequential(
nn.LazyConv1d(64,3,1),
nn.LazyConv1d(64,3,1),
nn.AvgPool1d(3),
nn.Dropout(0.25),
)
self.backbone2 = nn.Sequential(
nn.Conv1d(64, 32,3,1),
nn.Conv1d(32, 32,3,1),
nn.AvgPool1d(3),
nn.Dropout(0.25),
)
self.backbone3 = nn.Sequential(
nn.Conv1d(32, 16,3,1),
nn.Conv1d(16, 16,3,1),
nn.AvgPool1d(3),
nn.Dropout(0.25),
)
self.classifier = nn.LazyLinear(2)
self.shelf_life_reg = nn.LazyLinear(1)
self.thickness_reg = nn.LazyLinear(1)
def forward(self, x1, x2, aux1, aux2, aux3):
x1 = self.features_amp(x1)
x2 = self.features_phase(x2)
x1 = x1.view(x1.size(0),-1)
x2 = x2.view(x2.size(0),-1)
x = torch.cat((x1, x2), dim=-1)
print(x.size())
x = x.unsqueeze(1)
print(x.size())
x = self.backbone1(x)
print(x.size())
x = torch.flatten(x, start_dim=1, end_dim=-1)
x = torch.cat([x, aux1, aux2, aux3], dim=-1)
shelf_life_clf = self.classifier(x)
shelf_life_reg = self.shelf_life_reg(x)
thickness_reg = self.thickness_reg(x)
return (shelf_life_clf,
shelf_life_reg,
thickness_reg)
model = MyModel()
optimizer = optim.Adam(model.parameters(), lr=0.003)
criterion1 = nn.CrossEntropyLoss()
criterion2 = nn.MSELoss()
criterion3 = nn.MSELoss()
# In[21]:
def train(epoch):
model.train()
#exp_lr_scheduler.step()
arr_loss = []
#first_batch = next(iter(train_loader))
for batch_idx, (data, target) in enumerate(train_loader):
#amp, phase = data
clf, reg1, reg2 = target
#print(amp.shape, phase.shape)
#print(target[2].shape)
if torch.cuda.is_available():
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
data = [data[i].cuda() for i in range(len(data))]
target = [target[i].cuda() for i in range(len(target))]
model.to(device)
optimizer.zero_grad()
output1, output2, output3 = model(*data)
#losses
loss = criterion1(output1, target[0].long())
loss1 = criterion2(output2, target[1].float())
loss2 = criterion3(output3, target[2].float())
loss = loss + loss1 + loss2
#metrices
loss.backward()
optimizer.step()
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, (batch_idx + 1) * len(data), len(train_loader.dataset),
100. * (batch_idx + 1) / len(train_loader), loss.data))
arr_loss.append(loss.data)
return arr_loss
def averaged_accuracy(outputs, targets):
assert len(outputs) != len(targets), "number of outputs should equal the number of targets"
accuracy = []
for i in range(len(outputs)):
_, predicted = torch.max(output1.data, 1)
total += target[0].size(0)
correct += (predicted == target[0]).sum()
acc = correct / total *100
accuracy.append(acc)
return torch.mean(accuracy)
# In[22]:
optimizer = optim.Adam(model.parameters(), lr=0.00003)
criterion1 = nn.CrossEntropyLoss()
criterion2 = nn.MSELoss()
criterion3 = nn.MSELoss()
n_epochs = 10
for epoch in range(n_epochs):
train(epoch)
Can anybody provide guidance to resolve this problem?
In pytorch, self-made dataset and testing dataset seem to exhaust all RAM
I am new to pytorch and I wrote a ResNet program in pytorch on MNIST for an experiment.
If I use the data loader as below, it is fine:
import torch as pt
from torch.utils.data import DataLoader, TensorDataset
import torchvision as ptv
mnist_train = ptv.datasets.MNIST(ROOT_DIR,
train=True,
transform=ptv.transforms.ToTensor(),
download=False)
dl = pt.utils.data.DataLoader(dataset=mnist_train,
batch_size=BATCH_SIZE,
shuffle=True,
drop_last=True)
If I use a self-made dataset as below to use a validation set at each iteration, the program will exhaust all my RAM. The testing set is not used in each iteration, but at the end to evaluate the model.
mnist_test = ptv.datasets.MNIST(ROOT_DIR,
train=False,
transform=ptv.transforms.ToTensor(),
download=False)
M_TEST, PIC_H, PIC_W = mnist_test.data.shape
x_test = mnist_test.data.double() / 255.
y_test = mnist_test.targets
a = pt.randperm(M_TEST) # ATTENTION pt.randperm
x_test = x_test[a]
y_test = y_test[a]
VAL_RATE = 0.1
M_VAL = int(np.ceil(M_TEST * VAL_RATE))
M_TEST -= M_VAL
x_test, x_val = pt.split(x_test, (M_TEST, M_VAL))
y_test, y_val = pt.split(y_test, (M_TEST, M_VAL))
x_test = x_test.view(-1, 1, PIC_H, PIC_W).double()
x_val = x_val.view(-1, 1, PIC_H, PIC_W).double()
dl_test = DataLoader(TensorDataset(x_test, y_test),
batch_size=BATCH_SIZE)
def acc(ht, yt):
return (pt.argmax(ht, 1) == yt.long()).double().mean()
# in iteration:
for epoch in range(N_EPOCHS):
for i, (bx, by) in enumerate(dl):
model.train(True)
optim.zero_grad()
bx = bx.view(-1, 1, PIC_H, PIC_W).double()
ht = model(bx)
cost = criterion(ht, by)
cost.backward()
optim.step()
model.train(False)
accv = acc(ht, by)
ht_val = model(x_val)
val_cost = criterion(ht_val, y_val)
val_acc = acc(ht_val, y_val)
So I suspect only the ptv.datasets.MNIST and the pt.utils.data.DataLoader is available, so I removed the usage of my self-made validation set at each iteration; and the RAM usage is normal after the removal. But the test progress still exhaust all my RAM even I only use the ptv.datasets.MNIST and the pt.utils.data.DataLoader as below:
mnist_test = ptv.datasets.MNIST(ROOT_DIR,
train=False,
transform=ptv.transforms.ToTensor(),
download=False)
dl_test = pt.utils.data.DataLoader(dataset=mnist_test,
batch_size=BATCH_SIZE,
shuffle=False,
drop_last=True)
test_cost_avg = 0.
test_acc_avg = 0.
GROUP = int(np.ceil(M_TEST / BATCH_SIZE / 10))
for i, (bx, by) in enumerate(dl_test):
bx = bx.view(-1, 1, PIC_H, PIC_W).double()
ht = model(bx)
test_cost_avg += criterion(ht, by)
test_acc_avg += acc(ht, by)
if i % GROUP == 0:
print(f'Testing # {i + 1}')
if i % GROUP != 0:
print(f'Testing # {i + 1}')
test_cost_avg /= i + 1
test_acc_avg /= i + 1
print(f'Tested: cost = {test_cost_avg}, acc = {test_acc_avg}')
print('Over')
Please give me a help. Thanks a lot!
Update:
I suspect there is something wrong with my model, because I have a simple CNN model on self-made dataset from pytorchvision's MNIST does not have this RAM exhaustion problem. So I paste my model in this problem as below FYI:
def my_conv(in_side, in_ch, out_ch, kernel, stride, padding='same'):
if 'same' == padding:
ps = kernel - 1
padding = ps // 2
else:
padding = 0
print(padding) # tmp
return pt.nn.Conv2d(in_ch, out_ch, kernel_size=kernel, stride=stride, padding=padding)
class MyResnetBlock(pt.nn.Module):
def __init__(self, residual, in_side, in_ch, out_ch, kernel=3, stride=1, **kwargs):
super().__init__(**kwargs)
self.residual = residual
self.in_side = in_side
self.in_ch = in_ch
self.out_ch = out_ch
self.kernel = kernel
self.stride = stride
self.conv1 = my_conv(in_side, in_ch, out_ch, kernel, stride)
self.bn1 = pt.nn.BatchNorm2d(out_ch)
self.relu1 = pt.nn.ReLU()
self.conv2 = my_conv(np.ceil(in_side / stride), out_ch, out_ch, kernel, 1)
self.bn2 = pt.nn.BatchNorm2d(out_ch)
self.relu2 = pt.nn.ReLU()
if residual:
self.conv_down = my_conv(in_side, in_ch, out_ch, kernel, stride)
def forward(self, input):
x = input
x = self.conv1(x)
x = self.bn1(x)
x = self.relu1(x)
x = self.conv2(x)
x = self.bn2(x)
if self.residual:
res = self.conv_down(input)
else:
res = input
x += res
x = self.relu2(x)
return x
class MyResnetByPt(pt.nn.Module):
def __init__(self, blocks_spec_list, in_side, init_in_ch, init_out_ch, **kwargs):
super().__init__(**kwargs)
self.conv1 = my_conv(in_side, init_in_ch, init_out_ch, 3, 1)
in_ch = out_ch = init_out_ch
blocks = []
for block_id, n_blocks in enumerate(blocks_spec_list):
for layer_id in range(n_blocks):
if layer_id == 0:
if block_id != 0:
out_ch *= 2
block = MyResnetBlock(True, in_side, in_ch, out_ch, 3, 2)
in_ch = out_ch
in_side = int(np.ceil(in_side / 2))
else:
block = MyResnetBlock(False, in_side, in_ch, out_ch, 3, 1)
blocks.append(block)
self.blocks = pt.nn.Sequential(*blocks)
self.final_ch = out_ch
self.avg_pool = pt.nn.AvgPool2d(kernel_size=(in_side, in_side),
stride=(1, 1),
padding=(0, 0))
self.fc = pt.nn.Linear(out_ch, N_CLS)
def forward(self, input):
x = input
x = self.conv1(x)
x = self.blocks(x)
x = self.avg_pool(x)
x = x.view(-1, self.final_ch)
x = self.fc(x)
return x
model = MyResnetByPt([2, 2, 2, 2], PIC_H, 1, 16)
model = model.double()
I'm building a simple image recognition convolutional neural network and trying to run this on my GPU but I haven't done something important apparently.
I checked If GPU is available at the beginning and in train, set batches to the device (cuda:0).
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
# Checks if GPU is available otherwise uses CPU
if torch.cuda.is_available():
device = torch.device("cuda:0")
print("Running on the GPU!")
else:
device = torch.device("cpu")
print("Running on the CPU!")
REBUILD_DATA = False
# Data clean up and format
class DogsVsCats():
IMG_SIZE = 50
CATS = "PetImages/Cat"
DOGS = "PetImages/Dog"
LABELS = {CATS: 0, DOGS: 1}
training_data = []
catcount = 0
dogcount = 0
def make_training_data(self):
for label in self.LABELS:
print(label)
for f in tqdm(os.listdir(label)):
try:
path = os.path.join(label, f)
img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
img = cv2.resize(img, (self.IMG_SIZE, self.IMG_SIZE))
self.training_data.append([np.array(img), np.eye(2)[self.LABELS[label]] ])
if label == self.CATS:
self.catcount += 1
elif label == self.DOGS:
self.dogcount += 1
except Exception as e:
pass
np.random.shuffle(self.training_data)
np.save("training_data.npy", self.training_data)
print("Cats: ", self.catcount)
print("Dogs: ", self.dogcount)
if REBUILD_DATA:
dogsvcats = DogsVsCats()
dogsvcats.make_training_data()
training_data = np.load("training_data.npy", allow_pickle=True)
# print(len("training_data.npy"))
class Net(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(1, 32, 5)
self.conv2 = nn.Conv2d(32, 64, 5)
self.conv3 = nn.Conv2d(64, 128, 5)
x = torch.randn(50,50).view(-1,1,50,50)
self._to_linear = None
self.convs(x)
self.fc1 = nn.Linear(self._to_linear, 512)
self.fc2 = nn.Linear(512, 2)
def convs(self, x):
x = F.max_pool2d(F.relu(self.conv1(x)), (2,2))
x = F.max_pool2d(F.relu(self.conv2(x)), (2,2))
x = F.max_pool2d(F.relu(self.conv3(x)), (2,2))
print(x[0].shape)
if self._to_linear is None:
self._to_linear = x[0].shape[0]*x[0].shape[1]*x[0].shape[2]
return x
def forward(self, x):
x = self.convs(x)
x = x.view(-1, self._to_linear)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return F.softmax(x, dim = 1)
net = Net().to(device)
optimizer = optim.Adam(net.parameters(), lr = 0.001)
loss_function = nn.MSELoss()
X = torch.Tensor([i[0] for i in training_data]).view(-1, 50, 50)
X = X/255.0
y = torch.Tensor([i[1] for i in training_data])
VAL_PCT = 0.1
val_size = int(len(X)*VAL_PCT)
print(val_size)
train_X = X[:-val_size]
train_y = y[:-val_size]
test_X = X[-val_size:]
test_y = y[-val_size:]
BATCH_SIZE = 100
EPOCHS = 1
def train(net):
for epoch in range(EPOCHS):
for i in tqdm(range(0, len(train_X), BATCH_SIZE)):
# print(i, i+BATCH_SIZE)
batch_X = train_X[i:i+BATCH_SIZE].view(-1,1,50,50).to(device)
batch_y = train_y[i:i+BATCH_SIZE].to(device)
net.zero_grad()
outputs = net(batch_X)
loss = loss_function(outputs, batch_y)
loss.backward()
optimizer.step()
print(loss)
correct = 0
total = 0
with torch.no_grad():
for i in tqdm(range(len(test_X))):
real_class = torch.argmax(test_y[i])
net_out = net(test_X[i].view(-1, 1, 50, 50))[0]
predicted_class = torch.argmax(net_out)
if predicted_class == real_class:
correct += 1
total += 1
print("Accuracy: ", round(correct/total,3))
train(net)
Sorry if it's too simple of a question. Thank you in advance!
You should post the line number of the error, but I'm thinking its from this snipit:
with torch.no_grad():
for i in tqdm(range(len(test_X))):
real_class = torch.argmax(test_y[i])
net_out = net(test_X[i].view(-1, 1, 50, 50))[0]
predicted_class = torch.argmax(net_out)
if predicted_class == real_class:
correct += 1
total += 1
You have to put the input to your net must be put to the device, so maybe change the line
net_out = net(test_X[i].view(-1, 1, 50, 50))[0]
to
net_out = net(test_X[i].view(-1, 1, 50, 50).to(device)[0]
I want to utilize deep neural network to classify Hyperspectral Image. But every time I run this code, it gives me this error "TypeError: forward() missing 1 required positional argument: 'negative'".
Code show as below(Not completed):
import numpy as np
import scipy.io as sio
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import math
REBUILD_DATA = True
TO read data:
class DATA():
# 读取样本和标签,并转换为numpy数组格式
Pavia = sio.loadmat('G:\研究生\Matlab_code\dataset\Classification\paviaU.mat')
PaviaGT = sio.loadmat('G:\研究生\Matlab_code\dataset\Classification\paviaU_GT.mat')
# print(sorted(Pavia.keys())) 返回字典中的键值key()
# print(sorted(PaviaGT.keys()))
Sample = Pavia['data']
Sample = np.array(Sample, dtype = np.int32)
Label = PaviaGT['groundT']
Label = np.array(Label, dtype = np.int32)
# 将样本每一维度的数值存到a,b,c中,以便后续使用
[a,b,c]=Sample.shape
# 将数据reshape成matlab中的格式
SampleT = Sample.transpose(1, 0, 2)
SampleX = SampleT.reshape(-1,103)
""" sio.savemat('G:\研究生\Sample.mat',{'dataX':SampleX}) """
LabelT = Label.transpose(1,0)
Label = LabelT.reshape(-1,1)
# 如何将样本和标签合并,输入神经网络的数据为[-1,band]
""" sio.savemat('G:\研究生\Label.mat',{'LabelX':Label}) """
totalcount = np.zeros((10,1),dtype = np.int32)
trainset = []
testset = []
# 将样本和标签合并
def integrated_data(self):
rebuilddata = []
for i in range(0,self.a*self.b):
rebuilddata.append([np.array(self.SampleX[i]),np.array(self.Label[i])])
for j in range(0,10):
if self.Label[i] == j:
self.totalcount[j] += 1
rebuilddata = np.array(rebuilddata)
return rebuilddata
# 并制作训练和测试数据
def make_trainset_and_testset(self, rebuilddata, ratio):
TrainIndex = []
TestIndex = []
# 取出每一类的训练集和测试集坐标
for i in range(1,np.max(self.Label)+1):
class_coor = np.argwhere(self.Label == i)
index = class_coor[:,0].tolist()
np.random.shuffle(index)
VAL_SIZE = int(np.floor(len(index)*ratio))
ClassTrainIndex = index[:VAL_SIZE]
ClassTestIndex = index[-VAL_SIZE:]
TrainIndex += ClassTrainIndex
TestIndex += ClassTestIndex
# 返回训练集和测试集样本
TrainSample = rebuilddata[TrainIndex]
TestSample = rebuilddata[TestIndex]
return TrainIndex,TestIndex,TrainSample,TestSample
This is my dnn module:
class DNN(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(103, 500)
self.fc2 = nn.Linear(500, 256)
self.fc3 = nn.Linear(256, 9)
def forward(self,x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return F.softmax(x,dim=1)
The training and testing function:
def train(dnn):
BATCH_SIZE = 100
EPOCHS = 3
for epoch in range(EPOCHS):
for i in tqdm(range(0, len(train_X), BATCH_SIZE)):
batch_X = train_X[i:i+BATCH_SIZE]
batch_y = train_y[i:i+BATCH_SIZE]
dnn.zero_grad()
outputs = dnn(batch_X)
loss = loss_function(outputs, batch_y)
loss.backward()
optimizer.step()
print(loss)
def test(net):
correct = 0
total = 0
with torch.no_grad():
for i in tqdm(range(len(test_X))):
real_class = torch.argmax(test_y[i]).to(device)
net_out = dnn(test_X[i].view(-1, 1, 50, 50).to(device))[0]
predicted_class = torch.argmax(net_out)
if predicted_class == real_class:
correct += 1
total += 1
print("Accuracy:", round(correct/total,3))
if REBUILD_DATA:
Data = DATA()
datay = Data.integrated_data()
Trainindex, Testindex, TrainSet, TestSet = Data.make_trainset_and_testset(rebuilddata=datay,ratio=0.1)
train_X = torch.Tensor([i[0] for i in TrainSet])
train_y = torch.Tensor([i[1] for i in TrainSet])
train_X = train_X/3000
test_X = torch.Tensor([i[0] for i in TestSet])
test_y = torch.Tensor([i[1] for i in TestSet])
print(train_X[0])
dnn = DNN()
optimizer = optim.SGD(dnn.parameters(), lr = 0.001)
loss_function = nn.TripletMarginLoss()
train(dnn)
You are using nn.TripletMarginLoss() as your loss function.
This specific loss function expects three inputs for computing the loss: anchor, positive and negative.
Your code passes only two arguments.
I have extracted CNN features from a pretrain vgg19 with size 4096. Then I am using a shallower architecture to train a classifier with softmax and center losses. Unfortunately, the softmax loss function returns nan. There is detailed discussion available here, however I am not able to resolve the problem with clip because labels and logits are in two different data format (int64, float32). Furthermore, I also changed the learning rate but still got the same error.
Can some please let me know, how to resolve this situation.
from __future__ import division
from __future__ import print_function
import csv
import numpy as np
import tensorflow as tf
from retrieval_model import setup_train_model
FLAGS = None
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
def get_name(read_file):
feat_lst = []
identifier_lst = []
with open(read_file, 'r') as csvfile:
read_file = csv.reader(csvfile, delimiter=',')
for row in read_file:
feat = row[:-1]
s_feat = [float(i) for i in feat]
identifier = row[-1]
feat_lst.append(s_feat)
identifier_lst.append(identifier)
return feat_lst, identifier_lst
def get_batch(batch_index, batch_size, labels, f_lst):
start_ind = batch_index * batch_size
end_ind = start_ind + batch_size
return f_lst[start_ind:end_ind], labels[start_ind:end_ind]
def creat_dict(orig_labels):
dict = {}
count = 0
for x in orig_labels:
n_label = dict.get(x, None)
if n_label is None:
dict[x] = count
count += 1
return dict
def main(_):
save_dir = 'model/one-branch-ckpt'
train_file = 'gtrain.csv'
img_feat, img_labels = get_name(train_file)
map_dict = creat_dict(img_labels)
img_labels = [map_dict.get(x) for x in img_labels]
im_feat_dim = 4096
batch_size = 50
max_num_epoch = 10
steps_per_epoch = len(img_feat) // batch_size
num_steps = steps_per_epoch * max_num_epoch
# Setup placeholders for input variables.
im_feat_plh = tf.placeholder(tf.float32, shape=[batch_size, im_feat_dim])
label_plh = tf.placeholder(tf.int64, shape=(batch_size), name='labels')
train_phase_plh = tf.placeholder(tf.bool)
# Setup training operation.
t_l = setup_train_model(im_feat_plh, train_phase_plh, label_plh, classes)
# Setup optimizer.
global_step = tf.Variable(0, trainable=False)
init_learning_rate = 0.0001
learning_rate = tf.train.exponential_decay(init_learning_rate, global_step,
steps_per_epoch, 0.794, staircase=True)
optim = tf.train.AdamOptimizer(init_learning_rate)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
train_step = optim.minimize(t_l, global_step=global_step)
# Setup model saver.
saver = tf.train.Saver(save_relative_paths=True,max_to_keep=1)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(num_steps):
im_feats, labels = get_batch(
i % steps_per_epoch, batch_size, img_labels, img_feat)
feed_dict = {
im_feat_plh: im_feats,
label_plh: labels,
train_phase_plh: True,
}
[_, loss_val] = sess.run([train_step, t_l], feed_dict=feed_dict)
if i % 100 == 0:
print('Epoch: %d Step: %d Loss: %f' % (i // steps_per_epoch, i, loss_val))
if i % steps_per_epoch == 0 and i > 0:
print('Saving checkpoint at step %d' % i)
saver.save(sess, save_dir, global_step=global_step)
if __name__ == '__main__':
np.random.seed(0)
tf.set_random_seed(0)
tf.app.run(main=main)
**************************retrieval_model********************************
def setup_train_model(im_feats, train_phase, im_labels, nrof_classes):
alfa = 0.9
# nrof_classes = 28783
i_embed = embedding_model(im_feats, train_phase, im_labels)
c_l = embedding_loss(i_embed, im_labels, alfa, nrof_classes)
loss = softmax_loss(i_embed, im_labels)
total_loss = loss + c_l
return total_loss
def add_fc(inputs, outdim, train_phase, scope_in):
fc = fully_connected(inputs, outdim, activation_fn=None, scope=scope_in + '/fc')
fc_bnorm = tf.layers.batch_normalization(fc, momentum=0.1, epsilon=1e-5,
training=train_phase, name=scope_in + '/bnorm')
fc_relu = tf.nn.relu(fc_bnorm, name=scope_in + '/relu')
fc_out = tf.layers.dropout(fc_relu, seed=0, training=train_phase, name=scope_in + '/dropout')
return fc_out
def embedding_loss(features, label, alfa, nrof_classes):
nrof_features = features.get_shape()[1]
centers = tf.get_variable('centers', [nrof_classes, nrof_features], dtype=tf.float32,
initializer=tf.constant_initializer(0), trainable=False)
label = tf.reshape(label, [-1])
centers_batch = tf.gather(centers, label)
diff = (1 - alfa) * (centers_batch - features)
#centers = tf.scatter_sub(centers, label, diff)
center_loss = tf.reduce_mean(tf.square(features - centers_batch))
#softmax_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=label, logits=features))
#total_loss = softmax_loss + center_loss
return center_loss
def embedding_model(im_feats, train_phase, im_labels,
fc_dim=2048, embed_dim=512):
# Image branch.
im_fc1 = add_fc(im_feats, fc_dim, train_phase, 'im_embed_1')
im_fc2 = fully_connected(im_fc1, embed_dim, activation_fn=None,
scope='im_embed_2')
return tf.nn.l2_normalize(im_fc2, 1, epsilon=1e-10)
def softmax_loss(feat, im_labels):
label = tf.reshape(im_labels, [-1])
softmax = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=label, logits=feat))
return softmax