I'm working on implementing a semantic segmentation network in Tensorflow, and I'm trying to figure out how to write out summary images of the labels during training. I want to encode the images in a similar style to the class segmentation annotations used in the Pascal VOC dataset.
For example, let's assume I have a network that trains on a batch size of 1 with 4 classes. The networks final predictions have shape [1, 3, 3, 4]
Essentially I want to take the output predictions and run it through argmin to get a tensor containing the most likely class at each point in the output:
[[[0, 1, 3],
[2, 0, 1],
[3, 1, 2]]]
The annotated images use a color palette of 255 colors to encode labels. I have a tensor containing all the color triples:
[[ 0, 0, 0],
[128, 0, 0],
[ 0, 128, 0],
[128, 128, 0],
[ 0, 0, 128],
...
[224, 224, 192]]
How could I obtain a tensor of shape [1, 3, 3, 3] (a single 3x3 color image) that indexes into the color palette using the values obtained from argmin?
[[palette[0], palette[1], palette[3]],
[palette[2], palette[0], palette[1]],
[palette[3], palette[1], palette[2]]]
I could easily wrap some numpy and PIL code in tf.py_func but I'm wondering if there is a pure Tensorflow way of obtaining this result.
EDIT:
For those curious, this is the solution I got using just numpy. It works quite well, but I still dislike the use of tf.py_func:
import numpy as np
import tensorflow as tf
def voc_colormap(N=256):
bitget = lambda val, idx: ((val & (1 << idx)) != 0)
cmap = np.zeros((N, 3), dtype=np.uint8)
for i in range(N):
r = g = b = 0
c = i
for j in range(8):
r |= (bitget(c, 0) << 7 - j)
g |= (bitget(c, 1) << 7 - j)
b |= (bitget(c, 2) << 7 - j)
c >>= 3
cmap[i, :] = [r, g, b]
return cmap
VOC_COLORMAP = voc_colormap()
def grayscale_to_voc(input, name="grayscale_to_voc"):
return tf.py_func(grayscale_to_voc_impl, [input], tf.uint8, stateful=False, name=name)
def grayscale_to_voc_impl(input):
return np.squeeze(VOC_COLORMAP[input])
You can use tf.gather_nd(), but you will need to modify the shapes of the palette and logits to obtain the desired image, for example:
import tensorflow as tf
import numpy as np
import PIL.Image as Image
# We can load the palette from some random image in the PASCAL VOC dataset
palette = Image.open('.../VOC2012/SegmentationClass/2007_000032.png').getpalette()
# We build a random logits tensor of the requested size
batch_size = 1
height = width = 3
num_classes = 4
np.random.seed(1234)
logits = np.random.random_sample((batch_size, height, width, num_classes))
logits_argmax = np.argmax(logits, axis=3) # shape = (1, 3, 3)
# array([[[3, 3, 0],
# [1, 3, 1],
# [0, 2, 0]]])
sess = tf.InteractiveSession()
image = tf.gather_nd(
params=tf.reshape(palette, [-1, 3]), # reshaped from list to RGB
indices=tf.reshape(logits_argmax, [batch_size, -1, 1]))
image = tf.cast(tf.reshape(image, [batch_size, height, width, 3]), tf.uint8)
sess.run(image)
# array([[[[128, 128, 0],
# [128, 128, 0],
# [ 0, 0, 0]],
# [[128, 0, 0],
# [128, 128, 0],
# [128, 0, 0]],
# [[ 0, 0, 0],
# [ 0, 128, 0],
# [ 0, 0, 0]]]], dtype=uint8)
The resulting tensor can be directly fed to a tf.summary.image(), but depending on your implementation you should upsample it before the summary.
Related
# initialize start token
target = tf.constant([[2]], dtype=tf.int32, shape=[1, 1]) # 2 - <BOS>
dummy_outputs = [4, 7, 1, 9, 15, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] # for testing
self.max_length = 10
def generate_next_token(tar):
# out, _ = self.model([features, tf.expand_dims(tar, 0)], training=False) # shape: [1, seq_length, vocab_size]
# out = out[:, -1, :] # take last token of sequence. shape: [1, vocab_size]
# out = tf.random.categorical(out, 1) # shape: [1, 1]
# out = tf.cast(out, tf.int32)
out = tf.constant([[dummy_outputs.pop(0)]], dtype=tf.int32, shape=[1, 1])
print(tar.shape)
tar = tf.concat([tar, out], axis=-1)
return tar
def end_of_sequence_not_reached(tar):
print(tar.shape)
return tf.math.logical_and(tf.less(tf.shape(tar)[-1], self.max_length),
tf.not_equal(tar[-1], 3)) # 3 - <EOS>
target = tf.while_loop(cond=end_of_sequence_not_reached, body=generate_next_token, loop_vars=[target], shape_invariants=[tf.TensorShape([1, None])])
Somehow, the shape of tar changes from (1, n) to (n) after every iteration of the tf.while_loop, and dimensions of length 1 are lost, so I have to work around it by doing
if len(tar.shape) < len(prev_shape):
tar = tf.expand_dims(tar, 0)
Why is this and how can I prevent it?
I have an image saved as numpy array of shape [Height, Width, 3] and I want to replace every pixel with another value based on the color of pixel, so the final array will have a shape [Height, Weight].
My solution with for loop works but it's pretty slow. How can I use Numpy vectorization to make it more efficient?
image = cv2.imread("myimage.png")
result = np.zeros(shape=(image.shape[0], image.shape[1],))
for h in range(0, result.shape[0]):
for w in range(0, result.shape[1]):
result[h, w] = get_new_value(image[h, w])
Here is get_new_value function:
def get_new_value(array: np.ndarray) -> int:
mapping = {
(0, 0, 0): 0,
(0, 0, 255): 5,
(0, 100, 200): 8,
# ...
}
return mapping[tuple(array)]
you can use np.select() as shown below:
img=np.array(
[[[123 123 123]
[130 130 130]]
[[129 128 128]
[162 162 162]]])
condlist = [img==[123,123,123], img==[130, 130, 130], img==[129, 129, 129], img==[162, 162, 162]]
choicelist = [0, 5, 8, 9]
img_replaced = np.select(condlist, choicelist)
final = img_replaced[:, :, 0]
print('img_replaced')
print(img_replaced)
print('final')
print(final)
condlist is your list of colour values and choicelist is the list of replacements.
np.select then returns three channels and you just need to take one channel from that to give the array 'final' which is the format you want I believe
output is:
img_replaced
[[[0 0 0]
[5 5 5]]
[[0 0 0]
[9 9 9]]]
final
[[0 5]
[0 9]]
so code specific to your example and shown colour mappings would be:
image = cv2.imread("myimage.png")
condlist = [image==[0, 0, 0], image==[0, 0, 255], image==[0, 100, 200]]
choicelist = [0, 5, 8]
img_replaced = np.select(condlist, choicelist)
result = img_replaced[:, :, 0]
I want to ask you about calculating the histogram in Python using OpenCV. I used this code:
hist = cv2.calcHist(im, [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
The result gave me the histogram of each color channel with 8 bins, but what I want to get is:
1st bin (R=0-32,G=0-32,B=0-32),
2nd bin (R=33-64,G=0-32,B=0-32),
and so on,
so I will have 512 bins in total.
From my point of view, your cv2.calcHist call isn't correct:
hist = cv2.calcHist(im, [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
The first parameter should be a list of images:
hist = cv2.calcHist([im], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
Let's see this small example:
import cv2
import numpy as np
# Red blue square of size [4, 4], i.e. eight pixels (255, 0, 0) and eight pixels (0, 0, 255); Attention: BGR ordering!
image = np.zeros((4, 4, 3), dtype=np.uint8)
image[:, 0:2, 2] = 255
image[:, 2:4, 0] = 255
# Calculate histogram with two bins [0 - 127] and [128 - 255] per channel:
# Result should be hist["bin 0", "bin 0", "bin 1"] = 8 (red) and hist["bin 1", "bin 0", "bin 0"] = 8 (blue)
# Original cv2.calcHist call with two bins [0 - 127] and [128 - 255]
hist = cv2.calcHist(image, [0, 1, 2], None, [2, 2, 2], [0, 256, 0, 256, 0, 256])
print(hist, '\n') # Not correct
# Correct cv2.calcHist call
hist = cv2.calcHist([image], [0, 1, 2], None, [2, 2, 2], [0, 256, 0, 256, 0, 256])
print(hist, '\n') # Correct
[[[8. 0.]
[0. 0.]]
[[0. 0.]
[0. 4.]]]
[[[0. 8.]
[0. 0.]]
[[8. 0.]
[0. 0.]]]
As you can, your version only has 12 values in total, whereas there are 16 pixels in the image! Also, it's not clear, what "bins" (if at all) are represented.
So, having the proper cv2.calcHist call, your general idea/approach is correct! Maybe, you just need a little hint, "how to read" the resuling hist:
import cv2
import numpy as np
# Colored rectangle of size [32, 16] with one "color" per bin for eight bins per channel,
# i.e. 512 pixels, such that each of the resulting 512 bins has value 1
x = np.linspace(16, 240, 8, dtype=np.uint8)
image = np.reshape(np.moveaxis(np.array(np.meshgrid(x, x, x)), [0, 1, 2, 3], [3, 0, 1, 2]), (32, 16, 3))
# Correct cv2.calcHist call
hist = cv2.calcHist([image], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
# Lengthy output of each histogram bin
for B in np.arange(hist.shape[0]):
for G in np.arange(hist.shape[1]):
for R in np.arange(hist.shape[2]):
r = 'R=' + str(R*32).zfill(3) + '-' + str((R+1)*32-1).zfill(3)
g = 'G=' + str(G*32).zfill(3) + '-' + str((G+1)*32-1).zfill(3)
b = 'B=' + str(B*32).zfill(3) + '-' + str((B+1)*32-1).zfill(3)
print('(' + r + ', ' + g + ', ' + b + '): ', int(hist[B, G, R]))
(R=000-031, G=000-031, B=000-031): 1
(R=032-063, G=000-031, B=000-031): 1
(R=064-095, G=000-031, B=000-031): 1
[... 506 more lines ...]
(R=160-191, G=224-255, B=224-255): 1
(R=192-223, G=224-255, B=224-255): 1
(R=224-255, G=224-255, B=224-255): 1
Hope that helps!
I have an image which is read as a uint8 array with the shape (512,512,3).
Now I would like to convert this array to a uint8 array of shape (512,512,1), where each pixel value in the third axis are converted from a color value [255,0,0] to a single class label value [3], based on the following color/class encoding:
1 : [0, 0, 0],
2 : [0, 0, 255],
3 : [255, 0, 0],
4 : [150, 30, 150],
5 : [255, 65, 255],
6 : [150, 80, 0],
7 : [170, 120, 65],
8 : [125, 125, 125],
9 : [255, 255, 0],
10 : [0, 255, 255],
11 : [255, 150, 0],
12 : [255, 225, 120],
13 : [255, 125, 125],
14 : [200, 100, 100],
15 : [0, 255, 0],
16 : [0, 150, 80],
17 : [215, 175, 125],
18 : [220, 180, 210],
19 : [125, 125, 255]
What is the most efficient way to do this? I thought of looping through all classes and using numpy.where, but this is obviously time-consuming.
You could use giant lookup table. Let cls be [[0,0,0], [0,0,255], ...] of dtype=np.uint8.
LUT = np.zeros(size=(256,256,256), dtype='u1')
LUT[cls[:,0],cls[:,1],cls[:,2]] = np.arange(cls.shape[1])+1
img_as_cls = LUT[img[...,0],img[...,1], img[...,2]]
This solution is O(1) per pixel. It is also quite cache efficient because a small part of entries in LUT are actually used. It takes circa 10ms to process 1000x1000 image on my machine.
The solution can be slightly improved by converting 3-color channels to 24-bit integers.
Here is the code
def scalarize(x):
# compute x[...,2]*65536+x[...,1]*256+x[...,0] in efficient way
y = x[...,2].astype('u4')
y <<= 8
y +=x[...,1]
y <<= 8
y += x[...,0]
return y
LUT = np.zeros(2**24, dtype='u1')
LUT[scalarize(cls)] = 1 + np.arange(cls.shape[0])
simg = scalarize(img)
img_to_cls = LUT[simg]
After optimization it takes about 5ms to process 1000x1000 image.
One way: separately create the boolean arrays with True values where the input's pixel value matches one of the palette values, and then use arithmetic to combine them. Thus:
palette = [
[0, 0, 0],
[0, 0, 255],
[255, 0, 0],
# etc.
]
def palettized(data, palette):
# Initialize result array
shape = list(data.shape)
shape[-1] = 1
result = np.zeros(shape)
# Loop and add each palette index component.
for value, colour in enumerate(palette, 1):
result += (data == colour).all(axis=2) * value
return result
Here's one based on views -
# https://stackoverflow.com/a/45313353/ #Divakar
def view1D(a, b): # a, b are arrays
# This function gets 1D view into 2D input arrays
a = np.ascontiguousarray(a)
b = np.ascontiguousarray(b)
void_dt = np.dtype((np.void, a.dtype.itemsize * a.shape[-1]))
return a.view(void_dt).ravel(), b.view(void_dt).ravel()
def img2label(a, maps):
# Get one-dimension reduced view into input image and map arrays.
# We need to reshape image to 2D, then feed it to view1D to get 1D
# outputs and then reshape 1D image to 2D
A,B = view1D(a.reshape(-1,a.shape[-1]),maps)
A = A.reshape(a.shape[:2])
# Trace back positions of A in B using searchsorted. This gives us
# original order, which is the final output.
sidx = B.argsort()
return sidx[np.searchsorted(B,A,sorter=sidx)]
Given that your labels start from 1, you might want to add 1 to the output.
Sample run -
In [100]: # Mapping array
...: maps = np.array([[0, 0, 0],[0, 0, 255],\
...: [255, 0, 0],[150, 30, 150]],dtype=np.uint8)
...:
...: # Setup random image array
...: idx = np.array([[0,2,1,3],[1,3,2,0]])
...: img = maps[idx]
In [101]: img2label(img, maps) # should retrieve back idx
Out[101]:
array([[0, 2, 1, 3],
[1, 3, 2, 0]])
I'm trying to implement STDP (Spike-Timing Dependent Plasticity) in tensorflow. It's a bit complicated. Any ideas (to get running entirely within a tensorflow graph)?
It works like this: say I have 2 input neurons, and they connect to 3 output neurons, via this matrix: [[1.0, 1.0, 0.0], [0.0, 0.0, 1.0]] (input neuron 0 connects to output neurons 0 and 1...).
Say I have these spikes for the input neurons (2 neurons, 7 timesteps):
Input Spikes:
[[0, 0, 1, 1, 0, 1, 0],
[1, 1, 0, 0, 0, 0, 1]]
And these spikes for the output neurons (3 neurons, 7 timesteps):
Output Spikes:
[[0, 0, 0, 1, 0, 0, 1],
[1, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 1, 1, 1]]
Now, for each non-zero weight, I want to compute a dw. For instance, for input neuron 0 connecting to output neuron 0:
The time stamps of the spikes for input neuron 0 are [2, 3, 5], and the timestamps for output neuron 0 are [3, 6]. Now, I compute all the delta times:
Delta Times = [ 2-3, 2-6, 3-3, 3-6, 5-3, 5-6 ] = [ -1, -4, 0, -3, 2, -1 ]
Then, I compute some function (the actual STDP function, which isn't important for this question - some exponential thing)
dw = SUM [ F(-1), F(-4), F(0), F(-3), F(2), F(-1) ]
And that's the dw for the weight connecting input neuron 0 to output neuron 0. Repeat for all non-zero weights.
So I can do all this in numpy, but I'd like to be able to do it entirely within a single tensorflow graph. In particular, I'm stuck on computing the delta times. And how to do all this for all non-zero weights, in parallel.
This is the actual stdp function, btw (the constants can be parameters):
def stdp_f(x):
return tf.where(
x == 0, np.zeros(x.shape), tf.where(
x > 0, 1.0 * tf.exp(-1.0 * x / 10.0), -1.0 * 1.0 * tf.exp(x / 10.0)))
A note on performance: the method given by #jdehesa, below, is both correct and clever. But it also turns out to be slow. In particular, for a real neural network of 784 input neurons feeding into 400 neurons, over 500 time steps, the spike_match = step performs multiplication of (784, 1, 500, 1) and (1, 400, 1, 500) tensors.
I am not familiar with STDP, so I hope I understood correctly what you meant. I think this does what you describe:
import tensorflow as tf
def f(x):
# STDP function
return x * 1
def stdp(input_spikes, output_spikes):
input_shape = tf.shape(input_spikes)
t = input_shape[-1]
# Compute STDP function for all possible time difference values
stdp_values = f(tf.cast(tf.range(-t + 1, t), dtype=input_spikes.dtype))
# Arrange in matrix such that position [i, j] contains f(i - j)
matrix_idx = tf.expand_dims(tf.range(t - 1, 2 * t - 1), 1) + tf.range(0, -t, -1)
stdp_matrix = tf.gather(stdp_values, matrix_idx)
# Find spike matches
spike_match = (input_spikes[:, tf.newaxis, :, tf.newaxis] *
output_spikes[tf.newaxis, :, tf.newaxis, :])
# Sum values where there are spike matches
return tf.reduce_sum(spike_match * stdp_matrix, axis=(2, 3))
# Test
input_spikes = [[0, 0, 1, 1, 0, 1, 0],
[1, 1, 0, 0, 0, 0, 1]]
output_spikes = [[0, 0, 0, 1, 0, 0, 1],
[1, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 1, 1, 1]]
with tf.Graph().as_default(), tf.Session() as sess:
ins = tf.placeholder(tf.float32, [None, None])
outs = tf.placeholder(tf.float32, [None, None])
res = stdp(ins, outs)
res_val = sess.run(res, feed_dict={ins: input_spikes, outs: output_spikes})
print(res_val)
# [[ -7. 10. -15.]
# [-13. 7. -24.]]
Here I assume that f is probably expensive (and that its value is the same for every pair of neurons), so I compute it only once for every possible time delta and then redistribute the computed values in a matrix, so I can multiply at the pairs of coordinates where the input and output spikes happen.
I used the identity function for f as a placeholder, so the resulting values are actually just the sum of time differences in this case.
EDIT: Just for reference, replacing f with the STDP function you included:
def f(x):
return tf.where(x == 0,
tf.zeros_like(x),
tf.where(x > 0,
1.0 * tf.exp(-1.0 * x / 10.0),
-1.0 * 1.0 * tf.exp(x / 10.0)))
The result is:
[[-3.4020822 2.1660795 -5.694256 ]
[-2.974073 0.45364904 -3.1197631 ]]