Related
I would like to ask a question about building a pose-conditioned StyleGAN in PyTorch. My intent here is to generate images of human models only in conditioned poses (based on 17x64x64 pose heatmaps). Assuming that generator adjustments are already more or less finished, how can I include pose conditioning into discriminator?
We can use Discriminator class from https://github.com/NVlabs/stylegan2-ada-pytorch/blob/main/training/networks.py as an example: here, in the forward() method of the DiscriminatorEpilogue a simple label-based conditioning is applied.
def forward(self, x, img, cmap, force_fp32=False):
misc.assert_shape(x, [None, self.in_channels, self.resolution, self.resolution]) # [NCHW]
# Here, cmap is just a simple class label mapping. In my case, cmap would include
# a 17-channel pose heatmap from a certain source image.
_ = force_fp32 # unused
dtype = torch.float32
memory_format = torch.contiguous_format
# FromRGB.
x = x.to(dtype=dtype, memory_format=memory_format)
if self.architecture == 'skip':
misc.assert_shape(img, [None, self.img_channels, self.resolution, self.resolution])
img = img.to(dtype=dtype, memory_format=memory_format)
x = x + self.fromrgb(img)
# Main layers.
if self.mbstd is not None:
x = self.mbstd(x)
x = self.conv(x)
x = self.fc(x.flatten(1))
x = self.out(x)
# Conditioning.
if self.cmap_dim > 0:
misc.assert_shape(cmap, [None, self.cmap_dim])
x = (x * cmap).sum(dim=1, keepdim=True) * (1 / np.sqrt(self.cmap_dim))
assert x.dtype == dtype
return x
How could I adjust this code to accomodate my problem, with heatmap dimensions being [batch_size, 17, 64, 64]? I thought abbout flattening the heatmap, but that would lose the spatial information. Another option would be to extract heatmap xmap from the image and calculate some form of distance between xmap and gmap(some form of pixel-wise MAE?). However, I struggle to imagine how to combine such a result with the base output x for the purpose of conditioning.
Suppose I have a batch of images as a tensor, for example:
images = torch.zeros(64, 3, 1024, 1024)
Now, I want to select a patch from each of those images. All the patches are of the same size, but have different starting positions for each image in the batch.
size_x = 100
size_y = 100
start_x = torch.zeros(64)
start_y = torch.zeros(64)
I can achieve the desired result like that:
result = []
for i in range(arr.shape[0]):
result.append(arr[i, :, start_x[i]:start_x[i]+size_x, start_y[i]:start_y[i]+size_y])
result = torch.stack(result, dim=0)
The question is -- is it possible to do the same thing faster, without a loop? Perhaps there is some form of advanced indexing, or a PyTorch function that can do this?
You can use torch.take to get rid of a for loop. But first, an array of indices should be created with this function
def convert_inds(img_a,img_b,patch_a,patch_b,start_x,start_y):
all_patches = np.zeros((len(start_x),3,patch_a,patch_b))
patch_src = np.zeros((patch_a,patch_b))
inds_src = np.arange(patch_b)
patch_src[:] = inds_src
for ind,info in enumerate(zip(start_x,start_y)):
x,y = info
if x + patch_a + 1 > img_a: return False
if y + patch_b + 1 > img_b: return False
start_ind = img_b * x + y
end_ind = img_b * (x + patch_a -1) + y
col_src = np.linspace(start_ind,end_ind,patch_b)[:,None]
all_patches[ind,:] = patch_src + col_src
return all_patches.astype(np.int)
As you can see, this function essentially creates the indices for each patch you want to slice. With this function, the problem can be easily solved by
size_x = 100
size_y = 100
start_x = torch.zeros(64)
start_y = torch.zeros(64)
images = torch.zeros(64, 3, 1024, 1024)
selected_inds = convert_inds(1024,1024,100,100,start_x,start_y)
selected_inds = torch.tensor(selected_inds)
res = torch.take(images,selected_inds)
UPDATE
OP's observation is correct, the approach above is not faster than a naive approach. In order to avoid building indices every time, here is another solution based on unfold
First, build a tensor of all the possible patches
# create all possible patches
all_patches = images.unfold(2,size_x,1).unfold(3,size_y,1)
Then, slice the desired patches from all_patches
img_ind = torch.arange(images.shape[0])
selected_patches = all_patches[img_ind,:,start_x,start_y,:,:]
I have a function that takes a [32, 32, 3] tensor, and outputs a [256,256,3] tensor.
Specifically, the function interprets the smaller array as if it was a .svg file, and 'renders' it to a 256x256 array as a canvas using this algorithm
For an explanation of WHY I would want to do this, see This question
The function behaves exactly as intended, until I try to include it in the training loop of a GAN. The current error I'm seeing is:
NotImplementedError: Cannot convert a symbolic Tensor (mul:0) to a numpy array.
A lot of other answers to similar errors seem to boil down to "You need to re-write the function using tensorflow, not numpy"
Here's the working code using numpy - is it possible to re-write it to exclusively use tensorflow functions?
def convert_to_bitmap(input_tensor, target, j):
#implied conversion to nparray - the tensorflow docs seem to indicate this is okay, but the error is thrown here when training
array = input_tensor
outputArray = target
output = target
for i in range(32):
col = float(array[i,0,j])
if ((float(array[i,0,0]))+(float(array[i,0,1]))+(float(array[i,0,2]))/3)< 0:
continue
#slice only the red channel from the i line, multiply by 255
red_array = array[i,:,0]*255
#slice only the green channel, multiply by 255
green_array = array[i,:,1]*255
#combine and flatten them
combined_array = np.dstack((red_array, green_array)).flatten()
#remove the first two and last two indices of the combined array
index = [0,1,62,63]
clipped_array = np.delete(combined_array,index)
#filter array to remove values less than 0
filtered = clipped_array > 0
filtered_array = clipped_array[filtered]
#check array has an even number of values, delete the last index if it doesn't
if len(filtered_array) % 2 == 0:
pass
else:
filtered_array = np.delete(filtered_array,-1)
#convert into a set of tuples
l = filtered_array.tolist()
t = list(zip(l, l[1:] + l[:1]))
if not t:
continue
output = fill_polygon(t, outputArray, col)
return(output)
The 'fill polygon' function is copied from the 'mahotas' library:
def fill_polygon(polygon, canvas, color):
if not len(polygon):
return
min_y = min(int(y) for y,x in polygon)
max_y = max(int(y) for y,x in polygon)
polygon = [(float(y),float(x)) for y,x in polygon]
if max_y < canvas.shape[0]:
max_y += 1
for y in range(min_y, max_y):
nodes = []
j = -1
for i,p in enumerate(polygon):
pj = polygon[j]
if p[0] < y and pj[0] >= y or pj[0] < y and p[0] >= y:
dy = pj[0] - p[0]
if dy:
nodes.append( (p[1] + (y-p[0])/(pj[0]-p[0])*(pj[1]-p[1])) )
elif p[0] == y:
nodes.append(p[1])
j = i
nodes.sort()
for n,nn in zip(nodes[::2],nodes[1::2]):
nn += 1
canvas[y, int(n):int(nn)] = color
return(canvas)
NOTE: I'm not trying to get someone to convert the whole thing for me! There are some functions that are pretty obvious (tf.stack instead of np.dstack), but others that I don't even know how to start, like the last few lines of the fill_polygon function above.
Yes you can actually do this, you can use a python function in sth called tf.pyfunc. Its a python wrapper but its extremely slow in comparison to plain tensorflow. However, tensorflow and Cuda for example are so damn fast because they use stuff like vectorization, meaning you can rewrite a lot , really many of the loops in terms of mathematical tensor operations which are very fast.
In general:
If you want to use custom code as a custom layer, i would recommend you to rethink the algebra behind those loops and try to express them somehow different. If its just preprocessing before the training is going to start, you can use tensorflow but doing the same with numpy and other libraries is easier.
To your main question: Yes its possible, but better dont use loops. Tensorflow has a build-in loop optimizer but then you have to use tf.while() and thats anyoing (maybe just for me). I just blinked over your code, but it looks like you should be able to vectorize it quite good using the standard tensorflow vocabulary. If you want it fast, i mean really fast with GPU support write all in tensorflow, but nothing like 50/50 with tf.convert_to_tensor(), because than its going to be slow again. because than you switch between GPU and CPU and plain Python interpreter and the tensorflow low level API. Hope i could help you at least a bit
This code 'works', in that it only uses tensorflow functions, and does allow the model to train when used in a training loop:
def convert_image (x):
#split off the first column of the generator output, and store it for later (remove the 'colours' column)
colours_column = tf.slice(img_to_convert, tf.constant([0,0,0], dtype=tf.int32), tf.constant([32,1,3], dtype=tf.int32))
#split off the rest of the data, only keeping R + G, and discarding B
image_data_red = tf.slice(img_to_convert, tf.constant([0,1,0], dtype=tf.int32), tf.constant([32,31,1], dtype=tf.int32))
image_data_green = tf.slice(img_to_convert, tf.constant([0,1,1], dtype=tf.int32), tf.constant([32, 31,1], dtype=tf.int32))
#roll each row by 1 position, and make two more 2D tensors
rolled_red = tf.roll(image_data_red, shift=-1, axis=0)
rolled_green = tf.roll(image_data_green, shift=-1, axis=0)
#remove all values where either the red OR green channels are 0
zeroes = tf.constant(0, dtype=tf.float32)
#this is for the 'count_nonzero' command
boolean_red_data = tf.not_equal(image_data_red, zeroes)
boolean_green_data = tf.not_equal(image_data_green, zeroes)
initial_data_mask = tf.logical_and(boolean_red_data, boolean_green_data)
#count non-zero values per row and flatten it
count = tf.math.count_nonzero(initial_data_mask, 1)
count_flat = tf.reshape(count, [-1])
flat_red = tf.reshape(image_data_red, [-1])
flat_green = tf.reshape(image_data_green, [-1])
boolean_red = tf.math.logical_not(tf.equal(flat_red, tf.zeros_like(flat_red)))
boolean_green = tf.math.logical_not(tf.equal(flat_green, tf.zeros_like(flat_red)))
mask = tf.logical_and(boolean_red, boolean_green)
flat_red_without_zero = tf.boolean_mask(flat_red, mask)
flat_green_without_zero = tf.boolean_mask(flat_green, mask)
# create a ragged tensor
X0_ragged = tf.RaggedTensor.from_row_lengths(values=flat_red_without_zero, row_lengths=count_flat)
Y0_ragged = tf.RaggedTensor.from_row_lengths(values=flat_green_without_zero, row_lengths=count_flat)
#do the same for the rolled version
rolled_data_mask = tf.roll(initial_data_mask, shift=-1, axis=1)
flat_rolled_red = tf.reshape(rolled_red, [-1])
flat_rolled_green = tf.reshape(rolled_green, [-1])
#from SO "shift zeros to the end"
boolean_rolled_red = tf.math.logical_not(tf.equal(flat_rolled_red, tf.zeros_like(flat_rolled_red)))
boolean_rolled_green = tf.math.logical_not(tf.equal(flat_rolled_green, tf.zeros_like(flat_rolled_red)))
rolled_mask = tf.logical_and(boolean_rolled_red, boolean_rolled_green)
flat_rolled_red_without_zero = tf.boolean_mask(flat_rolled_red, rolled_mask)
flat_rolled_green_without_zero = tf.boolean_mask(flat_rolled_green, rolled_mask)
# create a ragged tensor
X1_ragged = tf.RaggedTensor.from_row_lengths(values=flat_rolled_red_without_zero, row_lengths=count_flat)
Y1_ragged = tf.RaggedTensor.from_row_lengths(values=flat_rolled_green_without_zero, row_lengths=count_flat)
#available outputs for future use are:
X0 = X0_ragged.to_tensor(default_value=0.)
Y0 = Y0_ragged.to_tensor(default_value=0.)
X1 = X1_ragged.to_tensor(default_value=0.)
Y1 = Y1_ragged.to_tensor(default_value=0.)
#Example tensor cel (replace with (x))
P = tf.cast(x, dtype=tf.float32)
#split out P.x and P.y, and fill a ragged tensor to the same shape as Rx
Px_value = tf.cast(x, dtype=tf.float32) - tf.cast((tf.math.floor(x/255)*255), dtype=tf.float32)
Py_value = tf.cast(tf.math.floor(x/255), dtype=tf.float32)
Px = tf.squeeze(tf.ones_like(X0)*Px_value)
Py = tf.squeeze(tf.ones_like(Y0)*Py_value)
#for each pair of values (Y0, Y1, make a vector, and check to see if it crosses the y-value (Py) either up or down
a = tf.math.less(Y0, Py)
b = tf.math.greater_equal(Y1, Py)
c = tf.logical_and(a, b)
d = tf.math.greater_equal(Y0, Py)
e = tf.math.less(Y1, Py)
f = tf.logical_and(d, e)
g = tf.logical_or(c, f)
#Makes boolean bitwise mask
#calculate the intersection of the line with the y-value, assuming it intersects
#P.x <= (G.x - R.x) * (P.y - R.y) / (G.y - R.y + R.x) - use tf.divide_no_nan for safe divide
h = tf.math.less(Px,(tf.math.divide_no_nan(((X1-X0)*(Py-Y0)),(Y1-Y0+X0))))
#combine using AND with the mask above
i = tf.logical_and(g,h)
#tf.count_nonzero
#reshape to make a column tensor with the same dimensions as the colours
#divide by 2 using tf.floor_mod (returns remainder of division - any remainder means the value is odd, and hence the point is IN the polygon)
final_count = tf.cast((tf.math.count_nonzero(i, 1)), dtype=tf.int32)
twos = tf.ones_like(final_count, dtype=tf.int32)*tf.constant([2], dtype=tf.int32)
divide = tf.cast(tf.math.floormod(final_count, twos), dtype=tf.int32)
index = tf.cast(tf.range(0,32, delta=1), dtype=tf.int32)
clipped_index = divide*index
sort = tf.sort(clipped_index)
reverse = tf.reverse(sort, [-1])
value = tf.slice(reverse, [0], [1])
pair = tf.constant([0], dtype=tf.int32)
slice_tensor = tf.reshape(tf.stack([value, pair, pair], axis=0),[-1])
output_colour = tf.slice(colours_column, slice_tensor, [1,1,3])
return output_colour
This is where the 'convert image' function is applied using tf.vectorize_map:
def convert_images(image_to_convert):
global img_to_convert
img_to_convert = image_to_convert
process_list = tf.reshape((tf.range(0,65536, delta=1, dtype=tf.int32)), [65536, 1])
output_line = tf.vectorized_map(convert_image, process_list)
output_line_squeezed = tf.squeeze(output_line)
output_reshape = (tf.reshape(output_line_squeezed, [256,256,3])/127.5)-1
output = tf.expand_dims(output_reshape, axis=0)
return output
It is PAINFULLY slow, though - It does not appear to be using the GPU, and looks to be single threaded as well.
I'm adding it as an answer to my own question because is clearly IS possible to do this numpy function entirely in tensorflow - it just probably shouldn't be done like this.
I have an image that I want to perform some calculations on. The image pixels will be represented as f(x, y) where x is the column number and y is the row number of each pixel. I want to perform a calculation using the following formula:
Here is the code that does the calculation:
import matplotlib.pyplot as plt
import numpy as np
import os.path
from PIL import Image
global image_width, image_height
# A. Blur Measurement
def measure_blur(f):
D_sub_h = [[0 for y in range(image_height)] for x in range(image_width)]
for x in range(image_width):
for y in range(image_height):
if(y == 0):
f_x_yp1 = f[x][y+1]
f_x_ym1 = 0
elif(y == (image_height -1)):
f_x_yp1 = 0
f_x_ym1 = f[x][y -1]
else:
f_x_yp1 = f[x][y+1]
f_x_ym1 = f[x][y -1]
D_sub_h[x][y] = abs(f_x_yp1 - f_x_ym1)
return D_sub_h
if __name__ == '__main__':
image_counter = 1
while True:
if not os.path.isfile(str (image_counter) + '.jpg'):
break
image_path = str(image_counter) + '.jpg'
image = Image.open(image_path )
image_height, image_width = image.size
print("Image Width : " + str(image_width))
print("Image Height : " + str(image_height))
f = np.array(image)
D_sub_h = measure_blur(f)
image_counter = image_counter + 1
The problem with this code is when the image size becomes large, such as (5000, 5000), it takes a very long time to complete. Is there any way or function I can use to make the execution time faster by not doing one by one or manual computation?
Since you specifically convert the input f to a numpy array, I am assuming you want to use numpy. In that case, the allocation of D_sub_h needs to change from a list to an array:
D_sub_h = np.empty_like(f)
If we assume that everything outside your array is zeros, then the first row and last row can be computed as the second and negative second-to-last rows, respectively:
D_sub_h[0, :] = f[1, :]
D_sub_h[-1, :] = -f[-2, :]
The remainder of the data is just the difference between the next and previous index at each location, which is idiomatically computed by shifting views: f[2:, :] - f[:-2, :]. This formulation creates a temporary array. You can avoid doing that by using np.subtract explicitly:
np.subtract(f[2:, :], f[:-2, :], out=D_sub_h[1:-1, :])
The entire thing takes four lines in this formulation, and is fully vectorized, which means that loops run quickly under the hood, without most of Python's overhead:
def measure_blur(f):
D_sub_h = np.empty_like(f)
D_sub_h[0, :] = f[1, :]
D_sub_h[-1, :] = -f[-2, :]
np.subtract(f[2:, :], f[:-2, :], out=D_sub_h[1:-1, :])
return D_sub_h
Notice that I return the value instead of printing it. When you write functions, get in the habit of returning a value. Printing can be done later, and effectively discards the computation if it replaces a proper return.
The way shown above is fairly efficient with regards to time and space. If you want to write a one liner that uses a lot of temporary arrays, you can also do:
D_sub_h = np.concatenate((f[1, None], f[2:, :] - f[:-2, :], -f[-2, None]), axis=0)
This function receives a list of numpy arrays that consist of cropped parts of an image. The crops are all the same size, except for the right-most and bottom-most images which might be of smaller size.
predictions[2] would return the 3rd sub-image that was cropped from the original image. Each crop is a numpy array. There are WxH crops, enumerated from left to right, top to bottom (so if there are 4 sub-images constituting the width, the 5th image in predictions would be the first sub-image on the left from the 2nd row of sub-images).
crops contains the necessary information to find number of horizontal and vertical images that will constitute the reconstructed images. crops[2][3] will contain the 3rd from the top, 4th from the left image cropped.
The images contained by crops are of smaller dimension than the ones in predictions (I am basically making a model that increases the resolution of images). The reconstructed image if from the images in predictions, arranged in the same order as the ones in crops.
def reconstruct(predictions, crops):
if len(crops) != 0:
print("use crops")
# TODO: properly extract the size of the full image
width_length = 0
height_length = 0
full_image = np.empty(shape=(height_length, width_length))
print(full_image.shape)
# TODO: properly merge the crops back into a single image
for height in range(len(predictions[0])):
for width in range(len(predictions)):
# concatenate here
print(height, width)
return full_image
I was going to use numpy.concatenate, but according to other answers I've seen on SO it wouldn't be an efficient way of doing it (apparently numpy will just recreate a new variable in memory, copy the old one, and add the new data, etc.). So now I'm left wondering how to properly merge my multiple images into a single image. The current idea I was going for was to create a python list of the proper shape and progressively fill it with each numpy array's data, but even that I'm not sure if it's the proper idea.
Here is more or less the kind of bunch of images I'm trying to concatenate into a single image:
Here is the expected result:
And to help you out with understanding what more might be available to you, here is some more code:
def predict(args):
model = load_model(save_dir + '/' + args.model)
image = skimage.io.imread(tests_path + args.image)
predictions = []
images = []
crops = seq_crop(image) # crops into multiple sub-parts the image based on 'input_' constants
for i in range(len(crops)): # amount of vertical crops
for j in range(len(crops[0])): # amount of horizontal crops
current_image = crops[i][j]
images.append(current_image)
# Hack because GPU can only handle one image at a time
input_img = (np.expand_dims(images[p], 0)) # Add the image to a batch where it's the only member
predictions.append(model.predict(input_img)[0]) # returns a list of lists, one for each image in the batch
return predictions, image, crops
# adapted from: https://stackoverflow.com/a/52463034/9768291
def seq_crop(img):
"""
To crop the whole image in a list of sub-images of the same size.
Size comes from "input_" variables in the 'constants' (Evaluation).
Padding with 0 the Bottom and Right image.
:param img: input image
:return: list of sub-images with defined size
"""
width_shape = ceildiv(img.shape[1], input_width)
height_shape = ceildiv(img.shape[0], input_height)
sub_images = [] # will contain all the cropped sub-parts of the image
for j in range(height_shape):
horizontal = []
for i in range(width_shape):
horizontal.append(crop_precise(img, i*input_width, j*input_height, input_width, input_height))
sub_images.append(horizontal)
return sub_images
def crop_precise(img, coord_x, coord_y, width_length, height_length):
"""
To crop a precise portion of an image.
When trying to crop outside of the boundaries, the input to padded with zeros.
:param img: image to crop
:param coord_x: width coordinate (top left point)
:param coord_y: height coordinate (top left point)
:param width_length: width of the cropped portion starting from coord_x
:param height_length: height of the cropped portion starting from coord_y
:return: the cropped part of the image
"""
tmp_img = img[coord_y:coord_y + height_length, coord_x:coord_x + width_length]
return float_im(tmp_img) # From [0,255] to [0.,1.]
# from https://stackoverflow.com/a/17511341/9768291
def ceildiv(a, b):
"""
To get the ceiling of a division
:param a:
:param b:
:return:
"""
return -(-a // b)
if __name__ == '__main__':
preds, original, crops = predict(args) # returns the predictions along with the original
# TODO: reconstruct image
enhanced = reconstruct(preds, crops) # reconstructs the enhanced image from predictions
EDIT:
The answer worked. Here is the version I've used:
# adapted from https://stackoverflow.com/a/52733370/9768291
def reconstruct(predictions, crops):
# unflatten predictions
def nest(data, template):
data = iter(data)
return [[next(data) for _ in row] for row in template]
predictions = nest(predictions, crops)
H = np.cumsum([x[0].shape[0] for x in predictions])
W = np.cumsum([x.shape[1] for x in predictions[0]])
D = predictions[0][0]
recon = np.empty((H[-1], W[-1], D.shape[2]), D.dtype)
for rd, rs in zip(np.split(recon, H[:-1], 0), predictions):
for d, s in zip(np.split(rd, W[:-1], 1), rs):
d[...] = s
return recon
The most convenient is probably np.block
import numpy as np
from scipy import misc
import Image
# get example picture
data = misc.face()
# chop it up
I, J = map(np.arange, (200, 200), data.shape[:2], (200, 200))
chops = [np.split(row, J, axis=1) for row in np.split(data, I, axis=0)]
# do something with the bits
predictions = [chop-(i+j)*(chop>>3) for j, row in enumerate(chops) for i, chop in enumerate(row)]
# unflatten predictions
def nest(data, template):
data = iter(data)
return [[next(data) for _ in row] for row in template]
pred_lol = nest(predictions, chops)
# almost builtin reconstruction
def np_block_2D(chops):
return np.block([[[x] for x in row] for row in chops])
recon = np_block_2D(pred_lol)
Image.fromarray(recon).save('demo.png')
Reconstructed manipulated image:
But we can do faster than that by avoiding intermediary arrays. Instead, we copy into a preallocated array:
def speed_block_2D(chops):
H = np.cumsum([x[0].shape[0] for x in chops])
W = np.cumsum([x.shape[1] for x in chops[0]])
D = chops[0][0]
recon = np.empty((H[-1], W[-1], D.shape[2]), D.dtype)
for rd, rs in zip(np.split(recon, H[:-1], 0), chops):
for d, s in zip(np.split(rd, W[:-1], 1), rs):
d[...] = s
return recon
Timings, also including a generalized ND-ready variant of each method:
numpy 2D: 0.991 ms
prealloc 2D: 0.389 ms
numpy general: 1.021 ms
prealloc general: 0.448 ms
Code for general case and timings:
def np_block(chops):
d = 0
tl = chops
while isinstance(tl, list):
tl = tl[0]
d += 1
if d < tl.ndim:
def adjust_depth(L):
if isinstance(L, list):
return [adjust_depth(l) for l in L]
else:
ret = L
for j in range(d, tl.ndim):
ret = [ret]
return ret
chops = adjust_depth(chops)
return np.block(chops)
def speed_block(chops):
def line(src, i):
while isinstance(src, list):
src = src[0]
return src.shape[i]
def hyper(src, i):
src = iter(src)
fst = next(src)
if isinstance(fst, list):
res, dtype, szs = hyper(fst, i+1)
szs.append([res[i], *(line(s, i) for s in src)])
res[i] = sum(szs[-1])
return res, dtype, szs
res = np.array(fst.shape)
szs = [res[i], *(s.shape[i] for s in src)]
res[i] = sum(szs)
return res, fst.dtype, [szs]
shape, dtype, szs = hyper(chops, 0)
recon = np.empty(shape, dtype)
def cpchp(dst, src, i, szs=None):
szs = np.array(hyper(src, i)[2]) if szs is None else szs
dst = np.split(dst, np.cumsum(szs[-1][:-1]), i)
if isinstance(src[0], list):
szs = szs[:-1]
for ds, sr in zip(dst, src):
cpchp(ds, sr, i+1, szs)
szs = None
else:
for ds, sr in zip(dst, src):
ds[...] = sr
cpchp(recon, chops, 0, np.array(szs))
return recon
from timeit import timeit
T = (timeit(lambda: speed_block(pred_lol), number=1000),
timeit(lambda: np_block(pred_lol), number=1000),
timeit(lambda: speed_block_2D(pred_lol), number=1000),
timeit(lambda: np_block_2D(pred_lol), number=1000))
assert (np.all(speed_block(pred_lol)==np_block(pred_lol)) and
np.all(speed_block_2D(pred_lol)==np_block(pred_lol)) and
np.all(speed_block(pred_lol)==np_block_2D(pred_lol)))
print(f"""
numpy 2D: {T[3]:10.3f} ms
prealloc 2D: {T[2]:10.3f} ms
numpy general: {T[1]:10.3f} ms
prealloc general: {T[0]:10.3f} ms
""")