Calculate distance between neighbors efficiently - python

I have data geographically scattered without any kind of pattern and I need to create an image where the value of each pixel is an average of the neighbors of that pixel that are less than X meters.
For this I use the library scipy.spatial to generate a KDTree with the data (cKDTree). Once the data structure is generated, I locate the pixel geographically and locate the geographic points that are closest.
# Generate scattered data points
coord_cart= [
] for feat in layer
# Create KDTree structure
tree = cKDTree(coord_cart)
# Get raster image dimensions
pixel_size = 5
source_layer = shapefile.GetLayer()
x_min, x_max, y_min, y_max = source_layer.GetExtent()
x_res = int((x_max - x_min) / pixel_size)
y_res = int((y_max - y_min) / pixel_size)
# Create grid
x = np.linspace(x_min, x_max, x_res)
y = np.linspace(y_min, y_max, y_res)
X, Y = np.meshgrid(x, y)
grid = np.array(zip(Y.ravel(), X.ravel()))
# Get points that are less than 10 meters away
inds = tree.query_ball_point(grid, 10)
# inds is an np.array of lists of different length, so I need to convert it into an array of n_points x maximum number of neighbors
ll = np.array([len(l) for l in inds])
maxlen = max(ll)
arr = np.zeros((len(ll), maxlen), int)
# I don't know why but inds is an array of list, so I convert it into an array of array to use grid[inds]
for i in range(len(inds)):
inds[i].extend([i] * (maxlen - len(inds[i])))
arr[i] = np.array(inds[i], dtype=int)
d = np.linalg.norm(grid - grid[inds])
Is there a better way to do this? I'm trying to use IDW to perform the interpolation between the points. I found this snippet that uses a function that gets the N nearest points but it does not work for me because I need that if there is no point in a radius R, the value of the pixel is 0.
d, inds = tree.query(zip(xt, yt, zt), k = 10)
w = 1.0 / d**2
air_idw = np.sum(w * air.flatten()[inds], axis=1) / np.sum(w, axis=1)
air_idw.shape = lon_curv.shape
Thanks in advance!

This may be one of the cases where KDTrees are not a good solution. This is because you are mapping to a grid, which is a very simple structure meaning there is nothing to gain from the KDTree's sophistication. Nearest grid point and distance can be found by simple arithmetic.
Below is a simple example implementation. I'm using a Gaussian kernel but changing that to IDW if you prefer should be straight-forward.
import numpy as np
from scipy import stats
def rasterize(coords, feature, gu, cutoff, kernel=stats.norm(0, 2.5).pdf):
# compute overlap (filter size / grid unit)
ovlp = int(np.ceil(cutoff/gu))
# compute raster dimensions
mn, mx = coords.min(axis=0), coords.max(axis=0)
reso = np.ceil((mx - mn) / gu).astype(int)
base = (mx + mn - reso * gu) / 2
# map coordinates to raster, the residual is the distance
grid_res = coords - base
grid_coords = np.rint(grid_res / gu).astype(int)
grid_res -= gu * grid_coords
# because of overlap we must add neighboring grid points to the nearest
gcovlp = np.c_[-ovlp:ovlp+1, np.zeros(2*ovlp+1, dtype=int)]
grid_coords = (gcovlp[:, None, None, :] + gcovlp[None, :, None, ::-1]
+ grid_coords).reshape(-1, 2)
# the corresponding residuals have the same offset with opposite sign
gdovlp = -gu * (gcovlp+1/2)
grid_res = (gdovlp[:, None, None, :] + gdovlp[None, :, None, ::-1]
+ grid_res).reshape(-1, 2)
# discard off fov grid points and points outside the cutoff
valid, = np.where(((grid_coords>=0) & (grid_coords<=reso)).all(axis=1) & (
np.einsum('ij,ij->i', grid_res, grid_res) <= cutoff*cutoff))
grid_res = grid_res[valid]
feature = feature[valid // (2*ovlp+1)**2]
# flatten grid so we can use bincount
grid_flat = np.ravel_multi_index(grid_coords[valid].T, reso+1)
return np.bincount(
feature * kernel(np.sqrt(np.einsum('ij,ij->i', grid_res, grid_res))),
(reso + 1).prod()).reshape(reso+1)
gu = 5
cutoff = 10
coords = np.random.randn(10_000, 2) * (100, 20)
coords[:, 1] += 80 * np.sin(coords[:, 0] / 40)
feature = np.random.uniform(0, 1000, (10_000,))
from timeit import timeit
print(timeit("rasterize(coords, feature, gu, cutoff)", globals=globals(), number=100)*10, 'ms')
pic = rasterize(coords, feature, gu, cutoff)
import pylab


Poor parallelization using dask

I have a 2D grid on which there is a path. I want to calculate the distances of each point of the grid to each point on the path, then do some operations on those grid. I am using dask.dataframe and dask.array for this task.
The code is:
import dask.dataframe as dd
import dask.array as da
x = np.linspace(-60, 60, 10000)
xv, yv = da.meshgrid(x, x, sparse='True')
path = da.from_array(np.random.rand(100, 2))
h = 100.0
# function to calculate distance to point
def dist_to_point(x, y, p):
x_dist = x-p[0]
y_dist = y-p[1]
dist = da.sqrt(x_dist**2+y_dist**2)
d2 = da.sqrt(dist**2 + h**2)
return dd.from_dask_array(d2)
distances = [dist_to_point(xv, yv, path[i, :]) for i in range(npath)]
distances_grid = dd.multi.concat(distances, axis=1, ignore_index=True)
So distances_grid should the concatenation of [grid distance to point 1, grid distance to point 2, ..., grid distance to point 100]
Now suppose I want to get the max across all dataframes I apply this
l_max = distances_grid.map_partitions(lambda x: x.groupby(level=0, axis=1).max())
The dask graph for this looks like this which to me does not look like proper parallelization of the tasks. Can anyone help point me to what I am doing wrong or how I can improve this? My final application will be on 100000x100000 grids hence the use of dask
So in case anyone runs into this I solved it by broadcasting the arrays and avoiding the for loop all together. The code I ended up using is
x = da.from_array(np.linspace(-60, 60, 10000), chunks=1000)
xv, yv = da.meshgrid(x, x, sparse='True')
path = da.from_array(np.random.rand(10, 2))
h = 100.0
ngrid = x.shape[0]
xd = x[:, np.newaxis] - path[:, 0]
yd = x[:, np.newaxis] - path[:, 1]
z = xd**2 + yd[:, np.newaxis]**2 + h**2
# euclidian distance at height = 100
z = xd**2 + yd[:, np.newaxis]**2 + h**2
distances_grid = z**0.5
l_max = distances_grid.max(axis=2)
This gave me a nicer graph which I am able to balance even more by changing the sizes of the chunks.

How can I avoid using a loop in this specific snippet of python code?

I have a specific python issue, that desperately needs to be sped up by avoiding the use of a loop, yet, I am at a loss as to how to do this. I need to read in a fits image, convert this to a numpy array (roughly, 2000 x 2000 elements in size), then for each element compute the statistics of a ring of elements around it.
As I have my code now, the statistics of the ring around the element is computed with a function using masks. This is fast but, of course, I call this function 2000x2000 times (the slow part).
I am relatively new to python. I think that using the mask function is clever, but I cannot find a way around individually addressing each element. Best of thanks for any help you can provide.
# First, the function computing the statistics within a ring
around the central pixel:<br/>
# flux = image intensity at pixel (i,j)<br/>
# rad1, rad2 = inner and outer radii<br/>
# array = image array<br/>_
def snr(flux, i, j, rad1, rad2, array):
a, b = i, j
nx, ny = array.shape
y, x = np.ogrid[-a:nx-a, -b:ny-b]
mask = (x*x + y*y >= rad1*rad1) & (x*x + y*y <= rad2*rad2)
Nmask = np.count_nonzero(mask)
noise = 0.6052697 * abs(Nmask * flux - sum(array[mask]))
return noise
# Now, the call to snr for each pixel in the array data1:<br/>_
frame1 =, mode='readonly') # read in fits file
data1 = frame1[ext].data # convert to np array
ny, nx = data1.shape # array dimensions
noise1 = zeros((ny, nx), float) # empty array
r1 = 5 # inner radius (pixels)
r2 = 7 # outer radius (pixels)
# The function is fast, but calling it 2k x 2k times is not:
for j in range(ny):
for i in range(nx):
noise1[i,j] = der_snr(data1[i,j], i, j, r1, r2, data1)
The operation that you are trying to do can be expressed as an image convolution. Try something like this:
import numpy as np
import scipy.ndimage
from import fits
def make_kernel(inner_radius, outer_radius):
if inner_radius > outer_radius:
raise ValueError
x, y = np.ogrid[-outer_radius:outer_radius + 1, -outer_radius:outer_radius + 1]
r2 = x * x + y * y
kernel = (r2 >= inner_radius * inner_radius) & (r2 <= outer_radius * outer_radius)
return kernel
in_frame = '<file path>'
ext = '...'
frame1 =, mode='readonly')
data1 = frame1[ext].data
inner_radius = 5
outer_radius = 7
kernel = make_kernel(inner_radius, outer_radius)
n_kernel = np.count_nonzero(kernel)
conv = scipy.ndimage.convolve(data1, kernel, mode='constant')
noise1 = 0.6052697 * np.abs(n_kernel * data1 - conv)

Need to speed up very slow loop for image manipulation on Python

I am currently completing a program in Pyhton (3.6) as per internal requirement. As part of it, I am having to loop through a colour image (3 bytes per pixel, R, G & B) and distort the image pixel by pixel.
I have the same code in other languages (C++, C#), and non-optimized code executes in about two seconds, while optimized code executes in less than a second. By non-optimized code I mean that the matrix multiplication is performed by a 10 line function I implemented. The optimized version just uses external libraries for multiplication.
In Python, this code takes close to 300 seconds. I canĀ“t think of a way to vectorize this logic or speed it up, as there are a couple of "if"s inside the nested loop. Any help would be greatly appreciated.
import numpy as np
#for test purposes:
#roi = rect.rect(0, 0, 1200, 1200)
#input = DCImage.DCImage(1200, 1200, 3)
#correctionImage = DCImage.DCImage(1200,1200,3)
#siteToImage= np.zeros((3,3), np.float32)
#worldToSite= np.zeros ((4, 4))
#r11 = r12 = r13 = r21 = r22 = r23 = r31 = r32 = r33 = 0.0
#xMean = yMean = zMean = 0
#tx = ty = tz = 0
#epsilon = np.finfo(float).eps
#fx = fy = cx = cy = k1 = k2 = p1 = p2 = 0
for i in range (roi.x, roi.x + roi.width):
for j in range (roi.y , roi.y + roi.height):
if ( (input.pixels [i] [j] == [255, 0, 0]).all()):
#Coordinates conversion
siteMat = np.matmul(siteToImage, [i, j, 1])
world =np.matmul(worldToSite, [siteMat[0], siteMat[1], 0.0, 1.0])
xLocal = world[0] - xMean
yLocal = world[1] - yMean
zLocal = z_ortho - zMean
#From World to camera
xCam = r11*xLocal + r12*yLocal + r13*zLocal + tx
yCam = r21*xLocal + r22*yLocal + r23*zLocal + ty
zCam = r31*xLocal + r32*yLocal + r33*zLocal + tz
if (zCam > epsilon or zCam < -epsilon):
xCam = xCam / zCam
yCam = yCam / zCam
r2 = xCam*xCam + yCam*yCam
a1 = 2*xCam*yCam
a2 = r2 + 2*xCam*xCam
a3 = r2 + 2*yCam*yCam
cdist = 1 + k1*r2 + k2*r2*r2
u = int((xCam * cdist + p1 * a1 + p2 * a2) * fx + cx + 0.5)
v = int((yCam * cdist + p1 * a3 + p2 * a1) * fy + cy + 0.5)
if (u>=0 and u<correctionImage.width and v>=0 and v < correctionImage.height):
input.pixels [i] [j] = correctionImage.pixels [u][v]
You normally vectorize this kind of thing by making a displacement map.
Make a complex image where each pixel has the value of its own coordinate, apply the usual math operations to compute whatever transform you want, then apply the map to your source image.
For example, in pyvips you might write:
import sys
import pyvips
image = pyvips.Image.new_from_file(sys.argv[1])
# this makes an image where pixel (0, 0) (at the top-left) has value [0, 0],
# and pixel (image.width, image.height) at the bottom-right has value
# [image.width, image.height]
index =, image.height)
# make a version with (0, 0) at the centre, negative values up and left,
# positive down and right
centre = index - [image.width / 2, image.height / 2]
# to polar space, so each pixel is now distance and angle in degrees
polar = centre.polar()
# scale sin(distance) by 1/distance to make a wavey pattern
d = 10000 * (polar[0] * 3).sin() / (1 + polar[0])
# and back to rectangular coordinates again to make a set of vectors we can
# apply to the original index image
distort = index + d.bandjoin(polar[1]).rect()
# distort the image
distorted = image.mapim(distort)
# pick pixels from either the distorted image or the original, depending on some
# condition
result = (d.abs() > 10 or image[2] > 100).ifthenelse(distorted, image)
That's just a silly wobble pattern, but you can swap it for any distortion you want. Then run as:
$ /usr/bin/time -f %M:%e ./ ~/pics/horse1920x1080.jpg x.jpg
300ms and 55MB of memory on this two-core, 2015 laptop to make:
After much testing, the only way to speed the function without writing it in C++ was dissassembling it and vectorizing it. The way to do it in this particular instance is to create an array with the valid indexes at the beginning of the funcion and use them as tuples to index the final solution.
subArray[roi.y:roi.y+roi.height,roi.x:roi.x+roi.width,] = input.pixels[roi.y:roi.y+roi.height,roi.x:roi.x+roi.width,]
#Calculate valid XY indexes
y_index, x_index = np.where(np.all(subArray== np.array([255,0,0]), axis=-1))
#do stuff
#Join result values with XY indexes
ij_xy = np.column_stack((i, j, y_index, x_index))
#Only keep valid ij values
valids_ij_xy = ij_xy [(ij_xy [:,0] >= 0) & (ij_xy [:,0] < correctionImage.height) & (ij_xy [:,1] >= 0) & (ij_xy [:,1] < correctionImage.width)]
#Assign values
input.pixels [tuple(np.array(valids_ij_xy [:,2:]).T)] = correctionImage.pixels[tuple(np.array(valids_ij_xy [:,:2]).T)]

Integrating 2D data over an irregular grid in python

So I have 2D function which is sampled irregularly over a domain, and I want to calculate the volume underneath the surface. The data is organised in terms of [x,y,z], taking a simple example:
def f(x,y):
return np.cos(10*x*y) * np.exp(-x**2 - y**2)
datrange1 = np.linspace(-5,5,1000)
datrange2 = np.linspace(-0.5,0.5,1000)
ar = []
for x in datrange1:
for y in datrange2:
ar += [[x,y, f(x,y)]]
for x in xrange2:
for y in yrange2:
ar += [[x,y, f(x,y)]]
val_arr1 = np.array(ar)
data = np.unique(val_arr1)
xlist, ylist, zlist = data.T
where np.unique sorts the data in the first column then the second. The data is arranged in this way as I need to sample more heavily around the origin as there is a sharp feature that must be resolved.
Now I wondered about constructing a 2D interpolating function using scipy.interpolate.interp2d, then integrating over this using dblquad. As it turns out, this is not only inelegant and slow, but also kicks out the error:
RuntimeWarning: No more knots can be added because the number of B-spline
coefficients already exceeds the number of data points m.
Is there a better way to integrate data arranged in this fashion or overcoming this error?
If you can sample the data with high enough resolution around the feature of interest, then more sparsely everywhere else, the problem definition then becomes how to define the area under each sample. This is easy with regular rectangular samples, and could likely be done stepwise in increments of resolution around the origin. The approach I went after is to generate the 2D Voronoi cells for each sample in order to determine their area. I pulled most of the code from this answer, as it had almost all the components needed already.
import numpy as np
from scipy.spatial import Voronoi
#taken from: #
#computes voronoi regions bounded by a bounding box
def square_voronoi(xy, bbox): #bbox: (min_x, max_x, min_y, max_y)
# Select points inside the bounding box
points_center = xy[np.where((bbox[0] <= xy[:,0]) * (xy[:,0] <= bbox[1]) * (bbox[2] <= xy[:,1]) * (bbox[2] <= bbox[3]))]
# Mirror points
points_left = np.copy(points_center)
points_left[:, 0] = bbox[0] - (points_left[:, 0] - bbox[0])
points_right = np.copy(points_center)
points_right[:, 0] = bbox[1] + (bbox[1] - points_right[:, 0])
points_down = np.copy(points_center)
points_down[:, 1] = bbox[2] - (points_down[:, 1] - bbox[2])
points_up = np.copy(points_center)
points_up[:, 1] = bbox[3] + (bbox[3] - points_up[:, 1])
points = np.concatenate((points_center, points_left, points_right, points_down, points_up,), axis=0)
# Compute Voronoi
vor = Voronoi(points)
# Filter regions (center points should* be guaranteed to have a valid region)
# center points should come first and not change in size
regions = [vor.regions[vor.point_region[i]] for i in range(len(points_center))]
vor.filtered_points = points_center
vor.filtered_regions = regions
return vor
#also stolen from:
def area_region(vertices):
# Polygon's signed area
A = 0
for i in range(0, len(vertices) - 1):
s = (vertices[i, 0] * vertices[i + 1, 1] - vertices[i + 1, 0] * vertices[i, 1])
A = A + s
return np.abs(0.5 * A)
def f(x,y):
return np.cos(10*x*y) * np.exp(-x**2 - y**2)
#sampling could easily be shaped to sample origin more heavily
sample_x = np.random.rand(1000) * 10 - 5 #same range as example linspace
sample_y = np.random.rand(1000) - .5
sample_xy = np.array([sample_x, sample_y]).T
vor = square_voronoi(sample_xy, (-5,5,-.5,.5)) #using bbox from samples
points = vor.filtered_points
sample_areas = np.array([area_region(vor.vertices[verts+[verts[0]],:]) for verts in vor.filtered_regions])
sample_z = np.array([f(p[0], p[1]) for p in points])
volume = np.sum(sample_z * sample_areas)
I haven't exactly tested this, but the principle should work, and the math checks out.

How do I maximally separate n points in a cube in Python? How do I use scipy.integrate.ode?

I need to find n points in the unit cube that roughly maximizes their separation. I am currently looking at writing this problem as an ODE and using scipy.integrate.ode to do the work. That is, doing a simulation whereby each of the particles repel each other along with the walls of the cube. I don't care too much about efficiency.
Unfortunately I can't get it to work for problems bigger than n=5:
import numpy as np
def spread_points_in_cube(n, dimensions=3, rng=None):
from scipy.integrate import ode
if rng is None:
rng = np.random
size = n * dimensions
y0 = np.zeros((2 * size))
y0[:size] = rng.uniform(size=size)
t0 = 0.0
def clip_to_wall(positions, forces):
forces = np.where(positions == 0.0,
np.clip(forces, 0, np.inf),
forces = np.where(positions == 1.0,
np.clip(forces, -np.inf, 0.0),
return forces
def decode(y):
positions = np.clip(y[:size].reshape((n, dimensions)), 0, 1)
velocities = clip_to_wall(positions, y[size:].reshape((n, dimensions)))
return positions, velocities
def f(t, y):
retval = np.zeros((2 * size))
positions, velocities = decode(y)
#print("pos", positions)
delta_positions = positions[:, np.newaxis, :] - positions[np.newaxis, :, :]
# print("delta_positions", delta_positions)
distances = np.linalg.norm(delta_positions, axis=2, ord=2)
distances += 1e-5
# print("distances", distances)
pairwise_forces = delta_positions * (distances ** -3)[:, :, np.newaxis]
# print("pairwise f", pairwise_forces[0, 1])
forces = np.sum(pairwise_forces, axis=1)
forces -= 0.9 * velocities
forces = clip_to_wall(positions, forces)
#print("forces", forces)
retval[:size] = velocities.reshape(size)
retval[size:] = forces.reshape(size)
return retval
r = ode(f).set_integrator('vode', method='adams')
r.set_initial_value(y0, t0)
t_max = 40000
dt = 1
while r.successful() and r.t < t_max:
r.integrate(r.t + dt)
return decode(r.y)[0]
Poisson Disk Sampling is a linear time algorithm to do this. References:,,
Bridson, Robert. "Fast Poisson disk sampling in arbitrary dimensions." ACM SIGGRAPH. Vol. 2007. 2007.

