2D lat/lon KernelDensity Estimator for sklearn - python

Using KernelDensity of sklearn.neighbors, I am getting density values that are much smaller than I expect. The density estimates have values that are about 1/200th of what I would expect.
I have reviewed sklearn's Kernel Density Estimate of Species Distribution, and I went down that path of converting my input lat/lon data to radians and using a haversine distance metric, but I was getting strange results.
I've thought about this a lot and here are the parameters that make most sense to me.
bandwidth = 1.0 map units
metric = "Euclidean" for my 2D space
kernel = "Gaussian" for probability density
Questions
Does this seem like a reasonable approach to this problem?
Why are the density values so much smaller than I would expect?
Here is my function and the parameters that I pass to it.
import rasterio
from rasterio.crs import CRS
from sklearn.neighbors import KernelDensity
import numpy as np
def kernel_density_lat_lon(positions, bandwidth, metric, kernel,
cell_size, extent, output_raster, multiplier=None):
# Set the bounds of the output raster based on the extent
x_min = extent[0]
x_max = extent[1]
y_min = extent[2]
y_max = extent[3]
# Create arrays, based on cell_size and bounds
# These arrays hold x locations and y locations for each pixel in the output raster
x = np.arange(x_min, x_max, cell_size)
y = np.arange(y_min, y_max, cell_size)
# Create a meshgrid, which has cells whose values are the (x,y) location at each cell
xx, yy = np.meshgrid(x, y)
# Pair the x locations with y locations
xys = np.vstack((xx.ravel(), yy.ravel())).T
# Create a density map
x_shape = xx.shape
# Get the kernel density estimator
kde = KernelDensity(bandwidth=bandwidth,metric=metric,
kernel=kernel, algorithm='ball_tree')
# Fit it to the coordinate pairs
_ = kde.fit(positions)
# Evaluate
z = np.exp(kde.score_samples(xys))
print(np.max(z))
zi = np.arange(xys.shape[0])
# Plug densities into grid
zg = -9999 + np.zeros(xys.shape[0])
zg[zi] = z
xyz = np.hstack((xys[:, :2], zg[:, None]))
# Get the density values arranged on the grid
z = xyz[:, 2].reshape(x_shape)
temp = z[::-1, :]
output_arr = temp.reshape(-1, temp.shape[0], temp.shape[1])
# Write the densities to a raster
with rasterio.open(
output_raster,
'w',
driver='GTiff',
height=output_arr.shape[1],
width=output_arr.shape[2],
dtype=output_arr.dtype,
crs=CRS.from_epsg(4326),
count=1,
transform=rasterio.transform.from_bounds(x_min, y_min, x_max, y_max, output_arr.shape[2], output_arr.shape[1])
) as dst:
dst.write(output_arr)
if __name__ == "__main__":
positions = [[126.82800884821953, 8.021550450814345],
[123.0835913004416, 15.887493017360754],
[122.87172138544588, 15.155979776107289],
[122.48465193221716, 15.233649683534475],
[122.26320643954872, 16.71625103407011],
[122.13275884500477, 15.941644592949958],
[120.63772441542471, 7.078277119741588],
[120.57180822188472, 7.537689414917545],
[119.53047809084589, 1.396741864447578],
[119.51652407635684, 1.7028166423529711],
[119.35538543402562, 7.795232293743844],
[119.35371605376332, 1.7139590065581176],
[118.21983976700818, 0.2725608428591114],
[116.32507063966972, -2.0478066628388163],
[115.9455871941716, -2.2758686356158915],
[110.54879990595637, 4.849182291868757],
[109.00373897612512, 12.330559666134512],
[108.56317006080423, 23.10356852435795],
[107.95374212609899, -3.878293744564539],
[107.6618148392204, -4.215545933851648],
[107.39598092145678, -3.3557991558597426],
[107.38347877309276, -4.243848824653475],
[107.3802332907293, -4.724984303635246],
[106.92298020128571, 3.3377440975999058],
[106.8467663232349, -3.427384435159751],
[106.6198566766759, 3.327030211530555],
[106.59035576911651, 3.409433089119516],
[106.48649132403538, 3.5936300679047966],
[106.2879431146126, 3.039670857739856],
[105.96323043582797, 2.5103916023650656],
[105.9540323861389, 2.596746532847891],
[105.80111748849575, 3.388380151516756],
[105.62119198430719, 3.2169296961449554],
[105.43276377101233, 2.6840109661437204],
[105.29236334314527, 2.420170430982717],
[104.94141265184744, 3.091707354213681],
[103.08902291491331, 3.1932135322924133],
[102.59488296531873, 14.93503092216549],
[100.7213889691745, 5.834246665586201],
[100.70491932538964, 5.2594820067014245],
[100.51665775078591, 6.0369426594605855],
[100.51156199546038, 5.491942119998682],
[100.45311457726862, 5.281343969279209],
[99.984116880471, 5.658350660638604],
[93.51170627287425, 24.024373245961645],
[93.34991893283902, 23.04050533807432],
[84.93884193888668, 19.384547030288207],
[84.30999142795147, 18.825326243832105],
[84.1630944193751, 19.06013889689632],
[83.80094785724114, 18.57306909774846],
[74.16321921976069, 23.579347585345776],
[72.4113965790803, 21.875517403679595],
[49.40472412468231, 32.2487630729451],
[42.90510332039255, -12.821849510976579],
[42.408207428324495, -12.31050970009727],
[42.36825610793828, -13.083052941231413],
[42.30285486383656, -12.234780003717532],
[15.328057669295298, -7.460883355600632],
[14.631592099379093, -7.440778982157976],
[14.563929300312948, -7.140268202440664],
[14.446656807020666, -4.699494598106393],
[14.188788859460905, -6.430418645148537],
[13.44490187975298, -2.8654279482460323],
[13.301089335672936, -2.593387816196834],
[13.131727857324034, -3.412434046655619],
[11.637624067618695, 5.306602656962694],
[11.537324701566494, 1.5773310360579327],
[11.056051828014489, 5.372994263069668],
[10.981944105212998, 6.05789466930291],
[10.978615683124655, 5.7586879077143225],
[10.384229532923067, 2.6509917300959476],
[10.293978958054748, 5.6087142487617045],
[9.724503564938162, 5.965801337392755],
[9.228154036572047, 6.4564328707855605],
[8.847083818460739, 4.696640992862242],
[8.724622829999017, 5.5476494764785516],
[8.483278678008926, 6.612624047942372],
[8.44366045716664, 6.2122982089038725],
[8.4255624128847, 4.755664077859387],
[8.11860899795907, 5.659724263701104],
[7.912362077517271, 4.87480562915889],
[7.563449250527216, 2.842579773546474],
[7.2608575851074, 5.16577516485171],
[7.004069229900638, 3.5416918941072804],
[6.9915716303567494, 3.7362296571866294],
[6.468876406999725, 5.010859767233725],
[6.203147917904825, 4.992482439632923],
[5.4017709770599325, 7.676092696459705],
[5.350100368207385, 7.762605113995827],
[5.279221956366327, 4.915935839020336],
[5.213104554080347, 8.281676925077297],
[5.1108484406102805, 7.9040681892696485],
[5.059337403465768, 8.140534352024792],
[4.861618772269268, 8.322655646328752],
[4.80376638793241, 8.062341031849334],
[4.665446704573248, 7.477404025788393],
[4.6477402888853145, 7.797020093234158],
[4.609044098910636, 38.765860093618905],
[4.555126307535386, 7.873929016757312],
[4.4195324599539845, 7.394848626095032],
[4.400283930670644, 8.038284539940614],
[4.347819621721147, 8.443859742876246],
[4.240704264765369, 6.955830447603886],
[4.227870824209585, 5.751072313355475],
[4.033821062618696, 7.0740805209122595],
[3.665972118522844, 6.545536856751896],
[3.4165849005141005, 7.191717476638518],
[3.121450235674562, 8.103710628355616],
[1.8057346437941182, 1.3314371195302515],
[0.21998421850813876, 6.744306925430884],
[-12.310298533627448, 11.362835062050264],
[-49.352317054841336, 2.010101652464972],
[-49.56587070660965, 1.366869361066606],
[-49.5821860267535, 1.824258170311353],
[-70.58665807820438, 20.03257364630837],
[-70.6803277335339, 19.902301232265422],
[-70.78620439744233, 20.024999949922996],
[-70.86459827149523, 20.273742251629713],
[-71.02033226779315, 19.891866165854587],
[-73.57317798569044, 12.265930473198331],
[-75.32300214385347, -10.734649751468147],
[-75.36631826293349, -10.206201123969526],
[-75.37463804230384, -10.724232696199014],
[-75.40829227919468, -10.817431611704407],
[-75.46984739081694, -10.195876463554633],
[-75.56266706716431, -10.202240256127965],
[-75.74233061116121, -10.647556252995775],
[-75.90503122834087, -10.297561312609464],
[-75.94114020328095, -10.530481915516726],
[-78.13302896559648, -1.2629721839381856],
[-78.42506520505198, -0.6805387090496724],
[-78.68351568134375, -1.1006283268898114],
[-79.09221180056895, -1.5423219306900116],
[-90.05839881111541, 21.022199691388156],
[-91.3208074507767, 20.58263399988673],
[-91.86906142999138, 20.169783366358622],
[-91.89838954465436, 20.49386425203851]]
bandwidth = 1.0
cell_size = 0.1
extent = [-180, 180, -90, 90]
metric = "euclidean"
kernel = "gaussian"
output_raster = metric + "_" + kernel + "_" + str(bandwidth).split(".")[0] + ".tif"
# The parameters that I think should do the trick
kernel_density_lat_lon(positions, bandwidth, metric, kernel, cell_size,
extent, output_raster)
# The parameters that get me closest to the desired output
# This requires multiplying all of the density probabilities by 205...
bandwidth = 1.0
cell_size = 0.1
extent = [-180, 180, -90, 90]
metric = "euclidean"
kernel = "epanechnikov"
multiplier = 205
output_raster = metric + "_" + kernel + "_" + str(bandwidth).split(".")[0] + ".tif"
kernel_density_lat_lon(positions, bandwidth, metric, kernel, cell_size,
extent, output_raster, multiplier)
I've thought about this a lot, and I am stumped as to why the density estimates are less than I would expect. Thanks for your help.

Related

Converting indices in marching cubes to original x,y,z space - visualizing isosurface 3d skimage

I want to draw a volume in x1,x2,x3-space. The volume is an isocurve found by the marching cubes algorithm in skimage. The function generating the volume is pdf_grid = f(x1,x2,x3) and
I want to draw the volume where pdf = 60% max(pdf).
My issue is that the marching cubes algorithm generates vertices and faces, but how do I map those back to the x1, x2, x3-space?
My (rather limited) understanding of marching cubes is that "vertices" refer to the indices in the volume (pdf_grid in my case). If "vertices" contained only the exact indices in the grid this would have been easy, but "vertices" contains floats and not integers. It seems like marching cubes do some interpolation between grid points (according to https://www.cs.carleton.edu/cs_comps/0405/shape/marching_cubes.html), so the question is then how to recover exactly the values of x1,x2,x3?
import numpy as np
import scipy.stats
import matplotlib.pyplot as plt
#Make some random data
cov = np.array([[1, .2, -.5],
[.2, 1.2, .1],
[-.5, .1, .8]])
dist = scipy.stats.multivariate_normal(mean = [1., 3., 2], cov = cov)
N = 500
x_samples = dist.rvs(size=N).T
#Create the kernel density estimator - approximation of a pdf
kernel = scipy.stats.gaussian_kde(x_samples)
x_mean = x_samples.mean(axis=1)
#Find the mode
res = scipy.optimize.minimize(lambda x: -kernel.logpdf(x),
x_mean #x0, initial guess
)
x_mode = res["x"]
num_el = 50 #number of elements in the grid
x_min = np.min(x_samples, axis = 1)
x_max = np.max(x_samples, axis = 1)
x1g, x2g, x3g = np.mgrid[x_min[0]:x_max[0]:num_el*1j,
x_min[1]:x_max[1]:num_el*1j,
x_min[2]:x_max[2]:num_el*1j
]
pdf_grid = np.zeros(x1g.shape) #implicit function/grid for the marching cubes
for an in range(x1g.shape[0]):
for b in range(x1g.shape[1]):
for c in range(x1g.shape[2]):
pdf_grid[a,b,c] = kernel(np.array([x1g[a,b,c],
x2g[a,b,c],
x3g[a,b,c]]
))
from mpl_toolkits.mplot3d.art3d import Poly3DCollection
from skimage import measure
iso_level = .6 #draw a volume which contains pdf_val(mode)*60%
verts, faces, normals, values = measure.marching_cubes(pdf_grid, kernel(x_mode)*iso_level)
#How to convert the figure back to x1,x2,x3 space? I just draw the output as it was done in the skimage example here https://scikit-image.org/docs/0.16.x/auto_examples/edges/plot_marching_cubes.html#sphx-glr-auto-examples-edges-plot-marching-cubes-py so you can see the volume
# Fancy indexing: `verts[faces]` to generate a collection of triangles
mesh = Poly3DCollection(verts[faces],
alpha = .5,
label = f"KDE = {iso_level}"+r"$x_{mode}$",
linewidth = .1)
mesh.set_edgecolor('k')
fig, ax = plt.subplots(subplot_kw=dict(projection='3d'))
c1 = ax.add_collection3d(mesh)
c1._facecolors2d=c1._facecolor3d
c1._edgecolors2d=c1._edgecolor3d
#Plot the samples. Marching cubes volume does not capture these samples
pdf_val = kernel(x_samples) #get density value for each point (for color-coding)
x1, x2, x3 = x_samples
scatter_plot = ax.scatter(x1, x2, x3, c=pdf_val, alpha = .2, label = r" samples")
ax.scatter(x_mode[0], x_mode[1], x_mode[2], c = "r", alpha = .2, label = r"$x_{mode}$")
ax.set_xlabel(r"$x_1$")
ax.set_ylabel(r"$x_2$")
ax.set_zlabel(r"$x_3$")
# ax.set_box_aspect([np.ptp(i) for me in x_samples]) # equal aspect ratio
cbar = fig.color bar(scatter_plot, ax=ax)
cbar.set_label(r"$KDE(w) \approx pdf(w)$")
ax.legend()
#Make the axis limit so that the volume and samples are shown.
ax.set_xlim(- 5, np.max(verts, axis=0)[0] + 3)
ax.set_ylim(- 5, np.max(verts, axis=0)[1] + 3)
ax.set_zlim(- 5, np.max(verts, axis=0)[2] + 3)
This is probably way too late of an answer to help OP, but in case anyone else comes across this post looking for a solution to this problem, the issue stems from the marching cubes algorithm outputting the relevant vertices in array space. This space is defined by the number of elements per dimension of the mesh grid and the marching cubes algorithm does indeed do some interpolation in this space (explaining the presence of floats).
Anyways, in order to transform the vertices back into x1,x2,x3 space you just need to scale and shift them by the appropriate quantities. These quantities are defined by the range, number of elements of the mesh grid, and the minimum value in each dimension respectively. So using the variables defined in the OP, the following will provide the actual location of the vertices:
verts_actual = verts*((x_max-x_min)/pdf_grid.shape) + x_min

How to smoothen 2D color map in matplotlib

My question is if there is any way to smoothen 2D color map using matplotlib? My code:
def map():
# setup parameters
j = 0
N = 719
N2 = 35
x = np.linspace(190, 800, N)
y = np.linspace(10, 360, N2) # (1,2,3), 1 - start Temp, 2- end temp + 10K, 3 - how many steps to reach it
z = []
A = np.zeros([35,719]) # [1 2], 1 - number of spectras, 2 - delta wavelength
# run
for i in range(10,360,10):
Z = []
file_no = (str(0) + str(i))[-3:]
data = np.genfromtxt('C:\\Users\\micha_000\\Desktop\\Measure\\' + '160317_LaPONd_g500_%s_radio.txt'%file_no,skip_header = 12)
for line in data:
Z.append(line[1]-6000)
A[j,:] = Z
j = j+1
X, Y = np.meshgrid(x,y)
fig, ax = plt.subplots()
cs = ax.contourf(X, Y, A, cmap=cm.viridis)
norm = colors.Normalize(vmin = 0, vmax = 1)
plt.xlabel('wavelength [nm]')
plt.ylabel('temperature [K]')
plt.title('LaPONd_g500')
cbar = fig.colorbar(cs, norm = norm)
plt.savefig('C:\\Users\\micha_000\\Desktop\\Measure\\LaPONd_g500_radio_map.png')
plt.show()
plt.close()
And here is an example of what i receive:
Is there any way to make it look better by smoothening pixels transitions?
The problem is not the palette (which are all smooth in matplotlib), but that fact that you are using contourf(), which generates a finite set of countours, each with a single color, and is therefore not smooth. The default is something like 10 countours.
One quick solution:, increase the number of contour levels by specifying levels (you can also give an array of which levels to include):
cs = ax.contourf(X, Y, A, cmap=cm.viridis, levels=100)
Better yet, since it seems your data data is already on a grid (e.g. X,Y,Z values for each pixel), you should use pcolormesh(X,Y,A) instead of contour to plot it. That will plot with fully continuous values, rather than steps.
open the png as an array, and blur it with a mean value filter. search convolution filters to learn more. I've just used a 25 pixel square averaging filter, but you could use a gaussian distribution to make it look smoother..
import numpy as np
from scipy import ndimage, signal, misc
img = ndimage.imread('C:/.../Zrj50.png')
#I used msPaint to get coords... there's probably a better way
x0, y0, x1, y1 = 87,215,764,1270 #chart area (pixel coords)
#you could use a gaussian filter to get a rounder blur pattern
kernel = np.ones((5,5),)/25 #mean value convolution
#convolve roi with averaging filter
#red
img[x0:x1, y0:y1, 0] = signal.convolve2d(img[x0:x1, y0:y1, 0], kernel, mode='same', boundary='symm')
#green
img[x0:x1, y0:y1, 1] = signal.convolve2d(img[x0:x1, y0:y1, 1], kernel, mode='same', boundary='symm')
#blue
img[x0:x1, y0:y1, 2] = signal.convolve2d(img[x0:x1, y0:y1, 2], kernel, mode='same', boundary='symm')
#do it again for ledgend area
#...
misc.imsave('C:/.../Zrj50_blurred.png', img)
Using a gaussian instead is pretty easy:
#red
img[x0:x1, y0:y1, 0] = ndimage.gaussian_filter(img[x0:x1, y0:y1, 0], 4, mode='nearest')

Fit the gamma distribution only to a subset of the samples

I have the histogram of my input data (in black) given in the following graph:
I'm trying to fit the Gamma distribution but not on the whole data but just to the first curve of the histogram (the first mode). The green plot in the previous graph corresponds to when I fitted the Gamma distribution on all the samples using the following python code which makes use of scipy.stats.gamma:
img = IO.read(input_file)
data = img.flatten() + abs(np.min(img)) + 1
# calculate dB positive image
img_db = 10 * np.log10(img)
img_db_pos = img_db + abs(np.min(img_db))
data = img_db_pos.flatten() + 1
# data histogram
n, bins, patches = plt.hist(data, 1000, normed=True)
# slice histogram here
# estimation of the parameters of the gamma distribution
fit_alpha, fit_loc, fit_beta = gamma.fit(data, floc=0)
x = np.linspace(0, 100)
y = gamma.pdf(x, fit_alpha, fit_loc, fit_beta)
print '(alpha, beta): (%f, %f)' % (fit_alpha, fit_beta)
# plot estimated model
plt.plot(x, y, linewidth=2, color='g')
plt.show()
How can I restrict the fitting only to the interesting subset of this data?
Update1 (slicing):
I sliced the input data by keeping only values below the max of the previous histogram, but the results were not really convincing:
This was achieved by inserting the following code below the # slice histogram here comment in the previous code:
max_data = bins[np.argmax(n)]
data = data[data < max_data]
Update2 (scipy.optimize.minimize):
The code below shows how scipy.optimize.minimize() is used to minimize an energy function to find (alpha, beta):
import matplotlib.pyplot as plt
import numpy as np
from geotiff.io import IO
from scipy.stats import gamma
from scipy.optimize import minimize
def truncated_gamma(x, max_data, alpha, beta):
gammapdf = gamma.pdf(x, alpha, loc=0, scale=beta)
norm = gamma.cdf(max_data, alpha, loc=0, scale=beta)
return np.where(x < max_data, gammapdf / norm, 0)
# read image
img = IO.read(input_file)
# calculate dB positive image
img_db = 10 * np.log10(img)
img_db_pos = img_db + abs(np.min(img_db))
data = img_db_pos.flatten() + 1
# data histogram
n, bins = np.histogram(data, 100, normed=True)
# using minimize on a slice data below max of histogram
max_data = bins[np.argmax(n)]
data = data[data < max_data]
data = np.random.choice(data, 1000)
energy = lambda p: -np.sum(np.log(truncated_gamma(data, max_data, *p)))
initial_guess = [np.mean(data), 2.]
o = minimize(energy, initial_guess, method='SLSQP')
fit_alpha, fit_beta = o.x
# plot data histogram and model
x = np.linspace(0, 100)
y = gamma.pdf(x, fit_alpha, 0, fit_beta)
plt.hist(data, 30, normed=True)
plt.plot(x, y, linewidth=2, color='g')
plt.show()
The algorithm above converged for a subset of data, and the output in o was:
x: array([ 16.66912781, 6.88105559])
But as can be seen on the screenshot below, the gamma plot doesn't fit the histogram:
You can use a general optimization tool such as scipy.optimize.minimize to fit a truncated version of the desired function, resulting in a nice fit:
First, the modified function:
def truncated_gamma(x, alpha, beta):
gammapdf = gamma.pdf(x, alpha, loc=0, scale=beta)
norm = gamma.cdf(max_data, alpha, loc=0, scale=beta)
return np.where(x<max_data, gammapdf/norm, 0)
This selects values from the gamma distribution where x < max_data, and zero elsewhere. The np.where part is not actually important here, because the data is exclusively to the left of max_data anyway. The key is normalization, because varying alpha and beta will change the area to the left of the truncation point in the original gamma.
The rest is just optimization technicalities.
It's common practise to work with logarithms, so I used what's sometimes called "energy", or the logarithm of the inverse of the probability density.
energy = lambda p: -np.sum(np.log(truncated_gamma(data, *p)))
Minimize:
initial_guess = [np.mean(data), 2.]
o = minimize(energy, initial_guess, method='SLSQP')
fit_alpha, fit_beta = o.x
My output is (alpha, beta): (11.595208, 824.712481). Like the original, it is a maximum likelihood estimate.
If you're not happy with the convergence rate, you may want to
Select a sample from your rather big dataset:
data = np.random.choice(data, 10000)
Try different algorithms using the method keyword argument.
Some optimization routines output a representation of the inverse hessian, which is useful for uncertainty estimation. Enforcement of nonnegativity for the parameters may also be a good idea.
A log-scaled plot without truncation shows the entire distribution:
Here's another possible approach using a manually created dataset in excel that more or less matched the plot given.
Raw Data
Outline
Imported data into a Pandas dataframe.
Mask the indices after the
max response index.
Create a mirror image of the remaining data.
Append the mirror image while leaving a buffer of empty space.
Fit the desired distribution to the modified data. Below I do a normal fit by the method of moments and adjust the amplitude and width.
Working Script
# Import data to dataframe.
df = pd.read_csv('sample.csv', header=0, index_col=0)
# Mask indices after index at max Y.
mask = df.index.values <= df.Y.argmax()
df = df.loc[mask, :]
scaled_y = 100*df.Y.values
# Create new df with mirror image of Y appended.
sep = 6
app_zeroes = np.append(scaled_y, np.zeros(sep, dtype=np.float))
mir_y = np.flipud(scaled_y)
new_y = np.append(app_zeroes, mir_y)
# Using Scipy-cookbook to fit a normal by method of moments.
idxs = np.arange(new_y.size) # idxs=[0, 1, 2,...,len(data)]
mid_idxs = idxs.mean() # len(data)/2
# idxs-mid_idxs is [-53.5, -52.5, ..., 52.5, len(data)/2]
scaling_param = np.sqrt(np.abs(np.sum((idxs-mid_idxs)**2*new_y)/np.sum(new_y)))
# adjust amplitude
fmax = new_y.max()*1.2 # adjusted function max to 120% max y.
# adjust width
scaling_param = scaling_param*.7 # adjusted by 70%.
# Fit normal.
fit = lambda t: fmax*np.exp(-(t-mid_idxs)**2/(2*scaling_param**2))
# Plot results.
plt.plot(new_y, '.')
plt.plot(fit(idxs), '--')
plt.show()
Result
See the scipy-cookbook fitting data page for more on fitting a normal using method of moments.

Find a easier way to cluster 2-d scatter data into grid array data

I have figured out a method to cluster disperse point data into structured 2-d array(like rasterize function). And I hope there are some better ways to achieve that target.
My work
1. Intro
1000 point data has there dimensions of properties (lon, lat, emission) whicn represent one factory located at (x,y) emit certain amount of CO2 into atmosphere
grid network: predefine the 2-d array in the shape of 20x20
http://i4.tietuku.com/02fbaf32d2f09fff.png
The code reproduced here:
#### define the map area
xc1,xc2,yc1,yc2 = 113.49805889531724,115.5030664238035,37.39995194888143,38.789235929357105
map = Basemap(llcrnrlon=xc1,llcrnrlat=yc1,urcrnrlon=xc2,urcrnrlat=yc2)
#### reading the point data and scatter plot by their position
df = pd.read_csv("xxxxx.csv")
px,py = map(df.lon, df.lat)
map.scatter(px, py, color = "red", s= 5,zorder =3)
#### predefine the grid networks
lon_grid,lat_grid = np.linspace(xc1,xc2,21), np.linspace(yc1,yc2,21)
lon_x,lat_y = np.meshgrid(lon_grid,lat_grid)
grids = np.zeros(20*20).reshape(20,20)
plt.pcolormesh(lon_x,lat_y,grids,cmap = 'gray', facecolor = 'none',edgecolor = 'k',zorder=3)
2. My target
Finding the nearest grid point for each factory
Add the emission data into this grid number
3. Algorithm realization
3.1 Raster grid
note: 20x20 grid points are distributed in this area represented by blue dot.
http://i4.tietuku.com/8548554587b0cb3a.png
3.2 KD-tree
Find the nearest blue dot of each red point
sh = (20*20,2)
grids = np.zeros(20*20*2).reshape(*sh)
sh_emission = (20*20)
grids_em = np.zeros(20*20).reshape(sh_emission)
k = 0
for j in range(0,yy.shape[0],1):
for i in range(0,xx.shape[0],1):
grids[k] = np.array([lon_grid[i],lat_grid[j]])
k+=1
T = KDTree(grids)
x_delta = (lon_grid[2] - lon_grid[1])
y_delta = (lat_grid[2] - lat_grid[1])
R = np.sqrt(x_delta**2 + y_delta**2)
for i in range(0,len(df.lon),1):
idx = T.query_ball_point([df.lon.iloc[i],df.lat.iloc[i]], r=R)
# there are more than one blue dot which are founded sometimes,
# So I'll calculate the distances between the factory(red point)
# and all blue dots which are listed
if (idx > 1):
distance = []
for k in range(0,len(idx),1):
distance.append(np.sqrt((df.lon.iloc[i] - grids[k][0])**2 + (df.lat.iloc[i] - grids[k][1])**2))
pos_index = distance.index(min(distance))
pos = idx[pos_index]
# Only find 1 point
else:
pos = idx
grids_em[pos] += df.so2[i]
4. Result
co2 = grids_em.reshape(20,20)
plt.pcolormesh(lon_x,lat_y,co2,cmap =plt.cm.Spectral_r,zorder=3)
http://i4.tietuku.com/6ded65c4ac301294.png
5. My question
Can someone point out some drawbacks or error of this method?
Is there some algorithms more aligned with my target?
Thanks a lot!
There are many for-loop in your code, it's not the numpy way.
Make some sample data first:
import numpy as np
import pandas as pd
from scipy.spatial import KDTree
import pylab as pl
xc1, xc2, yc1, yc2 = 113.49805889531724, 115.5030664238035, 37.39995194888143, 38.789235929357105
N = 1000
GSIZE = 20
x, y = np.random.multivariate_normal([(xc1 + xc2)*0.5, (yc1 + yc2)*0.5], [[0.1, 0.02], [0.02, 0.1]], size=N).T
value = np.ones(N)
df_points = pd.DataFrame({"x":x, "y":y, "v":value})
For equal space grids you can use hist2d():
pl.hist2d(df_points.x, df_points.y, weights=df_points.v, bins=20, cmap="viridis");
Here is the output:
Here is the code to use KdTree:
X, Y = np.mgrid[x.min():x.max():GSIZE*1j, y.min():y.max():GSIZE*1j]
grid = np.c_[X.ravel(), Y.ravel()]
points = np.c_[df_points.x, df_points.y]
tree = KDTree(grid)
dist, indices = tree.query(points)
grid_values = df_points.groupby(indices).v.sum()
df_grid = pd.DataFrame(grid, columns=["x", "y"])
df_grid["v"] = grid_values
fig, ax = pl.subplots(figsize=(10, 8))
ax.plot(df_points.x, df_points.y, "kx", alpha=0.2)
mapper = ax.scatter(df_grid.x, df_grid.y, c=df_grid.v,
cmap="viridis",
linewidths=0,
s=100, marker="o")
pl.colorbar(mapper, ax=ax);
the output is:

How can an almost arbitrary plane in a 3D dataset be plotted by matplotlib?

There is an array containing 3D data of shape e.g. (64,64,64), how do you plot a plane given by a point and a normal (similar to hkl planes in crystallography), through this dataset?
Similar to what can be done in MayaVi by rotating a plane through the data.
The resulting plot will contain non-square planes in most cases.
Can those be done with matplotlib (some sort of non-rectangular patch)?
Edit: I almost solved this myself (see below) but still wonder how non-rectangular patches can be plotted in matplotlib...?
Edit: Due to discussions below I restated the question.
This is funny, a similar question I replied to just today. The way to go is: interpolation. You can use griddata from scipy.interpolate:
Griddata
This page features a very nice example, and the signature of the function is really close to your data.
You still have to somehow define the points on you plane for which you want to interpolate the data. I will have a look at this, my linear algebra lessons where a couple of years ago
I have the penultimate solution for this problem. Partially solved by using the second answer to Plot a plane based on a normal vector and a point in Matlab or matplotlib :
# coding: utf-8
import numpy as np
from matplotlib.pyplot import imshow,show
A=np.empty((64,64,64)) #This is the data array
def f(x,y):
return np.sin(x/(2*np.pi))+np.cos(y/(2*np.pi))
xx,yy= np.meshgrid(range(64), range(64))
for x in range(64):
A[:,:,x]=f(xx,yy)*np.cos(x/np.pi)
N=np.zeros((64,64))
"""This is the plane we cut from A.
It should be larger than 64, due to diagonal planes being larger.
Will be fixed."""
normal=np.array([-1,-1,1]) #Define cut plane here. Normal vector components restricted to integers
point=np.array([0,0,0])
d = -np.sum(point*normal)
def plane(x,y): # Get plane's z values
return (-normal[0]*x-normal[1]*y-d)/normal[2]
def getZZ(x,y): #Get z for all values x,y. If z>64 it's out of range
for i in x:
for j in y:
if plane(i,j)<64:
N[i,j]=A[i,j,plane(i,j)]
getZZ(range(64),range(64))
imshow(N, interpolation="Nearest")
show()
It's not the ultimate solution since the plot is not restricted to points having a z value, planes larger than 64 * 64 are not accounted for and the planes have to be defined at (0,0,0).
For the reduced requirements, I prepared a simple example
import numpy as np
import pylab as plt
data = np.arange((64**3))
data.resize((64,64,64))
def get_slice(volume, orientation, index):
orientation2slicefunc = {
"x" : lambda ar:ar[index,:,:],
"y" : lambda ar:ar[:,index,:],
"z" : lambda ar:ar[:,:,index]
}
return orientation2slicefunc[orientation](volume)
plt.subplot(221)
plt.imshow(get_slice(data, "x", 10), vmin=0, vmax=64**3)
plt.subplot(222)
plt.imshow(get_slice(data, "x", 39), vmin=0, vmax=64**3)
plt.subplot(223)
plt.imshow(get_slice(data, "y", 15), vmin=0, vmax=64**3)
plt.subplot(224)
plt.imshow(get_slice(data, "z", 25), vmin=0, vmax=64**3)
plt.show()
This leads to the following plot:
The main trick is dictionary mapping orienations to lambda-methods, which saves us from writing annoying if-then-else-blocks. Of course you can decide to give different names,
e.g., numbers, for the orientations.
Maybe this helps you.
Thorsten
P.S.: I didn't care about "IndexOutOfRange", for me it's o.k. to let this exception pop out since it is perfectly understandable in this context.
I had to do something similar for a MRI data enhancement:
Probably the code can be optimized but it works as it is.
My data is 3 dimension numpy array representing an MRI scanner. It has size [128,128,128] but the code can be modified to accept any dimensions. Also when the plane is outside the cube boundary you have to give the default values to the variable fill in the main function, in my case I choose: data_cube[0:5,0:5,0:5].mean()
def create_normal_vector(x, y,z):
normal = np.asarray([x,y,z])
normal = normal/np.sqrt(sum(normal**2))
return normal
def get_plane_equation_parameters(normal,point):
a,b,c = normal
d = np.dot(normal,point)
return a,b,c,d #ax+by+cz=d
def get_point_plane_proximity(plane,point):
#just aproximation
return np.dot(plane[0:-1],point) - plane[-1]
def get_corner_interesections(plane, cube_dim = 128): #to reduce the search space
#dimension is 128,128,128
corners_list = []
only_x = np.zeros(4)
min_prox_x = 9999
min_prox_y = 9999
min_prox_z = 9999
min_prox_yz = 9999
for i in range(cube_dim):
temp_min_prox_x=abs(get_point_plane_proximity(plane,np.asarray([i,0,0])))
# print("pseudo distance x: {0}, point: [{1},0,0]".format(temp_min_prox_x,i))
if temp_min_prox_x < min_prox_x:
min_prox_x = temp_min_prox_x
corner_intersection_x = np.asarray([i,0,0])
only_x[0]= i
temp_min_prox_y=abs(get_point_plane_proximity(plane,np.asarray([i,cube_dim,0])))
# print("pseudo distance y: {0}, point: [{1},{2},0]".format(temp_min_prox_y,i,cube_dim))
if temp_min_prox_y < min_prox_y:
min_prox_y = temp_min_prox_y
corner_intersection_y = np.asarray([i,cube_dim,0])
only_x[1]= i
temp_min_prox_z=abs(get_point_plane_proximity(plane,np.asarray([i,0,cube_dim])))
#print("pseudo distance z: {0}, point: [{1},0,{2}]".format(temp_min_prox_z,i,cube_dim))
if temp_min_prox_z < min_prox_z:
min_prox_z = temp_min_prox_z
corner_intersection_z = np.asarray([i,0,cube_dim])
only_x[2]= i
temp_min_prox_yz=abs(get_point_plane_proximity(plane,np.asarray([i,cube_dim,cube_dim])))
#print("pseudo distance z: {0}, point: [{1},{2},{2}]".format(temp_min_prox_yz,i,cube_dim))
if temp_min_prox_yz < min_prox_yz:
min_prox_yz = temp_min_prox_yz
corner_intersection_yz = np.asarray([i,cube_dim,cube_dim])
only_x[3]= i
corners_list.append(corner_intersection_x)
corners_list.append(corner_intersection_y)
corners_list.append(corner_intersection_z)
corners_list.append(corner_intersection_yz)
corners_list.append(only_x.min())
corners_list.append(only_x.max())
return corners_list
def get_points_intersection(plane,min_x,max_x,data_cube,shape=128):
fill = data_cube[0:5,0:5,0:5].mean() #this can be a parameter
extended_data_cube = np.ones([shape+2,shape,shape])*fill
extended_data_cube[1:shape+1,:,:] = data_cube
diag_image = np.zeros([shape,shape])
min_x_value = 999999
for i in range(shape):
for j in range(shape):
for k in range(int(min_x),int(max_x)+1):
current_value = abs(get_point_plane_proximity(plane,np.asarray([k,i,j])))
#print("current_value:{0}, val: [{1},{2},{3}]".format(current_value,k,i,j))
if current_value < min_x_value:
diag_image[i,j] = extended_data_cube[k,i,j]
min_x_value = current_value
min_x_value = 999999
return diag_image
The way it works is the following:
you create a normal vector:
for example [5,0,3]
normal1=create_normal_vector(5, 0,3) #this is only to normalize
then you create a point:
(my cube data shape is [128,128,128])
point = [64,64,64]
You calculate the plane equation parameters, [a,b,c,d] where ax+by+cz=d
plane1=get_plane_equation_parameters(normal1,point)
then to reduce the search space you can calculate the intersection of the plane with the cube:
corners1 = get_corner_interesections(plane1,128)
where corners1 = [intersection [x,0,0],intersection [x,128,0],intersection [x,0,128],intersection [x,128,128], min intersection [x,y,z], max intersection [x,y,z]]
With all these you can calculate the intersection between the cube and the plane:
image1 = get_points_intersection(plane1,corners1[-2],corners1[-1],data_cube)
Some examples:
normal is [1,0,0] point is [64,64,64]
normal is [5,1,0],[5,1,1],[5,0,1] point is [64,64,64]:
normal is [5,3,0],[5,3,3],[5,0,3] point is [64,64,64]:
normal is [5,-5,0],[5,-5,-5],[5,0,-5] point is [64,64,64]:
Thank you.
The other answers here do not appear to be very efficient with explicit loops over pixels or using scipy.interpolate.griddata, which is designed for unstructured input data. Here is an efficient (vectorized) and generic solution.
There is a pure numpy implementation (for nearest-neighbor "interpolation") and one for linear interpolation, which delegates the interpolation to scipy.ndimage.map_coordinates. (The latter function probably didn't exist in 2013, when this question was asked.)
import numpy as np
from scipy.ndimage import map_coordinates
def slice_datacube(cube, center, eXY, mXY, fill=np.nan, interp=True):
"""Get a 2D slice from a 3-D array.
Copyright: Han-Kwang Nienhuys, 2020.
License: any of CC-BY-SA, CC-BY, BSD, GPL, LGPL
Reference: https://stackoverflow.com/a/62733930/6228891
Parameters:
- cube: 3D array, assumed shape (nx, ny, nz).
- center: shape (3,) with coordinates of center.
can be float.
- eXY: unit vectors, shape (2, 3) - for X and Y axes of the slice.
(unit vectors must be orthogonal; normalization is optional).
- mXY: size tuple of output array (mX, mY) - int.
- fill: value to use for out-of-range points.
- interp: whether to interpolate (rather than using 'nearest')
Return:
- slice: array, shape (mX, mY).
"""
center = np.array(center, dtype=float)
assert center.shape == (3,)
eXY = np.array(eXY)/np.linalg.norm(eXY, axis=1)[:, np.newaxis]
if not np.isclose(eXY[0] # eXY[1], 0, atol=1e-6):
raise ValueError(f'eX and eY not orthogonal.')
# R: rotation matrix: data_coords = center + R # slice_coords
eZ = np.cross(eXY[0], eXY[1])
R = np.array([eXY[0], eXY[1], eZ], dtype=np.float32).T
# setup slice points P with coordinates (X, Y, 0)
mX, mY = int(mXY[0]), int(mXY[1])
Xs = np.arange(0.5-mX/2, 0.5+mX/2)
Ys = np.arange(0.5-mY/2, 0.5+mY/2)
PP = np.zeros((3, mX, mY), dtype=np.float32)
PP[0, :, :] = Xs.reshape(mX, 1)
PP[1, :, :] = Ys.reshape(1, mY)
# Transform to data coordinates (x, y, z) - idx.shape == (3, mX, mY)
if interp:
idx = np.einsum('il,ljk->ijk', R, PP) + center.reshape(3, 1, 1)
slice = map_coordinates(cube, idx, order=1, mode='constant', cval=fill)
else:
idx = np.einsum('il,ljk->ijk', R, PP) + (0.5 + center.reshape(3, 1, 1))
idx = idx.astype(np.int16)
# Find out which coordinates are out of range - shape (mX, mY)
badpoints = np.any([
idx[0, :, :] < 0,
idx[0, :, :] >= cube.shape[0],
idx[1, :, :] < 0,
idx[1, :, :] >= cube.shape[1],
idx[2, :, :] < 0,
idx[2, :, :] >= cube.shape[2],
], axis=0)
idx[:, badpoints] = 0
slice = cube[idx[0], idx[1], idx[2]]
slice[badpoints] = fill
return slice
# Demonstration
nx, ny, nz = 50, 70, 100
cube = np.full((nx, ny, nz), np.float32(1))
cube[nx//4:nx*3//4, :, :] += 1
cube[:, ny//2:ny*3//4, :] += 3
cube[:, :, nz//4:nz//2] += 7
cube[nx//3-2:nx//3+2, ny//2-2:ny//2+2, :] = 0 # black dot
Rz, Rx = np.pi/6, np.pi/4 # rotation angles around z and x
cz, sz = np.cos(Rz), np.sin(Rz)
cx, sx = np.cos(Rx), np.sin(Rx)
Rmz = np.array([[cz, -sz, 0], [sz, cz, 0], [0, 0, 1]])
Rmx = np.array([[1, 0, 0], [0, cx, -sx], [0, sx, cx]])
eXY = (Rmx # Rmz).T[:2]
slice = slice_datacube(
cube,
center=[nx/3, ny/2, nz*0.7],
eXY=eXY,
mXY=[80, 90],
fill=np.nan,
interp=False
)
import matplotlib.pyplot as plt
plt.close('all')
plt.imshow(slice.T) # imshow expects shape (mY, mX)
plt.colorbar()
Output (for interp=False):
For this test case (50x70x100 datacube, 80x90 slice size) the run time is 376 µs (interp=False) and 550 µs (interp=True) on my laptop.

Categories

Resources