Merge histograms with different ranges - python

Is it any fast way to merge two numpy histograms with different bin ranges and bin number?
For example:
x = [1,2,2,3]
y = [4,5,5,6]
a = np.histogram(x, bins=10)
# a[0] = [1, 0, 0, 0, 0, 2, 0, 0, 0, 1]
# a[1] = [ 1. , 1.2, 1.4, 1.6, 1.8, 2. , 2.2, 2.4, 2.6, 2.8, 3. ]
b = np.histogram(y, bins=5)
# b[0] = [1, 0, 2, 0, 1]
# b[1] = [ 4. , 4.4, 4.8, 5.2, 5.6, 6. ]
Now I want to have some function like this:
def merge(a, b):
# some actions here #
return merged_a_b_values, merged_a_b_bins
Actually I have not x and y, a and b are known only.
But the result of merge(a, b) must be equal to np.histogram(x+y, bins=10):
m = merge(a, b)
# m[0] = [1, 0, 2, 0, 1, 0, 1, 0, 2, 1]
# m[1] = [ 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. , 5.5, 6. ]

I'd actually have added a comment to dangom's answer, but I lack the reputation required.
I'm a little confused by your example. You're plotting the histogram of the histogram bins if I'm not mistaken. It should rather be this, right?
plt.figure()
plt.plot(a[1][:-1], a[0], marker='.', label='a')
plt.plot(b[1][:-1], b[0], marker='.', label='b')
plt.plot(c[1][:-1], c[0], marker='.', label='c')
plt.legend()
plt.show()
Also a note to your suggestion for combining the histogram. You are of course right, that there's no unique solution as you simply don't know, where the samples would've have been in the finer grid you use for the combination. When having two histograms, which have a significantly differing bin width the suggested merging function may result in a sparse and artificial looking histogram.
I tried combining the histograms by interpolation (assuming the samples within the count bin were distributed uniformly in the original bin - which is of course also only an assumption).
This leads however to a more natural looking result, at least for data sampled from distributions I typically encounter.
import numpy as np
def merge_hist(a, b):
edgesa = a[1]
edgesb = b[1]
da = edgesa[1]-edgesa[0]
db = edgesb[1]-edgesb[0]
dint = np.min([da, db])
min = np.min(np.hstack([edgesa, edgesb]))
max = np.max(np.hstack([edgesa, edgesb]))
edgesc = np.arange(min, max, dint)
def interpolate_hist(edgesint, edges, hist):
cumhist = np.hstack([0, np.cumsum(hist)])
cumhistint = np.interp(edgesint, edges, cumhist)
histint = np.diff(cumhistint)
return histint
histaint = interpolate_hist(edgesc, edgesa, a[0])
histbint = interpolate_hist(edgesc, edgesb, b[0])
c = histaint + histbint
return c, edgesc
An example for two gaussian distributions:
import numpy as np
a = 5 + 1*np.random.randn(100)
b = 10 + 2*np.random.randn(100)
hista, edgesa = np.histogram(a, bins=10)
histb, edgesb = np.histogram(b, bins=5)
histc, edgesc = merge_hist([hista, edgesa], [histb, edgesb])
plt.figure()
width = edgesa[1]-edgesa[0]
plt.bar(edgesa[:-1], hista, width=width)
width = edgesb[1]-edgesb[0]
plt.bar(edgesb[:-1], histb, width=width)
plt.figure()
width = edgesc[1]-edgesc[0]
plt.bar(edgesc[:-1], histc, width=width)
plt.show()
I, however, am no statistician, so please let me know if the suggestes approach is viable.

There is no unique solution to the problem of merging two different histograms. I propose here a simple and quick solution based on two design assumptions necessary to deal with the loss of information inherent from binning sequences:
Recovered values are represented by the start of the bin they belong to.
The merge shall keep the highest bin resolution to avoid further loss of information and shall completely encompass the intervals of the children histograms.
Here's the code:
import numpy as np
def merge(a, b):
def extract_vals(hist):
# Recover values based on assumption 1.
values = [[y]*x for x, y in zip(hist[0], hist[1])]
# Return flattened list.
return [z for s in values for z in s]
def extract_bin_resolution(hist):
return hist[1][1] - hist[1][0]
def generate_num_bins(minval, maxval, bin_resolution):
# Generate number of bins necessary to satisfy assumption 2
return int(np.ceil((maxval - minval) / bin_resolution))
vals = extract_vals(a) + extract_vals(b)
bin_resolution = min(map(extract_bin_resolution, [a, b]))
num_bins = generate_num_bins(min(vals), max(vals), bin_resolution)
return np.histogram(vals, bins=num_bins)
Here's the example code:
import matplotlib.pyplot as plt
x = [1,2,2,3]
y = [4,5,5,6]
a = np.histogram(x, bins=10)
# a[0] = [1, 0, 0, 0, 0, 2, 0, 0, 0, 1]
# a[1] = [ 1. , 1.2, 1.4, 1.6, 1.8, 2. , 2.2, 2.4, 2.6, 2.8, 3. ]
b = np.histogram(y, bins=5)
# b[0] = [1, 0, 2, 0, 1]
# b[1] = [ 4. , 4.4, 4.8, 5.2, 5.6, 6. ]
# Merge and plot results
c = merge(a, b)
c_num_bins = c[1].size - 1
plt.hist(a[0], bins=5, label='a')
plt.hist(b[0], bins=10, label='b')
plt.hist(c[0], bins=c_num_bins, label='c')
plt.legend()
plt.show()

Related

Numpy/Torch insert smallest value in case of collision

I have an empty numpy array, a list of indices, and list of values associated with the indices. The issue is that there may be duplicates in the indices. In all these "collision" cases, I'd like the smallest value to be picked. Just wondering what is the best way to go about it.
Eg:
array = [0,0,0,0,0,0,0]
indices = [0, 0, 2, 3, 2, 4]
values = [1.0, 3.0, 3.5, 1.5, 2.5, 8.0]
Result:
out = [1.0, 0, 2.5, 1.5, 8.0, 0.0, 0.0]
You can always implement something manually like:
import numpy as np
def index_reduce(arr, indices, out, reducer=min):
touched = np.zeros_like(out, dtype=np.bool_)
for i, x in enumerate(indices):
if not touched[x]:
out[x] = arr[i]
touched[x] = True
else:
out[x] = reducer(out[x], arr[i])
return out
which essentially loops through the indices and assign the values of arr to out if not already touched (keeping track of this with the touched array) and reducing the output with the specified reducer.
NOTE: The reducer function needs to be such that the final result can only depend on the current and previous value.
The usage of this would be:
indices = [0, 0, 2, 3, 2, 4]
values = [1.0, 3.0, 3.5, 1.5, 2.5, 8.0]
array = np.zeros(7)
index_reduce(values, indices, array)
# array([1. , 0. , 2.5, 1.5, 8. , 0. , 0. ])
If performances are of concern, you can also accelerate the above code with Numba with a simple decoration provided that also the values and indices inputs are NumPy arrays:
import numba as nb
index_reduce_nb = nb.njit(index_reduce)
indices = np.array([0, 0, 2, 3, 2, 4])
values = np.array([1.0, 3.0, 3.5, 1.5, 2.5, 8.0])
array = np.zeros(7)
index_reduce_nb(values, indices, array)
# array([1. , 0. , 2.5, 1.5, 8. , 0. , 0. ])
Benchmarks
The above solutions can be compared to a Torch-based solution (reworked from #Shai's answer):
import torch
def index_reduce_torch(arr, indices, out, reduce_="amin"):
arr = torch.from_numpy(arr)
indices = torch.from_numpy(indices)
out = torch.from_numpy(out)
return out.index_reduce_(dim=0, index=indices, source=arr, reduce=reduce_, include_self=False).numpy()
or, with additional skipping of Torch gradients:
index_reduce_torch_ng = torch.no_grad()(index_reduce_torch)
index_reduce_torch_ng.__name__ = "index_reduce_torch_ng"
and a Pandas-based solution (reworked from #bpfrd's answer):
import pandas as pd
def index_reduce_pd(arr, indices, out, reducer=min):
df = pd.DataFrame(data=zip(indices, arr))
df1 = df.groupby(0, as_index=False).agg(reducer)
out[df1[0]] = df1[1]
return out
using the following code:
funcs = index_reduce, index_reduce_nb, index_reduce_pd, index_reduce_torch, index_reduce_torch_ng
timings = {}
for i in range(4, 18):
n = 2 ** i
print(f"n = {n}, i = {i}")
extrema = 0, 2 * n
indices = np.random.randint(*extrema, n)
values = np.random.random(n)
out = np.zeros(extrema[1] + 1)
timings[n] = []
base = funcs[0](values, indices, out)
for func in funcs:
res = func(values, indices, out)
is_good = np.allclose(base, res)
timed = %timeit -r 16 -n 16 -q -o func(values, indices, out)
timing = timed.best * 1e6
timings[n].append(timing if is_good else None)
print(f"{func.__name__:>24} {is_good} {timing:10.3f} µs")
to produce with the additional lines:
import matplotlib.pyplot as plt
df = pd.DataFrame(data=timings, index=[func.__name__ for func in funcs]).transpose()
df.plot(marker='o', xlabel='Input size / #', ylabel='Best timing / µs', figsize=(6, 4))
df.plot(marker='o', xlabel='Input size / #', ylabel='Best timing / µs', ylim=[0, 500], figsize=(6, 4))
fig = plt.gcf()
fig.patch.set_facecolor('white')
these plots:
(the second is a zoomed-in version of the first).
These indicate that the Numba accelerated solution could be the fastest, closely followed by the Torch-based solution while the Pandas approach could be the slowest, even slower than the explicit solution without acceleration.
You are looking for index_reduce_, which was introduced in PyTorch 1.12.
import torch
array = torch.zeros(7)
indices = torch.tensor([0, 0, 2, 3, 2, 4])
values = torch.tensor([1.0, 3.0, 3.5, 1.5, 2.5, 8.0])
out = array.index_reduce_(dim=0, index=indices, source=values, reduce='amin', include_self=False)
You'll get your desired output:
tensor([1.0000, 0.0000, 2.5000, 1.5000, 8.0000, 0.0000, 0.0000])
Note that this method is in "beta" and its API may change in future PyTorch versions.
You can use pandas groupby agg as the following:
indices = [0, 0, 2, 3, 2, 4]
values = [1.0, 3.0, 3.5, 1.5, 2.5, 8.0]
array = [0,0,0,0,0,0,0]
df = pd.DataFrame(zip(indices, values), columns=['indices','values'])
df1 = df.groupby('indices', as_index=False).agg(values=('values', min))
for i,j in zip(df1['indices'].tolist(), df1['values'].tolist()):
array[i] = j
output:
array
>[1.0, 0, 2.5, 1.5, 8.0, 0, 0]

Malfunction of the translated code InterX to python

I want to use the function InterX to find the intersection of two curves. However the function does not return the expected result. The function is availabel here
The function always return the point of intersection as P = None, None. When a valid point was expected.
import numpy as np
import pandas as pd
from InterX import InterX
x_t = np.linspace(0, 10, 10, True)
z_t = np.array((0, 0, 0, 0, 0, 0, 0.055, 0.41, 1.23, 4))
X_P = np.array((2,4))
Z_P = np.array((3,-1))
Line = pd.DataFrame(np.array((X_P,Z_P)))
Curve = pd.DataFrame(np.array([x_t,z_t]))
Curve = Curve.T
P = InterX(Line[0],Line[1],Curve[0],Curve[1])
In this script the expected result was P = [3.5,0]. However, the resulting point P is P = [None,None]
The short answer - use:
P = InterX(L1, L1, L2, L2)
or
P = InterX(L1.iloc[:,0].to_frame(),L1.iloc[:,1].to_frame(),L2.iloc[:,0].to_frame(),L2.iloc[:,1].to_frame())
For a detailed answer see the following that refers to the code of your original question.
This refers to the code of the original question:
You need two pass two dataframes with x and y values (it would be of course much more logical if InterX would accept 4 Series or 2 DataFrames respectively).
InterX then gets the x and y values in a very convoluted way from these dataframes in lines 90 through 119 (which could be done far more easyly). So the working solution is:
import numpy as np
import pandas as pd
from InterX import InterX
x_t = np.linspace(0, 10, 10, True)
z_t = np.array((0, 0, 0, 0, 0, 0, 0.055, 0.41, 1.23, 4))
x_P = np.array((2,4))
z_P = np.array((3,-1))
curve_x = pd.DataFrame(x_t)
curve_z = pd.DataFrame(z_t)
line_x = pd.DataFrame(X_P)
line_z = pd.DataFrame(Z_P)
p = InterX(line_x, line_z, curve_x, curve_z)
Output of print(p):
xs ys
0 3.5 0.0
Please note that according to the python naming convention (PEP8) function and variable names should be lowercase, with words separated by underscores.
I find the code of InterX very difficult to understand, a much cleaner solution (along with a nice plot) is this one.
With
x_t = np.linspace(0, 10, 10, True)
z_t = np.array((0, 0, 0, 0, 0, 0, 0.055, 0.41, 1.23, 4))
X_P = np.array((2,4))
Z_P = np.array((3,-1))
x,y = intersection(x_t,z_t,X_P,Z_P)
print(x,y)
plt.plot(x_t,z_t,c='r')
plt.plot(X_P,Z_P,c='g')
plt.plot(x,y,'*k')
plt.show()
we get [3.5] [-0.] and this picture:

Normalizing data to certain range of values

I am a new in Python, is there any function that can do normalizing a data?
For example, I have set of list in range 0 - 1 example : [0.92323, 0.7232322, 0,93832, 0.4344433]
I want to normalize those all values to range 0.25 - 0.50
Thank you,
The following function considers the generic case:
def normalize(values, bounds):
return [bounds['desired']['lower'] + (x - bounds['actual']['lower']) * (bounds['desired']['upper'] - bounds['desired']['lower']) / (bounds['actual']['upper'] - bounds['actual']['lower']) for x in values]
Use:
normalize(
[0.92323, 0.7232322, 0.93832, 0.4344433],
{'actual': {'lower': 0, 'upper': 1}, 'desired': {'lower': 0.25, 'upper': 0.5}}
) # [0.4808075, 0.43080805, 0.48458, 0.35861082499999997]
normalize(
[5, 7.5, 10, 12.5, 15],
{'actual':{'lower':5,'upper':15},'desired':{'lower':1,'upper':2}}
) # [1.0, 1.25, 1.5, 1.75, 2.0]
I chose a two-level dict as the argument but you could give it in multiple ways, for example in two separate tuples, one for the actual bounds and the other for the desired, being the first element the lower bound and the second the upper:
def normalize(values, actual_bounds, desired_bounds):
return [desired_bounds[0] + (x - actual_bounds[0]) * (desired_bounds[1] - desired_bounds[0]) / (actual_bounds[1] - actual_bounds[0]) for x in values]
Use:
normalize(
[0.92323, 0.7232322, 0.93832, 0.4344433],
(0,1),
(0.25,0.5)
) # [0.4808075, 0.43080805, 0.48458, 0.35861082499999997]
normalize(
[5, 7.5, 10, 12.5, 15],
(5,15),
(1,2)
) # [1.0, 1.25, 1.5, 1.75, 2.0]
You could do sth along the following lines:
>>> l = [0.92323, 0.7232322, 0.93832, 0.4344433]
>>> lower, upper = 0.25, 0.5
>>> l_norm = [lower + (upper - lower) * x for x in l]
>>> l_norm
[0.4808075, 0.43080805, 0.48458, 0.35861082499999997]
You can use sklearn.preprocessing for a lot of types of pre-processing tasks including normalization.
Note that the above answers do not give you the values in the range you are asking. Here I present an alternative function that may be easy to follow and gives you the result in the range you are asking.
def normalize(x, newRange=(0, 1)): #x is an array. Default range is between zero and one
xmin, xmax = np.min(x), np.max(x) #get max and min from input array
norm = (x - xmin)/(xmax - xmin) # scale between zero and one
if newRange == (0, 1):
return(norm) # wanted range is the same as norm
elif newRange != (0, 1):
return norm * (newRange[1] - newRange[0]) + newRange[0] #scale to a different range.
#add other conditions here. For example, an error message
Aplying our new function to your problem
x = np.array([0.92323, 0.7232322, 0.93832, 0.4344433]) #your input vector must be an array
normalize(x, newRange=(0.25, 0.5)) #set your range as a python tuple
#array([0.49251305, 0.39328352, 0.5 , 0.25 ])
#note that the new values have a maximum value of 0.5 (the third entry) and minimum of 0.25 (the last value).

How to sample in the matrix according to the probability in each cell

I tried to code the formula in pattern recognition but I can not find proper function to do the work. The problem is that I have an binary adjacency matrix A (M*N) and want to assign value 1 or 0 to each cell. Every cell has fixed probability P to be 1 and zero otherwise. I search method about sampling in python and it seems that the most methods only support sample several elements in list without considering probability. I really need help about this and any idea is appreciated.
you could use
A = (P > numpy.random.rand(4, 5)).astype(int)
Where P is your matrix of probabilities.
To make sure the probabilities are right you can test it using
P = numpy.ones((4, 5)) * 0.2
S = numpy.zeros((4, 5))
for i in range(100000):
S += (P > numpy.random.rand(4, 5)).astype(int)
print S # each element should be approximately 20000
print S.mean() # the average should be approximately 20000, too
Let's say you have your matrix of probabilities of adjacency as follows :
# Create your matrix
matrix = np.random.randint(0, 10, (3, 3))/10.
# Returns :
array([[ 0. , 0.4, 0.2],
[ 0.9, 0.7, 0.4],
[ 0.1, 0. , 0.5]])
# Now you can use np.where
threshold = 0.5
np.where(matrix<threshold, 0, 1) # you can set your threshold as you like.
# Here set to 0.5
# Returns :
array([[0, 0, 0],
[1, 1, 0],
[0, 0, 1]])

Matching two grids for data analysis, is there a good algorithm for my problem?

I'd like to compare two differently spaced datasets in python. I always want to find the closest (nearest neighbour) match and regrid the data, see this example:
Dataset A:
ALTITUDE[m] VALUE
1. a1
2. a2
3. a3
4. a4
Dataset B:
ALTITUDE[m] VALUE
0.7 b1
0.9 b2
1.7 b3
2. b4
2.4 b5
2.9 b6
3.1 b7
3.2 b8
3.9 b9
4.1 b10
ai and bi contain double numbers, but also nan fields.
I'd like to transform dataset B to the altitude grid of dataset A, but since dataset A contains less altitude levels than dataset B, I'd like to average them.
ALTITUDE[m] VALUE
1. median(b1,b2)
2. median(b3,b4,b5)
3. median(b6,b7,b8)
4. median(b9,b10)
i.e. the closest altitude levels have been found and averaged over.
Conversely, if I want to match dataset A to the grid of dataset B, dataset A should look like this (nearest neighbour):
ALTITUDE[m] VALUE
0.7 a1
0.9 a1
1.7 a2
2. a2
2.4 a2
2.9 a3
3.1 a3
3.2 a3
3.9 a4
4.1 a4
Maybe this even has a name (I imagine it being a common problem), but I don't know it and thus cannot search for it. I believe there is an efficient way of doing this, apart from the obvious solution coding it myself (but I'm afraid it won't be efficient and I'd introduce many bugs).
Preferably using numpy.
EDIT: Thanks for your input to all four contributors. I learned a bit and I apologize for not asking very clearly. I was myself in the progress of understanding the problem. Your answers pointed me towards the usage of interp1d and this answer allowed me to abuse it for me. I will post the result shortly. I can accept only one answer, but anyone would do.
Two assumptions:
1: You are not looking for the nearest neighbour, but for all altitudes within some range. So, let's say for a1 you want all bn that are within 0.5 of a1 (giving you b1 and b2 as per your example). I would define the 'nearest neighbour' as something different.
2: You don't count nan in your medians (numpy counts them as infinity as per some IEEE convention, but this seems odd to me). As per your suggestion we thus use nanmedian from scipy.stats.
I would do the following:
from numpy import *
from pylab import *
A_Alt = array([1,2,3,4])
A_Val = array([.33, .5, .6, .8])
B_Alt = array([.7, 0.9, 1.7, 2., 2.4, 2.9, 3.1, 3.2, 3.9, 4.1])
B_Val = array([.3, NaN, .8, .6, .7, .4, .3, NaN, .99, 1.3])
range = .5
B_Agrid = [nanmedian(B_Val[abs(B_Alt - k)<range]).item() for k in A_Alt]
A_Bgrid = [nanmedian(A_Val[abs(A_Alt - k)<range]).item() for k in B_Alt]
We find all indices where the distance of B_Alt to k in A_Alt is less than a specified range. Then we take the median of those B_Val.
The same works for A_Bgrid with the results as requested.
==Edit==
Different assumption as to your nearest neighbours:
Let's take the nearest neighbour to be the entry (or entries in case of a tie) with the smallest absolute altitude difference while not having nan as a value. N.B. these results do not match your example as b1 would not be the nearest neighbour of a1 on account of b2 being closer.
Under this assumption, the following code should work:
from numpy import *
from pylab import *
from scipy.stats import nanmedian
A_Alt = array([1,2,3,4])
A_Val = array([.33, .5, .6, .8])
B_Alt = array([.7, 0.9, 1.7, 2., 2.4, 2.9, 3.1, 3.2, 3.9, 4.1])
B_Val = array([.3, NaN, .8, .6, .7, .4, .3, NaN, .99, 1.3])
def ReGridMedian(AltIn, ValIn, AltOut):
part = isfinite(ValIn)
q = [abs(AltIn[part]-k) for k in AltOut]
q = [nonzero(abs(k - min(k))<3*finfo(k.dtype).eps) for k in q]
q = [ValIn[part][k] for k in q]
return [median(k) for k in q]
B_Agrid = ReGridMedian(B_Alt, B_Val, A_Alt)
A_Bgrid = ReGridMedian(A_Alt, A_Val, B_Alt)
I hacked together something that checks if two values are identical within machine precision, but I assume there's a better way of doing that. In any case, we first filter all values that are not nan, then find the closest match, then check for duplicate minima, then get the median of those values.
====
Does this cover your question, or are my assumptions incorrect?
Have a look at numpy.interp:
http://docs.scipy.org/doc/numpy/reference/generated/numpy.interp.html
(EDIT: numpy.interp only provides linear interpolation which, evidently, is not what the OP is looking for. Instead use the scipy methods like interp1d using kind='nearest')
http://docs.scipy.org/doc/scipy/reference/interpolate.html
What it sounds like you want to do is use the altitude points of one data set to interpolate the values of the other. This can be done pretty easily with either the numpy method or one of the scipy interpolation methods.
Here's one way:
import numpy as np
def regrid_op(x, y, xi, op=np.median):
x, y, xi = np.atleast_1d(x, y, xi)
if (x.ndim, y.ndim, xi.ndim) != (1, 1, 1):
raise ValueError("works only for 1D data")
yi = np.zeros(xi.shape, dtype=y.dtype)
yi.fill(np.nan)
# sort data
j = np.argsort(x)
x = x[j]
y = y[j]
# group items by nearest neighbour
x0s = np.r_[xi, np.inf]
xc = .5*(x0s[:-1] + x0s[1:])
j0 = 0
for i, j1 in enumerate(np.searchsorted(x, xc)):
print "x =", xi[i], ", y =", y[j0:j1] # print some debug info
yi[i] = op(y[j0:j1])
j0 = j1
return yi
xi = np.array([1, 2, 3, 4])
x = np.array([0.7, 0.9, 1.7, 2.0, 2.4, 2.9, 3.1, 3.2, 3.9, 4.1])
y = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10.])
print regrid_op(x, y, xi)
I don't see a way to vectorize the loop over items in the xi array, so this should be efficient provided the number of points in grid A is not too large.
EDIT: This also assumes that the points in xi are sorted.
This is not exactly the answer you were looking for, but this is my 50c answer...
A = {1:'a1',2:'a2',3:'a3',4:'a4'}
B = {0.7:'b1',0.9:'b2',1.7:'b3',2:'b4', 2.4:'b5'}
C = {} # result
# find altitude in A that is the closest to altitude in B
def findAltitude( altB,A):
toto = [ ((alt-altB)**2,alt) for alt in A.keys() ]
toto.sort()
return toto[0][1]
#iter on each altitude of B
for altB,valueB in B.iteritems():
altC = findAltitude( altB,A)
if altC in C:
C[altC].append(valueB)
else:
C[altC] = [valueB,]
# then do the median operation
#for altC,valueC in C.iteritems():
# C[altC] = map( median, valueC ) # where median is your median function
print C
It is NOT the best solution at all (specially if you have a lot of values), but only the fastest to write...
In fact, it depends how your datas are stored. Dictionnary is not the best choice.
It is more interesting/clever to use the fact that your altitudes are sorted.
You should provide more details on how your datas are stored (array with numpy ?)
==== Edit ====
I still do not know how your datas are, but let's try something more "clever", based on the fact that your altitudes are sorted.
from numpy import *
from pylab import *
from scipy.stats import nanmedian
# add val into C at the end of C or in the last place (depending if alt already exists in C or not)
def addto(C,val,alt):
if C and C[-1][0]==alt:
C[-1][1].append(valB)
else:
C.append( (alt,[valB,] ))
# values
A_Alt = array([1,2,3,4])
A_Val = array([.33, .5, .6, .8])
B_Alt = array([.7, 0.9, 1.7, 2., 2.4, 2.9, 3.1, 3.2, 3.9, 4.1])
B_Val = array([.3, NaN, .8, .6, .7, .4, .3, NaN, .99, 1.3])
#intermediate list of tuple (altitude, list_of_values)
C= []
#iterator on A
Aa = iter(A_Alt)
ainf = Aa.next()
asup = Aa.next() # two first values of A_Alt
#iterator on B
Ba = iter(B_Alt)
Bv = iter(B_Val)
# regrid
try:
while True:
altB = Ba.next()
valB = Bv.next()
# find ainf and asup in A_Alt such that ainf < altB < asup
while asup<altB:
try:
ainf,asup = asup, Aa.next()
except StopIteration:
break
# find closest
if abs(ainf-altB)<=abs(asup-altB):
addto(C, valB, ainf)
else:
addto(C, valB, asup)
except StopIteration:
pass
# do the median
res = [ nanmedian(k[1]) for k in C ]
print res
The idea is then to iterate over the two vectors/lists of altitudes, and for each altitude of B, find the two altitudes of A that surround it. Then, it is easy to find the closest...
This is less readable than Daan's solution, but it should be more efficient (linear in the size of your datas).
You just need to modify if your datas are not stored like that.
One way to cover the second case (Grid B to A, i.e. from few altitudes to many altitudes) is this:
Extrapolation function (from here)
from scipy.interpolate import interp1d
def extrap1d(interpolator):
xs = interpolator.x
ys = interpolator.y
def pointwise(x):
if x < xs[0]:
return ys[0]
elif x > xs[-1]:
return ys[-1]
else:
return interpolator(x)
def ufunclike(xs):
return array(map(pointwise, array(xs)))
return ufunclike
Values
A_Alt = array([1,2,3,4])
A_Val = array([.33, .5, .6, .8])
B_Alt = array([.7, 0.9, 1.7, 2., 2.4, 2.9, 3.1, 3.2, 3.9, 4.1])
Actual regridding:
f_i = interp1d(A_Alt, A_Val, kind='nearest')
f_x = extrap1d(f_i)
f_x(B_Alt)
Output:
array([ 0.33, 0.33, 0.5 , 0.5 , 0.5 , 0.6 , 0.6 , 0.6 , 0.8 , 0.8 ])

Categories

Resources