Group by max or min in a numpy array - python

I have two equal-length 1D numpy arrays, id and data, where id is a sequence of repeating, ordered integers that define sub-windows on data. For example:
id data
1 2
1 7
1 3
2 8
2 9
2 10
3 1
3 -10
I would like to aggregate data by grouping on id and taking either the max or the min.
In SQL, this would be a typical aggregation query like SELECT MAX(data) FROM tablename GROUP BY id ORDER BY id.
Is there a way I can avoid Python loops and do this in a vectorized manner?

I've been seeing some very similar questions on stack overflow the last few days. The following code is very similar to the implementation of numpy.unique and because it takes advantage of the underlying numpy machinery, it is most likely going to be faster than anything you can do in a python loop.
import numpy as np
def group_min(groups, data):
# sort with major key groups, minor key data
order = np.lexsort((data, groups))
groups = groups[order] # this is only needed if groups is unsorted
data = data[order]
# construct an index which marks borders between groups
index = np.empty(len(groups), 'bool')
index[0] = True
index[1:] = groups[1:] != groups[:-1]
return data[index]
#max is very similar
def group_max(groups, data):
order = np.lexsort((data, groups))
groups = groups[order] #this is only needed if groups is unsorted
data = data[order]
index = np.empty(len(groups), 'bool')
index[-1] = True
index[:-1] = groups[1:] != groups[:-1]
return data[index]

In pure Python:
from itertools import groupby, imap, izip
from operator import itemgetter as ig
print [max(imap(ig(1), g)) for k, g in groupby(izip(id, data), key=ig(0))]
# -> [7, 10, 1]
A variation:
print [data[id==i].max() for i, _ in groupby(id)]
# -> [7, 10, 1]
Based on #Bago's answer:
import numpy as np
# sort by `id` then by `data`
ndx = np.lexsort(keys=(data, id))
id, data = id[ndx], data[ndx]
# get max()
print data[np.r_[np.diff(id), True].astype(np.bool)]
# -> [ 7 10 1]
If pandas is installed:
from pandas import DataFrame
df = DataFrame(dict(id=id, data=data))
print df.groupby('id')['data'].max()
# id
# 1 7
# 2 10
# 3 1

I'm fairly new to Python and Numpy but, it seems like you can use the .at method of ufuncs rather than reduceat:
import numpy as np
data_id = np.array([0,0,0,1,1,1,1,2,2,2,3,3,3,4,5,5,5])
data_val = np.random.rand(len(data_id))
ans = np.empty(data_id[-1]+1) # might want to use max(data_id) and zeros instead
np.maximum.at(ans,data_id,data_val)
For example:
data_val = array([ 0.65753453, 0.84279716, 0.88189818, 0.18987882, 0.49800668,
0.29656994, 0.39542769, 0.43155428, 0.77982853, 0.44955868,
0.22080219, 0.4807312 , 0.9288989 , 0.10956681, 0.73215416,
0.33184318, 0.10936647])
ans = array([ 0.98969952, 0.84044947, 0.63460516, 0.92042078, 0.75738113,
0.37976055])
Of course this only makes sense if your data_id values are suitable for use as indices (i.e. non-negative integers and not huge...presumably if they are large/sparse you could initialize ans using np.unique(data_id) or something).
I should point out that the data_id doesn't actually need to be sorted.

with only numpy and without loops:
id = np.asarray([1,1,1,2,2,2,3,3])
data = np.asarray([2,7,3,8,9,10,1,-10])
# max
_ndx = np.argsort(id)
_id, _pos = np.unique(id[_ndx], return_index=True)
g_max = np.maximum.reduceat(data[_ndx], _pos)
# min
_ndx = np.argsort(id)
_id, _pos = np.unique(id[_ndx], return_index=True)
g_min = np.minimum.reduceat(data[_ndx], _pos)
# compare results with pandas groupby
np_group = pd.DataFrame({'min':g_min, 'max':g_max}, index=_id)
pd_group = pd.DataFrame({'id':id, 'data':data}).groupby('id').agg(['min','max'])
(pd_group.values == np_group.values).all() # TRUE

Ive packaged a version of my previous answer in the numpy_indexed package; its nice to have this all wrapped up and tested in a neat interface; plus it has a lot more functionality as well:
import numpy_indexed as npi
group_id, group_max_data = npi.group_by(id).max(data)
And so on

A slightly faster and more general answer than the already accepted one; like the answer by joeln it avoids the more expensive lexsort, and it works for arbitrary ufuncs. Moreover, it only demands that the keys are sortable, rather than being ints in a specific range. The accepted answer may still be faster though, considering the max/min isn't explicitly computed. The ability to ignore nans of the accepted solution is neat; but one may also simply assign nan values a dummy key.
import numpy as np
def group(key, value, operator=np.add):
"""
group the values by key
any ufunc operator can be supplied to perform the reduction (np.maximum, np.minimum, np.substract, and so on)
returns the unique keys, their corresponding per-key reduction over the operator, and the keycounts
"""
#upcast to numpy arrays
key = np.asarray(key)
value = np.asarray(value)
#first, sort by key
I = np.argsort(key)
key = key[I]
value = value[I]
#the slicing points of the bins to sum over
slices = np.concatenate(([0], np.where(key[:-1]!=key[1:])[0]+1))
#first entry of each bin is a unique key
unique_keys = key[slices]
#reduce over the slices specified by index
per_key_sum = operator.reduceat(value, slices)
#number of counts per key is the difference of our slice points. cap off with number of keys for last bin
key_count = np.diff(np.append(slices, len(key)))
return unique_keys, per_key_sum, key_count
names = ["a", "b", "b", "c", "d", "e", "e"]
values = [1.2, 4.5, 4.3, 2.0, 5.67, 8.08, 9.01]
unique_keys, reduced_values, key_count = group(names, values)
print 'per group mean'
print reduced_values / key_count
unique_keys, reduced_values, key_count = group(names, values, np.minimum)
print 'per group min'
print reduced_values
unique_keys, reduced_values, key_count = group(names, values, np.maximum)
print 'per group max'
print reduced_values

I think this accomplishes what you're looking for:
[max([val for idx,val in enumerate(data) if id[idx] == k]) for k in sorted(set(id))]
For the outer list comprehension, from right to left, set(id) groups the ids, sorted() sorts them, for k ... iterates over them, and max takes the max of, in this case, another list comprehension. So moving to that inner list comprehension: enumerate(data) returns both the index and value from data, if id[val] == k picks out the data members corresponding to id k.
This iterates over the full data list for each id. With some preprocessing into sublists, it might be possible to speed it up, but it won't be a one-liner then.

The following solution only requires a sort on the data (not a lexsort) and does not require finding boundaries between groups. It relies on the fact that if o is an array of indices into r then r[o] = x will fill r with the latest value x for each value of o, such that r[[0, 0]] = [1, 2] will return r[0] = 2. It requires that your groups are integers from 0 to number of groups - 1, as for numpy.bincount, and that there is a value for every group:
def group_min(groups, data):
n_groups = np.max(groups) + 1
result = np.empty(n_groups)
order = np.argsort(data)[::-1]
result[groups.take(order)] = data.take(order)
return result
def group_max(groups, data):
n_groups = np.max(groups) + 1
result = np.empty(n_groups)
order = np.argsort(data)
result[groups.take(order)] = data.take(order)
return result

Related

Finding similar numbers in a list and getting the average

I currently have the numbers above in a list. How would you go about adding similar numbers (by nearest 850) and finding average to make the list smaller.
For example I have the list
l = [2000,2200,5000,2350]
In this list, i want to find numbers that are similar by n+500
So I want all the numbers similar by n+500 which are 2000,2200,2350 to be added and divided by the amount there which is 3 to find the mean. This will then replace the three numbers added. so the list will now be l = [2183,5000]
As the image above shows the numbers in the list. Here I would like the numbers close by n+850 to all be selected and the mean to be found
It seems that you look for a clustering algorithm - something like K-means.
This algorithm is implemented in scikit-learn package
After you find your K means, you can count how many of your data were clustered with that mean, and make your computations.
However, it's not clear in your case what is K. You can try and run the algorithm for several K values until you get your constraints (the n+500 distance between the means)
You can use:
import numpy as np
l = np.array([2000,2200,5000,2350])
# find similar numbers (that are within each 500 fold)
similar = l // 500
# for each similar group get the average and convert it to integer (as in the desired output)
new_list = [np.average(l[similar == num]).astype(int) for num in np.unique(similar)]
print(new_list)
Output:
[2183, 5000]
Step 1:
list = [5620.77978515625,
7388.43017578125,
7683.580078125,
8296.6513671875,
8320.82421875,
8557.51953125,
8743.5,
9163.220703125,
9804.7939453125,
9913.86328125,
9940.1396484375,
9951.74609375,
10074.23828125,
10947.0419921875,
11048.662109375,
11704.099609375,
11958.5,
11964.8232421875,
12335.70703125,
13103.0,
13129.529296875,
16463.177734375,
16930.900390625,
17712.400390625,
18353.400390625,
19390.96484375,
20089.0,
34592.15625,
36542.109375,
39478.953125,
40782.078125,
41295.26953125,
42541.6796875,
42893.58203125,
44578.27734375,
45077.578125,
48022.2890625,
52535.13671875,
58330.5703125,
61597.91796875,
62757.12890625,
64242.79296875,
64863.09765625,
66930.390625]
Step 2:
seen = [] #to log used indices pairs
diff_dic = {} #to record indices and diff
for i,a in enumerate(list):
for j,b in enumerate(list):
if i!=j and (i,j)[::-1] not in seen:
seen.append((i,j))
diff_dic[(i,j)] = abs(a-b)
keys = []
for ind, diff in diff_dic.items():
if diff <= 850:
keys.append(ind)
uniques_k = [] #to record unique indices
for pair in keys:
for key in pair:
if key not in uniques_k:
uniques_k.append(key)
import numpy as np
list_arr = np.array(list)
nearest_avg = np.mean(list_arr[uniques_k])
list_arr = np.delete(list_arr, uniques_k)
list_arr = np.append(list_arr, nearest_avg)
list_arr
output:
array([ 5620.77978516, 34592.15625, 36542.109375, 39478.953125, 48022.2890625, 52535.13671875, 58330.5703125 , 61597.91796875, 62757.12890625, 66930.390625 , 20566.00205365])
You just need a conditional list comprehension like this:
l = [2000,2200,5000,2350]
n = 2000
a = [ (x) for x in l if ((n -250) < x < (n + 250)) ]
Then you can average with
np.mean(a)
or whatever method you prefer.

python interpolation of some datapoints in dataset / merging lists

In an .xlsx file there is logged machine data in a way that is not suitable for further calculations. Meaning I've got a file that contains depth data of a cutting tool. Each depth increment comes with several further informations like pressure, rotational speed, forces and many more.
As you can see in some datapoints the resolution of the depth parameter (0.01) is insufficient, as other parameters are updated more often. So I want to interpolate between two consecutive depth datapoints.
What is important to know, this effect doesn't occure on each depth. When the cutting tool moves fast, everything is fine.
Here is also an example file.
So I just need to interpolate values of the depth, when the differnce between two consecutive depth datapoints is 0.01
I've tried the following approach:
Open as dataframe, rename, drop NaN, convert to list
count identical depths in list and transfer them to dataframe
calculate Delta between depth i and depth i-1 (i.e. to the predecessor), replace NaN with "0"
Divide delta depth by number of time steps if 0.009 < delta depth < 0.011 -->interpolated depth
empty List of Lists with the number of elements of the sublist corresponding to the duration
Pass values from interpolated depth to the respective sublists --> List 1
Transfer elements from delta_depth to sublists --> Liste 2
Merge List 1 and List 2
Flatten the Lists
replace the original depth value by the interpolated values in dataframe
It looks like this, but at point 8 (merging) I don't get what I need:
import pandas as pd
from itertools import groupby
from itertools import zip_longest
import matplotlib.pyplot as plt
import numpy as np
#open and rename of some columns
df_raw=pd.read_excel(open('---.xlsx', 'rb'), sheet_name='---')
df_raw=df_raw.rename(columns={"---"})
#drop NaN
df_1=df_raw.dropna(subset=['depth'])
#convert to list
li = df_1['depth'].tolist()
#count identical depths in list and transfer them to dataframe
df_count = pd.DataFrame.from_records([[i, len([*group])] for i, group in groupby(li)])
df_count = df_count.rename(columns={0: "depth", 1: "duration"})
#calculate Delta between depth i and depth i-1 (i.e. to the predecessor), replace NaN with "0".
df_count["delta_depth"] = df_count["depth"].diff()
df_count=df_count.fillna(0)
#Divide delta depth by number of time steps if 0.009 < delta depth < 0.011
df_count["inter_depth"] = np.where(np.logical_and(df_count['delta_depth'] > 0.009, df_count['delta_depth'] < 0.011),df_count["delta_depth"] / df_count["duration"],0)
li2=df_count.values.tolist()
li_depth = df_count['depth'].tolist()
li_delta = df_count['delta_depth'].tolist()
li_duration = df_count['duration'].tolist()
li_inter = df_count['inter_depth'].tolist()
#empty List of Lists with the number of elements of the sublist corresponding to the duration
out=[]
for number in li_duration:
out.append(li_inter[:number])
#Pass values from interpolated depth to the respective sublists --> Liste 1
out = [[i]*j for i, j in zip(li_inter, [len(j) for j in out])]
#Transfer elements from delta_depth to sublists --> Liste 2
def extractDigits(lst):
return list(map(lambda el:[el], lst))
lst=extractDigits(li_delta)
#Merge list 1 and list 2
list1 = out
list2 = lst
new_list = []
for l1, l2 in zip_longest(list1, list2, fillvalue=[]):
new_list.append([y if y else x for x, y in zip_longest(l1, l2)])
new_list
After merging the first elements of the sublists the original depth values are followed by the interpolated values. But the sublists should contain only interpolated values.
Now I have the following questions:
is there in general a better approach to this problem?
How could I solve the problem with merging, or...
... find a way to override the wrong first elements in the sublists
The desired result would look something like this.
Any help would be much appreciated, as I'm very unexperienced in python and totally stuck.
I am sure someone could write something prettier, but I think this will work just fine:
Edited to some kinda messy scripting. I think this will do what you need it to though
_list_helper1 = df["Depth [m]"].to_list()
_list_helper1.insert(0, 0)
_list_helper1.insert(0, 0)
_list_helper1 = _list_helper1[:-2]
df["helper1"] = _list_helper1
_list = df["Depth [m]"].to_list() # grab all depth values
_list.insert(0, 0) # insert a value at the beginning to offset from original col
_list = _list[0:-1] # Delete the very last item
df["helper"] = _list # add the list to a helper col which is now offset
df["delta depth"] = df["Depth [m]"] - df["helper"] # subtract helper col from original
_id = 0
for i in range(len(df)):
if df.loc[i, "Depth [m]"] == df.loc[i, "helper"]:
break_val = df.loc[i, "Depth [m]"]
break_val_2 = df.loc[i+1, "Depth [m]"]
if break_val_2 == break_val:
df.loc[i, "IDcol"] = _id
df.loc[i+1, "IDcol"] = _id
else:
_id += 1
depth = df["IDcol"].to_list()
depth = list(dict.fromkeys(depth))
depth = [x for x in depth if str(x) != 'nan']
increments = []
for i in depth:
_df = df.copy()
_df = _df[_df["IDcol"] == i]
_df.reset_index(inplace=True, drop=True)
div_by = len(_df)
increment = _df.loc[0, "helper"] - _df.loc[0, "helper1"]
_df["delta depth"] = increment / div_by
_increment = increment / div_by
base_value = _df.loc[0, "Depth [m]"]
for y in range(div_by):
_df.loc[y, "Depth [m]"] = base_value + ((y + 1) * _increment)
increments.append(_df)
df["IDcol"] = df["IDcol"].fillna("KEEP")
df = df[df["IDcol"] == "KEEP"]
increments.append(df)
df = pd.concat(increments)
df = df.fillna(0)
df = df[["index", "Depth [m]", "delta depth", "IDcol"]] # and whatever other cols u want

Realign indexes to a changed python collection

I have a collection of data and a variable containing indexes to some of them.
A filtering operation is applied on the data that eliminates a subset of the data.
I want to shift the indexes so that they refer to the updated collection of data (eliminating indexes to deleted instances).
I'm using the implementation in the function below. I'm also posting the code I used to validate that it works.
Is there a quick & fast way to do the index realignment via the core libraries or a better way in general?
import random
def align_index(wanted_idx, mask):
"""
Function to align a set of indexes to a collection after deletions,
indicated with a mask
Arguments:
wanted_idx: List of desired integer indexes prior to deletion
mask: Binary mask, where 1's indicate elements that survive deletion
Returns:
List of integer indexes to (surviving) desired elements, post-deletion
"""
# rebuild indexes: remove dangling
new_idx = [idx for (i, idx) in enumerate(wanted_idx) if mask[idx]]
# mark deleted
not_mask = [int(not m) for m in mask]
# cumsum deleted regions
realigned_idx = [k-sum(not_mask[:k+1]) for k in new_idx]
return realigned_idx
# data
data = [random.randint(0,500) for _ in range(1000)]
rng = list(range(len(data)))
for _ in range(1000):
# random data deletion / request
wanted_idx = random.sample(rng, random.randint(5,100))
del_index = random.sample(rng, random.randint(5, 100))
# apply deletion
mask = [int(i not in del_index) for i in range(len(data))]
filtered_data = [data[i] for (i, m) in enumerate(mask) if m]
realigned_index = align_index(wanted_idx, mask)
# verify
new_idx = [idx for (i, idx) in enumerate(wanted_idx) if mask[idx]]
l1 = [data[k] for k in new_idx]
l2 = [filtered_data[k] for k in realigned_index]
assert l1 == l2
If you use numpy it's quite trivial:
import numpy as np
mask = np.array(mask, dtype=np.bool)
new_idx = np.cumsum(mask, dtype=np.int64)
new_idx[mask] = -1
You shouldn't need to recompute new_idx unless more elements get deleted.
Then you can get the remapped index for old index i just by looking new_idx[i]. Or a whole array at once:
wanted_idx = np.array(wanted_idx, dtype=np.int64)
remapped_idx = new_idx[wanted_idx]
Note that deleted indices get assigned value -1. You can filter these out if you want:
remapped_idx = remapped_idx[remapped_idx >= 0]

Collapse sequences of numbers into ranges

Today I'm requesting help with a Python script that I'm writing; I'm using the CSV module to parse a large document with about 1,100 rows, and from each row it's pulling a Case_ID, a unique number that no other row has. For example:
['10215', '10216', '10277', '10278', '10279', '10280', '10281', '10282', '10292', '10293',
'10295', '10296', '10297', '10298', '10299', '10300', '10301', '10302', '10303', '10304',
'10305', '10306', '10307', '10308', '10309', '10310', '10311', '10312', '10313', '10314',
'10315', '10316', '10317', '10318', '10319', '10320', '10321', '10322', '10323', '10324',
'10325', '10326', '10344', '10399', '10400', '10401', '10402', '10403', '10404', '10405',
'10406', '10415', '10416', '10417', '10418', '10430', '10448', '10492', '10493', '10494',
'10495', '10574', '10575', '10576', '10577', '10578', '10579', '10580', '10581', '10582',
'10583', '10584', '10585', '10586', '10587', '10588', '10589', '10590', '10591', '10592',
'10593', '10594', '10595', '10596', '10597', '10598', '10599', '10600', '10601', '10602',
'10603', '10604', '10605', '10606', '10607', '10608', '10609', '10610', '10611', '10612',
'10613', '10614', '10615', '10616', '10617', '10618', '10619', '10620', '10621', '10622',
'10623', '10624', '10625', '10626', '10627', '10628', '10629', '10630', '10631', '10632',
'10633', '10634', '10635', '10636', '10637', '10638', '10639', '10640', '10641', '10642',
'10643', '10644', '10645', '10646', '10647', '10648', '10649', '10650', '10651', '10652',
'10653', '10654', '10655', '10656', '10657', '10658', '10659', '10707', '10708', '10709',
'10710', '10792', '10793', '10794', '10795', '10908', '10936', '10937', '10938', '10939',
'11108', '11109', '11110', '11111', '11112', '11113', '11114', '11115', '11116', '11117',
'11118', '11119', '11120', '11121', '11122', '11123', '11124', '11125', '11126', '11127',
'11128', '11129', '11130', '11131', '11132', '11133', '11134', '11135', '11136', '11137',
'11138', '11139', '11140', '11141', '11142', '11143', '11144', '11145', '11146', '11147',
'11148', '11149', '11150', '11151', '11152', '11153', '11154', '11155', '11194', '11195',
'11196', '11197', '11198', '11199', '11200', '11201', '11202', '11203', '11204', '11205',
'11206', '11207', '11208', '11209', '11210', '11211', '11212', '11213', '11214', '11215',
'11216', '11217', '11218', '11219', '11220', '11221', '11222', '11223', '11224', '11225',
'11226', '11227', '11228', '11229', '11230', '11231', '11232', '11233', '11234', '11235',
'10101', '10102', '10800', '11236']
As you can see, this list is quite an eyeful, so I'd like to include a small little function in my script that can reduce all of the sequential ranges down to hyphenated bookends of a sort, for example 10,277 - 10,282.
Thanks to all for any help included! Have a great day.
Doable. Let's see if this can be done with pandas.
import pandas as pd
data = ['10215', '10216', '10277', ...]
# Load data as series.
s = pd.Series(data)
# Find all consecutive rows with a difference of one
# and bin them into groups using `cumsum`.
v = s.astype(int).diff().bfill().ne(1).cumsum()
# Use `groupby` and `apply` to condense the consecutive numbers into ranges.
# This is only done if the group size is >1.
ranges = (
s.groupby(v).apply(
lambda x: '-'.join(x.values[[0, -1]]) if len(x) > 1 else x.item()).tolist())
print (ranges)
['10215-10216',
'10277-10282',
'10292-10293',
'10295-10326',
'10344',
'10399-10406',
'10415-10418',
'10430',
'10448',
'10492-10495',
'10574-10659',
'10707-10710',
'10792-10795',
'10908',
'10936-10939',
'11108-11155',
'11194-11235',
'10101-10102',
'10800',
'11236']
Your data must be sorted for this to work.
You can just use a simple loop here with the following logic:
Create a list to store the ranges (ranges).
Iterate over the values in your list (l)
If ranges is empty, append a list with the first value in l to ranges
Otherwise if the difference between the current and previous value is 1, append the current value to the last list in ranges
Otherwise append a list with the current value to ranges
Code:
l = ['10215', '10216', '10277', '10278', '10279', '10280', ...]
ranges = []
for x in l:
if not ranges:
ranges.append([x])
elif int(x)-prev_x == 1:
ranges[-1].append(x)
else:
ranges.append([x])
prev_x = int(x)
Now you can compute your final ranges by concatenating the first and last element of each list in ranges (if there are at least 2 elements).
final_ranges = ["-".join([r[0], r[-1]] if len(r) > 1 else r) for r in ranges]
print(final_ranges)
#['10215-10216',
# '10277-10282',
# '10292-10293',
# '10295-10326',
# '10344',
# '10399-10406',
# '10415-10418',
# '10430',
# '10448',
# '10492-10495',
# '10574-10659',
# '10707-10710',
# '10792-10795',
# '10908',
# '10936-10939',
# '11108-11155',
# '11194-11235',
# '10101-10102',
# '10800',
# '11236']
This also assumes your data is sorted. You could simplify the code to combine items 3 and 5.
For purely educational purposes (this is much more inefficient that the loop above), here's the same thing using map and reduce:
from functools import reduce
def myreducer(ranges, x):
if not ranges:
return [[x]]
elif (int(x) - int(ranges[-1][-1]) == 1):
return ranges[:-1] + [ranges[-1]+[x]]
else:
return ranges + [[x]]
final_ranges = map(
lambda r: "-".join([r[0], r[-1]] if len(r) > 1 else r),
reduce(myreducer, l, [])
)
There is also the pynumparser package:
import pynumparser
pynumparser.NumberSequence().encode([1, 2, 3, 5, 6, 7, 8, 10])
# result: '1-3,5-8,10'
pynumparser.NumberSequence().parse('1-3,5-8,10')
# result: (1, 2, 3, 5, 6, 7, 8, 10)

Python- Selecting pairs of objects from a data frame

I have a data frame that contains information about the positions of various objects, and a unique index for each object (index in this case is not related to the data frame). Here is some example data:
ind pos
x y z
-1.0 7.0 0.0 21 [-2.76788330078, 217.786453247, 26.6822681427]
0.0 22 [-7.23852539062, 217.274139404, 26.6758270264]
0.0 1.0 152 [-0.868591308594, 2.48404550552, 48.4036369324]
6.0 2.0 427 [-0.304443359375, 182.772140503, 79.4475860596]
The actual data frame is quite long. I have written a function that takes two vectors as inputs and outputs the distance between them:
def dist(a, b):
diff = N.array(a)-N.array(b)
d = N.sqrt(N.dot(diff, diff))
return d
and a function that, given two arrays, will output all the unique combinations of elements between these arrays:
def getPairs(a, b):
if N.array_equal(a, b):
pairs = [(a[i], b[j]) for i in range(len(a)) for j in range(i+1,
len(b))]
else:
pairs = [(a[i], b[j]) for i in range(len(a)) for j in range(len(b))]
return pairs
I want to take my data frame and find all the pairs of elements whose distance between them is less than some value, say 30. For the pairs that meet this requirement, I also need to store the distance I calculated in some other data frame. Here is my attempt at solving this, but this turned out to be extremely slow.
pairs = [getPairs(list(group.ind), list(boxes.get_group((name[0]+i, name[1]+j, name[2]+k)).ind)) \
for i in [0,1] for j in [0,1] for k in [0,1] if name[0]+i != 34 and name[1]+j != 34 and name[2]+k != 34]
pairs = list(itertools.chain(*pairs))
subInfo = pandas.DataFrame()
subInfo['pairs'] = pairs
subInfo['r'] = subInfo.pairs.apply(lambda x: dist(df_yz.query('ind == #x[0]').pos[0], df_yz.query('ind == #x[1]').pos[0]))
Don't worry about what I'm iterating over in this for loop, it works for the system I'm dealing with and isn't where I'm getting slowed down. The step I use .query() is where the major jam happens.
The output I am looking for is something like:
pair distance
(21, 22) 22.59
(21, 152) 15.01
(22, 427) 19.22
I made the distances up, and the pair list would be much longer, but that's the basic idea.
Took me a while, but here are thee possible solution. Hope they are self explanatory. Written in Python 3.x in Jupyter Notebook. One remark: if your coordinates are world coordinates, you may think of using the Haversine distance (circular distance) instead of the Euclidean distance which is a straight line.
First, create your data
import pandas as pd
import numpy as np
values = [
{ 'x':-1.0, 'y':7.0, 'z':0.0, 'ind':21, 'pos':[-2.76788330078, 217.786453247, 26.6822681427] },
{ 'z':0.0, 'ind':22, 'pos':[-7.23852539062, 217.274139404, 26.6758270264] },
{ 'y':0.0, 'z':1.0, 'ind':152, 'pos':[-0.868591308594, 2.48404550552, 48.4036369324] },
{ 'y':6.0, 'z':2.0, 'ind':427, 'pos':[-0.304443359375, 182.772140503, 79.4475860596] }
]
def dist(a, b):
"""
Calculates the Euclidean distance between two 3D-vectors.
"""
diff = np.array(a) - np.array(b)
d = np.sqrt(np.dot(diff, diff))
return d
df_initial = pd.DataFrame(values)
The following three solutions will generate this output:
pairs distance
1 (21, 22) 4.499905
3 (21, 427) 63.373886
7 (22, 427) 63.429709
First solution is based on a full join of the data with itself. Downside is that it may exceed your memory if the dataset is huge. Advantages are the easy readability of the code and the usage of Pandas only:
#%%time
df = df_initial.copy()
# join data with itself, each line will contain two geo-positions
df['tmp'] = 1
df = df.merge(df, on='tmp', suffixes=['1', '2']).drop('tmp', axis=1)
# remove rows with similar index
df = df[df['ind1'] != df['ind2']]
# calculate distance for all
df['distance'] = df.apply(lambda row: dist(row['pos1'], row['pos2']), axis=1)
# filter only those within a specific distance
df = df[df['distance'] < 70]
# combine original indices into a tuple
df['pairs'] = list(zip(df['ind1'], df['ind2']))
# select columns of interest
df = df[['pairs', 'distance']]
def sort_tuple(idx):
x, y = idx
if y < x:
return y, x
return x, y
# sort values of each tuple from low to high
df['pairs'] = df['pairs'].apply(sort_tuple)
# drop duplicates
df.drop_duplicates(subset=['pairs'], inplace=True)
# print result
df
The second solution tries to avoid the memory issue of the first version by iterating over the original data line by line and calculating the distance between the current line and the original data while keeping only values that satisfy the minimum distance constraint. I was expecting a bad performance, but wasn't bad at all (see summary at the end).
#%%time
df = df_initial.copy()
results = list()
for index, row1 in df.iterrows():
# calculate distance between current coordinate and all original rows in the data
df['distance'] = df.apply(lambda row2: dist(row1['pos'], row2['pos']), axis=1)
# filter only those within a specific distance and drop rows with same index as current coordinate
df_tmp = df[(df['distance'] < 70) & (df['ind'] != row1['ind'])].copy()
# prepare final data
df_tmp['ind2'] = row1['ind']
df_tmp['pairs'] = list(zip(df_tmp['ind'], df_tmp['ind2']))
# remember data
results.append(df_tmp)
# combine all into one dataframe
df = pd.concat(results)
# select columns of interest
df = df[['pairs', 'distance']]
def sort_tuple(idx):
x, y = idx
if y < x:
return y, x
return x, y
# sort values of each tuple from low to high
df['pairs'] = df['pairs'].apply(sort_tuple)
# drop duplicates
df.drop_duplicates(subset=['pairs'], inplace=True)
# print result
df
The third solution is based on spatial operations using the KDTree from Scipy.
#%%time
from scipy import spatial
tree = spatial.KDTree(list(df_initial['pos']))
# calculate distances (returns a sparse matrix)
distances = tree.sparse_distance_matrix(tree, max_distance=70)
# convert to a Coordinate (coo) representation of the Compresses-Sparse-Column (csc) matrix.
coo = distances.tocoo(copy=False)
def get_cell_value(idx: int, column: str = 'ind'):
return df_initial.iloc[idx][column]
def extract_indices(row):
distance, idx1, idx2 = row
return get_cell_value(int(idx1)), get_cell_value(int(idx2))
df = pd.DataFrame({'idx1': coo.row, 'idx2': coo.col, 'distance': coo.data})
df['pairs'] = df.apply(extract_indices, axis=1)
# select columns of interest
df = df[['pairs', 'distance']]
def sort_tuple(idx):
x, y = idx
if y < x:
return y, x
return x, y
# sort values of each tuple from low to high
df['pairs'] = df['pairs'].apply(sort_tuple)
# drop duplicates
df.drop_duplicates(subset=['pairs'], inplace=True)
# print result
df
So what about performance. If you just want to know which row of your original data is within the desired distance, then the KDTree version (third version) is super fast. It took just 4ms to generate the sparse matrix. But since I then used the indices from that matrix to extract the data from the original data, the performance dropped. Of course this should be tested on your full dataset.
version 1: 93.4 ms
version 2: 42.2 ms
version 3: 52.3 ms (4 ms)

Categories

Resources