Finding overlapping segments in Pandas - python

I have two pandas DataFrames A and B, with columns ['start', 'end', 'value'] but not the same number of rows. I'd like to set the values for each row in A as follows:
A.iloc(i) = B['value'][B['start'] < A[i,'start'] & B['end'] > A[i,'end']]
There is a possibility of multiple rows of B satisfy this condition for each i, in that case max or sum of corresponding rows would be the result. In case if none satisfies the value of A.iloc[i] should not be updated or set to a default value of 0 (either way would be fine)
I'm interested to find the most efficient way of doing this.

import numpy as np
np.random.seed(1)
lenB = 10
lenA = 20
B_start = np.random.rand(lenB)
B_end = B_start + np.random.rand(lenB)
B_value = np.random.randint(100, 200, lenB)
A_start = np.random.rand(lenA)
A_end = A_start + np.random.rand(lenA)
#if you use dataframe
#B_start = B["start"].values
#B_end = ...
mask = (A_start[:, None ] > B_start) & (A_end[:, None] < B_end)
r, c = np.where(mask)
result = pd.Series(B_value[c]).groupby(r).max()
print result

Related

How to optimize following algorithm that iterates over a DataFrame of few million of rows?

I have the following algorithm that iterates over a DataFrame with few millions of rows.
It takes a lot of time for the algorithm to finish. Do you have any suggestions?
def k_nn_averaging(df: pd.DataFrame, k: int = 15, use_abs_value: bool = False) -> pd.DataFrame:
df_averaged = df.copy()
df[helper.modifiable_columns] = df[helper.modifiable_columns].astype(float)
df_averaged[helper.modifiable_columns] = df_averaged[helper.modifiable_columns].astype(float)
for i in range(0, df.shape[0]):
neighbours = list(range(i-k if i-k >= 0 else 0, i+k if i+k <= df_averaged.shape[0] else df_averaged.shape[0]))
neighbours.remove(i)
selectedNeighbourIndex = choice(neighbours)
factor = uniform(0,1)
currentSampleValues = df[helper.modifiable_columns].iloc[i]
neighbourSampleValues = df[helper.modifiable_columns].iloc[selectedNeighbourIndex]
average = 0
if not use_abs_value: average = factor*(currentSampleValues - neighbourSampleValues)
else: average = factor*(abs(currentSampleValues - neighbourSampleValues))
df_averaged.loc[i,helper.modifiable_columns] = currentSampleValues + average
return df_averaged
The first thing you should always want is to vectorize loops. Here is the modified code that avoids using Python loops and uses NumPy operations instead:
import pandas as pd
import numpy as np
def k_nn_averaging(df: pd.DataFrame, k: int = 15, use_abs_value: bool = False) -> pd.DataFrame:
df_averaged = df.copy()
df_averaged[helper.modifiable_columns] = df_averaged[helper.modifiable_columns].astype(float)
num_rows = df.shape[0]
modifiable_columns = helper.modifiable_columns
# create a matrix of the neighbour indices for each row
neighbour_indices = np.empty((num_rows, k*2+1), dtype=int)
neighbour_indices[:, k] = np.arange(num_rows) # set the current row index as the middle value
for i in range(k):
# set the left neighbours
neighbour_indices[i+1:, i] = neighbour_indices[i:-1, k] - 1
# set the right neighbours
neighbour_indices[:-i-1, k+i+1] = neighbour_indices[1:, k] + 1
# set the values outside the range of the DataFrame to -1
neighbour_indices[neighbour_indices < 0] = -1
neighbour_indices[neighbour_indices >= num_rows] = -1
# select the neighbour indices to use for each row
selected_neighbour_indices = neighbour_indices[:, neighbour_indices[0] >= 0]
# create a matrix of factors
factors = np.random.uniform(size=(num_rows, selected_neighbour_indices.shape[1]))
# select the neighbour values for each row
neighbour_values = df[modifiable_columns].values[selected_neighbour_indices]
# select the current values for each row
current_values = df[modifiable_columns].values[:, np.newaxis]
# calculate the average values
if not use_abs_value:
averages = factors * (current_values - neighbour_values)
else:
averages = factors * np.abs(current_values - neighbour_values)
# update the values in the output DataFrame
df_averaged[modifiable_columns] = current_values + averages
return df_averaged
I think this will be much faster than the original script.

Join rows in pandas, shift by one and create new data frame

I have a data frame as follows
I/P
date,low,high,close
d1,l1,h1,c1
d2,l2,h2,c2
d3,l3,h3,c3
d4,l4,h4,c4
d5,l5,h5,c5
d6,l6,h5,c5
d7,l7,h7,c7
O/P
d1,l1,h1,c1,d2,l2,h2,c2,d3,l3,h3,c3
d2,l2,h2,c2,d3,l3,h3,c3,d4,l5,h4,c4
d3,l3,h3,c3,d4,l5,h4,c4,d5,l5,h5,c5
d4,l5,h4,c4,d5,l5,h5,c5,d6,l6,h6,c6
....
Basically join all rows, split into subarrays of 3 size each staring at each index, and create the op data frame.
Following code works. Buts its too verbose and slow. Does pandas have something inbuilt for this?
def flatten(df):
candles = []
i = 0
while i < len(df):
candles.append(df.iloc[i])
i= i+1
return candles
def slide_and_expand(candles, k):
return [candles[i:i+k] for i in range(len(candles) - k + 1)]
def candle_to_dict(col_name_prefix, candle_series):
candle_dict = {}
for index, val in candle_series.iteritems():
col_name = col_name_prefix+index
candle_dict[col_name] = val
return candle_dict
def candle_group_to_feature_vector(candle_group):
feature_vector_dict = {}
i = 0
for candle in candle_group:
col_name_prefix = f"c{i}_"
candle_dict = candle_to_dict(col_name_prefix, candle)
feature_vector_dict.update(candle_dict)
i= i+1
return feature_vector_dict
def candle_groups_to_feature_vectors(candle_groups):
feature_vectors = []
for candle_group in candle_groups:
feature_vector = candle_group_to_feature_vector(candle_group)
feature_vectors.append(feature_vector)
return feature_vectors
fv_len = 3
candles = flatten(data)
candle_groups = slide_and_expand(candles,fv_len)
feature_vectors = candle_groups_to_feature_vectors(candle_groups)
data_fv = pd.DataFrame.from_dict(feature_vectors, orient='columns')
data_fv
You could do something like this:
n = len(df.index) # number of rows in original dataframe 'df'
df_0 = df.loc[0:n-3]
df_1 = df.loc[1:n-2]
df_2 = df.loc[2:n-1]
df_final = pandas.concat([df_0, df_1, df_2], axis = 1)
You can save a few steps using Pandas rolling function using the windows size as the desired subarray length (window=SUBARR_SZ). Then, join each column with a ,, transform the result to a Series to be able to apply a join again, but now using each row in the Series (which contains the specific amount of subarrays).
import pandas as pd
df = pd.read_csv('sample.csv')
SUBARR_SZ = 3 # subarray size
df_list = []
for w in df.rolling(window=SUBARR_SZ):
if len(w) == SUBARR_SZ:
s = w.apply(','.join, axis=1).apply(pd.Series).apply(','.join)
df_list.append(s)
dff = pd.concat(df_list).reset_index(drop=True)
print(dff)
Output from dff
0 d1,l1,h1,c1,d2,l2,h2,c2,d3,l3,h3,c3
1 d2,l2,h2,c2,d3,l3,h3,c3,d4,l4,h4,c4
2 d3,l3,h3,c3,d4,l4,h4,c4,d5,l5,h5,c5
3 d4,l4,h4,c4,d5,l5,h5,c5,d6,l6,h6,c6
4 d5,l5,h5,c5,d6,l6,h6,c6,d7,l7,h7,c7
dtype: object

Dataframes from arrays with different length - fill missing values by rmean of row

I'm want to create a dataframe, out of arrays with different size. I want to fill the missing values depending on similar values.
I've tried to stick the arrays together and do a sort and a split with numpy. I've then calculate the mean of the splits and decide wether its a value close to the mean or its better fill with nan.
def find_nearest(array, value):
array = np.asarray(array)
idx = (np.abs(array - value)).argmin()
return idx
#generate sample data
loa = [((np.arange(np.random.randint(1,3),np.random.randint(3,6)))*val).tolist()
for val in np.random.uniform(0.9,1.1,5)]
#reshape
flat_list = sum(loa,[])
#add some attributes
attributes = [np.random.randint(-3,-1) for x in range(len(flat_list))]
#sort and split on percentage change
flat_list.sort()
arr = np.array(flat_list)
arr_splits = np.split(arr, np.argwhere(np.diff(arr)/arr[1:]*100 > 12)[:,0])
#means of the splits
means = [np.mean(arr) for arr in arr_splits]
#create dataframe
i = 0
res = np.zeros((len(loa), len(means)*2))*np.nan
for row, l in enumerate(loa):
for val in l:
col = find_nearest(means, val)
res[row, col] = val
res[row, col+len(means)] = attributes[i]
i = i + 1
df = pd.DataFrame(res)
Is there another way, to do this stuff more directly with pandas? ... or something more elegant?

Pandas optimizing an interpolation/counting algorithm

I have a bunch of data (10M + records) that breaks down to an identifier, a location and a date. I want to find the number of times that any identifier moved from some locationA to some other locationB over the entire set of dates. Any identifier may not have a location for all possible dates. When an identifier does not have a location recorded, that should be treated as an actual 'unknown' location for that date.
Here is some reproducible fake data...
import numpy as np
import pandas as pd
import datetime
base = datetime.date.today()
num_days = 50
dates = np.array([base - datetime.timedelta(days=x) for x in range(num_days-1, -1, -1)])
ids = np.arange(50)
mi = pd.MultiIndex.from_product([ids, dates])
locations = np.array([chr(x) for x in 97 + np.random.randint(26, size=len(mi))])
s = pd.Series(locations, index=mi)
mask = np.random.rand(len(mi)) > .5
s[mask] = np.nan
s = s.dropna()
My initial thought was to create a dataframe and use boolean masking/vectorized operations to solve this
df = s.unstack(0).fillna('unknown')
Apparently my data is sparse enough to cause a MemoryError (from all the extra entries resulting from unstacking).
My current working solution is the following
def series_fn(s):
s = s.reindex(pd.date_range(s.index.levels[1].min(), s.index.levels[1].max()), level=-1).fillna('unknown')
mask_prev = (s != s.shift(-1))[:-1]
mask_next = (s != s.shift())[1:]
s_prev = s[:-1][mask_prev]
s_next = s[1:][mask_next]
s_tup = pd.Series(list(zip(s_prev, s_next)))
return s_tup.value_counts()
result_per_id = s.groupby(level=0).apply(series_fn)
result = result_per_id.sum(level=-1)
result looks like
(a, b) 1
(a, c) 5
(a, e) 3
(a, f) 3
(a, g) 3
(a, h) 3
(a, i) 1
(a, j) 1
(a, k) 2
(a, l) 2
...
This is going to take ~5 hours for all my data. Does anyone know any faster ways of doing this?
Thanks!
Hmmm, I guess I should have transposed the data... well that was a relatively simple fix. Instead of using groupby and apply,
s = s.reorder_levels(['date', 'id'])
s = s.sortlevel(0)
results = []
for i in range(len(s.index.levels[0])-1):
t = time.time()
s0 = s.loc[s.index.levels[0][i]]
s1 = s.loc[s.index.levels[0][i+1]]
df = pd.concat((s0, s1), axis=1)
# Note: this is slower than the line above
# df = s.loc[s.index.levels[0][0:2], :].unstack(0)
df = df.fillna('unknown')
mi = pd.MultiIndex.from_arrays((df.iloc[:, 0], df.iloc[:, 1]))
s2 = pd.Series(1, mi)
res = s2.groupby(level=[0, 1]).apply(np.sum)
results.append(res)
print(time.time() - t)
results = pd.concat(results, axis=1)
Still unclear on why the commented out section takes about three times as long as the three lines above it.

stratified sampling in numpy

In numpy I have a dataset like this. The first two columns are indices. I can divide my dataset into blocks via the indices, i.e. first block is 0 0 second block is 0 1 third block 0 2 then 1 0, 1 1, 1 2 and so on and so forth. Each block has at least two elements. The numbers in the indices columns can vary
I need to split the dataset along these blocks 80%-20% randomly such that after the split each block in both datasets has at least 1 element. How could I do that?
indices | real data
|
0 0 | 43.25 665.32 ... } 1st block
0 0 | 11.234 }
0 1 ... } 2nd block
0 1 }
0 2 } 3rd block
0 2 }
1 0 } 4th block
1 0 }
1 0 }
1 1 ...
1 1
1 2
1 2
2 0
2 0
2 1
2 1
2 1
...
See how do you like this. To introduce randomness, I am shuffling the entire dataset. It is the only way I have figured how to do the splitting vectorized. Maybe you could simply shuffle an indexing array, but that was one indirection too many for my brain today. I have also used a structured array, for ease in extracting the blocks. First, lets create a sample dataset:
from __future__ import division
import numpy as np
# Create a sample data set
c1, c2 = 10, 5
idx1, idx2 = np.arange(c1), np.arange(c2)
idx1, idx2 = np.repeat(idx1, c2), np.tile(idx2, c1)
items = 1000
i = np.random.randint(c1*c2, size=(items - 2*c1*c2,))
d = np.random.rand(items+5)
dataset = np.empty((items+5,), [('idx1', np.int), ('idx2', np.int),
('data', np.float)])
dataset['idx1'][:2*c1*c2] = np.tile(idx1, 2)
dataset['idx1'][2*c1*c2:-5] = idx1[i]
dataset['idx2'][:2*c1*c2] = np.tile(idx2, 2)
dataset['idx2'][2*c1*c2:-5] = idx2[i]
dataset['data'] = d
# Add blocks with only 2 and only 3 elements to test corner case
dataset['idx1'][-5:] = -1
dataset['idx2'][-5:] = [0] * 2 + [1]*3
And now the stratified sampling:
# For randomness, shuffle the entire array
np.random.shuffle(dataset)
blocks, _ = np.unique(dataset[['idx1', 'idx2']], return_inverse=True)
block_count = np.bincount(_)
where = np.argsort(_)
block_start = np.concatenate(([0], np.cumsum(block_count)[:-1]))
# If we have n elements in a block, and we assign 1 to each array, we
# are left with only n-2. If we randomly assign a fraction x of these
# to the first array, the expected ratio of items will be
# (x*(n-2) + 1) : ((1-x)*(n-2) + 1)
# Setting the ratio equal to 4 (80/20) and solving for x, we get
# x = 4/5 + 3/5/(n-2)
x = 4/5 + 3/5/(block_count - 2)
x = np.clip(x, 0, 1) # if n in (2, 3), the ratio is larger than 1
threshold = np.repeat(x, block_count)
threshold[block_start] = 1 # first item goes to A
threshold[block_start + 1] = 0 # seconf item goes to B
a_idx = threshold > np.random.rand(len(dataset))
A = dataset[where[a_idx]]
B = dataset[where[~a_idx]]
After running it, the split is roughly 80/20, and all blocks are represented in both arrays:
>>> len(A)
815
>>> len(B)
190
>>> np.all(np.unique(A[['idx1', 'idx2']]) == np.unique(B[['idx1', 'idx2']]))
True
Here's an alternative solution. I'm open for a code review if it is possible to implement this in a more numpy way (without for loops). #Jamie 's answer is really good, it's just that sometimes it produces skewed ratios within blocks of data.
ratio = 0.8
IDX1 = 0
IDX2 = 1
idx1s = np.arange(len(np.unique(self.data[:,IDX1])))
idx2s = np.arange(len(np.unique(self.data[:,IDX2])))
valid = None
train = None
for i1 in idx1s:
for i2 in idx2:
mask = np.nonzero((data[:,IDX1] == i1) & (data[:,IDX2] == i2))
curr_data = data[mask,:]
np.random.shuffle(curr_data)
start = np.min(mask)
end = np.max(mask)
thres = start + np.around((end - start) * ratio).astype(np.int)
selected = mask < thres
train_idx = mask[0][selected[0]]
valid_idx = mask[0][~selected[0]]
if train != None:
train = np.vstack((train,data[train_idx]))
valid = np.vstack((valid,data[valid_idx]))
else:
train = data[train_idx]
valid = data[valid_idx]
I'm assuming that each block has at least two entries and also that if it has more than two you want them assigned as closely as possible to 80/20. The easiest way to do this seems to be to assign a random number to all rows, and then choose based on percentiles within each stratified sample. Say this is the data in file strat_sample.csv:
Index_1,Index_2,Data_1,Data_2
0,0,0.614583182,0.677644482
0,0,0.321384981,0.598450854
0,0,0.303029607,0.300593782
0,0,0.646010758,0.612006715
0,0,0.484572883,0.30052535
0,1,0.010625416,0.118671475
0,1,0.428967984,0.23795173
0,1,0.523440618,0.457275922
0,1,0.379612652,0.337640868
0,1,0.338180659,0.206399031
1,0,0.079386,0.890939911
1,0,0.572864624,0.725615079
1,0,0.045891404,0.300128917
1,0,0.578792198,0.100698871
1,0,0.776485138,0.475135948
1,0,0.401850419,0.784835723
1,1,0.087660923,0.497299605
1,1,0.8460978,0.825774802
1,1,0.526015021,0.581905971
1,1,0.23324672,0.299475291
Then this code (using Pandas data structures) works as desired
import numpy as np
import random as rnd
import pandas as pd
#sample data strat_sample.csv, contents to follow
def TreatmentOneCount(n , *args):
#assign a minimum one to each group but as close as possible to fraction OptimalRatio in group 1.
OptimalRatio = args[0]
if n < 2:
print("N too small, assignment not defined.")
a = NaN
elif n == 2:
a = 1
else:
"""
There are one of two numbers that are close to the target ratio, one above, the other below
If the number above is N and it is closest to optimal, then you need to set things to N-1 to ensure both groups have at least one member (recall n>2)
If the number below is 0 and it is closest to optimal, then you need to set things to 1 to ensure both groups have at least one member (recall n>2)
"""
targetassigment = OptimalRatio * n
if targetassigment - floor(targetassigment) > 0.5:
a = min(ceil(targetassigment),n-1)
else:
a = max(floor(targetassigment),1)
return a
df = pd.read_csv('strat_sample.csv', sep=',' , header=0)
#assign a random number to each entry
df['RandScore'] = np.random.uniform(0,1,df.shape[0])
df.sort(columns= ['Index_1' ,'Index_2','RandScore'], inplace = True)
#Within each block assign a rank based on random number.
df['RandRank'] = df.groupby(['Index_1','Index_2'])['RandScore'].rank()
#make a group index
df['MasterIdx'] = df['Index_1'].apply(str) + df['Index_2'].apply(str)
#Store the counts for members of each block
seriestest = df.groupby('MasterIdx')['RandRank'].count()
seriestest.name = "Counts"
dftest = pd.DataFrame(seriestest)
#Add the block counts to the data
df = df.merge(dftest, how='left', left_on = 'MasterIdx', right_index= True)
#Make the actual assignments to the two groups
df['Assignment'] = (df['RandRank'] <= df['Counts'].apply(TreatmentOneCount, args = (0.8,))) * -1 + 2
df.drop(['MasterIdx', 'Counts', 'RandRank', 'RandScore'], axis=1)
from sklearn import cross_validation
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, y, test_size=0.2, random_state=0)

Categories

Resources