I have a user-own metric to implement as follows:
def metric(pred:pd.DataFrame(), valid:pd.DataFrame()):
date_begin = valid.dt.min()
date_end = valid.dt.max()
x = valid[valid.label == 1].dt.min()
# p
p_n_tpp_df = valid[(valid.dt >= x) &\
(valid.dt <= x + timedelta(days=30)) &\
(p_n_tpp_df.label == 1)]
p_n_pp_df = valid[(valid.dt >= date_begin + timedelta(days=30)) &\
(valid.dt <= date_end + timedelta(days=30)) &\
(p_n_tpp_df.label == 1)]
p_n_tpp = len([x for x in pred.serial_number.values\
if x in p_n_tpp_df.serial_number.unique()])
p_n_pp = len([x for x in pred.serial_number.values\
if x in p_n_pp_df.serial_number.unique()])
p = p_n_tpp / p_n_pp
print('p: ', p)
# r
p_n_tpr_df = valid[(valid.dt >= date_begin - timedelta(days=30)) &\
(valid.dt <= date_end - timedelta(days=30)) &\
(p_n_tpr_df.label == 1)]
p_n_pr_df = valid[(valid.dt >= date_begin) &\
(valid.dt <= date_end) &\
(p_n_pr_df.label == 1)]
p_n_tpr = len([x for x in pred.serial_number.values\
if x in p_n_tpr_df.serial_number.unique()])
p_n_pr = len([x for x in pred.serial_number.values\
if x in p_n_pr_df.serial_number.unique()])
r = p_n_tpr / p_n_pr
print('p: ', r)
m = 2 * p * r / (p + r)
return m
The pd.DataFrame() of pred and valid have the same columns and dt has no intersections.
And the all the values of serial_number in valid is a subset of all the values of serial_number in pred.
The label column only has 2 values: 0 or 1.
Here is the sample of pred and valid is as follows:
print(pred.head(3))
serial_number dt label
0 123 2011-03-21 1
1 52 2011-03-22 0
2 12 2011-03-01 1
..., ...
print(pred.info())
Int64Index: 10000000 entries,
Data columns (total 3 columns):
serial_number int32
dt datetimes64[ns]
label int8
..., ...
print(valid.head(3))
serial_number dt label
0 324 2011-04-22 1
1 52 2011-04-22 0
2 14 2011-04-01 1
..., ...
print(valid.info())
Int64Index: 10000000 entries,
Data columns (total 3 columns):
serial_number int32
dt datetimes64[ns]
label int8
And the size of input pd.DataFrame is about 10, 000, 000 samples and 3 features.
When I try to use it to calculate this metric, it is really slow and time spending is more than 2 hours on Intel 9600KF.
So I am wondering how to optimize such code on time cost.
Thanks in advance.
Here is the biggest performance win in the code that you have:
Numpy set logic
len([x for x in pred.serial_number.values\
if x in p_n_tpr_df.serial_number.unique()])
Any line that looks like this is getting the size of the set intersection of pred.serial_number and p_n_tpr_df.serial_number. Using numpy rather than the list comprehension and the unique call will save substantial compute time:
intersect_size = np.intersect1d(pred.serial_number.values,
p_n_tpr_df.serial_number.values).shape[0]
Related
I have an excel file with 2 columns titled Lat and Lon with some data such as:
Lat Lon
36.19553° N 95.90918° W
36.19550° N 95.93592° W
36.20277° N 95.94484° W
36.20277° N 95.95381° W
36.22436° N 95.98023° W
36.21005° N 95.94487° W
36.21006° N 95.93594° W
35.99968° N 96.09681° W
35.97043° N 95.98949° W
35.96317° N 95.98951° W
35.99968° N 96.11459° W
35.99967° N 96.10568° W
35.96318° N 95.99839° W
35.96315° N 96.00728° W
35.99239° N 96.13247° W
I am trying to read in the excel file and group each one that is within a distance of 0.00004 in the last place of each other in either the lat or the lon column into a group. It should start with the first row and check each row looking for another one that is within a distance of 0.00004 and insert a number starting a one in a column called 'Drive' for each grouping.
the expected output is supposed to be:
Lat Lon Drive
0 36.19553 95.90918 1
1 36.19550 95.93592 1
2 36.20277 95.94484 2
3 36.20277 95.95381 2
4 36.22436 95.98023 3
5 36.21005 95.94487 2
6 36.21006 95.93594 1
7 35.99968 96.09681 4
8 35.97043 95.98949 5
9 35.96317 95.98951 5
10 35.99968 96.11459 4
11 35.99967 96.10568 4
12 35.96318 95.99839 5
13 35.96315 96.00728 5
14 35.99239 96.13247 6
I have made several attempt with no success.
here is the latest attempt:
# Read the data into a pandas DataFrame
df = pd.read_excel('data.xlsx')
# Convert Lat and Lon to absolute values for easy comparison
df['Lat'] = df['Lat'].abs()
df['Lon'] = df['Lon'].abs()
# Initialize the counter and group column
counter = 1
df['Drive'] = 0
# Loop over the DataFrame rows
for i in range(len(df)):
if df['Drive'][i] == 0:
df.loc[(df['Lat'].between(df['Lat'][i] - 4, df['Lat'][i] + 4)) &
(df['Lon'].between(df['Lon'][i] - 4, df['Lon'][i] + 4)), 'Drive'] = counter
counter += 1
# Print the result
print(df)
I get the error bad operand type for abs(): 'str'
# Read the data into a pandas DataFrame
df = pd.read_excel(workbook_path)
# Extract the degrees from the string value
df['Lat'] = df['Lat'].str.extract(r'(\d+\.\d+)')
df['Lon'] = df['Lon'].str.extract(r'(\d+\.\d+)')
df['Lat'] = df['Lat'].astype(float)
df['Lon'] = df['Lon'].astype(float)
df['Drive'] = 0
drive = 1
for i in range(len(df)):
if df.loc[i, 'Drive'] == 0:
df.loc[i, 'Drive'] = drive
for j in range(i + 1, len(df)):
if (abs(df.loc[i, 'Lat'] - df.loc[j, 'Lat']) <= 0.00004) or (abs(df.loc[i, 'Lon'] - df.loc[j, 'Lon']) <= 0.00004):
df.loc[j, 'Drive'] = drive
drive += 1
print(df)
My toy example is as follows:
import numpy as np
from sklearn.datasets import load_iris
import pandas as pd
### prepare data
Xy = np.c_[load_iris(return_X_y=True)]
mycol = ['x1','x2','x3','x4','group']
df = pd.DataFrame(data=Xy, columns=mycol)
dat = df.iloc[:100,:] #only consider two species
dat['group'] = dat.group.apply(lambda x: 1 if x ==0 else 2) #two species means two groups
dat.shape
dat.head()
### Linear discriminant analysis procedure
G1 = dat.iloc[:50,:-1]; x1_bar = G1.mean(); S1 = G1.cov(); n1 = G1.shape[0]
G2 = dat.iloc[50:,:-1]; x2_bar = G2.mean(); S2 = G2.cov(); n2 = G2.shape[0]
Sp = (n1-1)/(n1+n2-2)*S1 + (n2-1)/(n1+n2-2)*S2
a = np.linalg.inv(Sp).dot(x1_bar-x2_bar); u_bar = (x1_bar + x2_bar)/2
m = a.T.dot(u_bar); print("Linear discriminant boundary is {} ".format(m))
def my_lda(x):
y = a.T.dot(x)
pred = 1 if y >= m else 2
return y.round(4), pred
xx = dat.iloc[:,:-1]
xxa = xx.agg(my_lda, axis=1)
xxa.shape
type(xxa)
We have xxa is a pandas.core.series.Series with shape (100,). Note that there are two columns in parentheses of xxa, I want convert xxa to a pd.DataFrame with 100 rows x 2 columns and I try
xxa_df1 = pd.DataFrame(data=xxa, columns=['y','pred'])
which gives ValueError: Shape of passed values is (100, 1), indices imply (100, 2).
Then I continue to try
xxa2 = xxa.to_frame()
# xxa2 = pd.DataFrame(xxa) #equals `xxa.to_frame()`
xxa_df2 = pd.DataFrame(data=xxa2, columns=['y','pred'])
and xxa_df2 presents all NaN with 100 rows x 2 columns. What should I do next?
Let's try Series.tolist()
xxa_df1 = pd.DataFrame(data=xxa.tolist(), columns=['y','pred'])
print(xxa_df1)
y pred
0 42.0080 1
1 32.3859 1
2 37.5566 1
3 31.0958 1
4 43.5050 1
.. ... ...
95 -56.9613 2
96 -61.8481 2
97 -62.4983 2
98 -38.6006 2
99 -61.4737 2
[100 rows x 2 columns]
This web explain how to calculate rank correlation between two value (Maths rank and English rank):
https://statistics.laerd.com/statistical-guides/spearmans-rank-order-correlation-statistical-guide-2.php
In my case i want to calculate Rolling correlation between Order rank and Value rank in series.
seriesData = [81,114,2,32,16,9,7,4,3,26,8,5,3,6,7] #15 length
corr = rolling_corr(seriesData, 4)
for example i want to calculate correlation with period 4 so i have to perform 12 windowing. i have to get order and value rank for each window and calculate the correlations.
I Already built the code, but it was too slow, because i have to perform up to 500000 series length. This is the example of first window calculations:
seriesValue order_rank value_rank delta
81 4 2 4
114 3 1 4
2 2 4 4
32 1 3 4
Rho Value for 1st window : -0.6000000000000001
seriesValue order_rank value_rank delta
114 4 1 9
2 3 4 1
32 2 2 0
16 1 3 4
Rho Value for 2nd window : -0.3999999999999999
can you help me to speed up this calculation, Thanks.
def rolling_corr(seriesData, window):
corr = []
for j in range(len(seriesData)):
if j < window:
corr.append(np.nan)
else:
data = pd.DataFrame()
data['seriesValue'] = list(seriesData[j - window:j])
###Create Value rank and Order Rank for this window / period
data = data.reset_index()
data = data.rename(columns={'index': 'original_index'})
data = data.sort_values('seriesValue', ascending=False).reset_index(drop=True)
data = data.reset_index()
data['index'] = [i + 1 for i in data['index']]
data = data.rename(columns={'index': 'value_rank'})
data = data.set_index('original_index')
data = data.sort_index()
data['order_rank'] = np.arange(window, 0, -1)
####
# Calculate
data['delta'] = [(data.loc[ii, 'value_rank'] - data.loc[ii, 'order_rank']) ** 2 for ii in range(len(data))]
d = data['delta'].sum()
value = (1 - (6 * d) / (window ** 3 - window))
#print(data[['seriesValue', 'order_rank', 'value_rank', 'delta']].to_string(index=False))
#print("Rho Value for window " + str(j) + " : " +str(value))
corr.append(value)
return corr
corr = rolling_corr(seriesData, 4)
i found that pandas has apply rolling function, this code is faster than my previus operations. approx 0.72 Second for 250 data (10 times faster). I am happy if someone can speed up this code
def spearmanCorr(seriesData, n):
orderRank = pd.Series(range(n, 0, -1))
def corr(dat):
valueRank = pd.Series(dat).rank(method='first', ascending=False).reset_index(drop=True)
deltaSum = ((orderRank-valueRank)**2).sum()
value = (1 - (6 * deltaSum) / (n ** 3 - n)) * 100
return value
return seriesData.rolling(window=n).apply(corr)
I currently have a dataframe in the following format:
step tag_id x_pos y_pos
1 1 5 3
1 2 3 4
2 1 2 2
2 3 1 6
.........................
.........................
N 1 5 7
For each row in the df, I am aiming to add an additional m rows oversampling from a Gaussian distribution for the x and y values (independent). Thus, a df of N = 100 and m = 10 would result in a df length 1010, including the original and oversampled values.
The code I have for this works, but it is extremely slow over a large dataset (N > 100k). There are many operations (creating new arrays/ dfs, use of itertuples, etc.) that I'm sure are hampering performance; I would appreciate any help as to how I can improve the performance so I can generate higher m values over the whole dataset. For instance: input data is from a pandas dataframe, but the multi-variate normal function operates on numpy arrays. Is there a more natural way to implement this through pandas without the copying between numpy arrays and dataframes? Thanks!
Reproducible example:
import pandas as pd
import numpy as np
import random
def gaussianOversample2(row, n):
sigma = 2
mean_x = float(getattr(row,'x_pos'))
mean_y = float(getattr(row,'y_pos'))
step = getattr(row, 'step')
tag_id = getattr(row, 'tag_id')
sigma = np.array([1,1])
cov = np.diag(sigma ** 2)
x,y = np.random.multivariate_normal([mean_x, mean_y], cov, n).T
x = np.concatenate(([mean_x], x))
y = np.concatenate(([mean_y], y))
steps = np.empty(n+1)
tags = np.empty(n+1)
steps.fill(step)
tags.fill(tag_id)
return x,y, steps, tags
def oversampleDf(df, n):
oversampled_arr = np.empty((0,4), float)
# with input df with step, tag_id, x_pos, y_pos
data = pd.DataFrame(columns = df.columns)
count = 0
for row in df.itertuples(index=False):
count = count + 1
temp = np.zeros((len(row), n+1))
oversample_x, oversample_y, steps, tags = gaussianOversample2(row, n)
temp[0] = steps
temp[1] = tags
temp[2] = oversample_x
temp[3] = oversample_y
temp = pd.DataFrame(temp.T, columns = df.columns)
data = data.append(temp)
if count % 1000 == 0:
print("Row: ", count)
return data
df = pd.DataFrame([[1, 1, 5, 3],[1, 2, 3, 4],[2, 1, 2, 2],[2, 3, 1, 6], columns = ['step', 'tag_id', 'x_pos', 'y_pos']])
res = oversampleDf(df, 20)
"""
# Result should be:
step tag_id x_pos y_pos
0 1.0 1.0 5.000000 3.000000
1 1.0 1.0 3.423492 3.886602
2 1.0 1.0 5.404581 2.177559
3 1.0 1.0 4.023274 2.883737
4 1.0 1.0 3.390710 3.038782
.. ... ... ... ...
16 2.0 3.0 1.894151 5.510321
17 2.0 3.0 1.110932 5.281578
18 2.0 3.0 1.623538 4.529825
19 2.0 3.0 -0.576756 7.476872
20 2.0 3.0 -0.866123 5.898048
"""
This is the solution I have found for myself; it is more of a workaround than a technique using quicker methods. I instead write out to a csv file, which I then read in once complete, as so:
def gaussianOversample3(row, n):
mean_x = float(getattr(row,'x_pos'))
mean_y = float(getattr(row,'y_pos'))
step = getattr(row, 'step')
tag_id = getattr(row, 'tag_id')
sigma = np.array([1,1])
cov = np.diag(sigma ** 2)
x,y = np.random.multivariate_normal([mean_x, mean_y], cov, n).T
x = np.concatenate(([mean_x], x))
y = np.concatenate(([mean_y], y))
steps = np.empty(n+1)
tags = np.empty(n+1)
steps.fill(step)
tags.fill(tag_id)
pd.DataFrame(data = np.column_stack((steps,tags,x,y))).to_csv("oversample.csv", mode = 'a', header = False)
def oversampleDf2(df, n):
filename = "oversample.csv"
d = pd.DataFrame(list())
d.to_csv(filename)
#count = 0
for row in df.itertuples(index=False):
#count = count + 1
gaussianOversample3(row, n)
#if count % 10000 == 0:
# print("Row: ", count)
Because of how it is reading the file, I have to do the following:
oversampleDf2(defensive_df2, num_oversamples)
oversampled_df = pd.read_csv("oversample_10.csv", sep= ' ')
oversampled_df.columns = ['col']
oversampled_df = oversampled_df.col.str.split(",",expand=True)
oversampled_df.columns = ['temp', 'step', 'tag_id', 'x_pos', 'y_pos']
oversampled_df = oversampled_df.drop(['temp'], axis = 1)
oversampled_df = oversampled_df.astype(float)
Suppose
s = pd.Series(range(50))
0 0
1 1
2 2
3 3
...
48 48
49 49
How can I get the new series that consists of sum of every n rows?
Expected result is like below, when n = 5;
0 10
1 35
2 60
3 85
...
8 210
9 235
If using loc or iloc and loop by python, of course it can be accomplished, however I believe it could be done simply in Pandas way.
Also, this is a very simplified example, I don't expect the explanation of the sequences:). Actual data series I'm trying has the time index and the the number of events occurred in every second as the values.
GroupBy.sum
N = 5
s.groupby(s.index // N).sum()
0 10
1 35
2 60
3 85
4 110
5 135
6 160
7 185
8 210
9 235
dtype: int64
Chunk the index into groups of 5 and group accordingly.
numpy.reshape + sum
If the size is a multiple of N (or 5), you can reshape and add:
s.values.reshape(-1, N).sum(1)
# array([ 10, 35, 60, 85, 110, 135, 160, 185, 210, 235])
numpy.add.at
b = np.zeros(len(s) // N)
np.add.at(b, s.index // N, s.values)
b
# array([ 10., 35., 60., 85., 110., 135., 160., 185., 210., 235.])
The most efficient solution I can think of is f1() in my example below. It is orders of magnitude faster than using the groupby in the other answer.
Note that f1() doesn't work when the length of the array is not an exact multiple, e.g. if you want to sum a 3-item array every 2 items.
For those cases, you can use f1v2():
f1v2( [0,1,2,3,4] ,2 ) = [1,5,4]
My code is below. I have used timeit for the comparisons:
import timeit
import numpy as np
import pandas as pd
def f1(a,x):
if isinstance(a, pd.Series):
a = a.to_numpy()
return a.reshape((int(a.shape[0]/x), int(x) )).sum(1)
def f2(myarray, x):
return [sum(myarray[n: n+x]) for n in range(0, len(myarray), x)]
def f3(myarray, x):
s = pd.Series(myarray)
out = s.groupby(s.index // 2).sum()
return out
def f1v2(a,x):
if isinstance(a, pd.Series):
a = a.to_numpy()
mod = a.shape[0] % x
if mod != 0:
excl = a[-mod:]
keep = a[: len(a) - mod]
out = keep.reshape((int(keep.shape[0]/x), int(x) )).sum(1)
out = np.hstack( (excl.sum() , out) )
else:
out = a.reshape((int(a.shape[0]/x), int(x) )).sum(1)
return out
a = np.arange(0,1e6)
out1 = f1(a,2)
out2 = f2(a,2)
out3 = f2(a,2)
t1 = timeit.Timer( "f1(a,2)" , globals = globals() ).repeat(repeat = 5, number = 2)
t1v2 = timeit.Timer( "f1v2(a,2)" , globals = globals() ).repeat(repeat = 5, number = 2)
t2 = timeit.Timer( "f2(a,2)" , globals = globals() ).repeat(repeat = 5, number = 2)
t3 = timeit.Timer( "f3(a,2)" , globals = globals() ).repeat(repeat = 5, number = 2)
resdf = pd.DataFrame(index = ['min time'])
resdf['f1'] = [min(t1)]
resdf['f1v2'] = [min(t1v2)]
resdf['f2'] = [min(t2)]
resdf['f3'] = [min(t3)]
#the docs explain why it makes more sense to take the min than the avg
resdf = resdf.transpose()
resdf['% difference vs fastes'] = (resdf /resdf.min() - 1) * 100
b = np.array( [0,1,2,4,5,6,7] )
out1v2 = f1v2(b,2)