I currently have a dataframe in the following format:
step tag_id x_pos y_pos
1 1 5 3
1 2 3 4
2 1 2 2
2 3 1 6
.........................
.........................
N 1 5 7
For each row in the df, I am aiming to add an additional m rows oversampling from a Gaussian distribution for the x and y values (independent). Thus, a df of N = 100 and m = 10 would result in a df length 1010, including the original and oversampled values.
The code I have for this works, but it is extremely slow over a large dataset (N > 100k). There are many operations (creating new arrays/ dfs, use of itertuples, etc.) that I'm sure are hampering performance; I would appreciate any help as to how I can improve the performance so I can generate higher m values over the whole dataset. For instance: input data is from a pandas dataframe, but the multi-variate normal function operates on numpy arrays. Is there a more natural way to implement this through pandas without the copying between numpy arrays and dataframes? Thanks!
Reproducible example:
import pandas as pd
import numpy as np
import random
def gaussianOversample2(row, n):
sigma = 2
mean_x = float(getattr(row,'x_pos'))
mean_y = float(getattr(row,'y_pos'))
step = getattr(row, 'step')
tag_id = getattr(row, 'tag_id')
sigma = np.array([1,1])
cov = np.diag(sigma ** 2)
x,y = np.random.multivariate_normal([mean_x, mean_y], cov, n).T
x = np.concatenate(([mean_x], x))
y = np.concatenate(([mean_y], y))
steps = np.empty(n+1)
tags = np.empty(n+1)
steps.fill(step)
tags.fill(tag_id)
return x,y, steps, tags
def oversampleDf(df, n):
oversampled_arr = np.empty((0,4), float)
# with input df with step, tag_id, x_pos, y_pos
data = pd.DataFrame(columns = df.columns)
count = 0
for row in df.itertuples(index=False):
count = count + 1
temp = np.zeros((len(row), n+1))
oversample_x, oversample_y, steps, tags = gaussianOversample2(row, n)
temp[0] = steps
temp[1] = tags
temp[2] = oversample_x
temp[3] = oversample_y
temp = pd.DataFrame(temp.T, columns = df.columns)
data = data.append(temp)
if count % 1000 == 0:
print("Row: ", count)
return data
df = pd.DataFrame([[1, 1, 5, 3],[1, 2, 3, 4],[2, 1, 2, 2],[2, 3, 1, 6], columns = ['step', 'tag_id', 'x_pos', 'y_pos']])
res = oversampleDf(df, 20)
"""
# Result should be:
step tag_id x_pos y_pos
0 1.0 1.0 5.000000 3.000000
1 1.0 1.0 3.423492 3.886602
2 1.0 1.0 5.404581 2.177559
3 1.0 1.0 4.023274 2.883737
4 1.0 1.0 3.390710 3.038782
.. ... ... ... ...
16 2.0 3.0 1.894151 5.510321
17 2.0 3.0 1.110932 5.281578
18 2.0 3.0 1.623538 4.529825
19 2.0 3.0 -0.576756 7.476872
20 2.0 3.0 -0.866123 5.898048
"""
This is the solution I have found for myself; it is more of a workaround than a technique using quicker methods. I instead write out to a csv file, which I then read in once complete, as so:
def gaussianOversample3(row, n):
mean_x = float(getattr(row,'x_pos'))
mean_y = float(getattr(row,'y_pos'))
step = getattr(row, 'step')
tag_id = getattr(row, 'tag_id')
sigma = np.array([1,1])
cov = np.diag(sigma ** 2)
x,y = np.random.multivariate_normal([mean_x, mean_y], cov, n).T
x = np.concatenate(([mean_x], x))
y = np.concatenate(([mean_y], y))
steps = np.empty(n+1)
tags = np.empty(n+1)
steps.fill(step)
tags.fill(tag_id)
pd.DataFrame(data = np.column_stack((steps,tags,x,y))).to_csv("oversample.csv", mode = 'a', header = False)
def oversampleDf2(df, n):
filename = "oversample.csv"
d = pd.DataFrame(list())
d.to_csv(filename)
#count = 0
for row in df.itertuples(index=False):
#count = count + 1
gaussianOversample3(row, n)
#if count % 10000 == 0:
# print("Row: ", count)
Because of how it is reading the file, I have to do the following:
oversampleDf2(defensive_df2, num_oversamples)
oversampled_df = pd.read_csv("oversample_10.csv", sep= ' ')
oversampled_df.columns = ['col']
oversampled_df = oversampled_df.col.str.split(",",expand=True)
oversampled_df.columns = ['temp', 'step', 'tag_id', 'x_pos', 'y_pos']
oversampled_df = oversampled_df.drop(['temp'], axis = 1)
oversampled_df = oversampled_df.astype(float)
Related
My toy example is as follows:
import numpy as np
from sklearn.datasets import load_iris
import pandas as pd
### prepare data
Xy = np.c_[load_iris(return_X_y=True)]
mycol = ['x1','x2','x3','x4','group']
df = pd.DataFrame(data=Xy, columns=mycol)
dat = df.iloc[:100,:] #only consider two species
dat['group'] = dat.group.apply(lambda x: 1 if x ==0 else 2) #two species means two groups
dat.shape
dat.head()
### Linear discriminant analysis procedure
G1 = dat.iloc[:50,:-1]; x1_bar = G1.mean(); S1 = G1.cov(); n1 = G1.shape[0]
G2 = dat.iloc[50:,:-1]; x2_bar = G2.mean(); S2 = G2.cov(); n2 = G2.shape[0]
Sp = (n1-1)/(n1+n2-2)*S1 + (n2-1)/(n1+n2-2)*S2
a = np.linalg.inv(Sp).dot(x1_bar-x2_bar); u_bar = (x1_bar + x2_bar)/2
m = a.T.dot(u_bar); print("Linear discriminant boundary is {} ".format(m))
def my_lda(x):
y = a.T.dot(x)
pred = 1 if y >= m else 2
return y.round(4), pred
xx = dat.iloc[:,:-1]
xxa = xx.agg(my_lda, axis=1)
xxa.shape
type(xxa)
We have xxa is a pandas.core.series.Series with shape (100,). Note that there are two columns in parentheses of xxa, I want convert xxa to a pd.DataFrame with 100 rows x 2 columns and I try
xxa_df1 = pd.DataFrame(data=xxa, columns=['y','pred'])
which gives ValueError: Shape of passed values is (100, 1), indices imply (100, 2).
Then I continue to try
xxa2 = xxa.to_frame()
# xxa2 = pd.DataFrame(xxa) #equals `xxa.to_frame()`
xxa_df2 = pd.DataFrame(data=xxa2, columns=['y','pred'])
and xxa_df2 presents all NaN with 100 rows x 2 columns. What should I do next?
Let's try Series.tolist()
xxa_df1 = pd.DataFrame(data=xxa.tolist(), columns=['y','pred'])
print(xxa_df1)
y pred
0 42.0080 1
1 32.3859 1
2 37.5566 1
3 31.0958 1
4 43.5050 1
.. ... ...
95 -56.9613 2
96 -61.8481 2
97 -62.4983 2
98 -38.6006 2
99 -61.4737 2
[100 rows x 2 columns]
I'm looking to make it so that NaN values in a dataframe are filled in by the mean of all the values up to that point, as such:
A
0 1
1 2
2 3
3 4
4 5
5 NaN
6 NaN
7 11
8 NaN
Would become
A
0 1
1 2
2 3
3 4
4 5
5 3
6 3
7 11
8 4
You can solve it by running the following code
import numpy as np
import pandas as pd
df = pd.DataFrame({
"A": [ 1, 2, 3, 4, 5, pd.NA, pd.NA, 11, pd.NA ]
})
for idx in df[pd.isna(df["A"])].index:
df.loc[idx, "A"] = np.mean(df.loc[ : idx, "A" ])
It iterates on each NaN and fills it with the mean of the previous values, including those filled NaNs.
At the end you will have:
>>> df
A
0 1
1 2
2 3
3 4
4 5
5 3
6 3
7 11
8 4
EDIT
As stated by RichieV, performance may be an issue with this solution (its runtime complexity is O(N^2)) when there are many NaNs, but we also should avoid python iterations, since they are slow when compared to native pandas / numpy calls.
Here is an optimized version:
last_idx = None
cumsum = 0
cumnum = 0
for idx in df[pd.isna(df["A"])].index:
prev_values = df.loc[ last_idx : idx, "A" ]
# for some reason, pandas includes idx on the slice, so we remove it
prev_values = prev_values[ : -1 ]
cumsum += prev_values.sum()
cumnum += len(prev_values)
df.loc[idx, "A"] = int(cumsum / cumnum)
last_idx = idx
Result:
>>> df
A
0 1
1 2
2 3
3 4
4 5
5 3
6 3
7 11
8 4
Since in the worst case the script should pass on the dataframe twice, the runtime complexity is now O(N).
Marco's answer works fine but it can be optimized with incremental average formulas, from math.stackexchange.com
Here is an adaptation of that other question (not the exact formula, just the concept).
cumsum = 0
expanding_mean = []
for i, xi in enumerate(df['A']):
if pd.isna(xi):
mean = cumsum / i # divide by number of items up to previous row
expanding_mean.append(mean)
cumsum += mean
else:
cumsum += xi
df.loc[df['A'].isna(), 'A'] = expanding_mean
The main advantage with this code is not having to read all items up to the current index on each iteration to get the mean.
This option still uses a python loop--which is not the best choice with pandas--but there seems to be no way around it for this use case (hopefully someone will get inspired by this and find such method without a loop).
Performance tests
Three alternative functions were defined:
incremental: My answer.
from_origin: Marco's original answer.
incremental_pandas: Marco's updated answer.
Tests were done using timeit module with 3 repetitions on random samples with 0.4 probability of NaN.
Full code for testing
import pandas as pd
import numpy as np
import timeit
import collections
from matplotlib import pyplot as plt
def incremental(df: pd.DataFrame):
# error handling
if pd.isna(df.iloc[0, 0]):
df.iloc[0, 0] = 0
cumsum = 0
expanding_mean = []
for i, xi in enumerate(df['A']):
if pd.isna(xi):
mean = cumsum / i # divide by number of items up to previous row
expanding_mean.append(mean)
cumsum += mean
else:
cumsum += xi
df.loc[df['A'].isna(), 'A'] = expanding_mean
return df
def incremental_pandas(df: pd.DataFrame):
# error handling
if pd.isna(df.iloc[0, 0]):
df.iloc[0, 0] = 0
last_idx = None
cumsum = 0
cumnum = 0
for idx in df[pd.isna(df["A"])].index:
prev_values = df.loc[ last_idx : idx, "A" ]
# for some reason, pandas includes idx on the slice, so we remove it
prev_values = prev_values[ : -1 ]
cumsum += prev_values.sum()
cumnum += len(prev_values)
df.loc[idx, "A"] = cumsum / cumnum
last_idx = idx
return df
def from_origin(df: pd.DataFrame):
# error handling
if pd.isna(df.iloc[0, 0]):
df.iloc[0, 0] = 0
for idx in df[pd.isna(df["A"])].index:
df.loc[idx, "A"] = np.mean(df.loc[ : idx, "A" ])
return df
def get_random_sample(n, p):
np.random.seed(123)
return pd.DataFrame({'A':
np.random.choice(list(range(10)) + [np.nan],
size=n, p=[(1 - p) / 10] * 10 + [p])})
r = 3
p = 0.4 # portion of NaNs
# check result from all functions
results = []
for func in [from_origin, incremental, incremental_pandas]:
random_df = get_random_sample(1000, p)
new_df = random_df.copy(deep=True)
results.append(func(new_df))
print('Passed' if all(np.allclose(r, results[0]) for r in results[1:])
else 'Failed', 'implementation test')
timings = {}
for n in np.geomspace(10, 10000, 10):
random_df = get_random_sample(int(n), p)
timings[n] = collections.defaultdict(float)
results = {}
for func in ['incremental', 'from_origin', 'incremental_pandas']:
timings[n][func] = (
timeit.timeit(f'{func}(random_df.copy(deep=True))', number=r, globals=globals())
/ r
)
timings = pd.DataFrame(timings).T
print(timings)
timings.plot()
plt.xlabel('size of array')
plt.ylabel('avg runtime (s)')
plt.ylim(0)
plt.grid(True)
plt.tight_layout()
plt.show()
plt.close('all')
import numpy as np
import pandas as pd
from scipy.spatial.distance import directed_hausdorff
df:
1 1.1 2 2.1 3 3.1 4 4.1
45.13 7.98 45.10 7.75 45.16 7.73 NaN NaN
45.35 7.29 45.05 7.68 45.03 7.96 45.05 7.65
Calculated distance for 1 couple
x = df['3']
y = df['3.1']
P = np.array([x, y])
q = df['4']
w = df['4.1']
Q = np.array([q, w])
Q_final = list(zip(Q[0], Q[1]))
P_final = list(zip(P[0], P[1]))
directed_hausdorff(P_final, Q_final)[0]
Desired output:
Same process with for loop for the whole dataset
distance from a['0'], a['0']is 0
from a['0'], a['1'] is 0.234 (some number)
from a['0'], a['2'] is .. ...
From [0] to all, then to [1] to all and etc.
Finally I should get a matrix with 0s` in diagonal
I Have tried:
space = list(df.index)
dist = []
for j in space:
for k in space:
if k != j:
dist.append((j, k, directed_hausdorff(P_final, Q_final)[0]))
But getting same value of distance between [3] and [4]
I am not entirely sure what you are trying to do.. but based on how you calculated the first one, here is a possible solution:
import pandas as pd
import numpy as np
from scipy.spatial.distance import directed_hausdorff
df = pd.read_csv('something.csv')
groupby = lambda l, n: [tuple(l[i:i+n]) for i in range(0, len(l), n)]
values = groupby(df.columns.values, 2)
matrix = np.zeros((4, 4))
for Ps in values:
x = df[str(Ps[0])]
y = df[str(Ps[1])]
P = np.array([x, y])
for Qs in values:
q = df[str(Qs[0])]
w = df[str(Qs[1])]
Q = np.array([q, w])
Q_final = list(zip(Q[0], Q[1]))
P_final = list(zip(P[0], P[1]))
matrix[values.index(Ps), values.index(Qs)] = directed_hausdorff(P_final, Q_final)[0]
print(matrix)
Output:
[[0. 0.49203658 0.47927028 0.46861498]
[0.31048349 0. 0.12083046 0.1118034 ]
[0.25179357 0.22135944 0. 0.31064449]
[0.33955854 0.03 0.13601471 0. ]]
I want to create a loop that loads all the iterations of two variables into a dataframe in seperate columns. I want variable "a" to hold values between 0 and 1 in 0.1 increments, and the same for variable "b". In otherwords there should be 100 iterations when complete, starting with 0 & 0, and ending with 1 & 1.
I've tried the following code
data = [['Decile 1', 10], ['Decile_2', 15], ['Decile_3', 14]]
staging_table = pd.DataFrame(data, columns = ['Decile', 'Volume'])
profile_table = pd.DataFrame(columns = ['Decile', 'Volume'])
a = 0
b = 0
finished = False
while not finished:
if b != 1:
if a != 1:
a = a + 0.1
staging_table['CAM1_Modifier'] = a
staging_table['CAM2_Modifier'] = b
profile_table = profile_table.append(staging_table)
else:
b = b + 0.1
else:
finished = True
profile_table
You can use itertools.product to get all the combinations:
import itertools
import pandas as pd
x = [i / 10 for i in range(11)]
df = pd.DataFrame(
list(itertools.product(x, x)),
columns=["a", "b"]
)
# a b
# 0 0.0 0.0
# 1 0.0 0.1
# 2 0.0 0.2
# ... ... ...
# 118 1.0 0.8
# 119 1.0 0.9
# 120 1.0 1.0
#
# [121 rows x 2 columns]
itertools is your friend.
from itertools import product
for a, b in product(map(lambda x: x / 10, range(10)),
map(lambda x: x / 10, range(10))):
...
range(10) gives us the integers from 0 to 10 (regrettably, range fails on floats). Then we divide those values by 10 to get your range from 0 to 1. Then we take the Cartesian product of that iterable with itself to get every combination.
I am hoping to write a program that will run though multiple columns of data and create a new dataframe based on those that are found to be outliers and those that are blank. Currently, I have the below code that will replace the values with "Outlier" and "No Data" but I am struggling to convert this to a new dataframe.
Visual of request:
import pandas as pd
from pandas import ExcelWriter
# Remove Initial Data Quality
outl = ['.',0,' ']
# Pull in Data
path = r"C:\Users\robert.carmody\desktop\Python\PyTest\PyTGPS.xlsx"
sheet = 'Raw Data'
df = pd.read_excel(path,sheet_name=sheet)
data = pd.read_excel(path,sheet_name=sheet)
j = 0
while j < len(df.keys()): #run through total number of columns
list(df.iloc[:,j]) #create a list of all values within the 'j' column
if type(list(df.iloc[:,j])[0]) == float:
q1 = df.iloc[:,j].quantile(q=.25)
med = df.iloc[:,j].quantile(q=.50)
q3 = df.iloc[:,j].quantile(q=.75)
iqr = q3 - q1
ub = q3 + 1.5*iqr
lb = q1 - 1.5*iqr
mylist = [] #outlier list is defined
for i in df.iloc[:,j]: #identify outliers and add to the list
if i > ub or i < lb:
mylist.append(float(i))
else:
i
if mylist == []:
mylist = ['Outlier']
else:
mylist
else:
mylist = ['Outlier']
data.iloc[:,j].replace(mylist,'Outlier',inplace=True)
j = j + 1
data = data.fillna('No Data')
#Excel
path2 = r"C:\Users\robert.carmody\desktop\Python\PyTest\PyTGPS.xlsx"
writer = ExcelWriter(path2)
df.to_excel(writer,'Raw Data')
data.to_excel(writer,'Adjusted Data')
writer.save()
Suppose you data looks like this, and for simplicity upperbound is 2 and lowerbound is 0,
df = pd.DataFrame({'group':'A B C D E F'.split(' '), 'Q1':[1,1,5,2,2,2], 'Q2':[1,5,5,2,2,2],'Q3':[2,2,None,2,2,2]})
df.set_index('group', inplace=True)
i.e.:
Q1 Q2 Q3
group
A 1 1 2.0
B 1 5 2.0
C 5 5 NaN
D 2 2 2.0
E 2 2 2.0
F 2 2 2.0
Then the following might give what you want:
newData = []
for quest in df.columns: #run through the columns
q1 = df[quest].quantile(q=.25)
med = df[quest].quantile(q=.50)
q3 = df[quest].quantile(q=.75)
iqr = q3 - q1
#ub = q3 + 1.5*iqr
ub = 2 #my
#lb = q1 - 1.5*iqr
lb = 0 #my
for group in df.index:
i = df.loc[group, quest]
if i > ub or i < lb: #identify outliers and add to the list
newData += [[group, quest, 'Outlier', i]]
elif (i>0 or i<=0)==False:
newData += [[group, quest, 'None', None]]
creates a 2 dimensional list which can easily be transformed in a dataframe
by
pd.DataFrame(newData)