say I have:
df ={'animal' : [1, 1, 1, 1, 1, 1, 1, 2, 2],
'x':[76.551, 77.529, 78.336,77, 78.02, 79.23, 77.733, 79.249, 76.077],
'y': [151.933, 152.945, 153.970, 119.369, 120.615, 118.935, 119.115, 152.004, 153.027],
'time': [0, 1, 2, 0, 3,2,5, 0, 1]}
df = pd.DataFrame(df)
# get distance travelled between points
def get_diff(df):
dx = (df['x'] - df.groupby('animal')['x'].shift(1))
dy = (df['y'] - df.groupby('animal')['y'].shift(1))
df['distance'] = (dx**2 + dy**2)**0.5
return df
# get the start/end coordinates
def get_startend(df):
for i in range(len(df)):
df.loc[df['distance'] > 5, 'start'] = 'start' #if distance >5, assign 'start'
df.loc[df['distance'].isnull(), 'start'] = 'start' #if distance = NaN, assign 'start'
cond = df['start'].shift(-1).str.contains('start').fillna(False) #for every idx before row 'start', assign 'end'
df.loc[cond, 'start'] = 'end'
df['start'].iloc[-1] = 'end' #assign 'end' for last idx
return df
df = get_diff(df)
df = get_startend(df)
after some preprocessing, I end up with:
animal x y time distance start
0 1 76.551 151.933 0 NaN start
1 1 77.529 152.945 1 1.407348 NaN
2 1 78.336 153.970 2 1.304559 end
3 1 77.000 119.369 0 34.626783 start
4 1 76.020 120.615 3 1.585218 NaN
5 1 79.230 118.935 2 3.623051 NaN
6 1 77.733 119.115 5 1.507783 end
7 2 79.249 152.004 0 NaN start
8 2 76.077 153.027 1 3.332884 end
I want to artificially recenter the start coordinates at (0,0). So if the start column has 'start', then subtract the x,y values from all the following rows until reach the next start index, then subtract the new x,y values, etc.
output should look something like:
animal x y time distance start newX newY
0 1 76.551 151.933 0 NaN start 0 0 #(76.551-76.551, 151.993-151.933)
1 1 77.529 152.945 1 1.407348 NaN 0.978 1.012 #(77.529-76.551, 152.945-151.933)
2 1 78.336 153.970 2 1.304559 end 1.785 2.012 #(78.336-76.551, 153.970-151.933)
3 1 77.000 119.369 0 34.626783 start 0 0 #(77-77, 119.369-119.369)
4 1 76.020 120.615 3 1.610253 NaN -0.98 1.246 #(76.020-77, 120.615-119.363)
5 1 79.230 118.935 2 3.623051 NaN 2.23 -0.434 #(..., ...)
6 1 77.733 119.115 5 1.507783 end 0.733 -0.254
7 2 79.249 152.004 0 NaN start 0 0 #(79.249-79.249, 152.004-152.004)
8 2 76.077 153.027 1 3.332884 end -3.172 1.023 #(76.077-79.249,153.027-152.004)
You can create a boolean mask based on start, and then use cumsum to turn that into a perfect grouper. Group by it, and then get the first value of x and y for each group. Subtract x and y from those firsts and you have your new columns:
df[['newX', 'newY']] = df[['x', 'y']] - df.groupby(df['start'].eq('start').cumsum())[['x', 'y']].transform('first')
Output:
animal x y time distance start newX newY
0 1 76.551 151.933 0 NaN start 0.000 0.000
1 1 77.529 152.945 1 1.407348 NaN 0.978 1.012
2 1 78.336 153.970 2 1.304559 NaN 1.785 2.037
3 1 77.000 119.369 0 34.626783 start 0.000 0.000
4 1 76.020 120.615 3 1.585218 NaN -0.980 1.246
5 1 79.230 118.935 2 3.623051 NaN 2.230 -0.434
6 1 77.733 119.115 5 1.507783 NaN 0.733 -0.254
7 2 79.249 152.004 0 NaN start 0.000 0.000
8 2 76.077 153.027 1 3.332884 NaN -3.172 1.023
You can use diff to compute the difference between previous rows.
df[['new_x', 'new_y']] = \
df.groupby(df['start'].notna().cumsum())[['x', 'y']].diff().fillna(0)
print(df)
# Output
animal x y time distance start new_x new_y
0 1 76.551 151.933 0 NaN start 0.000 0.000
1 1 77.529 152.945 1 1.407348 NaN 0.978 1.012
2 1 78.336 153.970 2 1.304559 NaN 0.807 1.025
3 1 77.000 119.369 0 34.626783 start 0.000 0.000
4 1 78.020 120.615 3 1.610253 NaN 1.020 1.246
5 1 79.230 118.935 2 2.070386 NaN 1.210 -1.680
6 1 77.733 119.115 5 1.507783 NaN -1.497 0.180
7 2 79.249 152.004 0 NaN start 0.000 0.000
8 2 76.077 153.027 1 3.332884 NaN -3.172 1.023
Related
I have a pandas Dataframe:
np.random.seed(0)
df = pd.DataFrame({'Close': np.random.uniform(0, 100, size=10)})
lbound, ubound = 0, 1
change = df["Close"].diff()
df["Change"] = change
df["Result"] = np.select([ np.isclose(change, 1) | np.isclose(change, 0) | np.isclose(change, -1),
# The other conditions
(change > 0) & (change > ubound),
(change < 0) & (change < lbound),
change.between(lbound, ubound)],[0, 1, -1, 0])
Close Change Result
0 54.881350 NaN 0
1 71.518937 16.637586 1
2 60.276338 -11.242599 -1
3 54.488318 -5.788019 -1
4 42.365480 -12.122838 -1
5 64.589411 22.223931 1
6 43.758721 -20.830690 -1
7 89.177300 45.418579 1
8 96.366276 7.188976 1
9 38.344152 58.022124 -1
Problem statement - Now, I want the majority of voting for index 1,2,3,4 assigned to index 0, index 2,3,4,5 assigned to index 1 of result columns, and so on for all the subsequent indexes.
I tried:
df['Voting'] = df['Result'].rolling(window = 4,min_periods=1).apply(lambda x: x.mode()[0]).shift()
But,this doesn't give the result I intend. It takes the first 4 rolling window and applies the mode function.
Close Change Result Voting
0 54.881350 NaN 0 NaN
1 71.518937 16.637586 1 0.0
2 60.276338 -11.242599 -1 0.0
3 54.488318 -5.788019 -1 -1.0
4 42.36548 -12.122838 -1 -1.0
5 64.589411 22.223931 1 -1.0
6 43.758721 -20.830690 -1 -1.0
7 89.177300 45.418579 1 -1.0
8 96.366276 7.188976 1 -1.0
9 38.344152 -58.022124 -1 1.0
Result I Intend - Rolling window of 4(index 1,2,3,4) should be set and mode function be applied and result
should be assigned to index 0,then next rolling window(index 2,3,4,5) and result should
be assigned to index 1 and so on..
You have to reverse your list before then shift of 1 (because you don't want the current index in the result):
majority = lambda x: 0 if len((m := x.mode())) > 1 else m[0]
df['Voting'] = (df[::-1].rolling(4, min_periods=1)['Result']
.apply(majority).shift())
print(df)
# Output
Close Change Result Voting
0 54.881350 NaN 0 -1.0
1 71.518937 16.637586 1 -1.0
2 60.276338 -11.242599 -1 -1.0
3 54.488318 -5.788019 -1 0.0
4 42.365480 -12.122838 -1 1.0
5 64.589411 22.223931 1 0.0
6 43.758721 -20.830690 -1 1.0
7 89.177300 45.418579 1 0.0
8 96.366276 7.188976 1 -1.0
9 38.344152 58.022124 -1 NaN
I have a Dataframe that look like this:
df_1:
Phase_1 Phase_2 Phase_3
0 8 4 2
1 4 6 3
2 8 8 3
3 10 5 8
...
I'd like to add a column called "Coeff" that compute (Phase_max - Phase_min) / Phase_max
For the first row: Coeff= (Phase_1 - Phase_3)/ Phase_1 = (8-2)/8 = 0.75
Expected OUTPUT:
df_1
Phase_1 Phase_2 Phase_3 Coeff
0 8 4 2 0.75
1 4 6 3 0.5
2 8 8 3 0.625
3 10 5 8 0.5
What is the best way to compute this without using loop? I want to apply it on large dataset.
here is one way to do it
# list the columns, you like to use in calculations
cols=['Phase_1', 'Phase_2', 'Phase_3']
# using max and min across the axis to calculate, for the defined columns
df['coeff']=(df[cols].max(axis=1).sub(df[cols].min(axis=1))).div(df[cols].max(axis=1))
df
little performance optimization (credit Yevhen Kuzmovych)
df['coeff']= 1 - (df[cols].min(axis=1).div(df[cols].max(axis=1)))
df
Phase_1 Phase_2 Phase_3 coeff
0 8 4 2 0.750
1 4 6 3 0.500
2 8 8 3 0.625
3 10 5 8 0.500
As per OP specification
I only want the max or the min between Phase_1 Phase_2 and Phase_3 and not other columns
The following will do the work
columns = ['Phase_1', 'Phase_2', 'Phase_3']
max_phase = df[columns].max(axis = 1)
min_phase = df[columns].min(axis = 1)
df['Coeff'] = (max_phase - min_phase) / max_phase
# or
max_phase = df[['Phase_1', 'Phase_2', 'Phase_3']].max(axis = 1)
min_phase = df[['Phase_1', 'Phase_2', 'Phase_3']].min(axis = 1)
df['Coeff'] = (max_phase - min_phase) / max_phase
# or
df['Coeff'] = (df[['Phase_1', 'Phase_2', 'Phase_3']].max(axis = 1) - df[['Phase_1', 'Phase_2', 'Phase_3']].min(axis = 1)) / df[['Phase_1', 'Phase_2', 'Phase_3']].max(axis = 1)
[Out]:
Phase_1 Phase_2 Phase_3 Coeff
0 8 4 2 0.750
1 4 6 3 0.500
2 8 8 3 0.625
3 10 5 8 0.500
Another alternative would be to use numpy built-in modules, as follows
columns = ['Phase_1', 'Phase_2', 'Phase_3']
max_phase = np.max(df[columns], axis = 1)
min_phase = np.min(df[columns], axis = 1)
df['Coeff'] = (max_phase - min_phase) / max_phase
# or
max_phase = np.max(df[['Phase_1', 'Phase_2', 'Phase_3']], axis = 1)
min_phase = np.min(df[['Phase_1', 'Phase_2', 'Phase_3']], axis = 1)
df['Coeff'] = (max_phase - min_phase) / max_phase
# or
df['Coeff'] = (np.max(df[['Phase_1', 'Phase_2', 'Phase_3']], axis = 1) - np.min(df[['Phase_1', 'Phase_2', 'Phase_3']], axis = 1)) / np.max(df[['Phase_1', 'Phase_2', 'Phase_3']], axis = 1)
[Out]:
Phase_1 Phase_2 Phase_3 Coeff
0 8 4 2 0.750
1 4 6 3 0.500
2 8 8 3 0.625
3 10 5 8 0.500
I'm trying to convert kilometer values in one column of a dataframe to mile values. I've tried various things and this is what I have now:
def km_dist(column, dist):
length = len(column)
for dist in zip(range(length), column):
if (column == data["dist"] and dist in data.loc[(data["dist"] > 25)]):
return dist / 5820
else:
return dist
data = data.apply(lambda x: km_dist(data["dist"], x), axis=1)
The dataset I'm working with looks something like this:
past_score dist income lab score gender race income_bucket plays_sports student_id lat long
0 8.091553 11.586920 67111.784934 0 7.384394 male H 3 0 1 0.0 0.0
1 8.091553 11.586920 67111.784934 0 7.384394 male H 3 0 1 0.0 0.0
2 7.924539 7858.126614 93442.563796 1 10.219626 F W 4 0 2 0.0 0.0
3 7.924539 7858.126614 93442.563796 1 10.219626 F W 4 0 2 0.0 0.0
4 7.726480 11.057883 96508.386987 0 8.544586 M W 4 0 3 0.0 0.0
With my code above, I'm trying to loop through all the "dist" values and if those values are in the right column ("data["dist"]") and greater than 25, divide those values by 5820 (the number of feet in a kilometer). More generally, I'd like to find a way to operate on specific elements of dataframes. I'm sure this is at least a somewhat common question, I just haven't been able to find an answer for it. If someone could point me towards somewhere with an answer, I would be just as happy.
Instead your solution filter rows with mask and divide column dist by 5820:
data.loc[data["dist"] > 25, 'dist'] /= 5820
Working same like:
data.loc[data["dist"] > 25, 'dist'] = data.loc[data["dist"] > 25, 'dist'] / 5820
data.loc[data["dist"] > 25, 'dist'] /= 5820
print (data)
past_score dist income lab score gender race \
0 8.091553 11.586920 67111.784934 0 7.384394 male H
1 8.091553 11.586920 67111.784934 0 7.384394 male H
2 7.924539 1.350194 93442.563796 1 10.219626 F W
3 7.924539 1.350194 93442.563796 1 10.219626 F W
4 7.726480 11.057883 96508.386987 0 8.544586 M W
income_bucket plays_sports student_id lat long
0 3 0 1 0.0 0.0
1 3 0 1 0.0 0.0
2 4 0 2 0.0 0.0
3 4 0 2 0.0 0.0
4 4 0 3 0.0 0.0
I have two columns which I want to compare every nth row. If it comes across the nth row it will compare them and put the result of the if statement in a new column.
When I tried the enumerate function it always ends up in the true part of the if statement. Somehow this piece of the code is always thrue:
if (count % 3)== 0:
for count, factors in enumerate(df.index):
if (count % 3)== 0: #every 3th row
df['Signal']=np.where(df['Wind Ch']>=df['Rain Ch'],'1', '-1')
else:
df['Signal']=0
In column 'Signal' I am expecting a '1' or '-1' every 3rd row and '0' on all the other rows. However I am getting '1' or '-1' on each row
Now I am getting:
Date Wind CH Rain CH Signal
0 5/10/2005 -1.85% -3.79% 1
1 5/11/2005 1.51% -1.66% 1
2 5/12/2005 0.37% 0.88% -1
3 5/13/2005 -0.81% 3.83% -1
4 5/14/2005 -0.28% 4.05% -1
5 5/15/2005 3.93% 1.79% 1
6 5/16/2005 6.23% 0.94% 1
7 5/17/2005 -0.08% 4.43% -1
8 5/18/2005 -2.69% 4.02% -1
9 5/19/2005 6.40% 1.33% 1
10 5/20/2005 -3.41% 2.38% -1
11 5/21/2005 3.27% 5.46% -1
12 5/22/2005 -4.40% -4.15% -1
13 5/23/2005 3.27% 4.48% -1
But I want to get:
Date Wind CH Rain CH Signal
0 5/10/2005 -1.85% -3.79% 0.0
1 5/11/2005 1.51% -1.66% 0.0
2 5/12/2005 0.37% 0.88% -1.0
3 5/13/2005 -0.81% 3.83% 0.0
4 5/14/2005 -0.28% 4.05% 0.0
5 5/15/2005 3.93% 1.79% 1.0
6 5/16/2005 6.23% 0.94% 0.0
7 5/17/2005 -0.08% 4.43% 0.0
8 5/18/2005 -2.69% 4.02% -1.0
9 5/19/2005 6.40% 1.33% 0.0
10 5/20/2005 -3.41% 2.38% 0.0
11 5/21/2005 3.27% 5.46% -1.0
12 5/22/2005 -4.40% -4.15% 0.0
13 5/23/2005 3.27% 4.48% 0.0
What am I missing here?
You can go about it like this, using np.vectorize to avoid loops:
import numpy as np
def calcSignal(x, y, i):
return 0 if (i + 1) % 3 != 0 else 1 if x >= y else -1
func = np.vectorize(calcSignal)
df['Signal'] = func(df['Wind CH'], df['Rain CH'], df.index)
df
Date Wind CH Rain CH Signal
0 5/10/2005 -1.85% -3.79% 0
1 5/11/2005 1.51% -1.66% 0
2 5/12/2005 0.37% 0.88% -1
3 5/13/2005 -0.81% 3.83% 0
4 5/14/2005 -0.28% 4.05% 0
5 5/15/2005 3.93% 1.79% 1
6 5/16/2005 6.23% 0.94% 0
7 5/17/2005 -0.08% 4.43% 0
8 5/18/2005 -2.69% 4.02% -1
9 5/19/2005 6.40% 1.33% 0
10 5/20/2005 -3.41% 2.38% 0
11 5/21/2005 3.27% 5.46% -1
12 5/22/2005 -4.40% -4.15% 0
13 5/23/2005 3.27% 4.48% 0
In general you don't want to loop over pandas objects. This case is no exception.
In [12]: df = pd.DataFrame({'x': [1,2,3], 'y': [10, 20, 30]})
In [13]: df
Out[13]:
x y
0 1 10
1 2 20
2 3 30
In [14]: df.loc[df.index % 2 == 0, 'x'] = 5
In [15]: df
Out[15]:
x y
0 5 10
1 2 20
2 5 30
there is no need to use enumerate function as i see it.Also your logic is faulty. you are rewriting complete column in every iteration of loop instead of ith row of column. you could simply do this
for count in range(len(df.index)):
if (count % 3)== 0: #every 3th row
df['Signal'].iloc[count]=np.where(df['Wind Ch'].iloc[count]>=df['Rain Ch'].iloc[count],'1', '-1')
else:
df['Signal'].iloc[0]=0
I want to fill the missing data of gender in proportion in a data set.
i use boolean index and head or tail function to select the top data i want, but when i use fillna function, it doesn't work.but after i try, it only run without boolean index, how can i get the top 3 empty values in example and fill it with 0.
a = pd.DataFrame(np.random.randn(50).reshape((10,5)))
a[0][1,3,4,6,9] = np.nan
a[0][a[0].isnull()].head(3).fillna(value = '0', inplace = True)
the dataframe didn't fill the NaN
You should use the loc function, otherwise you will never attribute a value. Here is what you could do :
a.loc[a[0].isnull().index[0:3], 0] = 0
In [1] : print(a)
Out[1] : 0 1 2 3 4
0 0.786182 -0.474953 -0.285881 -0.285502 -0.541957
1 0.000000 0.648042 1.104871 1.237512 -0.156453
2 -1.327987 1.851947 -0.522366 0.631213 -0.960167
3 0.000000 0.561111 -0.945439 -1.414678 0.433246
4 0.000000 -1.463828 0.141122 1.468288 0.649452
5 1.554890 -0.411142 -1.162577 -0.186640 0.774959
6 0.000000 -0.061410 -0.312673 -1.324719 1.763257
7 0.587035 0.265302 -0.793746 -0.148613 0.059330
8 0.909685 1.169786 -1.289559 -0.090185 -0.024272
9 0.000000 0.606329 -0.806034 1.102597 0.820976
Starting with data:
a = pd.DataFrame(np.random.randn(50).reshape((10,5)))
a[0][1,3,4,6,9] = np.nan
gives
0 1 2 3 4
0 -0.388759 -0.660923 0.385984 0.933920 0.164083
1 NaN -0.996237 -0.384492 0.191026 -1.168100
2 -0.773971 0.453441 -0.543590 0.768267 -1.127085
3 NaN -1.051186 -2.251681 -0.575438 1.642082
4 NaN 0.123432 1.063412 -1.556765 0.839855
5 -1.678960 -1.617817 -1.344757 -1.469698 0.276604
6 NaN -0.813213 -0.077575 -0.064179 1.960611
7 1.256771 -0.541197 -1.577126 -1.723853 0.028666
8 0.236197 0.868503 -1.304098 -1.578005 -0.632721
9 NaN -0.227659 -0.857427 0.010257 -1.884986
Now you want to work on column zero so we use fillna with a limit of 3 and replace that column inplace
a[0].fillna(0, inplace=True, limit=3)
gives
0 1 2 3 4
0 -0.388759 -0.660923 0.385984 0.933920 0.164083
1 0.000000 -0.996237 -0.384492 0.191026 -1.168100
2 -0.773971 0.453441 -0.543590 0.768267 -1.127085
3 0.000000 -1.051186 -2.251681 -0.575438 1.642082
4 0.000000 0.123432 1.063412 -1.556765 0.839855
5 -1.678960 -1.617817 -1.344757 -1.469698 0.276604
6 NaN -0.813213 -0.077575 -0.064179 1.960611
7 1.256771 -0.541197 -1.577126 -1.723853 0.028666
8 0.236197 0.868503 -1.304098 -1.578005 -0.632721
9 NaN -0.227659 -0.857427 0.010257 -1.884986