Assign value to subset of rows in Pandas dataframe - python

I want to assign values based on a condition on index in Pandas DataFrame.
class test():
def __init__(self):
self.l = 1396633637830123000
self.dfa = pd.DataFrame(np.arange(20).reshape(10,2), columns = ['A', 'B'], index = arange(self.l,self.l+10))
self.dfb = pd.DataFrame([[self.l+1,self.l+3], [self.l+6,self.l+9]], columns = ['beg', 'end'])
def update(self):
self.dfa['true'] = False
self.dfa['idx'] = np.nan
for i, beg, end in zip(self.dfb.index, self.dfb['beg'], self.dfb['end']):
self.dfa.ix[beg:end]['true'] = True
self.dfa.ix[beg:end]['idx'] = i
def do(self):
self.update()
print self.dfa
t = test()
t.do()
Result:
A B true idx
1396633637830123000 0 1 False NaN
1396633637830123001 2 3 True NaN
1396633637830123002 4 5 True NaN
1396633637830123003 6 7 True NaN
1396633637830123004 8 9 False NaN
1396633637830123005 10 11 False NaN
1396633637830123006 12 13 True NaN
1396633637830123007 14 15 True NaN
1396633637830123008 16 17 True NaN
1396633637830123009 18 19 True NaN
The true column is correctly assigned, while the idx column is not. Futhermore, this seems to depend on how the columns are initialized because if I do:
def update(self):
self.dfa['true'] = False
self.dfa['idx'] = False
also the true column does not get properly assigned.
What am I doing wrong?
p.s. the expected result is:
A B true idx
1396633637830123000 0 1 False NaN
1396633637830123001 2 3 True 0
1396633637830123002 4 5 True 0
1396633637830123003 6 7 True 0
1396633637830123004 8 9 False NaN
1396633637830123005 10 11 False NaN
1396633637830123006 12 13 True 1
1396633637830123007 14 15 True 1
1396633637830123008 16 17 True 1
1396633637830123009 18 19 True 1
Edit: I tried assigning using both loc and iloc but it doesn't seem to work:
loc:
self.dfa.loc[beg:end]['true'] = True
self.dfa.loc[beg:end]['idx'] = i
iloc:
self.dfa.loc[self.dfa.index.get_loc(beg):self.dfa.index.get_loc(end)]['true'] = True
self.dfa.loc[self.dfa.index.get_loc(beg):self.dfa.index.get_loc(end)]['idx'] = i

You are chain indexing, see here. The warning is not guaranteed to happen.
You should prob just do this. No real need to actually track the index in b, btw.
In [44]: dfa = pd.DataFrame(np.arange(20).reshape(10,2), columns = ['A', 'B'], index = np.arange(l,l+10))
In [45]: dfb = pd.DataFrame([[l+1,l+3], [l+6,l+9]], columns = ['beg', 'end'])
In [46]: dfa['in_b'] = False
In [47]: for i, s in dfb.iterrows():
....: dfa.loc[s['beg']:s['end'],'in_b'] = True
....:
or this if you have non-integer dtypes
In [36]: for i, s in dfb.iterrows():
dfa.loc[(dfa.index>=s['beg']) & (dfa.index<=s['end']),'in_b'] = True
In [48]: dfa
Out[48]:
A B in_b
1396633637830123000 0 1 False
1396633637830123001 2 3 True
1396633637830123002 4 5 True
1396633637830123003 6 7 True
1396633637830123004 8 9 False
1396633637830123005 10 11 False
1396633637830123006 12 13 True
1396633637830123007 14 15 True
1396633637830123008 16 17 True
1396633637830123009 18 19 True
[10 rows x 3 columns
If b is HUGE this might not be THAT performant.
As an aside, these look like nanosecond times. Can be more friendly by converting them.
In [49]: pd.to_datetime(dfa.index)
Out[49]:
<class 'pandas.tseries.index.DatetimeIndex'>
[2014-04-04 17:47:17.830123, ..., 2014-04-04 17:47:17.830123009]
Length: 10, Freq: None, Timezone: None

Related

Detect presence of inverse pairs in two columns of a DataFrame

I have a dataframe with two columns; source, and target. I would like to detect inverse rows, i.e. for a pair of values (source, target), if there exists a pair of values (target, source) then assign True to a new column.
My attempt:
cols = ['source', 'target']
_cols = ['target', 'source']
sub_edges = edges[cols]
sub_edges['oneway'] = sub_edges.apply(lambda x: True if x[x.isin(x[_cols])] else False, axis=1)
You can apply a lambda function using similar logic to that in your example. We check if there are any rows in the dataframe with a reversed source/target pair.
Incidentally, the column name 'oneway' indicates to me the opposite of the logic described in your question, but to change this we can just remove the not in the lambda function.
Code
import pandas as pd
import random
edges = {"source": random.sample(range(20), 20),
"target": random.sample(range(20), 20)}
df = pd.DataFrame(edges)
df["oneway"] = df.apply(
lambda x: not df[
(df["source"] == x["target"]) & (df["target"] == x["source"]) & (df.index != x.name)
].empty,
axis=1,
)
Output
source target oneway
0 9 11 False
1 16 1 True
2 1 16 True
3 11 14 False
4 4 13 False
5 18 15 False
6 14 17 False
7 13 12 False
8 19 19 False
9 12 3 False
10 10 6 False
11 15 5 False
12 3 18 False
13 17 0 False
14 6 7 False
15 5 10 False
16 7 2 False
17 8 9 False
18 0 4 False
19 2 8 False

Checking if values of a pandas Dataframe are between two lists. Adding a boolean column

I am trying to add a new column to a pandas Dataframe (False/True),which reflects if the Value is between two datapoints from another file.
I have a two files which give the following info:
File A:(x) File B:(y)
'time' 'time_A' 'time_B'
0 1 0 1 3
1 3 1 5 6
2 5 2 8 10
3 7
4 9
5 11
6 13
I tried to do it with the .map function, however it gives true and false for each event, not one column.
x['Event'] = x['time'].map((lamda x: x< y['time_A']),(lamda x: x> y['time_B']))
This would be the expected result
->
File A:
'time' 'Event'
0 1 True
1 3 True
2 5 True
3 7 False
4 9 True
5 11 False
6 13 False
However what i get is something like this
->
File A:
'time'
0 1 "0 True
1 True
2 True"
Name:1, dtype:bool"
2 3 "0 True
1 True
2 True
Name:1, dtype:bool"
This should do it:
(x.assign(key=1)
.merge(y.assign(key=1),
on='key')
.drop('key', 1)
.assign(Event=lambda v: (v['time_A'] <= v['time']) &
(v['time'] <= v['time_B']))
.groupby('time', as_index=False)['Event']
.any())
time Event
0 1 True
1 3 True
2 5 True
3 7 False
4 9 True
5 11 False
6 13 False
Use pd.IntervalIndex here:
idx=pd.IntervalIndex.from_arrays(B['time_A'],B['time_B'],closed='both')
#output-> IntervalIndex([[1, 3], [5, 6], [8, 10]],closed='both',dtype='interval[int64]')
A['Event']=B.set_index(idx).reindex(A['time']).notna().all(1).to_numpy()
print(A)
time Event
0 1 True
1 3 True
2 5 True
3 7 False
4 9 True
5 11 False
6 13 False
One liner:
A['Event'] = sum(A.time.between(b.time_A, b.time_B) for _, b in B.iterrows()) > 0
Explain:
For each row b of B dataframe, A.time.between(b.time_A, b.time_B) returns a boolean series whether time is between time_A and time_B
sum(list_of_boolean_series) > 0: Elementwise OR

Boolean slicing by comparing one to many columns in pandas

How can i compare a column to all other columns and obtain a boolean series to slice the dataframe by using i loc?
import numpy as np
import pandas as pd
a = np.random.normal(1,10,(10,1))
b = np.random.normal(1,5,(10,1))
c = np.random.normal(1,5,(10,1))
d = np.random.normal(1,5,(10,1))
e = np.append(a,b, axis = 1)
e = np.append(e,c, axis = 1)
e = np.append(e,d, axis = 1)
df = pd.DataFrame(data = e, columns=['a','b','c','d'])
a b c d
0 4.043832 -1.672865 -0.401864 3.073481
1 4.828796 -0.830688 3.652347 -1.780346
2 13.055145 5.730707 -2.305093 -4.566279
3 6.589498 -0.525029 -1.077942 -3.850963
4 5.273932 -1.003112 0.393002 -0.415573
5 -7.872004 -2.506250 1.725281 6.676886
6 -4.797119 6.448990 0.254142 -7.374601
7 8.610763 8.075350 13.043584 12.768633
8 -10.871154 2.152322 2.093089 11.570059
9 -22.148239 1.493870 3.649696 2.455621
df.loc[df.a > df.b]
Will give the desired result, but only for a 1:1 comparison
a b c d
0 4.043832 -1.672865 -0.401864 3.073481
1 4.828796 -0.830688 3.652347 -1.780346
2 13.055145 5.730707 -2.305093 -4.566279
3 6.589498 -0.525029 -1.077942 -3.850963
4 5.273932 -1.003112 0.393002 -0.415573
7 8.610763 8.075350 13.043584 12.768633
My Approach was like this:
S = ['b','c','d']
(df.a > df[S]).any(axis = 1)
0 False
1 False
2 False
3 False
4 False
5 False
6 False
7 False
8 False
9 False
dtype: bool
But unfortunately, the series is somehow False for all rows. How can i solve this Problem?
Using lt
df[S].lt(df.a,0).any(axis=1)
Out[808]:
0 False
1 True
2 True
3 True
4 True
5 True
6 True
7 False
8 True
9 True
dtype: bool
Given that you're using this for a mask, you could simply add another axis to the underlying ndarray to allow for broadcasting. This should be somewhat faster, depending on the size of your DataFrame.
(df[S].values < df.a.values[:,None]).any(1)

Find first 'True' value in blocks in pandas data frame

I have a dataframe, where one column contains only True or False values in blocks. For example:
df =
b
0 False
1 True
2 True
3 False
4 True
5 True
6 True
7 True
8 False
9 False
10 False
11 False
12 False
13 True
14 True
15 True
I need to find the beginning of block with True:
>> find_first_true(df)
>> array([1, 4, 13])
Any elegant solutions?
EDIT
Thanks for the proposed solution. I am wondering, what's the easiest way to extract blocks of a certain length, starting from the indices I found?
For example, I need to take blocks (number of rows) of length 4 before the indices. So, if my indices (found previously)
index = array([1, 4, 13])
then I need blocks:
[df.loc[0:4], df.loc[9:13]]
or
b
0 False
1 True
2 True
3 False
4 True
9 False
10 False
11 False
12 False
13 True
I am looping over indices, but wonder about more pandasian solution
In [2]: df = pd.read_clipboard()
In [3]: df
Out[3]:
b
0 False
1 True
2 True
3 False
4 True
5 True
6 True
7 True
8 False
9 False
10 False
11 False
12 False
13 True
14 True
15 True
In [11]: np.where(((df.b != df.b.shift(1)) & df.b).values)[0]
Out[11]: array([ 1, 4, 13], dtype=int64)
def find_first_true(df):
#finds indexes of true elements
a = list(map(lambda e: e[0] + 1 if e[1] else 0, enumerate(df)))
a = list(filter(bool, a))
a = list(map(lambda x: x - 1, a))
#removes consecutive elements
ta = [a[0]] + list(filter(lambda x: a[x] - a[x-1] != 1, range(1, len(a))))
a = list(map(lambda x: a[x], ta))
return a
find_first = []
for i in range(len(df)):
if (df.loc[i, 'b'] == False and df.loc[i+1, 'b'] == True):
find_first.append(i+1)

Adding a count to prior cell value in Pandas

in Pandas I am looking to add a value in one column 'B' depending on the boolean values from another column 'A'. So if 'A' is True then start counting (i.e. adding a one each new line) as long as 'A' is false. When 'A' is True reset and start counting again. I managed to do this with a 'for' loop but this is very time consuming. I am wondering if there is no more time efficient solution?
the result should look like this:
Date A B
01.2010 False 0
02.2010 True 1
03.2010 False 2
04.2010 False 3
05.2010 True 1
06.2010 False 2
You can use cumsum with groupby and cumcount:
print df
Date A
0 1.201 False
1 1.201 True
2 1.201 False
3 2.201 True
4 3.201 False
5 4.201 False
6 5.201 True
7 6.201 False
roll = df.A.cumsum()
print roll
0 0
1 1
2 1
3 2
4 2
5 2
6 3
7 3
Name: A, dtype: int32
df['B'] = df.groupby(roll).cumcount() + 1
#if in first values are False, output is 0
df.loc[roll == 0 , 'B'] = 0
print df
Date A B
0 1.201 False 0
1 1.201 True 1
2 1.201 False 2
3 2.201 True 1
4 3.201 False 2
5 4.201 False 3
6 5.201 True 1
7 6.201 False 2
thanks, I got the solution from another post similar to this:
rolling_count = 0
def set_counter(val):
if val == False:
global rolling_count
rolling_count +=1
else:
val == True
rolling_count = 1
return rolling_count
df['B'] = df['A'].map(set_counter)

Categories

Resources