from itertools import product
import pandas as pd
df = pd.DataFrame.from_records(product(range(10), range(10)))
df = df.sample(90)
df.columns = "c1 c2".split()
df = df.sort_values(df.columns.tolist()).reset_index(drop=True)
# c1 c2
# 0 0 0
# 1 0 1
# 2 0 2
# 3 0 3
# 4 0 4
# .. .. ..
# 85 9 4
# 86 9 5
# 87 9 7
# 88 9 8
# 89 9 9
#
# [90 rows x 2 columns]
How do I quickly find, identify, and remove the last duplicate of all symmetric pairs in this data frame?
An example of symmetric pair is that '(0, 1)' is equal to '(1, 0)'. The latter should be removed.
The algorithm must be fast, so it is recommended to use numpy. Converting to python object is not allowed.
You can sort the values, then groupby:
a= np.sort(df.to_numpy(), axis=1)
df.groupby([a[:,0], a[:,1]], as_index=False, sort=False).first()
Option 2: If you have a lot of pairs c1, c2, groupby can be slow. In that case, we can assign new values and filter by drop_duplicates:
a= np.sort(df.to_numpy(), axis=1)
(df.assign(one=a[:,0], two=a[:,1]) # one and two can be changed
.drop_duplicates(['one','two']) # taken from above
.reindex(df.columns, axis=1)
)
One way is using np.unique with return_index=True and use the result to index the dataframe:
a = np.sort(df.values)
_, ix = np.unique(a, return_index=True, axis=0)
print(df.iloc[ix, :])
c1 c2
0 0 0
1 0 1
20 2 0
3 0 3
40 4 0
50 5 0
6 0 6
70 7 0
8 0 8
9 0 9
11 1 1
21 2 1
13 1 3
41 4 1
51 5 1
16 1 6
71 7 1
...
frozenset
mask = pd.Series(map(frozenset, zip(df.c1, df.c2))).duplicated()
df[~mask]
I will do
df[~pd.DataFrame(np.sort(df.values,1)).duplicated().values]
From pandas and numpy tri
s=pd.crosstab(df.c1,df.c2)
s=s.mask(np.triu(np.ones(s.shape)).astype(np.bool) & s==0).stack().reset_index()
Here's one NumPy based one for integers -
def remove_symm_pairs(df):
a = df.to_numpy(copy=False)
b = np.sort(a,axis=1)
idx = np.ravel_multi_index(b.T,(b.max(0)+1))
sidx = idx.argsort(kind='mergesort')
p = idx[sidx]
m = np.r_[True,p[:-1]!=p[1:]]
a_out = a[np.sort(sidx[m])]
df_out = pd.DataFrame(a_out)
return df_out
If you want to keep the index data as it is, use return df.iloc[np.sort(sidx[m])].
For generic numbers (ints/floats, etc.), we will use a view-based one -
# https://stackoverflow.com/a/44999009/ #Divakar
def view1D(a): # a is array
a = np.ascontiguousarray(a)
void_dt = np.dtype((np.void, a.dtype.itemsize * a.shape[1]))
return a.view(void_dt).ravel()
and simply replace the step to get idx with idx = view1D(b) in remove_symm_pairs.
If this needs to be fast, and if your variables are integer, then the following trick may help: let v,w be the columns of your vector; construct [v+w, np.abs(v-w)] =: [x, y]; then sort this matrix lexicographically, remove duplicates, and finally map it back to [v, w] = [(x+y), (x-y)]/2.
Related
Python newbie here.
Imagine a csv file that looks something like this:
(...except that in real life, there are 20 distinct names in the Person column, and each Person has 300-500 rows. Also, there are multiple data columns, not just one.)
What I want to do is randomly flag 10% of each Person's rows and mark this in a new column. I came up with a ridiculously convoluted way to do this--it involved creating a helper column of random numbers and all sorts of unnecessarily complicated jiggery-pokery. It worked, but was crazy. More recently, I came up with this:
import pandas as pd
df = pd.read_csv('source.csv')
df['selected'] = ''
names= list(df['Person'].unique()) #gets list of unique names
for name in names:
df_temp = df[df['Person']== name]
samp = int(len(df_temp)/10) # I want to sample 10% for each name
df_temp = df_temp.sample(samp)
df_temp['selected'] = 'bingo!' #a new column to mark the rows I've randomly selected
df = df.merge(df_temp, how = 'left', on = ['Person','data'])
df['temp'] =[f"{a} {b}" for a,b in zip(df['selected_x'],df['selected_y'])]
#Note: initially instead of the line above, I tried the line below, but it didn't work too well:
#df['temp'] = df['selected_x'] + df['selected_y']
df = df[['Person','data','temp']]
df = df.rename(columns = {'temp':'selected'})
df['selected'] = df['selected'].str.replace('nan','').str.strip() #cleans up the column
As you can see, essentially I'm pulling out a temporary DataFrame for each Person, using DF.sample(number) to do the randomising, then using DF.merge to get the 'marked' rows back into the original DataFrame. And it involved iterating through a list to create each temporary DataFrame...and my understanding is that iterating is kind of lame.
There's got to be a more Pythonic, vectorising way to do this, right? Without iterating. Maybe something involving groupby? Any thoughts or advice much appreciated.
EDIT: Here's another way that avoids merge...but it's still pretty clunky:
import pandas as pd
import math
#SETUP TEST DATA:
y = ['Alex'] * 2321 + ['Doug'] * 34123 + ['Chuck'] * 2012 + ['Bob'] * 9281
z = ['xyz'] * len(y)
df = pd.DataFrame({'persons': y, 'data' : z})
df = df.sample(frac = 1) #shuffle (optional--just to show order doesn't matter)
percent = 10 #CHANGE AS NEEDED
#Add a 'helper' column with random numbers
df['rand'] = np.random.random(df.shape[0])
df = df.sample(frac=1) #this shuffles data, just to show order doesn't matter
#CREATE A HELPER LIST
helper = pd.DataFrame(df.groupby('persons'['rand'].count()).reset_index().values.tolist()
for row in helper:
df_temp = df[df['persons'] == row[0]][['persons','rand']]
lim = math.ceil(len(df_temp) * percent*0.01)
row.append(df_temp.nlargest(lim,'rand').iloc[-1][1])
def flag(name,num):
for row in helper:
if row[0] == name:
if num >= row[2]:
return 'yes'
else:
return 'no'
df['flag'] = df.apply(lambda x: flag(x['persons'], x['rand']), axis=1)
You could use groupby.sample, either to pick out a sample of the whole dataframe for further processing, or to identify rows of the dataframe to mark if that's more convenient.
import pandas as pd
percentage_to_flag = 0.5
# Toy data: 8 rows, persons A and B.
df = pd.DataFrame(data={'persons':['A']*4 + ['B']*4, 'data':range(8)})
# persons data
# 0 A 0
# 1 A 1
# 2 A 2
# 3 A 3
# 4 B 4
# 5 B 5
# 6 B 6
# 7 B 7
# Pick out random sample of dataframe.
random_state = 41 # Change to get different random values.
df_sample = df.groupby("persons").sample(frac=percentage_to_flag,
random_state=random_state)
# persons data
# 1 A 1
# 2 A 2
# 7 B 7
# 6 B 6
# Mark the random sample in the original dataframe.
df["marked"] = False
df.loc[df_sample.index, "marked"] = True
# persons data marked
# 0 A 0 False
# 1 A 1 True
# 2 A 2 True
# 3 A 3 False
# 4 B 4 False
# 5 B 5 False
# 6 B 6 True
# 7 B 7 True
If you really do not want the sub-sampled dataframe df_sample you can go straight to marking a sample of the original dataframe:
# Mark random sample in original dataframe with minimal intermediate data.
df["marked2"] = False
df.loc[df.groupby("persons")["data"].sample(frac=percentage_to_flag,
random_state=random_state).index,
"marked2"] = True
# persons data marked marked2
# 0 A 0 False False
# 1 A 1 True True
# 2 A 2 True True
# 3 A 3 False False
# 4 B 4 False False
# 5 B 5 False False
# 6 B 6 True True
# 7 B 7 True True
If I understood you correctly, you can achieve this using:
df = pd.DataFrame(data={'persons':['A']*10 + ['B']*10, 'col_1':[2]*20})
percentage_to_flag = 0.5
a = df.groupby(['persons'])['col_1'].apply(lambda x: pd.Series(x.index.isin(x.sample(frac=percentage_to_flag, random_state= 5, replace=False).index))).reset_index(drop=True)
df['flagged'] = a
Input:
persons col_1
0 A 2
1 A 2
2 A 2
3 A 2
4 A 2
5 A 2
6 A 2
7 A 2
8 A 2
9 A 2
10 B 2
11 B 2
12 B 2
13 B 2
14 B 2
15 B 2
16 B 2
17 B 2
18 B 2
19 B 2
Output with 50% flagged rows in each group:
persons col_1 flagged
0 A 2 False
1 A 2 False
2 A 2 True
3 A 2 False
4 A 2 True
5 A 2 True
6 A 2 False
7 A 2 True
8 A 2 False
9 A 2 True
10 B 2 False
11 B 2 False
12 B 2 True
13 B 2 False
14 B 2 True
15 B 2 True
16 B 2 False
17 B 2 True
18 B 2 False
19 B 2 True
This is TMBailey's answer, tweaked so it works in my Python version. (Didn't want to edit someone else's answer but if I'm doing it wrong I'll take this down.) This works really great and really fast!
EDIT: I've updated this based on additional suggestion by TMBailey to replace frac=percentage_to_flag with n=math.ceil(percentage_to_flag * len(x)). This ensures that rounding doesn't pull the sampled %age under the 'percentage_to_flag' threshhold. (For what it's worth, you can replace it with frac=(math.ceil(percentage_to_flag * len(x)))/len(x) too).
import pandas as pd
import math
percentage_to_flag = .10
# Toy data:
y = ['Alex'] * 2321 + ['Eddie'] * 876 + ['Doug'] * 34123 + ['Chuck'] * 2012 + ['Bob'] * 9281
z = ['xyz'] * len(y)
df = pd.DataFrame({'persons': y, 'data' : z})
df = df.sample(frac = 1) #optional shuffle, just to show order doesn't matter
# Pick out random sample of dataframe.
random_state = 41 # Change to get different random values.
df_sample = df.groupby("persons").apply(lambda x: x.sample(n=(math.ceil(percentage_to_flag * len(x))),random_state=random_state))
#had to use lambda in line above
df_sample = df_sample.reset_index(level=0, drop=True) #had to add this to simplify multi-index DF
# Mark the random sample in the original dataframe.
df["marked"] = False
df.loc[df_sample.index, "marked"] = True
And then to check:
pp = df.pivot_table(index="persons", columns="marked", values="data", aggfunc='count', fill_value=0)
pp.columns = ['no','yes']
pp = pp.append(pp.sum().rename('Total')).assign(Total=lambda d: d.sum(1))
pp['% selected'] = 100 * pp.yes/pp.Total
print(pp)
OUTPUT:
no yes Total % selected
persons
Alex 2088 233 2321 10.038776
Bob 8352 929 9281 10.009697
Chuck 1810 202 2012 10.039761
Doug 30710 3413 34123 10.002051
Eddie 788 88 876 10.045662
Total 43748 4865 48613 10.007611
Works like a charm.
I am trying to implement the 'Bottom-Up Computation' algorithm in data mining (https://www.aaai.org/Papers/FLAIRS/2003/Flairs03-050.pdf).
I need to use the 'pandas' library to create a dataframe and provide it to a recursive function, which should also return a dataframe as output. I am only able to return the final column as output, because I am unable to figure out how to dynamically build a data frame.
Here is the python program:
import pandas as pd
def project_data(df, d):
return df.iloc[:, d]
def select_data(df, d, val):
col_name = df.columns[d]
return df[df[col_name] == val]
def remove_first_dim(df):
return df.iloc[:, 1:]
def slice_data_dim0(df, v):
df_temp = select_data(df, 0, v)
return remove_first_dim(df_temp)
def buc(df):
dims = df.shape[1]
if dims == 1:
input_sum = sum(project_data(df, 0) )
print(input_sum)
else:
dim_vals = set(project_data(df, 0).values)
for dim_val in dim_vals:
sub_data = slice_data_dim0(df, dim_val)
buc(sub_data)
sub_data = remove_first_dim(df)
buc(sub_data)
data = {'A':[1,1,1,1,2],
'B':[1,1,2,3,1],
'M':[10,20,30,40,50]
}
df = pd.DataFrame(data, columns = ['A','B','M'])
buc(df)
I get the following output:
30
30
40
100
50
50
80
30
40
But what I need is a dataframe, like this (not necessarily formatted, but a data frame):
A B M
0 1 1 30
1 1 2 30
2 1 3 40
3 1 ALL 100
4 2 1 50
5 2 ALL 50
6 ALL 1 80
7 ALL 2 30
8 ALL 3 40
9 ALL ALL 150
How do I achieve this?
Unfortunately pandas doesn't have functionality to do subtotals - so the trick is to just calculate them on the side and concatenate together with original dataframe.
from itertools import combinations
import numpy as np
dim = ['A', 'B']
vals = ['M']
df = pd.concat(
[df]
# subtotals:
+ [df.groupby(list(gr), as_index=False)[vals].sum() for r in range(len(dim)-1) for gr in combinations(dim, r+1)]
# total:
+ [df.groupby(np.zeros(len(df)))[vals].sum()]
)\
.sort_values(dim)\
.reset_index(drop=True)\
.fillna("ALL")
Output:
A B M
0 1 1 10
1 1 1 20
2 1 2 30
3 1 3 40
4 1 ALL 100
5 2 1 50
6 2 ALL 50
7 ALL 1 80
8 ALL 2 30
9 ALL 3 40
10 ALL ALL 150
I have the following working code that sets 1 to "new_col" at the locations pointed by intervals dictated by starts and ends.
import pandas as pd
import numpy as np
df = pd.DataFrame({"a": np.arange(10)})
starts = [1, 5, 8]
ends = [1, 6, 10]
value = 1
df["new_col"] = 0
for s, e in zip(starts, ends):
df.loc[s:e, "new_col"] = value
print(df)
a new_col
0 0 0
1 1 1
2 2 0
3 3 0
4 4 0
5 5 1
6 6 1
7 7 0
8 8 1
9 9 1
I want these intervals to come from another dataframe pointer_df.
How to vectorize this?
pointer_df = pd.DataFrame({"starts": starts, "ends": ends})
Attempt:
df.loc[pointer_df["starts"]:pointer_df["ends"], "new_col"] = 2
print(df)
obviously doesn't work and gives
raise AssertionError("Start slice bound is non-scalar")
AssertionError: Start slice bound is non-scalar
EDIT:
it seems all answers use some kind of pythonic for loop.
the question was how to vectorize the operation above?
Is this not doable without for loops/list comprehentions?
You could do:
pointer_df = pd.DataFrame({"starts": starts, "ends": ends})
rang = np.arange(len(df))
indices = [i for s, e in pointer_df.to_numpy() for i in rang[slice(s, e + 1, None)]]
df.loc[indices, 'new_col'] = value
print(df)
Output
a new_col
0 0 0
1 1 1
2 2 0
3 3 0
4 4 0
5 5 1
6 6 1
7 7 0
8 8 1
9 9 1
If you want a method that do not uses uses any for loop or list comprehension, only relies on numpy, you could do:
def indices(start, end, ma=10):
limits = end + 1
lens = np.where(limits < ma, limits, end) - start
np.cumsum(lens, out=lens)
i = np.ones(lens[-1], dtype=int)
i[0] = start[0]
i[lens[:-1]] += start[1:]
i[lens[:-1]] -= limits[:-1]
np.cumsum(i, out=i)
return i
pointer_df = pd.DataFrame({"starts": starts, "ends": ends})
df.loc[indices(pointer_df.starts.values, pointer_df.ends.values, ma=len(df)), "new_col"] = value
print(df)
I adapted the method to your use case from the one in this answer.
for i,j in zip(pointer_df["starts"],pointer_df["ends"]):
print (i,j)
Apply same method but on your dictionary
I have a DataFrame as below:
len scores
5 [0.45814112124905954, 0.34974337172257086, 0.042586941883761324, 0.042586941883761324, 0.33509446692807404, 0.01202741856859997, 0.01202741856859997, 0.031149023579740857, 0.031149023579740857, 0.9382029832667171]
4 [0.1289882974831455, 0.17069367229950574, 0.03518847270370917, 0.3283517918439753, 0.41119171582425107, 0.5057528742869354]
3 [0.22345885572316307, 0.1366147609256035, 0.09309687010700848]
2 [0.4049920770888036]
I want to index the scores column based on len column value and get multiple rows
len scores
5 [0.45814112124905954, 0.34974337172257086, 0.042586941883761324, 0.042586941883761324]
5 [0.33509446692807404, 0.01202741856859997, 0.01202741856859997]
5 [0.031149023579740857, 0.031149023579740857]
5 [0.9382029832667171]
5
4 [0.1289882974831455, 0.17069367229950574, 0.03518847270370917]
4 [0.3283517918439753, 0.41119171582425107]
4 [0.9382029832667171]
4
3 [0.22345885572316307, 0.1366147609256035]
3 [0.09309687010700848]
3
2 [0.4049920770888036]
2
I tried this
d = []
for x in df['len']:
col = df['scores'][:(x-1)]
d.append(col)
but this would just give me first row of indexed row only
len scores
5 [0.45814112124905954, 0.34974337172257086, 0.042586941883761324, 0.042586941883761324]
4 [0.1289882974831455, 0.17069367229950574, 0.03518847270370917]
3 [0.22345885572316307, 0.1366147609256035]
2 [0.4049920770888036]
How to get the rest of the rows to index as per my requirement ?
Assuming that the column len is related to the length of the list in the column scores row wise as in your example, you can do it with apply to reshape the list to nested list with decreasing length and then explode like:
#define function to create nested list
def create_nested_list (x):
l_idx = [0]+np.cumsum(np.arange(x['len'])[::-1]).tolist()
return [x['scores'][i:j] for i, j in zip(l_idx[:-1], l_idx[1:])]
#apply row-wise
s = df.apply(create_nested_list, axis=1)
#change index to keep the value in len
s.index=df['len']
#explode and reset_index
df_f = s.explode().reset_index(name='scores')
print (df_f)
len scores
0 5 [0.45814112124905954, 0.34974337172257086, 0.0...
1 5 [0.33509446692807404, 0.01202741856859997, 0.0...
2 5 [0.031149023579740857, 0.031149023579740857]
3 5 [0.9382029832667171]
4 5 []
5 4 [0.1289882974831455, 0.17069367229950574, 0.03...
6 4 [0.3283517918439753, 0.41119171582425107]
7 4 [0.5057528742869354]
8 4 []
9 3 [0.22345885572316307, 0.1366147609256035]
10 3 [0.09309687010700848]
11 3 []
12 2 [0.4049920770888036]
13 2 []
EDIT: if you can't use explode, try like this:
#define function to create a series from nested lists
def create_nested_list_s (x):
l_idx = [0]+np.cumsum(np.arange(x['len'])[::-1]).tolist()
return pd.Series([x['scores'][i:j] for i, j in zip(l_idx[:-1], l_idx[1:])])
df_f = (df.apply(create_nested_list_s, axis=1)
.set_index(df['len'])
.stack()
.reset_index(name='scores')
.drop('level_1', axis=1))
print(df_f)
df.explode() does exactly what you want.
Example:
import pandas as pd
df = pd.DataFrame({'A': [[1, 2, 3], 'foo', [], [3, 4]], 'B': 1})
df.explode('A')
#Output
# A B
# 0 1 1
# 0 2 1
# 0 3 1
# 1 foo 1
# 2 NaN 1
# 3 3 1
# 3 4 1
This question already has answers here:
Calculate new value based on decreasing value
(4 answers)
Closed 5 years ago.
Given the following table
vals
0 20
1 3
2 2
3 10
4 20
I'm trying to find a clean solution in pandas to subtract away a value, say 30 for example, to end with the following result.
vals
0 0
1 0
2 0
3 5
4 20
I was wondering if pandas had a solution to performing this that didn't require looping through all the rows in a dataframe, something that takes advantage of pandas's bulk operations.
identify where cumsum is greater than or equal to 30
mask the rows where it isn't
reassign the one row to be the cumsum less 30
c = df.vals.cumsum()
m = c.ge(30)
i = m.idxmax()
n = df.vals.where(m, 0)
n.loc[i] = c.loc[i] - 30
df.assign(vals=n)
vals
0 0
1 0
2 0
3 5
4 20
Same thing, but numpyfied
v = df.vals.values
c = v.cumsum()
m = c >= 30
i = m.argmax()
n = np.where(m, v, 0)
n[i] = c[i] - 30
df.assign(vals=n)
vals
0 0
1 0
2 0
3 5
4 20
Timing
%%timeit
v = df.vals.values
c = v.cumsum()
m = c >= 30
i = m.argmax()
n = np.where(m, v, 0)
n[i] = c[i] - 30
df.assign(vals=n)
10000 loops, best of 3: 168 µs per loop
%%timeit
c = df.vals.cumsum()
m = c.ge(30)
i = m.idxmax()
n = df.vals.where(m, 0)
n.loc[i] = c.loc[i] - 30
df.assign(vals=n)
1000 loops, best of 3: 853 µs per loop
Here's one using NumPy with four lines of code -
v = df.vals.values
a = v.cumsum()-30
idx = (a>0).argmax()+1
v[:idx] = a.clip(min=0)[:idx]
Sample run -
In [274]: df # Original df
Out[274]:
vals
0 20
1 3
2 2
3 10
4 20
In [275]: df.iloc[3,0] = 7 # Bringing in some variety
In [276]: df
Out[276]:
vals
0 20
1 3
2 2
3 7
4 20
In [277]: v = df.vals.values
...: a = v.cumsum()-30
...: idx = (a>0).argmax()+1
...: v[:idx] = a.clip(min=0)[:idx]
...:
In [278]: df
Out[278]:
vals
0 0
1 0
2 0
3 2
4 20
#A one-liner solution
df['vals'] = df.assign(res = 30-df.vals.cumsum()).apply(lambda x: 0 if x.res>0 else x.vals if abs(x.res)>x.vals else x.vals-abs(x.res), axis=1)
df
Out[96]:
vals
0 0
1 0
2 0
3 5
4 20