Consider the following dataframes d1 and d1
d1 = pd.DataFrame([
[1, 2, 3],
[2, 3, 4],
[3, 4, 5],
[1, 2, 3],
[2, 3, 4],
[3, 4, 5]
], columns=list('ABC'))
d2 = pd.get_dummies(list('XYZZXY'))
d1
A B C
0 1 2 3
1 2 3 4
2 3 4 5
3 1 2 3
4 2 3 4
5 3 4 5
d2
X Y Z
0 1 0 0
1 0 1 0
2 0 0 1
3 0 0 1
4 1 0 0
5 0 1 0
I need to get a new dataframe with a multi-index columns object that has the product of every combination of columns from d1 and d2
So far I've done this...
from itertools import product
pd.concat({(x, y): d1[x] * d2[y] for x, y in product(d1, d2)}, axis=1)
A B C
X Y Z X Y Z X Y Z
0 1 0 0 2 0 0 3 0 0
1 0 2 0 0 3 0 0 4 0
2 0 0 3 0 0 4 0 0 5
3 0 0 1 0 0 2 0 0 3
4 2 0 0 3 0 0 4 0 0
5 0 3 0 0 4 0 0 5 0
There is nothing wrong with this method. But I'm looking for alternatives to evaluate.
Inspired by Yakym Pirozhenko
m, n = len(d1.columns), len(d2.columns)
lvl0 = np.repeat(np.arange(m), n)
lvl1 = np.tile(np.arange(n), m)
v1, v2 = d1.values, d2.values
pd.DataFrame(
v1[:, lvl0] * v2[:, lvl1],
d1.index,
pd.MultiIndex.from_tuples(list(zip(d1.columns[lvl0], d2.columns[lvl1])))
)
However, this is a more clumsy implementation of numpy broadcasting which is better covered by Divakar.
Timing
All answers were good answers and demonstrate different aspects of pandas and numpy. Please consider up-voting them if you found them useful and informative.
%%timeit
m, n = len(d1.columns), len(d2.columns)
lvl0 = np.repeat(np.arange(m), n)
lvl1 = np.tile(np.arange(n), m)
v1, v2 = d1.values, d2.values
pd.DataFrame(
v1[:, lvl0] * v2[:, lvl1],
d1.index,
pd.MultiIndex.from_tuples(list(zip(d1.columns[lvl0], d2.columns[lvl1])))
)
%%timeit
vals = (d2.values[:,None,:] * d1.values[:,:,None]).reshape(d1.shape[0],-1)
cols = pd.MultiIndex.from_product([d1.columns, d2.columns])
pd.DataFrame(vals, columns=cols, index=d1.index)
%timeit d1.apply(lambda x: d2.mul(x, axis=0).stack()).unstack()
%timeit pd.concat({x : d2.mul(d1[x], axis=0) for x in d1.columns}, axis=1)
%timeit pd.concat({(x, y): d1[x] * d2[y] for x, y in product(d1, d2)}, axis=1)
1000 loops, best of 3: 663 µs per loop
1000 loops, best of 3: 624 µs per loop
100 loops, best of 3: 3.38 ms per loop
1000 loops, best of 3: 860 µs per loop
100 loops, best of 3: 2.01 ms per loop
Here is a one-liner that uses pandas stack and unstack method.
The "trick" is to use stack, so that the result of each computation within apply is a time series. Then use unstack to obtain the Multiindex form.
d1.apply(lambda x: d2.mul(x, axis=0).stack()).unstack()
Which gives:
A B C
X Y Z X Y Z X Y Z
0 1.0 0.0 0.0 2.0 0.0 0.0 3.0 0.0 0.0
1 0.0 2.0 0.0 0.0 3.0 0.0 0.0 4.0 0.0
2 0.0 0.0 3.0 0.0 0.0 4.0 0.0 0.0 5.0
3 0.0 0.0 1.0 0.0 0.0 2.0 0.0 0.0 3.0
4 2.0 0.0 0.0 3.0 0.0 0.0 4.0 0.0 0.0
5 0.0 3.0 0.0 0.0 4.0 0.0 0.0 5.0 0.0
Here's one approach with NumPy broadcasting -
vals = (d2.values[:,None,:] * d1.values[:,:,None]).reshape(d1.shape[0],-1)
cols = pd.MultiIndex.from_product([d1.columns, d2.columns])
df_out = pd.DataFrame(vals, columns=cols, index=d1.index)
Sample run -
In [92]: d1
Out[92]:
A B C
0 1 2 3
1 2 3 4
2 3 4 5
3 1 2 3
4 2 3 4
5 3 4 5
In [93]: d2
Out[93]:
X Y Z
0 1 0 0
1 0 1 0
2 0 0 1
3 0 0 1
4 1 0 0
5 0 1 0
In [110]: vals = (d2.values[:,None,:] * d1.values[:,:,None]).reshape(d1.shape[0],-1)
...: cols = pd.MultiIndex.from_product([d1.columns, d2.columns])
...: df_out = pd.DataFrame(vals, columns=cols, index=d1.index)
...:
In [111]: df_out
Out[111]:
A B C
X Y Z X Y Z X Y Z
0 1 0 0 2 0 0 3 0 0
1 0 2 0 0 3 0 0 4 0
2 0 0 3 0 0 4 0 0 5
3 0 0 1 0 0 2 0 0 3
4 2 0 0 3 0 0 4 0 0
5 0 3 0 0 4 0 0 5 0
Here's a bit vectorized version. There could be a better way.
In [846]: pd.concat({x : d2.mul(d1[x], axis=0) for x in d1.columns}, axis=1)
Out[846]:
A B C
X Y Z X Y Z X Y Z
0 1 0 0 2 0 0 3 0 0
1 0 2 0 0 3 0 0 4 0
2 0 0 3 0 0 4 0 0 5
3 0 0 1 0 0 2 0 0 3
4 2 0 0 3 0 0 4 0 0
5 0 3 0 0 4 0 0 5 0
You could get the multi-index first, use it to obtain the shapes and multiply directly.
cols = pd.MultiIndex.from_tuples(
[(c1, c2) for c1 in d1.columns for c2 in d2.columns])
a = d1.loc[:,cols.get_level_values(0)]
b = d2.loc[:,cols.get_level_values(1)]
a.columns = b.columns = cols
res = a * b
Related
Basic Example:
# Given params such as:
params = {
'cols': 8,
'rows': 4,
'n': 4
}
# I'd like to produce (or equivalent):
col0 col1 col2 col3 col4 col5 col6 col7
row_0 0 1 2 3 0 1 2 3
row_1 1 2 3 0 1 2 3 0
row_2 2 3 0 1 2 3 0 1
row_3 3 0 1 2 3 0 1 2
Axis Value Counts:
Where the axis all have an equal distribution of values
df.apply(lambda x: x.value_counts(), axis=1)
0 1 2 3
row_0 2 2 2 2
row_1 2 2 2 2
row_2 2 2 2 2
row_3 2 2 2 2
df.apply(lambda x: x.value_counts())
col0 col1 col2 col3 col4 col5 col6 col7
0 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1
2 1 1 1 1 1 1 1 1
3 1 1 1 1 1 1 1 1
My attempt thus far:
import itertools
import pandas as pd
def create_df(cols, rows, n):
x = itertools.cycle(list(itertools.permutations(range(n))))
df = pd.DataFrame(index=range(rows), columns=range(cols))
df[:] = np.reshape([next(x) for _ in range((rows*cols)//n)], (rows, cols))
#df = df.T.add_prefix('row_').T
#df = df.add_prefix('col_')
return df
params = {
'cols': 8,
'rows': 4,
'n': 4
}
df = create_df(**params)
Output:
0 1 2 3 4 5 6 7
0 0 1 2 3 0 1 3 2
1 0 2 1 3 0 2 3 1
2 0 3 1 2 0 3 2 1
3 1 0 2 3 1 0 3 2
# Correct on this Axis:
>>> df.apply(lambda x: x.value_counts(), axis=1)
0 1 2 3
0 2 2 2 2
1 2 2 2 2
2 2 2 2 2
3 2 2 2 2
# Incorrect on this Axis:
>>> df.apply(lambda x: x.value_counts())
0 1 2 3 4 5 6 7
0 3.0 1 NaN NaN 3.0 1 NaN NaN
1 1.0 1 2.0 NaN 1.0 1 NaN 2.0
2 NaN 1 2.0 1.0 NaN 1 1.0 2.0
3 NaN 1 NaN 3.0 NaN 1 3.0 NaN
So, I have the conditions I need on one axis, but not on the other.
How can I update my method/create a method to meet both conditions?
You can use numpy.roll:
def create_df(cols, rows, n):
x = itertools.cycle(range(n))
arr = [np.roll([next(x) for _ in range(cols)], -i) for i in range(rows)]
return pd.DataFrame(arr)
Output (with given test case):
0 1 2 3 4 5 6 7
0 0 1 2 3 0 1 2 3
1 1 2 3 0 1 2 3 0
2 2 3 0 1 2 3 0 1
3 3 0 1 2 3 0 1 2
Edit: In Python 3.8+ you can use the := operator (which is significantly faster than my answer above):
def create_df(cols, rows, n):
x = itertools.cycle(range(n))
n = [next(x) for _ in range(cols)]
arr = [n := n[1:]+n[:1] for _ in range(rows)]
return pd.DataFrame(arr)
Output (again with given test case):
0 1 2 3 4 5 6 7
0 1 2 3 0 1 2 3 0
1 2 3 0 1 2 3 0 1
2 3 0 1 2 3 0 1 2
3 0 1 2 3 0 1 2 3
You can tile you input and use a custom roll to shift each row independently:
c = params['cols']
r = params['rows']
n = params['n']
a = np.arange(params['n']) # or any input
b = np.tile(a, (r, c//n))
# array([[0, 1, 2, 3, 0, 1, 2, 3],
# [0, 1, 2, 3, 0, 1, 2, 3],
# [0, 1, 2, 3, 0, 1, 2, 3],
# [0, 1, 2, 3, 0, 1, 2, 3]])
idx = np.arange(r)[:, None]
shift = (np.tile(np.arange(c), (r, 1)) - np.arange(r)[:, None])
df = pd.DataFrame(b[idx, shift])
Output:
0 1 2 3 4 5 6 7
0 0 1 2 3 0 1 2 3
1 3 0 1 2 3 0 1 2
2 2 3 0 1 2 3 0 1
3 1 2 3 0 1 2 3 0
Alternative order:
idx = np.arange(r)[:, None]
shift = (np.tile(np.arange(c), (r, 1)) + np.arange(r)[:, None]) % c
df = pd.DataFrame(b[idx, shift])
Output:
0 1 2 3 4 5 6 7
0 0 1 2 3 0 1 2 3
1 1 2 3 0 1 2 3 0
2 2 3 0 1 2 3 0 1
3 3 0 1 2 3 0 1 2
Other alternative: use a custom strided_indexing_roll function.
What is the best way to add the contents of two dataframes, which have mostly equivalent indices:
df1:
A B C
A 0 3 1
B 3 0 2
C 1 2 0
df2:
A B C D
A 0 1 1 0
B 1 0 3 2
C 1 3 0 0
D 0 2 0 0
df1 + df2 =
A B C D
A 0 4 2 0
B 4 0 5 2
C 2 5 0 0
D 0 2 0 0
You can also concat both the dataframes since concatenation (by default) happens by index.
# sample dataframe
df1 = pd.DataFrame({'a': [1,2,3], 'b':[2,3,4]}, index=['a','c','e'])
df2 = pd.DataFrame({'a': [10,20], 'b':[11,22]}, index=['b','d'])
new_df= pd.concat([df1, df2]).sort_index()
print(new_df)
a b
a 1 2
b 10 11
c 2 3
d 20 22
e 3 4
I think you can just add:
In [625]: df1.add(df2,fill_value=0)
Out[625]:
A B C D
A 0.0 4.0 2.0 0.0
B 4.0 0.0 5.0 2.0
C 2.0 5.0 0.0 0.0
D 0.0 2.0 0.0 0.0
I'm trying to transform a dataframe
df = pd.DataFrame({
'c1': ['x','y','z'],
'c2': [[1,2,3],[1,3],[2,4]]})
which looks like
c1 c2
0 x [1, 2, 3]
1 y [1, 3]
2 z [2, 4]
into
p = pd.DataFrame({
'c1': ['x','y','z'],
1: [1,1,0],
2: [1,0,1],
3: [1,1,0],
4: [0,0,1]
})
which looks like
c1 1 2 3 4
0 x 1 1 1 0
1 y 1 0 1 0
2 z 0 1 0 1
the value 1's and 0's are supposed to be true and false. I'm still learning pivots. Please point me in the right direction.
You can use:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
df1 = pd.DataFrame(mlb.fit_transform(df['c2']),columns=mlb.classes_, index=df.index)
df = df.drop('c2', 1).join(df1)
print (df)
c1 1 2 3 4
0 x 1 1 1 0
1 y 1 0 1 0
2 z 0 1 0 1
Another solution:
df1 = df['c2'].apply(lambda x: '|'.join([str(y) for y in x])).str.get_dummies()
df = df.drop('c2', 1).join(df1)
print (df)
c1 1 2 3 4
0 x 1 1 1 0
1 y 1 0 1 0
2 z 0 1 0 1
EDIT:
Thanks, MaxU for nice suggestion:
df = df.join(pd.DataFrame(mlb.fit_transform(df.pop('c2')),
columns=mlb.classes_,
index=df.index))
You can use
In [235]: df.join(pd.DataFrame([{x: 1 for x in r} for r in df.c2]).fillna(0))
Out[235]:
c1 c2 1 2 3 4
0 x [1, 2, 3] 1.0 1.0 1.0 0.0
1 y [1, 3] 1.0 0.0 1.0 0.0
2 z [2, 4] 0.0 1.0 0.0 1.0
Details
In [236]: pd.DataFrame([{x: 1 for x in r} for r in df.c2]).fillna(0)
Out[236]:
1 2 3 4
0 1.0 1.0 1.0 0.0
1 1.0 0.0 1.0 0.0
2 0.0 1.0 0.0 1.0
I have the following dataframe
a b c d e
0 0 0 -1 1 -1
1 0 1 -1 1 -1
2 -1 0 -1 1 1
3 -1 1 1 -1 1
4 1 0 1 -1 1
5 1 0 0 0 1
6 1 1 0 0 -1
7 1 1 -1 0 0
For each numbers that appears in a,b,c,d,e, I want to sum and store in a column the amount of times that it appears in a row, so the result should be something like this:
a b c d e Sum1 Sum0 Sum_1
0 0 0 -1 1 -1 1 2 2
1 0 1 -1 1 -1 2 1 2
2 -1 0 -1 1 1 2 1 2
3 -1 1 1 -1 1 3 0 2
4 1 0 1 -1 1 3 1 1
5 1 0 0 0 1 2 3 0
6 1 -1 0 0 -1 1 2 2
7 1 1 -1 -1 0 2 1 2
So in the first row, the number "1" appears once in a,b,c,d,e, so we store that in Sum1 column. Then, number "0" appears two times, and we store that in Sum0, and number "-1" appears 2 times and we store it in Sum_1.
How could this columns be calculated, without using lambda functions (to get better performance)? I guess that numpy is involved here but I don't get how to do it
Using get_dummies
df=df.astype(str)
pd.get_dummies(df.stack()).sum(level=0)
Out[667]:
-1 0 1
0 2 2 1
1 2 1 2
2 2 1 2
3 2 0 3
4 1 1 3
5 0 3 2
6 1 2 2
7 1 2 2
More info
pd.concat([df,pd.get_dummies(df.stack()).sum(level=0).add_prefix('Sum')],1)
Out[669]:
a b c d e Sum-1 Sum0 Sum1
0 0 0 -1 1 -1 2 2 1
1 0 1 -1 1 -1 2 1 2
2 -1 0 -1 1 1 2 1 2
3 -1 1 1 -1 1 2 0 3
4 1 0 1 -1 1 1 1 3
5 1 0 0 0 1 0 3 2
6 1 1 0 0 -1 1 2 2
7 1 1 -1 0 0 1 2 2
Another method maybe solve but do not need convert to str.
df.apply(lambda x : x.value_counts(),1).fillna(0)
Out[674]:
-1 0 1
0 2.0 2.0 1.0
1 2.0 1.0 2.0
2 2.0 1.0 2.0
3 2.0 0.0 3.0
4 1.0 1.0 3.0
5 0.0 3.0 2.0
6 1.0 2.0 2.0
7 1.0 2.0 2.0
Use
In [62]: df.assign(**{'Sum{}'.format(v):df.eq(v).sum(1) for v in [1, 0, -1]})
Out[62]:
a b c d e Sum-1 Sum0 Sum1
0 0 0 -1 1 -1 2 2 1
1 0 1 -1 1 -1 2 1 2
2 -1 0 -1 1 1 2 1 2
3 -1 1 1 -1 1 2 0 3
4 1 0 1 -1 1 1 1 3
5 1 0 0 0 1 0 3 2
6 1 1 0 0 -1 1 2 2
7 1 1 -1 0 0 1 2 2
Same as
In [72]: df.join(pd.DataFrame({'Sum{}'.format(v):df.eq(v).sum(1) for v in [1, 0, -1]}))
Out[72]:
a b c d e Sum-1 Sum0 Sum1
0 0 0 -1 1 -1 2 2 1
1 0 1 -1 1 -1 2 1 2
2 -1 0 -1 1 1 2 1 2
3 -1 1 1 -1 1 2 0 3
4 1 0 1 -1 1 1 1 3
5 1 0 0 0 1 0 3 2
6 1 1 0 0 -1 1 2 2
7 1 1 -1 0 0 1 2 2
Create boolean mask and count it - Trues are process like 1:
m1 = df == 1
m0 = df == 0
m_1 = df == -1
df['Sum1'] = m1.sum(1)
df['Sum0'] = m0.sum(1)
df['Sum_1'] = m_1.sum(1)
print (df)
a b c d e Sum1 Sum0 Sum_1
0 0 0 -1 1 -1 1 2 2
1 0 1 -1 1 -1 2 1 2
2 -1 0 -1 1 1 2 1 2
3 -1 1 1 -1 1 3 0 2
4 1 0 1 -1 1 3 1 1
5 1 0 0 0 1 2 3 0
6 1 1 0 0 -1 2 2 1
7 1 1 -1 0 0 2 2 1
General solution with get_dummies:
df1 = (pd.get_dummies(df.astype(str), prefix='', prefix_sep='')
.sum(level=0, axis=1)
.add_prefix('Sum'))
print (df1)
Sum-1 Sum0 Sum1
0 2 2 1
1 2 1 2
2 2 1 2
3 2 0 3
4 1 1 3
5 0 3 2
6 1 2 2
7 1 2 2
df = df.join(df1)
print (df)
a b c d e Sum-1 Sum0 Sum1
0 0 0 -1 1 -1 2 2 1
1 0 1 -1 1 -1 2 1 2
2 -1 0 -1 1 1 2 1 2
3 -1 1 1 -1 1 2 0 3
4 1 0 1 -1 1 1 1 3
5 1 0 0 0 1 0 3 2
6 1 1 0 0 -1 1 2 2
7 1 1 -1 0 0 1 2 2
Idea for better performance of Zero solution - compare numpy array and instead static values is possible use unique values by numpy.unique:
all_vals = np.unique(df.values)
arr = df.values
df1 = df.join(pd.DataFrame({'Sum{}'.format(v):(arr == v).sum(1) for v in all_vals}))
print (df1)
a b c d e Sum-1 Sum0 Sum1
0 0 0 -1 1 -1 2 2 1
1 0 1 -1 1 -1 2 1 2
2 -1 0 -1 1 1 2 1 2
3 -1 1 1 -1 1 2 0 3
4 1 0 1 -1 1 1 1 3
5 1 0 0 0 1 0 3 2
6 1 1 0 0 -1 1 2 2
7 1 1 -1 0 0 1 2 2
Timings
np.random.seed(234)
N = 100000
df = pd.DataFrame(np.random.randint(3, size=(N,5)), columns=list('abcde')) - 1
print (df)
#wen's solution 1
In [49]: %timeit pd.concat([df,pd.get_dummies(df.astype(str).stack()).sum(level=0).add_prefix('Sum')],1)
1 loop, best of 3: 2.21 s per loop
#wen's solution 2
In [56]: %timeit df.apply(lambda x : x.value_counts(),1).fillna(0)
1 loop, best of 3: 1min 35s per loop
#jezrael's solution 2
In [50]: %timeit df.join((pd.get_dummies(df.astype(str), prefix='', prefix_sep='').sum(level=0, axis=1).add_prefix('Sum')))
1 loop, best of 3: 2.14 s per loop
#jezrael's solution 1
In [55]: %%timeit
...: m1 = df == 1
...: m0 = df == 0
...: m_1 = df == -1
...: df['Sum1'] = m1.sum(1)
...: df['Sum0'] = m0.sum(1)
...: df['Sum_1'] = m_1.sum(1)
...:
10 loops, best of 3: 50.6 ms per loop
#zero's solution1
In [51]: %timeit df.assign(**{'Sum{}'.format(v):df.eq(v).sum(1) for v in [1, 0, -1]})
10 loops, best of 3: 39.8 ms per loop
#zero's solution2
In [52]: %timeit df.join(pd.DataFrame({'Sum{}'.format(v):df.eq(v).sum(1) for v in [1, 0, -1]}))
10 loops, best of 3: 39.6 ms per loop
#zero&jezrael's solution1
In [53]: %timeit df.join(pd.DataFrame({'Sum{}'.format(v):(df.values == v).sum(1) for v in np.unique(df.values)}))
10 loops, best of 3: 23.8 ms per loop
#zero&jezrael's solution2
In [54]: %timeit df.join(pd.DataFrame({'Sum{}'.format(v):(df.values == v).sum(1) for v in [0, 1, -1]}))
100 loops, best of 3: 12.8 ms per loop
#if many columns and more unique values is possible convert to numpy array outside loop
def f1(df):
all_vals = np.unique(df.values)
arr = df.values
return df.join(pd.DataFrame({'Sum{}'.format(v):(arr == v).sum(1) for v in all_vals}))
def f2(df):
arr = df.values
return df.join(pd.DataFrame({'Sum{}'.format(v):(arr == v).sum(1) for v in [0, 1, -1]}))
print (f1(df))
print (f2(df))
#zero&jezrael's solution3
In [58]: %timeit (f1(df))
10 loops, best of 3: 25.8 ms per loop
#zero&jezrael's solution4
In [59]: %timeit (f2(df))
100 loops, best of 3: 13 ms per loop
I would like to transform the below pandas dataframe:
dd = pd.DataFrame({ "zz":[1,3], "y": ["a","b"], "x": [[1,2],[1]]})
x y z
0 [1, 2] a 1
1 [1] b 3
into :
x y z
0 1 a 1
1 1 b 3
2 2 a 1
As you can see, the first row is elaborated in columns X into its individual elements while repeating the other columns y, z. Can I do this without using a for loop?
Use:
#get lengths of lists
l = dd['x'].str.len()
df = dd.loc[dd.index.repeat(l)].assign(x=np.concatenate(dd['x'])).reset_index(drop=True)
print (df)
x y zz
0 1 a 1
1 2 a 1
2 1 b 3
But if order is important:
df1 = pd.DataFrame(dd['x'].values.tolist())
.stack()
.sort_index(level=[1,0])
.reset_index(name='x')
print (df1)
level_0 level_1 x
0 0 0 1.0
1 1 0 1.0
2 0 1 2.0
df = df1.join(dd.drop('x',1), on='level_0').drop(['level_0','level_1'], 1)
print (df)
x y zz
0 1.0 a 1
1 1.0 b 3
2 2.0 a 1
Using join and stack you can
In [655]: dd.drop('x', 1).join(
dd.apply(lambda x: pd.Series(x.x), axis=1)
.stack().reset_index(level=1, drop=True).to_frame('x'))
Out[655]:
y z x
0 a 1 1.0
0 a 1 2.0
1 b 3 1.0
Details
In [656]: dd.apply(lambda x: pd.Series(x.x), axis=1).stack().reset_index(level=1,drop=True)
Out[656]:
0 1.0
0 2.0
1 1.0
dtype: float64
In [657]: dd
Out[657]:
x y z
0 [1, 2] a 1
1 [1] b 3
new_dd = pd.DataFrame(dd.apply(lambda x: pd.Series(x['x']),axis=1).stack().reset_index(level=1, drop=True))
new_dd.columns = ['x']
new_dd.merge(dd[['y','zz']], left_index=True, right_index=True)