Cross tabulation between two sets of columns in a DataFrame

Cross tabulation between two sets of columns in a DataFrame - python

I have tried to get crosstab of data specified by slice.
But something wrong in syntax.
data.csv like the following
ia,ib,ic,id,ie,if,ig
a,0,0,0,e,0,g
0,b,0,0,e,f,0
0,0,c,d,0,f,g
And then do python3 test.py like the following
import pandas as pd
import enum
df = pd.read_csv('data.csv')
class Slices(enum.Enum):
first = slice(0, 2)
second = slice(4, 6)
def getCrosstab(*args):
cols1 = []
cols1.append(df.iloc[:, args[0].value])
cols2 = []
cols2.append(df.iloc[:, args[1].value])
print( pd.crosstab(cols1, cols2) )
if __name__ == '__main__':
getCrosstab(Slices.first, Slices.second)
Expected result:
col2 ie if ig
col1
ia 1 0 1
ib 1 1 0
ic 0 1 1
But I had an error:
ValueError: Shape of passed values is (2, 2), indices imply (2, 3)
I can not fully understand the meaning of this error.
Please give me your guidance.

melt twice, once for each set of columns, and then call crosstab:
u = (df.melt(['ia', 'ib', 'ic'], var_name='C', value_name='D')
.melt(['C', 'D'], var_name='A', value_name='B')
.query("B != '0' and D != '0'"))
pd.crosstab(u.A, u.C)
C id ie if ig
A
ia 0 1 0 1
ib 0 1 1 0
ic 1 0 1 1
def crosstab_for(df, sliceA, sliceB):
u = (df.reindex(df.columns[sliceA] | df.columns[sliceB], axis=1)
.melt(df.columns[sliceA], var_name='C', value_name='D')
.melt(['C', 'D'], var_name='A', value_name='B')
.query("B != '0' and D != '0'"))
return pd.crosstab(u.A, u.C)
crosstab_for(df, slice(0, 3), slice(4, 7))
C ie if ig
A
ia 1 0 1
ib 1 1 0
ic 0 1 1

Related

How can I count the number of elements within a range in each row

I want to count the number of elements in a row that fall within a range and then print a new column with the results. After looking around I came up with the following solution, however the results are not consistent. Is the solution too simplistic for what I want to accomplish?
I have the following DataFrame.
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randint(0,10,size=(10, 3)), columns=list('ABC'))
def val_1(row):
val_1 = 0
if row.loc['A'] and row.loc['B'] and row.loc["C"] in range(1,3):
val_1 = 3
elif row.loc['A'] and row.loc['B'] in range(1,3):
val_1 = 2
elif row.loc['A'] in range(1,3):
val_1 = 1
return val_1
def val_2(row):
val_2 = 0
if row.loc['A'] and row.loc['B'] and row.loc["C"] in range(3,6) :
val_2 = 3
elif row.loc['A'] and row.loc['B'] in range(3,6) :
val_2 = 2
elif row.loc['A'] in range(3,6) :
val_2 = 1
return val_2
def val_3(row):
val_3 = 0
if row.loc['A'] and row.loc['B'] and row.loc["C"] in range(6,10) :
val_3 = 3
elif row.loc['A'] and row.loc['B'] in range(6,10) :
val_3 = 2
elif row.loc['A']in range(6,10) :
val_3 = 1
return val_3
def results():
df['Val_1'] = df.apply(val_1, axis=1)
df['Val_2'] = df.apply(val_2, axis=1)
df['Val_3'] = df.apply(val_3, axis=1)
print(df)
results()
A B C Val_1 Val_2 Val_3
0 9 0 0 0 0 1
1 6 1 0 2 0 1
2 8 5 5 0 3 1
3 9 7 0 0 0 2
4 4 6 2 3 1 2
5 1 5 5 1 3 0
6 8 1 7 2 0 3
7 4 8 5 0 3 2
8 0 6 0 0 0 0
9 3 0 3 0 1 0
Thanks for your help.

This is shorter version of your code.
The problem is in your code:
It is checking the range only for last element
row.loc['A'] and row.loc['B'] and row.loc["C"] in range(start,end)
Only checking range logic for C column (Not for A & B)
def get_val(row,start,end):
val = 0
if row.loc['A'] and row.loc['B'] and row.loc["C"] in range(start,end) :
val = 3
elif row.loc['A'] and row.loc['B'] in range(start,end) :
val = 2
elif row.loc['A'] in range(start,end) :
val = 1
return val
df = pd.DataFrame(np.random.randint(0,10,size=(10, 3)), columns=list('ABC'))
df['Val_1'] = df.apply(lambda x:get_val(x,1,3), axis=1)
df['Val_2'] = df.apply(lambda x:get_val(x,3,6), axis=1)
df['Val_3'] = df.apply(lambda x:get_val(x,6,10), axis=1)
My Solution
def query_build(start, end):
query1 = f'A>={start} and A<={end} and B>={start} and B<={end} and C>={start} and C<={end}'
query2 = f'A>={start} and A<={end} and B>={start} and B<={end}'
query3 = f'A>={start} and A<={end}'
return {query1: 3,
query2: 2,
query3: 1}
df = pd.DataFrame(np.random.randint(0,10,size=(10, 3)), columns=list('ABC'))
df['val1'] = 0
df['val2'] = 0
df['val3'] = 0
val_range = {'val1':(1,2),'val2':(3,5),'val3':(6,9)}
for name, r_range in val_range.items():
query_set = query_build(*r_range)
for query, val in query_set.items():
length = len(df.query(query))
if length:
df[name][df.query(query).index] = val
print(df)

Another approach which is using numpy.select (recommended in the Pandas User Guide here):
import numpy as np
for n, (start, stop) in enumerate([(1, 3), (3, 6), (6, 10)], start=1):
m = df.isin(range(start, stop))
condlist = [m.all(axis=1), m[["A", "B"]].all(axis=1), m["A"]]
df[f"Val_{n}"] = np.select(condlist, [3, 2, 1])
For each iteration build a mask m on df that checks if the df values are in the resp. range(start, stop).
Based on m build a condition list condlist: The 1. entry checks if all values are in the range, the 2. checks if the values in columns A and B are in the range, and the 3. checks if the value in column A is in the range - all checks are done row-wise.
Based on condlist set the corresponding values from [3, 2, 1] (called choicelist in the numpy.select docs) in the new column, and 0 (standard default) if no condition is met.
The selection follows the preferences in you code - see the numpy.select documentation:
When multiple conditions are satisfied, the first one encountered in condlist is used.
Your question actually sounds a bit different to what you are doing:
I want to count the number of elements in a row that fall within a range and then print a new column with the results.
If that is your real goal, then you could try something simpler:
for n, (start, stop) in enumerate([(1, 3), (3, 6), (6, 10)], start=1):
df[f"Val_{n}"] = df.isin(range(start, stop)).sum(axis=1)
Why your approach fails (in addition to #Mazhar's explanation): This
if row.loc['A'] and row.loc['B'] and row.loc["C"] in range(1,3):
isn't how logical operators like and work: You have to fully specify each part of the condition, like:
if (row.loc['A'] in range(1,3)) and (row.loc['B'] in range(1,3)) and (row.loc["C"] in range(1,3)):
The way you have used it actually resolves to
if bool(row.loc['A']) and bool(row.loc['B']) and row.loc["C"] in range(1,3):
which here means (since the values are numbers)
if (row.loc['A'] != 0) and (row.loc['B'] != 0) and (row.loc["C"] in range(1,3)):

why condition for my dataframe is not working?

Here is the code:
import pandas as pd
import numpy as np
df1=pd.DataFrame({'0':[1,0,11,0],'1':[0,11,4,0]})
print(df1.head(5))
df2 = df1.copy()
columns=list(df2.columns)
print(columns)
for i in columns:
idx1 = np.where((df2[i]>0) & (df2[i] < 10))
df2.loc[idx1] = 1
idx3 = np.where(df2[i] == 0)
df2.loc[idx3] = 0
idx2 = np.where(df2[i] > 10)
df2.loc[idx2] = 0
print(df2.head(5))
output:
0 1
0 1 0
1 0 11
2 11 4
3 0 0
['0', '1']
0 1
0 1 1
1 0 0
2 0 0
3 0 0
the concerning part is:
(idx1 = np.where((df2[i]>0) & (df2[i] < 10))
df2.loc[idx1] = 1,
why this logic isn't working?)
According to this logic, this is what needs to be my output:
expected:
0 1
0 1 1
1 0 0
2 0 1
3 0 0

This can be done much simpler. You can operate directly on the dataframe as whole; no need to cycle through the columns individually.
Also, you don't need numpy.where to grab indices; you can use the dataframe with boolean values form the selection directly.
sel = (df2 > 0) & (df2 < 10)
df2[sel] = 1
df2[df2 == 0] = 0
df2[df2 > 10] = 0
(The first line is only to make the second line not overly complicated to the eye.)
Given your conditions however, the result is
0 1
0 1 0
1 0 0
2 0 1
3 0 0
Because you only set numbers between 0 and 10 (exclusive) to 1. A number like 11 is set to 0; while your expected output somehow shows 1 for entries with 11. And 0 is also set to 0, not to 1 (the letter shows in your expected output).

Your expected output does not align with your logic it seems. It looks like anything between 0 and 10 (exclusive) should be 1 and the other be 0.
If so, try this:
df2 = pd.DataFrame(np.where((0 < df1) & (df1 < 10), 1, 0))

Extract dataframe from dataframe based on column value with extended bounds

I have the following dataframe:
NAME
SIGNAL
a
0
b
0
c
0
d
0
e
1
f
1
g
1
h
0
i
0
j
0
k
0
l
0
m
0
n
1
o
1
p
1
q
1
r
0
s
0
t
0
I need to write a function that will allow me to extract another dataframe, or just modify the existing frame based on a condition:
Get all columns (in my case NAME) if SIGNAL column is 1 for the row but also extract 2 rows extra from above and 2 rows extra from bellow.
In my example, the function should return me the following table:
NAME
SIGNAL
c
0
d
0
e
1
f
1
g
1
h
0
i
0
j
0
l
0
m
0
n
1
o
1
p
1
q
1
r
0
s
0
Thanks!
UPDATE:
This is the code I have so far:
# Import pandas library
import pandas as pd
# initialize list of lists
data = [['a', 0], ['b', 0], ['c', 1], ['d', 1], ['e', 0], ['f', 0], ['g', 0], ['h', 1], ['i', 0], ['j', 0], ['k', 0]]
# Create the pandas DataFrame
df = pd.DataFrame(data, columns = ['NAME', 'SIGNAL'])
# print dataframe.
print(df)
print("----------------")
for index, row in df.iterrows():
#print(row['Name'], row['Age'])
if((df.iloc[index]['SIGNAL'] == 1) & (df.iloc[index-1]['SIGNAL'] == 0)): #check when the signal change from 0 to 1
print(df.iloc[index]['NAME']) #first line with signal 1 after it was 0
#print the above 2 lines
print(df.iloc[index-1]['NAME'])
print(df.iloc[index-2]['NAME'])
My dataframe is like:
NAME SIGNAL
0 a 0
1 b 0
2 c 1
3 d 1
4 e 0
5 f 0
6 g 0
7 h 1
8 i 0
9 j 0
10 k 0
My code is returning:
c
b
a
h
g
f
The problem here is that I cannot return the value of "d" and "e" + "f" or "i" and "j" because i get the error "IndexError: single positional indexer is out-of-bounds" if i try if condition:
(df.iloc[index]['SIGNAL'] == 1) & (df.iloc[index+1]['SIGNAL'] == 0)
enter code here
Also the extended bounds will be variable, sometimes I will work with 2 extra rows from top and bottom sometimes with more.
I am looking for a solution using dataframes functions and not iteration.
thanks!

This will return the desired data frame:
df[(df.shift(periods=-2, axis="rows").SIGNAL == 1) | (df.shift(periods=-1, axis="rows").SIGNAL == 1) | (df.SIGNAL == 1) | (df.shift(periods=1, axis="rows").SIGNAL == 1) | (df.shift(periods=2, axis="rows").SIGNAL == 1)]
Output:
NAME
SIGNAL
c
0
d
0
e
1
f
1
g
1
h
0
i
0
l
0
m
0
n
1
o
1
p
1
q
1
r
0
s
0
Add .NAME to the end to get your series of names
2 c
3 d
4 e
5 f
6 g
7 h
8 i
11 l
12 m
13 n
14 o
15 p
16 q
17 r
18 s
Name: NAME, dtype: object
Update: for arbitrarily large span
m=(df.shift(periods=-400, axis="rows").SIGNAL == 1)
for i in list(range(-399,401)):
m= m | (df.shift(periods=i, axis="rows").SIGNAL == 1)
print(df[m])
Disclaimer:
This method may be inefficient for large spans

Assign a value to the first row of a sub-dataframe

I have a Pandas dataframe that looks like this:
df = pd.DataFrame({'gp_id': [1, 2, 1, 2], 'A': [1, 2, 3, 4]})
gp_id A
0 1 1
1 2 2
2 1 3
3 2 4
I want to assign the value -1 to the first row of the group with the id 2 (gp_id = 2), to get the following output:
gp_id A
0 1 1
1 2 -1
2 1 3
3 2 4
To do this, I've tried the following code:
df[df.gp_id == 2].A.iloc[0] = -1
But this doesn't do anything as I'm assigning a value in the sub-dataframe df[df.gp_id == 2] and I'm not modifying the original dataframe df.
Is there an easy way to solve this problem?

You could do:
df.loc[(df.gp_id == 2).argmax(), 'A'] = -1
as pd.Series.argmax returns the first max.
If you are not sure that the value is present in the dataframe, you could do:
cond = (df.gp_id == 2)
if cond.sum():
df.loc[cond.argmax(), 'A'] = -1

General solution if possible mask return no rows is chain another mask by cumulative sum of mask by & for bitwise AND and set values by DataFrame.loc:
m = df.gp_id == 2
df.loc[m & (m.cumsum() == 1), 'A'] = -1
Working well if no match - no assign, no error, no incorrect assignment:
m = df.gp_id == 7
df.loc[m & (m.cumsum() == 1), 'A'] = -1
Solution if always match mask at least one row is:
idx = df[df.gp_id == 2].index[0]
df.loc[idx, 'A'] = -1
print (df)
gp_id A
0 1 1
1 2 -1
2 1 3
3 2 4
If no match, solution raise error, no incorrect assignment.

How to multiply every column of one Pandas Dataframe with every column of another Dataframe efficiently?

I'm trying to multiply two pandas dataframes with each other. Specifically, I want to multiply every column with every column of the other df.
The dataframes are one-hot encoded, so they look like this:
col_1, col_2, col_3, ...
0 1 0
1 0 0
0 0 1
...
I could just iterate through each of the columns using a for loop, but in python that is computationally expensive, and I'm hoping there's an easier way.
One of the dataframes has 500 columns, the other has 100 columns.
This is the fastest version that I've been able to write so far:
interact_pd = pd.DataFrame(index=df_1.index)
df1_columns = [column for column in df_1]
for column in df_2:
col_pd = df_1[df1_columns].multiply(df_2[column], axis="index")
interact_pd = interact_pd.join(col_pd, lsuffix='_' + column)
I iterate over each column in df_2 and multiply all of df_1 by that column, then I append the result to interact_pd. I would rather not do it using a for loop however, as this is very computationally costly. Is there a faster way of doing it?
EDIT: example
df_1:
1col_1, 1col_2, 1col_3
0 1 0
1 0 0
0 0 1
df_2:
2col_1, 2col_2
0 1
1 0
0 0
interact_pd:
1col_1_2col_1, 1col_2_2col_1,1col_3_2col_1, 1col_1_2col_2, 1col_2_2col_2,1col_3_2col_2
0 0 0 0 1 0
1 0 0 0 0 0
0 0 0 0 0 0

# use numpy to get a pair of indices that map out every
# combination of columns from df_1 and columns of df_2
pidx = np.indices((df_1.shape[1], df_2.shape[1])).reshape(2, -1)
# use pandas MultiIndex to create a nice MultiIndex for
# the final output
lcol = pd.MultiIndex.from_product([df_1.columns, df_2.columns],
names=[df_1.columns.name, df_2.columns.name])
# df_1.values[:, pidx[0]] slices df_1 values for every combination
# like wise with df_2.values[:, pidx[1]]
# finally, I marry up the product of arrays with the MultiIndex
pd.DataFrame(df_1.values[:, pidx[0]] * df_2.values[:, pidx[1]],
columns=lcol)
Timing
code
from string import ascii_letters
df_1 = pd.DataFrame(np.random.randint(0, 2, (1000, 26)), columns=list(ascii_letters[:26]))
df_2 = pd.DataFrame(np.random.randint(0, 2, (1000, 52)), columns=list(ascii_letters))
def pir1(df_1, df_2):
pidx = np.indices((df_1.shape[1], df_2.shape[1])).reshape(2, -1)
lcol = pd.MultiIndex.from_product([df_1.columns, df_2.columns],
names=[df_1.columns.name, df_2.columns.name])
return pd.DataFrame(df_1.values[:, pidx[0]] * df_2.values[:, pidx[1]],
columns=lcol)
def Test2(DA,DB):
MA = DA.as_matrix()
MB = DB.as_matrix()
MM = np.zeros((len(MA),len(MA[0])*len(MB[0])))
Col = []
for i in range(len(MB[0])):
for j in range(len(MA[0])):
MM[:,i*len(MA[0])+j] = MA[:,j]*MB[:,i]
Col.append('1col_'+str(i+1)+'_2col_'+str(j+1))
return pd.DataFrame(MM,dtype=int,columns=Col)
results

You can multiply along the index axis your first df with each column of the second df, this is the fastest method for big datasets (see below):
df = pd.concat([df_1.mul(col[1], axis="index") for col in df_2.iteritems()], axis=1)
# Change the name of the columns
df.columns = ["_".join([i, j]) for j in df_2.columns for i in df_1.columns]
df
1col_1_2col_1 1col_2_2col_1 1col_3_2col_1 1col_1_2col_2 \
0 0 0 0 0
1 1 0 0 0
2 0 0 0 0
1col_2_2col_2 1col_3_2col_2
0 1 0
1 0 0
2 0 0
--> See benchmark for comparisons with other answers to choose the best option for your dataset.
Benchmark
Functions:
def Test2(DA,DB):
MA = DA.as_matrix()
MB = DB.as_matrix()
MM = np.zeros((len(MA),len(MA[0])*len(MB[0])))
Col = []
for i in range(len(MB[0])):
for j in range(len(MA[0])):
MM[:,i*len(MA[0])+j] = MA[:,j]*MB[:,i]
Col.append('1col_'+str(i+1)+'_2col_'+str(j+1))
return pd.DataFrame(MM,dtype=int,columns=Col)
def Test3(df_1, df_2):
df = pd.concat([df_1.mul(i[1], axis="index") for i in df_2.iteritems()], axis=1)
df.columns = ["_".join([i,j]) for j in df_2.columns for i in df_1.columns]
return df
def Test4(df_1,df_2):
pidx = np.indices((df_1.shape[1], df_2.shape[1])).reshape(2, -1)
lcol = pd.MultiIndex.from_product([df_1.columns, df_2.columns],
names=[df_1.columns.name, df_2.columns.name])
return pd.DataFrame(df_1.values[:, pidx[0]] * df_2.values[:, pidx[1]],
columns=lcol)
def jeanrjc_imp(df_1, df_2):
df = pd.concat([df_1.mul(‌i[1], axis="index") for i in df_2.iteritems()], axis=1, keys=df_2.columns)
return df
Code:
Sorry, ugly code, the plot at the end matters :
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
df_1 = pd.DataFrame(np.random.randint(0, 2, (1000, 600)))
df_2 = pd.DataFrame(np.random.randint(0, 2, (1000, 600)))
df_1.columns = ["1col_"+str(i) for i in range(len(df_1.columns))]
df_2.columns = ["2col_"+str(i) for i in range(len(df_2.columns))]
resa = {}
resb = {}
resc = {}
for f, r in zip([Test2, Test3, Test4, jeanrjc_imp], ["T2", "T3", "T4", "T3bis"]):
resa[r] = []
resb[r] = []
resc[r] = []
for i in [5, 10, 30, 50, 150, 200]:
a = %timeit -o f(df_1.iloc[:,:i], df_2.iloc[:, :10])
b = %timeit -o f(df_1.iloc[:,:i], df_2.iloc[:, :50])
c = %timeit -o f(df_1.iloc[:,:i], df_2.iloc[:, :200])
resa[r].append(a.best)
resb[r].append(b.best)
resc[r].append(c.best)
X = [5, 10, 30, 50, 150, 200]
fig, ax = plt.subplots(1, 3, figsize=[16,5])
for j, (a, r) in enumerate(zip(ax, [resa, resb, resc])):
for i in r:
a.plot(X, r[i], label=i)
a.set_xlabel("df_1 columns #")
a.set_title("df_2 columns # = {}".format(["10", "50", "200"][j]))
ax[0].set_ylabel("time(s)")
plt.legend(loc=0)
plt.tight_layout()
With T3b <=> jeanrjc_imp. Which is a bit faster that Test3.
Conclusion:
Depending on your dataset size, pick the right function, between Test4 and Test3(b). Given the OP's dataset, Test3 or jeanrjc_imp should be the fastest, and also the shortest to write!
HTH

You can use numpy.
Consider this example code, I did modify the variable names, but Test1() is essentially your code. I didn't bother create the correct column names in that function though:
import pandas as pd
import numpy as np
A = [[1,0,1,1],[0,1,1,0],[0,1,0,1]]
B = [[0,0,1,0],[1,0,1,0],[1,1,0,0],[1,0,0,1],[1,0,0,0]]
DA = pd.DataFrame(A).T
DB = pd.DataFrame(B).T
def Test1(DA,DB):
E = pd.DataFrame(index=DA.index)
DAC = [column for column in DA]
for column in DB:
C = DA[DAC].multiply(DB[column], axis="index")
E = E.join(C, lsuffix='_' + str(column))
return E
def Test2(DA,DB):
MA = DA.as_matrix()
MB = DB.as_matrix()
MM = np.zeros((len(MA),len(MA[0])*len(MB[0])))
Col = []
for i in range(len(MB[0])):
for j in range(len(MA[0])):
MM[:,i*len(MA[0])+j] = MA[:,j]*MB[:,i]
Col.append('1col_'+str(i+1)+'_2col_'+str(j+1))
return pd.DataFrame(MM,dtype=int,columns=Col)
print Test1(DA,DB)
print Test2(DA,DB)
Output:
0_1 1_1 2_1 0 1 2 0_3 1_3 2_3 0 1 2 0 1 2
0 0 0 0 1 0 0 1 0 0 1 0 0 1 0 0
1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0
2 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0
1col_1_2col_1 1col_1_2col_2 1col_1_2col_3 1col_2_2col_1 1col_2_2col_2 \
0 0 0 0 1 0
1 0 0 0 0 0
2 1 1 0 1 1
3 0 0 0 0 0
1col_2_2col_3 1col_3_2col_1 1col_3_2col_2 1col_3_2col_3 1col_4_2col_1 \
0 0 1 0 0 1
1 0 0 1 1 0
2 0 0 0 0 0
3 0 0 0 0 1
1col_4_2col_2 1col_4_2col_3 1col_5_2col_1 1col_5_2col_2 1col_5_2col_3
0 0 0 1 0 0
1 0 0 0 0 0
2 0 0 0 0 0
3 0 1 0 0 0
Performance of your function:
%timeit(Test1(DA,DB))
100 loops, best of 3: 11.1 ms per loop
Performance of my function:
%timeit(Test2(DA,DB))
1000 loops, best of 3: 464 µs per loop
It's not beautiful, but it's efficient.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Cross tabulation between two sets of columns in a DataFrame - python

Related

How can I count the number of elements within a range in each row

why condition for my dataframe is not working?

Extract dataframe from dataframe based on column value with extended bounds

Assign a value to the first row of a sub-dataframe

How to multiply every column of one Pandas Dataframe with every column of another Dataframe efficiently?

Categories

Resources