Given the following two dataframes:
df1 = pd.DataFrame(data={'unicorn': ['blue', 'red', 'piNk'], 'size': [3, 4, 6]})
df2 = pd.DataFrame(data={'unicorn': ['red'], 'size': [2]})
df1:
unicorn size
0 blue 3
1 red 4
2 piNk 6
df2 (always has one row):
unicorn size
0 red 2
How can I compare the rows of both dataframes column-wise using custom comparison functions like this (simplified):
def unicorn_comparison(str1, str2) -> float:
return 100.0 if str1 == str2 else 0.0
and
def size_comparison(nr1, nr2) -> float:
return 100.0 if nr1 < nr2 else 0.0
Expected result:
unicorn size
0 0.0 0.0
1 100.0 0.0
2 0.0 0.0
As you have always a single row in df2, don't use a DataFrame (2D), but a Series (1D).
ser = df2.loc[0]
Then assuming you just want a comparison, use vectorial code (not a custom function):
out = df1.eq(ser)*100
If you really need to use a non-vectorial function and have to compare all combinations, use:
def unicorn_comparison(str1, str2) -> float:
return 100.0 if str1 == str2 else 0.0
def size_comparison(nr1, nr2) -> float:
return 100.0 if nr1 < nr2 else 0.0
funcs = {'unicorn': unicorn_comparison,
'size': size_comparison
}
out = df1.apply(lambda c: c.apply(lambda s: funcs[c.name](s, ser[c.name])))
output:
unicorn size
0 0 0
1 100 0
2 0 0
its way.
first; add df2's column of you want.
df1['unicorn2'] = df2['unicorn']
after; You can use "application loop". You can run the logic of the you want in the "application loop".
def function(x):
# your logic
return x
df1_result = df1.apply(function)
for col in df1:
df1[col] = (df1[col] == df2[col].loc[0]).replace({True: 100, False: 0})
This will overwrite your df1, or you can make a copy of it first.
Related
Suppose we have this simple pandas.DataFrame:
import pandas as pd
df = pd.DataFrame(
columns=['quantity', 'value'],
data=[[1, 12.5], [3, 18.0]]
)
>>> print(df)
quantity value
0 1 12.5
1 3 18.0
I would like to create a new column, say modified_value, that applies a function N times to the value column, N being the quantity column.
Suppose that function is new_value = round(value/2, 1), the expected result would be:
quantity value modified_value
0 1 12.5 6.2 # applied 1 time
1 3 9.0 1.1 # applied 3 times, 9.0 -> 4.5 -> 2.2 -> 1.1
What would be an elegant/vectorized way to do so?
You can write a custom repeat function, then use apply:
def repeat(func, x, n):
ret = x
for i in range(int(n)):
ret = func(ret)
return ret
def my_func(val): return round(val/2, 1)
df['new_col'] = df.apply(lambda x: repeat(my_func, x['value'], x['quantity']),
axis=1)
# or without apply
# df['new_col'] = [repeat(my_func, v, n) for v,n in zip(df['value'], df['quantity'])]
Use reduce:
from functools import reduce
def repeated(f, n):
def rfun(p):
return reduce(lambda x, _: f(x), range(n), p)
return rfun
def myfunc(value): return round(value/2, 1)
df['modified_valued'] = df.apply(lambda x: repeated(myfunc,
int(x['quantity']))(x['value']),
axis=1)
We can also use list comprehension instead apply
df['modified_valued'] = [repeated(myfunc, int(quantity))(value)
for quantity, value in zip (df['quantity'], df['value'])]
Output
quantity value modified_valued
0 1 12.5 6.2
1 3 18.0 2.2
I want to create a loop that loads all the iterations of two variables into a dataframe in seperate columns. I want variable "a" to hold values between 0 and 1 in 0.1 increments, and the same for variable "b". In otherwords there should be 100 iterations when complete, starting with 0 & 0, and ending with 1 & 1.
I've tried the following code
data = [['Decile 1', 10], ['Decile_2', 15], ['Decile_3', 14]]
staging_table = pd.DataFrame(data, columns = ['Decile', 'Volume'])
profile_table = pd.DataFrame(columns = ['Decile', 'Volume'])
a = 0
b = 0
finished = False
while not finished:
if b != 1:
if a != 1:
a = a + 0.1
staging_table['CAM1_Modifier'] = a
staging_table['CAM2_Modifier'] = b
profile_table = profile_table.append(staging_table)
else:
b = b + 0.1
else:
finished = True
profile_table
You can use itertools.product to get all the combinations:
import itertools
import pandas as pd
x = [i / 10 for i in range(11)]
df = pd.DataFrame(
list(itertools.product(x, x)),
columns=["a", "b"]
)
# a b
# 0 0.0 0.0
# 1 0.0 0.1
# 2 0.0 0.2
# ... ... ...
# 118 1.0 0.8
# 119 1.0 0.9
# 120 1.0 1.0
#
# [121 rows x 2 columns]
itertools is your friend.
from itertools import product
for a, b in product(map(lambda x: x / 10, range(10)),
map(lambda x: x / 10, range(10))):
...
range(10) gives us the integers from 0 to 10 (regrettably, range fails on floats). Then we divide those values by 10 to get your range from 0 to 1. Then we take the Cartesian product of that iterable with itself to get every combination.
How do I write a function that checks two input dataframes are of equal as long as rows in both dataframes are equal? So it disregards index positions and column orders. I can't use df.equals() since it will enforce data types to be equal, which is not what I need.
from io import StringIO
canonical_in_csv = """,c,a,b
2,hat,x,1
0,rat,y,4
3,cat,x,2
1,bat,x,2"""
with StringIO(canonical_in_csv) as fp:
df1 = pd.read_csv(fp, index_col=0)
canonical_soln_csv = """,a,b,c
0,x,1,hat
1,x,2,bat
2,x,2,cat
3,y,4,rat"""
with StringIO(canonical_soln_csv) as fp:
df2 = pd.read_csv(fp, index_col=0)
df1:
c a b
2 hat x 1
0 rat y 4
3 cat x 2
1 bat x 2
df2:
a b c
0 x 1 hat
1 x 2 bat
2 x 2 cat
3 y 4 rat
My attempt:
temp1 = (df == df2).all()
temp2 = temp1.all()
temp2
ValueError: Can only compare identically-labeled DataFrame objects
You can use sort_index by index and columns values first, then merge with eq (==) or equals:
df11 = df1.sort_index().sort_index(axis=1)
df22 = df2.sort_index().sort_index(axis=1)
print (df11.merge(df22))
a b c
0 y 4 rat
1 x 2 bat
2 x 1 hat
3 x 2 cat
print (df11.merge(df22).eq(df11))
a b c
0 True True True
1 True True True
2 True True True
3 True True True
a = df11.merge(df22).eq(df11).values.all()
#alternative
#a = df11.merge(df22).equals(df11)
print (a)
True
Your function should be rewritten:
def checkequality(A, B):
df11 = A.sort_index(axis=1)
df11 = df11.sort_values(df11.columns.tolist()).reset_index(drop=True)
df22 = B.sort_index(axis=1)
df22 = df22.sort_values(df22.columns.tolist()).reset_index(drop=True)
return (df11 == df22).values.all()
a = checkequality(df1, df2)
print (a)
True
You request on row index dis-regard is pretty difficult to undertake as this datatype is not optimized for such operation whereas regarding columns issue, fortunately this will help you
df1.values == df2[df1.columns].values
where df1.columns syncs the columns order and values convert to numpy for comparison. I still recommend not doing row re-ordering and match as that can be very taxing for bigger dataset.
Based on index match this can be what you are looking for
df1.values==df2.reindex(df1.index.values.tolist())[df1.columns].values
Update
As pointed by #Dark a cleaner and in-place comparison can be done like this
df1.loc[df2.index,df2.columns] == df2
I figured it out,
def checkequality(A, B):
var_names = sorted(A.columns)
var_names
Y = A[var_names].copy()
Y.sort_values(by = var_names,inplace=True)
Y.set_index([list(range(0,len(Y)))],inplace=True)
var_names2 = sorted(B.columns)
var_names2
Y2 = B[var_names2].copy()
Y2.sort_values(by = var_names2,inplace=True)
Y2.set_index([list(range(0,len(Y2)))],inplace=True)
if (Y==Y2).all().all() == True:
return True
else:
return False
I am trying to create a new groupid based on the original groupid which has the value of 0, 1. I used the following code but it failed to code the nan rows into 2.
final['groupid2'] = final['groupid'].apply(lambda x: 2 if x == np.nan else x)
I tried the following code also, but it gave an attribute error
final['groupid2'] = final['groupid'].apply(lambda x: 2 if x.isnull() else x)
Could someone please explain why this is the case? Thanks
Use pd.isnull for check scalars if need use apply:
final = pd.DataFrame({'groupid': [1, 0, np.nan],\
'B': [400, 500, 600]})
final['groupid2'] = final['groupid'].apply(lambda x: 2 if pd.isnull(x) else x)
print (final)
groupid B groupid2
0 1.0 400 1.0
1 0.0 500 0.0
2 NaN 600 2.0
Details:
Value x in lambda function is scalar, because Series.apply loop each value of column. So function pd.Series.isnull() failed.
For better testing is possible rewrite lambda funcion to:
def f(x):
print (x)
print (pd.isnull(x))
return 2 if pd.isnull(x) else x
1.0
False
0.0
False
nan
True
final['groupid2'] = final['groupid'].apply(f)
But better is Series.fillna:
final['groupid2'] = final['groupid'].fillna(2)
How to compare values to next or previous items in loop?
I need to summarize consecutive repetitinos of occurences in columns.
After that I need to create "frequency table" so the dfoutput schould looks like on the bottom picture.
This code doesn't work because I can't compare to another item.
Maybe there is another, simple way to do this without looping?
sumrep=0
df = pd.DataFrame(data = {'1' : [0,0,1,0,1,1,0,1,1,0,1,1,1,1,0],'2' : [0,0,1,1,1,1,0,0,1,0,1,1,0,1,0]})
df.index= [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] # It will be easier to assign repetitions in output df - index will be equal to number of repetitions
dfoutput = pd.DataFrame(0,index=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],columns=['1','2'])
#example for column 1
for val1 in df.columns[1]:
if val1 == 1 and val1 ==0: #can't find the way to check NEXT val1 (one row below) in column 1 :/
if sumrep==0:
dfoutput.loc[1,1]=dfoutput.loc[1,1]+1 #count only SINGLE occurences of values and assign it to proper row number 1 in dfoutput
if sumrep>0:
dfoutput.loc[sumrep,1]=dfoutput.loc[sumrep,1]+1 #count repeated occurences greater then 1 and assign them to proper row in dfoutput
sumrep=0
elif val1 == 1 and df[val1+1]==1 :
sumrep=sumrep+1
Desired output table for column 1 - dfoutput:
I don't undestand why there is no any simple method to move around dataframe like offset function in VBA in Excel:/
You can use the function defined here to perform fast run-length-encoding:
import numpy as np
def rlencode(x, dropna=False):
"""
Run length encoding.
Based on http://stackoverflow.com/a/32681075, which is based on the rle
function from R.
Parameters
----------
x : 1D array_like
Input array to encode
dropna: bool, optional
Drop all runs of NaNs.
Returns
-------
start positions, run lengths, run values
"""
where = np.flatnonzero
x = np.asarray(x)
n = len(x)
if n == 0:
return (np.array([], dtype=int),
np.array([], dtype=int),
np.array([], dtype=x.dtype))
starts = np.r_[0, where(~np.isclose(x[1:], x[:-1], equal_nan=True)) + 1]
lengths = np.diff(np.r_[starts, n])
values = x[starts]
if dropna:
mask = ~np.isnan(values)
starts, lengths, values = starts[mask], lengths[mask], values[mask]
return starts, lengths, values
With this function your task becomes a lot easier:
import pandas as pd
from collections import Counter
from functools import partial
def get_frequency_of_runs(col, value=1, index=None):
_, lengths, values = rlencode(col)
return pd.Series(Counter(lengths[np.where(values == value)]), index=index)
df = pd.DataFrame(data={'1': [0,0,1,0,1,1,0,1,1,0,1,1,1,1,0],
'2': [0,0,1,1,1,1,0,0,1,0,1,1,0,1,0]})
df.apply(partial(get_frequency_of_runs, index=df.index)).fillna(0)
# 1 2
# 0 0.0 0.0
# 1 1.0 2.0
# 2 2.0 1.0
# 3 0.0 0.0
# 4 1.0 1.0
# 5 0.0 0.0
# 6 0.0 0.0
# 7 0.0 0.0
# 8 0.0 0.0
# 9 0.0 0.0
# 10 0.0 0.0
# 11 0.0 0.0
# 12 0.0 0.0
# 13 0.0 0.0
# 14 0.0 0.0