Update a function in python where first two columns doesn't exist

Update a function in python where first two columns doesn't exist - python

I have created a function which checks three columns and applies the conditions I have mentioned in the function. I have set first column(col0) as None. This is how my columns look like:
rule_id col0 col1 col2
50378 2 0 0
50402 12 9 6
52879 0 4 3
Here 'rule_id' column is the index
This is my code:
for i, j, in dframe.groupby('tx_id'):
df1 = pd.DataFrame(j)
df = df1.pivot_table(index = 'rule_id' , columns = ['date'], values =
'rid_fc', aggfunc = np.sum, fill_value = 0)
coeff = df.T
# compute the coefficients
for name, s in coeff.items():
top = 100 # start at 100
r = []
for i, v in enumerate(s):
if v == 0: # reset to 100 on a 0 value
top=100
else:
top = top/2 # else half the previous value
r.append(top)
coeff.loc[:, name] = r # set the whole column in one operation
# transpose back to have a companion dataframe for df
coeff = coeff.T
def build_comp(col1, col2, i, col0 = None):
conditions = [(df[col1] == 0) & (df[col2] == 0) ,(df[col1] == df[col2]) , (df[col1] != 0) & (df[col2] != 0) & (df[col1] > df[col2]) ,
(df[col1] != 0) & (df[col2] != 0) & (df[col1] < df[col2]) ,(df[col1] != 0) & (df[col2] == 0)]
choices = [np.nan , coeff[col1] , df[col2]/df[col1]*coeff[col1],df[col2]/df[col1]* coeff[col1],100]
condition = [(df[col2] != 0) , (df[col2] == 0)]
choice = [100 , np.nan]
if col0 is not None:
conditions.insert(1, (df[col1] != 0) & (df[col2] == 0) & (df[col0] != 0))
choices.insert(1, 25)
condition.insert(0,(df[col2] != 0) & (df[col1] != 0))
choice.insert(0, 25)
if col0 is None:
condition.insert(0,(df[col2] != 0) & (df[col1] != 0))
choice.insert(0, 25)
df['comp{}'.format(i)] = np.select(conditions , choices , default = np.nan)
df['comp{}'.format(i+1)] = np.select(condition , choice)
col_ref = None
col_prev = df.columns[0]
for i, col in enumerate(df.columns[1:], 1):
build_comp(col_prev, col, i, col_ref)
col_ref = col_prev
col_prev = col
if len(df.columns) == 1:
df['comp1'] = [100] * len(df)
'df' is the dataframe which has these columns.There are multiple conditions involved in this function as you can see. I want to add one more , which is both col0 and col1 are None but I don't know how. I tried adding a condition inside if col0 is None: like:
if col1 is None:
conditions.insert(0, (df[col2] != 0)
choices.insert(0, 100)
But it's not working. Suppose I have only one column (col2) and both col0 and col1 are not there, then the result should be like this as per my condition:
rule_id col2 comp1
50378 2 100
51183 3 100
But comp column is not getting created. If you guys could help me achieve that , I'd greatly appreciate it.
Current code(Edit): After using the code #Joël suggested. I made the alterations. This is the code:
def build_comp(col2, i, col0 = None, col1 = None):
conditions = [(df[col1] == df[col2]) & (df[col1] != 0) & (df[col2] != 0) , (df[col1] != 0) & (df[col2] != 0) & (df[col1] > df[col2]) ,
(df[col1] != 0) & (df[col2] != 0) & (df[col1] < df[col2]) ,(df[col1] != 0) & (df[col2] == 0)]
choices = [50 , df[col2]/df[col1]*50,df[col2]/df[col1]* 25,100]
condition = [(df[col2] != 0) , (df[col2] == 0)]
choice = [100 , np.nan]
if col0 is not None:
conditions.insert(1, (df[col1] != 0) & (df[col2] == 0) &
(df[col0]!= 0))
choices.insert(1, 25)
condition.insert(0,(df[col2] != 0) & (df[col1] != 0))
choice.insert(0, 25)
else:
condition.insert(0,(df[col2] != 0) & (df[col1] != 0))
choice.insert(0, 25)
if col1 is None:
conditions.insert(0, (df[col2] != 0))
choices.insert(0, 100)
conditions.insert(0, (df[col2] == 0))
choices.insert(0, np.nan)
df['comp{}'.format(i)] = np.select(conditions , choices , default = np.nan)
df['comp{}'.format(i+1)] = np.select(condition , choice)
col_ref = None
col_prev = df.columns[0]
for i, col in enumerate(df.columns[1:], 1):
build_comp(col,i, col_ref , col_prev)
col_ref = col_prev
col_prev = col
When I run this code , I am still not getting the comp column. This is what I am getting:
rule_id col2
50378 2
51183 3
But I should get this as per my logic:
rule_id col2 comp1
50378 2 100
51183 3 100
I know there is something wrong with the for loop and col_prev logic but I don't know what.
Edit: For more simplification , this is how my df looks like:
This is my `df' looks like after applying my code:
But now suppose there is only one timestamp column is present such as this:
Then I want the result to be this:
date 2018-12-11 13:41:51 comp1
rule_id
51183 1 100
52368 1 100

When df has a single column, the for loop gets skipped (i.e. the code in the loop does not get executed).
In order to add a column for the case where df has a single column, add the following code to the end:
if len(df.columns) == 1:
df['comp1'] = [100] * len(df)
This assumes that rule_id is the row labels. If not, then compare with 2 instead of 1.

Your condition about testing col1 is None is exactly the same as for col0; therefore, this is about setting a default value for col1 so that it may not be provided.
Therefore, your code should be something like this:
def build_comp(col2, i, col0 = None, col1 = None): # <== changing here
if col1 is not None: # we can compare <== EDITED HERE
conditions = [(df[col1] == 0) & (df[col2] == 0),
(df[col1] == df[col2]),
(df[col1] != 0) & (df[col2] != 0) & (df[col1] > df[col2]),
(df[col1] != 0) & (df[col2] != 0) & (df[col1] < df[col2]),
(df[col1] != 0) & (df[col2] == 0)]
choices = [np.nan,
50,
df[col2] / df[col1] * 50,
df[col2] / df[col1] * 25,
100]
condition = [(df[col2] != 0),
(df[col2] == 0)]
choice = [100,
np.nan]
if col0 is not None:
conditions.insert(1, (df[col1] != 0) & (df[col2] == 0) & (df[col0] != 0))
choices.insert(1, 50)
condition.insert(0,(df[col2] != 0) & (df[col1] != 0))
choice.insert(0, 25)
else: # if col0 is None: # <== use `else` instead of testing opposite
condition.insert(0,(df[col2] != 0) & (df[col1] != 0))
choice.insert(0, 25)
df['comp{}'.format(i)] = np.select(conditions , choices , default = np.nan)
df['comp{}'.format(i+1)] = np.select(condition , choice)
Beware, you use choices and choice for different stuff, that's not helping you.

Why are You using None?
IMO it`s better to use NaN.

Related

Python Numpy: Is there a faster way to modify multiple arrays of the same shape when the changes are the same?

I have multiple numpy arrays that are all the same shape. I have two masks that are also that same shape.
Right now I'm changing each array one at a time. Is there a faster way to do this?
This is a smaller scale example of what I am doing.
start = time.time()
mask1 = np.array([[1,0,1],[0,1,1],[1,0,1]])
mask2 = np.array([[0,0,0],[1,0,1],[0,0,1]])
arr1 = np.array([[20,10,51],[21,1,2],[25,23,38]])
arr2 = np.array([[99,1,6],[66,54,11],[22,21,1]])
arr3 = np.array([[23,2,3],[55,2,16],[90,37,1]])
arr4 = np.array([[81,25,22],[1,63,24],[47,58,1]])
arr1[(mask1 == 1) & (mask2 == 0)] = 9999
arr2[(mask1 == 1) & (mask2 == 0)] = 9999
arr3[(mask1 == 1) & (mask2 == 0)] = 9999
arr4[(mask1 == 1) & (mask2 == 0)] = 9999
print(time.time() - start)

at #hpaulj suggestion, I did the following and saw some improvement!
import timeit
def func1():
mask1 = np.array([[1,0,1],[0,1,1],[1,0,1]])
mask2 = np.array([[0,0,0],[1,0,1],[0,0,1]])
arr1 = np.array([[20,10,51],[21,1,2],[25,23,38]])
arr2 = np.array([[99,1,6],[66,54,11],[22,21,1]])
arr3 = np.array([[23,2,3],[55,2,16],[90,37,1]])
arr4 = np.array([[81,25,22],[1,63,24],[47,58,1]])
arr1[(mask1 == 1) & (mask2 == 0)] = 9999
arr2[(mask1 == 1) & (mask2 == 0)] = 9999
arr3[(mask1 == 1) & (mask2 == 0)] = 9999
arr4[(mask1 == 1) & (mask2 == 0)] = 9999
def func2():
mask1 = np.array([[1,0,1],[0,1,1],[1,0,1]])
mask2 = np.array([[0,0,0],[1,0,1],[0,0,1]])
mask1[(mask1==1) & (mask2 != 0)] = 0
arr1 = np.array([[20,10,51],[21,1,2],[25,23,38]])
arr2 = np.array([[99,1,6],[66,54,11],[22,21,1]])
arr3 = np.array([[23,2,3],[55,2,16],[90,37,1]])
arr4 = np.array([[81,25,22],[1,63,24],[47,58,1]])
arr1[(mask1 == 1)] = 9999
arr2[(mask1 == 1)] = 9999
arr3[(mask1 == 1)] = 9999
arr4[(mask1 == 1)] = 9999
print(timeit.timeit(func1, number=1000000))
#17.574998669995693
print(timeit.timeit(func2, number=1000000))
#15.318040108977584

What I was trying to suggest was:
mask3 = (mask1 == 1) & (mask2 == 0)
arr1[mask3] = 9999
arr2[mask3] = 9999
arr3[mask3] = 9999
arr4[mask3] = 9999

Filter spark dataframe with multiple conditions on multiple columns in Pyspark

I would like to implement the below SQL conditions in Pyspark
SELECT *
FROM table
WHERE NOT ( ID = 1
AND Event = 1
)
AND NOT ( ID = 2
AND Event = 2
)
AND NOT ( ID = 1
AND Event = 0
)
AND NOT ( ID = 2
AND Event = 0
)
What would be the clean way to do this?

you use filter or where function for DataFrame API version.
the equivalent code would be as follows :
df.filter(~((df.ID == 1) & (df.Event == 1)) &
~((df.ID == 2) & (df.Event == 2)) &
~((df.ID == 1) & (df.Event == 0)) &
~((df.ID == 2) & (df.Event == 0)))

If you're lazy, you can just copy and paste the SQL filter expression into the pyspark filter:
df.filter("""
NOT ( ID = 1
AND Event = 1
)
AND NOT ( ID = 2
AND Event = 2
)
AND NOT ( ID = 1
AND Event = 0
)
AND NOT ( ID = 2
AND Event = 0
)
""")

making a Pandas while loop faster

I have a while loop which runs through a data frame A of 30000 rows and updates another data frame B and uses dataframe B for further iterations. its taking too much time. want to make it faster! any ideas
for x in range(0, dataframeA.shape[0]):
AuthorizationID_Temp = dataframeA["AuthorizationID"].iloc[x]
Auth_BeginDate = dataframeA["BeginDate"].iloc[x]
Auth_EndDate = dataframeA["EndDate"].iloc[x]
BeginDate_Temp = pd.to_datetime(Auth_BeginDate).date()
ScriptsFlag = dataframeA["ScriptsFlag"].iloc[x]
Legacy_PlacementID = dataframeA["Legacy_PlacementID"].iloc[x]
Legacy_AncillaryServicesID = dataframeA["Legacy_AncillaryServicesID"].iloc[x]
ProviderID_Temp = dataframeA["ProviderID"].iloc[x]
SRSProcode_Temp = dataframeA["SRSProcode"].iloc[x]
Rate_Temp = dataframeA["Rate"].iloc[x]
Scripts2["BeginDate1_SC"] = pd.to_datetime(Scripts2["BeginDate_SC"]).dt.date
Scripts2["EndDate1_SC"] = pd.to_datetime(Scripts2["EndDate_SC"]).dt.date
# BeginDate_Temp = BeginDate_Temp.date()
# EndDate_Temp = EndDate_Temp.date()
Scripts_New_Modified1 = Scripts2.loc[
((Scripts2["ScriptsFlag_SC"].isin(["N", "M"])) & (Scripts2["AuthorizationID_SC"] == AuthorizationID_Temp))
& ((Scripts2["ProviderID_SC"] == ProviderID_Temp) & (Scripts2["SRSProcode_SC"] == SRSProcode_Temp)),
:,
]
Scripts_New_Modified = Scripts_New_Modified1.loc[
(Scripts_New_Modified1["BeginDate1_SC"] == BeginDate_Temp)
& ((Scripts_New_Modified1["EndDate1_SC"] == EndDate_Temp) & (Scripts_New_Modified1["Rate_SC"] == Rate_Temp)),
"AuthorizationID_SC",
]
if ScriptsFlag == "M":
if Legacy_PlacementID is not None:
InsertA = insertA(AuthorizationID_Temp, BeginDate_Temp, EndDate_Temp, Units_Temp, EndDate_Temp_DO)
dataframeB = dataframeB.append(InsertA)
print("ScriptsTemp6 shape is {}".format(dataframeB.shape))
# else:
# ScriptsTemp6 = ScriptsTemp5.copy()
# print('ScriptsTemp6 shape is {}'.format(ScriptsTemp6.shape))
if Legacy_AncillaryServicesID is not None:
InsertB = insertB(AuthorizationID_Temp, BeginDate_Temp, EndDate_Temp, Units_Temp, EndDate_Temp_DO)
dataframeB = dataframeB.append(InsertB)
print("ScriptsTemp7 shape is {}".format(dataframeB.shape))
dataframe_New = dataframeB.loc[
((dataframeB["ScriptsFlag"] == "N") & (dataframeB["AuthorizationID"] == AuthorizationID_Temp))
& ((dataframeB["ProviderID"] == ProviderID_Temp) & (dataframeB["SRSProcode"] == SRSProcode_Temp)),
:,
]
dataframe_New1 = dataframe_New.loc[
(pd.to_datetime(dataframe_New["BeginDate"]).dt.date == BeginDate_Temp)
& ((pd.to_datetime(dataframe_New["EndDate"]).dt.date == EndDate_Temp_DO) & (dataframe_New["Rate"] == Rate_Temp)),
"AuthorizationID",
]
# PLAATN = dataframeA.copy()
Insert1 = insert1(dataframe_New1, BeginDate_Temp, AuthorizationID_Temp, EndDate_Temp, Units_Temp, EndDate_Temp_DO)
if Insert1.shape[0] > 0:
dataframeB = dataframeB.append(Insert1.iloc[0])
# else:
# ScriptsTemp8 = ScriptsTemp7
print("ScriptsTemp8 shape is {}".format(dataframeB.shape))
dataframe_modified1 = dataframeB.loc[
((dataframeB["ScriptsFlag"] == "M") & (dataframeB["AuthorizationID"] == AuthorizationID_Temp))
& ((dataframeB["ProviderID"] == ProviderID_Temp) & (dataframeB["SRSProcode"] == SRSProcode_Temp)),
:,
]
dataframe_modified = dataframe_modified1.loc[
(dataframe_modified1["BeginDate"] == BeginDate_Temp)
& ((dataframe_modified1["EndDate"] == EndDate_Temp_DO) & (dataframe_modified1["Rate"] == Rate_Temp)),
"AuthorizationID",
]
Insert2 = insert2(
dataframe_modified,
Scripts_New_Modified,
AuthorizationID_Temp,
BeginDate_Temp,
EndDate_Temp,
Units_Temp,
EndDate_Temp_DO,
)
if Insert2.shape[0] > 0:
dataframeB = dataframeB.append(Insert2.iloc[0])
dataframeA having 30000 rows
dataframeB should be inserted with new rows every iteration(30000 iterations) from DataframeA
updated dataframeB should be used in middle of each iteration for filtering conditions
insertA and InsertB are two functions which has additional filtering
it takes too much time to run for 30000 rows so
so it takes more time to run.
provide suggestions for making the loop faster in terms of execution time

Sort QTableView QDateTime Column instead of string sort

In my QtableView there are four column. The 0th col is for date in dd-MM-yyyy format. and the other three column contains string so for them sorting is not a problem (can be done using QSortFilterProxyModel class) but for col 0 i want sorting from right to left ( for both ascending and descending order).

Here is simple example of customSortingModel
self.tableView = QtGui.TableView(self)
self.table_model = QtGui.QStandardItemModel(0, 0)
self.proxyModel = CustomSortingModel(self)
self.proxyModel.setSourceModel(self.table_model)
self.tableView.setModel(self.proxyModel)
class CustomSortingModel(QtGui.QSortFilterProxyModel):
def lessThan(self,left,right):
col = left.column()
dataleft = left.data()
dataright = right.data()
if col == 2:
dataleft = float(dataleft)
dataright = float(dataright)
elif col == 3:
dataleft = QtCore.QDateTime.fromString(dataleft, "d/M/yy").addYears(100)
dataright = QtCore.QDateTime.fromString(dataright, "d/M/yy").addYears(100)
return dataleft < dataright

Concise way updating values based on column values

Background: I have a DataFrame whose values I need to update using some very specific conditions. The original implementation I inherited used a lot nested if statements wrapped in for loop, obfuscating what was going on. With readability primarily in mind, I rewrote it into this:
# Other Widgets
df.loc[(
(df.product == 0) &
(df.prod_type == 'OtherWidget') &
(df.region == 'US')
), 'product'] = 5
# Supplier X - All clients
df.loc[(
(df.product == 0) &
(df.region.isin(['UK','US'])) &
(df.supplier == 'X')
), 'product'] = 6
# Supplier Y - Client A
df.loc[(
(df.product == 0) &
(df.region.isin(['UK','US'])) &
(df.supplier == 'Y') &
(df.client == 'A')
), 'product'] = 1
# Supplier Y - Client B
df.loc[(
(df.product == 0) &
(df.region.isin(['UK','US'])) &
(df.supplier == 'Y') &
(df.client == 'B')
), 'product'] = 3
# Supplier Y - Client C
df.loc[(
(df.product == 0) &
(df.region.isin(['UK','US'])) &
(df.supplier == 'Y') &
(df.client == 'C')
), 'product'] = 4
Problem: This works well, and makes the conditions clear (in my opinion), but I'm not entirely happy because it's taking up a lot of space. Is there anyway to improve this from a readability/conciseness perspective?

Per EdChum's recommendation, I created a mask for the conditions. The code below goes a bit overboard in terms of masking, but it gives the general sense.
prod_0 = ( df.product == 0 )
ptype_OW = ( df.prod_type == 'OtherWidget' )
rgn_UKUS = ( df.region.isin['UK', 'US'] )
rgn_US = ( df.region == 'US' )
supp_X = ( df.supplier == 'X' )
supp_Y = ( df.supplier == 'Y' )
clnt_A = ( df.client == 'A' )
clnt_B = ( df.client == 'B' )
clnt_C = ( df.client == 'C' )
df.loc[(prod_0 & ptype_OW & reg_US), 'prod_0'] = 5
df.loc[(prod_0 & rgn_UKUS & supp_X), 'prod_0'] = 6
df.loc[(prod_0 & rgn_UKUS & supp_Y & clnt_A), 'prod_0'] = 1
df.loc[(prod_0 & rgn_UKUS & supp_Y & clnt_B), 'prod_0'] = 3
df.loc[(prod_0 & rgn_UKUS & supp_Y & clnt_C), 'prod_0'] = 4

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Update a function in python where first two columns doesn't exist - python

Why are You using None? IMO it`s better to use NaN.

Related

Python Numpy: Is there a faster way to modify multiple arrays of the same shape when the changes are the same?

Filter spark dataframe with multiple conditions on multiple columns in Pyspark

making a Pandas while loop faster

Sort QTableView QDateTime Column instead of string sort

Concise way updating values based on column values

Categories

Resources