I would like to implement the below SQL conditions in Pyspark
SELECT *
FROM table
WHERE NOT ( ID = 1
AND Event = 1
)
AND NOT ( ID = 2
AND Event = 2
)
AND NOT ( ID = 1
AND Event = 0
)
AND NOT ( ID = 2
AND Event = 0
)
What would be the clean way to do this?
you use filter or where function for DataFrame API version.
the equivalent code would be as follows :
df.filter(~((df.ID == 1) & (df.Event == 1)) &
~((df.ID == 2) & (df.Event == 2)) &
~((df.ID == 1) & (df.Event == 0)) &
~((df.ID == 2) & (df.Event == 0)))
If you're lazy, you can just copy and paste the SQL filter expression into the pyspark filter:
df.filter("""
NOT ( ID = 1
AND Event = 1
)
AND NOT ( ID = 2
AND Event = 2
)
AND NOT ( ID = 1
AND Event = 0
)
AND NOT ( ID = 2
AND Event = 0
)
""")
The function i am using is keep giving the red filter condition where not applied.
Here the function i am using:
tolerance = 5
def rag(data):
red_filter = ((data.SHIPMENT_MOT_x == 'VESSEL') & \
((data.latedeliverydate + pd.to_timedelta(tolerance,unit='D')) < data.m6p)) | \
((data.SHIPMENT_MOT_x == 'AIR') & (data.latedeliverydate < data.m6p))
green_filter = (data.SHIPMENT_MOT_x == 'VESSEL') & \
(data.M6_proposed == data.m6p) & \
((data.latedeliverydate + pd.to_timedelta(tolerance,unit='D')) >= data.m6p) | \
((data.SHIPMENT_MOT_x == 'AIR') & (data.latedeliverydate >= data.m6p))
amber_filter = (data.SHIPMENT_MOT_x == 'VESSEL') & \
(data.M6_proposed != data.m6p) & \
((data.latedeliverydate + pd.to_timedelta(tolerance,unit='D')) >= data.m6p) | \
((data.SHIPMENT_MOT_x == 'AIR') & (data.latedeliverydate >= data.m6p))
data['RAG'] = np.where(green_filter, 'G', np.where(amber_filter, 'A', np.where(red_filter, 'R', '')))
Here is the solution if you guys are interested.
np.where is useful but would not recommend when there are multiple conditions
def pmm_rag(data):
if ((data.MOT== 'VESSEL') & ((data.m0p + pd.to_timedelta(tolerance,unit='D')) < data.m6p)) | ((data.SHIPMENT_MOT_x == 'AIR') & (data.m0p < data.m6p)):
return 'R'
elif (data.MOT== 'VESSEL') & (data.M6_proposed == data.m6p) & ((data.m0p + pd.to_timedelta(tolerance,unit='D')) >= data.m6p) | ((data.MOT== 'AIR') & (data.m0p >= data.m6p)):
return 'G'
elif (data.MOT== 'VESSEL') & (data.M6_proposed != data.m6p) & ((data.m0p + pd.to_timedelta(tolerance,unit='D')) >= data.m6p) | ((data.MOT== 'AIR') & (data.m0p >= data.m6p)):
return 'A'
else:
return ''
I have created a function which checks three columns and applies the conditions I have mentioned in the function. I have set first column(col0) as None. This is how my columns look like:
rule_id col0 col1 col2
50378 2 0 0
50402 12 9 6
52879 0 4 3
Here 'rule_id' column is the index
This is my code:
for i, j, in dframe.groupby('tx_id'):
df1 = pd.DataFrame(j)
df = df1.pivot_table(index = 'rule_id' , columns = ['date'], values =
'rid_fc', aggfunc = np.sum, fill_value = 0)
coeff = df.T
# compute the coefficients
for name, s in coeff.items():
top = 100 # start at 100
r = []
for i, v in enumerate(s):
if v == 0: # reset to 100 on a 0 value
top=100
else:
top = top/2 # else half the previous value
r.append(top)
coeff.loc[:, name] = r # set the whole column in one operation
# transpose back to have a companion dataframe for df
coeff = coeff.T
def build_comp(col1, col2, i, col0 = None):
conditions = [(df[col1] == 0) & (df[col2] == 0) ,(df[col1] == df[col2]) , (df[col1] != 0) & (df[col2] != 0) & (df[col1] > df[col2]) ,
(df[col1] != 0) & (df[col2] != 0) & (df[col1] < df[col2]) ,(df[col1] != 0) & (df[col2] == 0)]
choices = [np.nan , coeff[col1] , df[col2]/df[col1]*coeff[col1],df[col2]/df[col1]* coeff[col1],100]
condition = [(df[col2] != 0) , (df[col2] == 0)]
choice = [100 , np.nan]
if col0 is not None:
conditions.insert(1, (df[col1] != 0) & (df[col2] == 0) & (df[col0] != 0))
choices.insert(1, 25)
condition.insert(0,(df[col2] != 0) & (df[col1] != 0))
choice.insert(0, 25)
if col0 is None:
condition.insert(0,(df[col2] != 0) & (df[col1] != 0))
choice.insert(0, 25)
df['comp{}'.format(i)] = np.select(conditions , choices , default = np.nan)
df['comp{}'.format(i+1)] = np.select(condition , choice)
col_ref = None
col_prev = df.columns[0]
for i, col in enumerate(df.columns[1:], 1):
build_comp(col_prev, col, i, col_ref)
col_ref = col_prev
col_prev = col
if len(df.columns) == 1:
df['comp1'] = [100] * len(df)
'df' is the dataframe which has these columns.There are multiple conditions involved in this function as you can see. I want to add one more , which is both col0 and col1 are None but I don't know how. I tried adding a condition inside if col0 is None: like:
if col1 is None:
conditions.insert(0, (df[col2] != 0)
choices.insert(0, 100)
But it's not working. Suppose I have only one column (col2) and both col0 and col1 are not there, then the result should be like this as per my condition:
rule_id col2 comp1
50378 2 100
51183 3 100
But comp column is not getting created. If you guys could help me achieve that , I'd greatly appreciate it.
Current code(Edit): After using the code #Joël suggested. I made the alterations. This is the code:
def build_comp(col2, i, col0 = None, col1 = None):
conditions = [(df[col1] == df[col2]) & (df[col1] != 0) & (df[col2] != 0) , (df[col1] != 0) & (df[col2] != 0) & (df[col1] > df[col2]) ,
(df[col1] != 0) & (df[col2] != 0) & (df[col1] < df[col2]) ,(df[col1] != 0) & (df[col2] == 0)]
choices = [50 , df[col2]/df[col1]*50,df[col2]/df[col1]* 25,100]
condition = [(df[col2] != 0) , (df[col2] == 0)]
choice = [100 , np.nan]
if col0 is not None:
conditions.insert(1, (df[col1] != 0) & (df[col2] == 0) &
(df[col0]!= 0))
choices.insert(1, 25)
condition.insert(0,(df[col2] != 0) & (df[col1] != 0))
choice.insert(0, 25)
else:
condition.insert(0,(df[col2] != 0) & (df[col1] != 0))
choice.insert(0, 25)
if col1 is None:
conditions.insert(0, (df[col2] != 0))
choices.insert(0, 100)
conditions.insert(0, (df[col2] == 0))
choices.insert(0, np.nan)
df['comp{}'.format(i)] = np.select(conditions , choices , default = np.nan)
df['comp{}'.format(i+1)] = np.select(condition , choice)
col_ref = None
col_prev = df.columns[0]
for i, col in enumerate(df.columns[1:], 1):
build_comp(col,i, col_ref , col_prev)
col_ref = col_prev
col_prev = col
When I run this code , I am still not getting the comp column. This is what I am getting:
rule_id col2
50378 2
51183 3
But I should get this as per my logic:
rule_id col2 comp1
50378 2 100
51183 3 100
I know there is something wrong with the for loop and col_prev logic but I don't know what.
Edit: For more simplification , this is how my df looks like:
This is my `df' looks like after applying my code:
But now suppose there is only one timestamp column is present such as this:
Then I want the result to be this:
date 2018-12-11 13:41:51 comp1
rule_id
51183 1 100
52368 1 100
When df has a single column, the for loop gets skipped (i.e. the code in the loop does not get executed).
In order to add a column for the case where df has a single column, add the following code to the end:
if len(df.columns) == 1:
df['comp1'] = [100] * len(df)
This assumes that rule_id is the row labels. If not, then compare with 2 instead of 1.
Your condition about testing col1 is None is exactly the same as for col0; therefore, this is about setting a default value for col1 so that it may not be provided.
Therefore, your code should be something like this:
def build_comp(col2, i, col0 = None, col1 = None): # <== changing here
if col1 is not None: # we can compare <== EDITED HERE
conditions = [(df[col1] == 0) & (df[col2] == 0),
(df[col1] == df[col2]),
(df[col1] != 0) & (df[col2] != 0) & (df[col1] > df[col2]),
(df[col1] != 0) & (df[col2] != 0) & (df[col1] < df[col2]),
(df[col1] != 0) & (df[col2] == 0)]
choices = [np.nan,
50,
df[col2] / df[col1] * 50,
df[col2] / df[col1] * 25,
100]
condition = [(df[col2] != 0),
(df[col2] == 0)]
choice = [100,
np.nan]
if col0 is not None:
conditions.insert(1, (df[col1] != 0) & (df[col2] == 0) & (df[col0] != 0))
choices.insert(1, 50)
condition.insert(0,(df[col2] != 0) & (df[col1] != 0))
choice.insert(0, 25)
else: # if col0 is None: # <== use `else` instead of testing opposite
condition.insert(0,(df[col2] != 0) & (df[col1] != 0))
choice.insert(0, 25)
df['comp{}'.format(i)] = np.select(conditions , choices , default = np.nan)
df['comp{}'.format(i+1)] = np.select(condition , choice)
Beware, you use choices and choice for different stuff, that's not helping you.
Why are You using None?
IMO it`s better to use NaN.
I have a while loop which runs through a data frame A of 30000 rows and updates another data frame B and uses dataframe B for further iterations. its taking too much time. want to make it faster! any ideas
for x in range(0, dataframeA.shape[0]):
AuthorizationID_Temp = dataframeA["AuthorizationID"].iloc[x]
Auth_BeginDate = dataframeA["BeginDate"].iloc[x]
Auth_EndDate = dataframeA["EndDate"].iloc[x]
BeginDate_Temp = pd.to_datetime(Auth_BeginDate).date()
ScriptsFlag = dataframeA["ScriptsFlag"].iloc[x]
Legacy_PlacementID = dataframeA["Legacy_PlacementID"].iloc[x]
Legacy_AncillaryServicesID = dataframeA["Legacy_AncillaryServicesID"].iloc[x]
ProviderID_Temp = dataframeA["ProviderID"].iloc[x]
SRSProcode_Temp = dataframeA["SRSProcode"].iloc[x]
Rate_Temp = dataframeA["Rate"].iloc[x]
Scripts2["BeginDate1_SC"] = pd.to_datetime(Scripts2["BeginDate_SC"]).dt.date
Scripts2["EndDate1_SC"] = pd.to_datetime(Scripts2["EndDate_SC"]).dt.date
# BeginDate_Temp = BeginDate_Temp.date()
# EndDate_Temp = EndDate_Temp.date()
Scripts_New_Modified1 = Scripts2.loc[
((Scripts2["ScriptsFlag_SC"].isin(["N", "M"])) & (Scripts2["AuthorizationID_SC"] == AuthorizationID_Temp))
& ((Scripts2["ProviderID_SC"] == ProviderID_Temp) & (Scripts2["SRSProcode_SC"] == SRSProcode_Temp)),
:,
]
Scripts_New_Modified = Scripts_New_Modified1.loc[
(Scripts_New_Modified1["BeginDate1_SC"] == BeginDate_Temp)
& ((Scripts_New_Modified1["EndDate1_SC"] == EndDate_Temp) & (Scripts_New_Modified1["Rate_SC"] == Rate_Temp)),
"AuthorizationID_SC",
]
if ScriptsFlag == "M":
if Legacy_PlacementID is not None:
InsertA = insertA(AuthorizationID_Temp, BeginDate_Temp, EndDate_Temp, Units_Temp, EndDate_Temp_DO)
dataframeB = dataframeB.append(InsertA)
print("ScriptsTemp6 shape is {}".format(dataframeB.shape))
# else:
# ScriptsTemp6 = ScriptsTemp5.copy()
# print('ScriptsTemp6 shape is {}'.format(ScriptsTemp6.shape))
if Legacy_AncillaryServicesID is not None:
InsertB = insertB(AuthorizationID_Temp, BeginDate_Temp, EndDate_Temp, Units_Temp, EndDate_Temp_DO)
dataframeB = dataframeB.append(InsertB)
print("ScriptsTemp7 shape is {}".format(dataframeB.shape))
dataframe_New = dataframeB.loc[
((dataframeB["ScriptsFlag"] == "N") & (dataframeB["AuthorizationID"] == AuthorizationID_Temp))
& ((dataframeB["ProviderID"] == ProviderID_Temp) & (dataframeB["SRSProcode"] == SRSProcode_Temp)),
:,
]
dataframe_New1 = dataframe_New.loc[
(pd.to_datetime(dataframe_New["BeginDate"]).dt.date == BeginDate_Temp)
& ((pd.to_datetime(dataframe_New["EndDate"]).dt.date == EndDate_Temp_DO) & (dataframe_New["Rate"] == Rate_Temp)),
"AuthorizationID",
]
# PLAATN = dataframeA.copy()
Insert1 = insert1(dataframe_New1, BeginDate_Temp, AuthorizationID_Temp, EndDate_Temp, Units_Temp, EndDate_Temp_DO)
if Insert1.shape[0] > 0:
dataframeB = dataframeB.append(Insert1.iloc[0])
# else:
# ScriptsTemp8 = ScriptsTemp7
print("ScriptsTemp8 shape is {}".format(dataframeB.shape))
dataframe_modified1 = dataframeB.loc[
((dataframeB["ScriptsFlag"] == "M") & (dataframeB["AuthorizationID"] == AuthorizationID_Temp))
& ((dataframeB["ProviderID"] == ProviderID_Temp) & (dataframeB["SRSProcode"] == SRSProcode_Temp)),
:,
]
dataframe_modified = dataframe_modified1.loc[
(dataframe_modified1["BeginDate"] == BeginDate_Temp)
& ((dataframe_modified1["EndDate"] == EndDate_Temp_DO) & (dataframe_modified1["Rate"] == Rate_Temp)),
"AuthorizationID",
]
Insert2 = insert2(
dataframe_modified,
Scripts_New_Modified,
AuthorizationID_Temp,
BeginDate_Temp,
EndDate_Temp,
Units_Temp,
EndDate_Temp_DO,
)
if Insert2.shape[0] > 0:
dataframeB = dataframeB.append(Insert2.iloc[0])
dataframeA having 30000 rows
dataframeB should be inserted with new rows every iteration(30000 iterations) from DataframeA
updated dataframeB should be used in middle of each iteration for filtering conditions
insertA and InsertB are two functions which has additional filtering
it takes too much time to run for 30000 rows so
so it takes more time to run.
provide suggestions for making the loop faster in terms of execution time
I have a dataframe named output -
RAW_ENTITY_NAME ENTITY_TYPE ENTITY_NAME IS_MAIN
01-03-2017 TNRMATDT 01 03 2017 1
04-02-2017 TNRSTRTDT 04 02 2017 1
documents TNRTYPE SIGHT 1
documents TNRDOCSBY NOT FOUND 1
accept TNRDTL accept 1
23 TNRDAYS 23 1
print(df.dtypes())
RAW_ENTITY_NAME object
ENTITY_TYPE object
ENTITY_NAME object
IS_MAIN object
Note - ENTITY_TYPE = TNRTYPE, ENTITY_NAME = SIGHT AND IS_MAIN = 1 will only come once in the dataframe.
I want to update some values if ENTITY_TYPE is TNRTYPE, ENTITY_NAME = SIGHT AND IS_MAIN = 1.
temp = output.loc[(output['IS_MAIN'] == 1) & (output['ENTITY_TYPE'] == 'TNRTYPE'), 'ENTITY_NAME']
temp = temp.reset_index(drop=True)
temp = temp[0]
if (temp == 'SIGHT'):
output.loc[(output['IS_MAIN'] == '1') & (output['ENTITY_TYPE'] == 'TNRDOCSBY'), 'ENTITY_NAME'] = 'PAYMENT'
output.loc[(output['IS_MAIN'] == '1') & (output['ENTITY_TYPE'].isin(['TNRDTL'])),
['ENTITY_NAME', 'RAW_ENTITY_NAME']] = 'NOT APPLICABLE'
output.loc[(output['IS_MAIN'] == '1') & (output['ENTITY_TYPE'].isin(['TNRDAYS'])),
['ENTITY_NAME']] = '0'
output.loc[(output['IS_MAIN'] == '1') & (output['ENTITY_TYPE'].isin(['TNRDAYS'])),
['RAW_ENTITY_NAME']] = ''
output.loc[(output['IS_MAIN'] == '1') & (output['ENTITY_TYPE']=='TNRSTRTDT'),
['ENTITY_NAME', 'RAW_ENTITY_NAME']] = ''
output.loc[(output['IS_MAIN'] == '1') & (output['ENTITY_TYPE']=='TNRMATDT'),
['ENTITY_NAME', 'RAW_ENTITY_NAME']] = ''
The final output is -
RAW_ENTITY_NAME ENTITY_TYPE ENTITY_NAME IS_MAIN
01-03-2017 TNRMATDT 01 03 2017 1
04-02-2017 TNRSTRTDT 04 02 2017 1
documents TNRTYPE SIGHT 1
documents TNRDOCSBY PAYMENT 1
NOT APPLICABLE TNRDTL NOT APPLICABLE 1
TNRDAYS 0 1
As you can see everything is getting updated except the first two rows , i.e. ENTITY_TYPE = TNRMATDT AND TNRSTRTDAT.
I want to know why the below code is not giving the desired results.
output.loc[(output['IS_MAIN'] == '1') & (output['ENTITY_TYPE']=='TNRSTRTDT'),
['ENTITY_NAME', 'RAW_ENTITY_NAME']] = ''
output.loc[(output['IS_MAIN'] == '1') & (output['ENTITY_TYPE']=='TNRMATDT'),
['ENTITY_NAME', 'RAW_ENTITY_NAME']] = ''
I would be happy if someone could findout the mistake I'm commiting or tell me any work around.
thanks a lot.
i had the same problem. all you have to do is make the column IS_MAIN to be numeric
df['IS_MAIN'] = df['IS_MAIN'].astype(int)
This should make it work.
For me your solution working nice, I try rewrite it for better readable and not repeat same conditions:
temp = output.loc[(output['IS_MAIN'] == '1') &
(output['ENTITY_TYPE'] == 'TNRTYPE'), 'ENTITY_NAME']
#if values in IS_MAIN are integers
#temp = output.loc[(output['IS_MAIN'] == 1) &
# (output['ENTITY_TYPE'] == 'TNRTYPE'), 'ENTITY_NAME']
if (temp.iat[0] == 'SIGHT'):
#more general working if not match condition
#if (next(iter(temp), 'not match') == 'SIGHT'):
m1 = output['IS_MAIN'] == '1'
#if values in IS_MAIN are integers
#m1 = output['IS_MAIN'] == 1
m2 = output['ENTITY_TYPE'] == 'TNRDOCSBY'
m3 = output['ENTITY_TYPE'] == 'TNRDTL'
m4 = output['ENTITY_TYPE'] == 'TNRDAYS'
m5 = output['ENTITY_TYPE'].isin(['TNRMATDT','TNRSTRTDT'])
output.loc[m1 & m2, 'ENTITY_NAME'] = 'PAYMENT'
output.loc[m1 & m3, ['ENTITY_NAME', 'RAW_ENTITY_NAME']] = 'NOT APPLICABLE'
output.loc[m1 & m4, ['ENTITY_NAME']] = '0'
output.loc[m1 & m4, ['RAW_ENTITY_NAME']] = ''
output.loc[m1 & m5, ['ENTITY_NAME', 'RAW_ENTITY_NAME']] = ''
print (output)
RAW_ENTITY_NAME ENTITY_TYPE ENTITY_NAME IS_MAIN
0 TNRMATDT 1
1 TNRSTRTDT 1
2 documents TNRTYPE SIGHT 1
3 documents TNRDOCSBY PAYMENT 1
4 NOT APPLICABLE TNRDTL NOT APPLICABLE 1
5 TNRDAYS 0 1