I made this np select but AND operators don't work!
df = pd.DataFrame({'A': [2107], 'B': [76380700]})
cond = [(df["A"]==2107)|(df["A"]==6316)&(df['B']>=10000000)&(df['B']<=19969999),
(df["A"]==2107)|(df["A"]==6316)&(df['B']>=1000000)&(df['B']<=99999999)]
choices =["Return 1", "Return 2"]
df["C"] = np.select(cond, choices, default = df["A"])
NP select return "Return 1" but correct option is "Return 2"
>>df["C"]
0 Return 1
Cause this line return false
>>df["B"]<=19969999
False
How can I solve this problem?
It's an operator precendence issue. Here's what you wrote:
cond = [
(df["A"]==2107) |
(df["A"]==6316) &
(df['B']>=10000000) &
(df['B']<=19969999),
(df["A"]==2107) |
(df["A"]==6316) &
(df['B']>=1000000) &
(df['B']<=99999999)
]
Here's how that is interpreted:
cond = [
(df["A"]==2107) |
(
(df["A"]==6316) &
(df['B']>=10000000) &
(df['B']<=19969999)
),
(df["A"]==2107) |
(
(df["A"]==6316) &
(df['B']>=1000000) &
(df['B']<=99999999)
)
]
You need parens around the "or" clause:
cond = [
( (df["A"]==2107) | (df["A"]==6316) ) &
(df['B']>=10000000) &
(df['B']<=19969999),
( (df["A"]==2107) | (df["A"]==6316) ) &
(df['B']>=1000000) &
(df['B']<=99999999)
)
]
And, by the way, there is absolutely nothing wrong with writing the expressions like I did there. Isn't it much more clear what's going on when it's spaced out like that?
I think you were missing parenthesis for (df["A"]==2107)|(df["A"]==6316). In your script, condition for Return 1 was checking (df["A"]==2107)|(df["A"]==6316))&(df['B']>=10000000)&(df['B']<=19969999) which means A==2107 OR (A == 6316 & B... & B... ). That's why np.select returns 'Returns 1', because it is True.
df = pd.DataFrame({'A': [2107], 'B': [76380700]})
cond = [((df["A"]==2107)|(df["A"]==6316))&(df['B']>=10000000)&(df['B']<=19969999),
(df["A"]==2107)|(df["A"]==6316)&(df['B']>=1000000)&(df['B']<=99999999)]
choices =["Return 1", "Return 2"]
df["C"] = np.select(cond, choices, default = df["A"])
Related
BEGIN
open v_refcur for
SELECT A.LGCY_LNDR_NO
, A.LGCY_LNDR_BR_NO
, A.LNDR_NM
, B.ADDR_LINE1_TXT
, B.ADDR_LINE2_TXT
, B.CITY_NM
, B.ST_CD
, B.POSTAL_CD
, C.FAX_NO
FROM LNDR_CUST_XREF A
LEFT OUTER JOIN LNDR_CUST_ADDR B
ON A.LNDR_ID = B.LNDR_ID
AND B.ADDR_TYP_CD = 'MAIL'
LEFT OUTER JOIN LNDR_CUST_ADDR C
ON A.LNDR_ID = C.LNDR_ID
AND C.ADDR_TYP_CD = 'SITE'
WHERE A.LGCY_LNDR_NO = LNDR_NO
AND A.LGCY_LNDR_BR_NO = BRN_NO
AND A.TA_CUST_FLG = 'Y';
SQL_CD := SWV_SQLCODE;
END;
What will be the line by line conversion of this above code? I dont have the databases in-hand, so what would be the most appropriate gist of the PL/SQL code in Pyspark?
this statement can be re-written something like below -
df = (df_LNDR_CUST_XREF.alias('A')
.df_LNDR_CUST_ADDR.alias('B'), ((A.LNDR_ID == B.LNDR_ID) & (B.ADDR_TYP_CD == 'MAIL'), "left")
.df_LNDR_CUST_ADDR.alias('C'), ((A.LNDR_ID == C.LNDR_ID) & (C.ADDR_TYP_CD == 'SITE'), "left")
.where ((A.LGCY_LNDR_NO == LNDR_NO) & (A.LGCY_LNDR_BR_NO == BRN_NO) & (A.TA_CUST_FLG == 'Y'))
.select (F.col("A.LGCY_LNDR_NO")
, F.col("A.LGCY_LNDR_BR_NO")
, F.col("A.LNDR_NM")
, F.col("B.ADDR_LINE1_TXT")
, F.col("B.ADDR_LINE2_TXT")
, F.col("B.CITY_NM")
, F.col("B.ST_CD")
, F.col("B.POSTAL_CD")
, F.col("C.FAX_NO"))
)
I havent tested it though.
I'm getting the following error while running the DirectRunner:
can't pickle generator objects [while running 'Assign Side Columns']
The error is popping up when the beam is writing the "mapped rows" to BigQuery.
Any idea where this error is coming from?
def assign_columns(row, mapping):
main_account = row['filepath_platform_id']
main_report = row['filepath_report_name']
for _mapping in mapping:
_side_account = _mapping['account_id']
_side_report = _mapping['report_name']
if main_account == _side_account and main_report == _side_report:
_cols = json.loads(_mapping['mapping'])
row = renames_columns(row, _cols)
yield row
def run(argv=None, save_main_session=True):
options = PipelineOptions(flags=argv)
options.view_as(SetupOptions).save_main_session = save_main_session
table_spec = 'SELECT * FROM `nextwork-staging.datalake.v_mapping_reports`'
with beam.Pipeline(options=options) as p:
common_options = options.view_as(CommonOptions)
file_metadata = (
p
| 'Create empty PCollection' >> beam.Create(['Start'])
| 'Load Input Location' >> beam.ParDo(CreateFilesFromGlobByDate(common_options.input, None, None, None, common_options.starting_date, common_options.ending_date, common_options.only_last_file_from_input))
)
rows = (
file_metadata | 'GetIncomeAccessFn' >> beam.ParDo(GetIncomeAccessFn())
| 'Map to regular dictionary' >> beam.Map(lambda x: dict(x))
| 'TransformDate' >> beam.ParDo(TransformDateFromStringFn())
)
side_input = (
p
| 'ReadTable' >> beam.io.ReadFromBigQuery(query=table_spec, use_standard_sql=True)
| 'Map Columns' >> beam.Map(lambda x: dict(x))
)
mapped_rows = (
rows | 'Assign Side Columns' >> beam.Map(assign_columns, mapping=beam.pvalue.AsIter(side_input))
| 'Rename column' >> beam.Map(renames_columns, names={'site': 'campaign'})
| 'TransformForBigQuery' >> beam.ParDo(TransformForBigQueryFn())
)
gcp_options = options.view_as(GoogleCloudOptions)
(
mapped_rows
| 'WriteToBigQueryUsingParameter' >> bigquery_file_loads.BigQueryBatchFileLoads(
destination=_get_table_name,
schema=_get_schema,
create_disposition='CREATE_IF_NEEDED',
write_disposition='WRITE_TRUNCATE',
additional_bq_parameters=_get_additional_bq_parameters,
custom_gcs_temp_location=gcp_options.temp_location
)
)
Hastily, I once wrote the code for work. It was assumed that it would only need to be run once. But now you have to run it often. The source data unfortunately can not provide because they conferenceline. I am interested in the question, which parts of the code need to be corrected in order to get performance? The current version runs for about 2 hours, running about 25K iterations.
dfs = []
for x in tqdm_notebook(range(len(group_furn))):
good_id = group_furn.iloc[x, 0]
name_promo = group_furn.iloc[x, 1]
unique_stores = set(stores.loc[stores.store_type_id != 4]['store_id'].unique()) - \
set(promo_list.loc[(promo_list['good_id'] == good_id) & \
(promo_list['name_promo_mech'] == name_promo)].store_id.unique())
bu_stores = list(unique_stores.intersection(set(bu_furn.store_id.unique())))
main_stores = bu_store.loc[(bu_store.store_id.isin(bu_stores)) & (bu_store.store_type_id != 4)].main_store_id.unique()
df = promo_list.loc[(promo_list.good_id == good_id) & (promo_list.name_promo_mech == name_promo) &
(promo_list.store_id.isin(main_stores))]
bu = bu_store.loc[bu_store.main_store_id.isin(main_stores)]
df = pd.merge(df, bu, how='inner', left_on='store_id', right_on='main_store_id')
dfs.append(df)
main_stores = bu_store.loc[(bu_store.store_id.isin(bu_stores)) & (bu_store.store_type_id == 4)].main_store_id.unique()
owners = bu_store.loc[bu_store.main_store_id.isin(main_stores)].main_owner_id.unique()
df_2 = promo_list.loc[(promo_list.good_id == good_id) & (promo_list.name_promo_mech == name_promo) &
(promo_list.owner_id.isin(owners))]
bu = bu_store.loc[bu_store.main_store_id.isin(main_stores)]
df_2 = pd.merge(df_2, bu, how='inner', left_on='store_id', right_on='main_store_id')
dfs.append(df_2)
I´m doing some stuff with pandas and python. I have the next code
df = pd.read_csv("Request.csv", keep_default_na=False)
df1 = df.loc[(df["Request Status"] == "Closed")]
df1["Request Close-Down Actual"] = pd.to_datetime(df1["Request Close-Down Actual"], errors = 'coerce' )
df3 = df1.loc[(df1["Request Close-Down Actual"] < '2016-11-01') | (df1["Request Close-Down Actual"].isnull())]
df3.set_index("Request ID", inplace = True)
df3.to_csv("Request1.csv")
The issue is when i run the code i receive the next issue
A value is trying to be set on a copy of a slice from a DataFrame
df1.loc["Request Close-Down Actual"] = pd.to_datetime(df1["Request
Close-Down Actual"], errors = 'coerce' )
Can someone give me a hand with this please. Thanks
I test it and for me it works nice.
Problem should be in row above:
df1 = df.loc[(df["Request Status"] == "Closed")]
And solution is copy:
#loc is not necessary
df1 = df[df["Request Status"] == "Closed"].copy()
Error show loc - try remove it if need select column:
df1.loc["Request Close-Down Actual"] = pd.to_datetime(df1["Request Close-Down Actual"], errors = 'coerce' )
to:
df1["Request Close-Down Actual"] = pd.to_datetime(df1["Request Close-Down Actual"], errors = 'coerce' )
category = request.GET.get('cat')
from_p = request.GET.get('from')
to_p = request.GET.get('to')
q = Q()
if category:
q |= Q(category=category)
if from_p:
q |= Q(price__gt=from_p)
if to_p:
q |= Q(price__lt=to_p)
s = Sample.objects.filter(q)
www.example.com/?cat=0&from=300&to=600
If first is category - from_p and to_p criterion not working. How to fix it?
It work only for one criterion. I need if user set (for example) category and from_p search by these criterion.
EDITED
Here it is:
q_cat = Q(category=category) if category else Q()
q_from = Q(price__gt=from_p) if from_p else Q()
q_to = Q(price__lt=to_p) if to_p else Q()
q = q_cat | ( q_from & q_to )
s = Sample.objects.filter( q )