How to access xcom_pull outside of task function in Airflow? - python

Code:
import datetime
import logging
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
def hello_world(ti, execution_date, **context):
logging.info("Hello World")
return "Gorgeous"
def addition(ti, **context):
# Want belows are same each other
logging.info(context['params']["please1"])
logging.info(ti.xcom_pull(task_ids="hello_world"))
dag = DAG(
"test",
schedule_interval="#hourly",
start_date=datetime.datetime.now() - datetime.timedelta(days=1),
)
t1 = PythonOperator(
task_id="hello_world", python_callable=hello_world, dag=dag, provide_context=True
)
t2 = PythonOperator(
task_id="abc",
python_callable=addition,
dag=dag,
params={"please1": "{{{{ ti.xcom_pull(task_ids='{}') }}}}".format(t1.task_id)},
provide_context=True,
)
t1 >> t2
I want addition() shows the same result:
# Want belows are same each other
logging.info(context['params']["please1"])
logging.info(ti.xcom_pull(task_ids="hello_world"))
But the result is:
[2021-05-17 23:47:15,286] {test_dag.py:14} INFO - {{ ti.xcom_pull(task_ids='hello_world') }}
[2021-05-17 23:47:15,291] {test_dag.py:15} INFO - Gorgeous
What I want to know: Is it possible to access xcom_pull outside of the task function? e.g. When passing the value from the xcom to PythonOperator?
Thanks!

Jinja-templated args for an operator can only be used for those fields that are listed as template_fields in the operator class. For the PythonOperator that is op_args, op_kwargs, and templates_dict. First, replace your params parameter to op_kwargs and remove the extra curly brackets for Jinja -- only 2 on either side of the expression. Second, and unfortunately, you need to explicitly list the task_id in the ti.xcom_pull(task_ids='<task_id>') call.
Revised code:
import datetime
import logging
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
def hello_world(ti, execution_date, **context):
logging.info("Hello World")
return "Gorgeous"
def addition(ti, **context):
logging.info(context["please1"])
logging.info(ti.xcom_pull(task_ids="hello_world"))
dag = DAG(
"test",
schedule_interval=None,
start_date=datetime.datetime(2021, 5, 17),
catchup=False,
)
with dag:
t1 = PythonOperator(
task_id="hello_world",
python_callable=hello_world,
provide_context=True,
)
t2 = PythonOperator(
task_id="abc",
python_callable=addition,
op_kwargs={
"please1": "{{ ti.xcom_pull(task_ids='hello_world') }}",
},
provide_context=True,
)
t1 >> t2
Logging from "t2":
If you are using Airflow 2.0, the code can actually be simplified to use the new XComArg feaure. This feature allows you to access the output of tasks using a simple task.output expression.
Revised code with 2.0 and XComArg use to access the output of "t1" as the "please1" arg:
import datetime
import logging
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
def hello_world(ti, execution_date, **context):
logging.info("Hello World")
return "Gorgeous"
def addition(ti, **context):
logging.info(context["please1"])
logging.info(ti.xcom_pull(task_ids="hello_world"))
dag = DAG(
"test",
schedule_interval=None,
start_date=datetime.datetime(2021, 5, 17),
catchup=False,
)
with dag:
t1 = PythonOperator(
task_id="hello_world",
python_callable=hello_world,
)
t2 = PythonOperator(
task_id="abc",
python_callable=addition,
op_kwargs={"please1": t1.output},
)
t1 >> t2
More about DAG authoring with 2.0 here.

Related

Apache Airflow : create tasks using for loop in one dag, I want tasks made of for loop to respond to each xcom

A task that performs the same task in one dag was created using a for loop. It is hoped to be divided into two branches that depend on the result of this task. However, all tasks created using the for loop return the xcom of the last task. How can tasks created using for loop return each xcom?
Each task a,b,c returns xcom_a, xcom_b, and xcom_c. However, branch tasks all get the same xcom_c. What should I do?
default_args ={'start_date':days_ago(1)}
dag=DAG(
dag_id='batch_test',
default_args=default_args,
schedule_interval=None)
def count(**context):
name = context['params']['name']
dict = {'a':50,
'b':100,
'c':150}
if dict[name]<100:
task_id=f'add_{name}'
return task_id
elif dict[name]>=100:
task_id=f'times_{name}'
return task_id
def branch(**context):
task_id = context['ti'].xcom_pull(task_ids=f'count_task_{name}')
return task_id
def add(**context):
ans = context['ti'].xcom_pull(task_ids=f'branch_task_{name}')
ans_dict = {'add_a':50+100,
'add_b':100+100,
'add_c':150+100}
ans = ans_dict[ans]
return print(ans)
def times(**context):
ans = context['ti'].xcom_pull(task_ids=f'branch_task_{name}')
ans_dict = {'times_a':50*100,
'times_b':100*100,
'times_c':150*100}
ans = ans_dict[ans]
return print(ans)
name_list = ['a','b','c']
for name in name_list:
exec_count_task = PythonOperator(
task_id = f'count_task_{name}',
python_callable = count,
provide_context=True,
params = {'name':name},
dag=dag
)
exec_branch_task = BranchPythonOperator(
task_id = f'branch_task_{name}',
python_callable = branch,
provide_context = True,
dag = dag
)
exec_add_count = PythonOperator(
task_id = f'add_{name}',
python_callable = add,
provide_context = True,
dag = dag
)
exec_times_count = PythonOperator(
task_id = f'times_{name}',
python_callable = times,
provide_context = True,
dag = dag
)
exec_count_task >> exec_branch_task >> [exec_add_count, exec_times_count]
i want this...
task_a >> branch_a (branch python operator, xcom pull returned by task_a) >> [task_a1, task_a2]
task_b >> branch_b (branch python operator, xcom pull returned by task_b) >> [task_b1, task_b2]
task_c (>> branch_c (branch python operator, xcom pull returned by task_c) >> [task_c1, task_c2]
but
task_a >> branch_a (branch python operator, xcom pull returned by task_c) >> [task_a1, task_a2]
task_b >> branch_b (branch python operator, xcom pull returned by task_c) >> [task_b1, task_b2]
task_c >> branch_c (branch python operator, xcom pull returned by task_c) >> [task_c1, task_c2]
I'm unable to reproduce the behavior you describe using classic operators and the TaskFlow API. If you are able to add more context and code of what you are actually executing that would be most helpful.
In the meantime, here are the examples I used should it give you some guidance for troubleshooting. I added a task at the end of the streams to check that the first task indeed pushes its expected value.
Classic Operators
from pendulum import datetime
from airflow.models import DAG
from airflow.operators.python import BranchPythonOperator, PythonOperator
from airflow.utils.trigger_rule import TriggerRule
with DAG(dag_id="multiple_branch_loop", start_date=datetime(2023, 1, 1), schedule=None):
def xcom_push(val):
return val
def func():
...
def choose(val):
return f"task_{val}"
def check_xcom_output_from_first(val, expected_val):
assert val == expected_val
stuff = ["a", "b", "c"]
for i in stuff:
first = PythonOperator(task_id=f"first_task_{i}", python_callable=xcom_push, op_kwargs={"val": i})
branch = BranchPythonOperator(task_id=f"branch_{i}", python_callable=choose, op_kwargs={"val": i})
second = PythonOperator(task_id=f"task_{i}", python_callable=func)
third = PythonOperator(task_id=f"task_{i}a", python_callable=func)
check = PythonOperator(
task_id=f"check_{i}",
trigger_rule=TriggerRule.ALL_DONE,
python_callable=check_xcom_output_from_first,
op_kwargs={"val": first.output, "expected_val": i},
)
first >> branch >> [second, third] >> check
The check* tasks succeed meaning the first task in a given stream does push its value and not the last stream's.
TaskFlow API
from pendulum import datetime
from airflow.decorators import dag, task
from airflow.utils.trigger_rule import TriggerRule
#dag(start_date=datetime(2023, 1, 1), schedule=None)
def multiple_branch_loop():
#task()
def xcom_push(val):
return val
#task()
def func():
...
#task.branch()
def choose(val):
return f"task_{val}"
#task(trigger_rule=TriggerRule.ALL_DONE)
def check_xcom_output_from_first(val, expected_val):
assert val == expected_val
stuff = ["a", "b", "c"]
for i in stuff:
first = xcom_push.override(task_id=f"first_task_{i}")(val=i)
branch = choose.override(task_id=f"branch_{i}")(val=first)
second = func.override(task_id=f"task_{i}")()
third = func.override(task_id=f"task_{i}a")()
check = check_xcom_output_from_first.override(task_id=f"check_{i}")(val=first, expected_val=i)
first >> branch >> [second, third] >> check
multiple_branch_loop()
Same expected behavior as well confirmed in the check* tasks:
Your functions branch, add, and times don't define name themselves, so it is taken out of global context, which is at time of function execution the last value of for name in name_list. This is a common trap explained e.g. here: tkinter creating buttons in for loop passing command arguments
To fix it, you can either pull name from context as in count, or provide it via op_args or op_kwargs when you create the respective operator, as in the answer by Josh Fell:
first = PythonOperator(task_id=f"first_task_{i}", python_callable=xcom_push, op_kwargs={"val": i})
branch = BranchPythonOperator(task_id=f"branch_{i}", python_callable=choose, op_kwargs={"val": i})

Issue running a lambda two times in sequence from Airflow in AWS

I am very new to Airflow. I have a single lambda function that needs to be executed two times in sequence. I am passing the payload from dag file (say date), based on this date my lambda will fetch the records from an API.
The lambda works fine when testing it manually for different dates, but when I create a dag file to invoke them in sequence, the second lambda is executed more than once. For this task, second time execution starts even before the first execution of lambda is complete.
I am not using any retries in my dag, not sure if the issue is at the lambda end or dag end or if anything configuration related.
No errors noted at dag, but the lambda log shows the multiple invocation of the second task.
Any help with this appreciated!
Thanks!
from airflow import DAG
from airflow.operators.dummy import DummyOperator
from airflow.operators.python import PythonOperator
from datetime import datetime, timedelta, timezone
import boto3
import json
from airflow.utils.dates import days_ago
args = { 'owner': 'airflow', 'start_date': days_ago(1), 'catchup': False, 'provide_context': True }
#50 13 * * *
dag = DAG( 'my_dag', schedule_interval = None, default_args = args )
def lambda1_trigger(ds,**kwargs):
pld =json.dumps({"date":"27/06/2022"})
lambda_client = boto3.client('lambda',region_name=aws_region,aws_access_key_id=aws_access_key,aws_secret_access_key=aws_secret_access_key)
response_1 = lambda_client.invoke(FunctionName='my_lambda_function',InvocationType='RequestResponse',Payload = pld)
print('Response--->', response_1)
def lambda2_trigger(ds,**kwargs):
pld =json.dumps({"date":"28/06/2022"})
lambda_client = boto3.client('lambda',region_name=aws_region,aws_access_key_id=aws_access_key,aws_secret_access_key=aws_secret_access_key)
response_1 = lambda_client.invoke(FunctionName='my_lambda_function',InvocationType='RequestResponse',Payload = pld)
print('Response--->', response_1)
start = DummyOperator(task_aid='Begin_execution', dag=dag)
invoke_lambda1 = PythonOperator( task_id="task1", python_callable=lambda1_trigger, provide_context=True,execution_timeout=datetime.timedelta(hours=1), dag=dag)
invoke_lambda2 = PythonOperator( task_id="task2", python_callable=lambda2_trigger, provide_context=True,execution_timeout=datetime.timedelta(hours=1), dag=dag)
end = DummyOperator(task_id='stop_execution', dag=dag)
start >> invoke_lambda1 >> invoke_lambda2 >> end`

airflow.exceptions.DuplicateTaskIdFound exception

Hi I need to define a DAG with a task and the task has to be invoked 4 times every day. But when I publish to DAG, I got an airflow.exceptions.DuplicationTaskIdFound error, see my DAG definition below:
import sys
import time
from airflow.models import DAG,Variable
from airflow.operators.bash import BashOperator
from datetime import datetime,timedelta
from airflow.operators.dummy import DummyOperator
from airflow.providers.ssh.hooks.ssh import SSHHook
from airflow.providers.ssh.operators.ssh import SSHOperator
from airflow.operators.python import PythonOperator
from airflow.sensors.external_task import ExternalTaskMarker, ExternalTaskSensor
from airflow.utils.dates import days_ago
import pendulum
sys.path.append("../..")
from common.airflow_dep_coordinator import airfow_coordinator
from common import airflow_utli
from common import airflow_config
default_args = airflow_utli.default_args
default_args['owner'] = 'bi'
default_args['sla'] = timedelta(hours=6)
default_args['retries'] = 3
default_args['start_date'] = datetime(2022,4,29, tzinfo=airflow_utli.local_tz) #TODO please override start date of this dag
airflow_config.EMAIL_RECIEVER_LIST=Variable.get("bi_email_reciever_list",deserialize_json=True , default_var=None)
# ---------------------------------------------------------------------------
#Get data string
# #beforeOfDay, the day before current day
# #with_dash, whether the result string contains dash, default False
# ---------------------------------------------------------------------------
def getdate(beforeOfDay, with_dash=False):
today = datetime.now()
offset = timedelta(days=-beforeOfDay)
str_date_format ='%Y%m%d'
if(with_dash):
str_date_format ='%Y-%m-%d'
date_str = (today + offset).strftime(str_date_format)
return date_str
# ---------------------------------------------------------------------------
#Get etl task, return SSHOperator
# ---------------------------------------------------------------------------
def batch_etl_task(dag,target_table_name, param=None,ssh_conn_id=SSHHOOK_NAME):
sh_command = COMMAND_BASE+ COMMAND.format(target_table_name=target_table_name, param=param)
print(sh_command)
task = SSHOperator(
dag=dag, task_id=target_table_name, ssh_conn_id=ssh_conn_id, command=sh_command
)
return task
# =========================================================================
# DAG definition
# =========================================================================
dag=DAG(
dag_id='lakehouse_dws_otp_app_user_daily',
default_args=default_args,
tags=['bi','dws','otp','app'],
schedule_interval='00 04 * * *',
dagrun_timeout=timedelta(hours=6),
concurrency=12,
catchup=False,
params={"pday": getdate(2),"fday":getdate(1),
"pdaym1": getdate(3),"fdaym1":getdate(2),
"pdaym7": getdate(9),"fdaym7":getdate(8),
"pdaym30": getdate(32),"fdaym30":getdate(31)
},
sla_miss_callback=airflow_utli.default_sla_callback
)
#etl task
task_dws_fact_com_otp_app_visitor_snp_t=batch_etl_task(dag=dag,target_table_name='AnotherTargetTable',param='{{ params.pday }} {{params.fday}}')
task_dws_fact_com_otp_app_active_retention_snp_t=batch_etl_task(dag=dag,target_table_name='target_table',param='{{ params.pday }} {{params.fday}}')
task_dws_fact_com_otp_app_active_retention_snp_t_m1=batch_etl_task(dag=dag,target_table_name='target_table',param='{{ params.pdaym1 }} {{params.fdaym1}}')
task_dws_fact_com_otp_app_active_retention_snp_t_m7=batch_etl_task(dag=dag,target_table_name='target_table',param='{{ params.pdaym7 }} {{params.fdaym7}}')
task_dws_fact_com_otp_app_active_retention_snp_t_m30=batch_etl_task(dag=dag,target_table_name='target_table',param='{{ params.pdaym30 }} {{params.fdaym30}}')
### End
end_dws_otp_register_daily=DummyOperator(
task_id='end_dws_otp_register_daily',
dag=dag)
### Dependence, TODO please setup the dependency tree
[task_dws_fact_com_otp_app_new_visitor_t,task_dws_fact_com_otp_app_active_au_snp_t]>>task_dws_fact_com_otp_app_active_retention_snp_t>>task_dws_fact_com_otp_app_active_retention_snp_t_m1
as you can see, I need to invoke etl to update target_table 4 times with different parameters, but this is not work seems DAG doesn't allow me to define task 4 times with same target table, so is there any best practice to achive this purpose in one DAG?
You have implemented batch_etl_task to set the task_id for the SSHOperator to the name of the target_table_name.
You must pass unique names for the task_id to register a DagNode.

Pass arguments to function from BranchPythonOperator in Airflow

I am running below code to create DAG. Dags are created but choose_best_model Dag is failing. Error is: ERROR - _choose_best_model() missing 1 required positional argument: 'ti'. My Airflow version is: 1.10.3. How Can I resolve this error?
my_dag.py
from airflow import DAG
from airflow.operators.python_operator import PythonOperator, BranchPythonOperator
from airflow.operators.bash_operator import BashOperator
from random import randint
from datetime import datetime
def _choose_best_model(ti):
accuracies = ti.xcom_pull(task_ids=[
'training_model_A',
'training_model_B',
'training_model_C'
])
best_accuracy = max(accuracies)
if (best_accuracy > 8):
return 'accurate'
return 'inaccurate'
def _training_model():
return randint(1, 10)
with DAG("my_dag", start_date=datetime(2021, 1, 1),
schedule_interval="#daily", catchup=False) as dag:
training_model_A = PythonOperator(
task_id="training_model_A",
python_callable=_training_model
)
training_model_B = PythonOperator(
task_id="training_model_B",
python_callable=_training_model
)
training_model_C = PythonOperator(
task_id="training_model_C",
python_callable=_training_model
)
choose_best_model = BranchPythonOperator(
task_id="choose_best_model",
python_callable=_choose_best_model
)
accurate = BashOperator(
task_id="accurate",
bash_command="echo 'accurate'"
)
inaccurate = BashOperator(
task_id="inaccurate",
bash_command="echo 'inaccurate'"
)
[training_model_A, training_model_B, training_model_C] >> choose_best_model >> [accurate, inaccurate]
You need to pass the provide_context parameter to your operator (it's extending the PythonOperator which defines it). You also need to add the kwargs to your function's signature.
The full list of parameters in the context which can be passed to your python_callable can be found here (v.1.10.15).
Once you do this, you can also pass additional custom parameters to your function using the op_kwargs parameter.
PythonOperator Airflow docs
[...]
def _choose_best_model(ti, **kwargs): # <-- here
accuracies = ti.xcom_pull(task_ids=[
'training_model_A',
'training_model_B',
'training_model_C'
])
[...]
with DAG("my_dag", start_date=datetime(2021, 1, 1),
schedule_interval="#daily", catchup=False) as dag:
[...]
choose_best_model = BranchPythonOperator(
task_id="choose_best_model",
python_callable=_choose_best_model,
provide_context=True, # <-- here
)
[...]

Use kwargs and context simultaniauly on Airflow

Im using Airflow 1.10.11.
Can I use a TriggerDagRunOperator to pass a parameter to the triggered dag? Airflow from a previous question I know that I can send parameter using a TriggerDagRunOperator.
But my new question is: Can I use the parameter from the dag_run on a def when using **kwargs? So I can retrieve the xcom values and the dag_run values?
I tried def new_op_fun(**kwargs, **context): , but that is an invalid syntax
please help, Thanks in advance !.
dag.py
from datetime import datetime
from airflow.models import DAG
from airflow.operators.python_operator import PythonOperator
dag = DAG(
dag_id='my_dag',
schedule_interval='#once',
start_date=datetime(2021, 1, 1)
)
previous_op= BashOperator(
task_id="previous_op",
bash_command='echo "{{ params.week }}"',
params = {'week' : '$(date +%V -d \"1 week ago\")',},
provide_context=True,
xcom_push=True,
dag=dag
)
def run_this_func(**context):
ti = kwargs['ti']
xcom_value = ti.xcom_pull(task_ids='previous_op')
print( xcom_value )
print(context["dag_run"].conf)
def new_op_fun(**kwargs):
ti = kwargs['ti']
xcom_value = ti.xcom_pull(task_ids='previous_op')
print( xcom_value )
print(context["dag_run"].conf)
return( "hello" )
new_op = PythonOperator(
task_id='new_op',
provide_context=True,
python_callable=new_op_fun,
xcom_push=True,
dag=dag)
previous_op >> new_op
trigger.py
from datetime import datetime
from airflow.models import DAG
from airflow.operators.dagrun_operator import TriggerDagRunOperator
dag = DAG(
dag_id='trigger',
schedule_interval='#once',
start_date=datetime(2021, 1, 1)
)
def modify_dro(context, dagrun_order):
print(context)
print(dagrun_order)
dagrun_order.payload = {
"message": "This is my conf message"
}
return dagrun_order
run_this = TriggerDagRunOperator(
task_id='run_this',
trigger_dag_id='my_dag',
python_callable=modify_dro,
dag=dag
)
run_this
This works for me.
Instead of using kwargs, pass them in as arguments. For example:
def func_my_task(my_param, **context):
print(my_param)
t1 = PythonOperator(
task_id='my_task',
python_callable=func_my_task,
op_kwargs={'my_param': 'hello'},
dag=dag
)
Your limitation is basically with being unable to use 2 dynamic dict as input parameter, if you turn one of these into a String or List, your problem will go away.
1-You can define 2 functions that in one of them you return the value of the **kwargs you like as a list or string and then pass it to the function that wants **context as first parameter(either as str1 or list1).
2- use a variable in a global way and fill it each time with contents returned by your first and second function and then use the results of these in your final function.

Categories

Resources