Does Airflow cache global variable when rerun - python

I have a Airflow job like below:
import time
job_id = int(time.time())
airflow_job1 = PythonOperator(op_kwargs={"job_id" : job_id}, ...)
airflow_job2 = BashOperator(op_kwargs={"job_id" : job_id}, ...)
airflow_job1 >> airflow_job2
I know every time when script launched, I will have a new job_id, used in each airflow task. But I wonder what if I run the script from middle, like airflow_job1 failed, and I fix problem and rerun from airflow_job1 in UI, is a new job_id generated in the rerun, or Airflow use the last job_id before?

Actually, after I check with a simple case:
# global parameter
job_id = int(time.time())
def airflow_job1(job_id, **context):
print("in airflow_job1, current timestamp: %s" % job_id)
def airflow_job2(job_id, **context):
print("in airflow_job2, current timestamp: %s" % job_id)
airflow_job1 = PythonOperator(
task_id='airflow_job1',
provide_context=True,
python_callable=airflow_job1,
op_kwargs={'job_id': job_id},
dag=globals()[dag_name]
)
airflow_job2 = PythonOperator(
task_id='airflow_job2',
provide_context=True,
python_callable=airflow_job2,
op_kwargs={'job_id': job_id},
dag=globals()[dag_name]
)
airflow_job1 >> airflow_job2
I find job_id in airflow_job1 and airflow_job2 are different even if in the same run.
So the conclusion is that we shouldn't set global parameter in this way, maybe use xcom_pull / xcom_push to solve that

Related

Apache Airflow : create tasks using for loop in one dag, I want tasks made of for loop to respond to each xcom

A task that performs the same task in one dag was created using a for loop. It is hoped to be divided into two branches that depend on the result of this task. However, all tasks created using the for loop return the xcom of the last task. How can tasks created using for loop return each xcom?
Each task a,b,c returns xcom_a, xcom_b, and xcom_c. However, branch tasks all get the same xcom_c. What should I do?
default_args ={'start_date':days_ago(1)}
dag=DAG(
dag_id='batch_test',
default_args=default_args,
schedule_interval=None)
def count(**context):
name = context['params']['name']
dict = {'a':50,
'b':100,
'c':150}
if dict[name]<100:
task_id=f'add_{name}'
return task_id
elif dict[name]>=100:
task_id=f'times_{name}'
return task_id
def branch(**context):
task_id = context['ti'].xcom_pull(task_ids=f'count_task_{name}')
return task_id
def add(**context):
ans = context['ti'].xcom_pull(task_ids=f'branch_task_{name}')
ans_dict = {'add_a':50+100,
'add_b':100+100,
'add_c':150+100}
ans = ans_dict[ans]
return print(ans)
def times(**context):
ans = context['ti'].xcom_pull(task_ids=f'branch_task_{name}')
ans_dict = {'times_a':50*100,
'times_b':100*100,
'times_c':150*100}
ans = ans_dict[ans]
return print(ans)
name_list = ['a','b','c']
for name in name_list:
exec_count_task = PythonOperator(
task_id = f'count_task_{name}',
python_callable = count,
provide_context=True,
params = {'name':name},
dag=dag
)
exec_branch_task = BranchPythonOperator(
task_id = f'branch_task_{name}',
python_callable = branch,
provide_context = True,
dag = dag
)
exec_add_count = PythonOperator(
task_id = f'add_{name}',
python_callable = add,
provide_context = True,
dag = dag
)
exec_times_count = PythonOperator(
task_id = f'times_{name}',
python_callable = times,
provide_context = True,
dag = dag
)
exec_count_task >> exec_branch_task >> [exec_add_count, exec_times_count]
i want this...
task_a >> branch_a (branch python operator, xcom pull returned by task_a) >> [task_a1, task_a2]
task_b >> branch_b (branch python operator, xcom pull returned by task_b) >> [task_b1, task_b2]
task_c (>> branch_c (branch python operator, xcom pull returned by task_c) >> [task_c1, task_c2]
but
task_a >> branch_a (branch python operator, xcom pull returned by task_c) >> [task_a1, task_a2]
task_b >> branch_b (branch python operator, xcom pull returned by task_c) >> [task_b1, task_b2]
task_c >> branch_c (branch python operator, xcom pull returned by task_c) >> [task_c1, task_c2]
I'm unable to reproduce the behavior you describe using classic operators and the TaskFlow API. If you are able to add more context and code of what you are actually executing that would be most helpful.
In the meantime, here are the examples I used should it give you some guidance for troubleshooting. I added a task at the end of the streams to check that the first task indeed pushes its expected value.
Classic Operators
from pendulum import datetime
from airflow.models import DAG
from airflow.operators.python import BranchPythonOperator, PythonOperator
from airflow.utils.trigger_rule import TriggerRule
with DAG(dag_id="multiple_branch_loop", start_date=datetime(2023, 1, 1), schedule=None):
def xcom_push(val):
return val
def func():
...
def choose(val):
return f"task_{val}"
def check_xcom_output_from_first(val, expected_val):
assert val == expected_val
stuff = ["a", "b", "c"]
for i in stuff:
first = PythonOperator(task_id=f"first_task_{i}", python_callable=xcom_push, op_kwargs={"val": i})
branch = BranchPythonOperator(task_id=f"branch_{i}", python_callable=choose, op_kwargs={"val": i})
second = PythonOperator(task_id=f"task_{i}", python_callable=func)
third = PythonOperator(task_id=f"task_{i}a", python_callable=func)
check = PythonOperator(
task_id=f"check_{i}",
trigger_rule=TriggerRule.ALL_DONE,
python_callable=check_xcom_output_from_first,
op_kwargs={"val": first.output, "expected_val": i},
)
first >> branch >> [second, third] >> check
The check* tasks succeed meaning the first task in a given stream does push its value and not the last stream's.
TaskFlow API
from pendulum import datetime
from airflow.decorators import dag, task
from airflow.utils.trigger_rule import TriggerRule
#dag(start_date=datetime(2023, 1, 1), schedule=None)
def multiple_branch_loop():
#task()
def xcom_push(val):
return val
#task()
def func():
...
#task.branch()
def choose(val):
return f"task_{val}"
#task(trigger_rule=TriggerRule.ALL_DONE)
def check_xcom_output_from_first(val, expected_val):
assert val == expected_val
stuff = ["a", "b", "c"]
for i in stuff:
first = xcom_push.override(task_id=f"first_task_{i}")(val=i)
branch = choose.override(task_id=f"branch_{i}")(val=first)
second = func.override(task_id=f"task_{i}")()
third = func.override(task_id=f"task_{i}a")()
check = check_xcom_output_from_first.override(task_id=f"check_{i}")(val=first, expected_val=i)
first >> branch >> [second, third] >> check
multiple_branch_loop()
Same expected behavior as well confirmed in the check* tasks:
Your functions branch, add, and times don't define name themselves, so it is taken out of global context, which is at time of function execution the last value of for name in name_list. This is a common trap explained e.g. here: tkinter creating buttons in for loop passing command arguments
To fix it, you can either pull name from context as in count, or provide it via op_args or op_kwargs when you create the respective operator, as in the answer by Josh Fell:
first = PythonOperator(task_id=f"first_task_{i}", python_callable=xcom_push, op_kwargs={"val": i})
branch = BranchPythonOperator(task_id=f"branch_{i}", python_callable=choose, op_kwargs={"val": i})

Airflow variables getting updated even if the DAG is not running

I am reading an integer variable from airflow variables and then incrementing the value by one each time the DAG runs and set it to the variable again.
But after the below code the variable at UI changes each time page is refreshed or so.
Idk what is causing such behavior
counter = Variable.get('counter')
s = BashOperator(
task_id='echo_start_variable',
bash_command='echo ' + counter,
dag=dag,
)
Variable.set("counter", int(counter) + 1)
sql_query = "SELECT * FROM UNNEST(SEQUENCE({start}, {end}))"
sql_query = sql_query.replace('{start}', start).replace('{end}', end)
submit_query = PythonOperator(
task_id='submit_athena_query',
python_callable=run_athena_query,
op_kwargs={'query': sql_query, 'db': 'db',
's3_output': 's3://s3-path/rohan/date=' + current_date + '/'},
dag=dag)
e = BashOperator(
task_id='echo_end_variable',
bash_command='echo ' + counter,
dag=dag,
)
s >> submit_query >> e
Airflow process that DAG file every 30 seconds (default of min_file_process_interval setting) this means that any top level code you have is running every 30 seconds so Variable.set("counter", int(counter) + 1)
will cause the Variable counter to be increased by 1 every 30 seconds.
It's a bad practice to interact with Variables in top level code (regardless of the increasing value issue). It opens a connection to the metastore database every 30 seconds which may cause serious problems and overwhelm the database.
To get the value of Variable you can use Jinja:
e = BashOperator(
task_id='echo_end_variable',
bash_command='echo {{ var.value.counter }}',
dag=dag,
)
This is a safe way to use variables as the value is being retrieved only when the operator is executed.
If you want to increase the value of the variable by 1 then do it with PythonOpeartor:
def increase():
counter = Variable.get('counter')
Variable.set("counter", int(counter) + 1)
increase_op = PythonOperator(
task_id='increase_task',
python_callable=increase,
dag=dag)
The python callable will be executed only when the operator runs.

Use kwargs and context simultaniauly on Airflow

Im using Airflow 1.10.11.
Can I use a TriggerDagRunOperator to pass a parameter to the triggered dag? Airflow from a previous question I know that I can send parameter using a TriggerDagRunOperator.
But my new question is: Can I use the parameter from the dag_run on a def when using **kwargs? So I can retrieve the xcom values and the dag_run values?
I tried def new_op_fun(**kwargs, **context): , but that is an invalid syntax
please help, Thanks in advance !.
dag.py
from datetime import datetime
from airflow.models import DAG
from airflow.operators.python_operator import PythonOperator
dag = DAG(
dag_id='my_dag',
schedule_interval='#once',
start_date=datetime(2021, 1, 1)
)
previous_op= BashOperator(
task_id="previous_op",
bash_command='echo "{{ params.week }}"',
params = {'week' : '$(date +%V -d \"1 week ago\")',},
provide_context=True,
xcom_push=True,
dag=dag
)
def run_this_func(**context):
ti = kwargs['ti']
xcom_value = ti.xcom_pull(task_ids='previous_op')
print( xcom_value )
print(context["dag_run"].conf)
def new_op_fun(**kwargs):
ti = kwargs['ti']
xcom_value = ti.xcom_pull(task_ids='previous_op')
print( xcom_value )
print(context["dag_run"].conf)
return( "hello" )
new_op = PythonOperator(
task_id='new_op',
provide_context=True,
python_callable=new_op_fun,
xcom_push=True,
dag=dag)
previous_op >> new_op
trigger.py
from datetime import datetime
from airflow.models import DAG
from airflow.operators.dagrun_operator import TriggerDagRunOperator
dag = DAG(
dag_id='trigger',
schedule_interval='#once',
start_date=datetime(2021, 1, 1)
)
def modify_dro(context, dagrun_order):
print(context)
print(dagrun_order)
dagrun_order.payload = {
"message": "This is my conf message"
}
return dagrun_order
run_this = TriggerDagRunOperator(
task_id='run_this',
trigger_dag_id='my_dag',
python_callable=modify_dro,
dag=dag
)
run_this
This works for me.
Instead of using kwargs, pass them in as arguments. For example:
def func_my_task(my_param, **context):
print(my_param)
t1 = PythonOperator(
task_id='my_task',
python_callable=func_my_task,
op_kwargs={'my_param': 'hello'},
dag=dag
)
Your limitation is basically with being unable to use 2 dynamic dict as input parameter, if you turn one of these into a String or List, your problem will go away.
1-You can define 2 functions that in one of them you return the value of the **kwargs you like as a list or string and then pass it to the function that wants **context as first parameter(either as str1 or list1).
2- use a variable in a global way and fill it each time with contents returned by your first and second function and then use the results of these in your final function.

Airflow - xcom returns the value with comma in the last

I just tried the xcom pull to print the return value of a task.
Basically, I ran a SQL task select 123123123 it will return 123123123. And my dag is,
<<args and DAG details goes here>>
def puller(**kwargs):
ti = kwargs['ti']
pulled_value_1 = ti.xcom_pull(task_ids='push_result')
print("VALUE IN PULLER : ", pulled_value_1)
def get_dag_ids(**kwargs):
postgres_hook = PostgresHook(postgres_conn_id='cloudsqlpg')
records = postgres_hook.get_records(sql='select 123123123')
return records
t1 = PythonOperator(
task_id="push_result",
python_callable=get_dag_ids,
provide_context=True,
dag=dag
)
pull = PythonOperator(
task_id='pullee',
dag=dag,
python_callable=puller,
provide_context=True,
)
t1 >> pull
I got the output from the task t1 as below.
INFO - Done. Returned value was: [(123123123,)]
I just need the correct value as 123123123. Is it an array? how do I extract the correct value from this result?
You may want to unpack it into a single list/tuple.
[123123123..., n]
list(itertools.chain(*records))
tuple(itertools.chain(*records))
Then it'll be easier to access each result i.e records[0]

How do you call tasks conditionally in Airflow?

I am looping over a series of status tests running in Apache Airflow. Under certain conditions I want to publish a message. Something like this is what I have now:
for count, test in enumerate(test_list):
test = StatusTest(test['_id'],test['status'])
check_status_task = PythonOperator(
task_id='run_status_checker_'+str(count),
python_callable=run_status_checker,
op_kwargs={'status_test':test},
provide_context=True,
xcom_push=True,
retries=0,
dag=dag)
pub_results_task = PythonOperator(
task_id='pub_results_' + str(count),
python_callable=pub_result,
#op_kwargs={'task_id':'run_status_checker_'+str(count)},
provide_context=True,
trigger_rule='all_done',
dag=dag
)
check_status_task >> pub_results_task
Code that calls status checker, gets responses, etc:
def run_status_test(ti, **kwargs):
status_test_conn = MongoHook(conn_id='test_selector_mongo')
status_test = kwargs.pop('status_test', None)
is_up = check_test_status(status_test)
status, response = is_up
if status:
if status_test.already_failed(test_conn):
status_test.status = 'true'
status_test.update_status(status_test_conn)
message = {"Test {0} passed".format(status_test.uid)}
ti.xcom_push(XCOM_CHECK_STATUS_KEY, message)
else:
if (test.already_down(test_conn) and 'false' in test.status):
test.update_sty_down(status_test_conn, response=response)
else:
status_test.status = 'false'
status_test.update_status(status_test_conn)
message = {status_test.uid}
ti.xcom_push(XCOM_CHECK_STATUS_KEY, message)
status_test_con.close_conn()
Code that would do message publishing:
def pub_result(dag, ti, **context):
message = ti.xcom_pull(
task_ids=context['task_id'],
key=XCOM_CHECK_STATUS_KEY
)
message_con = pika.BlockingConnection(pika.URLParameters(os.getenv['BROKERURL']))
channel = message_con.channel()
channel.queue_declare(queue='status_test', durable=True, auto_delete=False, exclusive=False)
channel.basic_publish(exchange='', routing_key='outage', body=json.dumps(message), properties=pika.BasicProperties(delivery_mode=2))
message_con.close()
How do I tell Airflow to only do the publishing part of the workflow if certain conditions are met such as:
If there is a message then publish it (or run the publish task).
If not don't do anything.
I was thinking I could just check the value in the XCOM and publish if there is something or do nothing if it is empty. However, I wanted to see if there was a proper way to do it in Airflow.
You could use the BranchPythonOperator to create a task that determines whether to call pub_result or some other DummyTask. Here is an example from the Airflow Github repo.

Categories

Resources