I have this Python file:
class Get:
def __init__(self, i):
self.i = get_date(i)
self.df = self.get_file()
def get_file(self):
try:
...
return df
except Exception as e:
return ...
def get_date(self,i):
dt = datetime.now() - timedelta(days=i)
return dt.strftime("%Y-%m-%d")
def put(self,df):
....
class Fix:
def __init__(self,df):
....
if __name__ == '__main__':
for i in range(4, 0, -1):
get = Get(i)
fix = Fix(get.df)
get.put(fix.df)
Basically this code generates 4 last dates and run the functions over these dates (update statistics etc...)
At first I wanted to convert each function into a PythonOperator and then schedule it but I don't think this will work. I don't know how to convert the Classes and the parameters that are transferred between them.
This is what the code does if I run it on 2018-Jun-12 and below what it should be with Airflow:
Is there a template that I can use or any suggestion how to do it?
you can either execute your script using BashOperator without any changes of your script:
dag = DAG('{NAME_OF_THE_DAG}', schedule_interval='daily',
default_args=default_args)
t1 = BashOperator(
task_id = '{NAME_OF_TASK}',
dag = dag,
bash_command = python {NAME_OF_THE_FILE_TO_EXECUTE}.py')
or use PythonOperator:
update your code to create main function in your script:
def main():
for i in range(4, 0, -1):
get = Get(i)
fix = Fix(get.df)
get.put(fix.df)
define and execute the dag:
dag = DAG('{NAME_OF_THE_TASK}', schedule_interval = 'daily',
default_args=default_args)
t1 = PythonOperator(
task_id = '{NAME_OF_TASK}',
dag = dag,
python_callable = main)
Related
A task that performs the same task in one dag was created using a for loop. It is hoped to be divided into two branches that depend on the result of this task. However, all tasks created using the for loop return the xcom of the last task. How can tasks created using for loop return each xcom?
Each task a,b,c returns xcom_a, xcom_b, and xcom_c. However, branch tasks all get the same xcom_c. What should I do?
default_args ={'start_date':days_ago(1)}
dag=DAG(
dag_id='batch_test',
default_args=default_args,
schedule_interval=None)
def count(**context):
name = context['params']['name']
dict = {'a':50,
'b':100,
'c':150}
if dict[name]<100:
task_id=f'add_{name}'
return task_id
elif dict[name]>=100:
task_id=f'times_{name}'
return task_id
def branch(**context):
task_id = context['ti'].xcom_pull(task_ids=f'count_task_{name}')
return task_id
def add(**context):
ans = context['ti'].xcom_pull(task_ids=f'branch_task_{name}')
ans_dict = {'add_a':50+100,
'add_b':100+100,
'add_c':150+100}
ans = ans_dict[ans]
return print(ans)
def times(**context):
ans = context['ti'].xcom_pull(task_ids=f'branch_task_{name}')
ans_dict = {'times_a':50*100,
'times_b':100*100,
'times_c':150*100}
ans = ans_dict[ans]
return print(ans)
name_list = ['a','b','c']
for name in name_list:
exec_count_task = PythonOperator(
task_id = f'count_task_{name}',
python_callable = count,
provide_context=True,
params = {'name':name},
dag=dag
)
exec_branch_task = BranchPythonOperator(
task_id = f'branch_task_{name}',
python_callable = branch,
provide_context = True,
dag = dag
)
exec_add_count = PythonOperator(
task_id = f'add_{name}',
python_callable = add,
provide_context = True,
dag = dag
)
exec_times_count = PythonOperator(
task_id = f'times_{name}',
python_callable = times,
provide_context = True,
dag = dag
)
exec_count_task >> exec_branch_task >> [exec_add_count, exec_times_count]
i want this...
task_a >> branch_a (branch python operator, xcom pull returned by task_a) >> [task_a1, task_a2]
task_b >> branch_b (branch python operator, xcom pull returned by task_b) >> [task_b1, task_b2]
task_c (>> branch_c (branch python operator, xcom pull returned by task_c) >> [task_c1, task_c2]
but
task_a >> branch_a (branch python operator, xcom pull returned by task_c) >> [task_a1, task_a2]
task_b >> branch_b (branch python operator, xcom pull returned by task_c) >> [task_b1, task_b2]
task_c >> branch_c (branch python operator, xcom pull returned by task_c) >> [task_c1, task_c2]
I'm unable to reproduce the behavior you describe using classic operators and the TaskFlow API. If you are able to add more context and code of what you are actually executing that would be most helpful.
In the meantime, here are the examples I used should it give you some guidance for troubleshooting. I added a task at the end of the streams to check that the first task indeed pushes its expected value.
Classic Operators
from pendulum import datetime
from airflow.models import DAG
from airflow.operators.python import BranchPythonOperator, PythonOperator
from airflow.utils.trigger_rule import TriggerRule
with DAG(dag_id="multiple_branch_loop", start_date=datetime(2023, 1, 1), schedule=None):
def xcom_push(val):
return val
def func():
...
def choose(val):
return f"task_{val}"
def check_xcom_output_from_first(val, expected_val):
assert val == expected_val
stuff = ["a", "b", "c"]
for i in stuff:
first = PythonOperator(task_id=f"first_task_{i}", python_callable=xcom_push, op_kwargs={"val": i})
branch = BranchPythonOperator(task_id=f"branch_{i}", python_callable=choose, op_kwargs={"val": i})
second = PythonOperator(task_id=f"task_{i}", python_callable=func)
third = PythonOperator(task_id=f"task_{i}a", python_callable=func)
check = PythonOperator(
task_id=f"check_{i}",
trigger_rule=TriggerRule.ALL_DONE,
python_callable=check_xcom_output_from_first,
op_kwargs={"val": first.output, "expected_val": i},
)
first >> branch >> [second, third] >> check
The check* tasks succeed meaning the first task in a given stream does push its value and not the last stream's.
TaskFlow API
from pendulum import datetime
from airflow.decorators import dag, task
from airflow.utils.trigger_rule import TriggerRule
#dag(start_date=datetime(2023, 1, 1), schedule=None)
def multiple_branch_loop():
#task()
def xcom_push(val):
return val
#task()
def func():
...
#task.branch()
def choose(val):
return f"task_{val}"
#task(trigger_rule=TriggerRule.ALL_DONE)
def check_xcom_output_from_first(val, expected_val):
assert val == expected_val
stuff = ["a", "b", "c"]
for i in stuff:
first = xcom_push.override(task_id=f"first_task_{i}")(val=i)
branch = choose.override(task_id=f"branch_{i}")(val=first)
second = func.override(task_id=f"task_{i}")()
third = func.override(task_id=f"task_{i}a")()
check = check_xcom_output_from_first.override(task_id=f"check_{i}")(val=first, expected_val=i)
first >> branch >> [second, third] >> check
multiple_branch_loop()
Same expected behavior as well confirmed in the check* tasks:
Your functions branch, add, and times don't define name themselves, so it is taken out of global context, which is at time of function execution the last value of for name in name_list. This is a common trap explained e.g. here: tkinter creating buttons in for loop passing command arguments
To fix it, you can either pull name from context as in count, or provide it via op_args or op_kwargs when you create the respective operator, as in the answer by Josh Fell:
first = PythonOperator(task_id=f"first_task_{i}", python_callable=xcom_push, op_kwargs={"val": i})
branch = BranchPythonOperator(task_id=f"branch_{i}", python_callable=choose, op_kwargs={"val": i})
I am running below code to create DAG. Dags are created but choose_best_model Dag is failing. Error is: ERROR - _choose_best_model() missing 1 required positional argument: 'ti'. My Airflow version is: 1.10.3. How Can I resolve this error?
my_dag.py
from airflow import DAG
from airflow.operators.python_operator import PythonOperator, BranchPythonOperator
from airflow.operators.bash_operator import BashOperator
from random import randint
from datetime import datetime
def _choose_best_model(ti):
accuracies = ti.xcom_pull(task_ids=[
'training_model_A',
'training_model_B',
'training_model_C'
])
best_accuracy = max(accuracies)
if (best_accuracy > 8):
return 'accurate'
return 'inaccurate'
def _training_model():
return randint(1, 10)
with DAG("my_dag", start_date=datetime(2021, 1, 1),
schedule_interval="#daily", catchup=False) as dag:
training_model_A = PythonOperator(
task_id="training_model_A",
python_callable=_training_model
)
training_model_B = PythonOperator(
task_id="training_model_B",
python_callable=_training_model
)
training_model_C = PythonOperator(
task_id="training_model_C",
python_callable=_training_model
)
choose_best_model = BranchPythonOperator(
task_id="choose_best_model",
python_callable=_choose_best_model
)
accurate = BashOperator(
task_id="accurate",
bash_command="echo 'accurate'"
)
inaccurate = BashOperator(
task_id="inaccurate",
bash_command="echo 'inaccurate'"
)
[training_model_A, training_model_B, training_model_C] >> choose_best_model >> [accurate, inaccurate]
You need to pass the provide_context parameter to your operator (it's extending the PythonOperator which defines it). You also need to add the kwargs to your function's signature.
The full list of parameters in the context which can be passed to your python_callable can be found here (v.1.10.15).
Once you do this, you can also pass additional custom parameters to your function using the op_kwargs parameter.
PythonOperator Airflow docs
[...]
def _choose_best_model(ti, **kwargs): # <-- here
accuracies = ti.xcom_pull(task_ids=[
'training_model_A',
'training_model_B',
'training_model_C'
])
[...]
with DAG("my_dag", start_date=datetime(2021, 1, 1),
schedule_interval="#daily", catchup=False) as dag:
[...]
choose_best_model = BranchPythonOperator(
task_id="choose_best_model",
python_callable=_choose_best_model,
provide_context=True, # <-- here
)
[...]
I am trying to use a python operator to fetch a list of filenames that have the run date string in it and then download these files using the sftp-to-s3 operator. Is there a better way to do this? With this following code I get the error > name ti not found
def get_files(**kwargs):
sftp_hook = SFTPHook(ftp_conn_id='conn')
str_date = kwargs["date"]
files = []
with sftp_hook.get_conn() as conn:
for entry in conn.listdir_attr():
mode = entry.st_mode
if S_ISREG(mode) and str_date in entry.filename:
files.append(entry.filename)
return files -> list of files to download
with dag:
date = '{{ next_ds_nodash }}'
source_files = PythonOperator(task_id=f"get_files",
python_callable=get_files,
op_kwargs={'date': {date}},
provide_context=True,
dag=dag)
file_list = ti.xcom_pull(task_ids='get_files', key='files')
collect = []
for file in file_list:
op = SFTPToS3Operator(task_id=f"download_{file}",
sftp_conn_id="conn",
sftp_path=f"path1/{file}" if 'key' in file else f"path2/{file}",
s3_conn_id=aws_conn_id,
s3_bucket=s3_bucket,
s3_key =f"/temp/{date}/{file}",
dag=dag)
collect.append(op)
collect.set_upstream(source_files)
According to the XCOM Documentation, XCOMs are similar to Variables and therefore must be serialized and deserialized. Use json.dumps() and json.loads() to do so.
Additionally you shoul be using the xcom pull inside another task instead of in the DAG definition itself.
The DAG should only be calls to various task_ids. Perform all operations within tasks and chain them together in the DAG.
Here is an example of the proper use of XCOMs.
from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow.utils.dates import days_ago
dag = DAG(
'example_xcom',
schedule_interval="#once",
start_date=days_ago(2),
default_args={'owner': 'airflow'},
tags=['example'],
)
value_1 = [1, 2, 3]
value_2 = {'a': 'b'}
def push(**kwargs):
"""Pushes an XCom without a specific target"""
kwargs['ti'].xcom_push(key='value from pusher 1', value=value_1)
def push_by_returning(**kwargs):
"""Pushes an XCom without a specific target, just by returning it"""
return value_2
def puller(**kwargs):
"""Pull all previously pushed XComs and check if the pushed values match the pulled values."""
ti = kwargs['ti']
# get value_1
pulled_value_1 = ti.xcom_pull(key=None, task_ids='push')
if pulled_value_1 != value_1:
raise ValueError(f'The two values differ {pulled_value_1} and {value_1}')
# get value_2
pulled_value_2 = ti.xcom_pull(task_ids='push_by_returning')
if pulled_value_2 != value_2:
raise ValueError(f'The two values differ {pulled_value_2} and {value_2}')
# get both value_1 and value_2
pulled_value_1, pulled_value_2 = ti.xcom_pull(key=None, task_ids=['push', 'push_by_returning'])
if pulled_value_1 != value_1:
raise ValueError(f'The two values differ {pulled_value_1} and {value_1}')
if pulled_value_2 != value_2:
raise ValueError(f'The two values differ {pulled_value_2} and {value_2}')
push1 = PythonOperator(
task_id='push',
dag=dag,
python_callable=push,
)
push2 = PythonOperator(
task_id='push_by_returning',
dag=dag,
python_callable=push_by_returning,
)
pull = PythonOperator(
task_id='puller',
dag=dag,
python_callable=puller,
)
pull << [push1, push2]
I have a Airflow job like below:
import time
job_id = int(time.time())
airflow_job1 = PythonOperator(op_kwargs={"job_id" : job_id}, ...)
airflow_job2 = BashOperator(op_kwargs={"job_id" : job_id}, ...)
airflow_job1 >> airflow_job2
I know every time when script launched, I will have a new job_id, used in each airflow task. But I wonder what if I run the script from middle, like airflow_job1 failed, and I fix problem and rerun from airflow_job1 in UI, is a new job_id generated in the rerun, or Airflow use the last job_id before?
Actually, after I check with a simple case:
# global parameter
job_id = int(time.time())
def airflow_job1(job_id, **context):
print("in airflow_job1, current timestamp: %s" % job_id)
def airflow_job2(job_id, **context):
print("in airflow_job2, current timestamp: %s" % job_id)
airflow_job1 = PythonOperator(
task_id='airflow_job1',
provide_context=True,
python_callable=airflow_job1,
op_kwargs={'job_id': job_id},
dag=globals()[dag_name]
)
airflow_job2 = PythonOperator(
task_id='airflow_job2',
provide_context=True,
python_callable=airflow_job2,
op_kwargs={'job_id': job_id},
dag=globals()[dag_name]
)
airflow_job1 >> airflow_job2
I find job_id in airflow_job1 and airflow_job2 are different even if in the same run.
So the conclusion is that we shouldn't set global parameter in this way, maybe use xcom_pull / xcom_push to solve that
I have a list of tables I want to run my script through. It works successfully when I do one table at a time but when I try a for loop above the tasks, it run all the tables at once giving me multiple errors.
Here is my code:
def create_tunnel_postgres():
psql_host = ''
psql_port = 5432
ssh_host= ''
ssh_port = 22
ssh_username = ''
pkf = paramiko.RSAKey.from_private_key(StringIO(Variable.get('my_key')))
server = SSHTunnelForwarder(
(ssh_host, 22),
ssh_username=ssh_username,
ssh_private_key=pkf,
remote_bind_address=(psql_host, 5432))
return server
def conn_postgres_internal(server):
"""
Using the server connect to the internal postgres
"""
conn = psycopg2.connect(
database='pricing',
user= Variable.get('postgres_db_user'),
password= Variable.get('postgres_db_key'),
host=server.local_bind_host,
port=server.local_bind_port,
)
return conn
def gzip_postgres_table(**kwargs):
"""
path='/path/{}.csv'.format(table_name)
server_postgres = create_tunnel_postgres()
server_postgres.start()
etl_conn = conn_postgres_internal(server_postgres)
cur=etl_conn.cursor()
cur.execute("""
select * from schema.db.{} limit 100;
""".format(table_name))
result = cur.fetchall()
column_names = [i[0] for i in cur.description]
fp = gzip.open(path, 'wt')
myFile = csv.writer(fp,delimiter=',')
myFile.writerow(column_names)
myFile.writerows(result)
fp.close()
etl_conn.close()
server_postgres.stop()
#------------------------------------------------------------------------------------------------------------------------------------------------
default_args = {
'owner': 'mae',
'depends_on_past':False,
'start_date': datetime(2020,1,1),
'email': ['maom#aol.com'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 0,
'retry_delay': timedelta(minutes=1)
}
tables= ['table1','table2']
s3_folder='de'
current_timestamp=datetime.now()
#Element'S VARIABLES
dag = DAG('dag1',
description = 'O',
default_args=default_args,
max_active_runs=1,
schedule_interval= '#once',
#schedule_interval='hourly'
catchup = False )
for table_name in pricing_table_name:
t1 = PythonOperator(
task_id='{}_gzip_table'.format(table_name),
python_callable= gzip_postgres_table,
provide_context=True,
op_kwargs={'table_name':table_name,'s3_folder':s3_folder,'current_timestamp':current_timestamp},
dag = dag)
Is there a way to run table1 first..let it finish and then run table 2? I tried doing that with the for table_name in tables: but to no avail. Any ideas or suggestions would help.
Your for is creating multiple tasks for your tables processing, this will parallelize the execution of the tasks by default on airflow.
You can either set the number of workers in the airflow config file to 1, or create only 1 task and run your loop inside the task, which will then be executed synchronously.
I saw your code, and it seems like you're creating multiple DAG tasks using looping statement, which runs the task in parallel.
There are certain ways to achieve your requirement.
use sequential_executor.
airflow.executors.sequential_executor.SequentialExecutor which will only run task instances sequentially.
https://airflow.apache.org/docs/stable/start.html#quick-start
create a script that works according to your need.
Create a script(Python) and use it as PythonOperator that repeats your current function for number of tables.
limit airflow executors(parallelism) to 1.
You can limit your airflow workers to 1 in its airflow.cfg config file.
Steps:
open airflow.cfg from your airflow root(AIRFLOW_HOME).
set/update parallelism = 1
restart your airflow.
this should work.
I see 3 way of solving this.
Limit parallelism = 1 in the airflow.cfg file.
Create a python code which is going to loop trough you tables and
call that with a python
Create a pool and assign 1 slot to it.
https://airflow.apache.org/docs/stable/concepts.html?highlight=pool#pools
i thing You need DAG like this
Code for it:
from datetime import datetime
from airflow import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator
import sys
sys.path.append('../')
from mssql_loader import core #program code, which start load
from mssql_loader import locals #local variables, contains dictionaries with name
def contact_load(typ,db):
core.starter(typ=typ,db=db)
return 'MSSQL LOADED '+db['DBpseudo']+'.'+typ
dag = DAG('contact_loader', description='MSSQL sqlcontact.uka.local loader to GBQ',
schedule_interval='0 7 * * *',
start_date=datetime(2017, 3, 20), catchup=False)
start_operator = DummyOperator(task_id='ROBO_task', retries=3, dag=dag)
for v in locals.TABLES:
for db in locals.DB:
task = PythonOperator(
task_id=db['DBpseudo']+'_mssql_' + v, #create Express_mssql_fast , UKA_mssql_important and etc
python_callable=contact_load,
op_kwargs={'typ': v,'db':db},
retries=3,
dag=dag,
)
start_operator >> task #create parent-child connection to from first task to other
dag = DAG(dag_id='you_DAG',default_args=default_args,schedule_interval='10 6 * * *',max_active_runs=1 --- HERE execute only 1 task)