airflow.exceptions.DuplicateTaskIdFound exception - python

Hi I need to define a DAG with a task and the task has to be invoked 4 times every day. But when I publish to DAG, I got an airflow.exceptions.DuplicationTaskIdFound error, see my DAG definition below:
import sys
import time
from airflow.models import DAG,Variable
from airflow.operators.bash import BashOperator
from datetime import datetime,timedelta
from airflow.operators.dummy import DummyOperator
from airflow.providers.ssh.hooks.ssh import SSHHook
from airflow.providers.ssh.operators.ssh import SSHOperator
from airflow.operators.python import PythonOperator
from airflow.sensors.external_task import ExternalTaskMarker, ExternalTaskSensor
from airflow.utils.dates import days_ago
import pendulum
sys.path.append("../..")
from common.airflow_dep_coordinator import airfow_coordinator
from common import airflow_utli
from common import airflow_config
default_args = airflow_utli.default_args
default_args['owner'] = 'bi'
default_args['sla'] = timedelta(hours=6)
default_args['retries'] = 3
default_args['start_date'] = datetime(2022,4,29, tzinfo=airflow_utli.local_tz) #TODO please override start date of this dag
airflow_config.EMAIL_RECIEVER_LIST=Variable.get("bi_email_reciever_list",deserialize_json=True , default_var=None)
# ---------------------------------------------------------------------------
#Get data string
# #beforeOfDay, the day before current day
# #with_dash, whether the result string contains dash, default False
# ---------------------------------------------------------------------------
def getdate(beforeOfDay, with_dash=False):
today = datetime.now()
offset = timedelta(days=-beforeOfDay)
str_date_format ='%Y%m%d'
if(with_dash):
str_date_format ='%Y-%m-%d'
date_str = (today + offset).strftime(str_date_format)
return date_str
# ---------------------------------------------------------------------------
#Get etl task, return SSHOperator
# ---------------------------------------------------------------------------
def batch_etl_task(dag,target_table_name, param=None,ssh_conn_id=SSHHOOK_NAME):
sh_command = COMMAND_BASE+ COMMAND.format(target_table_name=target_table_name, param=param)
print(sh_command)
task = SSHOperator(
dag=dag, task_id=target_table_name, ssh_conn_id=ssh_conn_id, command=sh_command
)
return task
# =========================================================================
# DAG definition
# =========================================================================
dag=DAG(
dag_id='lakehouse_dws_otp_app_user_daily',
default_args=default_args,
tags=['bi','dws','otp','app'],
schedule_interval='00 04 * * *',
dagrun_timeout=timedelta(hours=6),
concurrency=12,
catchup=False,
params={"pday": getdate(2),"fday":getdate(1),
"pdaym1": getdate(3),"fdaym1":getdate(2),
"pdaym7": getdate(9),"fdaym7":getdate(8),
"pdaym30": getdate(32),"fdaym30":getdate(31)
},
sla_miss_callback=airflow_utli.default_sla_callback
)
#etl task
task_dws_fact_com_otp_app_visitor_snp_t=batch_etl_task(dag=dag,target_table_name='AnotherTargetTable',param='{{ params.pday }} {{params.fday}}')
task_dws_fact_com_otp_app_active_retention_snp_t=batch_etl_task(dag=dag,target_table_name='target_table',param='{{ params.pday }} {{params.fday}}')
task_dws_fact_com_otp_app_active_retention_snp_t_m1=batch_etl_task(dag=dag,target_table_name='target_table',param='{{ params.pdaym1 }} {{params.fdaym1}}')
task_dws_fact_com_otp_app_active_retention_snp_t_m7=batch_etl_task(dag=dag,target_table_name='target_table',param='{{ params.pdaym7 }} {{params.fdaym7}}')
task_dws_fact_com_otp_app_active_retention_snp_t_m30=batch_etl_task(dag=dag,target_table_name='target_table',param='{{ params.pdaym30 }} {{params.fdaym30}}')
### End
end_dws_otp_register_daily=DummyOperator(
task_id='end_dws_otp_register_daily',
dag=dag)
### Dependence, TODO please setup the dependency tree
[task_dws_fact_com_otp_app_new_visitor_t,task_dws_fact_com_otp_app_active_au_snp_t]>>task_dws_fact_com_otp_app_active_retention_snp_t>>task_dws_fact_com_otp_app_active_retention_snp_t_m1
as you can see, I need to invoke etl to update target_table 4 times with different parameters, but this is not work seems DAG doesn't allow me to define task 4 times with same target table, so is there any best practice to achive this purpose in one DAG?

You have implemented batch_etl_task to set the task_id for the SSHOperator to the name of the target_table_name.
You must pass unique names for the task_id to register a DagNode.

Related

Issue running a lambda two times in sequence from Airflow in AWS

I am very new to Airflow. I have a single lambda function that needs to be executed two times in sequence. I am passing the payload from dag file (say date), based on this date my lambda will fetch the records from an API.
The lambda works fine when testing it manually for different dates, but when I create a dag file to invoke them in sequence, the second lambda is executed more than once. For this task, second time execution starts even before the first execution of lambda is complete.
I am not using any retries in my dag, not sure if the issue is at the lambda end or dag end or if anything configuration related.
No errors noted at dag, but the lambda log shows the multiple invocation of the second task.
Any help with this appreciated!
Thanks!
from airflow import DAG
from airflow.operators.dummy import DummyOperator
from airflow.operators.python import PythonOperator
from datetime import datetime, timedelta, timezone
import boto3
import json
from airflow.utils.dates import days_ago
args = { 'owner': 'airflow', 'start_date': days_ago(1), 'catchup': False, 'provide_context': True }
#50 13 * * *
dag = DAG( 'my_dag', schedule_interval = None, default_args = args )
def lambda1_trigger(ds,**kwargs):
pld =json.dumps({"date":"27/06/2022"})
lambda_client = boto3.client('lambda',region_name=aws_region,aws_access_key_id=aws_access_key,aws_secret_access_key=aws_secret_access_key)
response_1 = lambda_client.invoke(FunctionName='my_lambda_function',InvocationType='RequestResponse',Payload = pld)
print('Response--->', response_1)
def lambda2_trigger(ds,**kwargs):
pld =json.dumps({"date":"28/06/2022"})
lambda_client = boto3.client('lambda',region_name=aws_region,aws_access_key_id=aws_access_key,aws_secret_access_key=aws_secret_access_key)
response_1 = lambda_client.invoke(FunctionName='my_lambda_function',InvocationType='RequestResponse',Payload = pld)
print('Response--->', response_1)
start = DummyOperator(task_aid='Begin_execution', dag=dag)
invoke_lambda1 = PythonOperator( task_id="task1", python_callable=lambda1_trigger, provide_context=True,execution_timeout=datetime.timedelta(hours=1), dag=dag)
invoke_lambda2 = PythonOperator( task_id="task2", python_callable=lambda2_trigger, provide_context=True,execution_timeout=datetime.timedelta(hours=1), dag=dag)
end = DummyOperator(task_id='stop_execution', dag=dag)
start >> invoke_lambda1 >> invoke_lambda2 >> end`

Pass arguments to function from BranchPythonOperator in Airflow

I am running below code to create DAG. Dags are created but choose_best_model Dag is failing. Error is: ERROR - _choose_best_model() missing 1 required positional argument: 'ti'. My Airflow version is: 1.10.3. How Can I resolve this error?
my_dag.py
from airflow import DAG
from airflow.operators.python_operator import PythonOperator, BranchPythonOperator
from airflow.operators.bash_operator import BashOperator
from random import randint
from datetime import datetime
def _choose_best_model(ti):
accuracies = ti.xcom_pull(task_ids=[
'training_model_A',
'training_model_B',
'training_model_C'
])
best_accuracy = max(accuracies)
if (best_accuracy > 8):
return 'accurate'
return 'inaccurate'
def _training_model():
return randint(1, 10)
with DAG("my_dag", start_date=datetime(2021, 1, 1),
schedule_interval="#daily", catchup=False) as dag:
training_model_A = PythonOperator(
task_id="training_model_A",
python_callable=_training_model
)
training_model_B = PythonOperator(
task_id="training_model_B",
python_callable=_training_model
)
training_model_C = PythonOperator(
task_id="training_model_C",
python_callable=_training_model
)
choose_best_model = BranchPythonOperator(
task_id="choose_best_model",
python_callable=_choose_best_model
)
accurate = BashOperator(
task_id="accurate",
bash_command="echo 'accurate'"
)
inaccurate = BashOperator(
task_id="inaccurate",
bash_command="echo 'inaccurate'"
)
[training_model_A, training_model_B, training_model_C] >> choose_best_model >> [accurate, inaccurate]
You need to pass the provide_context parameter to your operator (it's extending the PythonOperator which defines it). You also need to add the kwargs to your function's signature.
The full list of parameters in the context which can be passed to your python_callable can be found here (v.1.10.15).
Once you do this, you can also pass additional custom parameters to your function using the op_kwargs parameter.
PythonOperator Airflow docs
[...]
def _choose_best_model(ti, **kwargs): # <-- here
accuracies = ti.xcom_pull(task_ids=[
'training_model_A',
'training_model_B',
'training_model_C'
])
[...]
with DAG("my_dag", start_date=datetime(2021, 1, 1),
schedule_interval="#daily", catchup=False) as dag:
[...]
choose_best_model = BranchPythonOperator(
task_id="choose_best_model",
python_callable=_choose_best_model,
provide_context=True, # <-- here
)
[...]

Need to backup the data but it's not working in Airflow

I am trying to backup the data in Airflow but it's not giving any error and also I am unable to backup the data because it's getting skipped. The code which I have written is :-
import os
from airflow import DAG
from airflow.providers.google.cloud.transfers.bigquery_to_gcs import (
BigQueryToGCSOperator,
)
from composer_plugins import get_list_to_backup
from datetime import datetime, timedelta, date
from airflow.exceptions import AirflowFailException, AirflowSkipException
from airflow.operators.python import PythonOperator
"""the function validates if the schedule_day parameter is a valid day to execute the task schedule_day is a number and corresponds to day of the week
1 - Monday
2 - Tuesday
3 - Wednesday
4 - Thursday
5 - Friday
6 - Saturday
7 - Sunday """
def _check_valid_day(**kwargs):
today = datetime.today()
if today.isoweekday()==kwargs["schedule_day"]:
return True
else:
raise AirflowSkipException("does not correspond to the backup day")
today = datetime.today()
today_str = today.strftime("%Y-%m-%d")
#start_date = get_next_weekday(today_str, 5) # 5 = Saturday
start_date = datetime(2022, 5, 2)
dag_id = "data_bq_weekly_backups_dag"
event_collection_project_id = os.environ["EVENT_COLLECTION_PROJECT_ID"]
tables_to_backup = os.environ["TABLES_TO_BACKUP"]
destination_bucket = os.environ["WEEKLY_BQ_BACKUP_BUCKET"]
schedule_day =os.environ["BACKUP_SCHEDULE_DAY"]
default_dag_args = {
# Setting start date for next Saturday in order to maintain the scheduler
# in a consistent state
"start_date": start_date,
# To email on failure or retry set 'email' arg to your email and enable
# emailing here.
"email_on_failure": False,
"email_on_retry": False,
# If a task fails, retry it once after waiting at least what's specified in retry_delay
"retries": 1,
"retry_delay": timedelta(seconds=10),
"project_id": event_collection_project_id,
"schedule_interval": "0 2 * * *",
}
tables_to_backup_list = get_list_to_backup(tables_to_backup)
with DAG(dag_id=dag_id, default_args=default_dag_args,catchup=False) as dag:
check_valid_day = PythonOperator(
task_id='check_valid_day',
python_callable=_check_valid_day,
op_kwargs={
"schedule_day": schedule_day
},
)
task_dict = dict()
for table_to_backup in tables_to_backup_list:
dataset = table_to_backup.split(".")[0]
table = table_to_backup.split(".")[1]
task_name = f"{dataset}_{table}_table_weekly_backup"
task_dict[task_name] = BigQueryToGCSOperator(
task_id=task_name,
trigger_rule="all_success",
dag=dag,
source_project_dataset_table=table_to_backup,
destination_cloud_storage_uris=[
f"gs://{destination_bucket}/{dataset}/{table}/{today.year}/{today.month}/{today.day}/{table}-*.avro"
],
export_format="AVRO", # OPTIONS: AVRO, CSV, JSON
compression="NONE", # OPTIONS: NONE, DEFLATE, GZIP, SNAPPY
labels=None,
)
check_valid_day >> task_dict[task_name]
When I am executing this DAG, there is no error but it's skipping everything :-
Airflow DAG TREE VIEW
BACKUP_SCHEDULE_DAY=3 is set in environment variable file. I don't know what's wrong in this and why it's not working
I tried your code I was able to reproduce your issue. See run history below:
NOTE: Prior to running your code, I hardcoded values like your environment variables and tables_to_backup_list to make it work on my environment.
The main problem is in _check_valid_day(). When this line is executed if today.isoweekday()==kwargs["schedule_day"] it is always false because they have a data type mismatch.
print(today.isoweekday()) # <class 'int'>
print(kwargs["schedule_day"]) # <class 'str'>
The fix is to make their data types match. Fix below is to convert kwargs["schedule_day"] to type int:
def _check_valid_day(**kwargs):
today = datetime.today()
if today.isoweekday() == int(kwargs["schedule_day"]): #convert to int
print("inside the if statement")
return True
else:
raise AirflowSkipException("does not correspond to the backup day")
Graph view:
check_valid_day Logs:

How to access xcom_pull outside of task function in Airflow?

Code:
import datetime
import logging
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
def hello_world(ti, execution_date, **context):
logging.info("Hello World")
return "Gorgeous"
def addition(ti, **context):
# Want belows are same each other
logging.info(context['params']["please1"])
logging.info(ti.xcom_pull(task_ids="hello_world"))
dag = DAG(
"test",
schedule_interval="#hourly",
start_date=datetime.datetime.now() - datetime.timedelta(days=1),
)
t1 = PythonOperator(
task_id="hello_world", python_callable=hello_world, dag=dag, provide_context=True
)
t2 = PythonOperator(
task_id="abc",
python_callable=addition,
dag=dag,
params={"please1": "{{{{ ti.xcom_pull(task_ids='{}') }}}}".format(t1.task_id)},
provide_context=True,
)
t1 >> t2
I want addition() shows the same result:
# Want belows are same each other
logging.info(context['params']["please1"])
logging.info(ti.xcom_pull(task_ids="hello_world"))
But the result is:
[2021-05-17 23:47:15,286] {test_dag.py:14} INFO - {{ ti.xcom_pull(task_ids='hello_world') }}
[2021-05-17 23:47:15,291] {test_dag.py:15} INFO - Gorgeous
What I want to know: Is it possible to access xcom_pull outside of the task function? e.g. When passing the value from the xcom to PythonOperator?
Thanks!
Jinja-templated args for an operator can only be used for those fields that are listed as template_fields in the operator class. For the PythonOperator that is op_args, op_kwargs, and templates_dict. First, replace your params parameter to op_kwargs and remove the extra curly brackets for Jinja -- only 2 on either side of the expression. Second, and unfortunately, you need to explicitly list the task_id in the ti.xcom_pull(task_ids='<task_id>') call.
Revised code:
import datetime
import logging
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
def hello_world(ti, execution_date, **context):
logging.info("Hello World")
return "Gorgeous"
def addition(ti, **context):
logging.info(context["please1"])
logging.info(ti.xcom_pull(task_ids="hello_world"))
dag = DAG(
"test",
schedule_interval=None,
start_date=datetime.datetime(2021, 5, 17),
catchup=False,
)
with dag:
t1 = PythonOperator(
task_id="hello_world",
python_callable=hello_world,
provide_context=True,
)
t2 = PythonOperator(
task_id="abc",
python_callable=addition,
op_kwargs={
"please1": "{{ ti.xcom_pull(task_ids='hello_world') }}",
},
provide_context=True,
)
t1 >> t2
Logging from "t2":
If you are using Airflow 2.0, the code can actually be simplified to use the new XComArg feaure. This feature allows you to access the output of tasks using a simple task.output expression.
Revised code with 2.0 and XComArg use to access the output of "t1" as the "please1" arg:
import datetime
import logging
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
def hello_world(ti, execution_date, **context):
logging.info("Hello World")
return "Gorgeous"
def addition(ti, **context):
logging.info(context["please1"])
logging.info(ti.xcom_pull(task_ids="hello_world"))
dag = DAG(
"test",
schedule_interval=None,
start_date=datetime.datetime(2021, 5, 17),
catchup=False,
)
with dag:
t1 = PythonOperator(
task_id="hello_world",
python_callable=hello_world,
)
t2 = PythonOperator(
task_id="abc",
python_callable=addition,
op_kwargs={"please1": t1.output},
)
t1 >> t2
More about DAG authoring with 2.0 here.

How to individually run task separately in airflow?

I have a list of tables I want to run my script through. It works successfully when I do one table at a time but when I try a for loop above the tasks, it run all the tables at once giving me multiple errors.
Here is my code:
def create_tunnel_postgres():
psql_host = ''
psql_port = 5432
ssh_host= ''
ssh_port = 22
ssh_username = ''
pkf = paramiko.RSAKey.from_private_key(StringIO(Variable.get('my_key')))
server = SSHTunnelForwarder(
(ssh_host, 22),
ssh_username=ssh_username,
ssh_private_key=pkf,
remote_bind_address=(psql_host, 5432))
return server
def conn_postgres_internal(server):
"""
Using the server connect to the internal postgres
"""
conn = psycopg2.connect(
database='pricing',
user= Variable.get('postgres_db_user'),
password= Variable.get('postgres_db_key'),
host=server.local_bind_host,
port=server.local_bind_port,
)
return conn
def gzip_postgres_table(**kwargs):
"""
path='/path/{}.csv'.format(table_name)
server_postgres = create_tunnel_postgres()
server_postgres.start()
etl_conn = conn_postgres_internal(server_postgres)
cur=etl_conn.cursor()
cur.execute("""
select * from schema.db.{} limit 100;
""".format(table_name))
result = cur.fetchall()
column_names = [i[0] for i in cur.description]
fp = gzip.open(path, 'wt')
myFile = csv.writer(fp,delimiter=',')
myFile.writerow(column_names)
myFile.writerows(result)
fp.close()
etl_conn.close()
server_postgres.stop()
#------------------------------------------------------------------------------------------------------------------------------------------------
default_args = {
'owner': 'mae',
'depends_on_past':False,
'start_date': datetime(2020,1,1),
'email': ['maom#aol.com'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 0,
'retry_delay': timedelta(minutes=1)
}
tables= ['table1','table2']
s3_folder='de'
current_timestamp=datetime.now()
#Element'S VARIABLES
dag = DAG('dag1',
description = 'O',
default_args=default_args,
max_active_runs=1,
schedule_interval= '#once',
#schedule_interval='hourly'
catchup = False )
for table_name in pricing_table_name:
t1 = PythonOperator(
task_id='{}_gzip_table'.format(table_name),
python_callable= gzip_postgres_table,
provide_context=True,
op_kwargs={'table_name':table_name,'s3_folder':s3_folder,'current_timestamp':current_timestamp},
dag = dag)
Is there a way to run table1 first..let it finish and then run table 2? I tried doing that with the for table_name in tables: but to no avail. Any ideas or suggestions would help.
Your for is creating multiple tasks for your tables processing, this will parallelize the execution of the tasks by default on airflow.
You can either set the number of workers in the airflow config file to 1, or create only 1 task and run your loop inside the task, which will then be executed synchronously.
I saw your code, and it seems like you're creating multiple DAG tasks using looping statement, which runs the task in parallel.
There are certain ways to achieve your requirement.
use sequential_executor.
airflow.executors.sequential_executor.SequentialExecutor which will only run task instances sequentially.
https://airflow.apache.org/docs/stable/start.html#quick-start
create a script that works according to your need.
Create a script(Python) and use it as PythonOperator that repeats your current function for number of tables.
limit airflow executors(parallelism) to 1.
You can limit your airflow workers to 1 in its airflow.cfg config file.
Steps:
open airflow.cfg from your airflow root(AIRFLOW_HOME).
set/update parallelism = 1
restart your airflow.
this should work.
I see 3 way of solving this.
Limit parallelism = 1 in the airflow.cfg file.
Create a python code which is going to loop trough you tables and
call that with a python
Create a pool and assign 1 slot to it.
https://airflow.apache.org/docs/stable/concepts.html?highlight=pool#pools
i thing You need DAG like this
Code for it:
from datetime import datetime
from airflow import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator
import sys
sys.path.append('../')
from mssql_loader import core #program code, which start load
from mssql_loader import locals #local variables, contains dictionaries with name
def contact_load(typ,db):
core.starter(typ=typ,db=db)
return 'MSSQL LOADED '+db['DBpseudo']+'.'+typ
dag = DAG('contact_loader', description='MSSQL sqlcontact.uka.local loader to GBQ',
schedule_interval='0 7 * * *',
start_date=datetime(2017, 3, 20), catchup=False)
start_operator = DummyOperator(task_id='ROBO_task', retries=3, dag=dag)
for v in locals.TABLES:
for db in locals.DB:
task = PythonOperator(
task_id=db['DBpseudo']+'_mssql_' + v, #create Express_mssql_fast , UKA_mssql_important and etc
python_callable=contact_load,
op_kwargs={'typ': v,'db':db},
retries=3,
dag=dag,
)
start_operator >> task #create parent-child connection to from first task to other
dag = DAG(dag_id='you_DAG',default_args=default_args,schedule_interval='10 6 * * *',max_active_runs=1 --- HERE execute only 1 task)

Categories

Resources