writing Airflow 2 dag - python

I have been in Airflow 1.10.14 for a long time, and now I'm trying to upgrade to Airflow 2.4.3 (latest?) I have built this dag in the new format in hopes to assimilate the language and understand how the new format works. Below is my dag:
from airflow.decorators import dag, task
from airflow.models import Variable
from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJobOperator
from airflow.providers.microsoft.mssql.operators.mssql import MsSqlOperator
from airflow.operators.bash import BashOperator
from datetime import datetime
import glob
path = '~/airflow/staging/gcs/offrs2/'
clear_Staging_Folders = """
rm -rf {}OFFRS2/LEADS*.*
""".format(Variable.get("temp_directory"))
#dag(
schedule_interval='#daily',
start_date=datetime(2022, 11, 1),
catchup=False,
tags=['offrs2', 'LEADS']
)
def taskflow():
CLEAR_STAGING = BashOperator(
task_id='Clear_Folders',
bash_command=clear_Staging_Folders,
dag=dag,
)
BQ_Output = BigQueryInsertJobOperator(
task_id='BQ_Output',
configuration={
"query": {
"query": '~/airflow/sql/Leads/Leads_Export.sql',
"useLegacySql": False
}
}
)
Prep_MSSQL = MsSqlOperator(
task_id='Prep_DB3_Table',
mssql_conn_id = 'db.offrs.com',
sql='truncate table offrs_staging..LEADS;'
)
#task
def Load_Staging_Table():
for files in glob.glob(path + 'LEADS*.csv'):
print(files)
CLEAR_STAGING >> BQ_Output >> Load_Staging_Table()
dag = taskflow()
when I send this up, I'm getting the below error:
Broken DAG: [/home/airflow/airflow/dags/BQ_OFFRS2_Leads.py] Traceback (most recent call last):
File "/home/airflow/.local/lib/python3.10/site-packages/airflow/models/baseoperator.py", line 376, in apply_defaults
task_group = TaskGroupContext.get_current_task_group(dag)
File "/home/airflow/.local/lib/python3.10/site-packages/airflow/utils/task_group.py", line 490, in get_current_task_group
return dag.task_group
AttributeError: 'function' object has no attribute 'task_group'
As I look at my code, I don't have a specified task_group.
Where am I going wrong here?
Thank you!

You forgot to remove an undefined dag variable in CLEAR_STAGING. When you are using decorator, remove dag=dag.
CLEAR_STAGING = BashOperator(
task_id='Clear_Folders',
bash_command=clear_Staging_Folders,
# dag=dag <== Remove this
)

Related

How to use imported function from a module that creates and returns a DAG for UI to see?

I've created a module named dag_template_module.py that returns a DAG using specified arguments. I want to use this definition for multiple DAGs, doing same thing but from different sources (thus parameters). A simplified version of dag_template_module.py:
from airflow.decorators import dag, task
from airflow.operators.bash import BashOperator
def dag_template(
dag_id: str,
echo_message_1: str,
echo_message_2: str
):
#dag(
dag_id=dag_id,
schedule_interval="0 6 2 * *"
)
def dag_example():
echo_1 = BashOperator(
task_id='echo_1',
bash_command=f'echo {echo_message_1}'
)
echo_2 = BashOperator(
task_id='echo_2',
bash_command=f'echo {echo_message_2}'
)
echo_1 >> echo_2
dag = dag_example()
return dag
Now I've created a hello_world_dag.py that imports dag_template() function from dag_template_module.py and uses it to create a DAG:
from dag_template import dag_template
hello_world_dag = dag_template(
dag_id='hello_world_dag',
echo_message_1='Hello',
echo_message_2='World'
)
I've expected that this DAG will be discovered by Airflow UI but that's not the case.
I've also tried using globals() in hello_world_dag.py according to documentation but that also doesn't work for me:
from dag_template import dag_template
hello_world_dag = 'hello_word_dag'
globals()[hello_world_dag] = dag_template(dag_id='hello_world_dag',
echo_message_1='Hello',
echo_message_2='World'
)
A couple things:
The DAG you are attempting to create is missing the start_date param
There is a nuance to how Airflow determine which Python files might contain a DAG definition and it's looking for "dag" and "airflow" in the file contents. The hello_world_dag.py is missing these keywords so the DagFileProcessor won't attempt to parse this file and, therefore, doesn't call the dag_template() function.
Adding these small tweaks, and running with Airflow 2.5.0:
dag_template_module.py
from pendulum import datetime
from airflow.decorators import dag
from airflow.operators.bash import BashOperator
def dag_template(dag_id: str, echo_message_1: str, echo_message_2: str):
#dag(dag_id, start_date=datetime(2023, 1, 22), schedule=None)
def dag_example():
echo_1 = BashOperator(task_id="echo_1", bash_command=f"echo {echo_message_1}")
echo_2 = BashOperator(task_id="echo_2", bash_command=f"echo {echo_message_2}")
echo_1 >> echo_2
return dag_example()
hello_world_dag.py
#airflow dag <- Make sure this these words appear _somewhere_ in the file.
from dag_template_module import dag_template
dag_template(dag_id="dag_example", echo_message_1="Hello", echo_message_2="World")

Problems connecting Redshift to Airflow (MWAA)

I am learning airflow and as a practice exercise im trying to create a table at Redshift through an airflow dag at MWAA. I create the connection to Redshift at the UI (specifying host,port, etc) and run the following dag, but it fails at the "sql_query" task. Any idea of how can I solve this problem or what can be causing it?
Script:
import os
from datetime import timedelta
from airflow import DAG
from airflow.models import Variable
from airflow.models.baseoperator import chain
from airflow.operators.dummy import DummyOperator
from airflow.providers.amazon.aws.operators.redshift import RedshiftSQLOperator
from airflow.utils.dates import days_ago
DEFAULT_ARGS = {
"owner": "username",
"depends_on_past": False,
"retries": 0,
"email_on_failure": False,
"email_on_retry": False,
"redshift_conn_id": "redshift_default",
}
with DAG(
dag_id= "new_table_dag",
description="",
default_args=DEFAULT_ARGS,
dagrun_timeout=timedelta(minutes=15),
start_date=days_ago(1),
schedule_interval=None,
tags=[""],
) as dag:
begin = DummyOperator(task_id="begin")
end = DummyOperator(task_id="end")
sql_query = RedshiftSQLOperator(
task_id="sql_query",
sql= "CREATE TABLE schema_name.table_a AS (SELECT * FROM table_b)")
chain(begin,sql_query, end)

Poke the Specified Extension file in the server directory using the Airflow SFTPSensor

My use case is quite simple:
When file dropped in the FTP server directory, SFTPSensor task picks the specified txt extension file and process the file content.
path="/test_dir/sample.txt" this case is working.
my requirement is to read the dynamic filenames with only the specified extension(text files).
path="/test_dir/*.txt", in this case file poking is not working..
#Sample Code
from airflow.models import DAG
from airflow.operators.python import PythonOperator
from airflow.providers.sftp.sensors.sftp import SFTPSensor
from airflow.providers.ssh.hooks.ssh import SSHHook
from datetime import datetime
default_args= {
"owner": "airflow",
"depends_on_past": False,
"start_date": datetime(2022, 4, 16)
}
with DAG(
'sftp_sensor_test',
schedule_interval=None,
default_args=default_args
) as dag:
waiting_for_file = SFTPSensor(
task_id="check_for_file",
sftp_conn_id="sftp_default",
path="/test_dir/*.txt", #NOTE: Poking for the txt extension files
mode="reschedule",
poke_interval=30
)
waiting_for_file
To achieve what you want, I think you should use the file_pattern argument as follows :
waiting_for_file = SFTPSensor(
task_id="check_for_file",
sftp_conn_id="sftp_default",
path="test_dir",
file_pattern="*.txt",
mode="reschedule",
poke_interval=30
)
However, there is currently a bug for this feature → https://github.com/apache/airflow/issues/28121
While this gets solved, you can easily create a local fixed version of the sensor in your project following the issue's explanations.
Here is the file with the current fix: https://github.com/RishuGuru/airflow/blob/ac0457a51b885459bc5ae527878a50feb5dcadfa/airflow/providers/sftp/sensors/sftp.py

How to run a local script running Airflow in the docker?

I installed a docker pointing to local folders where I configured my dag in real file path: "C:\Users\Rod\airflow-docker"
So far so good. I can run my DAGs without any problems.
The problem is when I try to run a script via BashOperator task. Returns error. What am I doing wrong?
the error:
Broken DAG: [/opt/airflow/dags/invetory_sap.py] Traceback (most recent call last): File "", line 219, in _call_with_frames_removed File "/opt/airflow/dags/invetory_sap.py", line 34, in etl_invetory_sap NameError: name 'etl_invetory_sap' is not defined
The DAG:
from airflow import DAG
from airflow.operators.python import PythonOperator, BranchPythonOperator
from airflow.operators.python import PythonOperator
from airflow.operators.bash import BashOperator
from datetime import datetime, timedelta
seven_days_ago = datetime.combine(datetime.today() - timedelta(7),
datetime.min.time())
args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': seven_days_ago,
'retries': 1,
'retry_delay': timedelta(minutes=5),
}
with DAG("invetory_sap",
default_args=args,
schedule_interval='30 * * * *',
dagrun_timeout=timedelta(minutes=60),
catchup=False) as dag:
etl_inventory_sap = BashOperator(
task_id='etl_invetory_sap',
bash_command='python /opt/airflow/plugins/ler_txt_convert_todataframe_v5.py'
)
etl_invetory_sap
Spelling error. You declared it as "etl_inventory_sap" and then wrote "etl_invetory_sap" on the next line. Put back the n and you should be fine.

ModuleNotFoundError: No module named 'airflow'

I'm using the Airflow PythonOperator to execute a python Beam job using the Dataflow runner.
The Dataflow job returns the error "ModuleNotFoundError: No module named 'airflow'"
In the DataFlow UI the SDK version being used when the job is called using the PythonOperator is 2.15.0. If the
job is executed from Cloud shell the SDK version being used is 2.23.0. The job works when initiated from
the shell.
The Environment details for Composer are:
Image version = composer-1.10.3-airflow-1.10.3
Python version= 3
A previous post suggested using the PythonVirtualenvOperator operator. I tried this using the settings:
requirements=['apache-beam[gcp]'],
python_version=3
Composer returns the error "'install', 'apache-beam[gcp]']' returned non-zero exit status 2."
Any advice would be greatly appreciated.
This is the DAG that calls the Dataflow job. I have not shown all the functions that are used in the DAG but kept the imports in :
import logging
import pprint
import json
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator
from airflow.contrib.operators.dataflow_operator import DataflowTemplateOperator
from airflow.models import DAG
import google.cloud.logging
from datetime import timedelta
from airflow.utils.dates import days_ago
from deps import utils
from google.cloud import storage
from airflow.exceptions import AirflowException
from deps import logger_montr
from deps import dataflow_clean_csv
dag = DAG(dag_id='clean_data_file',
default_args=args,
description='Runs Dataflow to clean csv files',
schedule_interval=None)
def get_values_from_previous_dag(**context):
var_dict = {}
for key, val in context['dag_run'].conf.items():
context['ti'].xcom_push(key, val)
var_dict[key] = val
populate_ti_xcom = PythonOperator(
task_id='get_values_from_previous_dag',
python_callable=get_values_from_previous_dag,
provide_context=True,
dag=dag,
)
dataflow_clean_csv = PythonOperator(
task_id = "dataflow_clean_csv",
python_callable = dataflow_clean_csv.clean_csv_dataflow,
op_kwargs= {
'project':
'zone':
'region':
'stagingLocation':
'inputDirectory':
'filename':
'outputDirectory':
},
provide_context=True,
dag=dag,
)
populate_ti_xcom >> dataflow_clean_csv
I use the ti.xcom_pull(task_ids = 'get_values_from_previous_dag') method to assign the op_kwargs.
This is the Dataflow job that is being called:
import apache_beam as beam
import csv
import logging
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.io import WriteToText
def parse_file(element):
for line in csv.reader([element], quotechar='"', delimiter=',', quoting=csv.QUOTE_ALL):
line = [s.replace('\"', '') for s in line]
clean_line = '","'.join(line)
final_line = '"'+ clean_line +'"'
return final_line
def clean_csv_dataflow(**kwargs):
argv = [
# Dataflow pipeline options
"--region={}".format(kwargs["region"]),
"--project={}".format(kwargs["project"]) ,
"--temp_location={}".format(kwargs["stagingLocation"]),
# Setting Dataflow pipeline options
'--save_main_session',
'--max_num_workers=8',
'--autoscaling_algorithm=THROUGHPUT_BASED',
# Mandatory constants
'--job_name=cleancsvdataflow',
'--runner=DataflowRunner'
]
options = PipelineOptions(
flags=argv
)
pipeline = beam.Pipeline(options=options)
inputDirectory = kwargs["inputDirectory"]
filename = kwargs["filename"]
outputDirectory = kwargs["outputDirectory"]
outputfile_temp = filename
outputfile_temp = outputfile_temp.split(".")
outputfile = "_CLEANED.".join(outputfile_temp)
in_path_and_filename = "{}{}".format(inputDirectory,filename)
out_path_and_filename = "{}{}".format(outputDirectory,outputfile)
pipeline = beam.Pipeline(options=options)
clean_csv = (pipeline
| "Read input file" >> beam.io.ReadFromText(in_path_and_filename)
| "Parse file" >> beam.Map(parse_file)
| "writecsv" >> beam.io.WriteToText(out_path_and_filename,num_shards=1)
)
pipeline.run()
This answer was provided by #BSpinoza in the comment section:
What I did was move all imports from the global namespace and place
them into the function definitions. Then, from the calling DAG I used
the BashOperator. It worked.
Also, one of the recommended way is to use DataFlowPythonOperator.

Categories

Resources