How to run a local script running Airflow in the docker? - python

I installed a docker pointing to local folders where I configured my dag in real file path: "C:\Users\Rod\airflow-docker"
So far so good. I can run my DAGs without any problems.
The problem is when I try to run a script via BashOperator task. Returns error. What am I doing wrong?
the error:
Broken DAG: [/opt/airflow/dags/invetory_sap.py] Traceback (most recent call last): File "", line 219, in _call_with_frames_removed File "/opt/airflow/dags/invetory_sap.py", line 34, in etl_invetory_sap NameError: name 'etl_invetory_sap' is not defined
The DAG:
from airflow import DAG
from airflow.operators.python import PythonOperator, BranchPythonOperator
from airflow.operators.python import PythonOperator
from airflow.operators.bash import BashOperator
from datetime import datetime, timedelta
seven_days_ago = datetime.combine(datetime.today() - timedelta(7),
datetime.min.time())
args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': seven_days_ago,
'retries': 1,
'retry_delay': timedelta(minutes=5),
}
with DAG("invetory_sap",
default_args=args,
schedule_interval='30 * * * *',
dagrun_timeout=timedelta(minutes=60),
catchup=False) as dag:
etl_inventory_sap = BashOperator(
task_id='etl_invetory_sap',
bash_command='python /opt/airflow/plugins/ler_txt_convert_todataframe_v5.py'
)
etl_invetory_sap

Spelling error. You declared it as "etl_inventory_sap" and then wrote "etl_invetory_sap" on the next line. Put back the n and you should be fine.

Related

writing Airflow 2 dag

I have been in Airflow 1.10.14 for a long time, and now I'm trying to upgrade to Airflow 2.4.3 (latest?) I have built this dag in the new format in hopes to assimilate the language and understand how the new format works. Below is my dag:
from airflow.decorators import dag, task
from airflow.models import Variable
from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJobOperator
from airflow.providers.microsoft.mssql.operators.mssql import MsSqlOperator
from airflow.operators.bash import BashOperator
from datetime import datetime
import glob
path = '~/airflow/staging/gcs/offrs2/'
clear_Staging_Folders = """
rm -rf {}OFFRS2/LEADS*.*
""".format(Variable.get("temp_directory"))
#dag(
schedule_interval='#daily',
start_date=datetime(2022, 11, 1),
catchup=False,
tags=['offrs2', 'LEADS']
)
def taskflow():
CLEAR_STAGING = BashOperator(
task_id='Clear_Folders',
bash_command=clear_Staging_Folders,
dag=dag,
)
BQ_Output = BigQueryInsertJobOperator(
task_id='BQ_Output',
configuration={
"query": {
"query": '~/airflow/sql/Leads/Leads_Export.sql',
"useLegacySql": False
}
}
)
Prep_MSSQL = MsSqlOperator(
task_id='Prep_DB3_Table',
mssql_conn_id = 'db.offrs.com',
sql='truncate table offrs_staging..LEADS;'
)
#task
def Load_Staging_Table():
for files in glob.glob(path + 'LEADS*.csv'):
print(files)
CLEAR_STAGING >> BQ_Output >> Load_Staging_Table()
dag = taskflow()
when I send this up, I'm getting the below error:
Broken DAG: [/home/airflow/airflow/dags/BQ_OFFRS2_Leads.py] Traceback (most recent call last):
File "/home/airflow/.local/lib/python3.10/site-packages/airflow/models/baseoperator.py", line 376, in apply_defaults
task_group = TaskGroupContext.get_current_task_group(dag)
File "/home/airflow/.local/lib/python3.10/site-packages/airflow/utils/task_group.py", line 490, in get_current_task_group
return dag.task_group
AttributeError: 'function' object has no attribute 'task_group'
As I look at my code, I don't have a specified task_group.
Where am I going wrong here?
Thank you!
You forgot to remove an undefined dag variable in CLEAR_STAGING. When you are using decorator, remove dag=dag.
CLEAR_STAGING = BashOperator(
task_id='Clear_Folders',
bash_command=clear_Staging_Folders,
# dag=dag <== Remove this
)

Problems connecting Redshift to Airflow (MWAA)

I am learning airflow and as a practice exercise im trying to create a table at Redshift through an airflow dag at MWAA. I create the connection to Redshift at the UI (specifying host,port, etc) and run the following dag, but it fails at the "sql_query" task. Any idea of how can I solve this problem or what can be causing it?
Script:
import os
from datetime import timedelta
from airflow import DAG
from airflow.models import Variable
from airflow.models.baseoperator import chain
from airflow.operators.dummy import DummyOperator
from airflow.providers.amazon.aws.operators.redshift import RedshiftSQLOperator
from airflow.utils.dates import days_ago
DEFAULT_ARGS = {
"owner": "username",
"depends_on_past": False,
"retries": 0,
"email_on_failure": False,
"email_on_retry": False,
"redshift_conn_id": "redshift_default",
}
with DAG(
dag_id= "new_table_dag",
description="",
default_args=DEFAULT_ARGS,
dagrun_timeout=timedelta(minutes=15),
start_date=days_ago(1),
schedule_interval=None,
tags=[""],
) as dag:
begin = DummyOperator(task_id="begin")
end = DummyOperator(task_id="end")
sql_query = RedshiftSQLOperator(
task_id="sql_query",
sql= "CREATE TABLE schema_name.table_a AS (SELECT * FROM table_b)")
chain(begin,sql_query, end)

ImportError: cannot import name 'DAG' from 'airflow' (unknown location)

I've installed the airflow on docker and i'm trying to create my first DAG, but when i use the command FROM airflow import DAG and try to execute it gives an import error. The file name isn't set as airflow.py to avoid import problems. Also i can't import the from airflow.operators.python_operator import PythonOperator it says that the airflow.operators.python_operator could not be resolved.
Here's the code that i've used to create my first DAG:
import airflow
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
default_args ={
'owner': 'eike',
'depends_on_past': False,
'start-date': airflow.utils.dates.days_ago(2),
'email': ['eike#gmail.com.br'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 2,
'retry_delay': timedelta(minutes=3),
}
dag = DAG(
'anonimização',
default_args = default_args,
description = 'Realização da anonimzação do banco de dados propesq',
schedule_interval = timedelta(None),
catchup = False,
)
Code of the DAG on vs code
Airflow home page with DAG import error

Poke the Specified Extension file in the server directory using the Airflow SFTPSensor

My use case is quite simple:
When file dropped in the FTP server directory, SFTPSensor task picks the specified txt extension file and process the file content.
path="/test_dir/sample.txt" this case is working.
my requirement is to read the dynamic filenames with only the specified extension(text files).
path="/test_dir/*.txt", in this case file poking is not working..
#Sample Code
from airflow.models import DAG
from airflow.operators.python import PythonOperator
from airflow.providers.sftp.sensors.sftp import SFTPSensor
from airflow.providers.ssh.hooks.ssh import SSHHook
from datetime import datetime
default_args= {
"owner": "airflow",
"depends_on_past": False,
"start_date": datetime(2022, 4, 16)
}
with DAG(
'sftp_sensor_test',
schedule_interval=None,
default_args=default_args
) as dag:
waiting_for_file = SFTPSensor(
task_id="check_for_file",
sftp_conn_id="sftp_default",
path="/test_dir/*.txt", #NOTE: Poking for the txt extension files
mode="reschedule",
poke_interval=30
)
waiting_for_file
To achieve what you want, I think you should use the file_pattern argument as follows :
waiting_for_file = SFTPSensor(
task_id="check_for_file",
sftp_conn_id="sftp_default",
path="test_dir",
file_pattern="*.txt",
mode="reschedule",
poke_interval=30
)
However, there is currently a bug for this feature → https://github.com/apache/airflow/issues/28121
While this gets solved, you can easily create a local fixed version of the sensor in your project following the issue's explanations.
Here is the file with the current fix: https://github.com/RishuGuru/airflow/blob/ac0457a51b885459bc5ae527878a50feb5dcadfa/airflow/providers/sftp/sensors/sftp.py

Airflow is unable to Import custom python package

I want to call a script through airflow from a custom python project
My directory structure is:
/home/user/
├──airflow/
│ ├──dags
├──.venv_airflow (virtual environment for airflow)
│ ├──my_dag.py
├──my_project
├──.venv (virtual environment for my_project)
├──folderA
├──__init__.py
├──folderB
├──call_me.py (has a line "from my_project.folderA.folderB import import_me")
├──import_me.py
My dag file looks like:
from airflow import DAG
import datetime as dt
from airflow.operators.bash_operator import BashOperator
default_args = {
'owner': 'arpita',
'start_date': dt.datetime(2019, 11, 20),
'retries': 1,
'retry_delay': dt.timedelta(minutes=5),
'depends_on_past': False,
'email': ['example#abc.com'],
'email_on_failure': True,
'email_on_retry': True,
}
with DAG('sample',
default_args=default_args,
schedule_interval='30 * * * *',
) as dag:
enter_project = BashOperator(task_id='enter_project',
bash_command='cd /home/user/my_project',
retries=2)
setup_environment = BashOperator(task_id='setup_environment',
bash_command='source /home/user/my_project/.venv/bin/activate',
retries=2)
call_script = BashOperator(task_id='call_script',
bash_command='python -m my_project.folderA.folderB.call_me,
retries=2)
enter_project >> setup_environment >> call_script
But I am getting this error
[2019-11-22 11:56:49,311] {bash_operator.py:115} INFO - Running command: python -m my_project.folderA.folderB.call_me
[2019-11-22 11:56:49,315] {bash_operator.py:124} INFO - Output:
[2019-11-22 11:56:49,349] {bash_operator.py:128} INFO - /home/user/airflow/.venv/bin/python: Error while finding spec for 'my_project.folderA.folderB.call_me' (ImportError: No module named 'my_project')
Project and the script are working outside airflow. In airflow, it imports other packages like pandas and tensorflow but not custom packages. I tried inserting path with sys.path.insert but that is not working. Thank you for reading:)
Your bash commands run in three separate bash operators. It should run in one.
call_script = BashOperator(
task_id='call_script',
bash_command='cd /home/user/my_project;'
'source /home/user/my_project/.venv/bin/activate;'
'python -m my_project.folderA.folderB.call_me',
retries=2)

Categories

Resources