I am not sure if this is normal behavior, but I try to run a command using gsutil within a python_callable in a BranhPythonOperator... This command works well when I use it explicitly on my terminal with the hardcoded paths within GCS, but once I try to run it within my DAG using {{ds_nodash}} and {{run_id}} (Airflow Macros). Airflow does not interpret them, as you can see in the logs below.
Here is the code within my DAG definition
with DAG("DAG_NAME", default_args=default_args, schedule_interval="#hourly", catchup=False) as dag:
# Buckets
airflow_bucket = "XXXXX" # Hidden on purpose
archive_bucket = "YYYYY" # Hidden on purpose
# Paths
raw_data_path = "raw_data/tc_export/raw/{{ds_nodash}}/{{run_id}}/*"
airflow_local_dir = "/home/airflow/gcs/data/tc_data/"
# SFTP & dirs
sftp_key = "KEY" # Hidden on purpose
sftp_remote_directory_root = '/data/from_tc/'
op_check_if_files_in_sftp = BranchPythonOperator(
task_id='check_if_files_in_sftp',
provide_context=True,
python_callable=check_if_files_in_sftp,
op_kwargs={'remote_directory_root': sftp_remote_directory_root},
templates_dict={"sftp_key": sftp_key})
op_check_if_files_in_bucket = BranchPythonOperator(
task_id='check_if_files_in_bucket',
provide_context=True,
python_callable=check_if_files_in_bucket,
op_kwargs={'bucket': archive_bucket, 'subdir': raw_data_path})
And here is the function that executes the gsutil
def check_if_files_in_bucket(bucket: str, subdir: str, **kwargs) -> str:
"""
Check if files already exist in the archives' bucket.
:param bucket: bucket in which to search
:param subdir: directory within the bucket
:param kwargs: additional context parameters.
:return: id of the next DAG operator
"""
try:
logging.info(f"Executing command : gsutil -q stat gs://{bucket}/{subdir}")
command = subprocess.run(["gsutil", "-q", "stat", f"gs://{bucket}/{subdir}"])
if command.returncode:
logging.info(f"Command return code : {command.returncode}. Ending process.")
return "end_process"
logging.info(f"There are files within the {bucket}/{subdir}. Proceeding with the next step.")
return "transfer_to_other_bucket"
except OSError as os_err:
logging.exception(os_err)
exit(1)
except ValueError as val_err:
logging.exception(val_err)
exit(1)
So my questions are :
When does Airflow interpret the Macros?
How do I fix this?
The problem here is related to not using the argument templates_dict within the BranchPythonOperator. Here is the corrected code :
op_check_if_files_in_bucket = BranchPythonOperator(task_id='check_if_files_in_bucket',
provide_context=True,
python_callable=check_if_files_in_bucket,
op_kwargs={'bucket': archive_bucket},
templates_dict={'subdir': raw_data_path})
And the python_callable function :
def check_if_files_in_bucket(bucket: str, **kwargs) -> None:
"""
Check if files already exist in the archives' bucket.
:param bucket: bucket in which to search
:param kwargs: additional context parameters, and subdirectory in bucket.
:return: None
"""
try:
subdir = kwargs["templates_dict"]["subdir"]
cmd_check_files = ["gsutil", "-q", "stat", f"gs://{bucket}/{subdir}"]
logging.info(f"Executing command : {' '.join(cmd_check_files)}")
command = subprocess.run(cmd_check_files)
if command.returncode:
logging.info(f"Command return code : {command.returncode}. Ending process.")
return "end_process"
logging.info(f"There are files within the {bucket}/{subdir}. Proceeding with the next step.")
return "transfer_to_other_bucket"
except OSError as os_err:
logging.exception(os_err)
exit(1)
except ValueError as val_err:
logging.exception(val_err)
exit(1)
N.B : Since BranchPythonOperator extends PythonOperator, the same rule applies.
Related
I have a sensor task that listens to files being created in S3.
After a poke I may have 3 files, after another poke I might have another 5 files.
I want to create a DAG (or multiple dags) that listen to work request, and creates others tasks or DAGs to handle that amount of work.
I wish I could access the xcom or dag_run variable from the DAG definition (see pseudo-code as follows):
def wait_for_s3_data(ti, **kwargs):
s3_wrapper = S3Wrapper()
work_load = s3_wrapper.work()
# work_load: {"filename1.json": "s3/key/filename1.json", ....}
ti.xcom_push(key="work_load", value=work_load)
return len(work_load) > 0
def get_work(self, dag_run, ti, **_):
s3_wrapper = S3Wrapper()
work_load = ti.xcom_pull(key="work_load")
dag_run.conf['work_load'] = work_load
s3_wrapper.move_messages_from_waiting_to_processing(work_load)
with DAG(
"ListenAndCallWorkers",
description="This DAG waits for work request from s3",
schedule_interval="#once",
max_active_runs=1,
) as dag:
wait_for_s3_data: PythonSensor = PythonSensor(
task_id="wait_for_s3_data",
python_callable=wait_for_s3_data,
timeout=60,
poke_interval=30,
retries=2,
mode="reschedule",
)
get_data_task = PythonOperator(
task_id="GetData",
python_callable=query.get_work,
provide_context=True,
)
work_load = "{{ dag_run.conf['work_load'] }}" # <--- I WISH I COULD DO THIS
do_work_tasks = [
TriggerDagRunOperator(
task_id=f"TriggerDoWork_{work}",
trigger_dag_id="Work", # Ensure this equals the dag_id of the DAG to trigger
conf={"work":keypath},
)
for work, keypath in work_load.items():
]
wait_for_s3_data >> get_data_task >> do_work_tasks
I know I cannot do that.
I also tried to defined my own custom MultiTriggerDAG object (as in this https://stackoverflow.com/a/51790697/1494511). But at that step I still don't have access to the amount of work that needs to be done.
Another idea:
I am considering build a DAG with N doWork tasks, and I pass work to up to N via xcom
def get_work(self, dag_run, ti, **_):
s3_wrapper = S3Wrapper()
work_load = ti.xcom_pull(key="work_load")
i = 1
for work, keypath in work_load.items()
dag_run.conf[f'work_{i}'] = keypath
i += 1
if i > N:
break
s3_wrapper.move_messages_from_waiting_to_processing(work_load[:N])
This idea would get the job done, but it sounds very inefficient
Related questions:
This is the same question as I have, but no code is presented on how to solve it:
Airflow: Proper way to run DAG for each file
This answer looks like it would solve the problem, but it seems to be related to Airflow versions lower than 2.2.2
How do we trigger multiple airflow dags using TriggerDagRunOperator?
I have a dag called my_dag.py that utilizes the S3KeySensor in Airflow 2 to check if a s3 key exists. When I use the sensor directly inside the dag, it works:
with TaskGroup('check_exists') as check_exists:
path = 's3://my-bucket/data/my_file'
poke_interval = 30
timeout = 60*60
mode = 'reschedule'
dependency_name = 'my_file'
S3KeySensor(
task_id = 'check_' + dependency_name + '_exists',
bucket_key = path,
poke_interval = poke_interval,
timeout = timeout,
mode = mode
)
The log of the above looks like:
[2022-05-03, 19:51:26 UTC] {s3.py:105} INFO - Poking for key : s3://my-bucket/data/my_file
[2022-05-03, 19:51:26 UTC] {base_aws.py:90} INFO - Retrieving region_name from Connection.extra_config['region_name']
[2022-05-03, 19:51:27 UTC] {taskinstance.py:1701} INFO - Rescheduling task, marking task as UP_FOR_RESCHEDULE
This is correct. The reschedule is expected, because the file does not exist yet.
However, I want to check any number of paths in other dags, so I moved the sensor into a function called test in another file called helpers.py. I use a python operator in my_dag.py within the task group that calls test. It looks like this:
with TaskGroup('check_exists') as check_exists:
path = 's3://my-bucket/data/my_file'
dependency_name = 'my_file'
wait_for_dependencies = PythonOperator(
task_id = 'wait_for_my_file',
python_callable = test,
op_kwargs = {
'dependency_name': dependency_name,
'path': path
},
dag = dag
)
wait_for_dependencies
The function test in helpers.py looks like:
def test(dependency_name, path, poke_interval = 30, timeout = 60 * 60, mode = 'reschedule'):
S3KeySensor(
task_id = 'check_' + dependency_name + '_exists',
bucket_key = path,
poke_interval = poke_interval,
timeout = timeout,
mode = mode
)
However, when I run the dag, the step is marked as success even though the file is not there. The logs show:
[2022-05-03, 20:07:54 UTC] {python.py:175} INFO - Done. Returned value was: None
[2022-05-03, 20:07:54 UTC] {taskinstance.py:1282} INFO - Marking task as SUCCESS.
It seems airflow doesn't like using a sensor via a python operator. Is this true? Or am I doing something wrong?
My goal is to loop through multiple paths and check if each one exists. However, I do this in other dags, which is why I'm putting the sensor in a function that resides in another file.
If there are alternative ideas to doing this, I'm open!
Thanks for your help!
This will not work as you expect.
You created a case of operator inside operator. See this answer for information about what this means.
In your case you wrapped the S3KeySensor with PythonOperator. This means that when the PythonOperator runs it only execute the init function of S3KeySensor - it doesn't invoke the logic of the operator itself.
Using operator inside operator is a bad practice.
Your case is even more extreme as you are trying to use sensor inside operator. Sensors need to invoke the poke() function for every poking cycle.
To simplify - You can not enjoy the power of Sensor with mode = 'reschedule' when you set them as you did because reschedule means that you want to release the worker if condition is not met yet but PythonOperator doesn't know how to do that.
How to solve your issue:
Option 1:
From the code you showed you can simply do:
with TaskGroup('check_exists') as check_exists:
path = 's3://my-bucket/data/my_file'
dependency_name = 'my_file'
S3KeySensor(
task_id='check_' + dependency_name + '_exists',
bucket_key=path,
poke_interval=30,
timeout=60 * 60,
mode='reschedule'
)
I didn't see a reason why this can't work for you.
Option 2:
If for some reason option 1 is not good for you then create a custom sensor that accept also dependency_name, path and use it like any other operator.
I didn't test it but something like the following should work:
class MyS3KeySensor(S3KeySensor):
def __init__(
self,
*,
dependency_name:str = None,
path: str = None,
**kwargs,
):
super().__init__(**kwargs)
self.task_id = task_id = 'check_' + dependency_name + '_exists'
self.bucket_name = path
I'm having issues with Airflow 1.10 Python Branch operator. I have a dag that scans a cloud bucket, and processes files if found. If the file is missing it hits the no_file_found dummy operator and completes, otherwise it moves forward to some parsing steps.
With a single file this workflow works great. My issue arises when I add the same logic for a second file. Currently the check_for_Post_Performance returns cleans_headers_for_gcm task and I'm at a total loss how that happens. From the outline below it should have only two paths forward, clean_headers_Post_Perfromance or no_file_found.
I create these tasks dynamically from a list of file names. I loop through each filename and build the following operators:
def build_check(filename):
return BranchPythonOperator(
task_id=f'check_for_{file_name}'.replace(' ', '_'),
python_callable=check_file_exists,
op_kwargs={'filename': filename},
provide_context=True,
dag=dag
)
def check_file_exists(filename, **context):
xcom_value = context['ti'].xcom_pull(task_ids=f'list_files')
if any(filename in s for s in xcom_value):
return f'clean_headers_for_{file_name}'.replace(' ', '_')
else:
return 'no_file_found'
I've checked the rendered task template to confirm 'Post Performance' is passed for the filename variable
but when looking at the logs I see the following:
[2021-12-02 20:15:56,742] {logging_mixin.py:120} INFO - Running <TaskInstance: example_dag.check_for_Post_Performance 2021-12-02T20:14:50.724084+00:00 [running]> on host 21d0393eb686
[2021-12-02 20:15:56,766] {python_operator.py:114} INFO - Done. Returned value was: clean_headers_for_GCM
[2021-12-02 20:15:56,767] {skipmixin.py:122} INFO - Following branch clean_headers_for_GCM
[2021-12-02 20:15:56,773] {skipmixin.py:158} INFO - Skipping tasks ['no_file_found', 'clean_headers_for_Post_Performance']
My best guess is the function isn't created each loop like I think it is, or some trigger rule is tripping me up. How can I have each file in my source list either reach no_file_found or clean_headers task independently of each other?
EDIT
Here is the code I use to build the tasks from a static list:
for file_name, table_name in FILES().items():
import_to_bq = import_file(file_name, table_name)
clean_headers_task = clean_headers(file_name)
start_import >> list_files >> build_check(file_name) >> [clean_headers_task, no_file]
clean_headers_task >> import_to_bq >> archive_file(file_name)
Perhaps it's the difference between file_name and filename? Looks like the task IDs use file_name while the arg is filename. Should these functions both use filename?
def build_check(filename):
return BranchPythonOperator(
task_id=f'check_for_{filename}'.replace(' ', '_'),
python_callable=check_file_exists,
op_kwargs={'filename': filename},
provide_context=True,
dag=dag
)
def check_file_exists(filename, **context):
xcom_value = context['ti'].xcom_pull(task_ids=f'list_files')
if any(filename in s for s in xcom_value):
return f'clean_headers_for_{filename}'.replace(' ', '_')
else:
return 'no_file_found'
I use AzureML SDK for Python to define a Run and assign log parameters as shown below.
run = Run.get_context()
run.parent.log("param1", 25)
run.parent.log("param2", 100)
run.parent.log("param3", 10)
run.parent.log("param4", 40)
The problem is that I can only see param1 and param2 in Machine Learning Service Workspace. Is there any limitation on the number of variables?
The short answer is NO after I reviewed the source code of azureml-core which I got via decompress the azureml_core-1.0.85-py2.py3-none-any.whl file.
The key source codes are here.
# azureml_core-1.0.85-py2.py3-none-any.whl\azureml\core\run.py
class Run(_RunBase):
.......
#classmethod
def get_context(cls, allow_offline=True, used_for_context_manager=False, **kwargs):
"""Return current service context.
Use this method to retrieve the current service context for logging metrics and uploading files. If
``allow_offline`` is True (the default), actions against the Run object will be printed to standard
out.
.. remarks::
This function is commonly used to retrieve the authenticated Run object
inside of a script to be submitted for execution via experiment.submit(). This run object is both
an authenticated context to communicate with Azure Machine Learning services and a conceptual container
within which metrics, files (artifacts), and models are contained.
.. code-block:: python
run = Run.get_context() # allow_offline=True by default, so can be run locally as well
...
run.log("Accuracy", 0.98)
run.log_row("Performance", epoch=e, error=err)
:param cls: Indicates class method.
:param allow_offline: Allow the service context to fall back to offline mode so that the training script
can be tested locally without submitting a job with the SDK. True by default.
:type allow_offline: bool
:param kwargs: A dictionary of additional parameters.
:type kwargs: dict
:return: The submitted run.
:rtype: azureml.core.run.Run
"""
try:
experiment, run_id = cls._load_scope()
# Querying for the run instead of initializing to load current state
if used_for_context_manager:
return _SubmittedRun(experiment, run_id, **kwargs)
return _SubmittedRun._get_instance(experiment, run_id, **kwargs)
except RunEnvironmentException as ex:
module_logger.debug("Could not load run context %s, switching offline: %s", ex, allow_offline)
if allow_offline:
module_logger.info("Could not load the run context. Logging offline")
return _OfflineRun(**kwargs)
else:
module_logger.debug("Could not load the run context and allow_offline set to False")
raise RunEnvironmentException(inner_exception=ex)
class _OfflineRun(ChainedIdentity):
def __init__(self, parent_logger=None, run_id=None, **kwargs):
self._run_id = "OfflineRun_{}".format(uuid4()) if run_id is None else run_id
super(_OfflineRun, self).__init__(
_ident=self._run_id,
_parent_logger=parent_logger if parent_logger is not None else module_logger)
....
def log(self, name, value, description=""):
self._emit("scalar", name, value)
....
def _emit(self, type, name, value):
print("Attempted to log {0} metric {1}:\n{2}".format(type, name, value))
The run object got from get_context function is running on Offline mode, so its log function just be an alias of print.
How to retrieve the yarn_application_id from the SparkSubmitHook ?
I tried to using a custom operator and the task_instance property but I guess I missed something...
def task_failure_callback(context):
task_instance = context.get('task_instance') # Need to access yarn_application_id here
operator = task_instance.operator
application_id = operator.yarn_application_id
return ...
default_args = {
'start_date': ...,
'on_failure_callback': task_failure_callback
}
with DAG(DAG_ID, default_args=default_args, catchup=CATCHUP, schedule_interval=SCHEDULE_INTERVAL) as dag:
...
So I tried adding it as a new key-value in the context dict, but without success...
class CustomSparkSubmitHook(SparkSubmitHook, LoggingMixin):
def __init__(self, ...):
super().__init__(...)
def submit_with_context(self, context, application="", **kwargs):
# Build spark submit cmd
...
# Run cmd as subprocess
...
# Process spark submit log
...
# Check spark-submit return code. In Kubernetes mode, also check the value
# of exit code in the log, as it may differ.
...
# We want the Airflow job to wait until the Spark driver is finished
if self._should_track_driver_status:
if self._driver_id is None:
raise AirflowException(
"No driver id is known: something went wrong when executing " +
"the spark submit command"
)
# We start with the SUBMITTED status as initial status
self._driver_status = "SUBMITTED"
# Trying to export yarn_application_id unsuccessfully
context['yarn_application_id'] = self.yarn_application_id
# Start tracking the driver status (blocking function)
...
#property
def yarn_application_id(self):
return self._yarn_application_id