How to retrieve the yarn_application_id from the SparkSubmitHook ?
I tried to using a custom operator and the task_instance property but I guess I missed something...
def task_failure_callback(context):
task_instance = context.get('task_instance') # Need to access yarn_application_id here
operator = task_instance.operator
application_id = operator.yarn_application_id
return ...
default_args = {
'start_date': ...,
'on_failure_callback': task_failure_callback
}
with DAG(DAG_ID, default_args=default_args, catchup=CATCHUP, schedule_interval=SCHEDULE_INTERVAL) as dag:
...
So I tried adding it as a new key-value in the context dict, but without success...
class CustomSparkSubmitHook(SparkSubmitHook, LoggingMixin):
def __init__(self, ...):
super().__init__(...)
def submit_with_context(self, context, application="", **kwargs):
# Build spark submit cmd
...
# Run cmd as subprocess
...
# Process spark submit log
...
# Check spark-submit return code. In Kubernetes mode, also check the value
# of exit code in the log, as it may differ.
...
# We want the Airflow job to wait until the Spark driver is finished
if self._should_track_driver_status:
if self._driver_id is None:
raise AirflowException(
"No driver id is known: something went wrong when executing " +
"the spark submit command"
)
# We start with the SUBMITTED status as initial status
self._driver_status = "SUBMITTED"
# Trying to export yarn_application_id unsuccessfully
context['yarn_application_id'] = self.yarn_application_id
# Start tracking the driver status (blocking function)
...
#property
def yarn_application_id(self):
return self._yarn_application_id
Related
I try to customize SSHOperator like CustomSSHOperator. Because I need to assign dynamic values to ssh_conn_id and pool variables of SSHOperator. However these two are not in template_fields. So I've create a custom class like below
class CustomSSHOperator(SSHOperator):
template_fields: Sequence[str] = ('command', 'remote_host', 'ssh_conn_id', 'pool')
template_fields_renderers = {"command": "bash", "remote_host": "str", "ssh_conn_id": "str", "pool": "str"}
def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
And I'm creating dag like below
VM_CONN_ID = "vm-{vm_name}"
VM_POOL = "vm-{vm_name}"
with DAG(dag_id="my_dag", tags=["Project", "Team"],
start_date=datetime(2022, 9, 27), schedule_interval=None,
) as dag:
tasks = []
vm1_task = CustomSSHOperator(task_id='vm1_task',
# ssh_conn_id='vm-112',
#pool='vm-112',
ssh_conn_id=VM_CONN_ID.format(vm_name="{{dag_run.conf['vm1']}}"),
pool=VM_CONN_ID.format(vm_name="{{dag_run.conf['vm1']}}"),
get_pty=True,
command="d=$(date) && echo $d > my_file.txt"
)
vm2_task = CustomSSHOperator(task_id='vm2_task',
# ssh_conn_id='vm-140',
#pool='vm-140',
ssh_conn_id=VM_CONN_ID.format(vm_name="{{dag_run.conf['vm2']}}"),
pool=VM_CONN_ID.format(vm_name="{{dag_run.conf['vm2']}}"),
get_pty=True,
command="d=$(date) && echo $d > my_file.txt"
)
Basically, I can see the rendered values from the UI. However my tasks are waiting as in the image
I also indicate that if I change the dag like below(just populating pool variable as static, ssh_conn_id is still dynamic variable), It works
VM_CONN_ID = "vm-{vm_name}"
VM_POOL = "vm-{vm_name}"
with DAG(dag_id="my_dag", tags=["Project", "Team"], start_date=datetime(2022, 9, 27), schedule_interval=None,) as dag:
tasks = []
vm1_task = CustomSSHOperator(task_id='vm1_task',
# ssh_conn_id='vm-112',
pool='vm-112',
ssh_conn_id=VM_CONN_ID.format(vm_name="{{dag_run.conf['vm1']}}"),
#pool=VM_CONN_ID.format(vm_name="{{dag_run.conf['vm1']}}"),
get_pty=True,
command="d=$(date) && echo $d > my_file.txt"
)
vm2_task = CustomSSHOperator(task_id='vm2_task',
# ssh_conn_id='vm-140',
pool='vm-140',
ssh_conn_id=VM_CONN_ID.format(vm_name="{{dag_run.conf['vm2']}}"),
#pool=VM_CONN_ID.format(vm_name="{{dag_run.conf['vm2']}}"),
get_pty=True,
command="d=$(date) && echo $d > my_file.txt"
)
dag_run.conf parameter is {"vm1": "112", "vm2": "140"}
I couldn't find the reason. I'd be appreciate any suggestions.
Template fields are rendered after the task has been scheduled, while the task pool field is used before the task is scheduled (by the Airflow scheduler itself).
This is the reason why a template cannot be used for the pool field. See also this discussion.
What is happening in your case is that the task remains stuck in the scheduled state because it is associated with a non-existent pool (actually it is vm-{{dag_run.conf['vm1']}}, that is, evaluated before the rendering).
You should have evidence of this in the scheduler logs:
Tasks using non-existent pool 'vm-{{dag_run.conf['vm1']}}' will not be scheduled
As a proof, you can create a new pool named exactly vm-{{dag_run.conf['vm1']}} and you will see that the task will be executed.
Only later the pool field will be rendered, and that's why you see the expected rendered values from the UI. But that's not what the scheduler saw.
I am not sure if this is normal behavior, but I try to run a command using gsutil within a python_callable in a BranhPythonOperator... This command works well when I use it explicitly on my terminal with the hardcoded paths within GCS, but once I try to run it within my DAG using {{ds_nodash}} and {{run_id}} (Airflow Macros). Airflow does not interpret them, as you can see in the logs below.
Here is the code within my DAG definition
with DAG("DAG_NAME", default_args=default_args, schedule_interval="#hourly", catchup=False) as dag:
# Buckets
airflow_bucket = "XXXXX" # Hidden on purpose
archive_bucket = "YYYYY" # Hidden on purpose
# Paths
raw_data_path = "raw_data/tc_export/raw/{{ds_nodash}}/{{run_id}}/*"
airflow_local_dir = "/home/airflow/gcs/data/tc_data/"
# SFTP & dirs
sftp_key = "KEY" # Hidden on purpose
sftp_remote_directory_root = '/data/from_tc/'
op_check_if_files_in_sftp = BranchPythonOperator(
task_id='check_if_files_in_sftp',
provide_context=True,
python_callable=check_if_files_in_sftp,
op_kwargs={'remote_directory_root': sftp_remote_directory_root},
templates_dict={"sftp_key": sftp_key})
op_check_if_files_in_bucket = BranchPythonOperator(
task_id='check_if_files_in_bucket',
provide_context=True,
python_callable=check_if_files_in_bucket,
op_kwargs={'bucket': archive_bucket, 'subdir': raw_data_path})
And here is the function that executes the gsutil
def check_if_files_in_bucket(bucket: str, subdir: str, **kwargs) -> str:
"""
Check if files already exist in the archives' bucket.
:param bucket: bucket in which to search
:param subdir: directory within the bucket
:param kwargs: additional context parameters.
:return: id of the next DAG operator
"""
try:
logging.info(f"Executing command : gsutil -q stat gs://{bucket}/{subdir}")
command = subprocess.run(["gsutil", "-q", "stat", f"gs://{bucket}/{subdir}"])
if command.returncode:
logging.info(f"Command return code : {command.returncode}. Ending process.")
return "end_process"
logging.info(f"There are files within the {bucket}/{subdir}. Proceeding with the next step.")
return "transfer_to_other_bucket"
except OSError as os_err:
logging.exception(os_err)
exit(1)
except ValueError as val_err:
logging.exception(val_err)
exit(1)
So my questions are :
When does Airflow interpret the Macros?
How do I fix this?
The problem here is related to not using the argument templates_dict within the BranchPythonOperator. Here is the corrected code :
op_check_if_files_in_bucket = BranchPythonOperator(task_id='check_if_files_in_bucket',
provide_context=True,
python_callable=check_if_files_in_bucket,
op_kwargs={'bucket': archive_bucket},
templates_dict={'subdir': raw_data_path})
And the python_callable function :
def check_if_files_in_bucket(bucket: str, **kwargs) -> None:
"""
Check if files already exist in the archives' bucket.
:param bucket: bucket in which to search
:param kwargs: additional context parameters, and subdirectory in bucket.
:return: None
"""
try:
subdir = kwargs["templates_dict"]["subdir"]
cmd_check_files = ["gsutil", "-q", "stat", f"gs://{bucket}/{subdir}"]
logging.info(f"Executing command : {' '.join(cmd_check_files)}")
command = subprocess.run(cmd_check_files)
if command.returncode:
logging.info(f"Command return code : {command.returncode}. Ending process.")
return "end_process"
logging.info(f"There are files within the {bucket}/{subdir}. Proceeding with the next step.")
return "transfer_to_other_bucket"
except OSError as os_err:
logging.exception(os_err)
exit(1)
except ValueError as val_err:
logging.exception(val_err)
exit(1)
N.B : Since BranchPythonOperator extends PythonOperator, the same rule applies.
I use AzureML SDK for Python to define a Run and assign log parameters as shown below.
run = Run.get_context()
run.parent.log("param1", 25)
run.parent.log("param2", 100)
run.parent.log("param3", 10)
run.parent.log("param4", 40)
The problem is that I can only see param1 and param2 in Machine Learning Service Workspace. Is there any limitation on the number of variables?
The short answer is NO after I reviewed the source code of azureml-core which I got via decompress the azureml_core-1.0.85-py2.py3-none-any.whl file.
The key source codes are here.
# azureml_core-1.0.85-py2.py3-none-any.whl\azureml\core\run.py
class Run(_RunBase):
.......
#classmethod
def get_context(cls, allow_offline=True, used_for_context_manager=False, **kwargs):
"""Return current service context.
Use this method to retrieve the current service context for logging metrics and uploading files. If
``allow_offline`` is True (the default), actions against the Run object will be printed to standard
out.
.. remarks::
This function is commonly used to retrieve the authenticated Run object
inside of a script to be submitted for execution via experiment.submit(). This run object is both
an authenticated context to communicate with Azure Machine Learning services and a conceptual container
within which metrics, files (artifacts), and models are contained.
.. code-block:: python
run = Run.get_context() # allow_offline=True by default, so can be run locally as well
...
run.log("Accuracy", 0.98)
run.log_row("Performance", epoch=e, error=err)
:param cls: Indicates class method.
:param allow_offline: Allow the service context to fall back to offline mode so that the training script
can be tested locally without submitting a job with the SDK. True by default.
:type allow_offline: bool
:param kwargs: A dictionary of additional parameters.
:type kwargs: dict
:return: The submitted run.
:rtype: azureml.core.run.Run
"""
try:
experiment, run_id = cls._load_scope()
# Querying for the run instead of initializing to load current state
if used_for_context_manager:
return _SubmittedRun(experiment, run_id, **kwargs)
return _SubmittedRun._get_instance(experiment, run_id, **kwargs)
except RunEnvironmentException as ex:
module_logger.debug("Could not load run context %s, switching offline: %s", ex, allow_offline)
if allow_offline:
module_logger.info("Could not load the run context. Logging offline")
return _OfflineRun(**kwargs)
else:
module_logger.debug("Could not load the run context and allow_offline set to False")
raise RunEnvironmentException(inner_exception=ex)
class _OfflineRun(ChainedIdentity):
def __init__(self, parent_logger=None, run_id=None, **kwargs):
self._run_id = "OfflineRun_{}".format(uuid4()) if run_id is None else run_id
super(_OfflineRun, self).__init__(
_ident=self._run_id,
_parent_logger=parent_logger if parent_logger is not None else module_logger)
....
def log(self, name, value, description=""):
self._emit("scalar", name, value)
....
def _emit(self, type, name, value):
print("Attempted to log {0} metric {1}:\n{2}".format(type, name, value))
The run object got from get_context function is running on Offline mode, so its log function just be an alias of print.
We typically start Airflow DAGs with the trigger_dag CLI command. For example:
airflow trigger_dag my_dag --conf '{"field1": 1, "field2": 2}'
We access this conf in our operators using context[‘dag_run’].conf
Sometimes when the DAG breaks at some task, we'd like to "update" the conf and restart the broken task (and downstream dependencies) with this new conf. For example:
new conf --> {"field1": 3, "field2": 4}
Is it possible to “update” the dag_run conf with a new json string like this?
Would be interested in hearing thoughts on this, other solutions, or potentially ways to avoid this situation to begin with.
Working with Apache Airflow v1.10.3
Thank you very much in advance.
Updating conf after a dag run has been created isn't as straight forward as reading from conf, because conf is read from the dag_run metadata table whenever it's used after a dag run has been created. While Variables have methods to both write to and read from a metadata table, dag runs only let you read.
I agree that Variables are a useful tool, but when you have k=v pairs that you only want to use for a single run, it gets complicated and messy.
Below is an operator that will let you update a dag_run's conf after instantiation (tested in v1.10.10):
#! /usr/bin/env python3
"""Operator to overwrite a dag run's conf after creation."""
import os
from airflow.models import BaseOperator
from airflow.utils.db import provide_session
from airflow.utils.decorators import apply_defaults
from airflow.utils.operator_helpers import context_to_airflow_vars
class UpdateConfOperator(BaseOperator):
"""Updates an existing DagRun's conf with `given_conf`.
Args:
given_conf: A dictionary of k:v values to update a DagRun's conf with. Templated.
replace: Whether or not `given_conf` should replace conf (True)
or be used to update the existing conf (False).
Defaults to True.
"""
template_fields = ("given_conf",)
ui_color = "#ffefeb"
#apply_defaults
def __init__(self, given_conf: Dict, replace: bool = True, *args, **kwargs):
super().__init__(*args, **kwargs)
self.given_conf = given_conf
self.replace = replace
#staticmethod
def update_conf(given_conf: Dict, replace: bool = True, **context) -> None:
#provide_session
def save_to_db(dag_run, session):
session.add(dag_run)
session.commit()
dag_run.refresh_from_db()
dag_run = context["dag_run"]
# When there's no conf provided,
# conf will be None if scheduled or {} if manually triggered
if replace or not dag_run.conf:
dag_run.conf = given_conf
elif dag_run.conf:
# Note: dag_run.conf.update(given_conf) doesn't work
dag_run.conf = {**dag_run.conf, **given_conf}
save_to_db(dag_run)
def execute(self, context):
# Export context to make it available for callables to use.
airflow_context_vars = context_to_airflow_vars(context, in_env_var_format=True)
self.log.debug(
"Exporting the following env vars:\n%s",
"\n".join(["{}={}".format(k, v) for k, v in airflow_context_vars.items()]),
)
os.environ.update(airflow_context_vars)
self.update_conf(given_conf=self.given_conf, replace=self.replace, **context)
Example usage:
CONF = {"field1": 3, "field2": 4}
with DAG(
"some_dag",
# schedule_interval="*/1 * * * *",
schedule_interval=None,
max_active_runs=1,
catchup=False,
) as dag:
t_update_conf = UpdateConfOperator(
task_id="update_conf", given_conf=CONF,
)
t_print_conf = BashOperator(
task_id="print_conf",
bash_command="echo {{ dag_run['conf'] }}",
)
t_update_conf >> t_print_conf
This seems like a good use-case of Airflow Variables. If you were to read your configs from Variables you can easily see and modify the configuration inputs from the Airflow UI itself.
You can even go creative and automate that updation of config (which is now stored in a Variable) before re-running a Task / DAG via another Airflow task itself. See With code, how do you update and airflow variable
When I run my tests that include calling a #classmethod using setuptools and nose2, the testing suite doesn't finish it just keeps on running. However I have checked that the test does indeed pass and reach the end of the function, the test suite just doesn't finish running. If I remove the tests using decode_auth_token it works fine. I was able to narrow it done to the class methods because I tested other class methods as well and they causes the same problem
Does anyone have any idea why this might be happening? Below are the relevant pieces of code without posting too much of my code
Code in my User Model
#classmethod
def decode_auth_token(cls, auth_token):
try:
payload = jwt.decode(auth_token, config.SECRET_KEY, algorithms=['HS256'])
# check the hash of what we expect the token to be and token we got to be the same
if bcrypt.check_password_hash(User.by_id(payload['sub']).api_token_hash, auth_token):
return payload['sub']
else:
return 'Token does not match Api Token.'
except jwt.ExpiredSignatureError:
return 'Signature expired. Please log in again.'
except jwt.InvalidTokenError:
return 'Invalid Token. Please log in again.'
The following two functions also cause the problem when called
#classmethod
def is_username_taken(cls, username):
return db.session.query(db.exists().where(User.username==username)).scalar()
#classmethod
def is_email_taken(cls, email):
return db.session.query(db.exists().where(User.email==email)).scalar()
This function does not cause the problem when called though
#classmethod
def by_username(cls, username):
return User.query.filter(User.username == username).first()
Here are the tests
import unittest
import sys
from . import AppTestCase, API_ROOT
from app.extensions import db, bcrypt
from app.models import User, UserSchema, Location, Company
class TestUserModel(AppTestCase):
def test_encode_auth_token(self):
user = User.by_username('jdoe')
auth_token = user.encode_auth_token(user.id)
self.assertTrue(isinstance(auth_token, bytes))
def test_decode_auth_token(self):
user = User.by_username('jdoe')
auth_token = user.encode_auth_token(user.id)
self.assertTrue(isinstance(auth_token, bytes))
self.assertEqual(User.decode_auth_token(auth_token), user.id)
print('DONE')
The first test works fine, the second test prints out Done and properly decodes the auth_token returning the proper user id but does not cause the test suite to finish. It just keeps running after printing done.
And here is the setup script, I run the tests using python setup.py test
import os
from setuptools import setup, find_packages, Command
# Thanks http://stackoverflow.com/questions/3779915/why-does-python-setup-py-sdist-create-unwanted-project-egg-info-in-project-r
class CleanCommand(Command):
"""Custom clean command to tidy up the project root."""
user_options = []
def initialize_options(self):
pass
def finalize_options(self):
pass
def run(self):
os.system('rm -vrf ./build ./dist ./*.pyc ./*.tgz ./*.egg-info')
with open('requirements.txt') as f:
requirements = f.read().splitlines()
setup(
name="XXX",
description="XXX",
version=1.0,
packages=find_packages(),
install_requires=requirements,
include_package_data=True,
test_suite='nose2.collector.collector',
tests_require=['nose2'],
cmdclass={
'clean': CleanCommand,
}
)
Output when Running and Not Stopping
running test
Searching for nose2
Best match: nose2 0.6.5
Processing nose2-0.6.5-py3.6.egg
Using XXX/.eggs/nose2-0.6.5-py3.6.egg
running egg_info
writing doomfist.egg-info/PKG-INFO
writing dependency_links to XXX.egg-info/dependency_links.txt
writing requirements to XXX.egg-info/requires.txt
writing top-level names to XXX.egg-info/top_level.txt
reading manifest file 'XXX.egg-info/SOURCES.txt'
writing manifest file 'XXX.egg-info/SOURCES.txt'
running build_ext
/Users/XXX/anaconda3/envs/XXX/lib/python3.6/site-packages/python_dateutil-2.6.0-py3.6.egg/dateutil/parser.py:50: DeprecationWarning: invalid escape sequence \.
/Users/XXX/anaconda3/envs/XXX/lib/python3.6/site-packages/python_dateutil-2.6.0-py3.6.egg/dateutil/parser.py:50: DeprecationWarning: invalid escape sequence \.
/Users/XXX/anaconda3/envs/XXX/lib/python3.6/site-packages/python_dateutil-2.6.0-py3.6.egg/dateutil/tz/win.py:197: DeprecationWarning: invalid escape sequence \{
/Users/XXX/anaconda3/envs/XXX/lib/python3.6/site-packages/python_dateutil-2.6.0-py3.6.egg/dateutil/tz/win.py:247: DeprecationWarning: invalid escape sequence \{
/Users/XXX/anaconda3/envs/XXX/lib/python3.6/site-packages/python_dateutil-2.6.0-py3.6.egg/dateutil/tz/win.py:197: DeprecationWarning: invalid escape sequence \{
/Users/XXX/anaconda3/envs/XXX/lib/python3.6/site-packages/python_dateutil-2.6.0-py3.6.egg/dateutil/tz/win.py:247: DeprecationWarning: invalid escape sequence \{
NOT running in debug mode
DONE
^]^\[1] 35752 quit python setup.py test
EDIT----- Sorry for the Huge post now
With someone's advice in the comments I used a debugger to determine it does indeed finish the tests. And where it actually get's stuck is during tearDown(). The following function of mine is where it gets stuck.
def tearDown(self):
"""Clean db session and drop all tables."""
db.drop_all()
Following the debugger farther down I determined it ultimately gets stuck
for table, fkcs in collection:
if table is not None:
self.traverse_single(table, drop_ok=True, _is_metadata_operation=True)
else:
for fkc in fkcs:
...
More specifically on this method self.traverse_single(table, drop_ok=True, _is_metadata_operation=True). I'm assuming it gets stuck waiting for the generator to return? Unsure but below is the last lines I got before it gets stuck again.
> /Users/XXX/anaconda3/envs/XXX/lib/python3.6/site-packages/SQLAlchemy-1.1.11-py3.6-macosx-10.7-x86_64.egg/sqlalchemy/sql/ddl.py(929)visit_table()->None
-> _is_metadata_operation=_is_metadata_operation)
(Pdb) n
--Call--
> /Users/XXX/anaconda3/envs/XXX/lib/python3.6/site-packages/SQLAlchemy-1.1.11-py3.6-macosx-10.7-x86_64.egg/sqlalchemy/sql/visitors.py(150)_visitor_iterator()-><sqlalchemy.s...t 0x112045630>
-> yield v
(Pdb) n
GeneratorExit
> /Users/XXX/anaconda3/envs/XXX/lib/python3.6/site-packages/SQLAlchemy-1.1.11-py3.6-macosx-10.7-x86_64.egg/sqlalchemy/sql/visitors.py(150)_visitor_iterator()-><sqlalchemy.s...t 0x112045630>
-> yield v
(Pdb) l
145 def _visitor_iterator(self):
146 """iterate through this visitor and each 'chained' visitor."""
147
148 v = self
149 while v:
150 -> yield v
151 v = getattr(v, '_next', None)
152
153 def chain(self, visitor):
154 """'chain' an additional ClauseVisitor onto this ClauseVisitor.
155
(Pdb) n
I believe it gets stuck on the following table of mine
from ..helpers import get_current_time
from ..extensions import db, ma
from ..constants import STRING_LEN, DESCRIPTION_LEN
from .worker import WorkerSchema
class Injury(db.Model):
__tablename__ = "injuries"
def __repr__(self):
return '<Injury %r>' % (self.id)
id = db.Column(db.Integer, primary_key = True)
title = db.Column(db.String(STRING_LEN), nullable=False)
description = db.Column(db.String(DESCRIPTION_LEN), nullable=False)
worker_id = db.Column(db.Integer, db.ForeignKey('workers.id'))
created_at = db.Column(db.DateTime, nullable=False, default = get_current_time)
updated_at = db.Column(db.DateTime, nullable=False, default = get_current_time, onupdate=get_current_time)
# Relationships
worker = db.relationship('Worker', back_populates='injuries')
# ================================================================
# ================================================================
# methods
# ================================================================
# Class methods
#classmethod
def by_id(cls, id):
return cls.query.filter(Injury.id==id).first()
class InjurySchema(ma.Schema):
class Meta:
fields = ('id', 'title', 'description', 'worker')
worker = ma.Nested(WorkerSchema)
I was able to get it to work by adding db.session.close() before my drop_all command based on this post SQLAlchemy blocked on dropping tables
def tearDown(self):
"""Clean db session and drop all tables."""
db.session.close()
db.drop_all()
I still need to find why the session is open and where I need to close it though