I am trying to create a dynamic tasklist to check the previous batch runs for the day got completed or not. In order to achieve that, I have the Timings(HHMM) stored in the Airflow variable and I have used the datetime.now() variable to get the current HHMM and creates a list of previous runs. But as the Airflow dag gets validated everytime, it picks up the latest date and time and it generates new previous task list based on that.
I was trying to see instead of comparing the datetime.now(), using the {{ ds }} and {{ ts }} default airflow variables to avoid the above issue. But it treats these variables as String or not able recognize those as variables and throwing ts/ds variable not defined.
is there way/workaround to access these variables outside of the operators as the above logic is to create a list of dynamic tasks to be run based on to check the previous batch run completion.
Thanks in Advance.
from datetime import datetime,timedelta,date
from pytz import timezone, utc
import pendulum
## Below would come from Airflow variable.
dag_times = ["0700", "0715", "0730" ,"0730", "0930","1130","1330","1630","2000"]
## This is the code to get the current time.. this is keep changing as the airflow validates the DAG.
current_dag_time = datetime.now().astimezone(timezone('US/Pacific')).strftime('%H%M')
schedule_run_time = min(dag_times, key=lambda x:abs(int(x)-int(current_dag_time)))
current_run = dag_times.index(schedule_run_time)
print("current_run",current_run)
intra_day_time = dag_times[dag_times.index(schedule_run_time)-1] if current_run > 0 else schedule_run_time
previous_runs = []
if current_run > 0:
# print(dag_times.index(schedule_run_time))
previous_runs = dag_times[0:dag_times.index(schedule_run_time)]
else:
previous_runs.append(dag_times[-1])
previous_run_tasks=[]
for dag_name in previous_runs:
item = {}
if int(dag_name) == 0:
if date.today().weekday() == 0 :
start_time =-52
end_time = 4
else:
start_time =-24
end_time = 24
# poke_task_name = "SAMPLE_BOX_%s" % dag_name
item = {"poke_task_name": "SAMPLE_BOX_%s" % dag_name, "start_time":start_time, "end_time":end_time}
elif int(dag_name) > 0 :
start_time =0
end_time = 24
poke_task_name = "SAMPLE_BOX_%s" % dag_name
item = {"poke_task_name": "SAMPLE_BOX_%s" % dag_name, "start_time":start_time, "end_time":end_time}
else:
print("error")
previous_run_tasks.append(item)
print(previous_run_tasks)
if int(schedule_run_time) == 0:
if date.today().weekday() == 0 :
start_time =-52
end_time = 4
else:
start_time =-24
end_time = 24
poke_task_name = "SAMPLE_BOX_%s" % dag_times[-1]
generate_task_name = "SAMPLE_BOX_%s" % schedule_run_time
elif int(schedule_run_time) > 0 :
start_time =0
end_time = 24
poke_task_name = "SAMPLE_BOX_%s" % intra_day_time
generate_task_name = "SAMPLE_BOX_%s" % schedule_run_time
else:
print("error")
print("start_time::::",start_time)
print("end_time::::",end_time)
print("generate_task_name::::",generate_task_name)
print("poke_task_name::::",poke_task_name)
These Airflow default variables are only instantiated in the context of a task instance for a given DAG run, and thus they are only available in the templated fields of each operator. Trying to use them outside of this context will not work.
I have prepared a simple DAG with task that displays execution date (ds) as a parameter:
from airflow import macros
from airflow import models
from airflow.operators import bash_operator
import datetime
yesterday = datetime.datetime.combine(
datetime.datetime.today() - datetime.timedelta(1),
datetime.datetime.min.time())
default_args = {
"start_date": yesterday,
"retries": 1,
"email_on_failure": False,
"email_on_retry": False,
"email": "youremail#host.com"
}
with models.DAG(
'printing_the_execution_date_ts',
schedule_interval=datetime.timedelta(days=1),
default_args=default_args) as dag:
printing_the_execution_date = bash_operator.BashOperator(
task_id="display",
bash_command="echo {{ ds }}"
)
printing_the_execution_date
The {{ }} brackets tell Airflow that this is a Jinja template.
You may also use ts variable which is the execution date in ISO 8601 format. Thus, in the dag run stamped with 2020-05-10, this would render to:
'echo {{ ds }}'
echo 2020-05-10
'echo {{ ts }}'
echo 2020-05-10T00:00:00+00:00
I recommend you to take a look for this Stackoverflow thread, where you can find example with using PythonOperator.
Related
I am trying to backup the data in Airflow but it's not giving any error and also I am unable to backup the data because it's getting skipped. The code which I have written is :-
import os
from airflow import DAG
from airflow.providers.google.cloud.transfers.bigquery_to_gcs import (
BigQueryToGCSOperator,
)
from composer_plugins import get_list_to_backup
from datetime import datetime, timedelta, date
from airflow.exceptions import AirflowFailException, AirflowSkipException
from airflow.operators.python import PythonOperator
"""the function validates if the schedule_day parameter is a valid day to execute the task schedule_day is a number and corresponds to day of the week
1 - Monday
2 - Tuesday
3 - Wednesday
4 - Thursday
5 - Friday
6 - Saturday
7 - Sunday """
def _check_valid_day(**kwargs):
today = datetime.today()
if today.isoweekday()==kwargs["schedule_day"]:
return True
else:
raise AirflowSkipException("does not correspond to the backup day")
today = datetime.today()
today_str = today.strftime("%Y-%m-%d")
#start_date = get_next_weekday(today_str, 5) # 5 = Saturday
start_date = datetime(2022, 5, 2)
dag_id = "data_bq_weekly_backups_dag"
event_collection_project_id = os.environ["EVENT_COLLECTION_PROJECT_ID"]
tables_to_backup = os.environ["TABLES_TO_BACKUP"]
destination_bucket = os.environ["WEEKLY_BQ_BACKUP_BUCKET"]
schedule_day =os.environ["BACKUP_SCHEDULE_DAY"]
default_dag_args = {
# Setting start date for next Saturday in order to maintain the scheduler
# in a consistent state
"start_date": start_date,
# To email on failure or retry set 'email' arg to your email and enable
# emailing here.
"email_on_failure": False,
"email_on_retry": False,
# If a task fails, retry it once after waiting at least what's specified in retry_delay
"retries": 1,
"retry_delay": timedelta(seconds=10),
"project_id": event_collection_project_id,
"schedule_interval": "0 2 * * *",
}
tables_to_backup_list = get_list_to_backup(tables_to_backup)
with DAG(dag_id=dag_id, default_args=default_dag_args,catchup=False) as dag:
check_valid_day = PythonOperator(
task_id='check_valid_day',
python_callable=_check_valid_day,
op_kwargs={
"schedule_day": schedule_day
},
)
task_dict = dict()
for table_to_backup in tables_to_backup_list:
dataset = table_to_backup.split(".")[0]
table = table_to_backup.split(".")[1]
task_name = f"{dataset}_{table}_table_weekly_backup"
task_dict[task_name] = BigQueryToGCSOperator(
task_id=task_name,
trigger_rule="all_success",
dag=dag,
source_project_dataset_table=table_to_backup,
destination_cloud_storage_uris=[
f"gs://{destination_bucket}/{dataset}/{table}/{today.year}/{today.month}/{today.day}/{table}-*.avro"
],
export_format="AVRO", # OPTIONS: AVRO, CSV, JSON
compression="NONE", # OPTIONS: NONE, DEFLATE, GZIP, SNAPPY
labels=None,
)
check_valid_day >> task_dict[task_name]
When I am executing this DAG, there is no error but it's skipping everything :-
Airflow DAG TREE VIEW
BACKUP_SCHEDULE_DAY=3 is set in environment variable file. I don't know what's wrong in this and why it's not working
I tried your code I was able to reproduce your issue. See run history below:
NOTE: Prior to running your code, I hardcoded values like your environment variables and tables_to_backup_list to make it work on my environment.
The main problem is in _check_valid_day(). When this line is executed if today.isoweekday()==kwargs["schedule_day"] it is always false because they have a data type mismatch.
print(today.isoweekday()) # <class 'int'>
print(kwargs["schedule_day"]) # <class 'str'>
The fix is to make their data types match. Fix below is to convert kwargs["schedule_day"] to type int:
def _check_valid_day(**kwargs):
today = datetime.today()
if today.isoweekday() == int(kwargs["schedule_day"]): #convert to int
print("inside the if statement")
return True
else:
raise AirflowSkipException("does not correspond to the backup day")
Graph view:
check_valid_day Logs:
I am reading an integer variable from airflow variables and then incrementing the value by one each time the DAG runs and set it to the variable again.
But after the below code the variable at UI changes each time page is refreshed or so.
Idk what is causing such behavior
counter = Variable.get('counter')
s = BashOperator(
task_id='echo_start_variable',
bash_command='echo ' + counter,
dag=dag,
)
Variable.set("counter", int(counter) + 1)
sql_query = "SELECT * FROM UNNEST(SEQUENCE({start}, {end}))"
sql_query = sql_query.replace('{start}', start).replace('{end}', end)
submit_query = PythonOperator(
task_id='submit_athena_query',
python_callable=run_athena_query,
op_kwargs={'query': sql_query, 'db': 'db',
's3_output': 's3://s3-path/rohan/date=' + current_date + '/'},
dag=dag)
e = BashOperator(
task_id='echo_end_variable',
bash_command='echo ' + counter,
dag=dag,
)
s >> submit_query >> e
Airflow process that DAG file every 30 seconds (default of min_file_process_interval setting) this means that any top level code you have is running every 30 seconds so Variable.set("counter", int(counter) + 1)
will cause the Variable counter to be increased by 1 every 30 seconds.
It's a bad practice to interact with Variables in top level code (regardless of the increasing value issue). It opens a connection to the metastore database every 30 seconds which may cause serious problems and overwhelm the database.
To get the value of Variable you can use Jinja:
e = BashOperator(
task_id='echo_end_variable',
bash_command='echo {{ var.value.counter }}',
dag=dag,
)
This is a safe way to use variables as the value is being retrieved only when the operator is executed.
If you want to increase the value of the variable by 1 then do it with PythonOpeartor:
def increase():
counter = Variable.get('counter')
Variable.set("counter", int(counter) + 1)
increase_op = PythonOperator(
task_id='increase_task',
python_callable=increase,
dag=dag)
The python callable will be executed only when the operator runs.
i am trying to subtracts 'end time' of a ride from its 'start time'. starttime is fetched directly from database(models.py) and line 'start = n[0].driverStarttime' indicates that. Now i use current datetime as 'endtime' of a ride. Variable 'diff' is used to subtract end and start time. but it gives
'TypeError at /driver_panel/endtrip
can't subtract offset-naive and offset-aware datetimes' error. here driver_panel is my application in project. Driverbooking table is used to fetch start time. DateTimeField is used for store start and endtime. here is the code...
def endtrip(request):
if request.method == 'GET':
dbid = request.GET.get('driverBookID')
if dbid:
n = Driverbooking.objects.all().filter(driverBookID=dbid)
name = n[0].customerID
start = n[0].driverStartTime
end = datetime.datetime.now()
diff = end - start
total = diff * 10
a = Driverbooking.objects.get(driverBookID=dbid)
a.driverStatus = "end"
a.driverEndTime = end
a.driverAmount = total
a.save()
did = request.session['uid']
x = Driverside.objects.all().filter(driverID=did)
rate = x[0].driverFPH
d = Driverside.objects.get(driverID=did)
d.driverIsAvailable = "yes"
d.save()
context = {"name":name,"start":start,"end":end,"rate":rate,"t":total}
return render(request, "driverbill.html", context)
return redirect('driverhome')
The problem raises because you seem to be subtracting "start time" (which probably has timezone info) and "end time" (which has no timezone).
Simple solution is to use the correct current time with timezone, like the timezone docs suggest.
Instead of this line:
end = datetime.datetime.now()
try this:
from django.utils import timezone as tz
end = tz.now()
I have a Airflow job like below:
import time
job_id = int(time.time())
airflow_job1 = PythonOperator(op_kwargs={"job_id" : job_id}, ...)
airflow_job2 = BashOperator(op_kwargs={"job_id" : job_id}, ...)
airflow_job1 >> airflow_job2
I know every time when script launched, I will have a new job_id, used in each airflow task. But I wonder what if I run the script from middle, like airflow_job1 failed, and I fix problem and rerun from airflow_job1 in UI, is a new job_id generated in the rerun, or Airflow use the last job_id before?
Actually, after I check with a simple case:
# global parameter
job_id = int(time.time())
def airflow_job1(job_id, **context):
print("in airflow_job1, current timestamp: %s" % job_id)
def airflow_job2(job_id, **context):
print("in airflow_job2, current timestamp: %s" % job_id)
airflow_job1 = PythonOperator(
task_id='airflow_job1',
provide_context=True,
python_callable=airflow_job1,
op_kwargs={'job_id': job_id},
dag=globals()[dag_name]
)
airflow_job2 = PythonOperator(
task_id='airflow_job2',
provide_context=True,
python_callable=airflow_job2,
op_kwargs={'job_id': job_id},
dag=globals()[dag_name]
)
airflow_job1 >> airflow_job2
I find job_id in airflow_job1 and airflow_job2 are different even if in the same run.
So the conclusion is that we shouldn't set global parameter in this way, maybe use xcom_pull / xcom_push to solve that
When I measure the time manually, it is less than the time that I got through this script:
import time import os
def getTimes():
try:
times = []
if(exists("1472205483589.png",60)):
click("1472192774056.png")
wait("1472040968178.png",10)
click("1472036591623.png")
click("1472036834091.png")
click("1472036868986.png")
if(exists("1472192829443.png",5)):
click("1472192829443.png")
u = time.time()
click("1472539655695.png")
wait("1472042542247.png",120)
v = time.time()
print("Open File to when views list appear  (sec) : " , int(v-u))
times.append(int(v-u))
u = time.time()
click("1472042542247.png")
wait("1472108424071.png",120)
mouseMove("1472108424071.png")
wait("1472108486171.png",120)
v = time.time()
print("Opening view (sec) : ",int(v-u))
times.append(int(v-u))
u = time.time()
click("1472109163884.png")
wait("1472042181291.png",120)
v = time.time()
print("Clicking element (sec) : ", float(v-u))
times.append(int(v-u))
return times
except FindFailed as ex:
print("Failed. Navigator might have stopped working")
if(exists("1472204045678.png",10)):
click("1472204045678.png")
return -1
file = open(r"C:\BSW\SikulixScripts\NavigatorAutoTesting\log.txt",'w') ret = getTimes() if (ret == -1):
file.write("-1")
exit() str = " ".join(str(x) for x in ret) file.write(str) file.close()
By using time.time(), you are actually returning a number of seconds--the difference between "the epoch" and now. (The epoch is the same as gmtime(0)). Instead, try using datetime.now() which will give you a datetime object. You can add and subtract datetime objects freely, resulting in a timedelta object as per the Python docs
u = datetime.now()
click("1472539655695.png")
wait("1472042542247.png",120)
v = datetime.now()
tdelta = v-u
seconds = tdelta.total_seconds() #if you want the number of seconds as a floating point number... (available in Python 2.7 and up)
times.append(seconds)
This should yield more accuracy for you.