Cannot Create a Dataproc cluster - python

I tried creating a Dataproc cluster both through Airflow and through the Google cloud UI, and the cluster creation always fails at the end. Following is the airflow code I am using to create the cluster -
# STEP 1: Libraries needed
from datetime import timedelta, datetime
from airflow import models
from airflow.operators.bash_operator import BashOperator
from airflow.contrib.operators import dataproc_operator
from airflow.utils import trigger_rule
from poc.utils.transform import main
from airflow.contrib.hooks.gcp_dataproc_hook import DataProcHook
from airflow.operators.python_operator import BranchPythonOperator
import os
YESTERDAY = datetime.combine(
datetime.today() - timedelta(1),
datetime.min.time())
project_name = os.environ['GCP_PROJECT']
# Can pull in spark code from a gcs bucket
# SPARK_CODE = ('gs://us-central1-cl-composer-tes-fa29d311-bucket/spark_files/transformation.py')
dataproc_job_name = 'spark_job_dataproc'
default_dag_args = {
'depends_on_past': False,
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'start_date': YESTERDAY,
'retry_delay': timedelta(minutes=5),
'project_id': project_name,
'owner': 'DataProc',
}
with models.DAG(
'dataproc-poc',
description='Dag to run a simple dataproc job',
schedule_interval=timedelta(days=1),
default_args=default_dag_args) as dag:
CLUSTER_NAME = 'dataproc-cluster'
def ensure_cluster_exists(ds, **kwargs):
cluster = DataProcHook().get_conn().projects().regions().clusters().get(
projectId=project_name,
region='us-east1',
clusterName=CLUSTER_NAME
).execute(num_retries=5)
print(cluster)
if cluster is None or len(cluster) == 0 or 'clusterName' not in cluster:
return 'create_dataproc'
else:
return 'run_spark'
# start = BranchPythonOperator(
# task_id='start',
# provide_context=True,
# python_callable=ensure_cluster_exists,
# )
print_date = BashOperator(
task_id='print_date',
bash_command='date'
)
create_dataproc = dataproc_operator.DataprocClusterCreateOperator(task_id='create_dataproc',
cluster_name=CLUSTER_NAME,
num_workers=2,
use_if_exists='true',
zone='us-east1-b',
master_machine_type='n1-standard-1',
worker_machine_type='n1-standard-1')
# Run the PySpark job
run_spark = dataproc_operator.DataProcPySparkOperator(
task_id='run_spark',
main=main,
cluster_name=CLUSTER_NAME,
job_name=dataproc_job_name
)
# dataproc_operator
# Delete Cloud Dataproc cluster.
# delete_dataproc = dataproc_operator.DataprocClusterDeleteOperator(
# task_id='delete_dataproc',
# cluster_name='dataproc-cluster-demo-{{ ds_nodash }}',
# trigger_rule=trigger_rule.TriggerRule.ALL_DONE)
# STEP 6: Set DAGs dependencies
# Each task should run after have finished the task before.
print_date >> create_dataproc >> run_spark
# print_date >> start >> create_dataproc >> run_spark
# start >> run_spark
I checked the cluster logs and saw the following errors -
Unable to store master key 1
Unable to store master key 2
Initialization failed. Exiting 125 to prevent restart
Cannot start master: Timed out waiting for 2 datanodes and nodemanagers.
Operation timed out: Only 0 out of 2 minimum required datanodes running.
Operation timed out: Only 0 out of 2 minimum required node managers running.

Cannot start master: Timed out waiting for 2 datanodes and nodemanagers. Operation timed out: Only 0 out of 2 minimum required datanodes running. Operation timed out: Only 0 out of 2 minimum required node managers running.
This error suggests that the worker nodes are not able to communicate with the master node. When worker nodes are unable to report to master node in given timeframe, cluster creation fails.
Please check if you have set up correct firewall rules to allow communication among VMs.
You can refer to following for network configs best practices: https://cloud.google.com/dataproc/docs/concepts/configuring-clusters/network#overview

Related

Apache Airflow : create tasks using for loop in one dag, I want tasks made of for loop to respond to each xcom

A task that performs the same task in one dag was created using a for loop. It is hoped to be divided into two branches that depend on the result of this task. However, all tasks created using the for loop return the xcom of the last task. How can tasks created using for loop return each xcom?
Each task a,b,c returns xcom_a, xcom_b, and xcom_c. However, branch tasks all get the same xcom_c. What should I do?
default_args ={'start_date':days_ago(1)}
dag=DAG(
dag_id='batch_test',
default_args=default_args,
schedule_interval=None)
def count(**context):
name = context['params']['name']
dict = {'a':50,
'b':100,
'c':150}
if dict[name]<100:
task_id=f'add_{name}'
return task_id
elif dict[name]>=100:
task_id=f'times_{name}'
return task_id
def branch(**context):
task_id = context['ti'].xcom_pull(task_ids=f'count_task_{name}')
return task_id
def add(**context):
ans = context['ti'].xcom_pull(task_ids=f'branch_task_{name}')
ans_dict = {'add_a':50+100,
'add_b':100+100,
'add_c':150+100}
ans = ans_dict[ans]
return print(ans)
def times(**context):
ans = context['ti'].xcom_pull(task_ids=f'branch_task_{name}')
ans_dict = {'times_a':50*100,
'times_b':100*100,
'times_c':150*100}
ans = ans_dict[ans]
return print(ans)
name_list = ['a','b','c']
for name in name_list:
exec_count_task = PythonOperator(
task_id = f'count_task_{name}',
python_callable = count,
provide_context=True,
params = {'name':name},
dag=dag
)
exec_branch_task = BranchPythonOperator(
task_id = f'branch_task_{name}',
python_callable = branch,
provide_context = True,
dag = dag
)
exec_add_count = PythonOperator(
task_id = f'add_{name}',
python_callable = add,
provide_context = True,
dag = dag
)
exec_times_count = PythonOperator(
task_id = f'times_{name}',
python_callable = times,
provide_context = True,
dag = dag
)
exec_count_task >> exec_branch_task >> [exec_add_count, exec_times_count]
i want this...
task_a >> branch_a (branch python operator, xcom pull returned by task_a) >> [task_a1, task_a2]
task_b >> branch_b (branch python operator, xcom pull returned by task_b) >> [task_b1, task_b2]
task_c (>> branch_c (branch python operator, xcom pull returned by task_c) >> [task_c1, task_c2]
but
task_a >> branch_a (branch python operator, xcom pull returned by task_c) >> [task_a1, task_a2]
task_b >> branch_b (branch python operator, xcom pull returned by task_c) >> [task_b1, task_b2]
task_c >> branch_c (branch python operator, xcom pull returned by task_c) >> [task_c1, task_c2]
I'm unable to reproduce the behavior you describe using classic operators and the TaskFlow API. If you are able to add more context and code of what you are actually executing that would be most helpful.
In the meantime, here are the examples I used should it give you some guidance for troubleshooting. I added a task at the end of the streams to check that the first task indeed pushes its expected value.
Classic Operators
from pendulum import datetime
from airflow.models import DAG
from airflow.operators.python import BranchPythonOperator, PythonOperator
from airflow.utils.trigger_rule import TriggerRule
with DAG(dag_id="multiple_branch_loop", start_date=datetime(2023, 1, 1), schedule=None):
def xcom_push(val):
return val
def func():
...
def choose(val):
return f"task_{val}"
def check_xcom_output_from_first(val, expected_val):
assert val == expected_val
stuff = ["a", "b", "c"]
for i in stuff:
first = PythonOperator(task_id=f"first_task_{i}", python_callable=xcom_push, op_kwargs={"val": i})
branch = BranchPythonOperator(task_id=f"branch_{i}", python_callable=choose, op_kwargs={"val": i})
second = PythonOperator(task_id=f"task_{i}", python_callable=func)
third = PythonOperator(task_id=f"task_{i}a", python_callable=func)
check = PythonOperator(
task_id=f"check_{i}",
trigger_rule=TriggerRule.ALL_DONE,
python_callable=check_xcom_output_from_first,
op_kwargs={"val": first.output, "expected_val": i},
)
first >> branch >> [second, third] >> check
The check* tasks succeed meaning the first task in a given stream does push its value and not the last stream's.
TaskFlow API
from pendulum import datetime
from airflow.decorators import dag, task
from airflow.utils.trigger_rule import TriggerRule
#dag(start_date=datetime(2023, 1, 1), schedule=None)
def multiple_branch_loop():
#task()
def xcom_push(val):
return val
#task()
def func():
...
#task.branch()
def choose(val):
return f"task_{val}"
#task(trigger_rule=TriggerRule.ALL_DONE)
def check_xcom_output_from_first(val, expected_val):
assert val == expected_val
stuff = ["a", "b", "c"]
for i in stuff:
first = xcom_push.override(task_id=f"first_task_{i}")(val=i)
branch = choose.override(task_id=f"branch_{i}")(val=first)
second = func.override(task_id=f"task_{i}")()
third = func.override(task_id=f"task_{i}a")()
check = check_xcom_output_from_first.override(task_id=f"check_{i}")(val=first, expected_val=i)
first >> branch >> [second, third] >> check
multiple_branch_loop()
Same expected behavior as well confirmed in the check* tasks:
Your functions branch, add, and times don't define name themselves, so it is taken out of global context, which is at time of function execution the last value of for name in name_list. This is a common trap explained e.g. here: tkinter creating buttons in for loop passing command arguments
To fix it, you can either pull name from context as in count, or provide it via op_args or op_kwargs when you create the respective operator, as in the answer by Josh Fell:
first = PythonOperator(task_id=f"first_task_{i}", python_callable=xcom_push, op_kwargs={"val": i})
branch = BranchPythonOperator(task_id=f"branch_{i}", python_callable=choose, op_kwargs={"val": i})

Airflow dag executing middle tasks after completing stop_dag task, how to stop other tasks want to dependency even though parent task did not ran

Here is the image after stop execution step, highlighted steps are running without any dependency. please help to stop other task after stop_dag task run completed.
Not completely sure I understand you correctly but you want that the red marked tasks don't run in case stop_dag is successful? This can be accomplished using trigger rules and one extra task.
I tried to recreate your setup with EmptyOperators:
Then I add a set of dependencies and an additional task that fails if stop_task is successful due to the trigger rule all_failed:
fail_if_stop_successful = EmptyOperator(
task_id="fail_if_stop_successful",
trigger_rule="all_failed"
)
stop_task >> fail_if_stop_successful >> [t3_1, t3_2, t3_3]
This creates the following pattern:
Full DAG:
from airflow.decorators import dag
from airflow.operators.empty import EmptyOperator
from pendulum import datetime
#dag(
start_date=datetime(2023,1,1),
schedule=None,
catchup=False
)
def test_dag():
start = EmptyOperator(task_id="start")
t1 = EmptyOperator(task_id="t1")
stop_task = EmptyOperator(task_id="stop_task")
t2_1 = EmptyOperator(task_id="t2_1")
t2_2 = EmptyOperator(task_id="t2_2")
t2_3 = EmptyOperator(task_id="t2_3")
t3_1 = EmptyOperator(task_id="t3_1")
t3_2 = EmptyOperator(task_id="t3_2")
t3_3 = EmptyOperator(task_id="t3_3")
t4 = EmptyOperator(task_id="t4")
end = EmptyOperator(task_id="end")
# rebuilding the dependencies from your drawing
start >> t1 >> [t2_1, t2_2, t2_3]
start >> stop_task
t2_1 >> t3_1
t2_2 >> t3_2
t2_3 >> t3_3
[t3_1, t3_2, t3_3] >> t4 >> end
# making a successful run of stop_task stop t3_1, t3_2, t3_3
fail_if_stop_successful = EmptyOperator(
task_id="fail_if_stop_successful", trigger_rule="all_failed"
)
stop_task >> fail_if_stop_successful >> [t3_1, t3_2, t3_3]
test_dag()

Issue running a lambda two times in sequence from Airflow in AWS

I am very new to Airflow. I have a single lambda function that needs to be executed two times in sequence. I am passing the payload from dag file (say date), based on this date my lambda will fetch the records from an API.
The lambda works fine when testing it manually for different dates, but when I create a dag file to invoke them in sequence, the second lambda is executed more than once. For this task, second time execution starts even before the first execution of lambda is complete.
I am not using any retries in my dag, not sure if the issue is at the lambda end or dag end or if anything configuration related.
No errors noted at dag, but the lambda log shows the multiple invocation of the second task.
Any help with this appreciated!
Thanks!
from airflow import DAG
from airflow.operators.dummy import DummyOperator
from airflow.operators.python import PythonOperator
from datetime import datetime, timedelta, timezone
import boto3
import json
from airflow.utils.dates import days_ago
args = { 'owner': 'airflow', 'start_date': days_ago(1), 'catchup': False, 'provide_context': True }
#50 13 * * *
dag = DAG( 'my_dag', schedule_interval = None, default_args = args )
def lambda1_trigger(ds,**kwargs):
pld =json.dumps({"date":"27/06/2022"})
lambda_client = boto3.client('lambda',region_name=aws_region,aws_access_key_id=aws_access_key,aws_secret_access_key=aws_secret_access_key)
response_1 = lambda_client.invoke(FunctionName='my_lambda_function',InvocationType='RequestResponse',Payload = pld)
print('Response--->', response_1)
def lambda2_trigger(ds,**kwargs):
pld =json.dumps({"date":"28/06/2022"})
lambda_client = boto3.client('lambda',region_name=aws_region,aws_access_key_id=aws_access_key,aws_secret_access_key=aws_secret_access_key)
response_1 = lambda_client.invoke(FunctionName='my_lambda_function',InvocationType='RequestResponse',Payload = pld)
print('Response--->', response_1)
start = DummyOperator(task_aid='Begin_execution', dag=dag)
invoke_lambda1 = PythonOperator( task_id="task1", python_callable=lambda1_trigger, provide_context=True,execution_timeout=datetime.timedelta(hours=1), dag=dag)
invoke_lambda2 = PythonOperator( task_id="task2", python_callable=lambda2_trigger, provide_context=True,execution_timeout=datetime.timedelta(hours=1), dag=dag)
end = DummyOperator(task_id='stop_execution', dag=dag)
start >> invoke_lambda1 >> invoke_lambda2 >> end`

Need to backup the data but it's not working in Airflow

I am trying to backup the data in Airflow but it's not giving any error and also I am unable to backup the data because it's getting skipped. The code which I have written is :-
import os
from airflow import DAG
from airflow.providers.google.cloud.transfers.bigquery_to_gcs import (
BigQueryToGCSOperator,
)
from composer_plugins import get_list_to_backup
from datetime import datetime, timedelta, date
from airflow.exceptions import AirflowFailException, AirflowSkipException
from airflow.operators.python import PythonOperator
"""the function validates if the schedule_day parameter is a valid day to execute the task schedule_day is a number and corresponds to day of the week
1 - Monday
2 - Tuesday
3 - Wednesday
4 - Thursday
5 - Friday
6 - Saturday
7 - Sunday """
def _check_valid_day(**kwargs):
today = datetime.today()
if today.isoweekday()==kwargs["schedule_day"]:
return True
else:
raise AirflowSkipException("does not correspond to the backup day")
today = datetime.today()
today_str = today.strftime("%Y-%m-%d")
#start_date = get_next_weekday(today_str, 5) # 5 = Saturday
start_date = datetime(2022, 5, 2)
dag_id = "data_bq_weekly_backups_dag"
event_collection_project_id = os.environ["EVENT_COLLECTION_PROJECT_ID"]
tables_to_backup = os.environ["TABLES_TO_BACKUP"]
destination_bucket = os.environ["WEEKLY_BQ_BACKUP_BUCKET"]
schedule_day =os.environ["BACKUP_SCHEDULE_DAY"]
default_dag_args = {
# Setting start date for next Saturday in order to maintain the scheduler
# in a consistent state
"start_date": start_date,
# To email on failure or retry set 'email' arg to your email and enable
# emailing here.
"email_on_failure": False,
"email_on_retry": False,
# If a task fails, retry it once after waiting at least what's specified in retry_delay
"retries": 1,
"retry_delay": timedelta(seconds=10),
"project_id": event_collection_project_id,
"schedule_interval": "0 2 * * *",
}
tables_to_backup_list = get_list_to_backup(tables_to_backup)
with DAG(dag_id=dag_id, default_args=default_dag_args,catchup=False) as dag:
check_valid_day = PythonOperator(
task_id='check_valid_day',
python_callable=_check_valid_day,
op_kwargs={
"schedule_day": schedule_day
},
)
task_dict = dict()
for table_to_backup in tables_to_backup_list:
dataset = table_to_backup.split(".")[0]
table = table_to_backup.split(".")[1]
task_name = f"{dataset}_{table}_table_weekly_backup"
task_dict[task_name] = BigQueryToGCSOperator(
task_id=task_name,
trigger_rule="all_success",
dag=dag,
source_project_dataset_table=table_to_backup,
destination_cloud_storage_uris=[
f"gs://{destination_bucket}/{dataset}/{table}/{today.year}/{today.month}/{today.day}/{table}-*.avro"
],
export_format="AVRO", # OPTIONS: AVRO, CSV, JSON
compression="NONE", # OPTIONS: NONE, DEFLATE, GZIP, SNAPPY
labels=None,
)
check_valid_day >> task_dict[task_name]
When I am executing this DAG, there is no error but it's skipping everything :-
Airflow DAG TREE VIEW
BACKUP_SCHEDULE_DAY=3 is set in environment variable file. I don't know what's wrong in this and why it's not working
I tried your code I was able to reproduce your issue. See run history below:
NOTE: Prior to running your code, I hardcoded values like your environment variables and tables_to_backup_list to make it work on my environment.
The main problem is in _check_valid_day(). When this line is executed if today.isoweekday()==kwargs["schedule_day"] it is always false because they have a data type mismatch.
print(today.isoweekday()) # <class 'int'>
print(kwargs["schedule_day"]) # <class 'str'>
The fix is to make their data types match. Fix below is to convert kwargs["schedule_day"] to type int:
def _check_valid_day(**kwargs):
today = datetime.today()
if today.isoweekday() == int(kwargs["schedule_day"]): #convert to int
print("inside the if statement")
return True
else:
raise AirflowSkipException("does not correspond to the backup day")
Graph view:
check_valid_day Logs:

How to individually run task separately in airflow?

I have a list of tables I want to run my script through. It works successfully when I do one table at a time but when I try a for loop above the tasks, it run all the tables at once giving me multiple errors.
Here is my code:
def create_tunnel_postgres():
psql_host = ''
psql_port = 5432
ssh_host= ''
ssh_port = 22
ssh_username = ''
pkf = paramiko.RSAKey.from_private_key(StringIO(Variable.get('my_key')))
server = SSHTunnelForwarder(
(ssh_host, 22),
ssh_username=ssh_username,
ssh_private_key=pkf,
remote_bind_address=(psql_host, 5432))
return server
def conn_postgres_internal(server):
"""
Using the server connect to the internal postgres
"""
conn = psycopg2.connect(
database='pricing',
user= Variable.get('postgres_db_user'),
password= Variable.get('postgres_db_key'),
host=server.local_bind_host,
port=server.local_bind_port,
)
return conn
def gzip_postgres_table(**kwargs):
"""
path='/path/{}.csv'.format(table_name)
server_postgres = create_tunnel_postgres()
server_postgres.start()
etl_conn = conn_postgres_internal(server_postgres)
cur=etl_conn.cursor()
cur.execute("""
select * from schema.db.{} limit 100;
""".format(table_name))
result = cur.fetchall()
column_names = [i[0] for i in cur.description]
fp = gzip.open(path, 'wt')
myFile = csv.writer(fp,delimiter=',')
myFile.writerow(column_names)
myFile.writerows(result)
fp.close()
etl_conn.close()
server_postgres.stop()
#------------------------------------------------------------------------------------------------------------------------------------------------
default_args = {
'owner': 'mae',
'depends_on_past':False,
'start_date': datetime(2020,1,1),
'email': ['maom#aol.com'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 0,
'retry_delay': timedelta(minutes=1)
}
tables= ['table1','table2']
s3_folder='de'
current_timestamp=datetime.now()
#Element'S VARIABLES
dag = DAG('dag1',
description = 'O',
default_args=default_args,
max_active_runs=1,
schedule_interval= '#once',
#schedule_interval='hourly'
catchup = False )
for table_name in pricing_table_name:
t1 = PythonOperator(
task_id='{}_gzip_table'.format(table_name),
python_callable= gzip_postgres_table,
provide_context=True,
op_kwargs={'table_name':table_name,'s3_folder':s3_folder,'current_timestamp':current_timestamp},
dag = dag)
Is there a way to run table1 first..let it finish and then run table 2? I tried doing that with the for table_name in tables: but to no avail. Any ideas or suggestions would help.
Your for is creating multiple tasks for your tables processing, this will parallelize the execution of the tasks by default on airflow.
You can either set the number of workers in the airflow config file to 1, or create only 1 task and run your loop inside the task, which will then be executed synchronously.
I saw your code, and it seems like you're creating multiple DAG tasks using looping statement, which runs the task in parallel.
There are certain ways to achieve your requirement.
use sequential_executor.
airflow.executors.sequential_executor.SequentialExecutor which will only run task instances sequentially.
https://airflow.apache.org/docs/stable/start.html#quick-start
create a script that works according to your need.
Create a script(Python) and use it as PythonOperator that repeats your current function for number of tables.
limit airflow executors(parallelism) to 1.
You can limit your airflow workers to 1 in its airflow.cfg config file.
Steps:
open airflow.cfg from your airflow root(AIRFLOW_HOME).
set/update parallelism = 1
restart your airflow.
this should work.
I see 3 way of solving this.
Limit parallelism = 1 in the airflow.cfg file.
Create a python code which is going to loop trough you tables and
call that with a python
Create a pool and assign 1 slot to it.
https://airflow.apache.org/docs/stable/concepts.html?highlight=pool#pools
i thing You need DAG like this
Code for it:
from datetime import datetime
from airflow import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator
import sys
sys.path.append('../')
from mssql_loader import core #program code, which start load
from mssql_loader import locals #local variables, contains dictionaries with name
def contact_load(typ,db):
core.starter(typ=typ,db=db)
return 'MSSQL LOADED '+db['DBpseudo']+'.'+typ
dag = DAG('contact_loader', description='MSSQL sqlcontact.uka.local loader to GBQ',
schedule_interval='0 7 * * *',
start_date=datetime(2017, 3, 20), catchup=False)
start_operator = DummyOperator(task_id='ROBO_task', retries=3, dag=dag)
for v in locals.TABLES:
for db in locals.DB:
task = PythonOperator(
task_id=db['DBpseudo']+'_mssql_' + v, #create Express_mssql_fast , UKA_mssql_important and etc
python_callable=contact_load,
op_kwargs={'typ': v,'db':db},
retries=3,
dag=dag,
)
start_operator >> task #create parent-child connection to from first task to other
dag = DAG(dag_id='you_DAG',default_args=default_args,schedule_interval='10 6 * * *',max_active_runs=1 --- HERE execute only 1 task)

Categories

Resources