I have setup job as below:
In settings.py:
CRON_CLASSES = [
"kiteconnect_source.source.cron.MyCronJob",
]
CRONJOBS = [
('*/1 * * * *','kiteconnect_source.source.cron.MyCronJob')
]
In Cron job python file:
from django_cron import CronJobBase, Schedule
import datetime
class MyCronJob(CronJobBase):
ALLOW_PARALLEL_RUNS = False
RUN_EVERY_MINS = 1
schedule = Schedule(run_every_mins=RUN_EVERY_MINS)
code = 'kiteconnect_source.my_cron_job' # a unique code
def do(self):
# time = datetime.datetime.now()
val = "This is cron job function new testing:"
f = open('/dummy.txt','a')
f.write(val)
f.close()
print("demo")
Issue is it's getting executed only once instead of every interval of one minute.
Can anyone explain what I've missed? Thanks.
Related
I have a list of tables I want to run my script through. It works successfully when I do one table at a time but when I try a for loop above the tasks, it run all the tables at once giving me multiple errors.
Here is my code:
def create_tunnel_postgres():
psql_host = ''
psql_port = 5432
ssh_host= ''
ssh_port = 22
ssh_username = ''
pkf = paramiko.RSAKey.from_private_key(StringIO(Variable.get('my_key')))
server = SSHTunnelForwarder(
(ssh_host, 22),
ssh_username=ssh_username,
ssh_private_key=pkf,
remote_bind_address=(psql_host, 5432))
return server
def conn_postgres_internal(server):
"""
Using the server connect to the internal postgres
"""
conn = psycopg2.connect(
database='pricing',
user= Variable.get('postgres_db_user'),
password= Variable.get('postgres_db_key'),
host=server.local_bind_host,
port=server.local_bind_port,
)
return conn
def gzip_postgres_table(**kwargs):
"""
path='/path/{}.csv'.format(table_name)
server_postgres = create_tunnel_postgres()
server_postgres.start()
etl_conn = conn_postgres_internal(server_postgres)
cur=etl_conn.cursor()
cur.execute("""
select * from schema.db.{} limit 100;
""".format(table_name))
result = cur.fetchall()
column_names = [i[0] for i in cur.description]
fp = gzip.open(path, 'wt')
myFile = csv.writer(fp,delimiter=',')
myFile.writerow(column_names)
myFile.writerows(result)
fp.close()
etl_conn.close()
server_postgres.stop()
#------------------------------------------------------------------------------------------------------------------------------------------------
default_args = {
'owner': 'mae',
'depends_on_past':False,
'start_date': datetime(2020,1,1),
'email': ['maom#aol.com'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 0,
'retry_delay': timedelta(minutes=1)
}
tables= ['table1','table2']
s3_folder='de'
current_timestamp=datetime.now()
#Element'S VARIABLES
dag = DAG('dag1',
description = 'O',
default_args=default_args,
max_active_runs=1,
schedule_interval= '#once',
#schedule_interval='hourly'
catchup = False )
for table_name in pricing_table_name:
t1 = PythonOperator(
task_id='{}_gzip_table'.format(table_name),
python_callable= gzip_postgres_table,
provide_context=True,
op_kwargs={'table_name':table_name,'s3_folder':s3_folder,'current_timestamp':current_timestamp},
dag = dag)
Is there a way to run table1 first..let it finish and then run table 2? I tried doing that with the for table_name in tables: but to no avail. Any ideas or suggestions would help.
Your for is creating multiple tasks for your tables processing, this will parallelize the execution of the tasks by default on airflow.
You can either set the number of workers in the airflow config file to 1, or create only 1 task and run your loop inside the task, which will then be executed synchronously.
I saw your code, and it seems like you're creating multiple DAG tasks using looping statement, which runs the task in parallel.
There are certain ways to achieve your requirement.
use sequential_executor.
airflow.executors.sequential_executor.SequentialExecutor which will only run task instances sequentially.
https://airflow.apache.org/docs/stable/start.html#quick-start
create a script that works according to your need.
Create a script(Python) and use it as PythonOperator that repeats your current function for number of tables.
limit airflow executors(parallelism) to 1.
You can limit your airflow workers to 1 in its airflow.cfg config file.
Steps:
open airflow.cfg from your airflow root(AIRFLOW_HOME).
set/update parallelism = 1
restart your airflow.
this should work.
I see 3 way of solving this.
Limit parallelism = 1 in the airflow.cfg file.
Create a python code which is going to loop trough you tables and
call that with a python
Create a pool and assign 1 slot to it.
https://airflow.apache.org/docs/stable/concepts.html?highlight=pool#pools
i thing You need DAG like this
Code for it:
from datetime import datetime
from airflow import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator
import sys
sys.path.append('../')
from mssql_loader import core #program code, which start load
from mssql_loader import locals #local variables, contains dictionaries with name
def contact_load(typ,db):
core.starter(typ=typ,db=db)
return 'MSSQL LOADED '+db['DBpseudo']+'.'+typ
dag = DAG('contact_loader', description='MSSQL sqlcontact.uka.local loader to GBQ',
schedule_interval='0 7 * * *',
start_date=datetime(2017, 3, 20), catchup=False)
start_operator = DummyOperator(task_id='ROBO_task', retries=3, dag=dag)
for v in locals.TABLES:
for db in locals.DB:
task = PythonOperator(
task_id=db['DBpseudo']+'_mssql_' + v, #create Express_mssql_fast , UKA_mssql_important and etc
python_callable=contact_load,
op_kwargs={'typ': v,'db':db},
retries=3,
dag=dag,
)
start_operator >> task #create parent-child connection to from first task to other
dag = DAG(dag_id='you_DAG',default_args=default_args,schedule_interval='10 6 * * *',max_active_runs=1 --- HERE execute only 1 task)
import subprocess as sub
import re
import os
from datetime import datetime as influx_timestap
from influxdb import InfluxDBClient
from collections import OrderedDict
insert_json = []
hostname = str(sub.check_output('hostname')).strip()
location = str(sub.check_output(['ps -ef | grep mgr'], shell=True)).split()
current_dir = os.getcwd()
print("script executed")
gg_location_pattern = re.compile(r'mgr\.prm$')
gg_process_pattertn = re.compile(r'^REPLICAT|^EXTRACT')
for index in location:
if gg_location_pattern.search(index) != None:
gg_location = index[:-14]
os.chdir(gg_location)
print("checkpoint1")
get_lag = sub.check_output(str(current_dir) + '/ggsci_test.sh', shell=True)
print("checkpoint2")
processes = get_lag.split("\n")
for process in processes:
if gg_process_pattertn.search(process) != None:
lag_at_chkpnt = int((process.split()[3]).split(":")[0]) * 3600 + int((process.split()[3]).split(":")[1]) *60 + int((process.split()[3]).split(":")[2])
time_since_chkpnt = int((process.split()[4]).split(":")[0]) * 3600 + int((process.split()[4]).split(":")[1]) *60 + int((process.split()[4]).split(":")[2]
)
process_dict = OrderedDict({"measurement": "GoldenGate_Mon_" + str(hostname) + "_Graph",
"tags": {"hostname": hostname, "process_name": process.split()[2]},
"time": influx_timestap.now().isoformat('T'),
"fields": {"process_type": process.split()[0], "process_status": process.split()[1],
"lag_at_chkpnt": lag_at_chkpnt, "time_since_chkpnt": time_since_chkpnt}})
insert_json.append(process_dict)
host = 'xxxxxxxx'
port = 'x'
user = 'x'
password = 'x'
dbname = 'x'
print("before client")
client = InfluxDBClient(host, port, user, password, dbname)
client.write_points(insert_json)
print("after client")
This code works manually perfect, but on the crontab it is not working. After searching on the internet I found that they say change or set your "PATH" variable on the crontab. I changed my "PATH" variable and it is still not working.
Crontab log file write "checkpoint1" after that there is nothing. So, line not working is "get_lag = sub.check_output(str(current_dir) + '/ggsci_test.sh', shell=True)"
What can I do here afterwards?
Take care,
it looks like your external script (ggsci_test.sh) has some issues with the paths / general failure.
From the Python subprocess documentation about subprocess.check_output:
If the return code was non-zero it raises a CalledProcessError. The
CalledProcessError object will have the return code in the returncode
attribute and any output in the output attribute.
So thats the reason why you see the error when catching it, but not being able to continue.
You should check therefore if your shell script has any issues that need to be solved before.
I have this Python file:
class Get:
def __init__(self, i):
self.i = get_date(i)
self.df = self.get_file()
def get_file(self):
try:
...
return df
except Exception as e:
return ...
def get_date(self,i):
dt = datetime.now() - timedelta(days=i)
return dt.strftime("%Y-%m-%d")
def put(self,df):
....
class Fix:
def __init__(self,df):
....
if __name__ == '__main__':
for i in range(4, 0, -1):
get = Get(i)
fix = Fix(get.df)
get.put(fix.df)
Basically this code generates 4 last dates and run the functions over these dates (update statistics etc...)
At first I wanted to convert each function into a PythonOperator and then schedule it but I don't think this will work. I don't know how to convert the Classes and the parameters that are transferred between them.
This is what the code does if I run it on 2018-Jun-12 and below what it should be with Airflow:
Is there a template that I can use or any suggestion how to do it?
you can either execute your script using BashOperator without any changes of your script:
dag = DAG('{NAME_OF_THE_DAG}', schedule_interval='daily',
default_args=default_args)
t1 = BashOperator(
task_id = '{NAME_OF_TASK}',
dag = dag,
bash_command = python {NAME_OF_THE_FILE_TO_EXECUTE}.py')
or use PythonOperator:
update your code to create main function in your script:
def main():
for i in range(4, 0, -1):
get = Get(i)
fix = Fix(get.df)
get.put(fix.df)
define and execute the dag:
dag = DAG('{NAME_OF_THE_TASK}', schedule_interval = 'daily',
default_args=default_args)
t1 = PythonOperator(
task_id = '{NAME_OF_TASK}',
dag = dag,
python_callable = main)
I'm trying to run a unit test on a celery task that I have set to run daily.
I have tried importing the function and calling it in my test but this doesn't work.
The task is:
#shared_task
def create_a_notification_if_a_product_is_in_or_out_of_season():
"""
Send a notification if a product is now in or out of season
"""
julian_date = date.today().timetuple().tm_yday + 1
active_products = Product.objects.filter(status='ACTIVE')
for products in active_products:
in_season_prd = ProductDescription.objects.filter(
product=products,
early_start_julian=julian_date
)
for prd in in_season_prd:
notification = Notification()
notification.type = notification_choices.PRODUCT_IN_SEASON
notification.description = str(prd.product.name) + " will be in season from tomorrow."
notification.save()
and here is an example of one of my tests:
def test_when_product_is_about_to_come_in_to_seasonality(self):
"""
Make a notification when a product is due to come in to seasonality tomorrow
"""
p = Product.objects.first()
p.status = "ACTIVE"
today = date.today().timetuple().tm_yday
p.early_start_julian = today + 1
create_a_notification_if_a_product_is_in_or_out_of_season()
updated_notifications = Notification.objects.all().count()
self.assertNotEqual(self.current_notifications, updated_notifications)
Any help would be appreciated!
Thanks
You can apply() your celery task to execute it synchronously:
def test_when_product_is_about_to_come_in_to_seasonality(self):
"""
Make a notification when a product is due to come in to seasonality tomorrow
"""
p = Product.objects.first()
p.status = "ACTIVE"
today = date.today().timetuple().tm_yday
p.early_start_julian = today + 1
create_a_notification_if_a_product_is_in_or_out_of_season.apply()
updated_notifications = Notification.objects.all().count()
self.assertNotEqual(self.current_notifications, updated_notifications)
I think you're looking for CELERY_ALWAYS_EAGER setting. If set to True it will run your tasks synchronously. You can set it in your test settings or you can decorate only that test with #override_settings(CELERY_ALWAYS_EAGER=True)
I have a python script, followback.py, which I am trying to run by using cron.
The script runs fine on its own i.e. when run by command 'python followback.py'.
But the script is never run when using cron.
My crontab file:
* * * * * python /home/ubuntu/./followback.py
* * * * * python /home/ubuntu/./test.py
I am using test.py as a simple testing measure by writing to a file to let me know that it have been run.
followback.py:
import io, json
def save_json(filename, data):
with io.open('{0}.json'.format(filename),
'w', encoding='utf-8') as f:
f.write(unicode(json.dumps(data, ensure_ascii=False)))
def load_json(filename):
with io.open('{0}.json'.format(filename),
encoding='utf-8') as f:
return f.read()
CONSUMER_KEY = xx
CONSUMER_SECRET = xx
OAUTH_TOKEN = xx
OAUTH_TOKEN_SECRET = xx
auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,
CONSUMER_KEY, CONSUMER_SECRET)
twitter_api = twitter.Twitter(auth=auth)
q = 'followback'
count = 20
page = 1
results = []
maxResults = 50
filename = 'attempted_accounts'
try:
usedUsers = json.loads(load_json(filename))
except IOError:
usedUsers = []
usedList = [used['id'] for used in usedUsers]
# search for 'followback' and follow the ones with 'followback' in description
while len(results) < maxResults:
users = twitter_api.users.search(q=q, count=count, page=page)
results += [user for user in users if 'followback' in user['description'] and user['id'] not in usedList]
page += 1
[twitter_api.friendships.create(user_id=user['id'], follow='true') for user in results]
out = usedUsers + [{'id' : e['id']} for e in results]
save_json(filename, out)
The script above simply searches twitter for users with followback in the description and follows them.
The test.py script runs fine through cron but followback.py does not and I have no clue as to what could be wrong.
Any suggestions?
Check if the followback.py file ex executable, if not use chmod +x and it should work. That's the most common issue with this. Look at similar case here.