I am trying to schedule tasks within my flask app to fire now and then... this works great with flask development server, but once I fire the same app with gunicorn it is not working as intended, no errors, but no tasks are being fired..
from flask import Flask
from flask_apscheduler import APScheduler
class Config(object):
JOBS = [
{
'id': 'job1',
'func': 'myapp:job1',
'args': (1, 2),
'trigger': 'interval',
'seconds': 5
}
]
SCHEDULER_API_ENABLED = True
def job1(a, b):
print(str(a) + ' ' + str(b))
app = Flask(__name__)
if __name__ == '__main__':
app.config.from_object(Config())
scheduler = APScheduler()
scheduler.init_app(app)
scheduler.start()
app.run()
For the purpose I include this sample, python myapp.py works fine and prints 1 2 every fifth second, however, running gunicorn -w 1 -b 0.0.0.0:5000 myapp:app it is not printing 1 2 every fifth second, why?
so, 2 things
1) use logging, you get a LOT more info:
2)
you need to declare outside your if __name__=='__main__': block:
1 from flask import Flask
2 from flask_apscheduler import APScheduler
3 import logging
4
5 logging.basicConfig(level=logging.DEBUG,
6 format='%(asctime)s %(levelname)s %(message)s')
7
8 logger = logging.getLogger(__name__)
9
10 class Config(object):
11 JOBS = [
12 {
13 'id': 'job1',
14 'func': 'scratch:job1',
15 'args': (1, 2),
16 'trigger': 'interval',
17 'seconds': 5
18 }
19 ]
20
21 SCHEDULER_API_ENABLED = True
22
23
24 def job1(a, b):
25 logger.info(str(a) + ' ' + str(b))
26
27 app = Flask(__name__)
28 app.config.from_object(Config())
29
30 scheduler = APScheduler()
31 scheduler.init_app(app)
32 scheduler.start()
33
34
35 if __name__ == '__main__':
36 app.run()
~
~
output:
[2017-02-15 14:29:39 +0000] [25122] [INFO] Booting worker with pid: 25122
2017-02-15 14:29:39,817 INFO Adding job tentatively -- it will be properly scheduled when the scheduler starts
2017-02-15 14:29:39,819 INFO Added job "job1" to job store "default"
2017-02-15 14:29:39,819 INFO Scheduler started
2017-02-15 14:29:39,820 DEBUG Looking for jobs to run
2017-02-15 14:29:39,820 DEBUG Next wakeup is due at 2017-02-15 14:29:44.816072-05:00 (in 4.995362 seconds)
2017-02-15 14:29:44,816 DEBUG Looking for jobs to run
2017-02-15 14:29:44,817 INFO Running job "job1 (trigger: interval[0:00:05], next run at: 2017-02-15 14:29:44 EST)" (scheduled at 2017-02-15 14:29:44.816072-05:00)
2017-02-15 14:29:44,817 INFO 1 2
The issue:
The job didn't fire originally. The issue was that your scheduler definition (was inside of your block if __name__ == '__main__' This code only gets hit when you call that python file directly (python myapp.py)
When you launched it with Gunicorn, the __name__ was no longer main, it was "myapp.py" so it never got to the part of the code that created and added the scheduler :)
Related
I tried creating a Dataproc cluster both through Airflow and through the Google cloud UI, and the cluster creation always fails at the end. Following is the airflow code I am using to create the cluster -
# STEP 1: Libraries needed
from datetime import timedelta, datetime
from airflow import models
from airflow.operators.bash_operator import BashOperator
from airflow.contrib.operators import dataproc_operator
from airflow.utils import trigger_rule
from poc.utils.transform import main
from airflow.contrib.hooks.gcp_dataproc_hook import DataProcHook
from airflow.operators.python_operator import BranchPythonOperator
import os
YESTERDAY = datetime.combine(
datetime.today() - timedelta(1),
datetime.min.time())
project_name = os.environ['GCP_PROJECT']
# Can pull in spark code from a gcs bucket
# SPARK_CODE = ('gs://us-central1-cl-composer-tes-fa29d311-bucket/spark_files/transformation.py')
dataproc_job_name = 'spark_job_dataproc'
default_dag_args = {
'depends_on_past': False,
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'start_date': YESTERDAY,
'retry_delay': timedelta(minutes=5),
'project_id': project_name,
'owner': 'DataProc',
}
with models.DAG(
'dataproc-poc',
description='Dag to run a simple dataproc job',
schedule_interval=timedelta(days=1),
default_args=default_dag_args) as dag:
CLUSTER_NAME = 'dataproc-cluster'
def ensure_cluster_exists(ds, **kwargs):
cluster = DataProcHook().get_conn().projects().regions().clusters().get(
projectId=project_name,
region='us-east1',
clusterName=CLUSTER_NAME
).execute(num_retries=5)
print(cluster)
if cluster is None or len(cluster) == 0 or 'clusterName' not in cluster:
return 'create_dataproc'
else:
return 'run_spark'
# start = BranchPythonOperator(
# task_id='start',
# provide_context=True,
# python_callable=ensure_cluster_exists,
# )
print_date = BashOperator(
task_id='print_date',
bash_command='date'
)
create_dataproc = dataproc_operator.DataprocClusterCreateOperator(task_id='create_dataproc',
cluster_name=CLUSTER_NAME,
num_workers=2,
use_if_exists='true',
zone='us-east1-b',
master_machine_type='n1-standard-1',
worker_machine_type='n1-standard-1')
# Run the PySpark job
run_spark = dataproc_operator.DataProcPySparkOperator(
task_id='run_spark',
main=main,
cluster_name=CLUSTER_NAME,
job_name=dataproc_job_name
)
# dataproc_operator
# Delete Cloud Dataproc cluster.
# delete_dataproc = dataproc_operator.DataprocClusterDeleteOperator(
# task_id='delete_dataproc',
# cluster_name='dataproc-cluster-demo-{{ ds_nodash }}',
# trigger_rule=trigger_rule.TriggerRule.ALL_DONE)
# STEP 6: Set DAGs dependencies
# Each task should run after have finished the task before.
print_date >> create_dataproc >> run_spark
# print_date >> start >> create_dataproc >> run_spark
# start >> run_spark
I checked the cluster logs and saw the following errors -
Unable to store master key 1
Unable to store master key 2
Initialization failed. Exiting 125 to prevent restart
Cannot start master: Timed out waiting for 2 datanodes and nodemanagers.
Operation timed out: Only 0 out of 2 minimum required datanodes running.
Operation timed out: Only 0 out of 2 minimum required node managers running.
Cannot start master: Timed out waiting for 2 datanodes and nodemanagers. Operation timed out: Only 0 out of 2 minimum required datanodes running. Operation timed out: Only 0 out of 2 minimum required node managers running.
This error suggests that the worker nodes are not able to communicate with the master node. When worker nodes are unable to report to master node in given timeframe, cluster creation fails.
Please check if you have set up correct firewall rules to allow communication among VMs.
You can refer to following for network configs best practices: https://cloud.google.com/dataproc/docs/concepts/configuring-clusters/network#overview
I keep getting the same error from a scheduled BashOperator that is currently back-filling (it's over a month "behind").
[2018-06-10 22:06:33,558] {base_task_runner.py:115} INFO - Running: ['bash', '-c', u'airflow run dag_name task_name 2018-03-14T00:00:00 --job_id 50 --raw -sd DAGS_FOLDER/dag_file.py']
Traceback (most recent call last):
File "/anaconda/bin//airflow", line 27, in <module>
args.func(args)
File "/anaconda/lib/python2.7/site-packages/airflow/bin/cli.py", line 387, in run
run_job.run()
File "/anaconda/lib/python2.7/site-packages/airflow/jobs.py", line 198, in run
self._execute()
File "/anaconda/lib/python2.7/site-packages/airflow/jobs.py", line 2512, in _execute
self.task_runner.start()
File "/anaconda/lib/python2.7/site-packages/airflow/task_runner/bash_task_runner.py", line 29, in start
self.process = self.run_command(['bash', '-c'], join_args=True)
File "/anaconda/lib/python2.7/site-packages/airflow/task_runner/base_task_runner.py", line 120, in run_command
universal_newlines=True
File "/anaconda/lib/python2.7/subprocess.py", line 394, in __init__
errread, errwrite)
File "/anaconda/lib/python2.7/subprocess.py", line 1047, in _execute_child
raise child_exception
OSError: [Errno 2] No such file or directory
[2018-06-10 22:06:33,633] {sequential_executor.py:47} ERROR - Failed to execute task Command 'airflow run dag_name task_name 2018-03-14T00:00:00 --local -sd /var/lib/airflow/dags/dag_file.py' returned non-zero exit status 1.
I remember seeing something that suggested this might be a permissions issue, but I can't figure out which permissions might be involved.
I'm using a systemd configuration--and at my wit's end--I've taken to running the airflow webserver and scheduler as root.
I can take the list in the first line and enter it verbatim in an ipython shell as args to a subprocess.Popen instance (as it is in airflow/task_runner/base_task_runner.py; save no envs) and not only does it run but it correctly informs the airflow db that the task is complete. I can do this as user Airflow, root, or ubuntu.
I've added /anaconda/bin to the PATH in .bashrc for Airflow, root, ubuntu, and /etc/bash.bashrc in addition to the value for AIRFLOW_HOME which is also in my env file /etc/airflow.
This is what my systemd entry looks like:
[Unit]
Description=Airflow scheduler daemon
After=network.target postgresql.service mysql.service redis.service rabbitmq-server.service
Wants=postgresql.service mysql.service redis.service rabbitmq-server.service
[Service]
EnvironmentFile=/etc/airflow
User=root
Group=root
Type=simple
ExecStart=/anaconda/bin/airflow scheduler
Restart=always
RestartSec=5s
[Install]
WantedBy=multi-user.target
My env file:
PATH=$PATH:/anaconda/bin/
AIRFLOW_HOME=/var/lib/airflow
AIRFLOW_CONFIG=$AIRFLOW_HOME/airflow.cfg
Using apache-airflow==1.9.0 and desperate for a solution. Thanks in advance.
Airflow.cfg:
[core]
airflow_home = /var/lib/airflow
dags_folder = /var/lib/airflow/dags
base_log_folder = /var/lib/airflow/logs
remote_log_conn_id =
encrypt_s3_logs = False
logging_level = INFO
logging_config_class =
log_format = [%%(asctime)s] {%%(filename)s:%%(lineno)d} %%(levelname)s - %%(message)s
simple_log_format = %%(asctime)s %%(levelname)s - %%(message)s
executor = SequentialExecutor
sql_alchemy_conn = {actual value hidden}
sql_alchemy_pool_size = 5
sql_alchemy_pool_recycle = 3600
parallelism = 4
dag_concurrency = 2
dags_are_paused_at_creation = True
non_pooled_task_slot_count = 16
max_active_runs_per_dag = 1
load_examples = False
plugins_folder = /var/lib/airflow/plugins
fernet_key = {actual value hidden}
donot_pickle = False
dagbag_import_timeout = 30
task_runner = BashTaskRunner
default_impersonation =
security =
unit_test_mode = False
task_log_reader = file.task
enable_xcom_pickling = True
killed_task_cleanup_time = 60
[cli]
api_client = airflow.api.client.local_client
endpoint_url = http://localhost:8080
[api]
auth_backend = airflow.api.auth.backend.default
[operators]
default_owner = root
default_cpus = 1
default_ram = 512
default_disk = 512
default_gpus = 0
[webserver]
base_url = http://localhost:8080
web_server_host = 0.0.0.0
web_server_port = 8080
web_server_ssl_cert =
web_server_ssl_key =
web_server_worker_timeout = 120
worker_refresh_batch_size = 1
worker_refresh_interval = 60
secret_key = temporary_key
workers = 1
worker_class = sync
access_logfile = -
error_logfile = -
expose_config = False
authenticate = False
filter_by_owner = False
owner_mode = user
dag_default_view = tree
dag_orientation = LR
demo_mode = False
log_fetch_timeout_sec = 5
hide_paused_dags_by_default = False
page_size = 100
[email]
email_backend = airflow.utils.email.send_email_smtp
[smtp]
smtp_host = localhost
smtp_starttls = True
smtp_ssl = False
smtp_port = 25
smtp_mail_from = airflow#example.com
[celery]
...
[dask]
cluster_address = 127.0.0.1:8786
[scheduler]
job_heartbeat_sec = 120
scheduler_heartbeat_sec = 120
run_duration = -1
min_file_process_interval = 0
dag_dir_list_interval = 300
print_stats_interval = 300
child_process_log_directory = /var/lib/airflow/logs/scheduler
scheduler_zombie_task_threshold = 900
catchup_by_default = True
max_tis_per_query = 0
statsd_on = False
statsd_host = localhost
statsd_port = 8125
statsd_prefix = airflow
max_threads = 1
authenticate = False
[ldap]
...
[mesos]
...
[kerberos]
...
[github_enterprise]
...
[admin]
hide_sensitive_variable_fields = True
Adding ls -hal
root#ubuntu:/var/lib/airflow# ls -hal /var
total 52K
drwxr-xr-x 13 root root 4.0K Jun 3 11:58 .
root#ubuntu:/var/lib/airflow# ls -hal /var/lib
total 164K
drwxr-xr-x 42 root root 4.0K Jun 10 19:00 .
root#ubuntu:/var/lib/airflow# ls -hal
total 40K
drwxr-xr-x 4 airflow airflow 4.0K Jun 11 06:41 .
drwxr-xr-x 42 root root 4.0K Jun 10 19:00 ..
-rw-r--r-- 1 airflow airflow 13K Jun 11 06:41 airflow.cfg
-rw-r--r-- 1 airflow airflow 579 Jun 10 19:00 airflow.conf
drwxr-xr-x 2 airflow airflow 4.0K Jun 10 21:27 dags
drwxr-xr-x 4 airflow airflow 4.0K Jun 10 20:31 logs
-rw-r--r-- 1 airflow airflow 1.7K Jun 10 19:00 unittests.cfg
root#ubuntu:/var/lib/airflow# ls -hal dags/
total 16K
drwxr-xr-x 2 airflow airflow 4.0K Jun 10 21:27 .
drwxr-xr-x 4 airflow airflow 4.0K Jun 11 06:41 ..
-rw-r--r-- 1 airflow airflow 3.4K Jun 10 21:26 dag_file.py
-rw-r--r-- 1 airflow airflow 1.7K Jun 10 21:27 dag_file.pyc
and contents of dag_file.py:
import airflow
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from datetime import datetime, timedelta
default_args = {
'owner': 'root',
'run_as': 'root',
'depends_on_past': True,
'start_date': datetime(2018, 2, 20),
'email': ['myemail#gmail.com'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5),
'end_date': datetime(2018, 11, 15),
}
env = {
'PSQL': '{obscured}',
'PATH': '/anaconda/bin/:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin',
'PWD': '/home/ubuntu/{obs1}/',
'HOME': '/home/ubuntu',
'PYTHONPATH': '/home/ubuntu/{obs1}',
}
dag = DAG(
'dag_name',
default_args=default_args,
description='',
schedule_interval=timedelta(days=1))
t1 = BashOperator(
env=env,
task_id='dag_file',
bash_command='export PYTHONPATH=/home/ubuntu/{obs1} && /anaconda/bin/ipython $PYTHONPATH/{obs2}/{obs3}.py {{ ds }}',
dag=dag)
And I remind you that this runs correctly as airflow, root, and ubuntu: airflow run dag_name dag_file 2018-03-17T00:00:00 --job_id 55 --raw -sd DAGS_FOLDER/dag_file.py
It looks like python version mismatch, edit you .bashrc with proper python version and run:
source .bashrc
This will resolve you issue.
For my case we are usingexport PATH="/opt/miniconda3/bin":$PATH
Also to check how I can to this:
/opt/miniconda3/bin/python /opt/miniconda3/bin/airflow
This is how I used to run the airflow.
On Airflow v1.10.0 you just specify the filepath, without the space at the end anymore.
Example:
compact_output_task = BashOperator(**{
'task_id': 'compact_output',
'bash_command': './compact_output.sh',
'xcom_push': True,
})
Systemd EnvironmentFile won't expand the variable inside it so your PATH will only look at /anaconda/bin if you just want to extend your PATH it's better to use
ExecStart=/bin/bash -c 'PATH=/path/to/venv/bin/:$PATH exec /path/to/airflow scheduler
this solved my issue with No such file or directory, because airflow couldn't find the binary that I was calling inside my bash operator.
I am trying to cache the result of a time consuming request.
First I have a flask template as follow :
#app.route("/")
#app.route("/tabs", methods=['GET','POST'])
def tab():
return render_template("tabs.html")
#app.route("/graph", methods=['GET','POST'])
def graph():
#Some code
return render_template("chart.html", the_div=div, the_script=script,
form=form, tables=table, titles = testyear)
#app.route("/prices", methods=['GET','POST'])
def prices():
#Some other code
return render_template("prices.html", PlotGroup=PlotGroup,
ScriptGroup=ScriptGroup, DivGroup=DivGroup)
I have at the top of my code initialized the app, the cache and a time_out :
# Checking is prod to change server from 5000 to 5001
IS_PROD = sys.argv[1] == "prod"
# Setting up cache timer
CACHE_TIMEOUT = 20
# Defining the Flask App
app = Flask(__name__, template_folder='Template')
# define the cache config :
app.config['CACHE_TYPE'] = 'simple'
app.cache = Cache(app)
I have also created a config class :
class Config(object):
JOBS = [
{
'id' : 'refresh_cache',
'func' : 'main:get_my_cache',
'trigger' : 'interval',
'seconds' : 5
}
]
SCHEDULER_API_ENABLED = True
With a function "get_my_cache()" defined as below :
#app.cache.cached(timeout = CACHE_TIMEOUT, key_prefix='my-cache')
def get_my_cache():
cacheval = app.cache.get('my-cache')
print(cacheval)
if cacheval is None:
#cacheval1, cacheval2 = DataHandling.extract_full_table()
cacheval1, cacheval2 = DataHandling.offlinedata()
cacheval = [cacheval1, cacheval2]
print("Cache updated at : " + time.strftime("%b %d %Y - %H:%M:%S"))
app.cache.set('my-cache', [cacheval1, cacheval2])
return cacheval[0], cacheval[1]
In the main section I load everything :
if __name__ == '__main__':
app.config.from_object(Config())
scheduler = APScheduler()
scheduler.init_app(app)
scheduler.start()
if IS_PROD:
app.run(host='0.0.0.0',debug=False, port=5000)
else:
app.run(debug=True, port=5001)
So if I understand well from the timeline below :
None
Cache updated at : Jun 19 2017 - 11:25:58
None
Cache updated at : Jun 19 2017 - 11:26:23
None
Cache updated at : Jun 19 2017 - 11:26:25
127.0.0.1 - - [19/Jun/2017 11:26:25] "GET /graph HTTP/1.1" 200 -
My scheduler is checking my cache every 5 seconds (timing is for testing it will be longer in reality) and I see effectively a cache update every 25 seconds.
What is my problem is that when I refresh the page, I see a cache update after 2 sec of the last update... From my understanding it seems that there are two kind of cache : one for the page (localhost/graph) and another set up by the scheduler. Even if both are related with the same key_prefix...
I understood that the could be related to different threads? Could it be the issue?
def task1(app):
with app.app_context():
#cache.set("t1","123")
x=cache.get("t1")
print(x)
class Config(object):
JOBS = [ { # add task1
'id': 'job1',
'func': '__main__:task1',
'args': (3, 4,app),
'trigger': 'interval',
'seconds': 5,
}]
I hit a problem with my nginx+uwsgi+django site.
I know it's nothing special to django+uwsgi, should be something with logging module itself.
Within my site, I use RotatingFileHandler to log special entries, but, when uwsgi running with multiple worker processors,today i find that,
multiple log files are changing at the same time. For example, here is file snippet:
[root#speed logs]# ls -lth
total 18M
-rw-rw-rw- 1 root root 2.1M Sep 14 19:44 backend.log.7
-rw-rw-rw- 1 root root 1.3M Sep 14 19:43 backend.log.6
-rw-rw-rw- 1 root root 738K Sep 14 19:43 backend.log.3
-rw-rw-rw- 1 root root 554K Sep 14 19:43 backend.log.1
-rw-rw-rw- 1 root root 1013K Sep 14 19:42 backend.log.4
-rw-rw-rw- 1 root root 837K Sep 14 19:41 backend.log.5
-rw-rw-rw- 1 root root 650K Sep 14 19:40 backend.log.2
-rw-rw-rw- 1 root root 656K Sep 14 19:40 backend.log
-rw-r--r-- 1 root root 10M Sep 13 10:11 backend.log.8
-rw-r--r-- 1 root root 0 Aug 21 15:53 general.log
[root#speed-app logs]#
Actually, I set rotate file to 10M perfile and upto 10 files.
I googled a lot and many people hit this before, seems logging module itself cannot support this.
And I find someone mentioned ConcurrentLogHandler(https://pypi.python.org/pypi/ConcurrentLogHandler/0.9.1).
Anybody uses this guy before? I see it's based on file lock, I don't know if this guy's performance is good enouth.
Or anyone has better idea to log multiple uwsig instances to same rotated file ?
Thanks.
Wesley
Just for the heck of it, here is a complete solution example which uses python StreamHandler, uWSGI "daemonized file logging", and logrotate daemon to log to file with rotation.
As you will see, uWSGI logging captures stdout/stderr from your app and redirects it either to stdout/stderr (by default) or to other logger/handlers as defined.
Setup Django/uWSGI
Your Django settings.py
LOGGING = {
'version': 1,
'disable_existing_loggers': False,
'formatters': {
'default': {
'format': '%(asctime)s - %(process)s - %(levelname)s - %(name)s : %(message)s',
},
},
'handlers': {
'console': {
'level': 'DEBUG',
'class': 'logging.StreamHandler',
},
},
'root': {
'handlers': ['console'],
'level': 'DEBUG',
},
}
Somewhere in your code
log = logging.getLogger(__name__)
log.info("test log!")
Run uWSGI with some logging params
$ uwsgi --http :9090 --chdir=`pwd -P` --wsgi-file=wsgi.py \
--daemonize=test.log \ # daemonize AND set log file
--log-maxsize=10000 \ # a 10k file rotate
--workers=4 # start 4 workers
Output
Excerpt of test.log
*** uWSGI is running in multiple interpreter mode ***
spawned uWSGI master process (pid: 79755)
spawned uWSGI worker 1 (pid: 79813, cores: 1)
spawned uWSGI worker 2 (pid: 79814, cores: 1)
spawned uWSGI worker 3 (pid: 79815, cores: 1)
spawned uWSGI worker 4 (pid: 79816, cores: 1)
spawned uWSGI http 1 (pid: 79817)
2015-10-12 07:55:48,458 - 79816 - INFO - testapp.views : test log!
2015-10-12 07:55:51,440 - 79813 - INFO - testapp.views : test log!
2015-10-12 07:55:51,965 - 79814 - INFO - testapp.views : test log!
2015-10-12 07:55:52,810 - 79815 - INFO - testapp.views : test log!
In the same dir, after a while:
-rw-r----- 1 big staff 1.0K Oct 12 09:56 test.log
-rw-r----- 1 big staff 11K Oct 12 09:55 test.log.1444636554
Logrotate
Alternatively, to handle rotating the files yourself, omit the --log-maxsize parameter and use a logrotate config file (/etc/logrotate.d/uwsgi-test-app):
/home/demo/test_django/*log {
rotate 10
size 10k
daily
compress
delaycompress
}
Please note, the above values are for example sake, you probably don't want the rotate size at 10k. For more info on the logrotate format, see an example blog post.
If you have to use python's logrotation (when multiple gunicorn processes are pointing to the same log file) then you should make sure that the main log file is only edited and not renamed, moved etc during rotation. For this, you copy the main log file and then clear it out!
Snippet for rollover method (edit in logging.handlers.RotatingFileHandler's code)
def doRollover(self):
self.stream.close()
if self.backupCount > 0:
for i in range(self.backupCount - 1, 0, -1):
sfn = "%s.%d" % (self.baseFilename, i)
dfn = "%s.%d" % (self.baseFilename, i + 1)
if os.path.exists(sfn):
if os.path.exists(dfn):
os.remove(dfn)
os.rename(sfn, dfn)
dfn = self.baseFilename + ".1"
if os.path.exists(dfn):
os.remove(dfn)
# os.rename(self.baseFilename, dfn) # Intead of this
# Do this
shutil.copyfile(self.baseFilename, dfn)
open(self.baseFilename, 'w').close()
if self.encoding:
self.stream = codecs.open(self.baseFilename, "w", self.encoding)
else:
self.stream = open(self.baseFilename, "w")
Then you can create your logger like this:
logger = logging.getLogger(logfile_name)
logfile = '{}/{}.log'.format(logfile_folder, logfile_name)
handler = RotatingFileHandler(
logfile, maxBytes=maxBytes, backupCount=10
)
formatter = logging.Formatter(format, "%Y-%m-%d_%H:%M:%S")
formatter.converter = time.gmtime
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.INFO)
logger.isEnabledFor = lambda level: True
logger.propagate = 0
logger.warning("This is a log")
im running a simple task, triggered from a django view:
task = mock_deploy.delay()
mock_deploy is defined as:
from celery.decorators import task as ctask
from project.fabscripts.task.mock import *
#ctask(name="mock_deploy")
def mock_deploy():
print "hi form celery task b4 mockdeploy 1234"
output = execute(mock_deploy2)
return "out: %s" % (output)
And the fabric task itself is defined as:
#task
def mock_deploy2():
lrun("ls -l /")
lrun("ifconfig eth0")
# i need to get the full output from those commands and save them to db
And now... I was trying to substitute stdout, overwriting fabric execute function:
def execute(task):
output = StringIO()
error = StringIO()
sys.stdout = output
sys.stderr = error
task()
sys.stdout = sys.__stdout__
sys.stderr = sys.__stderr__
return (output.getvalue(), error.getvalue())
And I was trying to substitute stdout within fabric task. No matter what i did, the only output i was getting was a first line of "what fabric wants to do"
out: [localhost] local: ls -l /
Then, the whole output of the ls command was printed perfectly fine in celery log. Except for the missing one line of out: [localhost] local: ls -l / `9the one i managed to get as output)
[2012-06-14 21:33:56,587: DEBUG/MainProcess] TaskPool: Apply <function execute_and_trace at 0x36710c8> (args:('mock_deploy', '2a90d920-130a-4942-829b-87f4d5ebe80f', [], {}) kwargs:{'hostname': 's16079364', 'request': {'retries': 0, 'task': 'mock_deploy', 'utc': False, 'loglevel': 10, 'delivery_info': {'routing_key': u'celery', 'exchange': u'celery'}, 'args': [], 'expires': None, 'is_eager': False, 'eta': None, 'hostname': 's16079364', 'kwargs': {}, 'logfile': None, 'id': '2a90d920-130a-4942-829b-87f4d5ebe80f'}})
[2012-06-14 21:33:56,591: DEBUG/MainProcess] Task accepted: mock_deploy[2a90d920-130a-4942-829b-87f4d5ebe80f] pid:22214
hi form celery task b4 mockdeploy 1234
total 3231728
-rw-r--r-- 1 root root 3305551148 2012-06-13 14:43 dumpling.sql
drwxr-xr-x 2 root root 4096 2012-05-09 17:42 bin
drwxr-xr-x 4 root root 4096 2012-02-14 15:21 boot
drwxr-xr-x 2 root root 4096 2012-03-09 14:10 build
drwxr-xr-x 2 root root 4096 2010-05-11 19:58 cdrom
-rw------- 1 root root 2174976 2012-05-23 11:23 core
drwxr-xr-x 15 root root 4080 2012-06-11 12:55 dev
drwxr-xr-x 135 root root 12288 2012-06-14 21:15 etc
drwxr-xr-x 6 root root 77 2012-05-21 14:41 home
...
A horrible horrible workaround is wrapping up a fabric run command to add a "> /tmp/logfile.log" on each command, then when the task is finished ill retrieve the file with scp...
My question in short is how do i get the full output of a fabric task when its triggered with celery?
The following did the trick:
#ctask(name="mock_deploy")
def mock_deploy():
env.roledefs.update({'remote': ['root#1.1.1.1',]})
output = StringIO()
sys.stdout = output
execute(mock_deploy2)
sys.stdout = sys.__stdout__
return output.getvalue()