211 lines
7.3 KiB
Python
211 lines
7.3 KiB
Python
import collections
|
|
import logging
|
|
import shelve
|
|
import threading
|
|
import time
|
|
from collections import Counter
|
|
from functools import partial
|
|
|
|
from celery.events import EventReceiver
|
|
from celery.events.state import State
|
|
from prometheus_client import Counter as PrometheusCounter
|
|
from prometheus_client import Gauge, Histogram
|
|
from tornado.ioloop import PeriodicCallback
|
|
from tornado.options import options
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
PROMETHEUS_METRICS = None
|
|
|
|
|
|
def get_prometheus_metrics():
|
|
global PROMETHEUS_METRICS # pylint: disable=global-statement
|
|
if PROMETHEUS_METRICS is None:
|
|
PROMETHEUS_METRICS = PrometheusMetrics()
|
|
|
|
return PROMETHEUS_METRICS
|
|
|
|
|
|
class PrometheusMetrics:
|
|
def __init__(self):
|
|
self.events = PrometheusCounter('flower_events_total', "Number of events", ['worker', 'type', 'task'])
|
|
|
|
self.runtime = Histogram(
|
|
'flower_task_runtime_seconds',
|
|
"Task runtime",
|
|
['worker', 'task'],
|
|
buckets=options.task_runtime_metric_buckets
|
|
)
|
|
self.prefetch_time = Gauge(
|
|
'flower_task_prefetch_time_seconds',
|
|
"The time the task spent waiting at the celery worker to be executed.",
|
|
['worker', 'task']
|
|
)
|
|
self.number_of_prefetched_tasks = Gauge(
|
|
'flower_worker_prefetched_tasks',
|
|
'Number of tasks of given type prefetched at a worker',
|
|
['worker', 'task']
|
|
)
|
|
self.worker_online = Gauge('flower_worker_online', "Worker online status", ['worker'])
|
|
self.worker_number_of_currently_executing_tasks = Gauge(
|
|
'flower_worker_number_of_currently_executing_tasks',
|
|
"Number of tasks currently executing at a worker",
|
|
['worker']
|
|
)
|
|
|
|
|
|
class EventsState(State):
|
|
# EventsState object is created and accessed only from ioloop thread
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
super().__init__(*args, **kwargs)
|
|
self.counter = collections.defaultdict(Counter)
|
|
self.metrics = get_prometheus_metrics()
|
|
|
|
def event(self, event):
|
|
# Save the event
|
|
super().event(event)
|
|
|
|
worker_name = event['hostname']
|
|
event_type = event['type']
|
|
|
|
self.counter[worker_name][event_type] += 1
|
|
|
|
if event_type.startswith('task-'):
|
|
task_id = event['uuid']
|
|
task = self.tasks.get(task_id)
|
|
task_name = event.get('name', '')
|
|
if not task_name and task_id in self.tasks:
|
|
task_name = task.name or ''
|
|
self.metrics.events.labels(worker_name, event_type, task_name).inc()
|
|
|
|
runtime = event.get('runtime', 0)
|
|
if runtime:
|
|
self.metrics.runtime.labels(worker_name, task_name).observe(runtime)
|
|
|
|
task_started = task.started
|
|
task_received = task.received
|
|
|
|
if event_type == 'task-received' and not task.eta and task_received:
|
|
self.metrics.number_of_prefetched_tasks.labels(worker_name, task_name).inc()
|
|
|
|
if event_type == 'task-started' and not task.eta and task_started and task_received:
|
|
self.metrics.prefetch_time.labels(worker_name, task_name).set(task_started - task_received)
|
|
self.metrics.number_of_prefetched_tasks.labels(worker_name, task_name).dec()
|
|
|
|
if event_type in ['task-succeeded', 'task-failed'] and not task.eta and task_started and task_received:
|
|
self.metrics.prefetch_time.labels(worker_name, task_name).set(0)
|
|
|
|
if event_type == 'worker-online':
|
|
self.metrics.worker_online.labels(worker_name).set(1)
|
|
|
|
if event_type == 'worker-heartbeat':
|
|
self.metrics.worker_online.labels(worker_name).set(1)
|
|
|
|
num_executing_tasks = event.get('active')
|
|
if num_executing_tasks is not None:
|
|
self.metrics.worker_number_of_currently_executing_tasks.labels(worker_name).set(num_executing_tasks)
|
|
|
|
if event_type == 'worker-offline':
|
|
self.metrics.worker_online.labels(worker_name).set(0)
|
|
|
|
|
|
class Events(threading.Thread):
|
|
events_enable_interval = 5000
|
|
|
|
# pylint: disable=too-many-arguments
|
|
def __init__(self, capp, io_loop, db=None, persistent=False,
|
|
enable_events=True, state_save_interval=0,
|
|
**kwargs):
|
|
threading.Thread.__init__(self)
|
|
self.daemon = True
|
|
|
|
self.io_loop = io_loop
|
|
self.capp = capp
|
|
|
|
self.db = db
|
|
self.persistent = persistent
|
|
self.enable_events = enable_events
|
|
self.state = None
|
|
self.state_save_timer = None
|
|
|
|
if self.persistent:
|
|
logger.debug("Loading state from '%s'...", self.db)
|
|
state = shelve.open(self.db)
|
|
if state:
|
|
self.state = state['events']
|
|
state.close()
|
|
|
|
if state_save_interval:
|
|
self.state_save_timer = PeriodicCallback(self.save_state,
|
|
state_save_interval)
|
|
|
|
if not self.state:
|
|
self.state = EventsState(**kwargs)
|
|
|
|
self.timer = PeriodicCallback(self.on_enable_events,
|
|
self.events_enable_interval)
|
|
|
|
def start(self):
|
|
threading.Thread.start(self)
|
|
if self.enable_events:
|
|
logger.debug("Starting enable events timer...")
|
|
self.timer.start()
|
|
|
|
if self.state_save_timer:
|
|
logger.debug("Starting state save timer...")
|
|
self.state_save_timer.start()
|
|
|
|
def stop(self):
|
|
if self.enable_events:
|
|
logger.debug("Stopping enable events timer...")
|
|
self.timer.stop()
|
|
|
|
if self.state_save_timer:
|
|
logger.debug("Stopping state save timer...")
|
|
self.state_save_timer.stop()
|
|
|
|
if self.persistent:
|
|
self.save_state()
|
|
|
|
def run(self):
|
|
try_interval = 1
|
|
while True:
|
|
try:
|
|
try_interval *= 2
|
|
|
|
with self.capp.connection() as conn:
|
|
recv = EventReceiver(conn,
|
|
handlers={"*": self.on_event},
|
|
app=self.capp)
|
|
try_interval = 1
|
|
logger.debug("Capturing events...")
|
|
recv.capture(limit=None, timeout=None, wakeup=True)
|
|
except (KeyboardInterrupt, SystemExit):
|
|
try:
|
|
import _thread as thread
|
|
except ImportError:
|
|
import thread
|
|
thread.interrupt_main()
|
|
except Exception as e:
|
|
logger.error("Failed to capture events: '%s', "
|
|
"trying again in %s seconds.",
|
|
e, try_interval)
|
|
logger.debug(e, exc_info=True)
|
|
time.sleep(try_interval)
|
|
|
|
def save_state(self):
|
|
logger.debug("Saving state to '%s'...", self.db)
|
|
state = shelve.open(self.db, flag='n')
|
|
state['events'] = self.state
|
|
state.close()
|
|
|
|
def on_enable_events(self):
|
|
# Periodically enable events for workers
|
|
# launched after flower
|
|
self.io_loop.run_in_executor(None, self.capp.control.enable_events)
|
|
|
|
def on_event(self, event):
|
|
# Call EventsState.event in ioloop thread to avoid synchronization
|
|
self.io_loop.add_callback(partial(self.state.event, event))
|