PKhGZ^^zmon_worker_monitor/__main__.pyimport zmon_worker_monitor.web if __name__ == '__main__': zmon_worker_monitor.web.main() PKhGzmon_worker_monitor/worker.py# -*- coding: utf-8 -*- """ Execution script """ import sys import os import json import settings import logging def _set_logging(log_conf): #substituting {pid} in log handlers for handler_name, handler_dict in log_conf['handlers'].items(): if '{pid}' in handler_dict.get('filename', ''): log_conf['handlers'][handler_name]['filename'] = log_conf['handlers'][handler_name]['filename']\ .format(pid=os.getpid()) import logging.config logging.config.dictConfig(log_conf) def start_worker(**kwargs): """ A simple wrapper for workflow.start_worker(role) , needed to solve the logger import problem with multiprocessing :param role: one of the constants workflow.ROLE_... :return: """ _set_logging(settings.LOGGING) import workflow workflow.start_worker_for_queue(**kwargs) if __name__ == '__main__': if len(sys.argv) != 2: print >> sys.stderr, "\nWrong command line parameters:\n " \ "usage: {0} ".format(sys.argv[0]) sys.exit(1) kwargs = json.loads(sys.argv[1]) print "unmanaged worker started with kwargs: {}".format(kwargs) start_worker(**kwargs) PKhG)m#m#,zmon_worker_monitor/redis_context_manager.py#!/usr/bin/env python # -*- coding: utf-8 -*- from emu_kombu import parse_redis_conn import redis import os import logging import time from threading import current_thread from threading import local as thread_local import collections import math from traceback import format_exception logger = logging.getLogger(__name__) WAIT_RECONNECT_MIN = 0.1 WAIT_RECONNECT_MAX = 20 class _ThreadLocal(thread_local): can_init = False instance = None class RedisConnHandler(object): """ This is a connection manager for redis implemented as a context handler. When used inside a with statement it intercepts RedisConnection exceptions as well as its own IdleLoopException in order to keep score of failures and idle cycles. Based in this counters it reacts to connection errors by introducing small exponential time delays and making several attempts to regain the connection, if t_wait_per_server seconds pass without success it switches to the next redis server from the list it given when configured. It also switches to the next server after t_wait_no_tasks seconds without getting any task. You also get thread safety (connections are not shared among threads), and an easy way to get the connection without passing the reference around. """ # Constants __CONS_SUPPRESS_EXCEPTION = True __CONS_PROPAGATE_EXCEPTION = False STATUS_ERROR = 'STATUS_ERROR' STATUS_IDLE = 'STATUS_IDLE' STATUS_OK = 'STATUS_OK' # class variables servers = [] t_wait0 = WAIT_RECONNECT_MIN reties_per_server = 5 t_wait_per_server = 30 # if 30 seconds pass and we have connection errors we switch server t_wait_no_tasks = 5 * 60 # if 5 minutes pass without getting any message we switch server #t_reconnect_master = 10 * 60 # in 10 minutes it will attempt to connect to server 0 again _pid = None _max_wait_step = 15 # a top value for our exponential increase in waiting time _thread_local = _ThreadLocal() # Counters and dates markers for connection errors _active_index = 0 _retries_count = -1 _idle_count = -1 message_count = 0 _last_failure_tstamp = 0 _last_success_tstamp = time.time() _last_message_tstamp = time.time() class IdleLoopException(Exception): pass @classmethod def configure(cls, **config): cls._pid = os.getpid() cls.t_wait0 = float(config.get('t_wait0', cls.t_wait0)) cls.t_wait_no_tasks = float(config.get('t_wait_no_tasks', cls.t_wait_no_tasks)) cls.t_wait_per_server = float(config.get('t_wait_per_server', cls.t_wait_per_server)) # estimate the reties_per_server from the wait_time_per_server cls.reties_per_server = cls.calculate_reties_per_server(cls.t_wait_per_server, cls.t_wait0) #cls.t_reconnect_master = int(config.get('t_reconnect_master', cls.t_reconnect_master)) servers = config.get('redis_servers') if servers: if isinstance(servers, basestring): servers = [s.strip() for s in servers.split(',')] elif not isinstance(servers, collections.Iterable): raise Exception("wrong servers parameter") cls.servers = list(servers) # parse all server urls to detect config errors beforehand [parse_redis_conn(s) for s in cls.servers] logger.warn('Pid=%s ==> RedisConnHandler configured with reties_per_server(estimated)=%s, t_wait_per_server=%s, ' 't_wait_no_tasks=%s, servers=%s', cls._pid, cls.reties_per_server, cls.t_wait_per_server, cls.t_wait_no_tasks, cls.servers) def __init__(self): # we could use a metaclass or some trick on __new__ for enforcing the use of get_instance() if not self._thread_local.can_init: raise AssertionError('You must use get_instance() to get an instance') assert len(self.servers) > 0, 'Fatal Error: No servers have been configured' self._pid = os.getpid() if not self._pid else self._pid self._conn = None self._parsed_redis = parse_redis_conn(self.servers[self._active_index]) @classmethod def get_instance(cls): if cls._thread_local.instance is None: cls._thread_local.can_init = True cls._thread_local.instance = cls() cls._thread_local.can_init = False return cls._thread_local.instance def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): if exc_type is not None: if issubclass(exc_type, redis.ConnectionError): self.mark(self.STATUS_ERROR) logger.error('Pid=%s, thread_id=%s ==> Lost connection to redis server: %s. Waiting %s seconds. ' 'Exception detail follows:\n%s', self._pid, current_thread().name, self.get_active_server(), self.get_wait_time(), ''.join(format_exception(exc_type, exc_val, exc_tb))) self.wait_on_error() return self.__CONS_PROPAGATE_EXCEPTION if issubclass(exc_type, self.IdleLoopException): self.mark(self.STATUS_IDLE) logger.info("IdleLoop: %s... pid=%s, count: %s", exc_val, self._pid, self.get_message_count()) return self.__CONS_SUPPRESS_EXCEPTION self.mark(self.STATUS_OK) return self.__CONS_PROPAGATE_EXCEPTION @staticmethod def calculate_wait_time_per_server(reties_per_server, t_wait0): return t_wait0 * (2**(reties_per_server + 1) - 1) @staticmethod def calculate_reties_per_server(wait_time_per_server, t_wait0): return int(round(math.log(wait_time_per_server * 1.0 / t_wait0 + 1, 2) - 1)) def get_active_server(self): if self.should_switch_server(): #or self.should_reconnect_master(): self.switch_active_server() # (force_master=self.should_reconnect_master()) return self.servers[self._active_index] def get_parsed_redis(self): return self._parsed_redis def should_switch_server(self): cur_time = time.time() return (self.is_previous_error() and cur_time - self._last_success_tstamp > self.t_wait_per_server) or \ (self.is_previous_idle() and cur_time - self._last_message_tstamp > self.t_wait_no_tasks) def is_previous_ok(self): return self._retries_count == -1 def is_previous_error(self): return self._retries_count > -1 def is_previous_idle(self): return self._idle_count > -1 def switch_active_server(self, force_master=False): self._active_index = (0 if force_master or self._active_index >= len(self.servers) - 1 else self._active_index + 1) self._parsed_redis = parse_redis_conn(self.servers[self._active_index]) self.mark(self.STATUS_OK) # mark a fresh status OK for the new server logger.warn('Pid=%s, thread_id=%s ==> Redis Active server switched to %s, force_master=%s', self._pid, current_thread().name, self.servers[self._active_index], force_master) def get_wait_time(self): return min(self.t_wait0 * (2 ** self._retries_count) if self._retries_count >= 0 and not self.should_switch_server() else 0, self._max_wait_step) def get_message_count(self): return self.message_count def wait_on_error(self): time.sleep(self.get_wait_time()) def mark(self, status): if status == self.STATUS_ERROR: self._retries_count += 1 self._last_failure_tstamp = time.time() self._idle_count = -1 self._conn = None # force the recreation of the thread local connection in any case elif status == self.STATUS_IDLE: self._idle_count += 1 self._retries_count = -1 # and idle loop is still a success, so clear previous errors self._last_success_tstamp = time.time() elif status == self.STATUS_OK: self._retries_count = -1 self._idle_count = -1 self._last_success_tstamp = time.time() self.message_count += 1 self._last_message_tstamp = time.time() else: raise Exception('Non valid status: {}'.format(status)) def get_healthy_conn(self): return self.get_conn() def get_conn(self): if self._conn is not None and not self.should_switch_server(): return self._conn else: self._conn = None active_server = self.get_active_server() c = parse_redis_conn(active_server) logger.warn('Pid=%s, thread_id=%s ==> Opening new redis connection to host=%s, port=%s, db=%s', self._pid, current_thread().name, c.hostname, c.port, c.virtual_host) self._conn = redis.StrictRedis(host=c.hostname, port=c.port, db=c.virtual_host) return self._conn PKhG$6#: zmon_worker_monitor/rpc_utils.py#!/usr/bin/env python # -*- coding: utf-8 -*- """ Server module for exposing an rpc interface for clients to remotely control a local ProcessManager """ import inspect import json import xmlrpclib from SimpleXMLRPCServer import SimpleXMLRPCServer from SimpleXMLRPCServer import SimpleXMLRPCRequestHandler import logging logger = logging.getLogger(__name__) class RpcProxy(object): """ This is a base class to subclass in order to expose an instance object through remote RPC It serves as a container for some idiosyncrasies of Python XML_RPC like the private methods: _listMethods , _methodHelp and _dispatch, which purpose isn't obvious at firsts. Here we try to take advantage of these idiosyncrasies. """ exposed_obj_class = object # override with the class of the object to expose valid_methods = [] # override with the list of methods you want to call from RPC def __init__(self, exposed_obj): assert type(exposed_obj) is self.exposed_obj_class, "Error in RpcProxy: exposed_obj is not declared class" self.exposed_obj = exposed_obj def _listMethods(self): # this method must be present for system.listMethods to work return self.valid_methods def _methodHelp(self, method): # Override this method for system.methodHelp to work if method == 'example_method': return "example_method(2,3) => 5" else: # By convention, return empty string if no help is available return "" def get_exposed_obj(self): #Never add this method to valid_methods return self.exposed_obj def on_exit(self): #Override this to provide a logic to be executed when server is finishing pass def signal_termination(self, terminate): self._signal_terminate_and_exit = bool(terminate) def _dispatch(self, method, params): #This method is automatically called by Python's SimpleXMLRPCServer for every incoming rpc call if method in self.valid_methods: obj = self if hasattr(self, method) else self.exposed_obj try: kw = {} m = getattr(obj, method) if len(params) and str(params[-1]).startswith('js:'): # let's try to interpret the last argument as keyword args in json format _kw = json.loads(str(params[-1])[len('js:'):]) aspec = inspect.getargspec(m) if isinstance(_kw, dict) and _kw and [k in aspec.args for k in _kw]: params = params[:-1] kw = _kw return getattr(obj, method)(*params, **kw) except Exception: logger.exception("Exception encountered in rpc_server while attempting to call: %s with params: %s ", method, params) raise else: raise Exception('method "%s" is not supported' % method) def get_rpc_client(endpoint): """ Get an rpc client object to remote server listening at endpoint :param endpoint: http://host:port/rpc_path :return: rpc_client object """ return xmlrpclib.ServerProxy(endpoint) #TODO: move to a method in RpcProxy def start_RPC_server(host, port, rpc_path, rpc_proxy): """ Starts the RPC server and expose some methods of rpc_proxy :param host: :param port: :param rpc_proxy: :return: """ # Restrict to a particular path. class RequestHandler(SimpleXMLRPCRequestHandler): #rpc_paths = ('/RPC2',) rpc_paths = ('/' + rpc_path.lstrip('/'), ) # Create server server = SimpleXMLRPCServer((host, port), requestHandler=RequestHandler, allow_none=True) server.register_introspection_functions() server.register_instance(rpc_proxy) try: # Run the server's main loop server.serve_forever() except (KeyboardInterrupt, SystemExit): logger.info("Server interrupted: Exiting!") rpc_proxy.on_exit() PKuGWW#zmon_worker_monitor/settings_pro.py# -*- coding: utf-8 -*- """ Project settings for development: To customize the settings for a local environment please create another module called settings_local.py and change there the values you want, they will override the ones in this file """ import os app_home = os.path.abspath(os.environ['APP_HOME'] if 'APP_HOME' in os.environ else './') data_dir = os.path.abspath(os.path.join(app_home, 'zmon_worker_data')) # application data folder needs to be created by the application itself for d in (data_dir, ): if not os.path.isdir(d): os.mkdir(d) LOGGING = { 'version': 1, 'disable_existing_loggers': True, 'formatters': { 'verbose': { 'format': '%(asctime)s - %(name)s - %(levelname)s - %(module)s - %(process)d - %(thread)d - %(message)s' }, 'custom': { 'format': '%(asctime)s - %(levelname)s - %(name)s - %(funcName)s - %(message)s' }, }, 'handlers': { 'console': { 'level': 'INFO', 'class': 'logging.StreamHandler', 'formatter': 'custom' }, }, 'loggers': { '': { 'handlers': ['console'], 'propagate': True, 'level': 'DEBUG', }, } } RPC_SERVER_CONF = dict( HOST='localhost', PORT=8500, RPC_PATH='/zmon_rpc', LOGGING={ 'version': 1, 'disable_existing_loggers': True, 'formatters': { 'verbose': { 'format': '%(asctime)s - %(name)s - %(levelname)s - %(module)s - %(process)d - %(thread)d - %(message)s' }, 'custom': { 'format': '%(asctime)s - %(levelname)s - %(name)s - %(funcName)s - %(message)s' }, }, 'handlers': { 'console': { 'level': 'INFO', 'class': 'logging.StreamHandler', 'formatter': 'custom' }, }, 'loggers': { '': { 'handlers': ['console'], 'propagate': True, 'level': 'DEBUG', }, } } ) PKhG9Gzmon_worker_monitor/__init__.py__version__ = "0.1" PKhG=dzmon_worker_monitor/settings.py# -*- coding: utf-8 -*- """ Project settings: To customize the settings for a local environment please create another module called settings_local.py and change there the values you want. You don't have to duplicate all the logic in settings_dev.py inside settings_local.py, instead you import it and can modify certain values from settings_dev. For example to modify only the log level: ##### content of file: settings_local.py from settings_dev import * LOGGING['loggers']['']['level'] = 'INFO' RPC_SERVER_CONF['LOGGING']['loggers']['']['level'] = 'INFO' #### END of settings_local.py """ EXTERNAL_CONFIG = {} try: from settings_local import * except ImportError: # no settings_local.py found, falling back to settings_pro.py from settings_pro import * def set_workers_log_level(level): if level: LOGGING['loggers']['']['level'] = level def set_rpc_server_port(port): if port and str(port).isdigit(): RPC_SERVER_CONF['PORT'] = int(port) def set_external_config(external_config): global EXTERNAL_CONFIG EXTERNAL_CONFIG = dict(external_config) def get_external_config(): return EXTERNAL_CONFIG PK,uG~zmon_worker_monitor/web.py#!/usr/bin/env python # -*- coding: utf-8 -*- import os import cherrypy import argparse import settings import logging if __name__ == '__main__': import logging.config logging.config.dictConfig(settings.RPC_SERVER_CONF['LOGGING']) logger = logging.getLogger(__name__) # env vars get droped via zompy startup os.environ["ORACLE_HOME"] = "/opt/oracle/instantclient_12_1/" os.environ["LD_LIBRARY_PATH"] = os.environ.get("LD_LIBRARY_PATH", '') + ":/opt/oracle/instantclient_12_1/" import rpc_server DEFAULT_NUM_PROC = 16 def parse_args(args): parser = argparse.ArgumentParser() parser.add_argument("-c", "--config-file", help="path to config file") parser.add_argument('--no-rpc', action='store_true', help='Do not start XML-RPC server') return parser.parse_args(args) def main(args=None): # add src dir to sys.path # src_dir = os.path.abspath(os.path.dirname(__file__)) # if src_dir not in sys.path: # sys.path.append(src_dir) args = parse_args(args) main_proc = rpc_server.MainProcess() # load cherrypy configuration if args.config_file and os.path.exists(args.config_file): cherrypy.config.update(args.config_file) elif os.path.exists('/app/web.conf'): cherrypy.config.update('/app/web.conf') else: cherrypy.config.update('web.conf') for key in cherrypy.config.keys(): env_key = key.upper().replace('.', '_') if env_key in os.environ: cherrypy.config[key] = os.environ[env_key] # save cherrypy config in owr settings module settings.set_workers_log_level(cherrypy.config.get('loglevel', 'INFO')) settings.set_external_config(cherrypy.config) settings.set_rpc_server_port('2{}'.format('3500')) # start the process controller main_proc.start_proc_control() # start some processes per queue according to the config queues = cherrypy.config['zmon.queues']['local'] for qn in queues.split(','): queue, N = (qn.rsplit('/', 1) + [DEFAULT_NUM_PROC])[:2] main_proc.proc_control.spawn_many(int(N), kwargs={"queue": queue, "flow": "simple_queue_processor"}) if not args.no_rpc: main_proc.start_rpc_server() return main_proc if __name__ == '__main__': main() PKٌGDg+g+%zmon_worker_monitor/plugin_manager.py#!/usr/bin/env python # -*- coding: utf-8 -*- """ Our thin layer on top of yapsy_plugin to load external code Folders to be explored are taken from the environment variable ZMON_PLUGINS TODO: Add examples and point to the tests """ import logging import os import sys from yapsy.PluginManager import PluginManagerSingleton import pkg_resources from zmon_worker_monitor.adapters.ifunctionfactory_plugin import IFunctionFactoryPlugin logger = logging.getLogger(__name__) # Some constants to define global behaviour PLUGIN_INFO_EXT = 'worker_plugin' PLUGIN_BUILTINS = ('zmon_worker_monitor.builtins.plugins',) PLUGIN_ENV_VAR = 'ZMON_PLUGINS' PLUGIN_CATEGORIES_FILTER = { 'Function': IFunctionFactoryPlugin, } GLOBAL_CONFIG_PREFIX = 'plugin.{plugin_name}.' class PluginError(Exception): pass class PluginRecoverableError(PluginError): pass class PluginFatalError(PluginError): pass def _builtins_paths(subpackages, raise_errors=True): if not subpackages: return [] folders = [] for subpkg in (subpackages if isinstance(subpackages, (tuple, list)) else [subpackages]): try: parts = subpkg.split('.') path = pkg_resources.resource_filename('.'.join(parts[:-1]), parts[-1]) if not os.path.isdir(path): raise Exception('path is not a directory: {}'.format(path)) except Exception: logger.exception('erroneous plugins package: %s. Exception: ', subpkg) if raise_errors: _, ev, tb = sys.exc_info() raise PluginFatalError('Builtins plugins error in {}. Reason: {}'.format(subpkg, ev)), None, tb else: folders.append(path) return folders def _env_dirs(env_var, raise_errors=True): env_value = os.environ.get(env_var) if not env_value: return [] folders = [] for d in env_value.split(os.pathsep): if not os.path.isdir(d): logger.warn('Wrong path %s in env variable %s', d, env_var) if raise_errors: raise PluginFatalError('Env plugins error in path: {}, from env_var: {}'.format(d, env_var)) continue folders.append(d) return folders def _filter_additional_dirs(path_list, raise_errors=True): if not path_list: return [] folders = [] for path in path_list: if os.path.isdir(path): folders.append(path) elif raise_errors: raise PluginFatalError('Additional dirs contains erroneous path: {}'.format(path)) return folders _initialized = {} def init_plugin_manager(category_filter=None, info_ext=PLUGIN_INFO_EXT, builtins_pkg=PLUGIN_BUILTINS, env_var=PLUGIN_ENV_VAR): """ Initialize the plugin manager and set some behaviour options :param category_filter: :param info_ext: :param builtins_pkg: :param env_var: :return: """ global _initialized # default category_filter is PLUGIN_CATEGORIES_FILTER (dict) category_filter = PLUGIN_CATEGORIES_FILTER if category_filter is None else category_filter logger.info('init plugin manager') manager = PluginManagerSingleton.get() manager.setCategoriesFilter(category_filter) manager.setPluginInfoExtension(info_ext) # save parameters used to initialize the module _initialized = dict(category_filter=category_filter, info_ext=info_ext, builtins_pkg=builtins_pkg, env_var=env_var) def get_plugin_manager(): """ Get the plugin manager object (singleton) """ return PluginManagerSingleton.get() _collected = False def collect_plugins(load_builtins=True, load_env=True, additional_dirs=None, global_config=None, raise_errors=True): """ Collect plugins from folders in environment var and additional_dir param. :param plugin_env_var: environment variable containing a list of paths (shell $PATH style) :param additional_dirs: additional locations to search plugins in :return: """ global _collected if not _initialized: raise PluginFatalError('You must invoke init_plugin_manager() before collect_plugins()!') if _collected: raise PluginFatalError('Plugins should be collected only once!') try: # load the plugins global_config = {} if global_config is None else global_config builtins = _builtins_paths(_initialized.get('builtins_pkg')) if load_builtins else [] paths_env = _env_dirs(_initialized.get('env_var')) if load_env else [] path_list = paths_env + _filter_additional_dirs(additional_dirs) # not necessary and may cause module name clashes... remove? # for entry in path_list: # if entry not in sys.path: # sys.path.append(entry) # so the plugins can relatively import their submodules # check plugin dependencies declared in {plugin_dir}/requirements.txt are installed for path in path_list: miss_deps = _check_dependencies(path) if miss_deps: logger.error('Dependencies missing for plugin %s: %s', path, ','.join(miss_deps)) if raise_errors: raise PluginFatalError('Dependencies missing for plugin {}: {}'.format(path, ','.join(miss_deps))) manager = get_plugin_manager() manager.setPluginPlaces(builtins + path_list) # explore the provided locations and identify plugin candidates manager.locatePlugins() # save list of all plugin candidates: [(info file path, python file path, plugin info instance), ...] candidates = manager.getPluginCandidates() logger.debug('Recognized plugin candidates: %s', candidates) # trigger the loading of all plugin python modules manager.loadPlugins() all_plugins = manager.getAllPlugins() if len(all_plugins) != len(candidates): plugin_paths = map(_path_source_to_plugin, [p.path for p in all_plugins]) dropped = [c for c in candidates if c[0] not in plugin_paths] logger.error('These plugin candidates have errors: %s', dropped) if raise_errors: raise PluginFatalError('Plugin candidates have errors: {}'.format(dropped)) # configure and activate plugins for plugin in all_plugins: config_prefix = GLOBAL_CONFIG_PREFIX.format(plugin_name=plugin.name) conf_global = {} try: conf_global = {str(c)[len(config_prefix):]: v for c, v in global_config.iteritems() if str(c).startswith(config_prefix)} logger.debug('Plugin %s received global conf keys: %s', plugin.name, conf_global.keys()) except Exception: logger.exception('Failed to parse global configuration. Reason: ') if raise_errors: raise conf = {} try: if plugin.details.has_section('Configuration'): conf = {c: v for c, v in plugin.details.items('Configuration')} # plugin.plugin_info.detail has the safeconfig object logger.debug('Plugin %s received local conf keys: %s', plugin.name, conf.keys()) except Exception: logger.exception('Failed to load local configuration from plugin: %s. Reason: ', plugin.name) if raise_errors: raise # for security reasons our global config take precedence over the local config conf.update(conf_global) try: plugin.plugin_object.configure(conf) except Exception: logger.exception('Failed configuration of plugin: %s. Reason: ', plugin.name) if raise_errors: raise plugin.plugin_object.deactivate() continue plugin.plugin_object.activate() _collected = True except PluginFatalError: raise except Exception: logger.exception('Unexpected error during plugin collection: ') if raise_errors: _, ev, tb = sys.exc_info() raise PluginFatalError("Error while loading plugins. Reason: {}".format(ev)), None, tb def get_plugins_of_category(category, active=True, raise_errors=True): """ Get plugins (plugin_info) of a given category """ try: plugins = get_plugin_manager().getPluginsOfCategory(category) except KeyError: if raise_errors: raise PluginRecoverableError('Category {} not known to the plugin system'.format(category)) return [] if plugins and isinstance(active, bool): plugins = [p for p in plugins if p.is_activated == active] return plugins def get_plugin_objs_of_category(category, active=True, raise_errors=True): """ Get plugin objects of a given category """ return [p.plugin_object for p in get_plugins_of_category(category, active, raise_errors)] def get_plugin_by_name(name, category, not_found_is_error=True): """ Get a plugin by name and category """ plugin = get_plugin_manager().getPluginByName(name, category) if not plugin and not_found_is_error: raise PluginRecoverableError('Plugin by name {} not found under category {}'.format(name, category)) return plugin def get_plugin_obj_by_name(name, category, not_found_is_error=True): """ Get a plugin object by name and category """ plugin = get_plugin_by_name(name, category, not_found_is_error) return None if plugin is None else plugin.plugin_object def get_all_plugins(): """ Get list of all loaded plugins """ return get_plugin_manager().getAllPlugins() def get_all_plugin_names(): """ Get list of names of all loaded plugins """ plugins = get_all_plugins() if not plugins: return [] return [p.name for p in plugins] def get_all_categories(): """ Return list of categories provided in the category_filter """ manager = get_plugin_manager() return manager.getCategories() def get_loaded_plugins_categories(): """ Return list of categories that have one or more discovered plugins """ return list(set([p.category for p in get_all_plugins()])) def _check_dependencies(path): req_path = path + os.sep + 'requirements.txt' if not os.path.isfile(req_path): logger.debug('%s has no requirements.txt file' % path) return None missing_pkg = [] with open(req_path) as f: for line in f: stripped = line.strip() if stripped and not stripped.startswith('#'): try: pkg_resources.get_distribution(stripped) except Exception as _: missing_pkg.append(stripped) return missing_pkg def _path_source_to_plugin(source_path): source_no_py = source_path[:-3] if source_path.lower().endswith('.py') else source_path plugin_path = source_no_py + '.' + _initialized['info_ext'] return plugin_path PKhG "22)zmon_worker_monitor/process_controller.py# -*- coding: utf-8 -*- """ Logic for controlling worker processes """ import os import signal import time import copy import logging from multiprocessing import Process from threading import Thread logger = logging.getLogger(__name__) class ProcessController(object): """ Class to handle a bunch of child processes what can it do: 0. define a common target function for every process? 1. spawn N processes that execute the target function, store references to objects and its pid 2. spawn more process after some are running 3. terminate some process *(by pid?) 4. spawn a thread loop for checking the health of child processes *(and take some action if some process dies)? 5. dynamically change the policy on how to react to process dies *(use queue for incoming requests?) """ action_policies = ('report', 'dynamic_num', 'dynamic_throughput') proc_stat_element = {'begin_time': -1, 'end_time': -1, 'alive': False, 'rebel': False, 'pid': 0, 'exitcode': 0, 'mem': -1, 'abnormal_termination': False} def __init__(self, default_target=None, default_args=None, default_kwargs=None, always_add_kwargs=None, action_policy='report', min_processes=2, max_processes=1000, start_action_loop=True): self.default_target = default_target self.default_args = default_args if isinstance(default_args, (tuple, list)) else () self.default_kwargs = default_kwargs if isinstance(default_kwargs, dict) else {} self.always_add_kwargs = always_add_kwargs if isinstance(always_add_kwargs, dict) else {} self.proc_dict = {} # { proc_name : proc} self.proc_rebel = {} # { proc_name : proc} -> for processes that refuse to die self.proc_stats = {} # { proc_name : {'begin_time':T0, 'end_time': T1, 'alive' : bool1, ... } } self.proc_args = {} # { proc_name : {'target':None, 'args':[...], 'kwargs'={...} }} self.pid_to_pname = {} # {pid: proc_name} self.pids_for_termination = [] # [ pid1, pid2, ....] self.limbo_proc_dict = {} # {proc_name : proc} self.max_killed_stats = 5000 # max number of dead proc stats to keep around in memory self.min_processes = min_processes self.max_processes = max_processes self.count_stop_condition = 0 # counter of consecutive stop conditions found self.consecutive_stop_condition = 5 # counter of consecutive stop conditions found self.gracetime_stop_condition = 60 # number of seconds to wait before a final stop condition check self._thread_action_loop = None self.stop_action = True self.action_loop_interval = 2 # seconds between each actions pass self.set_action_policy(action_policy) self.set_dynamic_num_processes(5) # number of process to maintain alive when action_policy == 'dynamic_num' self._tstamp_clean_old_proc_stats = -1 # timestamp of the last execution of _clean_old_proc_stats() self._tdelta_clean_old_proc_stats = 300 # frequency of __clean_old_proc_stats() if start_action_loop: self.start_action_loop() def spawn_process(self, target=None, args=None, kwargs=None): args = args if isinstance(args, (tuple, list)) else () kwargs = kwargs if isinstance(kwargs, dict) else {} if self.max_processes == len(self.proc_dict): raise Exception("maximum number of processes reached!!!") target = target or self.default_target args = args or self.default_args kwargs = dict(kwargs if kwargs else self.default_kwargs) kwargs.update(self.always_add_kwargs) try: proc = Process(target=target, args=args, kwargs=kwargs) proc.start() pname = proc.name # creating entry in running process table self.proc_dict[pname] = proc # mapping pid -> pname self.pid_to_pname[proc.pid] = pname # store process arguments to relaunch it if it dies self.proc_args[pname] = dict(target=target, args=args, kwargs=kwargs) # creating entry in stats table self.proc_stats[pname] = dict(self.proc_stat_element) self.proc_stats[pname]['pid'] = proc.pid self.proc_stats[pname]['alive'] = proc.is_alive() self.proc_stats[pname]['begin_time'] = time.time() # use self._format_time() to get datetime format except Exception: logger.exception("Spawn of process failed. Caught exception with details: ") raise return pname def spawn_many(self, N, target=None, args=None, kwargs=None): logger.info('>>>>>>> spawn_many: %d, %s, %s', N, args, kwargs) args = args if isinstance(args, (tuple, list)) else () kwargs = kwargs if isinstance(kwargs, dict) else {} n_success = 0 for i in range(N): try: self.spawn_process(target=target, args=args, kwargs=kwargs) except Exception: logger.exception('Failed to start process. Reason: ') else: n_success += 1 return n_success == N def terminate_process(self, proc_name, kill_wait=0.5): proc = self.proc_dict.get(proc_name) if not proc: logger.warn('process: %s not found!!!!!!!!!!!!!!!!!', proc_name) return False if proc.is_alive(): logger.warn('terminating process: %s', proc_name) proc.terminate() time.sleep(kill_wait) if proc.is_alive(): logger.warn('Sending SIGKILL to process with pid=%s', proc.pid) os.kill(proc.pid, signal.SIGKILL) abnormal_termination = False else: logger.warn('process: %s is not alive!!!!!!!!!!!!!!!!!', proc_name) abnormal_termination = True # move proc to limbo and record end time in stats self.proc_dict.pop(proc_name, None) self.limbo_proc_dict[proc_name] = proc self._close_proc_stats(proc, abnormal_termination) return True def terminate_all_processes(self): self.stop_action_loop() # very important: stop action loop before starting to terminate child processes all_pnames = list(self.proc_dict.keys()) for proc_name in all_pnames: self.terminate_process(proc_name, kill_wait=0.1) logger.info("proc_stats after terminate_all_processes() : %s", self.list_stats()) return True def _close_proc_stats(self, proc, abnormal_termination=False): # Update proc_stats {'proc_name' : {'begin_time':T0, 'end_time': T1, 'alive' : bool1,... } } pn = proc.name if proc.is_alive(): self.proc_stats[pn]['alive'] = True self.proc_stats[pn]['rebel'] = True else: self.proc_stats[pn]['abnormal_termination'] = abnormal_termination self.proc_stats[pn]['end_time'] = time.time() self.proc_stats[pn]['alive'] = False self.proc_stats[pn]['exitcode'] = proc.exitcode def get_info(self, proc_name): """ Get all the info I can of this process, for example: 1. How long has it been running? *(Do I need an extra pid table for statistics?) 2. How much memory does it use? """ raise NotImplementedError('Method get_info not implemented yet') def list_running(self): return [(proc_name, proc.pid) for proc_name, proc in self.proc_dict.items()] def list_stats(self): proc_stats = copy.deepcopy(self.proc_stats) for proc_name, stats in proc_stats.items(): stats['begin_time'] = self._format_time(stats['begin_time']) stats['end_time'] = self._format_time(stats['end_time']) return proc_stats def _format_time(self, seconds): return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(seconds)) if seconds else '--' def get_action_policy(self): return self.action_policy def set_action_policy(self, action_policy): if action_policy not in self.action_policies: raise Exception('Invalid action policy, possible values are: %s' % ', '.join(self.action_policies)) self.action_policy = action_policy def available_action_policies(self): return self.action_policies def is_action_loop_running(self): return not self.stop_action def get_dynamic_num_processes(self): return self.dynamic_num_processes def set_dynamic_num_processes(self, dynamic_num_processes): try: assert type(dynamic_num_processes) is int and \ self.min_processes <= dynamic_num_processes <= self.max_processes except AssertionError: raise Exception('dynamic_num_processes passed is not in correct range') self.dynamic_num_processes = dynamic_num_processes def _clean_old_proc_stats(self): """ Remove old stats from dead processes to avoid high memory usage """ if time.time() - self._tstamp_clean_old_proc_stats > self._tdelta_clean_old_proc_stats: self._tstamp_clean_old_proc_stats = time.time() et_pn = sorted([(stats['end_time'], pn) for pn, stats in self.proc_stats.copy().items() if stats['end_time'] > 0]) del_et_pn = et_pn[:len(et_pn)-self.max_killed_stats] if len(et_pn) > self.max_killed_stats else [] for end_time, pn in del_et_pn: stats = self.proc_stats.pop(pn, None) logger.warn('Deleting stats of killed process %s to preserve memory: %s', pn, stats) def _clean_limbo_procs(self): limbo_dict = dict(self.limbo_proc_dict) for pname, proc in limbo_dict.items(): if proc.is_alive(): logger.error('Fatal: process in limbo in undead state!!!!!') else: self.pid_to_pname.pop(proc.pid, None) self.proc_args.pop(pname, None) self.limbo_proc_dict.pop(pname, None) def mark_for_termination(self, pid): """Given pid will be stored in local variable that marks them for termination in the next action pass""" self.pids_for_termination.append(pid) def _respawn(self, proc_name): # terminate process and spawn another process with same arguments pargs = self.proc_args.get(proc_name, {}) proc = self.proc_dict.get(proc_name) pid = proc.pid if proc else '???' was_alive = proc.is_alive() if proc else '???' self.terminate_process(proc_name, kill_wait=1.0) proc_name2 = self.spawn_process(**pargs) proc2 = self.proc_dict.get(proc_name2) pid2 = proc2.pid if proc2 else '???' logger.warn('Respawned process: proc_name=%s, pid=%s, was_alive=%s --> proc_name=%s, pid=%s, args=%s', proc_name, pid, was_alive, proc_name2, pid2, pargs) def _action_loop(self): """ A threaded loop that runs every interval seconds to perform autonomous actions """ while not self.stop_action: try: # action 1: respond to kill requests: terminate marked pid and spawn them again term_pids = self.pids_for_termination[:] del self.pids_for_termination[:len(term_pids)] for pid in term_pids: pname = self.pid_to_pname.get(pid) if pname is not None: if not self.stop_action: self._respawn(pname) else: logger.warn('action_loop found non valid pid: %s', pid) # action 2: inspect all processes and react to those that died unexpectedly for pname, proc in self.proc_dict.items(): if not proc.is_alive() and not self.stop_action: logger.warn('Detected abnormal termination of process pid=%s... attempting restart', proc.pid) self._respawn(pname) self._clean_limbo_procs() self._clean_old_proc_stats() except Exception: logger.exception('Error in ProcessController action_loop: ') if not self.stop_action: time.sleep(self.action_loop_interval) def start_action_loop(self, interval=1): self.stop_action = False self.action_loop_interval = interval self._thread_action_loop = Thread(target=self._action_loop) self._thread_action_loop.daemon = True self._thread_action_loop.start() def stop_action_loop(self): self.stop_action = True PK,uG%% zmon_worker_monitor/emu_kombu.py#!/usr/bin/env python # -*- coding: utf-8 -*- """ This module is intended as a quick and dirty drop in replacement of kombu module, covering only the functionality we use in zmon. In essence we are trying to cover these use cases: from kombu.connection import Connection try: c = Connection(config['backend']) except KeyError: raise Exception('"backend" property missing in config object') r_conn = redis.StrictRedis(host=c.hostname, port=c.port, db=c.virtual_host) And: from kombu import Queue app.conf.update(CELERY_QUEUES=(Queue('zmon:queue:default',routing_key='default'), Queue('zmon:queue:secure', routing_key='secure'), Queue('zmon:queue:snmp', routing_key='snmp'))) """ import re from collections import namedtuple class Queue(object): ''' Used to emulate kombu Queue ''' def __init__(self, queue, routing_key=None): self.queue = queue self.routing_key = routing_key def parse_redis_conn(conn_str): ''' Emulates kombu.connection.Connection that we were using only to parse redis connection string :param conn_str: example 'redis://localhost:6379/0' :return: namedtuple(hostname, port, virtual_host) ''' Connection = namedtuple('Connection', 'hostname port virtual_host') conn_regex = r'redis://([-.a-zA-Z0-9_]+):([0-9]+)/([0-9]+)' m = re.match(conn_regex, conn_str) if not m: raise Exception('unable to parse redis connection string: {}'.format(conn_str)) return Connection(m.group(1), int(m.group(2)), m.group(3)) PK/xGC !zmon_worker_monitor/rpc_server.py#!/usr/bin/env python # -*- coding: utf-8 -*- """ Server module for exposing an rpc interface for clients to remotely control a local ProcessManager """ import os import sys import signal import settings import logging if __name__ == '__main__': import logging.config logging.config.dictConfig(settings.RPC_SERVER_CONF['LOGGING']) logger = logging.getLogger(__name__) from pprint import pformat from process_controller import ProcessController import worker import rpc_utils def save_pid(abort_pidfile=False): pid = os.getpid() pid_file = os.path.join(settings.data_dir, 'rpc_server.pid') if abort_pidfile and os.path.isfile(pid_file): print >>sys.stderr, 'pid file {} already exists. Is another process running? Aborting!'.format(pid_file) sys.exit(1) with open(pid_file, 'w') as f: f.write(str(pid)) def sigterm_handler(signum, frame): # this will propagate the SystemExit exception all around, so we can quit listening loops, cleanup and exit sys.exit(0) class ProcessControllerProxy(rpc_utils.RpcProxy): """ A proxy class to expose some methods of multiprocess manager server to listen to remote requests, possible request are: 1. start N more child processes 2. terminate processes with pid1, pid2, ... 3. report running statistics 4. report status of process with pid1 5. terminate all child processes 6. terminate yourself """ exposed_obj_class = ProcessController valid_methods = ['spawn_many', 'list_running', 'list_stats', 'start_action_loop', 'stop_action_loop', 'is_action_loop_running', 'get_dynamic_num_processes', 'set_dynamic_num_processes', 'get_action_policy', 'set_action_policy', 'available_action_policies', 'terminate_all_processes', 'terminate_process', 'mark_for_termination'] def list_running(self): return pformat(self.get_exposed_obj().list_running()) def list_stats(self): return pformat(self.get_exposed_obj().list_stats()) def on_exit(self): self.get_exposed_obj().terminate_all_processes() # TODO: Think why exit codes are sometimes -15 and others 0 class MainProcess(object): def __init__(self): save_pid() signal.signal(signal.SIGTERM, sigterm_handler) def start_proc_control(self): self.proc_control = ProcessController(default_target=worker.start_worker, action_policy='report') def start_rpc_server(self): rpc_proxy = ProcessControllerProxy(self.proc_control) rpc_utils.start_RPC_server(settings.RPC_SERVER_CONF['HOST'], settings.RPC_SERVER_CONF['PORT'], settings.RPC_SERVER_CONF['RPC_PATH'], rpc_proxy) def main(config=None): save_pid(abort_pidfile=True) signal.signal(signal.SIGTERM, sigterm_handler) proc_control = ProcessController(default_target=worker.start_worker, action_policy='report', always_add_kwargs={'external_config': config}) rpc_proxy = ProcessControllerProxy(proc_control) rpc_utils.start_RPC_server(settings.RPC_SERVER_CONF['HOST'], settings.RPC_SERVER_CONF['PORT'], settings.RPC_SERVER_CONF['RPC_PATH'], rpc_proxy) if __name__ == '__main__': main() PK,uG+!)$/$/zmon_worker_monitor/workflow.py#!/usr/bin/env python # -*- coding: utf-8 -*- import os import sys from datetime import datetime, timedelta import base64 import json import logging import time import threading from rpc_client import get_rpc_client from contextlib import contextmanager import settings import eventloghttp import snappy import plugin_manager from redis_context_manager import RedisConnHandler from tasks import load_config_from_file, configure_tasks from tasks import check_and_notify, trial_run, cleanup logger = logging.getLogger(__name__) TASK_POP_TIMEOUT = 5 __config = None def get_config(): global __config if __config is None: __config = settings.get_external_config() or load_config_from_file() return __config def flow_simple_queue_processor(queue='', **execution_context): ''' Simple logic to connect to a redis queue, listen to messages, decode them and execute the tasks :param queue: (str) queue to connect to :param execution_context: (dict) other kwargs that may have been passed when worker was spawn :return: Some info to understand celery messages: 1. An example of a celery message as first received (base64-encoded body shortened): ('zmon:queue:default', '{ "body": "eyJleHBpcm...t9fQ==", "headers": {}, "content-type": "application/json", "properties": { "body_encoding": "base64", "correlation_id": "check-277-de_zalando:access-control-kit-1409826332.92", "reply_to": "abc5c87f-74eb-3570-a1cf-e426eaf91ca7", "delivery_info": { "priority": 0, "routing_key": "default", "exchange": "zmon" }, "delivery_mode": 2, "delivery_tag": "94288433-cb4e-4d33-be29-c63e2bbce39a" }, "content-encoding": "utf-8"}' ) 2. An example of the message['body'] after being base64-decoded (args list shortened): { u'utc': True, u'chord': None, u'args': [{u'check_id': 277, u'interval': 60, u'entity': {u'instance_type': u'zomcat', ...}, u'condition': u'>100', ...}], u'retries': 0, u'expires': u'2014-09-04T10:27:32.919152+00:00', u'task': u'check_and_notify', u'callbacks': None, u'errbacks': None, u'timelimit': [90, 60], u'taskset': None, u'kwargs': {}, u'eta': None, u'id': u'check-277-de_zalando:access-control-kit-1409826332.92' } ''' known_tasks = {'check_and_notify': check_and_notify, 'trial_run': trial_run, 'cleanup': cleanup} #get configuration and configure tasks config = get_config() configure_tasks(config) logger.info('Connecting simple_queue_consumer to queue=%s, execution_context=%s', queue, execution_context) RedisConnHandler.configure(**dict(config)) eventloghttp.set_target_host(config.get('eventlog.host','localhost'), config.get('eventlog.port', 8081)) eventloghttp.enable_http(config.get('eventlog.http', True)) reactor = FlowControlReactor.get_instance() conn_handler = RedisConnHandler.get_instance() # try cleanup captures queue in aws context r = conn_handler.get_healthy_conn() r.delete('zmon:captures2graphite') expired_count = 0 count = 0 while True: try: with conn_handler as ch: r_conn = ch.get_healthy_conn() encoded_task = r_conn.blpop(queue, TASK_POP_TIMEOUT) if encoded_task is None: raise ch.IdleLoopException('No task received') queue, msg = encoded_task if not msg[:1] == '{': msg = snappy.decompress(msg) msg_obj = json.loads(msg) msg_body = None body_encoding = msg_obj.get("properties", {}).get("body_encoding") if body_encoding == "nested": msg_body = msg_obj["body"] elif body_encoding == "base64": msg_body = json.loads(base64.b64decode(msg_obj['body'])) elif body_encoding == "snappy": msg_body = json.loads(snappy.decompress(base64.b64decode(msg_obj['body']))) taskname = msg_body['task'] func_args = msg_body['args'] func_kwargs = msg_body['kwargs'] timelimit = msg_body.get('timelimit') # [90, 60] t_hard, t_soft = timelimit # we pass task metadata as a kwargs right now, later will be put in the function context by our decorator task_context = { 'queue': queue, 'taskname': taskname, 'delivery_info': msg_obj.get('properties', {}).get('delivery_info', {}), 'task_properties': { 'task': taskname, 'id': msg_body.get('id', ''), 'expires': msg_body.get('expires'), # '2014-09-04T10:27:32.919152+00:00' 'timelimit': timelimit, # [90, 60] 'utc': msg_body.get('utc', True) }, } # discard tasks that are expired if expire metadata comes with the message cur_time = datetime.utcnow() if task_context['task_properties']['utc'] else datetime.now() expire_time = datetime.strptime(msg_body.get('expires').replace("Z", "").rsplit('+', 1)[0], '%Y-%m-%dT%H:%M:%S.%f') \ if msg_body.get('expires') else cur_time + timedelta(seconds=10) check_id = (msg_body['args'][0].get('check_id', 'xx') if len(msg_body['args']) > 0 and isinstance(msg_body['args'][0], dict) else 'XX') logger.debug('task loop analyzing time: check_id=%s, cur_time: %s , expire_time: %s, msg_body["expires"]=%s', check_id, cur_time, expire_time, msg_body.get('expires')) if cur_time < expire_time: with reactor.enter_task_context(taskname, t_hard, t_soft): known_tasks[taskname](*func_args, task_context=task_context, **func_kwargs) else: logger.warn('Discarding task due to time expiration. cur_time: %s , expire_time: %s, msg_body["expires"]=%s ---- msg_body=%s', cur_time, expire_time, msg_body.get('expires'), msg_body) expired_count += 1 if expired_count % 500 == 0: logger.warning("expired tasks count: %s", expired_count) count += 1 except Exception: logger.exception('Exception in redis loop. Details: ') time.sleep(5) # avoid heavy log spam here # some exit condition on failure: maybe when number of consecutive failures > n ? # TODO: Clean redis connection... very important!!!! # disconnect_all() def flow_forked_child(queue='', **kwargs): """ Implement forking a work horse process per message as seen in python_rq """ #TODO: implement pass def flow_forked_childs(queue='', **kwargs): """ Implement forking several work horses process per message? """ #TODO: implement pass def flow_multiprocessing_pool(queue='', **kwargs): """ Implement spawning a pool of workers of multiprocessing process """ #TODO: implement pass class FlowControlReactor(object): """ Implements a singleton object with a permanently running action loop, that can communicate with the parent process (ProcessController) to request certain actions or submit information about the health of this worker. Only implemented capability till now is a "Hard Kill" functionality that kicks in when a task is taking too long to complete. We use a context manager to signal when we enter or leave this mode of operations. Future capabilities may include periodical reports to the parent process about number of processed tasks, mean time spent by the N slowest running tasks. Also a soft kill feature. """ _initialized = False _can_init = False _instance = None t_wait = 0.2 def __init__(self): #self.task_agg_info = {} # we could aggregate some info about how tasks are running in this worker assert not self._initialized and self._can_init, 'Call get_instance() to instantiate' self._initialized = True self._pid = os.getpid() self._rpc_client = get_rpc_client('http://{}:{}{}'.format(settings.RPC_SERVER_CONF['HOST'], settings.RPC_SERVER_CONF['PORT'], settings.RPC_SERVER_CONF['RPC_PATH'])) self._current_task_by_thread = {} # {thread_id: (taskname, t_hard, t_soft, tstart)} self.action_on = False self._thread = threading.Thread(target=self.action_loop) self._thread.daemon = True @classmethod def get_instance(cls): if cls._instance is None: cls._can_init = True cls._instance = cls() return cls._instance @contextmanager def enter_task_context(self, taskname, t_hard, t_soft): self.task_received(taskname, t_hard, t_soft) try: yield self except Exception as e: self.task_ended(exc=e) raise else: self.task_ended() def action_loop(self): while self.action_on: try: # hard kill logic for th_name, (taskname, t_hard, t_soft, ts) in self._current_task_by_thread.copy().items(): if time.time() > ts + t_hard: logger.warn('Hard Kill request started for worker pid=%s, task: %s, t_hard=%d', self._pid, taskname, t_hard) self._rpc_client.mark_for_termination(self._pid) # rpc call to parent asking for a kill self._current_task_by_thread.pop(th_name, {}) except Exception: logger.exception('Scary Error in FlowControlReactor.action_loop(): ') time.sleep(self.t_wait) def start(self): self.action_on = True self._thread.start() def stop(self): self.action_on = False def task_received(self, taskname, t_hard, t_soft): # this sets a timer for this task, there is only one task per thread, and right now only main thread produce self._current_task_by_thread[threading.currentThread().getName()] = (taskname, t_hard, t_soft, time.time()) def task_ended(self, exc=None): # delete the task from the list self._current_task_by_thread.pop(threading.currentThread().getName(), {}) def start_worker_for_queue(flow='simple_queue_processor', queue='zmon:queue:default', **execution_context): """ Starting execution point to the workflows """ known_flows = {'simple_queue_processor': flow_simple_queue_processor} if flow not in known_flows: logger.exception("Bad role: %s" % flow) sys.exit(1) logger.info("Starting worker with pid=%s, flow type: %s, queue: %s, execution_context: %s", os.getpid(), flow, queue, execution_context) # init the plugin manager plugin_manager.init_plugin_manager() # load external plugins (should be run only once) plugin_manager.collect_plugins(global_config=get_config(), load_builtins=True, load_env=True) # start Flow Reactor here FlowControlReactor.get_instance().start() exit_code = 0 try: known_flows[flow](queue=queue, **execution_context) except (KeyboardInterrupt, SystemExit): logger.warning("Caught user signal to stop consumer: finishing!") except Exception: logger.exception("Exception in start_worker(). Details: ") exit_code = 2 finally: FlowControlReactor.get_instance().stop() sys.exit(exit_code) PKhG1  !zmon_worker_monitor/rpc_client.py#!/usr/bin/env python # -*- coding: utf-8 -*- """ Client module for connecting to an rpc_server in a remote machine Test of execution: python rpc_client.py http://localhost:8000/patata popo_method int:3.5 float:5.6 """ import sys import xmlrpclib DEBUG = True _cmd_struct = { 'endpoint': None, 'method_name': None, 'args': [] } def parse_cmd_line(args): admitted_types = ('int', 'float', 'str') cmd_parts = dict(_cmd_struct) cmd_parts['endpoint'] = args[1] cmd_parts['method_name'] = args[2] cmd_parts['args'] = [] raw_method_args = args[3:] for raw_arg in raw_method_args: #arg_parts = raw_arg.strip('"\'').split(':') arg_parts = raw_arg.split(':') if len(arg_parts) == 1 or arg_parts[0] not in admitted_types: arg_type, arg_value = 'str', ':'.join(arg_parts[0:]) if arg_value.isdigit(): arg_type = 'int' elif not (arg_value.startswith('.') or arg_value.endswith('.')) and arg_value.replace('.', '', 1).isdigit(): arg_type = 'float' else: arg_type, arg_value = arg_parts[0], ':'.join(arg_parts[1:]) try: value = eval('{0}({1})'.format(arg_type, arg_value)) if arg_type != 'str' else arg_value except Exception: print >> sys.stderr, "\n Error: Detected argument with wrong format" sys.exit(3) cmd_parts['args'].append(value) return cmd_parts def get_rpc_client(endpoint): """ Get an rpc client object to remote server listening at endpoint :param endpoint: http://host:port/rpc_path :return: rpc_client object """ return xmlrpclib.ServerProxy(endpoint) if __name__ == '__main__': if len(sys.argv) <= 2: print >>sys.stderr, 'usage: {0} http://:/ [ [int|float|str]:arg1 ' \ '[int|float|str]:arg2 ...[int|float|str]:argN ...]'.format(sys.argv[0]) sys.exit(1) cmd_line = parse_cmd_line(sys.argv[:]) if DEBUG: print 'Parsed cmd_line: ', cmd_line client = get_rpc_client(cmd_line['endpoint']) #Executing now the remote method result = getattr(client, cmd_line['method_name'])(*cmd_line['args']) if result is not None: print ">>Result:\n", result PKhG#zmon_worker_monitor/eventloghttp.pyimport eventlog import json import requests import datetime from eventlog import Event _target_host = 'localhost' _target_port = 8081 _enable_http = True def set_target_host(host='localhost', port='8081'): global _target_host, _target_port _target_host = host _target_port = port def enable_http(enable=True): global _enable_http _enable_http = enable def register_all(events, path=None): eventlog.register_all(events, path) def log(e_id, **kwargs): # for now forward everything eventlog.log(e_id, **kwargs) if not _enable_http: return now = datetime.datetime.now() headers = {'content-type': 'application/json'} event = {'typeId': e_id, 'attributes': kwargs, 'time': now.strftime("%Y-%m-%dT%H:%M:%S.")+now.strftime("%f")[:3]} try: res = requests.put('http://{}:{}/'.format(_target_host, _target_port), data=json.dumps([event]), headers=headers) except Exception as e: pass PKhG>zmon_worker_monitor/tasks.py#!/usr/bin/env python # -*- coding: utf-8 -*- import logging from ConfigParser import ConfigParser import settings from zmon_worker.tasks.notacelery_task import NotaZmonTask from zmon_worker.notifications.mail import Mail from zmon_worker.notifications.sms import Sms logger = logging.getLogger(__name__) def load_config_from_file(): # load the global section of the configuration as a dict (eval to get values as python objects) c = ConfigParser() c.readfp(open(settings.zmon_worker_config_file)) config = dict((key, eval(val)) for key, val in c.items('global')) logger.info('loaded worker config is: %s', config) return config def configure_tasks(config): #Pass configuration to zmon classes NotaZmonTask.configure(config) Mail.update_config(config) Sms.update_config(config) zmontask = NotaZmonTask() def check_and_notify(req, alerts, task_context=None, **kwargs): logger.debug('check_and_notify received req=%s, alerts=%s, task_context=%s, ', req, alerts, task_context) zmontask.check_and_notify(req, alerts, task_context=task_context) def trial_run(req, alerts, task_context=None, **kwargs): logger.info('trial_run received <== check_id=%s', req['check_id']) logger.debug('trial_run received req=%s, alerts=%s, task_context=%s, ', req, alerts, task_context) zmontask.trial_run(req, alerts, task_context=task_context) def cleanup(*args, **kwargs): logger.info('cleanup task received with args=%s, kwargs=%s', args, kwargs) zmontask.cleanup(*args, **kwargs) PKhGo;(zmon_worker_monitor/builtins/__init__.py__author__ = 'avalles' PKGf:zmon_worker_monitor/builtins/plugins/history.worker_plugin[Core] Name = history Module = history [Documentation] Author = John Doe Version = 0.1 Website = https://github.com/zalando/zmon-worker Description = Builtin plugin that provides an http request check function. [Configuration] ;; The configuration section is optional, you can put here any parameters needed to configure the plugin. ;; key = valueX ;; Note valueX will be overridden if Zmon's global config has: plugin.{plugin_name}.{key} = valueY kairosdb_host = cassandra01 kairosdb_port = 37629 kairosdb_history_enabled = False PKGX8zmon_worker_monitor/builtins/plugins/ping_.worker_plugin[Core] Name = ping Module = ping_ [Documentation] Author = John Doe Version = 0.1 Website = https://github.com/zalando/zmon-worker Description = Builtin plugin that provides an http request check function. [Configuration] ;; The configuration section is optional, you can put here any parameters needed to configure the plugin. ;; Note that these values may are overridden by zmon's global config. ;; key = value PK0}GxW-zmon_worker_monitor/builtins/plugins/zmon_.py#!/usr/bin/env python # -*- coding: utf-8 -*- import logging from zmon_worker_monitor.zmon_worker.errors import CheckError from functools import partial from suds.client import Client from zmon_worker_monitor.zmon_worker.common.time_ import parse_timedelta from timeperiod import in_period, InvalidFormat from zmon_worker_monitor.zmon_worker.common.utils import async_memory_cache import sys import json import redis import time from zmon_worker_monitor.adapters.ifunctionfactory_plugin import IFunctionFactoryPlugin, propartial logger = logging.getLogger(__name__) CHECK_REFRESH_TIME = 240 ALERT_REFRESH_TIME = 120 class ZmonFactory(IFunctionFactoryPlugin): def __init__(self): super(ZmonFactory, self).__init__() def configure(self, conf): """ Called after plugin is loaded to pass the [configuration] section in their plugin info file :param conf: configuration dictionary """ return def create(self, factory_ctx): """ Automatically called to create the check function's object :param factory_ctx: (dict) names available for Function instantiation :return: an object that implements a check function """ return propartial(ZmonWrapper, factory_ctx['zmon_url'], factory_ctx['redis_host'], factory_ctx['redis_port']) class ZmonWrapper(object): ZMON_ALERTS_ENTITIES_PATTERN = 'zmon:alerts:*:entities' def __init__(self, wsdl, host, port): # TODO: ZMON Controller no longer provides a SOAP endpoint try: self.__ws_client = Client(url=wsdl) self.__ws_client.set_options(cache=None) except Exception: raise CheckError('ZmonWrapper Error: failed to connect to zmon-controller') self.__redis = redis.StrictRedis(host, port) self.__checks = {} self.__alerts = [] self.logger = logger self.__checks = self.__load_check_definitions() self.__alerts = self.__load_alert_definitions() @async_memory_cache.cache_on_arguments(namespace='zmon-worker', expiration_time=ALERT_REFRESH_TIME) def __load_alert_definitions(self): try: response = self.__ws_client.service.getAllActiveAlertDefinitions() except Exception: self.logger.exception('ZmonWrapper Error: failed to load alert definitions') raise CheckError('ZmonWrapper Error: failed to load alert definitions'), None, sys.exc_info()[2] else: return [{ 'id': a.id, 'team': a.team, 'responsible_team': a.responsibleTeam, 'check_id': a.checkDefinitionId, 'period': (a.period or '' if hasattr(a, 'period') else ''), } for a in response[1]] @async_memory_cache.cache_on_arguments(namespace='zmon-worker', expiration_time=CHECK_REFRESH_TIME) def __load_check_definitions(self): try: response = self.__ws_client.service.getAllActiveCheckDefinitions() except Exception: self.logger.exception('ZmonWrapper Error: failed to load check definitions') raise CheckError('ZmonWrapper Error: failed to load check definitions'), None, sys.exc_info()[2] else: return dict((c.id, {'interval': c.interval}) for c in response[1]) @staticmethod def _is_entity_alert_stale(last_run, period): ''' Checks whether check's last run is within given period. >>> ZmonWrapper._is_entity_alert_stale(None, 60) False >>> ZmonWrapper._is_entity_alert_stale(time.time(), 10) False >>> ZmonWrapper._is_entity_alert_stale(time.time() - 20, 10) True ''' return (False if last_run is None else time.time() - last_run > period) def __is_alert_stale(self, alert, evaluated_alerts, check_results, multiplier, offset): a_id = alert['id'] # alert id c_id = alert['check_id'] # check id r_id = partial('{}:{}'.format, c_id) # helper function used in iterator to generate result id try: is_in_period = in_period(alert.get('period', '')) except InvalidFormat: self.logger.warn('Alert with id %s has malformed time period.', a_id) is_in_period = True if is_in_period: return a_id not in evaluated_alerts or any(self._is_entity_alert_stale(check_results.get(r_id(entity)), multiplier * self.__checks[c_id]['interval'] + offset) for entity in evaluated_alerts[a_id]) else: return False def stale_active_alerts(self, multiplier=2, offset='5m'): ''' Returns a list of alerts that weren't executed in a given period of time. The period is calculated using multiplier and offset: check's interval * multiplier + offset. Parameters ---------- multiplier: int Multiplier for check's interval. offset: str Time offset, for details see parse_timedelta function in zmon-worker/src/function/time_.py. Returns ------- list A list of stale active alerts. ''' alert_entities = self.__redis.keys(self.ZMON_ALERTS_ENTITIES_PATTERN) # Load evaluated alerts and their entities from redis. p = self.__redis.pipeline() for key in alert_entities: p.hkeys(key) entities = p.execute() evaluated_alerts = dict((int(key.split(':')[2]), entities[i]) for (i, key) in enumerate(alert_entities)) # Load check results for previously loaded alerts and entities. check_ids = [] for alert in self.__alerts: if alert['id'] in evaluated_alerts: for entity in evaluated_alerts[alert['id']]: p.lindex('zmon:checks:{}:{}'.format(alert['check_id'], entity), 0) check_ids.append('{}:{}'.format(alert['check_id'], entity)) results = p.execute() check_results = dict((check_id, json.loads(results[i])['ts']) for (i, check_id) in enumerate(check_ids) if results[i]) return [{'id': alert['id'], 'team': alert['team'], 'responsible_team': alert['responsible_team']} for alert in self.__alerts if self.__is_alert_stale(alert, evaluated_alerts, check_results, multiplier, parse_timedelta(offset).total_seconds())] def check_entities_total(self): ''' Returns total number of checked entities. ''' alert_entities = self.__redis.keys(self.ZMON_ALERTS_ENTITIES_PATTERN) p = self.__redis.pipeline() for key in alert_entities: p.hkeys(key) entities = p.execute() return sum(len(e) for e in entities) PK,uGT>-{-{.zmon_worker_monitor/builtins/plugins/nagios.py#!/usr/bin/env python # -*- coding: utf-8 -*- import sys import json import logging import re import shlex import subprocess32 from functools import partial, wraps from zmon_worker_monitor.zmon_worker.errors import CheckError, NagiosError from zmon_worker_monitor.adapters.ifunctionfactory_plugin import IFunctionFactoryPlugin, propartial logger = logging.getLogger(__name__) # only return 95% of diskspace because of http://unix.stackexchange.com/questions/7950/reserved-space-for-root-on-a-filesystem-why USABLE_DISKSPACE_FACTOR = 0.95 class NagiosFactory(IFunctionFactoryPlugin): def __init__(self): super(NagiosFactory, self).__init__() # fields from config self._exarpc_user = None self._exarpc_pass = None self._loungemysql_user = None self._loungemysql_pass = None self._hetcrawler_proxy_user = None self._hetcrawler_proxy_pass = None def configure(self, conf): """ Called after plugin is loaded to pass the [configuration] section in their plugin info file :param conf: configuration dictionary """ self._exarpc_user = conf['exarpc_user'] self._exarpc_pass = conf['exarpc_pass'] self._loungemysql_user = conf['loungemysql_user'] self._loungemysql_pass = conf['loungemysql_pass'] self._hetcrawler_proxy_user = conf['hetcrawler_proxy_user'] self._hetcrawler_proxy_pass = conf['hetcrawler_proxy_pass'] def create(self, factory_ctx): """ Automatically called to create the check function's object :param factory_ctx: (dict) names available for Function instantiation :return: an object that implements a check function """ return propartial(NagiosWrapper, factory_ctx['host'], exasol_user=self._exarpc_user, exasol_password=self._exarpc_pass, lounge_mysql_user=self._loungemysql_user, lounge_mysql_password=self._loungemysql_pass, hetcrawler_proxy_user=self._hetcrawler_proxy_user, hetcrawler_proxy_pass=self._hetcrawler_proxy_pass) def error_wrapped(parser): @wraps(parser) def wrapper(*args, **kwargs): try: result = parser(*args, **kwargs) except Exception: raise NagiosError(args[0]) else: return result return wrapper def fix_sub32_exc(e): '''Quick and dirty way to deal with subprocess32 bugs. See PF-4190''' if isinstance(e, subprocess32.TimeoutExpired): for field in ['output', 'timeout', 'cmd']: if not hasattr(e, field): setattr(e, field, '--') if isinstance(e, subprocess32.CalledProcessError): for field in ['returncode', 'output', 'cmd']: if not hasattr(e, field): setattr(e, field, '--') return e class NagiosWrapper(object): def __init__(self, host, exasol_user='nagios', exasol_password='', lounge_mysql_user='nagios', lounge_mysql_password='', hetcrawler_proxy_user='', hetcrawler_proxy_pass=''): self.host = host self.__nrpe_config = { # example to check non-default memcached: # nagios().nrpe('check_memcachestatus', port=11212) # example to check non-default logwatch: # requires NRPE: command[check_all_disks]=/usr/lib/nagios/plugins/check_disk -w "$ARG1$" -c "$ARG2$" -u "$ARG3$" 'check_diff_reverse': {'args': '-a /proc/meminfo CommitLimit Committed_AS kB 1048576 524288', 'parser': self._to_dict_commitdiff}, 'check_disk': {'args': '-a 15% 7% /', 'parser': self._parse_memory}, 'check_all_disks': {'args': '-a 15% 7% MB', 'parser': self._parse_disks}, 'check_fiege-avis-file': {'args': '', 'parser': self._to_dict_from_text}, 'check_findfiles': {'args': '-a 20,20,20 20,20,20 {directory} {epoch} found', 'parser': partial(self._to_dict, func=int), 'parameters': {'directory': '', 'epoch': 1}}, 'check_findfiles_names': {'args': '-a 20,20,20 20,20,20 {directory} {epoch} found {name}', 'parser': partial(self._to_dict, func=int), 'parameters': {'directory': '', 'epoch': 1, 'name': ''}}, 'check_findfiles_names_exclude': {'args': '-a 20,20,20 20,20,20 {directory} {epoch} found {name}', 'parser': partial(self._to_dict, func=int), 'parameters': {'directory': '', 'epoch': 1, 'name': ''}}, 'check_hpacucli_py': {'args': '', 'parser': json.loads}, 'check_hpacucli': {'args': '', 'parser': self._to_dict_hpraid}, 'check_hpasm_dl380p_gen8_fix': {'args': '-a 14:60 15:60', 'parser': self._to_dict_hpasm}, 'check_hpasm_fix_power_supply': {'args': '-a 14:60 15:60', 'parser': self._to_dict_hpasm}, 'check_hpasm_gen8': {'args': '-a 14:60 15:60', 'parser': self._to_dict_hpasm}, 'check_inodes': {'args': '', 'parser': json.loads}, 'check_iostat': {'args': '-a 32000,30000 64000,40000 {disk}', 'parser': self._to_dict_iostat, 'parameters': {'disk': 'sda'}}, 'check_list_timeout': { 'args': '-a "ls {path}" {timeout}', 'parameters': {'path': '/data/production/', 'timeout': 10}, 'parser': self._to_dict, 'pre_run_hook': self._check_path_chars, }, 'check_load': {'args': '-a 15,14,12 20,17,15', 'parser': self._to_dict}, 'check_mailq_postfix': {'args': '-a 10 5000', 'parser': partial(self._to_dict, func=int)}, 'check_postfix_queue.sh': {'parser': partial(self._to_dict, func=int)}, 'check_memcachestatus': {'args': '-a 9000000,550,10000,100,6000,5000,20481024,20481024 99000000,1000,12000,200,8000,7000,40961024,40961024 127.0.0.1 {port}', 'parser': self._to_dict, 'parameters': {'port': 11211}}, 'check_ntp_time': {'args': '-a 1 2 10 {ntp_server}', 'parser': self._to_dict}, 'check_openmanage': {'args': '', 'parser': self._to_dict_hpasm}, 'check_subdomain_redirect': {'args': '', 'parser': self._to_dict_from_text}, 'check_icmp': {'args': '-a {targethost} {num_of_packets} {timeout}', 'parser': self._to_dict, 'parameters': {'targethost': 'default', 'num_of_packets': 5, 'timeout': 10}}, 'check_tcp': {'args': '-a {targethost} {port} {timeout}', 'parser': self._to_dict, 'parameters': {'targethost': 'default', 'port': 22, 'timeout': 10}}, 'check_tcp_str': {'args': '-a {targethost} {port} {timeout} {expected}', 'parser': self._to_dict, 'parameters': { 'targethost': 'default', 'port': 22, 'timeout': 10, 'expected': 'SSH-2.0-OpenSSH', }}, 'check_ssl': {'args': '-a {targethost} {port} {timeout}', 'parser': self._to_dict, 'parameters': {'targethost': 'default', 'port': 443, 'timeout': 10}}, 'check_statistics.pl': {'args': '', 'parser': self._to_dict}, 'check_oracle': {'args': '{user_args}', 'parser': self._to_dict, 'parameters': {'user_args': ''}}, 'check_dbus': {'args': '', 'parser': self._to_dict_win_text,}, 'check_flocked_file': {'args': '-a {lockfile}', 'parser': self._to_dict_from_text}, 'check_apachestatus_uri': {'args': '-a 16000,10000,48 32000,20000,64 {url}', 'parser': self._to_dict, 'parameters': {'url': 'http://127.0.0.1/server-status?auto'}}, 'check_command_procs': {'args': '-a 250 500 {process}', 'parser': self._to_dict_procs, 'parameters': {'process': 'httpd'}}, 'check_http_expect_port_header': {'args': '-a 2 8 60 {ip} {url} {redirect} {size} {expect} {port} {hostname}', 'parser': self._to_dict, 'parameters': { 'ip': 'localhost', 'url': '/', 'redirect': 'warning', 'size': '9000:90000', 'expect': '200', 'port': '88', 'hostname': 'www.example.com', }}, 'check_mysql_processes': {'args': '-a 30 60 {host} {port} {user} {password}', 'parser': self._to_dict_mysql_procs, 'parameters': { 'host': 'localhost', 'port': '/var/lib/mysql/mysql.sock', 'user': lounge_mysql_user, 'password': lounge_mysql_password, }}, 'check_mysqlperformance': {'args': '-a 10000,1500,5000,500,750,100,100,1,5000,30,60,500,10,30 15000,3000,10000,750,1000,250,250,5,7500,60,300,1000,20,60 {host} {port} Questions,Com_select,Qcache_hits,Com_update,Com_insert,Com_delete,Com_replace,Aborted_clients,Com_change_db,Created_tmp_disk_tables,Created_tmp_tables,Qcache_not_cached,Table_locks_waited,Select_scan {user} {password}', 'parser': self._to_dict, 'parameters': { 'host': 'localhost', 'port': '/var/lib/mysql/mysql.sock', 'user': lounge_mysql_user, 'password': lounge_mysql_password, }}, 'check_mysql_slave': {'args': '-a 3 60 {host} {port} {database} {user} {password}', 'parser': self._to_dict_mysql_slave, 'parameters': { 'host': 'localhost', 'port': '/var/lib/mysql/mysql.sock', 'database': 'zlr_live_global', 'user': lounge_mysql_user, 'password': lounge_mysql_password, }}, 'check_stunnel_target': { 'args': '-a {target} {user} {password}', 'parser': self._to_dict, 'parameters': { 'target': 'www.example.com', 'user': hetcrawler_proxy_user, 'password': hetcrawler_proxy_pass, }, }, 'check_lounge_queries': {'args': '', 'parser': self._to_dict_lounge_queries}, 'check_newsletter': {'args': '-p {port}', 'parser': self._to_dict_newsletter, 'parameters': {'port': '5666'}}, 'check_nfs_mounts': {'args': '', 'parser': self._to_dict_list}, 'check_kdc': {'args': '', 'parser': json.loads}, 'check_kadmin': {'args': '', 'parser': json.loads}, 'check_ssl_cert': {'args': '-a 60 30 {host_ip} {domain_name}', 'parser': partial(self._to_dict, func=int), 'parameters': {'host_ip': '127.0.0.1', 'domain_name': 'www.example.com'}}, } self.__local_config = { 'check_subdomain_redirect.py': {'args': '', 'parser': self._to_dict_from_text}, 'check_ping': {'args': '-H {} -w 5000,100% -c 5000,100% -p 1'.format(self.host), 'parser': self._to_dict}, 'check_snmp_mem_used-cached.pl': {'args': '-H {} -w 100,100,100 -c 100,100,100 -C public -f'.format(self.host), 'parser': self._to_dict}, 'check_icmp': {'args': '-H {} -n {{num_of_packets}} -t {{timeout}}'.format(self.host), 'parser': self._to_dict, 'parameters': {'num_of_packets': 5, 'timeout': 10}}, 'check_tcp': {'args': '-H {} -p {{port}} -t {{timeout}}'.format(self.host), 'parser': self._to_dict, 'parameters': {'port': 22, 'timeout': 10}}, 'check_tcp_str': {'args': '-H {} -p {{port}} -t {{timeout}} -e {{expected}}'.format(self.host), 'parser': self._to_dict, 'parameters': {'port': 22, 'timeout': 10, 'expected': 'SSH-2.0-OpenSSH'}}, 'check_ssl': {'args': '-H {} -p {{port}} -t {{timeout}} -S'.format(self.host), 'parser': self._to_dict, 'parameters': {'port': 443, 'timeout': 10}}, 'check_dns': {'args': '-H {host} -s {dns_server} -t {timeout}', 'parser': self._to_dict, 'parameters': {'timeout': 5}}, 'check_snmp_process.pl': {'args': '-H {} -C {{community}} -F -n {{name}} -c {{critical}} -w {{warn}} -o {{octets}} {{extra}}'.format(self.host), 'parser': self._to_dict, 'parameters': { 'timeout': 5, 'octets': 2400, 'warn': 1, 'critical': 1, 'community': 'public', 'extra': '-r -2', }}, # check_xmlrpc.rb: BI checks for PF-3558 # possible user_args: # Exasol: backup state -> '-check-backup' (default) # Exasol: DB-Status -> '--rpc getDatabaseState --ok' # Exasol: Node status -> '--check-node-states -w 0 -c 1' # Exasol: Verbindungs-Status -> '--rpc getDatabaseConnectionState --ok yes' 'check_xmlrpc.rb': {'args': '--url http://{user}:{password}@{targethost}/cluster1/db_exa_db1 {user_args}', 'parser': self._to_dict_newsletter, 'parameters': { 'targethost': '10.229.12.212', 'user': exasol_user, 'password': exasol_password, 'user_args': '-check-backup', }}, 'check_ssl_cert': {'args': '-w 60 -c 30 -H {host_ip} -n {domain_name} -r /etc/ssl/certs --altnames', 'parser': partial(self._to_dict, func=int), 'parameters': {'host_ip': '127.0.0.1', 'domain_name': 'www.example.com'}}, 'check-ldap-sync.pl': {'args': '', 'parser': json.loads}, } self.__win_config = { 'CheckCounter': {'args': '-a "Counter:ProcUsedMem=\\Process({process})\\Working Set" ShowAll MaxWarn=1073741824 MaxCrit=1073741824', 'parser': partial(self._to_dict_win, func=int), 'parameters': {'process': 'eo_server'}}, 'CheckCPU': {'args': '-a warn=100 crit=100 time=1 warn=100 crit=100 time=5 warn=100 crit=100 time=10', 'parser': partial(self._to_dict_win, func=int)}, 'CheckDriveSize': {'args': '-a CheckAll ShowAll perf-unit=M', 'parser': self._to_dict_win}, 'CheckEventLog': {'args': '-a file="{log}" MaxWarn=1 MaxCrit=1 "filter={query}" truncate=800 unique "syntax=%source% (%count%)"', 'parser': partial(self._to_dict_win, func=int), 'parameters': {'log': 'application', 'query': 'generated gt -7d AND type=\'error\''}}, 'CheckFiles': {'args': '-a "path={path}" "pattern={pattern}" "filter={query}" MaxCrit=1', 'parser': partial(self._to_dict_win, func=int), 'parameters': {'path': 'C:\\Import\\Exchange2Clearing', 'pattern': '*.*', 'query': 'creation lt -1h'}}, 'CheckLogFile': {'args': '-a file="{logfile}" column-split="{seperator}" "filter={query}"', 'parser': self._to_dict_win_log, 'parameters': {'logfile': 'c:\Temp\log\maxflow_portal.log', 'seperator': ' ', 'query': 'column4 = \'ERROR\' OR column4 = \'FATAL\''}}, 'CheckMEM': {'args': '-a MaxWarn=15G MaxCrit=15G ShowAll perf-unit=M type=physical type=page type=virtual', 'parser': self._to_dict_win}, 'CheckProcState': {'args': '-a ShowAll {process}', 'parser': self._to_dict_win_text, 'parameters': {'process': 'check_mk_agent.exe'}}, 'CheckServiceState': {'args': '-a ShowAll {service}', 'parser': self._to_dict_win_text, 'parameters': {'service': 'ENAIO_server'}}, 'CheckUpTime': {'args': '-a MinWarn=1000d MinCrit=1000d', 'parser': partial(self._to_dict_win, func=int)}, } def nrpe(self, check, timeout=60, **kwargs): config = self.__nrpe_config[check] parameters = {} parameters.update(config.get('parameters', {})) parameters.update(kwargs) pre_run_hook_ok = config.get('pre_run_hook', self._check_ok) if not pre_run_hook_ok(config.get('parameters', {})): raise CheckError('Pre run hook does not accept your parameters') cmd_args = config['args'].format(**parameters) cmd = shlex.split('/usr/lib/nagios/plugins/check_nrpe -u -H {h} -t {t} -c {c} {a}'.format(h=self.host, t=timeout, c=check, a=cmd_args)) try: output = subprocess32.check_output(cmd, shell=False, timeout=timeout) except subprocess32.CalledProcessError, e: e = fix_sub32_exc(e) if e.returncode < 3: output = str(e.output) else: output = str(e.output) return output except subprocess32.TimeoutExpired, e: e = fix_sub32_exc(e) raise NagiosError(str(e)), None, sys.exc_info()[2] logger.debug('output for cmd (%s): %s', cmd, output) return self.__nrpe_config[check]['parser'](output) def local(self, check, timeout=60, **kwargs): config = self.__local_config[check] parameters = {} parameters.update(config.get('parameters', {})) parameters.update(kwargs) pre_run_hook_ok = config.get('pre_run_hook', self._check_ok) if not pre_run_hook_ok(config.get('parameters', {})): raise CheckError('Pre run hook does not accept your parameters') cmd_args = config['args'].format(**parameters) cmd = shlex.split('/usr/lib/nagios/plugins/{c} {a}'.format(c=check, a=cmd_args)) try: output = subprocess32.check_output(cmd, shell=False, timeout=timeout) except subprocess32.CalledProcessError, e: e = fix_sub32_exc(e) if e.returncode < 3: output = str(e.output) else: output = str(e.output) return output except subprocess32.TimeoutExpired, e: e = fix_sub32_exc(e) raise NagiosError(str(e)), None, sys.exc_info()[2] logger.debug('output for cmd (%s): %s', cmd, output) return self.__local_config[check]['parser'](output) def win(self, check, timeout=60, **kwargs): config = self.__win_config[check] parameters = {} parameters.update(config.get('parameters', {})) parameters.update(kwargs) pre_run_hook_ok = config.get('pre_run_hook', self._check_ok) if not pre_run_hook_ok(config.get('parameters', {})): raise CheckError('Pre run hook does not accept your parameters') cmd_args = config['args'].format(**parameters) cmd = shlex.split('/usr/lib/nagios/plugins/check_nrpe -u -H {h} -t {t} -c {c} {a}'.format(h=self.host, t=timeout, c=check, a=cmd_args)) try: output = subprocess32.check_output(cmd, shell=False, timeout=timeout) except subprocess32.CalledProcessError, e: e = fix_sub32_exc(e) if e.returncode < 3: output = str(e.output) else: output = str(e.output) return output except subprocess32.TimeoutExpired, e: e = fix_sub32_exc(e) raise NagiosError(str(e)), None, sys.exc_info()[2] logger.debug('output for cmd (%s): %s', cmd, output) return self.__win_config[check]['parser'](output) @staticmethod def _check_ok(config): '''Returns always True (ok) as result.''' return True @staticmethod def _check_path_chars(config): return re.match("^[a-zA-Z0-9\/_\-]+$", config['path']) @staticmethod @error_wrapped def _to_dict(output, func=float): return dict((a.split('=')[0], func(re.sub(r'.*?(-?[0-9]*\.[0-9]+|-?[0-9]+).*', r'\1', a.split('=')[1].split(';')[0]))) for a in output.split('|')[-1].split()) @staticmethod @error_wrapped def _to_dict_list(output): return dict((a, 1) for a in output.split('|')[-1].split()) @staticmethod @error_wrapped def _to_dict_olderfiles(output): '''try to parse this output: OK - 0 files older as 600 min -- 0 files older as 540 min -- total 762 files -- older: >>> import json; json.dumps(NagiosWrapper._to_dict_olderfiles('OK - 0 files older as 600 min -- 112 files older as 60 min -- total 831 files -- older:'), sort_keys=True) '{"files older than time01": 112, "files older than time02": 0, "total files": 831}' ''' return {'files older than time01': int(output.split(' -- ')[1].split()[0]), 'files older than time02': int(output.split(' -- ')[0].split()[2]), 'total files': int(output.split(' -- ')[2].split()[1])} @staticmethod @error_wrapped def _to_dict_win(output, func=float): '''try to parse this output: OK: physical memory: 4.8G, page file: 5.92G, virtual memory: 254M|'physical memory %'=29%;6;6 'physical memory'=5028644K;15728640;15728640;0;16776760 'page file %'=18%;53;53 'page file'=6206652K;15728640;15728640;0;33472700 'virtual memory %'=0%;99;99 'virtual memory'=259704K;15728640;15728640;0;8589934464 ''' return dict((a.split('=')[0], func(re.sub('[^0-9.]', '', a.split('=')[1].split(';')[0]))) for a in shlex.split(output.split('|')[-1])) @staticmethod @error_wrapped def _to_dict_win_text(output): '''try to parse this output: OK: ENAIO_server: started ''' po = {'status': output.split(':')[0], 'message': ' '.join(output.split(' ')[1:]).strip(' \n').split('|')[0]} if not po['status'] or not po['message']: raise NagiosError('Unable to parse {}'.format(output)) return po @staticmethod @error_wrapped def _to_dict_win_log(output): '''try to parse this output: c:\Temp\log\maxflow_portal.log2014.04.29: 1 (2014-04-29 15:44:00,741 [5] WARN BeckIT.MPO.... ) ''' if 'Nothing matched' in output: return {'count': 0} else: return {'count': int(output.split(' ')[1].strip(' '))} @staticmethod @error_wrapped def _to_dict_commitdiff(output): ''' try to parse this output: CheckDiff OK - CommitLimit-Committed_AS: 24801200 | 24801200;1048576;524288 ''' return {(output.split(' ')[-4])[:-1]: int(output.split(' ')[-3])} @staticmethod @error_wrapped def _to_dict_logwatch(output): ''' try to parse this output: WARNING - 0 new of 109 Lines on /access.log|newlines=0;100;5000;0; or OK - 0 new of 109 Lines on /access.log|newlines=0;1000;5000;0; ''' return {'last': int(output.split(' ')[2]), 'total': int(output.split(' ')[5])} @staticmethod @error_wrapped def _to_dict_from_text(output): ''' try to parse this output: Status ERROR : Avis-File for Fiege is missing or Status OK : file is ... ''' return {'status': output.split(' ')[1].strip('\n'), 'message': ' '.join(output.split(' ')[3:]).strip('\n')} @staticmethod @error_wrapped def _to_dict_procs(output): ''' try to parse this output: PROCS OK: 33 processes with command name 'httpd' ''' return {'procs': int(output.split(' ')[2])} @staticmethod @error_wrapped def _to_dict_mysql_procs(output): ''' try to parse this output: 1 threads running. 0 seconds avg ''' return {'threads': int(output.split(' ')[0]), 'avg': int(output.split(' ')[3])} @staticmethod @error_wrapped def _to_dict_mysql_slave(output): '''try to parse this output: Uptime: 38127526 Threads: 2 Questions: 42076974272 Slow queries: 1081545 Opens: 913119 Flush tables: 889 Open tables: 438 Queries per second avg: 1103.585 Slave IO: Yes Slave SQL: Yes Seconds Behind Master: 0 ''' po = dict(re.findall('([a-z][a-z0-9 ]+): ([a-z0-9.()]+)', output, re.IGNORECASE)) for k, v in po.items(): if not re.match('[a-z()]', v, re.IGNORECASE): po[k] = float(v) return po @staticmethod @error_wrapped def _to_dict_lounge_queries(output): '''try to parse this output: QUERY OK: 'SELECT COUNT(*) FROM global_orders WHERE ( billing_address LIKE '%Rollesbroich%' OR shipping_address LIKE '%Rollesbroich%' OR email LIKE '%Rollesbroich%' OR billing_address LIKE '%Süddeutsche TV%' OR shipping_address LIKE '%Süddeutsche TV%' OR email='sparfuchs-galileo@gmx.de' ) AND order_date >= DATE_SUB(CURDATE(),INTERVAL 1 DAY);' returned 0.000000 ''' return {'status': ' '.join(output.split()[:2]).strip(':'), 'query': ' '.join(output.split()[2:-2]), 'result': float(output.split()[-1])} @staticmethod @error_wrapped def _to_dict_iostat(output): ''' try to parse this output: OK - IOread 0.00 kB/s, IOwrite 214.80 kB/s on sda with 31.10 tps |ioread=0.00;32000;64000;0;iowrite=214.80;30000;40000;0; ''' return {'ioread': float(output.split(' ')[3]), 'iowrite': float(output.split(' ')[6]), 'tps': float(output.split(' ')[13])} @staticmethod @error_wrapped def _to_dict_hpraid(output): ''' try to parse this output: logicaldrive 1 (68.3 GB, RAID 1, OK) -- physicaldrive 1I:1:1 (port 1I:box 1:bay 1, SAS, 146 GB, OK) -- physicaldrive 1I:1:2 (port 1I:box 1:bay 2, SAS, 72 GB, OK) -- logicaldrive 2 (279.4 GB, RAID 1, OK) -- physicaldrive 1I:1:3 (port 1I:box 1:bay 3, SAS, 300 GB, OK) -- physicaldrive 1I:1:4 (port 1I:box 1:bay 4, SAS, 300 GB, OK) -- logicaldrive 3 (279.4 GB, RAID 1, OK) -- physicaldrive 2I:1:5 (port 2I:box 1:bay 5, SAS, 300 GB, OK) -- physicaldrive 2I:1:6 (port 2I:box 1:bay 6, SAS, 300 GB, OK) -- ''' return dict((b.split(' ')[0] + '_' + b.split(' ')[1], b.split(',')[-1].strip(' )')) for b in [a.strip(' --\n') for a in output.split(' -- ')]) @staticmethod @error_wrapped def _to_dict_newsletter(output): ''' try to parse this output: OK: Not in timeframe (02:25:00 - 09:00:00) or CRITICAL: NL not processed for appdomain 17, Not processed for appdomain 18 ''' return {'status': output.split(': ')[0], 'messages': output.split(': ')[-1].strip('\n').split(',')} @staticmethod @error_wrapped def _to_dict_hpasm(output): ''' try to parse this output: OK - System: 'proliant dl360 g6', S/N: 'CZJ947016M', ROM: 'P64 05/05/2011', hardware working fine, da: 3 logical drives, 6 physical drives cpu_0=ok cpu_1=ok ps_2=ok fan_1=46% fan_2=46% fan_3=46% fan_4=46% temp_1=21 temp_2=40 temp_3=40 temp_4=35 temp_5=34 temp_6=37 temp_7=32 temp_8=36 temp_9=32 temp_10=36 temp_11=32 temp_12=33 temp_13=48 temp_14=29 temp_15=32 temp_16=30 temp_17=29 temp_18=39 temp_19=37 temp_20=38 temp_21=45 temp_22=42 temp_23=39 temp_24=48 temp_25=35 temp_26=46 temp_27=35 temp_28=71 | fan_1=46%;0;0 fan_2=46%;0;0 fan_3=46%;0;0 fan_4=46%;0;0 'temp_1_ambient'=21;42;42 'temp_2_cpu#1'=40;82;82 'temp_3_cpu#2'=40;82;82 'temp_4_memory_bd'=35;87;87 'temp_5_memory_bd'=34;78;78 'temp_6_memory_bd'=37;87;87 'temp_7_memory_bd'=32;78;78 'temp_8_memory_bd'=36;87;87 'temp_9_memory_bd'=32;78;78 'temp_10_memory_bd'=36;87;87 'temp_11_memory_bd'=32;78;78 'temp_12_power_supply_bay'=33;59;59 'temp_13_power_supply_bay'=48;73;73 'temp_14_memory_bd'=29;60;60 'temp_15_processor_zone'=32;60;60 'temp_16_processor_zone'=3 or OK - ignoring 16 dimms with status 'n/a' , System: 'proliant dl360p gen8', S/N: 'CZJ2340R6C', ROM: 'P71 08/20/2012', hardware working fine, da: 1 logical drives, 4 physical drives ''' return {'status': output.split(' - ')[0], 'message': output.split(' - ')[-1].strip('\n')} @staticmethod @error_wrapped def _parse_memory(output): result = dict((a.split('=')[0], a.split('=')[1].split(';')[0]) for a in output.split('|')[-1].split()) conv = { 't': 1000000000, 'tb': 1000000000, 'tib': 1073741824, 'g': 1000000, 'gb': 1000000, 'gib': 1048576, 'm': 1000, 'mb': 1000, 'mib': 1024, 'k': 1, 'kb': 1, 'kib': 1.024, } p = re.compile(r'^(\d+(?:\.\d+)?)(?:\s+)?(\w+)?$', re.I) for k in result: parts = p.findall(result[k]) if len(parts): v, u = parts[0] result[k] = (float(v) * conv[u.lower()] if u and u.lower() in conv else float(v)) return result @staticmethod @error_wrapped def _parse_disks(output): '''try to parse output of /usr/lib/nagios/plugins/check_disk -w 10% -c 5% -u MB >>> import json; json.dumps(NagiosWrapper._parse_disks('DISK OK - free space: / 80879 MB (69% inode=99%); /dev 64452 MB (99% inode=99%); /selinux 0 MB (100% inode=99%); | /=35303MB;110160;116280;0;122401 /dev=0MB;58006;61229;0;64452 /selinux=0MB;0;0;0;0'), sort_keys=True) '{"/": {"total_mb": 116280, "used_mb": 35303}, "/dev": {"total_mb": 61229, "used_mb": 0}}' ''' r = re.compile('[^0-9.]') performance_data = output.split('|')[-1] result = dict((m[0], {'used_mb': int(r.sub('', f[0])), 'total_mb': int(int(f[4]) * USABLE_DISKSPACE_FACTOR)}) for (m, f) in [[s.split(';') for s in a.split('=')] for a in performance_data.split()] if int(f[4]) != 0) return result if __name__ == '__main__': if len(sys.argv) == 3: checkname = sys.argv[2] check = NagiosWrapper(sys.argv[1]) print json.dumps(check.nrpe(checkname), indent=4) elif len(sys.argv) == 4: checkname = sys.argv[2] query = sys.argv[3] check = NagiosWrapper(sys.argv[1]) print json.dumps(check.win(checkname, query=query), indent=4) elif len(sys.argv) == 5: checkname = sys.argv[2] path = sys.argv[3] query = sys.argv[4] check = NagiosWrapper(sys.argv[1]) print json.dumps(check.win(checkname, path=path, query=query), indent=4) elif len(sys.argv) == 6: checkname = sys.argv[2] directory = sys.argv[3] epoch = sys.argv[4] name = sys.argv[5] check = NagiosWrapper(sys.argv[1]) print json.dumps(check.nrpe(checkname, directory=directory, epoch=epoch, name=name), indent=4) PKGvi8zmon_worker_monitor/builtins/plugins/time_.worker_plugin[Core] Name = time Module = time_ [Documentation] Author = John Doe Version = 0.1 Website = https://github.com/zalando/zmon-worker Description = Builtin plugin that provides an time functions. [Configuration] ;; The configuration section is optional, you can put here any parameters needed to configure the plugin. ;; Note that these values may are overridden by zmon's global config. ;; key = valueX ;; Note valueX will be overridden if Zmon's global config has: plugin.{plugin_name}.{key} = valueY PKhGB[9zmon_worker_monitor/builtins/plugins/cassandra_wrapper.py#!/usr/bin/env python # -*- coding: utf-8 -*- import logging #from cassandra.io.libevreactor import LibevConnection from cassandra.cluster import Cluster from zmon_worker_monitor.adapters.ifunctionfactory_plugin import IFunctionFactoryPlugin, propartial logger = logging.getLogger('zmon-worker.cassandra-function') class CassandraFactory(IFunctionFactoryPlugin): def __init__(self): super(CassandraFactory, self).__init__() def configure(self, conf): """ Called after plugin is loaded to pass the [configuration] section in their plugin info file :param conf: configuration dictionary """ return def create(self, factory_ctx): """ Automatically called to create the check function's object :param factory_ctx: (dict) names available for Function instantiation :return: an object that implements a check function """ return propartial(CassandraWrapper, node=factory_ctx.get('host')) class CassandraWrapper(object): def __init__(self, node, keyspace, port=9042, connect_timeout=1): # for now using a single host / node should be seed nodes or at least available nodes self.node = node self.port = port self.keyspace = keyspace self.connect_timeout = connect_timeout def execute(self, stmt): cl = Cluster([self.node], connect_timeout=self.connect_timeout) #cl.connection_class = LibevConnection session = None try: session = cl.connect() session.set_keyspace(self.keyspace) rs = session.execute(stmt) result = [] for r in rs: result.append(r) return result finally: cl.shutdown(); return {}PKG`Dzmon_worker_monitor/builtins/plugins/cassandra_wrapper.worker_plugin[Core] Name = cassandra Module = cassandra_wrapper [Documentation] Author = Jan Mussler Version = 0.1 Website = https://github.com/zalando/zmon-worker Description = Builtin plugin that provides Cassandra query capabilities [Configuration] PKhG5`DD,zmon_worker_monitor/builtins/plugins/snmp.py#!/usr/bin/env python # -*- coding: utf-8 -*- from zmon_worker_monitor.zmon_worker.errors import SnmpError from pysnmp.entity.rfc3413.oneliner import cmdgen from pysnmp.proto.rfc1902 import Integer, OctetString, Counter32, Counter64 import re import subprocess32 from zmon_worker_monitor.adapters.ifunctionfactory_plugin import IFunctionFactoryPlugin, propartial class SnmpFactory(IFunctionFactoryPlugin): def __init__(self): super(SnmpFactory, self).__init__() def configure(self, conf): """ Called after plugin is loaded to pass the [configuration] section in their plugin info file :param conf: configuration dictionary """ return def create(self, factory_ctx): """ Automatically called to create the check function's object :param factory_ctx: (dict) names available for Function instantiation :return: an object that implements a check function """ return propartial(SnmpWrapper, host=factory_ctx['host']) class SnmpWrapper(object): def __init__(self, host, community='public', version='v2c', timeout=5): if re.match(r'^[0-9]+$', str(timeout)): self.timeout = timeout if re.match(r'^[\w\-.]+$', host): self.host = host self.generator = cmdgen.CommandGenerator() self.transport = cmdgen.UdpTransportTarget((host, 161), timeout=timeout) if version == 'v2c': # self.comm_data = cmdgen.CommunityData('server', community, 1) # 1 means version SNMP v2c self.comm_data = cmdgen.CommunityData(community) def memory(self): '''return {'free': 3, ..} # 3 UCD-SNMP-MIB::memTotalSwap.0 = INTEGER: 4194300 kB # 4 UCD-SNMP-MIB::memAvailSwap.0 = INTEGER: 4194300 kB # 5 UCD-SNMP-MIB::memTotalReal.0 = INTEGER: 32816032 kB # 6 UCD-SNMP-MIB::memAvailReal.0 = INTEGER: 9392600 kB # 11 UCD-SNMP-MIB::memTotalFree.0 = INTEGER: 13586900 kB # 12 UCD-SNMP-MIB::memMinimumSwap.0 = INTEGER: 16000 kB # 13 UCD-SNMP-MIB::memShared.0 = INTEGER: 0 kB # 14 UCD-SNMP-MIB::memBuffer.0 = INTEGER: 299992 kB # 15 UCD-SNMP-MIB::memCached.0 = INTEGER: 6115656 kB ''' result = {} oids = { 'swap_total': '1.3.6.1.4.1.2021.4.3.0', 'swap_free': '1.3.6.1.4.1.2021.4.4.0', 'ram_total': '1.3.6.1.4.1.2021.4.5.0', 'ram_free': '1.3.6.1.4.1.2021.4.6.0', 'ram_total_free': '1.3.6.1.4.1.2021.4.11.0', 'swap_min': '1.3.6.1.4.1.2021.4.12.0', 'ram_shared': '1.3.6.1.4.1.2021.4.13.0', 'ram_buffer': '1.3.6.1.4.1.2021.4.14.0', 'ram_cache': '1.3.6.1.4.1.2021.4.15.0', } for k, oid in oids.items(): val = self._get_cmd(oid) result[k] = self.parse(Integer, int, val) return result def load(self): ''' Return CPU Load Averages Example result: {"load1":0.95,"load5":0.69,"load15":0.72} ''' result = {} oids = {'load1': '1.3.6.1.4.1.2021.10.1.3.1', 'load5': '1.3.6.1.4.1.2021.10.1.3.2', 'load15': '1.3.6.1.4.1.2021.10.1.3.3'} for k, oid in oids.items(): val = self._get_cmd(oid) result[k] = self.parse(OctetString, lambda x: float(str(x)), val) return result def cpu(self): ''' Returns CPU Percentage Example result: {"cpu_system":0,"cpu_user":19,"cpu_idle":79} ''' result = {} oids = {'cpu_idle': '1.3.6.1.4.1.2021.11.11.0', 'cpu_user': '1.3.6.1.4.1.2021.11.9.0', 'cpu_system': '1.3.6.1.4.1.2021.11.10.0'} for k, oid in oids.items(): val = self._get_cmd(oid) result[k] = self.parse(Integer, int, val) return result def cpu_raw(self): result = {} oids = { 'raw_cpu_user': '1.3.6.1.4.1.2021.11.50.0', 'raw_cpu_system': '.1.3.6.1.4.1.2021.11.52.0', 'raw_cpu_nice': '.1.3.6.1.4.1.2021.11.51.0', 'raw_cpu_idle': '.1.3.6.1.4.1.2021.11.53.0', } for k, oid in oids.items(): val = self._get_cmd(oid) result[k] = self.parse(Counter32, int, val) return result def df(self): ''' Return disk usage information Example result: {"/data/postgres-wal-nfs-itr":{"percentage_space_used":0,"used_space":160,"total_size":524288000,"device":"zala0-1-stp-02:/vol/zal_pgwal","available_space":524287840,"percentage_inodes_used":0}} ''' # disk_table = '.1.3.6.1.4.1.2021.9' # base = '1.3.6.1.4.1.2021.13.15.1.1' base = '1.3.6.1.4.1.2021.9.1' base_idx = base + '.1.' base_name = base + '.2.' results = {} idx2name = {} result_all = self._get_walk(base) for oid in sorted(result_all.keys()): if base_idx in oid: val = str(result_all[oid]) idx2name[val] = None elif base_name in oid: idx = oid.split('.')[-1] val = result_all[oid] idx2name[idx] = val for oid in sorted(result_all.keys()): if oid in base_idx or oid in base_name: continue idx = oid.split('.')[-1] name = str(idx2name[idx]) if 'loop' in name or 'ram' in name: continue results[name] = results.get(name, {}) tname = oid kind = oid.split('.')[-2] if kind == '3': tname = 'device' elif kind == '6': tname = 'total_size' # kBytes elif kind == '7': tname = 'available_space' # kBytes elif kind == '8': tname = 'used_space' # kBytes elif kind == '9': tname = 'percentage_space_used' elif kind == '10': tname = 'percentage_inodes_used' elif kind == '11': tname = 'total_size' # Gauge32 elif kind == '13': tname = 'available_space' # Gauge32 elif kind == '15': tname = 'used_space' # Gauge32 else: continue results[name][tname] = result_all[oid] return results def logmatch(self): # logmatch_table = '.1.3.6.1.4.1.2021.16' base = '1.3.6.1.4.1.2021.16.2.1' base_idx = base + '.1.' base_name = base + '.2.' results = {} idx2name = {} result_all = self._get_walk(base) for oid in sorted(result_all.keys()): if base_idx in oid: val = str(result_all[oid]) idx2name[val] = None elif base_name in oid: idx = oid.split('.')[-1] val = result_all[oid] idx2name[idx] = val for oid in sorted(result_all.keys()): if oid in base_idx or oid in base_name: continue idx = oid.split('.')[-1] name = str(idx2name[idx]) results[name] = results.get(name, {}) tname = oid kind = oid.split('.')[-2] if kind == '3': tname = 'file' # file name elif kind == '4': tname = 'regex' # regex string elif kind == '5': tname = 'global_count' # counter32 elif kind == '7': tname = 'current_count' # counter32 since last logrotation elif kind == '9': tname = 'last_count' # counter32 since last read else: continue results[name][tname] = result_all[oid] return results def interfaces(self): # IF-MIB::interfaces_table = '1.3.6.1.2.1.2.' # IF-MIB::ifMIB_table = '1.3.6.1.2.1.31' base = '1.3.6.1.2.1.2.2.1' base_idx = base + '.1.' base_name = base + '.2.' results = {} idx2name = {} result_all = self._get_walk(base) for oid in sorted(result_all.keys()): if base_idx in oid: val = str(result_all[oid]) idx2name[val] = None elif base_name in oid: idx = oid.split('.')[-1] val = result_all[oid] idx2name[idx] = val for oid in sorted(result_all.keys()): if oid in base_idx or oid in base_name: continue idx = oid.split('.')[-1] name = str(idx2name[idx]) results[name] = results.get(name, {}) tname = oid kind = oid.split('.')[-2] if kind == '7': tname = 'opStatus' # operational status elif kind == '8': tname = 'adStatus' # administratal status elif kind == '13': tname = 'in_discards' # counter32 elif kind == '14': tname = 'in_error' # counter32 elif kind == '19': tname = 'out_discards' # counter32 elif kind == '20': tname = 'out_error' # counter32 else: continue results[name][tname] = result_all[oid] # IF-MIB::ifMIB_table = '1.3.6.1.2.1.31' base = '1.3.6.1.2.1.31.1.1.1' base_idx = '1.3.6.1.2.1.2.2.1.1.' base_name = base + '.1.' # results = {} idx2name = {} result_all = self._get_walk(base) for oid in sorted(result_all.keys()): if base_idx in oid: val = str(result_all[oid]) idx2name[val] = None elif base_name in oid: idx = oid.split('.')[-1] val = result_all[oid] idx2name[idx] = val for oid in sorted(result_all.keys()): if oid in base_idx or oid in base_name: continue idx = oid.split('.')[-1] name = str(idx2name[idx]) results[name] = results.get(name, {}) tname = oid kind = oid.split('.')[-2] if kind == '6': tname = 'in_octets' # counter64 elif kind == '10': tname = 'out_octets' # counter64 elif kind == '15': tname = 'speed' # gauge32 else: continue results[name][tname] = result_all[oid] return results def postgres_backup(self): # val = self._get_mib('public', 'NET-SNMP-EXTEND-MIB', 'check_postgres_backup') # res = self.parse(OctetString, str, val) # Workaround for a too large check response from the script(too large udp packets are blocked by the fw, so we use tcp) cmd = \ '/usr/bin/snmpget -v2c -c public -t {timeout} tcp:{host} \'NET-SNMP-EXTEND-MIB::nsExtendOutputFull."check_postgres_backup"\''.format(timeout=self.timeout, host=self.host) try: output = subprocess32.check_output(cmd, shell=True, timeout=self.timeout, stderr=subprocess32.STDOUT) except subprocess32.CalledProcessError, e: database_backup_size, warnings, check_duration = {}, [str(e.output)], 0 else: res = output.partition(': ')[2] s = res.split('|') warning = s[0] perfdata = s[-1] database_backup_size = {} check_duration = 0 for pd in perfdata.split(' '): k, _, v = pd.partition('=') v = v.split(';')[0] if k == 'time': check_duration = v else: database_backup_size[k] = v warnings = warning.split(';') return {'database_backup_size': database_backup_size, 'warnings': warnings, 'check_duration': check_duration} def disk_pgxlog(self): result = {} output = str(self._get_mib('public', 'NET-SNMP-EXTEND-MIB', 'disk_pgxlog')) output = [i for i in re.split('\s{1,}|\t|\n', output) if i.isdigit() or i.startswith('/')] output = [output[i:i + 7] for i in range(0, len(output), 7)] output = [[ i[1], int(int(i[0]) / 1024), i[2], int(i[3]), int(i[4]), int(i[5]), i[6], ] for i in output] for i in output: name = str(i[0]) result[name] = {} for index in range(len(i)): val = i[index] if val == i[1]: tname = 'du_dir' elif val == i[2]: tname = 'filesystem' elif val == i[3]: tname = 'totalspace' elif val == i[4]: tname = 'used_space' elif val == i[5]: tname = 'available_space' elif val == i[6]: tname = 'mounted_on' else: continue result[name][tname] = i[index] return result def conntrackstats(self): output = str(self._get_mib('public', 'NET-SNMP-EXTEND-MIB', 'conntrackstats')) return dict((a.split('=')[0].strip(), int(a.split('=')[1])) for a in output.split('|')) # # snmp functions # def get_list(self, prefix='get', convert=int, *oids): h = {} for idx, oid in enumerate(oids): k = '{0}{1}'.format(prefix, idx) h[k] = self.get(oid, k, convert) return h def get(self, oid, name, convert): '''Example: get('1.3.6.1.4.1.2021.4.6.0', 'foo', int) -> {'foo': 42} ''' val = self._get_cmd(oid) result = convert(val) return {name: result} def _get_mib(self, community, prefix, extend, mib_result='nsExtendOutputFull', path=None): '''Parameter: mib_result can be 'nsExtendOutputFull', 'nsExtendResult', .. Example: _get_mib("public", "NET-SNMP-EXTEND-MIB", "check_postgres_backup") is the same like: % snmpget -v2c -c public z-storage02 'NET-SNMP-EXTEND-MIB::nsExtendOutputFull."check_postgres_backup"' ''' mib = None if path: mib = cmdgen.MibVariable(prefix, mib_result, extend).addMibSource(path) else: mib = cmdgen.MibVariable(prefix, mib_result, extend) real_fun = getattr(self.generator, 'getCmd') # SNMPGET res = errorIndication, errorStatus, errorIndex, varBinds = real_fun(self.comm_data, self.transport, mib) if not errorIndication is None or errorStatus is True: msg = 'Error: %s %s %s %s' % res raise SnmpError(msg) else: _, val = varBinds[0] return val def _get_mib_bulkwalk(self, community, prefix, table, path=None): ''' % snmpbulkwalk -v2c -c public myhost123 TCP-MIB::tcpConnTable ''' mib = None if path: mib = cmdgen.MibVariable(prefix, table).addMibSource(path) else: mib = cmdgen.MibVariable(prefix, table) real_fun = getattr(self.generator, 'bulkCmd') # SNMPBULKWALK res = errorIndication, errorStatus, errorIndex, varBinds = real_fun(self.comm_data, self.transport, 0, 50, mib, max_rows=100, ignore_non_increasing_oid=True) if not errorIndication is None or errorStatus is True: msg = 'Error: %s %s %s %s' % res raise SnmpError(msg) else: res = {} for items in varBinds: oid = str(items[0][0]) val = items[0][1] if isinstance(val, Counter64) or isinstance(val, Counter32) or isinstance(val, Integer): res[oid] = int(val) else: res[oid] = str(val) return res def _get_walk(self, oid): '''Returns a dictionary of oid -> value''' real_fun = getattr(self.generator, 'nextCmd') # SNMPWALK res = errorIndication, errorStatus, errorIndex, varBinds = real_fun(self.comm_data, self.transport, oid) if not errorIndication is None or errorStatus is True: msg = 'Error: %s %s %s %s' % res raise SnmpError(msg) else: res = {} for items in varBinds: oid = str(items[0][0]) val = items[0][1] if isinstance(val, Counter64) or isinstance(val, Counter32) or isinstance(val, Integer): res[oid] = int(val) else: res[oid] = str(val) return res def _get_cmd(self, oid): real_fun = getattr(self.generator, 'getCmd') # SNMPGET res = errorIndication, errorStatus, errorIndex, varBinds = real_fun(self.comm_data, self.transport, oid) if not errorIndication is None or errorStatus is True: msg = 'Error: %s %s %s %s' % res raise SnmpError(msg) else: _, val = varBinds[0] return val # Convert SNMP data to python data def parse(self, clazz, convert, val): '''Example: self.parse(Integer, int, Integer(11040956))''' if not val: return None if isinstance(val, clazz): return convert(val) raise SnmpError('Could not convert [{}] with {} into {}'.format(val, convert, clazz)) PK,uGy;+zmon_worker_monitor/builtins/plugins/jmx.py#!/usr/bin/env python # -*- coding: utf-8 -*- from zmon_worker_monitor.zmon_worker.errors import JmxQueryError import json import requests import time from zmon_worker_monitor.adapters.ifunctionfactory_plugin import IFunctionFactoryPlugin, propartial class JmxFactory(IFunctionFactoryPlugin): def __init__(self): super(JmxFactory, self).__init__() self._jmxquery_host = None self._jmxquery_port = None def configure(self, conf): """ Called after plugin is loaded to pass the [configuration] section in their plugin info file :param conf: configuration dictionary """ self._jmxquery_host = conf['jmxquery.host'] self._jmxquery_port = conf['jmxquery.port'] def create(self, factory_ctx): """ Automatically called to create the check function's object :param factory_ctx: (dict) names available for Function instantiation :return: an object that implements a check function """ return propartial(JmxWrapper, jmxqueryhost=self._jmxquery_host, jmxqueryport=self._jmxquery_port, host=factory_ctx['host'], port=factory_ctx['jmx_port']) class JmxWrapper(object): def __init__(self, jmxqueryhost, jmxqueryport, host, port, timeout=5): #jmxquery running where? self.jmxquery_host = jmxqueryhost self.jmxquery_port = jmxqueryport self.host = host self.port = port self.timeout = timeout self._queries = [] @staticmethod def _transform_results(data): '''Transform JSON returned from JMX Query to a reasonable dict >>> JmxWrapper._transform_results({'results':[{'beanName':'mybean','attributes':{'HeapMemoryUsage':1}}]}) {'HeapMemoryUsage': 1} >>> JmxWrapper._transform_results({'results':[{'beanName':'a','attributes':{'x':1}}, {'beanName': 'b', 'attributes': {'y': 2}}]}) {'a': {'x': 1}, 'b': {'y': 2}} >>> JmxWrapper._transform_results({'results':[{'beanName':'a','attributes':{'x':{'compositeType': {}, 'contents': {'y':7}}}}]}) {'x': {'y': 7}} ''' results = data['results'] d = {} for result in results: attr = result['attributes'] for key, val in attr.items(): if 'password' in key.lower(): attr[key] = '' continue if isinstance(val, dict) and 'compositeType' in val and 'contents' in val: # special unpacking of JMX CompositeType objects (e.g. "HeapMemoryUsage") # we do not want all the CompositeType meta information => just return the actual values attr[key] = val['contents'] d[result['beanName']] = attr if len(d) == 1: # strip the top-level "bean name" keys return d.values()[0] else: return d def query(self, bean, *attributes): self._queries.append((bean, attributes)) return self def _jmxquery_queries(self): for bean, attributes in self._queries: query = bean if attributes: query += '@' + ','.join(attributes) yield query def results(self): if not self._queries: raise ValueError('No query to execute') try: r = requests.get('http://{}:{}'.format(self.jmxquery_host, self.jmxquery_port), params={'host': self.host, 'port': self.port, 'query': self._jmxquery_queries()}, timeout=2) if r.status_code == 500: raise Exception('-do-one-try-') except: time.sleep(1) r = requests.get('http://{}:{}'.format(self.jmxquery_host, self.jmxquery_port), params={'host': self.host, 'port': self.port, 'query': self._jmxquery_queries()}, timeout=2) output = r.text try: data = json.loads(output) except: raise JmxQueryError(output) return self._transform_results(data) if __name__ == '__main__': # example call: # JAVA_HOME=/opt/jdk1.7.0_21/ python jmx.py restsn03 49600 jmxremote.password java.lang:type=Memory HeapMemoryUsage import sys jmx = JmxWrapper(*sys.argv[1:4]) print jmx.query(*sys.argv[4:]).results() PK,uG9C5zmon_worker_monitor/builtins/plugins/redis_wrapper.py#!/usr/bin/env python # -*- coding: utf-8 -*- import redis from zmon_worker_monitor.adapters.ifunctionfactory_plugin import IFunctionFactoryPlugin, propartial from zmon_worker_monitor import plugin_manager STATISTIC_GAUGE_KEYS = frozenset([ 'blocked_clients', 'connected_clients', 'connected_slaves', 'instantaneous_ops_per_sec', 'used_memory', 'used_memory_rss', ]) STATISTIC_COUNTER_KEYS = frozenset([ 'evicted_keys', 'expired_keys', 'keyspace_hits', 'keyspace_misses', 'total_commands_processed', 'total_connections_received', ]) class RedisFactory(IFunctionFactoryPlugin): def __init__(self): super(RedisFactory, self).__init__() # fields to store dependencies: plugin depends on 1 other plugin self.counter_factory = None def configure(self, conf): """ Called after plugin is loaded to pass the [configuration] section in their plugin info file :param conf: configuration dictionary """ return def create(self, factory_ctx): """ Automatically called to create the check function's object :param factory_ctx: (dict) names available for Function instantiation :return: an object that implements a check function """ # load plugins dependencies and store them locally for efficiency if not self.counter_factory: self.counter_factory = plugin_manager.get_plugin_obj_by_name('counter', 'Function') return propartial(RedisWrapper, counter=self.counter_factory.create(factory_ctx), host=factory_ctx['host']) class RedisWrapper(object): '''Class to allow only readonly access to underlying redis connection''' def __init__(self, counter, host, port=6379, db=0, socket_connect_timeout=1): self._counter = counter('') self.__con = redis.StrictRedis(host, port, db, socket_connect_timeout=socket_connect_timeout) def llen(self, key): return self.__con.llen(key) def lrange(self, key, start, stop): return self.__con.lrange(key, start, stop) def get(self, key): return self.__con.get(key) def hget(self, key, field): return self.__con.hget(key, field) def hgetall(self, key): return self.__con.hgetall(key) def statistics(self): ''' Return general Redis statistics such as operations/s Example result:: { "blocked_clients": 2, "commands_processed_per_sec": 15946.48, "connected_clients": 162, "connected_slaves": 0, "connections_received_per_sec": 0.5, "dbsize": 27351, "evicted_keys_per_sec": 0.0, "expired_keys_per_sec": 0.0, "instantaneous_ops_per_sec": 29626, "keyspace_hits_per_sec": 1195.43, "keyspace_misses_per_sec": 1237.99, "used_memory": 50781216, "used_memory_rss": 63475712 } ''' data = self.__con.info() stats = {} for key in STATISTIC_GAUGE_KEYS: stats[key] = data.get(key) for key in STATISTIC_COUNTER_KEYS: stats['{}_per_sec'.format(key).replace('total_', '')] = \ round(self._counter.key(key).per_second(data.get(key, 0)), 2) stats['dbsize'] = self.__con.dbsize() return stats if __name__ == '__main__': import sys import json # init plugin manager and collect plugins, as done by Zmon when worker is starting plugin_manager.init_plugin_manager() plugin_manager.collect_plugins(load_builtins=True, load_env=True) factory_ctx = { 'redis_host': 'localhost', } counter = plugin_manager.get_plugin_obj_by_name('counter', 'Function').create(factory_ctx) wrapper = RedisWrapper(counter, sys.argv[1]) print json.dumps(wrapper.statistics(), indent=4, sort_keys=True) PKGeq>zmon_worker_monitor/builtins/plugins/exceptions_.worker_plugin[Core] Name = exceptions Module = exceptions_ [Documentation] Author = John Doe Version = 0.1 Website = https://github.com/zalando/zmon-worker Description = Builtin plugin that provides an http request check function. [Configuration] ;; The configuration section is optional, you can put here any parameters needed to configure the plugin. ;; Note that these values may are overridden by zmon's global config. ;; key = value PKG1R7zmon_worker_monitor/builtins/plugins/snmp.worker_plugin[Core] Name = snmp Module = snmp [Documentation] Author = John Doe Version = 0.1 Website = https://github.com/zalando/zmon-worker Description = Builtin plugin that provides an http request check function. [Configuration] ;; The configuration section is optional, you can put here any parameters needed to configure the plugin. ;; Note that these values may are overridden by zmon's global config. ;; key = value PKG):zmon_worker_monitor/builtins/plugins/counter.worker_plugin[Core] Name = counter Module = counter [Documentation] Author = John Doe Version = 0.1 Website = https://github.com/zalando/zmon-worker Description = Builtin plugin that provides an http request check function. [Configuration] ;; The configuration section is optional, you can put here any parameters needed to configure the plugin. ;; Note that these values may are overridden by zmon's global config. ;; key = value PKHG@@.zmon_worker_monitor/builtins/plugins/scalyr.py#!/usr/bin/env python # -*- coding: utf-8 -*- import json import requests import logging from zmon_worker_monitor.adapters.ifunctionfactory_plugin import IFunctionFactoryPlugin, propartial logger = logging.getLogger('zmon-worker.scalyr-function') class ScalyrWrapperFactory(IFunctionFactoryPlugin): def __init__(self): super(ScalyrWrapperFactory, self).__init__() def configure(self, conf): self.read_key = conf.get('read.key', '') return def create(self, factory_ctx): """ Automatically called to create the check function's object :param factory_ctx: (dict) names available for Function instantiation :return: an object that implements a check function """ return propartial(ScalyrWrapper, read_key=self.read_key) class ScalyrWrapper(object): def __init__(self, read_key): self.numeric_url = 'https://www.scalyr.com/api/numericQuery' self.timeseries_url = 'https://www.scalyr.com/api/timeseriesQuery' self.facet_url = 'https://www.scalyr.com/api/facetQuery' self.read_key = read_key def count(self, query, minutes=5): val = { 'token': self.read_key, 'queryType': 'numeric', 'filter': query, 'function': 'count', 'startTime': str(minutes)+'m', 'priority': 'low', 'buckets': 1 } r = requests.post(self.numeric_url, data=json.dumps(val), headers={"Content-Type": "application/json"}) j = r.json() if 'values' in j: return j['values'][0] else: return j def function(self, function, query, minutes=5): val = { 'token': self.read_key, 'queryType': 'numeric', 'filter': query, 'function': function, 'startTime': str(minutes)+'m', 'priority': 'low', 'buckets': 1 } r = requests.post(self.numeric_url, data=json.dumps(val), headers={"Content-Type": "application/json"}) j = r.json() if 'values' in j: return j['values'][0] else: return j def facets(self, filter, field, max_count=5, minutes=30, prio="low"): val = { 'token': self.read_key, 'queryType': 'facet', 'filter': filter, 'field': field, 'maxCount': max_count, "startTime": str(minutes)+"m", "priority": prio } r = requests.post(self.facet_url, data=json.dumps(val), headers={"Content-Type": "application/json"}) j = r.json() return j def timeseries(self, filter, function="count", minutes=30, buckets=1, prio="low"): val = { 'token': self.read_key, 'queries': [ { "filter": filter, "function": function, "startTime": str(minutes)+"m", "buckets": buckets, "priority": prio } ] } r = requests.post(self.timeseries_url, data=json.dumps(val), headers={"Content-Type": "application/json"}) j = r.json() if j['status'] == 'success': if len(j['results'][0]['values'])==1: return j['results'][0]['values'][0] return map(lambda x: x * minutes / buckets, j['results'][0]['values']) return j if __name__ == '__main__': import os s = ScalyrWrapper(read_key=os.getenv('SCALYR_READ_KEY')) print s.count(query="$application_id='zmon-scheduler'") PKhG EHH1zmon_worker_monitor/builtins/plugins/sql_mysql.py#!/usr/bin/env python # -*- coding: utf-8 -*- import sys import re from zmon_worker_monitor.zmon_worker.errors import DbError from zmon_worker_monitor.adapters.ifunctionfactory_plugin import IFunctionFactoryPlugin, propartial DEFAULT_PORT = 3306 MAX_RESULTS = 100 CONNECTION_RE = \ re.compile(r''' ^(?P[^:/]+) # host - either IP o hostname (:(?P\d+))? # port - integer, optional /(?P\w+) # database name $ ''' , re.X) class MySqlFactory(IFunctionFactoryPlugin): def __init__(self): super(MySqlFactory, self).__init__() # fields from config self._user = None self._pass = None def configure(self, conf): """ Called after plugin is loaded to pass the [configuration] section in their plugin info file :param conf: configuration dictionary """ self._user = conf['user'] self._pass = conf['pass'] def create(self, factory_ctx): """ Automatically called to create the check function's object :param factory_ctx: (dict) names available for Function instantiation :return: an object that implements a check function """ return propartial(MySqlWrapper, shards=factory_ctx['shards'], user=self._user, password=self._pass, timeout=factory_ctx['soft_time_limit'] * 1000) def _import_db_driver(): module_alternatives = 'MySQLdb', 'pymysql' for module in module_alternatives: try: return __import__(module, globals(), locals(), [], -1) except Exception, e: if module == module_alternatives[-1]: raise else: # print is well supported by celery, this will end up as a log entry print 'Warning: Import of module {} failed: {}'.format(module, e) class MySqlWrapper(object): '''Shard-aware SQL adapter sql().execute('SELECT 1').result() ''' def __init__(self, shards, user='nagios', password='', timeout=60000, shard=None): ''' Parameters ---------- shards: dict A dict of shard definitions where key is the shard's name and value is the host/database string. user: str password: str timeout: int Statement timeout in milliseconds. shard: str Optional shard name. If provided, the check will be run on only one shard matching given name. ''' if not shards: raise ValueError('SqlWrapper: No shards defined') if shard and not shards.get(shard): raise ValueError('SqlWrapper: Shard {} not found in shards definition'.format(shard)) self._cursors = [] self._stmt = None mdb = _import_db_driver() for shard_def in ([shards[shard]] if shard else shards.values()): m = CONNECTION_RE.match(shard_def) if not m: raise ValueError('Invalid shard connection: {}'.format(shard_def)) try: conn = mdb.connect(host=m.group('host'), user=user, passwd=password, db=m.group('dbname'), port=(int(m.group('port')) if int(m.group('port')) > 0 else DEFAULT_PORT), connect_timeout=timeout) except Exception, e: raise DbError(str(e), operation='Connect to {}'.format(shard_def)), None, sys.exc_info()[2] # TODO: Find a way to enforce readonly=True as it is done in postgres Wrapper # TODO: Do we need to set charset="utf8" and use_unicode=True in connection? conn.autocommit(True) self._cursors.append(conn.cursor(mdb.cursors.DictCursor)) def execute(self, stmt): self._stmt = stmt return self def result(self, agg=sum): '''return single row result, will result primitive value if only one column is selected''' result = {} try: for cur in self._cursors: try: cur.execute(self._stmt) row = cur.fetchone() if row: for k, v in row.items(): result[k] = result.get(k, []) result[k].append(v) finally: cur.close() except Exception, e: raise DbError(str(e), operation=self._stmt), None, sys.exc_info()[2] for k, v in result.items(): try: result[k] = agg(v) except: # just use list if aggregation function fails # (e.g. if we try to sum strings) result[k] = v if len(result) == 1: return result.values()[0] else: return result def results(self): '''return many rows''' results = [] try: for cur in self._cursors: try: cur.execute(self._stmt) rows = cur.fetchmany(MAX_RESULTS) for row in rows: results.append(dict(row)) finally: cur.close() except Exception, e: raise DbError(str(e), operation=self._stmt), None, sys.exc_info()[2] return results if __name__ == '__main__': default_dbname = 'mysql' default_sql_stmt = 'SELECT VERSION()' # SELECT Host,User FROM user if len(sys.argv) in (6, 7): if len(sys.argv) == 7: shards = {'test': sys.argv[1] + ':' + sys.argv[2] + '/' + sys.argv[3]} sql_stmt = sys.argv[6] else: print 'executing default statement:', default_sql_stmt shards = {'test': sys.argv[1] + ':' + sys.argv[2] + '/' + default_dbname} sql_stmt = default_sql_stmt check = MySqlWrapper(shards, user=sys.argv[4], password=sys.argv[5]) # print '>>> many results:\n', check.execute(sql_stmt).results() print '>>> one result:\n', check.execute(sql_stmt).result() else: print '{} [sql_stmt]'.format(sys.argv[0]) PKGӷ{9zmon_worker_monitor/builtins/plugins/scalyr.worker_plugin[Core] Name = scalyr Module = scalyr [Documentation] Author = Jan Mussler Version = 0.1 Website = https://github.com/zalando/zmon-worker Description = Builtin plugin that provides an Scalyr functions [Configuration] PKGwR(7zmon_worker_monitor/builtins/plugins/http.worker_plugin[Core] Name = http Module = http [Documentation] Author = John Doe Version = 0.1 Website = https://github.com/zalando/zmon-worker Description = Builtin plugin that provides an http request check function. [Configuration] ;; The configuration section is optional, you can put here any parameters needed to configure the plugin. ;; Note that these values may are overridden by zmon's global config. ;; key = value PKGFҏ:zmon_worker_monitor/builtins/plugins/mongodb.worker_plugin[Core] Name = mongodb Module = mongodb [Documentation] Author = Jan Mussler Version = 0.1 Website = https://github.com/zalando/zmon-worker Description = Builtin plugin that provides MongoDB query capabilities [Configuration] PKhGo;0zmon_worker_monitor/builtins/plugins/__init__.py__author__ = 'avalles' PKhGτ<0zmon_worker_monitor/builtins/plugins/kairosdb.py#!/usr/bin/env python # -*- coding: utf-8 -*- import logging import requests import json import sys from zmon_worker_monitor.adapters.ifunctionfactory_plugin import IFunctionFactoryPlugin, propartial logger = logging.getLogger('zmon-worker.kairosdb-function') class KairosdbFactory(IFunctionFactoryPlugin): def __init__(self): super(KairosdbFactory, self).__init__() def configure(self, conf): """ Called after plugin is loaded to pass the [configuration] section in their plugin info file :param conf: configuration dictionary """ return def create(self, factory_ctx): """ Automatically called to create the check function's object :param factory_ctx: (dict) names available for Function instantiation :return: an object that implements a check function """ return propartial(KairosdbWrapper, url=factory_ctx.get('entity_url')) class KairosdbWrapper(object): def __init__(self, url): self.url = url def query(self, name, group_by = [], tags = None, start = -5, end = 0, time_unit='seconds', aggregators = None): url = self.url + '/api/v1/datapoints/query' q = { "start_relative": { "value": start, "unit": time_unit }, "metrics": [{ "name": name, }] } if aggregators is not None: q["metrics"][0]["aggregators"] = aggregators if tags is not None: q["metrics"][0]["tags"] = tags try: response = requests.post(url, json=q) if response.status_code == requests.codes.ok: return response.json()["queries"][0] else: raise Exception("KairosDB Query failed: " + json.dumps(q)) except requests.Timeout, e: raise HttpError('timeout', self.url), None, sys.exc_info()[2] except requests.ConnectionError, e: raise HttpError('connection failed', self.url), None, sys.exc_info()[2] def tagnames(self): return [] def tagnames(self): return [] def metric_tags(self): return {}PKhGm1zmon_worker_monitor/builtins/plugins/sql_mssql.py#!/usr/bin/env python # -*- coding: utf-8 -*- import sys import logging from zmon_worker_monitor.zmon_worker.errors import DbError from zmon_worker_monitor.adapters.ifunctionfactory_plugin import IFunctionFactoryPlugin, propartial logger = logging.getLogger(__name__) # requires: # sudo apt-get install freetds-dev # sudo pip install pymssql # default port DEFAULT_PORT = 1433 MAX_RESULTS = 100 class MsSqlFactory(IFunctionFactoryPlugin): def __init__(self): super(MsSqlFactory, self).__init__() # fields from config self._user = None self._pass = None def configure(self, conf): """ Called after plugin is loaded to pass the [configuration] section in their plugin info file :param conf: configuration dictionary """ self._user = conf['user'] self._pass = conf['pass'] def create(self, factory_ctx): """ Automatically called to create the check function's object :param factory_ctx: (dict) names available for Function instantiation :return: an object that implements a check function """ return propartial(MsSqlWrapper, factory_ctx['host'], factory_ctx['port'], factory_ctx['database'], user=self._user, password=self._pass, timeout=factory_ctx['soft_time_limit']) def _import_db_driver(): try: _cx_MsSql = __import__('pymssql', globals(), locals(), [], -1) except Exception, e: logger.exception('Import of module pymssql failed') raise return _cx_MsSql class MsSqlWrapper(object): # Note: Timeout must be in seconds def __init__(self, host, port, database, user='robot_zmon', password='', enabled=True, timeout=60): self.__stmt = None self.__enabled = enabled if self.__enabled: self.__cx_MsSql = _import_db_driver() port = (int(port) if port and str(port).isdigit() else DEFAULT_PORT) try: self.__conn = self.__cx_MsSql.connect('{}:{}'.format(host, port), user, password, database, timeout, as_dict=True) self.__cursor = self.__conn.cursor() except Exception, e: raise DbError(str(e), operation='Connect to {}:{}'.format(host, port)), None, sys.exc_info()[2] def execute(self, stmt): self.__stmt = stmt return self def result(self): # return single row result, will result primitive value if only one column is selected result = {} try: if self.__enabled and self.__cx_MsSql: cur = self.__cursor try: cur.execute(self.__stmt) row = cur.fetchone() if row: result = row finally: cur.close() except Exception, e: raise DbError(str(e), operation=self.__stmt), None, sys.exc_info()[2] if len(result) == 1: return result.values()[0] else: return result def results(self): # return many rows results = [] try: if self.__enabled and self.__cx_MsSql: cur = self.__cursor try: cur.execute(self.__stmt) rows = cur.fetchmany(MAX_RESULTS) for row in rows: results.append(row) finally: cur.close() except Exception, e: raise DbError(str(e), operation=self.__stmt), None, sys.exc_info()[2] return results if __name__ == '__main__': logging.basicConfig(level=logging.DEBUG) default_sql_stmt = "SELECT 'ONE ROW' AS COLUMN_NAME" if len(sys.argv) in (6, 7): check = MsSqlWrapper(sys.argv[1], sys.argv[2], sys.argv[3], user=sys.argv[4], password=sys.argv[5]) if len(sys.argv) == 7: sql_stmt = sys.argv[6] else: print 'executing default statement:', default_sql_stmt sql_stmt = default_sql_stmt print '>>> one result:\n', check.execute(sql_stmt).result() # print '>>> many results:\n', check.execute(sql_stmt).results() else: print '{} [sql_stmt]'.format(sys.argv[0])PKGVD6zmon_worker_monitor/builtins/plugins/jmx.worker_plugin[Core] Name = jmx Module = jmx [Documentation] Author = John Doe Version = 0.1 Website = https://github.com/zalando/zmon-worker Description = Builtin plugin that provides an http request check function. [Configuration] ;; The configuration section is optional, you can put here any parameters needed to configure the plugin. ;; Note that these values may are overridden by zmon's global config. ;; key = value jmxquery.host = localhost jmxquery.port = 8074 PKhG .zmon_worker_monitor/builtins/plugins/exasol.py#!/usr/bin/env python # -*- coding: utf-8 -*- """ Query Exasol """ import tempfile import subprocess import os from zmon_worker_monitor.adapters.ifunctionfactory_plugin import IFunctionFactoryPlugin, propartial class ExaplusFactory(IFunctionFactoryPlugin): def __init__(self): super(ExaplusFactory, self).__init__() # fields from config self._exacrm_cluster = None self._exacrm_user = None self._exacrm_pass = None def configure(self, conf): """ Called after plugin is loaded to pass the [configuration] section in their plugin info file :param conf: configuration dictionary """ self._exacrm_cluster = conf['exacrm_cluster'] self._exacrm_user = conf['exacrm_user'] self._exacrm_pass = conf['exacrm_pass'] def create(self, factory_ctx): """ Automatically called to create the check function's object :param factory_ctx: (dict) names available for Function instantiation :return: an object that implements a check function """ return propartial(ExaplusWrapper, cluster=self._exacrm_cluster, password=self._exacrm_pass, user=self._exacrm_user) class ExaplusWrapper(object): def __init__(self, cluster, user='ZALANDO_NAGIOS', password='', schema=False): self._err = None self._out = None self.user = user self.__password = password self.cluster = cluster self.schema = schema self.java_opts = ['-Djava.net.preferIPv4Stack=true', '-Djava.awt.headless=true', '-Xmx512m', '-Xms128m'] self.exaplus_opts = [ '-recoverConnection', 'OFF', '-retry', '0', '-lang', 'EN', '-q', '-x', '-Q', '10', '-pipe', ] self.jar_file = '/server/exasol/exaplus/current/exaplus.jar' def query(self, query): fd, name = tempfile.mkstemp(suffix='.sql', text=True) try: fh = os.fdopen(fd, 'w') fh.write('%s\n' % query) fh.flush() cmd = ['/usr/bin/java'] cmd.extend(self.java_opts) cmd.extend(['-jar', self.jar_file]) cmd.extend(['-c', self.cluster]) cmd.extend(['-u', self.user]) cmd.extend(['-p', self.__password]) cmd.extend(self.exaplus_opts) if self.schema: cmd.extend(['-s', self.schema]) cmd.extend(['-f', name]) # print "EXAPLUS="+" ".join(cmd) sub = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) d_out, d_err = sub.communicate() self._out = d_out self._err = d_err finally: os.unlink(name) return self def result(self): return self._out.split('\n'), self._err.split('\n') if __name__ == '__main__': import sys exaplus = ExaplusWrapper(sys.argv[1], sys.argv[2], sys.argv[3]) print exaplus.query('''select table_name, case when hours_between(systimestamp,last_export_success_time) < 24 then 'EXPORTED YESTERDAY (within 24 HOURS) - OK' else 'NOT EXPORTED within LAST 24 HOURS' end status, last_export_success_time LAST_EXPORT_TIME from STG.TARGET_LOADING_STATUS where table_name not in ('D_SHOP','F_CUSTOMER_SALES','F_PRODUCT_AVAILABILITY','F_UMS_CAMPAIGN_RESPONSE') order by 1 ;''' ).result() PK,uG$ڴ2zmon_worker_monitor/builtins/plugins/cloudwatch.py#!/usr/bin/env python # -*- coding: utf-8 -*- import boto3 import collections import datetime import fnmatch import json import sys import logging import requests import sys logging.getLogger('botocore').setLevel(logging.WARN) from zmon_worker_monitor.adapters.ifunctionfactory_plugin import IFunctionFactoryPlugin, propartial class CloudwatchWrapperFactory(IFunctionFactoryPlugin): def __init__(self): super(CloudwatchWrapperFactory, self).__init__() def configure(self, conf): return def create(self, factory_ctx): """ Automatically called to create the check function's object :param factory_ctx: (dict) names available for Function instantiation :return: an object that implements a check function """ return propartial(CloudwatchWrapper, region=factory_ctx.get('entity').get('region', None)) def get_region(): r = requests.get('http://169.254.169.254/latest/dynamic/instance-identity/document', timeout=3) return r.json()['region'] def matches(dimensions, filters): for key, pattern in filters.items(): if not fnmatch.fnmatch(''.join(dimensions.get(key, '')), pattern): return False return True class CloudwatchWrapper(object): def __init__(self, region=None): if not region: region = get_region() self.client = boto3.client('cloudwatch', region_name=region) def query(self, dimensions, metric_name, statistics='Sum', namespace=None, unit=None, period=60, minutes=5): filter_dimension_keys = set() filter_dimension_pattern = {} for key, val in list(dimensions.items()): if val == 'NOT_SET': filter_dimension_keys.add(key) del dimensions[key] if val and '*' in val: filter_dimension_pattern[key] = val del dimensions[key] dimension_kvpairs = [{'Name': k, 'Value': v} for k, v in dimensions.items()] args = {'Dimensions': dimension_kvpairs, 'MetricName': metric_name} if namespace: args['Namespace'] = namespace metrics = self.client.list_metrics(**args) metrics = metrics['Metrics'] end = datetime.datetime.utcnow() start = end - datetime.timedelta(minutes=minutes) data = collections.defaultdict(int) data['dimensions'] = collections.defaultdict(int) for metric in metrics: metric_dimensions = {d['Name']: d['Value'] for d in metric['Dimensions']} if set(metric_dimensions.keys()) & filter_dimension_keys: continue if filter_dimension_pattern and not matches(metric_dimensions, filter_dimension_pattern): continue response = self.client.get_metric_statistics(Namespace=metric['Namespace'], MetricName=metric['MetricName'], Dimensions=metric['Dimensions'], StartTime=start, EndTime=end, Period=period, Statistics=[statistics]) data_points = response['Datapoints'] if data_points: for [dim_name, dim_val] in metric_dimensions.items(): if not dim_name in data['dimensions']: data['dimensions'][dim_name] = collections.defaultdict(int) data['dimensions'][dim_name][dim_val] += data_points[-1][statistics] data[metric['MetricName']] += data_points[-1][statistics] return data if __name__ == '__main__': logging.basicConfig(level=logging.INFO) cloudwatch = CloudwatchWrapper(sys.argv[1]) print "ELB result (eu-west-1 only):" elb_data = cloudwatch.query({'AvailabilityZone': 'NOT_SET', 'LoadBalancerName': 'pierone-*'}, 'Latency', 'Average') print(json.dumps(elb_data)) print "Billing result (us-east-1 only):" billing_data = cloudwatch.query({'Currency': 'USD'}, 'EstimatedCharges', 'Maximum', 'AWS/Billing', None, 3600, 60*4) print(json.dumps(billing_data)) PKhGaa2zmon_worker_monitor/builtins/plugins/sql_oracle.py#!/usr/bin/env python # -*- coding: utf-8 -*- import sys import os from zmon_worker_monitor.zmon_worker.errors import DbError from zmon_worker_monitor.adapters.ifunctionfactory_plugin import IFunctionFactoryPlugin, propartial # default port Oracle Net Listener port DEFAULT_PORT = 1521 MAX_RESULTS = 100 class SqlOracleFactory(IFunctionFactoryPlugin): def __init__(self): super(SqlOracleFactory, self).__init__() # fields from config self._user = None self._pass = None def configure(self, conf): """ Called after plugin is loaded to pass the [configuration] section in their plugin info file :param conf: configuration dictionary """ self._user = conf['user'] self._pass = conf['pass'] def create(self, factory_ctx): """ Automatically called to create the check function's object :param factory_ctx: (dict) names available for Function instantiation :return: an object that implements a check function """ return propartial(SqlOracleWrapper, factory_ctx['host'], factory_ctx['port'], factory_ctx['entity'].get('sid'), user=self._user, password=self._pass) def _check_oracle_env(): if 'ORACLE_HOME' not in os.environ or 'LD_LIBRARY_PATH' not in os.environ or os.environ['ORACLE_HOME'] \ not in os.environ['LD_LIBRARY_PATH'] or not os.path.isdir(os.environ['ORACLE_HOME']): raise Exception('Environmet variables ORACLE_HOME and LD_LIBRARY_PATH are not correctly set') def _import_db_driver(): try: _check_oracle_env() _cx_Oracle = __import__('cx_Oracle', globals(), locals(), [], -1) except Exception: # do we want to take some action here? raise return _cx_Oracle class SqlOracleWrapper(object): '''Oracle SQL adapter sql().execute('SELECT 1').result() ''' def __init__(self, host, port, sid, user='nagios', password='', enabled=True): self._stmt = None self._dsn_tns = None self._enabled = enabled self.__cx_Oracle = None self.__conn = None self.__cursor = None if self._enabled: self.__cx_Oracle = _import_db_driver() port = (int(port) if port and str(port).isdigit() else DEFAULT_PORT) try: self._dsn_tns = self.__cx_Oracle.makedsn(host, port, sid) self.__conn = self.__cx_Oracle.connect(user, password, self._dsn_tns) self.__cursor = self.__conn.cursor() except Exception, e: raise DbError(str(e), operation='Connect to dsn={}'.format(self._dsn_tns)), None, sys.exc_info()[2] def execute(self, stmt): self._stmt = stmt return self def result(self, agg=sum): '''return single row result, will result primitive value if only one column is selected''' result = {} try: if self._enabled and self.__cx_Oracle: cur = self.__cursor try: cur.execute(self._stmt) desc = [d[0] for d in cur.description] # Careful: col names come out all uppercase row = cur.fetchone() if row: result = dict(zip(desc, row)) finally: cur.close() except Exception, e: raise DbError(str(e), operation=self._stmt), None, sys.exc_info()[2] if len(result) == 1: return result.values()[0] else: return result def results(self): '''return many rows''' results = [] try: if self._enabled and self.__cx_Oracle: cur = self.__cursor try: cur.execute(self._stmt) desc = [d[0] for d in cur.description] # Careful: col names come out all uppercase rows = cur.fetchmany(MAX_RESULTS) for row in rows: row = dict(zip(desc, row)) results.append(row) finally: cur.close() except Exception, e: raise DbError(str(e), operation=self._stmt), None, sys.exc_info()[2] return results if __name__ == '__main__': default_sql_stmt = "SELECT 'OK' from dual" if len(sys.argv) in (6, 7): check = SqlOracleWrapper(sys.argv[1], sys.argv[2], sys.argv[3], user=sys.argv[4], password=sys.argv[5]) if len(sys.argv) == 7: sql_stmt = sys.argv[6] else: print 'executing default statement:', default_sql_stmt sql_stmt = default_sql_stmt # print '>>> one result:\n', check.execute(sql_stmt).result() print '>>> many results:\n', check.execute(sql_stmt).results() else: print '{} [sql_stmt]'.format(sys.argv[0]) PKGs"9zmon_worker_monitor/builtins/plugins/nagios.worker_plugin[Core] Name = nagios Module = nagios [Documentation] Author = John Doe Version = 0.1 Website = https://github.com/zalando/zmon-worker Description = Builtin plugin that provides an http request check function. [Configuration] ;; The configuration section is optional, you can put here any parameters needed to configure the plugin. ;; Note that these values may are overridden by zmon's global config. ;; key = valueX ;; Note valueX will be overridden if Zmon's global config has: plugin.{plugin_name}.{key} = valueY exarpc_user = admin exarpc_pass = --secret-- loungemysql_user = robot_zmon loungemysql_pass = --secret-- hetcrawler_proxy_user = robot_zmon hetcrawler_proxy_pass = --secret-- PKhG0zmon_worker_monitor/builtins/plugins/eventlog.py#!/usr/bin/env python # -*- coding: utf-8 -*- """ Zalando-specific function to query EventLog """ from zmon_worker_monitor.zmon_worker.errors import CheckError #from http import HttpWrapper # FIXME: watch out for this!!! from zmon_worker_monitor.adapters.ifunctionfactory_plugin import IFunctionFactoryPlugin, propartial from zmon_worker_monitor import plugin_manager class EventlogFactory(IFunctionFactoryPlugin): def __init__(self): super(EventlogFactory, self).__init__() # fields from configuration self.eventlog_url = None # fields from dependencies: plugin depends 1 other plugin self.http_factory = None def configure(self, conf): """ Called after plugin is loaded to pass the [configuration] section in their plugin info file :param conf: configuration dictionary """ self.eventlog_url = conf['eventlog_url'] def create(self, factory_ctx): """ Automatically called to create the check function's object :param factory_ctx: (dict) names available for Function instantiation :return: an object that implements a check function """ # load plugins dependencies and store them locally for efficiency if not self.http_factory: self.http_factory = plugin_manager.get_plugin_obj_by_name('http', 'Function') return propartial(EventLogWrapper, http_wrapper=self.http_factory.create(factory_ctx), url=self.eventlog_url) class EventLogWrapper(object): '''Convenience wrapper to access EventLog counts''' def __init__(self, http_wrapper, url): self.__http = http_wrapper self.url = url.rstrip('/') + '/' def __request(self, path, params): return self.__http(self.url + path, params=params).json() def count(self, event_type_ids, time_from, time_to=None, group_by=None, **kwargs): '''Return number of events for given type IDs in given time frame returns a single number (integer) if only one type ID is given returns a dict (typeId as hex=>count) if more than one type ID is given returns a dict (fieldValue => count) if one type ID is given and a field name with "group_by" >>> EventLogWrapper(object, 'https://eventlog.example.com/').count('a', time_from='-1h') Traceback (most recent call last): ... CheckError: EventLog type ID must be a integer >>> EventLogWrapper(object, 'https://eventlog.example.com/').count(123, time_from='-1h') Traceback (most recent call last): ... CheckError: EventLog type ID is out of range ''' if isinstance(event_type_ids, (int, long)): event_type_ids = [event_type_ids] for type_id in event_type_ids: if not isinstance(type_id, (int, long)): raise CheckError('EventLog type ID must be a integer') if type_id < 0x1001 or type_id > 0xfffff: raise CheckError('EventLog type ID is out of range') params = kwargs params['event_type_ids'] = ','.join(['{:x}'.format(v) for v in event_type_ids]) params['time_from'] = time_from params['time_to'] = time_to params['group_by'] = group_by result = self.__request('count', params) if len(event_type_ids) == 1 and not group_by: return result.get(params['event_type_ids'], 0) else: return result if __name__ == '__main__': import sys import logging logging.basicConfig(level=logging.DEBUG) # init plugin manager and collect plugins, as done by Zmon when worker is starting plugin_manager.init_plugin_manager() plugin_manager.collect_plugins(load_builtins=True, load_env=True) eventlog_url = sys.argv[1] factory_ctx = {} http = plugin_manager.get_plugin_obj_by_name('http', 'Function').create(factory_ctx) #eventlog = EventLogWrapper() eventlog = EventLogWrapper(http_wrapper=http, url=eventlog_url) print eventlog.count(0x96001, time_from='-1m') print eventlog.count([0x96001, 0x63005], time_from='-1m') print eventlog.count(0x96001, time_from='-1m', group_by='appDomainId') PKGvRR9zmon_worker_monitor/builtins/plugins/exasol.worker_plugin[Core] Name = exacrm Module = exasol [Documentation] Author = John Doe Version = 0.1 Website = https://github.com/zalando/zmon-worker Description = Builtin plugin that provides an http request check function. [Configuration] ;; The configuration section is optional, you can put here any parameters needed to configure the plugin. ;; Note that these values may are overridden by zmon's global config. ;; key = valueX ;; Note valueX will be overridden if Zmon's global config has: plugin.{plugin_name}.{key} = valueY exacrm_cluster = --secret-- exacrm_user = user exacrm_pass = --secret-- PKhGJ 3zmon_worker_monitor/builtins/plugins/exceptions_.py#!/usr/bin/env python # -*- coding: utf-8 -*- """ Zalando-specific function to query the Exception Monitor """ from collections import Iterable from zmon_worker_monitor.adapters.ifunctionfactory_plugin import IFunctionFactoryPlugin, propartial from zmon_worker_monitor import plugin_manager class ExceptionsFactory(IFunctionFactoryPlugin): def __init__(self): super(ExceptionsFactory, self).__init__() # fields from dependencies: plugin depends 1 other plugin self.http_factory = None def configure(self, conf): """ Called after plugin is loaded to pass the [configuration] section in their plugin info file :param conf: configuration dictionary """ return def create(self, factory_ctx): """ Automatically called to create the check function's object :param factory_ctx: (dict) names available for Function instantiation :return: an object that implements a check function """ entity = factory_ctx['entity'] project = entity['name'] if entity['type'] == 'project' else None # load plugins dependencies and store them locally for efficiency if not self.http_factory: self.http_factory = plugin_manager.get_plugin_obj_by_name('http', 'Function') return propartial(ExceptionsWrapper, http_wrapper=self.http_factory.create(factory_ctx), host=factory_ctx['host'], instance=factory_ctx['instance'], project=project) class ExceptionsWrapper(object): def __init__(self, http_wrapper, host=None, instance=None, project=None): self.__http = http_wrapper self.url = 'https://exceptions.example.com/' self.host = host self.instance = instance self.project = project def __request(self, path, **params): return self.__http(path, base_url=self.url, params=params).json() def count( self, host=None, instance=None, project=None, source_class=None, method_name=None, exception_class=None, time_from='-24h', time_to=None, level='ERROR', q=None, ): return self.__request( 'count', host=maybe_comma_join(host or self.host), instance=maybe_comma_join(instance or self.instance), project=maybe_comma_join(project or self.project), source_class=maybe_comma_join(source_class), method_name=maybe_comma_join(method_name), exception_class=maybe_comma_join(exception_class), time_from=time_from, time_to=time_to, level=maybe_comma_join(level), q=q, )['count'] def maybe_comma_join(s): """ If `s` is iterable (but not a string), returns a comma-separated Unicode string of the elements of `s`. Otherwise, returns `s` >>> maybe_comma_join(['a', 'b', 'c']) u'a,b,c' >>> maybe_comma_join([1, 2, 3]) u'1,2,3' >>> maybe_comma_join([u'\u03B1', u'\u03B2', u'\u03B3']) u'\u03b1,\u03b2,\u03b3' >>> maybe_comma_join([]) '' >>> maybe_comma_join('abc') 'abc' >>> maybe_comma_join(u'\u03B1\u03B2\u03B3') u'\u03b1\u03b2\u03b3' >>> maybe_comma_join('') '' >>> maybe_comma_join(123) 123 """ if isinstance(s, Iterable) and not isinstance(s, basestring): return ','.join(unicode(e) for e in s) else: return s PKG?9zmon_worker_monitor/builtins/plugins/whois_.worker_plugin[Core] Name = whois Module = whois_ [Documentation] Author = John Doe Version = 0.1 Website = https://github.com/zalando/zmon-worker Description = Builtin plugin that provides an http request check function. [Configuration] ;; The configuration section is optional, you can put here any parameters needed to configure the plugin. ;; Note that these values may are overridden by zmon's global config. ;; key = value PKGZ;zmon_worker_monitor/builtins/plugins/eventlog.worker_plugin[Core] Name = eventlog Module = eventlog [Documentation] Author = John Doe Version = 0.1 Website = https://github.com/zalando/zmon-worker Description = Builtin plugin that provides an http request check function. [Configuration] ;; The configuration section is optional, you can put here any parameters needed to configure the plugin. ;; key = valueX ;; Note valueX will be overridden if Zmon's global config has: plugin.{plugin_name}.{key} = valueY eventlog_url = https://eventlog.example.com/ PKG0~ ,,<zmon_worker_monitor/builtins/plugins/sql_mssql.worker_plugin[Core] Name = mssql Module = sql_mssql [Documentation] Author = John Doe Version = 0.1 Website = https://github.com/zalando/zmon-worker Description = Builtin plugin that provides an http request check function. [Configuration] ;; The configuration section is optional, you can put here any parameters needed to configure the plugin. ;; Note that these values may are overridden by zmon's global config. ;; key = valueX ;; Note valueX will be overridden if Zmon's global config has: plugin.{plugin_name}.{key} = valueY user = nagios pass = --secret-- PKGaX;zmon_worker_monitor/builtins/plugins/kairosdb.worker_plugin[Core] Name = kairosdb Module = kairosdb [Documentation] Author = Jan Mussler Version = 0.1 Website = https://github.com/zalando/zmon-worker Description = Builtin plugin that provides KairosDB query capabilities [Configuration] PKhG2 /zmon_worker_monitor/builtins/plugins/counter.py#!/usr/bin/env python # -*- coding: utf-8 -*- import json import redis import time from zmon_worker_monitor.adapters.ifunctionfactory_plugin import IFunctionFactoryPlugin, propartial # round to microseconds ROUND_SECONDS_DIGITS = 6 class CounterFactory(IFunctionFactoryPlugin): def __init__(self): super(CounterFactory, self).__init__() def configure(self, conf): """ Called after plugin is loaded to pass the [configuration] section in their plugin info file :param conf: configuration dictionary """ return def create(self, factory_ctx): """ Automatically called to create the check function's object :param factory_ctx: (dict) names available for Function instantiation :return: an object that implements a check function """ return propartial(CounterWrapper, key_prefix='{}:{}:'.format(factory_ctx['check_id'], factory_ctx['entity_id']), redis_host=factory_ctx['redis_host'], redis_port=factory_ctx['redis_port']) class CounterWrapper(object): '''Measure increasing counts (per second) by saving the last value in Redis''' def __init__(self, key, redis_host, redis_port=6379, key_prefix=''): self.__con = redis.StrictRedis(redis_host, redis_port) self.key_prefix = key_prefix self.key(key) def key(self, key): '''expose key setter to allow reusing redis connection (CounterWrapper instance)''' self.__key = 'zmon:counters:{}{}'.format(self.key_prefix, key) # return self to allow method chaining return self def per_second(self, value): '''return increment rate of counter value (per second)''' olddata = self.__con.get(self.__key) result = 0 now = time.time() if olddata: olddata = json.loads(olddata) time_diff = now - olddata['ts'] value_diff = value - olddata['value'] # do not allow negative values (important for JMX counters which will reset after restart/deploy) result = max(value_diff / time_diff, 0) newdata = {'value': value, 'ts': round(now, ROUND_SECONDS_DIGITS)} self.__con.set(self.__key, json.dumps(newdata)) return result def per_minute(self, value): '''convenience method: returns per_second(..) / 60''' return self.per_second(value) / 60.0 if __name__ == '__main__': counter = CounterWrapper('test', 'localhost', 6379) print counter.per_second(1) time.sleep(2) print counter.per_second(101) time.sleep(1) print counter.per_second(111) time.sleep(1) print counter.per_minute(211) PKGԮbN7zmon_worker_monitor/builtins/plugins/jobs.worker_plugin[Core] Name = jobs Module = jobs [Documentation] Author = John Doe Version = 0.1 Website = https://github.com/zalando/zmon-worker Description = Builtin plugin that provides an http request check function. [Configuration] ;; The configuration section is optional, you can put here any parameters needed to configure the plugin. ;; Note that these values may are overridden by zmon's global config. ;; key = value PK,uGlll0zmon_worker_monitor/builtins/plugins/joblocks.py#!/usr/bin/env python # -*- coding: utf-8 -*- """ Zalando-specific function to monitor job locking (Job Framework uses Redis to lock) """ try: from cmdb.client import Client as cmdb_client except: cmdb_client = None from dogpile.cache import make_region from zmon_worker_monitor.adapters.ifunctionfactory_plugin import IFunctionFactoryPlugin, propartial import json import redis import time HOSTS_CACHE_EXPIRATION_TIME = 600 # 10 minutes memory_cache = make_region().configure('dogpile.cache.memory', expiration_time=HOSTS_CACHE_EXPIRATION_TIME) class JoblocksFactory(IFunctionFactoryPlugin): def __init__(self): super(JoblocksFactory, self).__init__() # fields from configuration self.cmdb_url = None def configure(self, conf): """ Called after plugin is loaded to pass the [configuration] section in their plugin info file :param conf: configuration dictionary """ self.cmdb_url = conf['cmdb_url'] def create(self, factory_ctx): """ Automatically called to create the check function's object :param factory_ctx: (dict) names available for Function instantiation :return: an object that implements a check function """ return propartial(JoblocksWrapper, cmdb_url=self.cmdb_url, project=factory_ctx['entity'].get('name')) class JoblocksWrapper(object): LOCKING_NODE_ROLE_ID = 118 ALLOCATED_STATUS_ID = 6000 DEFAULT_EXPECTED_DURATION = 60000 # [ms] def __init__(self, cmdb_url, project=None, environment='LIVE'): if cmdb_client: self._cmdb = cmdb_client(cmdb_url) self.pattern = 'job:lock:{}:{}:*'.format(project or '*', environment) @memory_cache.cache_on_arguments(namespace='zmon-worker') def _get_hosts(self, host_role_id, lifecycle_status_id): return self._cmdb.get_hosts(host_role_id=host_role_id, lifecycle_status_id=lifecycle_status_id) @staticmethod def _get_expected_duration(redis_value, check_param): ''' >>> JoblocksWrapper._get_expected_duration({}, None) 60000.0 >>> JoblocksWrapper._get_expected_duration({}, 10000) 10000.0 >>> JoblocksWrapper._get_expected_duration({'expectedMaximumDuration': 120000}, None) 120000.0 >>> JoblocksWrapper._get_expected_duration({'expectedMaximumDuration': 60000}, 90000) 90000.0 ''' return float((check_param if check_param else redis_value.get('expectedMaximumDuration', JoblocksWrapper.DEFAULT_EXPECTED_DURATION))) def results(self, expected_duration=None): hosts = self._get_hosts(JoblocksWrapper.LOCKING_NODE_ROLE_ID, JoblocksWrapper.ALLOCATED_STATUS_ID) host_connections = dict((host.hostname, redis.StrictRedis(host=host.hostname)) for host in hosts) host_keys = dict((host, con.keys(self.pattern)) for (host, con) in host_connections.iteritems()) str_results = [] for host, keys in host_keys.iteritems(): p = host_connections[host].pipeline() for key in keys: p.get(key) str_results.extend(p.execute()) results = [] for r in str_results: try: results.append(json.loads(r)) except ValueError: pass # In case flowId is None, we want to return empty string instead. return dict((r['lockingComponent'], { 'host': r['host'], 'instance': r['instanceCode'], 'created': r['created'], 'expected_duration': JoblocksWrapper._get_expected_duration(r, expected_duration), 'flow_id': r.get('flowId') or '', 'expired': time.time() - time.mktime(time.strptime(r['created'], '%Y-%m-%dT%H:%M:%S')) > JoblocksWrapper._get_expected_duration(r, expected_duration) / 1000, }) for r in results) if __name__ == '__main__': import sys if len(sys.argv) != 2: print "usage: {} ".format(sys.argv[0]) sys.exit(1) res = JoblocksWrapper(cmdb_url=sys.argv[1], environment='LIVE').results(expected_duration=60000) print res PK,uGx;zmon_worker_monitor/builtins/plugins/distance_to_history.py#!/usr/bin/env python # -*- coding: utf-8 -*- import numpy import datetime from zmon_worker_monitor.zmon_worker.common.time_ import parse_timedelta # from tasks.check import flatten # originally, I wanted to load this function defintion from the tasks module, but I did not # succeed in doing so. My python knowledge is limited, so maybe you can tell me how I can achieve this? def flatten(structure, key='', path='', flattened=None): path = str(path) key = str(key) if flattened is None: flattened = {} if type(structure) not in (dict, list): flattened[((path + '.' if path else '')) + key] = structure elif isinstance(structure, list): pass else: # for i, item in enumerate(structure): # flatten(item, '%d' % i, '.'.join(filter(None, [path, key])), flattened) for new_key, value in structure.items(): flatten(value, new_key, '.'.join(filter(None, [path, key])), flattened) return flattened class DistanceWrapper(object): def __init__(self, history_wrapper, weeks=4, snap_to_bin=True, bin_size='1h', dict_extractor_path=''): self.history_wrapper = history_wrapper self.weeks = weeks self.snap_to_bin = snap_to_bin self.bin_size = parse_timedelta(bin_size) self.dict_extractor_path = dict_extractor_path def calculate_bin_time_range(self): ''' Calculates the time ranges we need to look for using the settings configured for this class. Returns a list of dicts with the time ranges. ''' now = datetime.datetime.now().replace(microsecond=0) if self.snap_to_bin: day_begin = now.replace(hour=0, minute=0, second=0, microsecond=0) # the number of bins (of size bin_size) that passed since the beginning of the day bins_this_day_until_now = int((now - day_begin).total_seconds() / self.bin_size.total_seconds()) bin_begin = day_begin + bins_this_day_until_now * self.bin_size bin_end = day_begin + (bins_this_day_until_now + 1) * self.bin_size else: bin_begin = now - self.bin_size bin_end = now timestamps = [] for week in range(1, self.weeks + 1): time_from = abs((bin_begin - week * datetime.timedelta(days=7) - now).total_seconds()) time_to = abs((bin_end - week * datetime.timedelta(days=7) - now).total_seconds()) timestamps.append({'time_from': time_from, 'time_to': time_to}) return timestamps def extract_value(self, value): ''' Extracts the value that will be used for comparisons. For dicts, we need an extractor. The extractor may not be empty if the value is a dict. The extractor is of the form 'a.b.c' for a dict with the structure {'a':{'b':{'c':5}}} to extract the value 5. ''' if isinstance(value, dict): if self.dict_extractor_path == '': raise Exception('Extractor may not be empty for dicts as value. You need to tell which part of the dict I should use.' ) # throws when the key is unavailable; this is what we want value = flatten(value)[self.dict_extractor_path] return value def bin_mean(self): ''' Calculates the mean of the history values. Applies the extractor and returns a scalar value. ''' time_ranges = self.calculate_bin_time_range() averages = [] for time_range in time_ranges: averages.extend(self.history_wrapper.get_avg(self.dict_extractor_path, time_range['time_from'], time_range['time_to'])) if not averages: raise Exception('No history data available in bin_mean call.') return numpy.average(averages) def bin_standard_deviation(self): ''' Calculates the standard deviation of the history values. Applies the extractor and returns a scalar value. ''' time_ranges = self.calculate_bin_time_range() deviations = [] for time_range in time_ranges: deviations.extend(self.history_wrapper.get_std_dev(self.dict_extractor_path, time_range['time_from'], time_range['time_to'])) if not deviations: raise Exception('No history data available in bin_standard_deviation call.') # you can't simply average standard deviations. # see https://en.wikipedia.org/wiki/Variance#Basic_properties for details, keep in mind that # the different times are uncorrelated. We assume that the sample sizes for the different weeks # are equal (since we do not get exact sample sizes for a specific key from the kairosdb) return numpy.sqrt(numpy.sum(map(lambda x: x * x, deviations))) def absolute(self, value): ''' Calculates the absolute distance of the actual value from the history value bins as selected through the constructor parameters weeks, bin_size and snap_to_bin. Applies the extractor and returns a scalar value. ''' return self.extract_value(value) - self.bin_mean() def sigma(self, value): ''' Calculates the relative distance of the actual value from the history value bins, normalized by the standard deviation. A sigma distance of 1.0 means that the actual value is as far away from the mean as the standard deviation is. The sigma distance can be negative, in this case you are below the mean with your value. If you need absolute values, you can use abs(sigma(value)). Applies the extractor and returns a scalar value. ''' abs_value = self.absolute(value) std_dev = self.bin_standard_deviation() if std_dev == 0: return numpy.Infinity * numpy.sign(abs_value) if abs_value != 0 else numpy.float64(0) return abs_value / std_dev if __name__ == '__main__': import logging logging.basicConfig(level=logging.DEBUG) logging.info('flattened dict: %s', flatten({'a': {'b': {'c': 5}}})) class HistoryWrapper(object): def __init__(self, check_id, entities=[]): self.check_id = check_id self.entities = entities @staticmethod def get_avg(key, time_from, time_to): return [7] @staticmethod def get_std_dev(key, time_from, time_to): return [2] now = datetime.datetime.now() before = now.replace(minute=0, second=0, microsecond=0) distance = DistanceWrapper(history_wrapper=HistoryWrapper(check_id=588), snap_to_bin=False, weeks=3, bin_size='5m') logging.info('Mean of history values: %f', distance.bin_mean()) logging.info('Absolute distance: %f', distance.absolute(15)) logging.info('Sigma distance: %f', distance.sigma(15)) PKGQԓ))<zmon_worker_monitor/builtins/plugins/checkldap.worker_plugin[Core] Name = ldap Module = checkldap [Documentation] Author = John Doe Version = 0.1 Website = https://github.com/zalando/zmon-worker Description = Builtin plugin that provides an http request check function. [Configuration] ;; The configuration section is optional, you can put here any parameters needed to configure the plugin. ;; Note that these values may are overridden by zmon's global config. ;; key = valueX ;; Note valueX will be overridden if Zmon's global config has: plugin.{plugin_name}.{key} = valueY user = user pass = --secret-- PKhGOp.zmon_worker_monitor/builtins/plugins/whois_.py#!/usr/bin/env python # -*- coding: utf-8 -*- import pythonwhois from zmon_worker_monitor.adapters.ifunctionfactory_plugin import IFunctionFactoryPlugin, propartial class WhoisFactory(IFunctionFactoryPlugin): def __init__(self): super(WhoisFactory, self).__init__() def configure(self, conf): """ Called after plugin is loaded to pass the [configuration] section in their plugin info file :param conf: configuration dictionary """ return def create(self, factory_ctx): """ Automatically called to create the check function's object :param factory_ctx: (dict) names available for Function instantiation :return: an object that implements a check function """ return propartial(WhoisWrapper, host=factory_ctx['host']) class WhoisWrapper(object): def __init__(self, host, timeout=10): self.host = host self.timeout = timeout def check(self): parsed = {} data, server_list = pythonwhois.net.get_whois_raw(self.host, with_server_list=True) if len(server_list) > 0: parsed = pythonwhois.parse.parse_raw_whois(data, normalized=True, never_query_handles=False, handle_server=server_list[-1]) else: parsed = pythonwhois.parse.parse_raw_whois(data, normalized=True) return parsed if __name__ == '__main__': import json import sys import datetime def json_fallback(obj): if isinstance(obj, datetime.datetime): return obj.isoformat() else: return obj data = WhoisWrapper(sys.argv[1]).check() print json.dumps(data, default=json_fallback, indent=4) PKGX6zmon_worker_monitor/builtins/plugins/tcp.worker_plugin[Core] Name = tcp Module = tcp [Documentation] Author = John Doe Version = 0.1 Website = https://github.com/zalando/zmon-worker Description = Builtin plugin that provides an http request check function. [Configuration] ;; The configuration section is optional, you can put here any parameters needed to configure the plugin. ;; Note that these values may are overridden by zmon's global config. ;; key = value PKGJZ..=zmon_worker_monitor/builtins/plugins/sql_oracle.worker_plugin[Core] Name = orasql Module = sql_oracle [Documentation] Author = John Doe Version = 0.1 Website = https://github.com/zalando/zmon-worker Description = Builtin plugin that provides an http request check function. [Configuration] ;; The configuration section is optional, you can put here any parameters needed to configure the plugin. ;; Note that these values may are overridden by zmon's global config. ;; key = valueX ;; Note valueX will be overridden if Zmon's global config has: plugin.{plugin_name}.{key} = valueY user = nagios pass = --secret-- PK,uGKȊD/zmon_worker_monitor/builtins/plugins/history.py#!/usr/bin/env python # -*- coding: utf-8 -*- # wrapper for kairosdb to access history data about checks import logging import requests from zmon_worker_monitor.builtins.plugins.distance_to_history import DistanceWrapper from zmon_worker_monitor.adapters.ifunctionfactory_plugin import IFunctionFactoryPlugin, propartial logger = logging.getLogger(__name__) class HistoryFactory(IFunctionFactoryPlugin): def __init__(self): super(HistoryFactory, self).__init__() # fields from configuration self.kairosdb_host = None self.kairosdb_port = None self.kairosdb_history_enabled = None def configure(self, conf): """ Called after plugin is loaded to pass the [configuration] section in their plugin info file :param conf: configuration dictionary """ self.kairosdb_host = conf.get('kairosdb_host') self.kairosdb_port = conf.get('kairosdb_port') self.kairosdb_history_enabled = True if conf.get('kairosdb_history_enabled') in ('true', 'True', '1') else False def create(self, factory_ctx): """ Automatically called to create the check function's object :param factory_ctx: (dict) names available for Function instantiation :return: an object that implements a check function """ return propartial(HistoryWrapper, kairosdb_host=self.kairosdb_host, kairosdb_port=self.kairosdb_port, history_enabled=self.kairosdb_history_enabled, check_id=factory_ctx['check_id'], entities=factory_ctx['entity_id_for_kairos']) def get_request_json(check_id, entities, time_from, time_to, aggregator='avg', sampling_size_in_seconds=300): j = \ """ { "metrics": [ { "tags": { "entity": [ %s ] }, "name": "zmon.check.%s", "group_by": [ { "name": "tag", "tags": [ "key" ] } ], "aggregators": [ { "name": "%s", "align_sampling": true, "sampling": { "value": "%s", "unit": "seconds" } } ] } ], "cache_time": 0, "start_relative": { "value": "%s", "unit": "seconds" }, "end_relative": { "value": "%s", "unit": "seconds" } } """ r = j % ( ','.join(map(lambda x: '"' + x + '"', entities)), check_id, aggregator, sampling_size_in_seconds, time_from, time_to, ) return r ONE_WEEK = 7 * 24 * 60 * 60 ONE_WEEK_AND_5MIN = ONE_WEEK + 5 * 60 class HistoryWrapper(object): def __init__(self, kairosdb_host, kairosdb_port, history_enabled, check_id, entities): self.__kairosdb_host = kairosdb_host if kairosdb_host is not None else 'cassandra01' self.__kairosdb_port = kairosdb_port if kairosdb_port is not None else '37629' self.__enabled = bool(history_enabled) self.url = 'http://%s:%s/api/v1/datapoints/query' % (self.__kairosdb_host, self.__kairosdb_port) self.check_id = check_id if type(entities) == list: self.entities = entities else: self.entities = [entities] def result(self, time_from=ONE_WEEK_AND_5MIN, time_to=ONE_WEEK): if not self.__enabled: raise Exception("History() function disabled for now") #self.logger.info("history query %s %s %s", self.check_id, time_from, time_to) return requests.post(self.url, get_request_json(self.check_id, self.entities, int(time_from), int(time_to))).json() def get_one(self, time_from=ONE_WEEK_AND_5MIN, time_to=ONE_WEEK): if not self.__enabled: raise Exception("History() function disabled for now") #self.logger.info("history get one %s %s %s", self.check_id, time_from, time_to) return requests.post(self.url, get_request_json(self.check_id, self.entities, int(time_from), int(time_to))).json()['queries'][0]['results'][0]['values'] def get_aggregated(self, key, aggregator, time_from=ONE_WEEK_AND_5MIN, time_to=ONE_WEEK): if not self.__enabled: raise Exception("History() function disabled for now") # read the list of results query_result = requests.post(self.url, get_request_json(self.check_id, self.entities, int(time_from), int(time_to), aggregator, int(time_from - time_to))).json()['queries'][0]['results' ] # filter for the key we are interested in filtered_for_key = filter(lambda x: x['tags'].get('key', [''])[0] == key, query_result) if not filtered_for_key: return_value = [] else: return_value = [filtered_for_key[0]['values'][0][1]] # since we have a sample size of 'all in the time range', return only the value, not the timestamp. return return_value def get_avg(self, key, time_from=ONE_WEEK_AND_5MIN, time_to=ONE_WEEK): if not self.__enabled: raise Exception("History() function disabled for now") #self.logger.info("history get avg %s %s %s", self.check_id, time_from, time_to) return self.get_aggregated(key, 'avg', time_from, time_to) def get_std_dev(self, key, time_from=ONE_WEEK_AND_5MIN, time_to=ONE_WEEK): if not self.__enabled: raise Exception("History() function disabled for now") #self.logger.info("history get std %s %s %s", self.check_id, time_from, time_to) return self.get_aggregated(key, 'dev', time_from, time_to) def distance(self, weeks=4, snap_to_bin=True, bin_size='1h', dict_extractor_path=''): if not self.__enabled: raise Exception("History() function disabled for now") #self.logger.info("history distance %s %s ", self.check_id, weeks, bin_size) return DistanceWrapper(history_wrapper=self, weeks=weeks, bin_size=bin_size, snap_to_bin=snap_to_bin, dict_extractor_path=dict_extractor_path) if __name__ == '__main__': import logging logging.basicConfig(level=logging.DEBUG) zhistory = HistoryWrapper(None, None, None, 17, ['GLOBAL']) r = zhistory.result() logging.info(r) r = zhistory.get_one() logging.info(r) r = zhistory.get_aggregated('', 'avg') logging.info(r) PKhGgg.zmon_worker_monitor/builtins/plugins/zomcat.py#!/usr/bin/env python # -*- coding: utf-8 -*- """ Query Tomcat metrics via JMX """ import json import sys from zmon_worker_monitor.adapters.ifunctionfactory_plugin import IFunctionFactoryPlugin, propartial from zmon_worker_monitor import plugin_manager MILLI = 10 ** -3 NANO = 10 ** -9 THREAD_POOL_PORT_PREFIXES = {'http': 3, 'ajp': 2} class ZomcatFactory(IFunctionFactoryPlugin): def __init__(self): super(ZomcatFactory, self).__init__() # fields to store dependencies: plugin depends on 3 other plugins self.http_factory = None self.jmx_factory = None self.counter_factory = None def configure(self, conf): """ Called after plugin is loaded to pass the [configuration] section in their plugin info file :param conf: configuration dictionary """ return def create(self, factory_ctx): """ Automatically called to create the check function's object :param factory_ctx: (dict) names available for Function instantiation :return: an object that implements a check function """ # load plugins dependencies and store them locally for efficiency if not self.http_factory: self.http_factory = plugin_manager.get_plugin_obj_by_name('http', 'Function') if not self.jmx_factory: self.jmx_factory = plugin_manager.get_plugin_obj_by_name('jmx', 'Function') if not self.counter_factory: self.counter_factory = plugin_manager.get_plugin_obj_by_name('counter', 'Function') return propartial(ZomcatWrapper, host=factory_ctx['host'], instance=factory_ctx['instance'], http=self.http_factory.create(factory_ctx), jmx=self.jmx_factory.create(factory_ctx), counter=self.counter_factory.create(factory_ctx)) def memory_usage_percentage(data): '''helper function to calculate percentage from "used" and "max"''' return round(100. * data['used'] / data['max'], 2) class ZomcatWrapper(object): def __init__(self, host, instance, http, jmx, counter): '''expects ready to use "partials" for http, jmx and counter''' self.host = host self.instance = instance self._http = http self._jmx = jmx # initialize counter with Redis connection self._counter = counter('') @staticmethod def get_jmx_port(instance): return int('4' + instance) def gc(self): '''return garbage collector statistics: gc_percentage and gcs_per_sec''' gc_count = 0 gc_time = 0 for _type in 'ConcurrentMarkSweep', 'ParNew': try: row = self._jmx().query('java.lang:type=GarbageCollector,name={}'.format(_type), 'CollectionCount', 'CollectionTime').results() gc_count += row['CollectionCount'] gc_time += row['CollectionTime'] except: pass gc_count = round(self._counter.key('gcCount').per_second(gc_count), 2) gc_time = self._counter.key('gcTime').per_second(gc_time * MILLI) return {'gc_percentage': round(gc_time * 100, 2), 'gcs_per_sec': gc_count} def requests(self): '''return Tomcat request statistics such as requests/s and errors/s''' request_count = 0 request_time = 0 http_errors = 0 for _type, _port_prefix in THREAD_POOL_PORT_PREFIXES.items(): try: row = self._jmx().query('Catalina:type=GlobalRequestProcessor,name="{}-apr-{}{}"'.format(_type, _port_prefix, self.instance), 'requestCount', 'processingTime', 'errorCount' ).results() request_count += row['requestCount'] request_time += row['processingTime'] http_errors += row['errorCount'] except: pass requests = round(self._counter.key('requestCount').per_second(request_count), 2) http_errors = round(self._counter.key('errorCount').per_second(http_errors), 2) time_per_request = round(self._counter.key('requestTime').per_second(request_time) / max(requests, 1), 2) return {'http_errors_per_sec': http_errors, 'requests_per_sec': requests, 'time_per_request': time_per_request} def basic_stats(self): '''return basic statistics such as memory, CPU and thread usage''' jmx = self._jmx() # {"NonHeapMemoryUsage":{"max":184549376,"init":24313856,"used":54467720,"committed":85266432}, # "HeapMemoryUsage":{"max":518979584,"init":134217728,"used":59485272,"committed":129761280}} jmx.query('java.lang:type=Memory', 'HeapMemoryUsage', 'NonHeapMemoryUsage') jmx.query('java.lang:type=Threading', 'ThreadCount') jmx.query('java.lang:type=OperatingSystem', 'ProcessCpuTime') data = jmx.results() memory = data['java.lang:type=Memory'] threading = data['java.lang:type=Threading'] os = data['java.lang:type=OperatingSystem'] cpu = self._counter.key('cpuTime').per_second(os['ProcessCpuTime'] * NANO) threads = threading['ThreadCount'] try: heartbeat = self._http('/heartbeat.jsp', timeout=3).text().strip() == 'OK: Zalando JVM is running' except: heartbeat = None try: jobs = self._http('/jobs.monitor?view=json', timeout=3).json()['operationMode'] == 'NORMAL' except: jobs = None return { 'cpu_percentage': round(cpu * 100, 2), 'heap_memory_percentage': memory_usage_percentage(memory['HeapMemoryUsage']), 'heartbeat_enabled': heartbeat, 'jobs_enabled': jobs, 'nonheap_memory_percentage': memory_usage_percentage(memory['NonHeapMemoryUsage']), 'threads': threads, } def health(self): '''return complete Zomcat health statistics including memory, threads, CPU, requests and GC''' data = {} data.update(self.basic_stats()) data.update(self.gc()) data.update(self.requests()) return data if __name__ == '__main__': # init plugin manager and collect plugins, as done by Zmon when worker is starting plugin_manager.init_plugin_manager() plugin_manager.collect_plugins(load_builtins=True, load_env=True) host = sys.argv[1] instance = sys.argv[2] factory_ctx = { 'base_url': 'http://{host}:3{instance}/'.format(host=host, instance=instance), 'host': host, 'port': ZomcatWrapper.get_jmx_port(instance), 'instance': instance, 'redis_host': 'localhost', } # http = partial(HttpWrapper, base_url='http://{host}:3{instance}/'.format(host=host, instance=instance)) http = plugin_manager.get_plugin_obj_by_name('http', 'Function').create(factory_ctx) # jmx = partial(JmxWrapper, host=host, port=ZomcatWrapper.get_jmx_port(instance)) jmx = plugin_manager.get_plugin_obj_by_name('jmx', 'Function').create(factory_ctx) # counter = partial(CounterWrapper, redis_host='localhost') counter = plugin_manager.get_plugin_obj_by_name('counter', 'Function').create(factory_ctx) zomcat = ZomcatWrapper(host, instance, http=http, jmx=jmx, counter=counter) print json.dumps(zomcat.health(), indent=4, sort_keys=True) PKhGfp8-zmon_worker_monitor/builtins/plugins/ping_.py#!/usr/bin/env python # -*- coding: utf-8 -*- import subprocess from zmon_worker_monitor.adapters.ifunctionfactory_plugin import IFunctionFactoryPlugin, propartial class PingFactory(IFunctionFactoryPlugin): def __init__(self): super(PingFactory, self).__init__() def configure(self, conf): """ Called after plugin is loaded to pass the [configuration] section in their plugin info file :param conf: configuration dictionary """ return def create(self, factory_ctx): """ Automatically called to create the check function's object :param factory_ctx: (dict) names available for Function instantiation :return: an object that implements a check function """ return propartial(ping, host=factory_ctx['host']) def ping(host, count=1, timeout=1): cmd = [ 'ping', '-c', str(count), '-w', str(timeout), host, ] sub = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) sub.communicate() ret = sub.wait() == 0 return ret if __name__ == '__main__': import sys print ping(sys.argv[1]) PK㈑G͖c6zmon_worker_monitor/builtins/plugins/sql_postgresql.py#!/usr/bin/env python # -*- coding: utf-8 -*- import psycopg2 import re import sys from zmon_worker_monitor.zmon_worker.errors import CheckError, InsufficientPermissionsError, DbError from zmon_worker_monitor.adapters.ifunctionfactory_plugin import IFunctionFactoryPlugin, propartial from psycopg2.extras import NamedTupleCursor DEFAULT_PORT = 5432 CONNECTION_RE = \ re.compile(r''' ^(?P[^:/]+) # host - either IP o hostname (:(?P\d+))? # port - integer, optional /(?P\w+) # database name $ ''' , re.X) ABSOLUTE_MAX_RESULTS = 1000000 REQUIRED_GROUP = 'zalandos' PERMISSIONS_STMT = \ ''' SELECT r.rolcanlogin AS can_login, ARRAY(SELECT b.rolname FROM pg_catalog.pg_auth_members m JOIN pg_catalog.pg_roles b ON (m.roleid = b.oid) WHERE m.member = r.oid) AS member_of FROM pg_catalog.pg_roles r WHERE r.rolname = %s; ''' NON_SAFE_CHARS = re.compile(r'[^a-zA-Z_0-9-]') class SqlFactory(IFunctionFactoryPlugin): def __init__(self): super(SqlFactory, self).__init__() # fields from config self._user = None self._pass = None def configure(self, conf): """ Called after plugin is loaded to pass the [configuration] section in their plugin info file :param conf: configuration dictionary """ self._user = conf['user'] self._pass = conf['pass'] def create(self, factory_ctx): """ Automatically called to create the check function's object :param factory_ctx: (dict) names available for Function instantiation :return: an object that implements a check function """ return propartial(SqlWrapper, shards=factory_ctx['shards'], user=self._user, password=self._pass, timeout=factory_ctx['soft_time_limit'] * 1000, check_id=factory_ctx['check_id'], created_by=factory_ctx['req_created_by'], __protected=['created_by', 'check_id']) def make_safe(s): ''' >>> make_safe('Bad bad \\' 123') 'Badbad123' ''' if not s: return '' return NON_SAFE_CHARS.sub('', s) class SqlWrapper(object): '''Shard-aware SQL adapter sql().execute('SELECT 1').result() ''' def __init__( self, shards, user='zmon', password='', timeout=60000, shard=None, created_by=None, check_id=None, ): ''' Parameters ---------- shards: dict A dict of shard definitions where key is the shard's name and value is the host/database string. user: str password: str timeout: int Statement timeout in milliseconds. shard: str Optional shard name. If provided, the check will be run on only one shard matching given name. created_by: str Optional user name. If provided, the check will first make sure that the user has permissions to access the requested database. It's optional because it's currently supported only in trial run. check_id: int The check definition ID in order to set PostgreSQL application name (easier tracking on server side). ''' if not shards: raise CheckError('SqlWrapper: No shards defined') if shard and not shards.get(shard): raise CheckError('SqlWrapper: Shard {} not found in shards definition'.format(shard)) self._cursors = [] self._stmt = None permissions = {} for shard_def in ([shards[shard]] if shard else shards.values()): m = CONNECTION_RE.match(shard_def) if not m: raise CheckError('Invalid shard connection: {}'.format(shard_def)) connection_str = \ "host='{host}' port='{port}' dbname='{dbname}' user='{user}' password='{password}' connect_timeout=5 options='-c statement_timeout={timeout}' application_name='ZMON Check {check_id} (created by {created_by})' ".format( host=m.group('host'), port=int(m.group('port') or DEFAULT_PORT), dbname=m.group('dbname'), user=user, password=password, timeout=timeout, check_id=check_id, created_by=make_safe(created_by), ) try: conn = psycopg2.connect(connection_str) conn.set_session(readonly=True, autocommit=True) cursor = conn.cursor(cursor_factory=NamedTupleCursor) self._cursors.append(cursor) except Exception, e: raise DbError(str(e), operation='Connect to {}'.format(shard_def)), None, sys.exc_info()[2] try: if created_by: cursor.execute(PERMISSIONS_STMT, [created_by]) row = cursor.fetchone() permissions[shard_def] = (row.can_login and REQUIRED_GROUP in row.member_of if row else False) except Exception, e: raise DbError(str(e), operation='Permission query'), None, sys.exc_info()[2] for resource, permitted in permissions.iteritems(): if not permitted: raise InsufficientPermissionsError(created_by, resource) def execute(self, stmt): self._stmt = stmt return self def result(self, agg=sum): '''return single row result, will result primitive value if only one column is selected''' result = {} try: for cur in self._cursors: try: cur.execute(self._stmt) row = cur.fetchone() if row: for k, v in row._asdict().items(): result[k] = result.get(k, []) result[k].append(v) finally: cur.close() except Exception, e: raise DbError(str(e), operation=self._stmt), None, sys.exc_info()[2] for k, v in result.items(): try: result[k] = agg(v) except: # just use list if aggregation function fails # (e.g. if we try to sum strings) result[k] = v if len(result) == 1: return result.values()[0] else: return result def results(self, max_results=100, raise_if_limit_exceeded=True): '''return many rows''' results = [] max_results = min(max_results, ABSOLUTE_MAX_RESULTS) try: for cur in self._cursors: try: cur.execute(self._stmt) if raise_if_limit_exceeded: rows = cur.fetchmany(max_results + 1) if len(rows) > max_results: raise DbError('Too many results, result set was limited to {}. Try setting max_results to a higher value.'.format(max_results), operation=self._stmt) else: rows = cur.fetchmany(max_results) for row in rows: results.append(row._asdict()) finally: cur.close() except Exception, e: raise DbError(str(e), operation=self._stmt), None, sys.exc_info()[2] return results if __name__ == '__main__': if len(sys.argv) == 4: check = SqlWrapper([sys.argv[1] + '/' + sys.argv[2]]) print check.execute(sys.argv[3]).result() elif len(sys.argv) > 1: print 'sql.py ' PKhG͋W,zmon_worker_monitor/builtins/plugins/jobs.py#!/usr/bin/env python # -*- coding: utf-8 -*- """ Zalando-specific function to query DeployCtl job information """ from itertools import groupby from operator import itemgetter from zmon_worker_monitor.adapters.ifunctionfactory_plugin import IFunctionFactoryPlugin, propartial from zmon_worker_monitor import plugin_manager class JobsFactory(IFunctionFactoryPlugin): def __init__(self): super(JobsFactory, self).__init__() # fields from dependencies: plugin depends 1 other plugin self.http_factory = None def configure(self, conf): """ Called after plugin is loaded to pass the [configuration] section in their plugin info file :param conf: configuration dictionary """ return def create(self, factory_ctx): """ Automatically called to create the check function's object :param factory_ctx: (dict) names available for Function instantiation :return: an object that implements a check function """ # load plugins dependencies and store them locally for efficiency if not self.http_factory: self.http_factory = plugin_manager.get_plugin_obj_by_name('http', 'Function') return propartial(JobsWrapper, http_wrapper=self.http_factory.create(factory_ctx), project=factory_ctx['entity'].get('name')) class JobsWrapper(object): def __init__(self, http_wrapper, environment, project, **kwargs): self.url = 'https://deployctl.example.com/jobs/history.json/{}/{}'.format(environment, project) self.__http = http_wrapper self.http_wrapper_params = kwargs self.name = itemgetter('name') def __request(self): return self.__http(self.url, **self.http_wrapper_params).json() def lastruns(self): start_time = itemgetter('start_seconds_ago') return dict((job, min(runs, key=start_time)) for (job, runs) in groupby(sorted(self.__request(), key=self.name), key=self.name)) def history(self): return dict((job, list(runs)) for (job, runs) in groupby(sorted(self.__request(), key=self.name), key=self.name)) PKMuGQ"",zmon_worker_monitor/builtins/plugins/http.py#!/usr/bin/env python # -*- coding: utf-8 -*- import logging import requests import sys import urllib import urlparse import json from prometheus_client.parser import text_string_to_metric_families from collections import defaultdict from zmon_worker_monitor.zmon_worker.errors import HttpError from requests.adapters import HTTPAdapter from zmon_worker_monitor.adapters.ifunctionfactory_plugin import IFunctionFactoryPlugin, propartial import tokens # will use OAUTH2_ACCESS_TOKEN_URL environment variable by default # will try to read application credentials from CREDENTIALS_DIR tokens.configure() tokens.manage('uid', ['uid']) tokens.start() logger = logging.getLogger('zmon-worker.http-function') class HttpFactory(IFunctionFactoryPlugin): def __init__(self): super(HttpFactory, self).__init__() def configure(self, conf): """ Called after plugin is loaded to pass the [configuration] section in their plugin info file :param conf: configuration dictionary """ return def create(self, factory_ctx): """ Automatically called to create the check function's object :param factory_ctx: (dict) names available for Function instantiation :return: an object that implements a check function """ return propartial(HttpWrapper, base_url=factory_ctx.get('entity_url')) def absolute_http_url(url): ''' >>> absolute_http_url('') False >>> absolute_http_url('bla:8080/blub') False >>> absolute_http_url('https://www.zalando.de') True ''' return url.startswith('http://') or url.startswith('https://') class HttpWrapper(object): def __init__( self, url, params=None, base_url=None, timeout=10, max_retries=0, verify=True, oauth2=False, headers=None, ): self.url = (base_url + url if not absolute_http_url(url) else url) self.clean_url = None self.params = params self.timeout = timeout self.max_retries = max_retries self.verify = verify self.headers = headers or {} self.oauth2 = oauth2 self.__r = None def __request(self, raise_error=True, post_data = None): if self.__r is not None: return self.__r if self.max_retries: s = requests.Session() s.mount('', HTTPAdapter(max_retries=self.max_retries)) else: s = requests base_url = self.url basic_auth = None url_parsed = urlparse.urlsplit(base_url) if url_parsed and url_parsed.username and url_parsed.password: base_url = base_url.replace("{0}:{1}@".format(urllib.quote(url_parsed.username), urllib.quote(url_parsed.password)), "") base_url = base_url.replace("{0}:{1}@".format(url_parsed.username, url_parsed.password), "") basic_auth = requests.auth.HTTPBasicAuth(url_parsed.username, url_parsed.password) self.clean_url = base_url if self.oauth2: self.headers.update({'Authorization':'Bearer {}'.format(tokens.get('uid'))}) try: if post_data is None: self.__r = s.get(base_url, params=self.params, timeout=self.timeout, verify=self.verify, headers=self.headers, auth = basic_auth) else: self.__r = s.post(base_url, params=self.params, timeout=self.timeout, verify=self.verify, headers=self.headers, auth = basic_auth, data=json.dumps(post_data)) except requests.Timeout, e: raise HttpError('timeout', self.clean_url), None, sys.exc_info()[2] except requests.ConnectionError, e: raise HttpError('connection failed', self.clean_url), None, sys.exc_info()[2] except Exception, e: raise HttpError(str(e), self.clean_url), None, sys.exc_info()[2] if raise_error: try: self.__r.raise_for_status() except requests.HTTPError, e: raise HttpError(str(e), self.clean_url), None, sys.exc_info()[2] return self.__r def json(self, raise_error=True): r = self.__request(raise_error=raise_error) try: return r.json() except Exception, e: raise HttpError(str(e), self.url), None, sys.exc_info()[2] def jolokia(self, read_requests, raise_error=True): def set_read_type(x): x['type'] = 'READ' # hack quick verify if (not self.url.endswith('jolokia/')) or ('?' in self.url) or ('&' in self.url): raise HttpError("URL needs to end in jolokia/ and not contain ? and &", self.url) map(set_read_type, read_requests) r = self.__request(post_data=read_requests, raise_error=raise_error) try: return r.json() except Exception, e: raise HttpError(str(e), self.url), None, sys.exc_info()[2] def actuator_metrics(self, prefix = 'zmon.response.', raise_error = True): """ /metric responds with keys like: zmon.response... Response map is ep->method->status->metric """ j = self.json(raise_error=raise_error) r={} # for clojure projects we use the dropwizard servlet, there the json looks slightly different if "timers" in j: metric_map = {'p99':'99th','p75':'75th','mean':'median','m1_rate':'mRate','99%':'99th','75%':'75th','1m.rate':'mRate'} j = j["timers"] j["zmon.response.200.GET.metrics"]={"mRate": 0.12345} start_index = len(prefix.split('.')) - 1 for (k,v) in j.iteritems(): if k.startswith(prefix): ks = k.split('.') ks = ks[start_index:] status = ks[0] method = ks[1] ep = '.'.join(ks[2:]) if not ep in r: r[ep]={} if not method in r[ep]: r[ep][method]={} if not status in r[ep][method]: r[ep][method][status]={} for (mn, mv) in v.iteritems(): if mn in ['count','p99','p75','m1_rate','min','max','mean','75%','99%','1m.rate','median']: if mn in metric_map: mn = metric_map[mn] r[ep][method][status][mn]=mv return r j["zmon.response.200.GET.metrics.oneMinuteRate"]=0.12345 for (k,v) in j.iteritems(): if k.startswith(prefix): ks = k.split('.') if ks[-2]=='snapshot': ep = '.'.join(ks[4:-2]) else: ep = '.'.join(ks[4:-1]) if not ep in r: r[ep] = {} # zmon.response. 200 . GET . EP . if ks[3] not in r[ep]: r[ep][ks[3]] = {} if ks[2] not in r[ep][ks[3]]: r[ep][ks[3]][ks[2]] = {} if not (ks[-2] == 'snapshot'): if ks[-1] == 'count': r[ep][ks[3]][ks[2]]['count']=v if ks[-1] == 'oneMinuteRate': r[ep][ks[3]][ks[2]]['mRate']=v else: if ks[-1] in ['75thPercentile','99thPercentile','min','max','median']: r[ep][ks[3]][ks[2]][ks[-1].replace("Percentile", "")] = v return r def text(self, raise_error=True): r = self.__request(raise_error=raise_error) return r.text def prometheus(self): t = self.__request().text samples_by_name = defaultdict(list) for l in text_string_to_metric_families(t): for s in l.samples: samples_by_name[s[0]].append((s[1],s[2])) return samples_by_name def headers(self, raise_error=True): return self.__request(raise_error=raise_error).headers def cookies(self, raise_error=True): return self.__request(raise_error=raise_error).cookies def content_size(self, raise_error=True): return len(self.__request(raise_error=raise_error).content) def time(self, raise_error=True): return self.__request(raise_error=raise_error).elapsed.total_seconds() def code(self): return self.__request(raise_error=False).status_code if __name__ == '__main__': http = HttpWrapper(sys.argv[1], max_retries=3) print http.text() PKG@zmon_worker_monitor/builtins/plugins/redis_wrapper.worker_plugin[Core] Name = redis Module = redis_wrapper [Documentation] Author = John Doe Version = 0.1 Website = https://github.com/zalando/zmon-worker Description = Builtin plugin that provides an http request check function. [Configuration] ;; The configuration section is optional, you can put here any parameters needed to configure the plugin. ;; Note that these values may are overridden by zmon's global config. ;; key = value PKG)--Azmon_worker_monitor/builtins/plugins/sql_postgresql.worker_plugin[Core] Name = sql Module = sql_postgresql [Documentation] Author = John Doe Version = 0.1 Website = https://github.com/zalando/zmon-worker Description = Builtin plugin that provides an http request check function. [Configuration] ;; The configuration section is optional, you can put here any parameters needed to configure the plugin. ;; Note that these values may are overridden by zmon's global config. ;; key = valueX ;; Note valueX will be overridden if Zmon's global config has: plugin.{plugin_name}.{key} = valueY user = zmon pass = --secret-- PKG8Cp;zmon_worker_monitor/builtins/plugins/joblocks.worker_plugin[Core] Name = job_lock Module = joblocks [Documentation] Author = John Doe Version = 0.1 Website = https://github.com/zalando/zmon-worker Description = Builtin plugin that provides an http request check function. [Configuration] ;; The configuration section is optional, you can put here any parameters needed to configure the plugin. ;; Note that these values may are overridden by zmon's global config. ;; key = value cmdb_url = https://cmdb.example.com/PKG!'g}:}:1zmon_worker_monitor/builtins/plugins/checkldap.py#!/usr/bin/env python # -*- coding: utf-8 -*- import sys import ldap try: import ldapapi except: ldapapi = None import logging import time from zmon_worker_monitor.zmon_worker.errors import CheckError from ldap.dn import explode_dn from zmon_worker_monitor.adapters.ifunctionfactory_plugin import IFunctionFactoryPlugin, propartial from zmon_worker_monitor import plugin_manager STATISTICS_OPERATIONS_TO_MONITOR = frozenset([ 'bind', 'unbind', 'search', 'modify', 'add', 'delete', 'extended', ]) STATISTICS_GAUGE_KEYS = frozenset([ 'threads_active', 'threads_max', 'connections_current', 'statistics_entries', 'waiters_read', 'waiters_write', 'connections_max_file_descriptors', ]) # rename some keys to make the names more friendly STATISTICS_FRIENDLY_KEY_NAMES = {'statistics_entries': 'entries', 'connections_max_file_descriptors': 'max_file_descriptors'} class LdapFactory(IFunctionFactoryPlugin): def __init__(self): super(LdapFactory, self).__init__() # fields from config self._ldapuser = None self._ldappass = None # fields to store dependencies: plugin depends on 1 other plugins self.counter_factory = None def configure(self, conf): """ Called after plugin is loaded to pass the [configuration] section in their plugin info file :param conf: configuration dictionary """ self._ldapuser = conf['user'] self._ldappass = conf['pass'] def create(self, factory_ctx): """ Automatically called to create the check function's object :param factory_ctx: (dict) names available for Function instantiation :return: an object that implements a check function """ if not self.counter_factory: self.counter_factory = plugin_manager.get_plugin_obj_by_name('counter', 'Function') return propartial(LdapWrapper, user=self._ldapuser, password=self._ldappass, host=factory_ctx['host'], counter=self.counter_factory.create(factory_ctx)) class UnsupportedMethodException(Exception): pass class DuplicateBindException(Exception): pass class LdapWrapper(object): def __init__( self, host=None, tls=True, krb5=False, user='uid=nagios,ou=robots,ou=users,dc=example,dc=com', password='', timeout=60, logger=None, counter=None, ): '''expects ready to use "partial" for counter (CounterWrapper)''' self.logger = logger or logging.getLogger() self._counter = counter('') self.use_tls = tls self.use_krb5 = krb5 self.user_base_dn = ','.join(explode_dn(user)[1:]) self.user_filter = '(' + explode_dn(user)[0] + ')' self.user_attrib = ['dn'] # password auth self.bind_dn = user self.__password = password self.session = None self.host = host if self.host and len(self.host.split('.')) <= 1: # FIXME self.host += '.zalando' def _connect(self, ldapserver): if self.session: raise CheckError('LDAP Error: duplicate bind exception.') self.logger.debug('connect to %s', ldapserver) uri = ldapserver if not uri.startswith("ldap://") and not uri.startswith("ldaps://"): uri = 'ldap://{0}'.format(ldapserver) try: if self.use_krb5 and self.use_tls: self.logger.debug('sasl bind') self.session = ldapapi.Session(uri, tls=True) elif self.use_tls: self.logger.debug('simple bind') self.session = ldapapi.Session(uri, self.bind_dn, self.__password, tls=True) elif uri.startswith("ldaps://"): self.logger.debug('LDAPS + simple bind') self.session = ldapapi.Session(uri, self.bind_dn, self.__password, tls=False) else: raise CheckError('LDAP Error: unsupported method exception.') except CheckError: raise except Exception, e: raise CheckError('Error connecting to LDAP: {}'.format(e)), None, sys.exc_info()[2] def _search(self, base, fltr, attrs, scope=ldap.SCOPE_SUBTREE): return self.session.search(base, fltr, attrs, scope) def _disconnect(self): self.logger.debug('disconnect') try: self.session.disconnect() except Exception, e: raise CheckError('Error disconnecting to LDAP: {}'.format(e)), None, sys.exc_info()[2] self.session = None def _sync_state(self, ldapserver): base = 'dc=example,dc=com' fltr = '(objectclass=*)' attr = ['contextCSN'] self._connect(ldapserver) result = self._search(base, fltr, attr, scope=ldap.SCOPE_BASE) self._disconnect() return result[0][1]['contextCSN'] def _get_rid_to_url(self, ldapserver): '''Returns a dict for this query: % ldapsearch -b cn=config '(cn=config)' -H ldap://myserver olcServerID ''' rid2url = {} url2rid = {} base = 'cn=config' fltr = '(cn=config)' attr = ['olcServerID'] self._connect(ldapserver) res = self.session.conn.search_ext_s(base, ldap.SCOPE_BASE, fltr, attr) self._disconnect() for rid_url in res[0][1]['olcServerID']: rid, url = rid_url.split() rid = '%03d' % int(rid) rid2url[rid] = url url2rid[url] = rid return rid2url, url2rid def _get_timestamp_rid(self, csn): '''csn: 20140227093429.363252Z#000000#004#000000''' ts, _, rid, _ = csn.split('#') ts = ts.split('.')[0] # ts = datetime.datetime.strptime(ts, "%Y%m%d%H%M%S") ts = int(ts) return rid, ts def sync(self): '''Example: checkldap().sync() => [{'newest': 20140516151002, 'elapsed': 0.14442706108093262, 'ok': True, 'diff': 0, 'server': 'myserv'}, {'newest': 20140516151002, 'elapsed': 0.19423580169677734, 'ok': True, 'diff': 0, 'server': 'myserver'}, {'newest': 20140516151002, 'elapsed': 0.2617530822753906, 'ok': True, 'diff': 0, 'server': 'z-auth123.example'}, {'newest': 20140516151002, 'elapsed': 0.15635299682617188, 'ok': True, 'diff': 0, 'server': 'myserver'}, {'newest': 20140516151002, 'elapsed': 0.20283913612365723, 'ok': True, 'diff': 0, 'server': 'myserver'}] ''' try: rid2url, url2rid = self._get_rid_to_url(self.host) ldapservers = map(lambda url: url[7:], url2rid.keys()) return self._sync(ldapservers) except CheckError: raise except Exception, e: raise CheckError('{}'.format(e)), None, sys.exc_info()[2] def _sync(self, ldapservers): '''Returns a list of dict, where 'diff' is the difference to the 'newest' of the full list, 'newest' is the newest timestamp for the given 'server', 'ok' means LDAP state for current 'server' and 'elapsed' the runtime of that ldap request. Example dict: {'diff': 0, 'elapsed': 0.2969970703125, 'newest': 20140228135148, 'ok': True, 'server': 'myserver'} ''' if not ldapservers: return results = [] for ldapserver in ldapservers: try: start = time.time() csn_list = self._sync_state(ldapserver) rid2ts = {} rid_ts = map(lambda csn: self._get_timestamp_rid(csn), csn_list) newest = rid_ts[0][1] for rid, ts in rid_ts: rid2ts[rid] = ts if ts > newest: newest = ts elapsed = time.time() - start results.append({ 'server': ldapserver, 'ok': True, 'newest': newest, 'elapsed': elapsed, }) except ldap.LOCAL_ERROR: bind_type = 'simple bind' if self.use_krb5: bind_type = 'sasl bind' msg = 'Could not connect to {} via {}'.format(ldapserver, bind_type) self.logger.exception(msg) raise CheckError(msg) except ldap.CONFIDENTIALITY_REQUIRED: results.append({'ok': False, 'server': ldapserver}) newest = 0 for result in results: if result['ok']: if result['newest'] > newest: newest = result['newest'] for result in results: if result['ok']: result['diff'] = newest - result['newest'] return results def auth(self): try: start = time.time() self._connect(self.host) connect_elapsed = time.time() - start self._search(self.user_base_dn, self.user_filter, self.user_attrib) all_elapsed = time.time() - start search_elapsed = all_elapsed - connect_elapsed self._disconnect() return { 'ok': True, 'connect_time': connect_elapsed, 'search_time': search_elapsed, 'elapsed': all_elapsed, } except ldap.LOCAL_ERROR: bind_type = 'simple bind' if self.use_krb5: bind_type = 'sasl bind' msg = 'Could not connect to {} via {}'.format(self.host, bind_type) self.logger.exception(msg) raise CheckError(msg) except ldap.CONFIDENTIALITY_REQUIRED: return {'ok': False} except Exception, e: raise CheckError('Error authenticating to LDAP: {}'.format(e)), None, sys.exc_info()[2] @staticmethod def _split_monitor_dn(dn): ''' >>> LdapWrapper._split_monitor_dn('cn=Max File Descriptors,cn=Connections,cn=Monitor') ('connections', 'max_file_descriptors') ''' parts = dn.replace(' ', '_').split(',') return (parts[1])[3:].lower(), (parts[0])[3:].lower() def statistics_raw(self): '''collect statistics from OpenLDAP "Monitor" DB as a dict ldapsearch -b cn=Connections,cn=Monitor -H ldap://myserver '(monitorCounter=*)' '+' Example result:: { "connections_current": "51", "connections_max_file_descriptors": "65536", "connections_total": "26291", "operations_add_completed": "0", "operations_add_initiated": "0", "operations_bind_completed": "25423", "operations_bind_initiated": "25423", "operations_delete_completed": "0", "operations_delete_initiated": "0", "operations_extended_completed": "293", "operations_extended_initiated": "293", "operations_modify_completed": "91", "operations_modify_initiated": "91", "operations_search_completed": "22865", "operations_search_initiated": "22866", "operations_unbind_completed": "25233", "operations_unbind_initiated": "25233", "statistics_bytes": "122581936", "statistics_entries": "64039", "statistics_pdu": "112707", "statistics_referrals": "0", "waiters_read": "51", "waiters_write": "0" } ''' try: self._connect(self.host) data = {} # we need to use the internal "conn" attribute as the default _search is using paging which does not work for the "cn=Monitor" tree! result = self.session.conn.search_s('cn=Monitor', ldap.SCOPE_SUBTREE, '(objectClass=monitorCounterObject)', ['monitorCounter']) for dn, attr in result: category, counter = self._split_monitor_dn(dn) data['{}_{}'.format(category, counter)] = int(attr['monitorCounter'][0]) result = self.session.conn.search_s('cn=Threads,cn=Monitor', ldap.SCOPE_SUBTREE, '(&(objectClass=monitoredObject)(monitoredInfo=*))', ['monitoredInfo']) for dn, attr in result: category, key = self._split_monitor_dn(dn) if key in ('active', 'max'): data['{}_{}'.format(category, key)] = int(attr['monitoredInfo'][0]) result = self.session.conn.search_s('cn=Operations,cn=Monitor', ldap.SCOPE_SUBTREE, '(objectClass=monitorOperation)', ['monitorOpInitiated', 'monitorOpCompleted']) for dn, attr in result: category, op = self._split_monitor_dn(dn) if op in STATISTICS_OPERATIONS_TO_MONITOR: data['{}_{}_initiated'.format(category, op)] = int(attr['monitorOpInitiated'][0]) data['{}_{}_completed'.format(category, op)] = int(attr['monitorOpCompleted'][0]) self._disconnect() except CheckError: raise except Exception, e: raise CheckError('{}'.format(e)), None, sys.exc_info()[2] return data def statistics(self): '''uses raw statistics and computes counter values (i.e. e.g. operations per second) Example result:: { "connections_current": 74, "connections_per_sec": 24.1, "entries": 353540, "max_file_descriptors": 65536, "operations_add_per_sec": 0.0, "operations_bind_per_sec": 26.4, "operations_delete_per_sec": 0.0, "operations_extended_per_sec": 1.15, "operations_modify_per_sec": 0.0, "operations_search_per_sec": 20.66, "operations_unbind_per_sec": 24.1, "threads_active": 2, "threads_max": 16, "waiters_read": 73, "waiters_write": 0 } ''' _data = self.statistics_raw() data = {} for key, val in _data.items(): if key in STATISTICS_GAUGE_KEYS: data[STATISTICS_FRIENDLY_KEY_NAMES.get(key, key)] = val elif key == 'connections_total': data['connections_per_sec'] = round(self._counter.key(key).per_second(val), 2) elif key.startswith('operations_') and key.endswith('_initiated'): data[key.replace('_initiated', '_per_sec')] = round(self._counter.key(key).per_second(val), 2) # gc_count = round(self._counter.key('gcCount').per_second(gc_count), 2) return data PK,uG4-`-zmon_worker_monitor/builtins/plugins/time_.py#!/usr/bin/env python # -*- coding: utf-8 -*- from datetime import datetime from zmon_worker_monitor.zmon_worker.common.time_ import parse_timedelta, parse_datetime from zmon_worker_monitor.adapters.ifunctionfactory_plugin import IFunctionFactoryPlugin, propartial class TimeFactory(IFunctionFactoryPlugin): def __init__(self): super(TimeFactory, self).__init__() def configure(self, conf): """ Called after plugin is loaded to pass the [configuration] section in their plugin info file :param conf: configuration dictionary """ return def create(self, factory_ctx): """ Automatically called to create the check function's object :param factory_ctx: (dict) names available for Function instantiation :return: an object that implements a check function """ return propartial(TimeWrapper) class TimeWrapper(object): def __init__(self, spec='now', utc=False): now = (datetime.utcnow() if utc else datetime.now()) delta = parse_timedelta(spec) if delta: self.time = now + delta elif spec == 'now': self.time = now else: self.time = parse_datetime(spec) def __sub__(self, other): ''' >>> TimeWrapper('2014-01-01 01:01:25') - TimeWrapper('2014-01-01 01:01:01') 24.0 ''' return (self.time - other.time).total_seconds() def isoformat(self, sep=' '): return self.time.isoformat(sep) def format(self, fmt): ''' >>> TimeWrapper('2014-01-01 01:01').format('%Y-%m-%d') '2014-01-01' >>> TimeWrapper('-1m').format('%Y-%m-%d')[:3] '201' ''' return self.time.strftime(fmt) PKG,,<zmon_worker_monitor/builtins/plugins/sql_mysql.worker_plugin[Core] Name = mysql Module = sql_mysql [Documentation] Author = John Doe Version = 0.1 Website = https://github.com/zalando/zmon-worker Description = Builtin plugin that provides an http request check function. [Configuration] ;; The configuration section is optional, you can put here any parameters needed to configure the plugin. ;; Note that these values may are overridden by zmon's global config. ;; key = valueX ;; Note valueX will be overridden if Zmon's global config has: plugin.{plugin_name}.{key} = valueY user = nagios pass = --secret-- PKGl= max_retries: traceback = sys.exc_info()[2] raise Exception('Max retries exceeded. Internal error: {}'.format(e)), None, traceback time.sleep(ts + random.uniform(-ts, ts) * random_delta) retry += 1 ts *= 2 # duplicate waiting time return wrapper return decorator PKhGZCN1zmon_worker_monitor/zmon_worker/common/mathfun.py#!/usr/bin/env python # -*- coding: utf-8 -*- '''General math/aggregate functions ''' from functools import partial import functional import math def _percentile(N, percent, key=functional.id): """ Find the percentile of a list of values. @parameter N - is a list of values. Note N MUST BE already sorted. @parameter percent - a float value from 0.0 to 1.0. @parameter key - optional key function to compute value from each element of N. @return - the percentile of the values >>> percentile([0,1], 0.5) 0.5 >>> percentile([0,1,2], 0.9) 1.8 >>> percentile([], 0.9) is None True """ if not N: return None k = (len(N) - 1) * percent f = math.floor(k) c = math.ceil(k) if f == c: return key(N[int(k)]) d0 = key(N[int(f)]) * (c - k) d1 = key(N[int(c)]) * (k - f) return d0 + d1 # median is 50th percentile. _median = partial(_percentile, percent=0.5) def median(results): return _median(sorted(results)) def percentile(results, percent): return _percentile(sorted(results), percent) def apply_aggregate_function(results, func, key=functional.id, **args): ''' >>> apply_aggregate_function([1,2,3], sum) 6 >>> apply_aggregate_function([{'a': 0}, {'a': 2}], _percentile, key=lambda x:x['a'], percent=0.9) 1.8 ''' return func(map(key, results), **args) def delta(results): ''' >>> delta([]) 0 >>> delta([0, 10]) 10 >>> delta([10, 0]) -10 ''' if not results: # no results => zero delta return 0 return results[-1] - results[0] def avg(results): ''' >>> avg([]) is None True >>> avg([0, 1]) 0.5 ''' if not results: return None return sum(results) * 1.0 / len(results) def first(results): ''' >>> first([1, 2]) 1 >>> first([]) is None True ''' return (results[0] if results else None) def _min(results): ''' >>> _min([2, 1, 3, 2]) 1 >>> _min([]) is None True ''' return (min(results) if results else None) def _max(results): ''' >>> _max([2, 1, 3, 2]) 3 >>> _max([]) is None True ''' return (max(results) if results else None) PK,uGn99/zmon_worker_monitor/zmon_worker/common/time_.py#!/usr/bin/env python # -*- coding: utf-8 -*- from datetime import timedelta, datetime import re TIME_UNITS = { 's': 'seconds', 'm': 'minutes', 'h': 'hours', 'd': 'days', } TIME_FORMATS = ['%Y-%m-%d %H:%M:%S.%f', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M', '%Y-%m-%d'] TIMEZONE_OFFSET = re.compile(r'([+-])([0-9][0-9])(?::?([0-9][0-9]))?$') def parse_timedelta(s): ''' >>> parse_timedelta('bla') >>> parse_timedelta('1s').total_seconds() 1.0 >>> parse_timedelta('-2s').total_seconds() -2.0 >>> parse_timedelta('2m').total_seconds() 120.0 >>> parse_timedelta('1h').total_seconds() 3600.0 ''' if s.startswith('-'): s = s[1:] factor = -1 else: factor = 1 try: v = int(s[:-1]) u = s[-1] except: return None arg = TIME_UNITS.get(u) if arg: return factor * timedelta(**{arg: v}) return None def parse_datetime(s): ''' >>> parse_datetime('foobar') >>> parse_datetime('1983-10-12T23:30').isoformat(' ') '1983-10-12 23:30:00' >>> parse_datetime('1983-10-12 23:30:12').isoformat(' ') '1983-10-12 23:30:12' >>> parse_datetime('2014-05-05 17:40:44.100313').isoformat(' ') '2014-05-05 17:40:44.100313' >>> parse_datetime('2014-05-05 17:40:44.100313+01:00').isoformat(' ') '2014-05-05 16:40:44.100313' ''' s = s.replace('T', ' ') # calculate timezone data from date string, we'll parse it ourselves ('%z' is not supported on all platforms for strptime) match = TIMEZONE_OFFSET.search(s) if match: signum = int(match.group(1) + '1') hours = signum * int(match.group(2)) minutes = signum * int(match.group(3)) timezone_timedelta = timedelta(hours=hours, minutes=minutes) else: timezone_timedelta = timedelta() # remove timezone data from input string, if any. s = TIMEZONE_OFFSET.sub('', s) for fmt in TIME_FORMATS: try: return datetime.strptime(s, fmt) - timezone_timedelta except: pass return None PK,uGق  1zmon_worker_monitor/zmon_worker/tasks/stashacc.py#!/usr/bin/env python # -*- coding: utf-8 -*- try: from scmmanager import StashApi except: StashApi = None import os import yaml import logging from yaml.scanner import ScannerError class StashAccessor(object): def __init__(self, logger=None): self._logger = (logger if logger else _get_null_logger()) if StashApi: self.stash = StashApi() else: self.stash = None def set_logger(self, logger): self._logger = logger def get_stash_check_definitions(self, *repo_urls): ''' Downloads the check definitions found under the given stash repositories. Returns a list of dict corresponding to the check definitions ''' check_definitions = [] if self.stash: for repo_url in repo_urls: self._logger.info("Scanning Stash url: %s", repo_url) try: repo, check_path = self.stash.get_repo_from_scm_url(repo_url) except Exception: self._logger.exception('Bad configured stash repo: %s. Exception follows: ', repo_url) else: try: files = [f for f in self.stash.get_files(repo, check_path) if f.endswith('.yaml')] if not files: self._logger.warn('No check definitions found in secure repo: %s', repo_url) for check_file in files: self._logger.info("Reading file: %s", check_file) file_path = os.path.join(check_path, check_file) fd = self.stash.get_content(repo, file_path) try: check_definitions.append(yaml.safe_load(fd)) except ScannerError as e: self._logger.exception("Could not parse file %s/%s", check_path, check_file, e ) except Exception: self._logger.exception('Unexpected error when fetching info from stash: ') if repo_urls and not check_definitions: # StashApi returns empty results on failure self._logger.error('Stash error: Check definition download finished with empty results') raise Exception('Check definition download finished with empty results') return check_definitions def get_stash_commands(self, *repo_urls): ''' Returns a list of str corresponding to commands found in check_definitions under the given stash repositories ''' return set(cf['command'] for cf in self.get_stash_check_definitions(*repo_urls) if 'command' in cf) class NullHandler(logging.Handler): def emit(self, record): pass _null_logger = None def _get_null_logger(): global _null_logger if not _null_logger: handler = NullHandler() _null_logger = logging.getLogger('NullLogger') _null_logger.addHandler(handler) return _null_logger PK,uGDa1zmon_worker_monitor/zmon_worker/tasks/__init__.pyimport logging logging.getLogger('requests.packages.urllib3.connectionpool').setLevel(logging.WARNING) logging.getLogger('urllib3.connectionpool').setLevel(logging.WARNING) PKuGU!8zmon_worker_monitor/zmon_worker/tasks/notacelery_task.py#!/usr/bin/env python # -*- coding: utf-8 -*- import ast from inspect import isclass import __future__ from collections import Callable, Counter import socket from zmon_worker_monitor.zmon_worker.encoder import JsonDataEncoder from stashacc import StashAccessor from zmon_worker_monitor.zmon_worker.common.utils import async_memory_cache, with_retries from zmon_worker_monitor.zmon_worker.errors import * import zmon_worker_monitor.eventloghttp as eventlog import functools import itertools import json import logging import random from zmon_worker_monitor.redis_context_manager import RedisConnHandler import time import re import requests import sys import setproctitle from datetime import timedelta, datetime import urllib import pytz import threading import Queue from collections import defaultdict from bisect import bisect_left from zmon_worker_monitor.zmon_worker.common.time_ import parse_timedelta from zmon_worker_monitor.zmon_worker.notifications.hipchat import NotifyHipchat from zmon_worker_monitor.zmon_worker.notifications.slack import NotifySlack from zmon_worker_monitor.zmon_worker.notifications.mail import Mail from zmon_worker_monitor.zmon_worker.notifications.sms import Sms from zmon_worker_monitor.zmon_worker.notifications.hubot import Hubot from zmon_worker_monitor.zmon_worker.notifications.notification import BaseNotification from operator import itemgetter from timeperiod import in_period, InvalidFormat import functional from zmon_worker_monitor.zmon_worker.common import mathfun from zmon_worker_monitor import plugin_manager logger = logging.getLogger(__name__) # interval in seconds for sending metrics to graphite METRICS_INTERVAL = 15 STASH_CACHE_EXPIRATION_TIME = 3600 DEFAULT_CHECK_RESULTS_HISTORY_LENGTH = 20 TRIAL_RUN_RESULT_EXPIRY_TIME = 300 # we allow specifying condition expressions without using the "value" variable # the following pattern is used to check if "value" has to be prepended to the condition SIMPLE_CONDITION_PATTERN = re.compile(r'^[<>!=\[]|i[ns] ') GRAPHITE_REPLACE_KEYCHARS = re.compile(r'[./\s]') # round to microseconds ROUND_SECONDS_DIGITS = 6 JMX_CONFIG_FILE = 'jmxremote.password' KAIROS_ID_FORBIDDEN_RE = re.compile(r'[^a-zA-Z0-9\-_\.]') HOST_GROUP_PREFIX = re.compile(r'^([a-z]+)') INSTANCE_PORT_SUFFIX = re.compile(r':([0-9]+)$') EVENTS = { 'ALERT_STARTED': eventlog.Event(0x34001, ['checkId', 'alertId', 'value']), 'ALERT_ENDED': eventlog.Event(0x34002, ['checkId', 'alertId', 'value']), 'ALERT_ENTITY_STARTED': eventlog.Event(0x34003, ['checkId', 'alertId', 'value', 'entity']), 'ALERT_ENTITY_ENDED': eventlog.Event(0x34004, ['checkId', 'alertId', 'value', 'entity']), 'DOWNTIME_STARTED': eventlog.Event(0x34005, [ 'alertId', 'entity', 'startTime', 'endTime', 'userName', 'comment', ]), 'DOWNTIME_ENDED': eventlog.Event(0x34006, [ 'alertId', 'entity', 'startTime', 'endTime', 'userName', 'comment', ]), 'SMS_SENT': eventlog.Event(0x34007, ['alertId', 'entity', 'phoneNumber', 'httpStatus']), 'ACCESS_DENIED': eventlog.Event(0x34008, ['userName', 'entity']), } eventlog.register_all(EVENTS) Sms.register_eventlog_events(EVENTS) get_value = itemgetter('value') class ProtectedPartial(object): ''' Provides functools.partial functionality with one additional feature: if keyword arguments contain '__protected' key with list of arguments as value, the appropriate values will not be overwritten when calling the partial. This way we can prevent user from overwriting internal zmon parameters in check command. The protected key uses double underscore to prevent overwriting it, we reject all commands containing double underscores. ''' def __init__(self, func, *args, **kwargs): self.__func = func self.__partial_args = args self.__partial_kwargs = kwargs self.__protected = frozenset(kwargs.get('__protected', [])) self.__partial_kwargs.pop('__protected', None) def __call__(self, *args, **kwargs): new_kwargs = self.__partial_kwargs.copy() new_kwargs.update((k, v) for (k, v) in kwargs.iteritems() if k not in self.__protected) return self.__func(*self.__partial_args + args, **new_kwargs) def propartial(func, *args, **kwargs): ''' >>> propartial(int, base=2)('100') 4 >>> propartial(int, base=2)('100', base=16) 256 >>> propartial(int, base=2, __protected=['base'])('100', base=16) 4 ''' return ProtectedPartial(func, *args, **kwargs) normalize_kairos_id = propartial(KAIROS_ID_FORBIDDEN_RE.sub, '_') orig_process_title = None def setp(check_id, entity, msg): global orig_process_title if not orig_process_title: try: orig_process_title = setproctitle.getproctitle().split(' ')[2].split(':')[0].split('.')[0] except: orig_process_title = 'p34XX' setproctitle.setproctitle('zmon-worker.{} check {} on {} {} {}'.format(orig_process_title, check_id, entity, msg, datetime.now().strftime('%H:%M:%S.%f'))) def flatten(structure, key='', path='', flattened=None): path = str(path) key = str(key) if flattened is None: flattened = {} if type(structure) not in (dict, list): flattened[((path + '.' if path else '')) + key] = structure elif isinstance(structure, list): pass else: for new_key, value in structure.items(): flatten(value, new_key, '.'.join(filter(None, [path, key])), flattened) return flattened def timed(f): '''Decorator to "time" a function execution. Wraps the function's result in a new dict. >>> timed(lambda: 1)()['value'] 1 ''' def wrapper(*args, **kwargs): start = time.time() res = f(*args, **kwargs) delta = time.time() - start # round and use short keys as we will serialize the whole stuff as JSON return {'value': res, 'ts': round(start, ROUND_SECONDS_DIGITS), 'td': round(delta, ROUND_SECONDS_DIGITS)} return wrapper def _get_entity_url(entity): ''' >>> _get_entity_url({}) >>> _get_entity_url({'url': 'fesn01:39820'}) 'http://fesn01:39820' >>> _get_entity_url({'host': 'fesn01'}) 'http://fesn01' ''' if 'url' in entity: if entity['url'].startswith('http://') or entity['url'].startswith('https://'): return entity['url'] return 'http://' + entity['url'] if 'host' in entity: return 'http://' + entity['host'] return None def _get_jmx_port(entity): ''' >>> _get_jmx_port({'instance': '9620'}) 49620 ''' if 'instance' in entity: return int('4' + entity['instance']) return None def _get_shards(entity): ''' >>> _get_shards({'shards': {'shard1': 'host1/db1'}}) {'shard1': 'host1/db1'} >>> _get_shards({'service_name': 'db'}) {'db': 'db/postgres'} >>> _get_shards({'service_name': 'db', 'port': 1234}) {'db': 'db:1234/postgres'} >>> _get_shards({'service_name': 'db:1234', 'port': 1234}) {'db:1234': 'db:1234/postgres'} >>> _get_shards({'service_name': 'db:1234'}) {'db:1234': 'db:1234/postgres'} >>> _get_shards({'service_name': 'db-1234', 'port': 1234}) {'db-1234': 'db-1234:1234/postgres'} >>> _get_shards({'project': 'shop'}) ''' if 'shards' in entity: return entity['shards'] if 'service_name' in entity: return {entity['service_name']: ('{service_name}:{port}/postgres'.format(**entity) if 'port' in entity and not entity['service_name'].endswith(':{}'.format(entity['port' ])) else '{}/postgres'.format(entity['service_name']))} return None def entity_values(con, check_id, alert_id,count=1): return map(get_value, entity_results(con, check_id, alert_id, count)) def entity_results(con, check_id, alert_id, count=1): all_entities = con.hkeys('zmon:alerts:{}:entities'.format(alert_id)) all_results = [] for entity_id in all_entities: results = get_results(con, check_id, entity_id, count) all_results.extend(results) return all_results def capture(value=None, captures=None, **kwargs): ''' >>> capture(1, {}) 1 >>> captures={}; capture(1, captures); captures 1 {'capture_1': 1} >>> captures={'capture_1': 1}; capture(2, captures); sorted(captures.items()) 2 [('capture_1', 1), ('capture_2', 2)] >>> captures={}; capture(captures=captures, mykey=1); captures 1 {'mykey': 1} >>> p = functools.partial(capture, captures={}); p(1); p(a=1) 1 1 ''' if kwargs: if len(kwargs) > 1: raise ValueError('Only one named capture supported') key, value = kwargs.items()[0] else: i = 1 while True: key = 'capture_{}'.format(i) if key not in captures: break i += 1 captures[key] = value return value def _parse_alert_parameter_value(data): ''' >>> _parse_alert_parameter_value({'value': 10}) 10 >>> _parse_alert_parameter_value({'value': '2014-07-03T22:00:00.000Z', 'comment': "desc", "type": "date"}) datetime.date(2014, 7, 3) ''' allowed_types = { 'int': int, 'float': float, 'str': str, 'bool': bool, 'datetime': lambda json_date: datetime.strptime(json_date, '%Y-%m-%dT%H:%M:%S.%fZ'), 'date': lambda json_date: datetime.strptime(json_date, '%Y-%m-%dT%H:%M:%S.%fZ').date(), } value = data['value'] type_name = data.get('type') if type_name: try: value = allowed_types[type_name](value) except Exception: raise Exception('Attempted wrong type cast <{}> in alert parameters'.format(type_name)) return value def _inject_alert_parameters(alert_parameters, ctx): ''' Inject alert parameters into the execution context dict (ctx) ''' params_name = 'params' params = {} if alert_parameters: for apname, apdata in alert_parameters.items(): if apname in ctx: raise Exception('Parameter name: %s clashes in context', apname) value = _parse_alert_parameter_value(apdata) params[apname] = value ctx[apname] = value # inject parameter into context # inject the whole parameters map so that user can iterate over them in the alert condition if params_name not in ctx: ctx[params_name] = params def alert_series(f, n, con, check_id, entity_id): """ evaluate given function on the last n check results and return true if the "alert" function f returns true for all values""" vs = get_results(con, check_id, entity_id, n) active_count = 0 exception_count = 0 for v in vs: # counting exceptions thrown during eval as alert being active for that interval try: v = v["value"] r = 1 if f(v) else 0 x =0 except: r = 1 x = 1 active_count += r exception_count += x if exception_count == n: raise Exception("All alert evaluations failed!") # activating alert if not enough value found (this should only affect starting period) return n == active_count or len(vs)>> plugin_manager.collect_plugins(); 'timeseries_median' in build_condition_context(None, 1, 1, {'id': '1'}, {}, {}) True >>> 'timeseries_percentile' in build_condition_context(None, 1, 1, {'id': '1'}, {}, {}) True ''' history_factory = plugin_manager.get_plugin_obj_by_name('history', 'Function') ctx = build_default_context() ctx['capture'] = functools.partial(capture, captures=captures) ctx['entity_results'] = functools.partial(entity_results, con=con, check_id=check_id, alert_id=alert_id) ctx['entity_values'] = functools.partial(entity_values, con=con, check_id=check_id, alert_id=alert_id) ctx['entity'] = dict(entity) ctx['history'] = history_factory.create({ 'check_id': check_id, 'entity_id_for_kairos': normalize_kairos_id(entity['id']) }) ctx['value_series'] = functools.partial(get_results_user, con=con, check_id=check_id, entity_id=entity['id']) ctx['alert_series'] = functools.partial(alert_series, con=con, check_id=check_id, entity_id=entity['id']) _inject_alert_parameters(alert_parameters, ctx) for f in ( mathfun.avg, mathfun.delta, mathfun.median, mathfun.percentile, mathfun.first, mathfun._min, mathfun._max, sum, ): name = f.__name__ if name.startswith('_'): name = name[1:] ctx['timeseries_' + name] = functools.partial(_apply_aggregate_function_for_time, con=con, func=f, check_id=check_id, entity_id=entity['id'], captures=captures) return ctx def _time_slice(time_spec, results): ''' >>> _time_slice('1s', []) [] >>> _time_slice('1s', [{'ts': 0, 'value': 0}, {'ts': 1, 'value': 10}]) [{'ts': 0, 'value': 0}, {'ts': 1, 'value': 10}] >>> _time_slice('2s', [{'ts': 123.6, 'value': 10}, {'ts': 123, 'value': 0}, {'ts': 121, 'value': -10}]) [{'ts': 123, 'value': 0}, {'ts': 123.6, 'value': 10}] ''' if len(results) < 2: # not enough values to calculate anything return results get_ts = itemgetter('ts') results.sort(key=get_ts) keys = map(get_ts, results) td = parse_timedelta(time_spec) last = results[-1] needle = last['ts'] - td.total_seconds() idx = bisect_left(keys, needle) if idx == len(results): # timerange exceeds range of results return results return results[idx:] def _get_results_for_time(con, check_id, entity_id, time_spec): results = get_results(con, check_id, entity_id, DEFAULT_CHECK_RESULTS_HISTORY_LENGTH) return _time_slice(time_spec, results) def _apply_aggregate_function_for_time( time_spec, con, func, check_id, entity_id, captures, key=functional.id, **args ): results = _get_results_for_time(con, check_id, entity_id, time_spec) ret = mathfun.apply_aggregate_function(results, func, key=functional.compose(key, get_value), **args) # put function result in our capture dict for debugging # e.g. captures["delta(5m)"] = 13.5 captures['{}({})'.format(func.__name__, time_spec)] = ret return ret def _build_notify_context(alert): return { 'send_mail': functools.partial(Mail.send, alert), 'send_email': functools.partial(Mail.send, alert), 'send_sms': functools.partial(Sms.send, alert), 'notify_hubot': functools.partial(Hubot.notify, alert), 'send_hipchat': functools.partial(NotifyHipchat.send, alert), 'send_slack': functools.partial(NotifySlack.send, alert) } def _prepare_condition(condition): '''function to prepend "value" to condition if necessary >>> _prepare_condition('>0') 'value >0' >>> _prepare_condition('["a"]>0') 'value ["a"]>0' >>> _prepare_condition('in (1, 2, 3)') 'value in (1, 2, 3)' >>> _prepare_condition('value>0') 'value>0' >>> _prepare_condition('a in (1, 2, 3)') 'a in (1, 2, 3)' ''' if SIMPLE_CONDITION_PATTERN.match(condition): # short condition format, e.g. ">=3" return 'value {}'.format(condition) else: # condition is more complex, e.g. "value > 3 and value < 10" return condition class PeriodicBufferedAction(object): def __init__(self, action, action_name=None, retries=5, t_wait=10, t_random_fraction=0.5): self._stop = True self.action = action self.action_name = action_name if action_name else (action.func_name if hasattr(action, 'func_name') else (action.__name__ if hasattr(action, '__name__') else None)) self.retries = retries self.t_wait = t_wait self.t_rand_fraction = t_random_fraction self._queue = Queue.Queue() self._thread = threading.Thread(target=self._loop) self._thread.daemon = True def start(self): self._stop = False self._thread.start() def stop(self): self._stop = True def is_active(self): return not self._stop def get_time_randomized(self): return self.t_wait * (1 + random.uniform(-self.t_rand_fraction, self.t_rand_fraction)) def enqueue(self, data, count=0): elem = { 'data': data, 'count': count, # 'time': time.time() } try: self._queue.put_nowait(elem) except Queue.Full: logger.exception('Fatal Error: is worker out of memory? Details: ') def _collect_from_queue(self): elem_list = [] empty = False while not empty and not self._stop: try: elem_list.append(self._queue.get_nowait()) except Queue.Empty: empty = True return elem_list def _loop(self): t_last = time.time() t_wait_last = self.get_time_randomized() while not self._stop: if time.time() - t_last >= t_wait_last: elem_list = self._collect_from_queue() try: if elem_list: self.action([e['data'] for e in elem_list]) except Exception as e: logger.error('Error executing action %s: %s', self.action_name, e) for elem in elem_list: if elem['count'] < self.retries: self.enqueue(elem['data'], count=elem['count']+1) else: logger.error('Error: Maximum retries reached for action %s. Dropping data: %s ', self.action_name, elem['data']) finally: t_last = time.time() t_wait_last = self.get_time_randomized() else: time.sleep(0.2) # so loop is responsive to stop commands def _log_event(event_name, alert, result, entity=None): params = {'checkId': alert['check_id'], 'alertId': alert['id'], 'value': result['value']} if entity: params['entity'] = entity eventlog.log(EVENTS[event_name].id, **params) def _convert_captures(worker_name, alert_id, entity_id, timestamp, captures): ''' >>> _convert_captures('p0.h', 1, 'e1', 1, {'c0': 'error'}) [] >>> _convert_captures('p0.h', 1, 'e1', 1, {'c1': '23.4'}) [('p0_h.alerts.1.e1.captures.c1', 23.4, 1)] >>> _convert_captures('p0.h', 1, 'e1', 1, {'c2': 12}) [('p0_h.alerts.1.e1.captures.c2', 12.0, 1)] >>> _convert_captures('p0.h', 1, 'e1', 1, {'c3': {'c31': '42'}}) [('p0_h.alerts.1.e1.captures.c3.c31', 42.0, 1)] >>> _convert_captures('p0.h', 1, 'e1', 1, {'c4': {'c41': 'error'}}) [] >>> _convert_captures('p0.h', 1, 'e .1/2', 1, {'c 1/2': '23.4'}) [('p0_h.alerts.1.e__1_2.captures.c_1_2', 23.4, 1)] >>> _convert_captures('p0.h', 1, 'e1', 1, {'c3': {'c 3.1/': '42'}}) [('p0_h.alerts.1.e1.captures.c3.c_3_1_', 42.0, 1)] ''' result = [] key = '{worker_name}.alerts.{alert_id}.{entity_id}.captures.{capture}' safe_worker_name = GRAPHITE_REPLACE_KEYCHARS.sub('_', worker_name) safe_entity_id = GRAPHITE_REPLACE_KEYCHARS.sub('_', entity_id) for capture, value in captures.iteritems(): safe_capture = GRAPHITE_REPLACE_KEYCHARS.sub('_', capture) if isinstance(value, dict): for inner_capture, inner_value in value.iteritems(): try: v = float(inner_value) except (ValueError, TypeError): continue safe_inner_capture = GRAPHITE_REPLACE_KEYCHARS.sub('_', inner_capture) result.append(('{}.{}'.format(key.format(worker_name=safe_worker_name, alert_id=alert_id, entity_id=safe_entity_id, capture=safe_capture), safe_inner_capture), v, timestamp)) else: try: v = float(value) except (ValueError, TypeError): continue result.append((key.format(worker_name=safe_worker_name, alert_id=alert_id, entity_id=safe_entity_id, capture=safe_capture), v, timestamp)) return result def evaluate_condition(val, condition, **ctx): ''' >>> evaluate_condition(0, '>0') False >>> evaluate_condition(1, '>0') True >>> evaluate_condition(1, 'delta("5m")<-10', delta=lambda x:1) False >>> evaluate_condition({'a': 1}, '["a"]>10') False ''' return safe_eval(_prepare_condition(condition), eval_source='', value=val, **ctx) class InvalidEvalExpression(Exception): pass class MalformedCheckResult(Exception): def __init__(self, msg): Exception.__init__(self, msg) class Try(Callable): def __init__(self, try_call, except_call, exc_cls=Exception): self.try_call = try_call self.except_call = except_call self.exc_cls = exc_cls def __call__(self, *args): try: return self.try_call() except self.exc_cls, e: return self.except_call(e) def get_results_user(count=1, con=None, check_id=None, entity_id=None): return map(lambda x: x["value"], get_results(con, check_id, entity_id, count)) def get_results(con, check_id, entity_id, count=1): r = map(json.loads, con.lrange('zmon:checks:{}:{}'.format(check_id, entity_id), 0, count - 1)) for x in r: x.update({"entity_id": entity_id}) return r def avg(sequence): ''' >>> avg([]) 0 >>> avg([1, 2, 3]) 2.0 >>> avg([2, 3]) 2.5 ''' l = len(sequence) * 1.0 return (sum(sequence) / l if l else 0) def empty(v): ''' >>> empty([]) True >>> empty([1]) False ''' return not bool(v) def build_default_context(): return { 'abs': abs, 'all': all, 'any': any, 'avg': avg, 'basestring': basestring, 'bin': bin, 'bool': bool, 'chain': itertools.chain, 'chr': chr, 'Counter': Counter, 'dict': dict, 'divmod': divmod, 'Exception': Exception, 'empty': empty, 'enumerate': enumerate, 'False': False, 'filter': filter, 'float': float, 'groupby': itertools.groupby, 'hex': hex, 'int': int, 'isinstance': isinstance, 'json': json.loads, 'len': len, 'list': list, 'long': long, 'map': map, 'max': max, 'min': min, 'normalvariate': random.normalvariate, 'oct': oct, 'ord': ord, 'pow': pow, 'range': range, 'reduce': functools.reduce, 'reversed': reversed, 'round': round, 'set': set, 'sorted': sorted, 'str': str, 'sum': sum, 'timestamp': time.time, 'True': True, 'Try': Try, 'tuple': tuple, 'unichr': unichr, 'unicode': unicode, 'xrange': xrange, 'zip': zip, } def check_ast_node_is_safe(node): ''' Check that the ast node does not contain any system attribute calls as well as exec call (not to construct the system attribute names with strings). eval() function calls should not be a problem, as it is hopefuly not exposed in the globals and __builtins__ >>> node = ast.parse('def __call__(): return 1') >>> node == check_ast_node_is_safe(node) True >>> check_ast_node_is_safe(ast.parse('def m(): return ().__class__')) Traceback (most recent call last): ... InvalidEvalExpression: alert definition should not try to access hidden attributes (for example '__class__') >>> check_ast_node_is_safe(ast.parse('def horror(g): exec "exploit = ().__" + "class" + "__" in g')) Traceback (most recent call last): ... InvalidEvalExpression: alert definition should not try to execute arbitrary code ''' for n in ast.walk(node): if isinstance(n, ast.Attribute): if n.attr.startswith('__'): raise InvalidEvalExpression("alert definition should not try to access hidden attributes (for example '__class__')" ) elif isinstance(n, ast.Exec): raise InvalidEvalExpression('alert definition should not try to execute arbitrary code') return node def safe_eval(expr, eval_source='', **kwargs): ''' Safely execute expr. For now expr can be only one python expression, a function definition or a callable class definition. If the expression is returning a callable object (like lambda function or Try() object) it will be called and a result of the call will be returned. If a result of calling of the defined function or class are returning a callable object it will not be called. As access to the hidden attributes is protected by check_ast_node_is_safe() method we should not have any problem with valnarabilites defined here: Link: http://nedbatchelder.com/blog/201206/eval_really_is_dangerous.html TODO: implement compile object cache >>> safe_eval('value > 0', value=1) True >>> safe_eval('def m(): return value', value=10) 10 >>> safe_eval('def m(param): return value', value=10) Traceback (most recent call last): ... TypeError: m() takes exactly 1 argument (0 given) >>> safe_eval('lambda: value', value=10) 10 >>> result = safe_eval('def m(): print value', value=10) Traceback (most recent call last): ... SyntaxError: invalid syntax >>> result = safe_eval('print value', value=10) Traceback (most recent call last): ... SyntaxError: invalid syntax >>> safe_eval('def m(): return lambda: value', value=10) #doctest: +ELLIPSIS at ...> >>> safe_eval('error = value', value=10) Traceback (most recent call last): ... InvalidEvalExpression: alert definition can contain a python expression, a function call or a callable class definition >>> safe_eval('def m(): return value.__class__', value=10) Traceback (most recent call last): ... InvalidEvalExpression: alert definition should not try to access hidden attributes (for example '__class__') >>> safe_eval(""" ... class CallableClass(object): ... ... def get_value(self): ... return value ... ... def __call__(self): ... return self.get_value() ... """, value=10) 10 >>> safe_eval(""" ... class NotCallableClass(object): ... ... def get_value(self): ... return value ... ... def call(self): # this is not a callable class ... return self.get_value() ... """, value=10) Traceback (most recent call last): ... InvalidEvalExpression: alert definition should contain a callable class definition (missing __call__ method?) >>> safe_eval(""" ... def firstfunc(): ... return value ... ... value > 0 ... ... """, value=10) Traceback (most recent call last): ... InvalidEvalExpression: alert definition should contain only one python expression, a function call or a callable class definition ''' g = {'__builtins__': {}, 'object': object, '__name__': __name__} # __builtins__ should be masked away to disable builtin functions # object is needed if the NewStyle class is being created # __name__ is needed to be able to complie a class g.update(kwargs) node = compile(expr, eval_source, 'exec', ast.PyCF_ONLY_AST | __future__.CO_FUTURE_PRINT_FUNCTION) node = check_ast_node_is_safe(node) body = node.body if body and len(body) == 1: x = body[0] if isinstance(x, ast.FunctionDef) or isinstance(x, ast.ClassDef): cc = compile(node, eval_source, 'exec') # can be nicely cached v = {} exec (cc, g, v) if len(v) == 1: c = v.itervalues().next() if isclass(c): # we need a class instance and not the class itself c = c() if callable(c): return c() # if a function will return another callable, we will not call it else: raise InvalidEvalExpression('alert definition should contain a callable class definition (missing __call__ method?)' ) else: raise InvalidEvalExpression('alert definition should contain only one function or one callable class definition' ) elif isinstance(x, ast.Expr): cc = compile(expr, eval_source, 'eval', __future__.CO_FUTURE_PRINT_FUNCTION) # can be nicely cached r = eval(cc, g) if callable(r): # Try() returns callable that should be executed return r() else: return r else: raise InvalidEvalExpression('alert definition can contain a python expression, a function call or a callable class definition' ) else: raise InvalidEvalExpression('alert definition should contain only one python expression, a function call or a callable class definition' ) class NotaZmonTask(object): abstract = True _host = 'localhost' _port = 6379 _secure_queue = 'zmon:queue:secure' _db = 0 _con = None _graphite = None _counter = Counter() _captures_local = [] _last_metrics_sent = 0 _last_captures_sent = 0 _logger = None _logfile = None _loglevel = logging.DEBUG _zmon_url = None _worker_name = None _queues = None _stash = None _stash_cmds = None _safe_repositories = [] _is_secure_worker = True _timezone = None _account = None _team = None _dataservice_url = None _dataservice_poster = None _plugin_category = 'Function' _plugins = [] _function_factories = {} _zmon_actuator_checkid = None @classmethod def configure(cls, config): try: #configure RedisConnHandler RedisConnHandler.configure(**config) except KeyError: logger.exception('Error creating connection: ') raise #cls._loglevel = (logging.getLevelName(config['loglevel']) if 'loglevel' in config else logging.INFO) cls._logfile = config.get('logfile') cls._soap_config = {k: v for k, v in config.items() if k.startswith('soap.service')} cls._zmon_url = config.get('zmon.url') cls._queues = config.get('zmon.queues', {}).get('local') cls._safe_repositories = sorted(config.get('safe_repositories', [])) cls._zmon_actuator_checkid = config.get('zmon.actuator.checkid', None) cls._logger = cls.get_configured_logger() cls.perload_stash_commands() cls._is_secure_worker = config.get('worker.is_secure') cls._timezone = pytz.timezone('Europe/Berlin') cls._account = config.get('account') cls._team = config.get('team') cls._dataservice_url = config.get('dataservice.url') if cls._dataservice_url: # start action loop for sending reports to dataservice cls._logger.info("Enabling data service: {}".format(cls._dataservice_url)) cls._dataservice_poster = PeriodicBufferedAction(cls.send_to_dataservice, retries=10, t_wait=5) cls._dataservice_poster.start() cls._plugins = plugin_manager.get_plugins_of_category(cls._plugin_category) # store function factories from plugins in a dict by name cls._function_factories = {p.name: p.plugin_object for p in cls._plugins} def __init__(self): self.task_context = None self._cmds_first_accessed = False @classmethod def is_secure_worker(cls): return cls._is_secure_worker @classmethod def perload_stash_commands(cls): cls._stash = StashAccessor(cls.get_configured_logger()) if cls.is_secure_worker(): try: cls._stash_cmds = cls._stash.get_stash_commands(*cls._safe_repositories) cls._logger.info('Loaded %d commands from stash secure repos', len(cls._stash_cmds)) except Exception: cls._logger.exception('Error loading stash commands: ') @async_memory_cache.cache_on_arguments(namespace='zmon-worker', expiration_time=STASH_CACHE_EXPIRATION_TIME) @with_retries(max_retries=3, delay=10) def load_stash_commands(self, repositories): if not self._cmds_first_accessed: # ugly but needed to stop celery from refreshing the cache when task process is forked self._cmds_first_accessed = True return self._stash_cmds else: return self._stash.get_stash_commands(*repositories) @classmethod def get_configured_logger(cls): if not cls._logger: cls._logger = logger return cls._logger @property def con(self): self._con = RedisConnHandler.get_instance().get_conn() BaseNotification.set_redis_con(self._con) return self._con @property def logger(self): return self.get_configured_logger() @property def worker_name(self): if not self._worker_name: self._worker_name = 'p{}.{}'.format('local', socket.gethostname()) return self._worker_name def get_redis_host(self): return RedisConnHandler.get_instance().get_parsed_redis().hostname def get_redis_port(self): return RedisConnHandler.get_instance().get_parsed_redis().port def send_metrics(self): now = time.time() if now > self._last_metrics_sent + METRICS_INTERVAL: p = self.con.pipeline() p.sadd('zmon:metrics', self.worker_name) for key, val in self._counter.items(): p.incrby('zmon:metrics:{}:{}'.format(self.worker_name, key), val) p.set('zmon:metrics:{}:ts'.format(self.worker_name), now) p.execute() self._counter.clear() self._last_metrics_sent = now self.logger.info('Send metrics, end storing metrics in redis count: %s, duration: %.3fs', len(self._counter), time.time() - now) @classmethod def send_to_dataservice(cls, check_results, timeout=10, method='PUT'): http_req = {'PUT': requests.put, 'POST': requests.post, 'GET': requests.get} headers = {'content-type': 'application/json'} team = cls._team if cls._team is not None else '' account = cls._account if cls._account is not None else '' try: # group check_results by check_id results_by_id = defaultdict(list) for cr in check_results: results_by_id[cr['check_id']].append(cr) # make separate posts per check_id for check_id, results in results_by_id.items(): url = '{url}/{account}/{check_id}/'.format(url=cls._dataservice_url.rstrip('/'), account=urllib.quote(account), check_id=check_id) worker_result = { 'team': team, 'account': account, 'results': results, } r = http_req[method](url, data=json.dumps(worker_result), timeout=timeout, headers=headers) if r.status_code != requests.codes.ok: raise Exception('http request to {} got status_code={}'.format(url, r.status_code)) except Exception: logger.exception('Error in dataservice post: ') raise def check_and_notify(self, req, alerts, task_context=None): self.task_context = task_context start_time = time.time() # soft_time_limit = req['interval'] check_id = req['check_id'] entity_id = req['entity']['id'] try: val = self.check(req) #TODO: need to support soft and hard time limits soon # except SoftTimeLimitExceeded, e: # self.logger.info('Check request with id %s on entity %s exceeded soft time limit', check_id, # entity_id) # # PF-3685 It might happen that this exception was raised after sending a command to redis, but before receiving # # a response. In this case, the connection object is "dirty" and when the same connection gets taken out of the # # pool and reused, it'll throw an exception in redis client. # self.con.connection_pool.disconnect() # notify(check_and_notify, {'ts': start_time, 'td': soft_time_limit, 'value': str(e)}, req, alerts, # force_alert=True) except CheckError, e: # self.logger.warn('Check failed for request with id %s on entity %s. Output: %s', check_id, entity_id, str(e)) self.notify({'ts': start_time, 'td': time.time() - start_time, 'value': str(e), 'worker': self.worker_name, 'exc': 1}, req, alerts, force_alert=True) except SecurityError, e: self.logger.exception('Security exception in request with id %s on entity %s', check_id, entity_id) self.notify({'ts': start_time, 'td': time.time() - start_time, 'value': str(e), 'worker': self.worker_name, 'exc': 1}, req, alerts, force_alert=True) except Exception, e: # self.logger.exception('Check request with id %s on entity %s threw an exception', check_id, entity_id) # PF-3685 Disconnect on unknown exceptions: we don't know what actually happened, it might be that redis # connection is dirty. CheckError exception is "safe", it's thrown by the worker whenever the check returns a # different response than expected, the user doesn't have access to the checked entity or there's an error in # check's parameters. self.con.connection_pool.disconnect() self.notify({'ts': start_time, 'td': time.time() - start_time, 'value': str(e), 'worker': self.worker_name, 'exc': 1}, req, alerts, force_alert=True) else: self.notify(val, req, alerts) def trial_run(self, req, alerts, task_context=None): self.task_context = task_context start_time = time.time() # soft_time_limit = req['interval'] entity_id = req['entity']['id'] try: val = self.check_for_trial_run(req) #TODO: need to support soft and hard time limits soon # except SoftTimeLimitExceeded, e: # trial_run.logger.info('Trial run on entity %s exceeded soft time limit', entity_id) # trial_run.con.connection_pool.disconnect() # notify_for_trial_run(trial_run, {'ts': start_time, 'td': soft_time_limit, 'value': str(e)}, req, alerts, # force_alert=True) except InsufficientPermissionsError, e: self.logger.info('Access denied for user %s to run check on %s', req['created_by'], entity_id) eventlog.log(EVENTS['ACCESS_DENIED'].id, userName=req['created_by'], entity=entity_id) self.notify_for_trial_run({'ts': start_time, 'td': time.time() - start_time, 'value': str(e)}, req, alerts, force_alert=True) except CheckError, e: self.logger.warn('Trial run on entity %s failed. Output: %s', entity_id, str(e)) self.notify_for_trial_run({'ts': start_time, 'td': time.time() - start_time, 'value': str(e)}, req, alerts, force_alert=True) except Exception, e: self.logger.exception('Trial run on entity %s threw an exception', entity_id) self.con.connection_pool.disconnect() self.notify_for_trial_run({'ts': start_time, 'td': time.time() - start_time, 'value': str(e)}, req, alerts, force_alert=True) else: self.notify_for_trial_run(val, req, alerts) def cleanup(self, *args, **kwargs): self.task_context = kwargs.get('task_context') p = self.con.pipeline() p.smembers('zmon:checks') p.smembers('zmon:alerts') check_ids, alert_ids = p.execute() for check_id in kwargs.get('disabled_checks', {}): self._cleanup_check(p, check_id) for alert_id in kwargs.get('disabled_alerts', {}): self._cleanup_alert(p, alert_id) for check_id in check_ids: if check_id in kwargs.get('check_entities', {}): redis_entities = self.con.smembers('zmon:checks:{}'.format(check_id)) check_entities = set(kwargs['check_entities'][check_id]) # If it happens that we remove all entities for given check, we should remove all the things. if not check_entities: p.srem('zmon:checks', check_id) p.delete('zmon:checks:{}'.format(check_id)) for entity in redis_entities: p.delete('zmon:checks:{}:{}'.format(check_id, entity)) else: self._cleanup_common(p, 'checks', check_id, redis_entities - check_entities) else: self._cleanup_check(p, check_id) for alert_id in alert_ids: if alert_id in kwargs.get('alert_entities', {}): # Entities that are in the alert state. redis_entities = self.con.smembers('zmon:alerts:{}'.format(alert_id)) alert_entities = set(kwargs['alert_entities'][alert_id]) # If it happens that we remove all entities for given alert, we should remove all the things. if not alert_entities: p.srem('zmon:alerts', alert_id) p.delete('zmon:alerts:{}'.format(alert_id)) p.delete('zmon:alerts:{}:entities'.format(alert_id)) for entity in redis_entities: p.delete('zmon:alerts:{}:{}'.format(alert_id, entity)) p.delete('zmon:notifications:{}:{}'.format(alert_id, entity)) else: self._cleanup_common(p, 'alerts', alert_id, redis_entities - alert_entities) # All entities matching given alert definition. all_entities = set(self.con.hkeys('zmon:alerts:{}:entities'.format(alert_id))) for entity in all_entities - alert_entities: self.logger.info('Removing entity %s from hash %s', entity, 'zmon:alerts:{}:entities'.format(alert_id)) p.hdel('zmon:alerts:{}:entities'.format(alert_id), entity) p.delete('zmon:notifications:{}:{}'.format(alert_id, entity)) else: self._cleanup_alert(p, alert_id) p.execute() def _cleanup_check(self, pipeline, check_id): self.logger.info('Removing check with id %s from zmon:checks set', check_id) pipeline.srem('zmon:checks', check_id) for entity_id in self.con.smembers('zmon:checks:{}'.format(check_id)): self.logger.info('Removing key %s', 'zmon:checks:{}:{}'.format(check_id, entity_id)) pipeline.delete('zmon:checks:{}:{}'.format(check_id, entity_id)) self.logger.info('Removing key %s', 'zmon:checks:{}'.format(check_id)) pipeline.delete('zmon:checks:{}'.format(check_id)) def _cleanup_alert(self, pipeline, alert_id): self.logger.info('Removing alert with id %s from zmon:alerts set', alert_id) pipeline.srem('zmon:alerts', alert_id) for entity_id in self.con.smembers('zmon:alerts:{}'.format(alert_id)): self.logger.info('Removing key %s', 'zmon:alerts:{}:{}'.format(alert_id, entity_id)) pipeline.delete('zmon:alerts:{}:{}'.format(alert_id, entity_id)) pipeline.delete('zmon:notifications:{}:{}'.format(alert_id, entity_id)) self.logger.info('Removing key %s', 'zmon:alerts:{}'.format(alert_id)) pipeline.delete('zmon:alerts:{}'.format(alert_id)) self.logger.info('Removing key %s', 'zmon:alert:{}:entities'.format(alert_id)) pipeline.delete('zmon:alerts:{}:entities'.format(alert_id)) def _cleanup_common(self, pipeline, entry_type, entry_id, entities): ''' Removes entities from redis matching given type and id. Parameters ---------- entry_type: str Type of entry to remove: 'checks' or 'alerts'. entry_id: int Id of entry to remove. entities: set A set of entities to remove (difference between entities from scheduler and ones present in redis). ''' for entity in entities: self.logger.info('Removing entity %s from set %s', entity, 'zmon:{}:{}'.format(entry_type, entry_id)) pipeline.srem('zmon:{}:{}'.format(entry_type, entry_id), entity) self.logger.info('Removing key %s', 'zmon:{}:{}:{}'.format(entry_type, entry_id, entity)) pipeline.delete('zmon:{}:{}:{}'.format(entry_type, entry_id, entity)) def _store_check_result(self, req, result): self.con.sadd('zmon:checks', req['check_id']) self.con.sadd('zmon:checks:{}'.format(req['check_id']), req['entity']['id']) key = 'zmon:checks:{}:{}'.format(req['check_id'], req['entity']['id']) value = json.dumps(result, cls=JsonDataEncoder) self.con.lpush(key, value) self.con.ltrim(key, 0, DEFAULT_CHECK_RESULTS_HISTORY_LENGTH - 1) def check(self, req): self.logger.debug(req) # schedule_time = req['schedule_time'] start = time.time() try: setp(req['check_id'], req['entity']['id'], 'start') res = self._get_check_result(req) setp(req['check_id'], req['entity']['id'], 'done') except Exception, e: # PF-3778 Always store check results and re-raise exception which will be handled in 'check_and_notify'. self._store_check_result(req, {'td': round(time.time() - start, ROUND_SECONDS_DIGITS), 'ts': round(start, ROUND_SECONDS_DIGITS), 'value': str(e), 'worker': self.worker_name, 'exc': 1}) raise finally: # Store duration in milliseconds as redis only supports integers for counters. # 'check.{}.count'.format(req['check_id']): 1, # 'check.{}.duration'.format(req['check_id']): int(round(1000.0 * (time.time() - start))), # 'check.{}.latency'.format(req['check_id']): int(round(1000.0 * (start - schedule_time))), self._counter.update({ 'check.count': 1 }) self.send_metrics() setp(req['check_id'], req['entity']['id'], 'store') self._store_check_result(req, res) setp(req['check_id'], req['entity']['id'], 'stored') return res def check_for_trial_run(self, req): # fake check ID as it is used by check context req['check_id'] = 'trial_run' return self._get_check_result(req) @timed def _get_check_result_internal(self, req): self._enforce_security(req) cmd = req['command'] ctx = self._build_check_context(req) try: result = safe_eval(cmd, eval_source='', **ctx) return result() if isinstance(result, Callable) else result except (SyntaxError, InvalidEvalExpression), e: raise CheckError(str(e)) def _get_check_result(self, req): r = self._get_check_result_internal(req) r['worker'] = self.worker_name return r def _enforce_security(self, req): ''' Check tasks from the secure queue to asert the command to run is specified in stash check definition Side effect: modifies req to address unique security concerns Raises SecurityError on check failure ''' if self.is_secure_worker() or self.task_context['delivery_info'].get('routing_key') == 'secure': try: stash_commands = self.load_stash_commands(self._safe_repositories) except Exception, e: traceback = sys.exc_info()[2] raise SecurityError('Unexpected Internal error: {}'.format(e)), None, traceback if req['command'] not in stash_commands: raise SecurityError('Security violation: Non-authorized command received in secure environment') # transformations of entities: hostname "pp-whatever" needs to become "whatever.pp" prefix = 'pp-' if 'host' in req['entity'] and str(req['entity']['host']).startswith(prefix): self.logger.warn('secure req[entity] before pp- transformations: %s', req['entity']) real_host = req['entity']['host'] #secure_host = '{}.pp'.format(req['entity']['host'][3:]) secure_host = '{}.{}'.format(req['entity']['host'][len(prefix):], prefix[:-1]) # relplace all real host values occurrences with secure_host req['entity'].update({k: v.replace(real_host, secure_host) for k, v in req['entity'].items() if isinstance(v, basestring) and real_host in v and k != 'id'}) self.logger.warn('secure req[entity] after pp- transformations: %s', req['entity']) def _build_check_context(self, req): '''Build context for check command with all necessary functions''' entity = req['entity'] # function creation context: passed to function factories create() method factory_ctx = { 'entity': entity, 'entity_url': _get_entity_url(entity), 'check_id': req['check_id'], 'entity_id': entity['id'], 'host': entity.get('host'), 'port': entity.get('port'), 'instance': entity.get('instance'), 'external_ip': entity.get('external_ip'), 'load_balancer_status': entity.get('load_balancer_status'), 'data_center_code': entity.get('data_center_code'), 'database': entity.get('database'), 'jmx_port': _get_jmx_port(entity), 'shards': _get_shards(entity), 'soft_time_limit': req['interval'], 'redis_host': self.get_redis_host(), 'redis_port': self.get_redis_port(), 'zmon_url': NotaZmonTask._zmon_url, 'entity_id_for_kairos': normalize_kairos_id(entity['id']), 'req_created_by': req.get('created_by'), } # check execution context ctx = build_default_context() ctx['entity'] = entity # populate check context with functions from plugins' function factories for func_name, func_factory in self._function_factories.items(): if func_name not in ctx: ctx[func_name] = func_factory.create(factory_ctx) return ctx def evaluate_alert(self, alert_def, req, result): '''Check if the result triggers an alert The function will save the global alert state to the following redis keys: * zmon:alerts::entities hash of entity IDs -> captures * zmon:alerts set of active alert definition IDs * zmon:alerts: set of entity IDs in alert * zmon:alerts:: JSON with alert evaluation result for given alert definition and entity ''' # captures is our map of "debug" information, e.g. to see values calculated in our condition captures = {} alert_id = alert_def['id'] check_id = alert_def['check_id'] alert_parameters = alert_def.get('parameters') try: result = evaluate_condition(result['value'], alert_def['condition'], **build_condition_context(self.con, check_id, alert_id, req['entity'], captures, alert_parameters)) except Exception, e: captures['exception'] = str(e) result = True try: is_alert = bool((result() if isinstance(result, Callable) else result)) except Exception, e: captures['exception'] = str(e) is_alert = True # add parameters to captures so they can be substituted in alert title if alert_parameters: pure_captures = captures.copy() try: captures = {k: p['value'] for k, p in alert_parameters.items()} except Exception, e: self.logger.exception('Error when capturing parameters: ') captures.update(pure_captures) return is_alert, captures def send_notification(self, notification, context): ctx = _build_notify_context(context) try: repeat = safe_eval(notification, eval_source='' , **ctx) except Exception, e: # TODO Define what should happen if sending emails or sms fails. self.logger.exception(e) else: if repeat: self.con.hset('zmon:notifications:{}:{}'.format(context['alert_def']['id'], context['entity']['id']), notification, time.time() + repeat) def notify(self, val, req, alerts, force_alert=False): ''' Process check result and evaluate all alerts. Returns list of active alert IDs Parameters ---------- val: dict Check result, see check function req: dict Check request dict alerts: list A list of alert definitions matching the checked entity force_alert: bool An optional flag whether to skip alert evalution and force "in alert" state. Used when check request exceeds time limit or throws other exception, this way unexpected conditions are always treated as alerts. Returns ------- list A list of alert definitions matching given entity. ''' ts_serialize = lambda ts: datetime.fromtimestamp(ts, tz=self._timezone).isoformat(' ') if ts else None result = [] entity_id = req['entity']['id'] start = time.time() check_result = { 'time': ts_serialize(val.get('ts')) if isinstance(val, dict) else None, 'run_time': val.get('td') if isinstance(val, dict) else None, # TODO: should be float or is it milliseconds? 'check_id': req['check_id'], 'entity_id': req['entity']['id'], 'check_result': val, 'exception': True if isinstance(val, dict) and val.get('exc') else False, 'alerts': {}, } try: setp(req['check_id'], entity_id, 'notify loop') for alert in alerts: alert_id = alert['id'] alert_entities_key = 'zmon:alerts:{}'.format(alert_id) alerts_key = 'zmon:alerts:{}:{}'.format(alert_id, entity_id) notifications_key = 'zmon:notifications:{}:{}'.format(alert_id, entity_id) is_alert, captures = ((True, {}) if force_alert else self.evaluate_alert(alert, req, val)) func = getattr(self.con, ('sadd' if is_alert else 'srem')) changed = bool(func(alert_entities_key, entity_id)) if is_alert: # bubble up: also update global set of alerts alert_changed = func('zmon:alerts', alert_id) if alert_changed: _log_event('ALERT_STARTED', alert, val) else: entities_in_alert = self.con.smembers(alert_entities_key) if not entities_in_alert: # no entity has alert => remove from global set alert_changed = func('zmon:alerts', alert_id) if alert_changed: _log_event('ALERT_ENDED', alert, val) # PF-3318 If an alert has malformed time period, we should evaluate it anyway and continue with # the remaining alert definitions. try: is_in_period = in_period(alert.get('period', '')) except InvalidFormat, e: self.logger.warn('Alert with id %s has malformed time period.', alert_id) captures['exception'] = '; \n'.join(filter(None, [captures.get('exception'), str(e)])) is_in_period = True if changed and is_in_period and is_alert: # notify on entity-level _log_event(('ALERT_ENTITY_STARTED'), alert, val, entity_id) elif changed and not is_alert: _log_event(('ALERT_ENTITY_ENDED'), alert, val, entity_id) # Always store captures for given alert-entity pair, this is also used a list of all entities matching # given alert id. Captures are stored here because this way we can easily link them with check results # (see PF-3146). self.con.hset('zmon:alerts:{}:entities'.format(alert_id), entity_id, json.dumps(captures, cls=JsonDataEncoder)) # prepare report - alert part check_result['alerts'][alert_id] = { 'alert_id': alert_id, 'captures': captures, 'downtimes': [], 'exception': True if isinstance(captures, dict) and 'exception' in captures else False, 'active': is_alert, 'changed': changed, 'in_period': is_in_period, 'start_time': None, # '_alert_stored': None, # '_notifications_stored': None, } # get last alert data stored in redis if any alert_stored = None try: stored_raw = self.con.get(alerts_key) alert_stored = json.loads(stored_raw) if stored_raw else None except (ValueError, TypeError): self.logger.warn('My messy Error parsing JSON alert result for key: %s', alerts_key) if False: # get notification data stored in redis if any notifications_stored = None try: stored_raw = self.con.get(notifications_key) notifications_stored = json.loads(stored_raw) if stored_raw else None except (ValueError, TypeError): self.logger.warn('My requete-messy Error parsing JSON alert result for key: %s', notifications_key) downtimes = None if is_in_period: self._counter.update({'alerts.{}.count'.format(alert_id): 1, 'alerts.{}.evaluation_duration'.format(alert_id): int(round(1000.0 * (time.time() - start)))}) # Always evaluate downtimes, so that we don't miss downtime_ended event in case the downtime ends when # the alert is no longer active. downtimes = self._evaluate_downtimes(alert_id, entity_id) start_time = time.time() # Store or remove the check value that triggered the alert if is_alert: result.append(alert_id) start_time = alert_stored['start_time'] if alert_stored and not changed else time.time() # create or refresh stored alert alert_stored = dict(captures=captures, downtimes=downtimes, start_time=start_time, **val) self.con.set(alerts_key, json.dumps(alert_stored, cls=JsonDataEncoder)) else: self.con.delete(alerts_key) self.con.delete(notifications_key) start = time.time() notification_context = { 'alert_def': alert, 'entity': req['entity'], 'value': val, 'captures': captures, 'worker': self.worker_name, 'is_alert': is_alert, 'changed': changed, 'duration': timedelta(seconds=(time.time() - start_time if is_alert and not changed else 0)), } #do not send notifications for downtimed alerts if not downtimes: if changed: for notification in alert['notifications']: self.send_notification(notification, notification_context) else: previous_times = self.con.hgetall(notifications_key) for notification in alert['notifications']: if notification in previous_times and time.time() > float(previous_times[notification]): self.send_notification(notification, notification_context) self._counter.update({'alerts.{}.notification_duration'.format(alert_id): int(round(1000.0 * (time.time() - start)))}) setp(req['check_id'], entity_id, 'notify loop - send metrics') self.send_metrics() setp(req['check_id'], entity_id, 'notify loop end') else: self.logger.debug('Alert %s is not in time period: %s', alert_id, alert['period']) if is_alert: entities_in_alert = self.con.smembers('zmon:alerts:{}'.format(alert_id)) p = self.con.pipeline() p.srem('zmon:alerts:{}'.format(alert_id), entity_id) p.delete('zmon:alerts:{}:{}'.format(alert_id, entity_id)) p.delete(notifications_key) if len(entities_in_alert) == 1: p.srem('zmon:alerts', alert_id) p.execute() self.logger.info('Removed alert with id %s on entity %s from active alerts due to time period: %s', alert_id, entity_id, alert.get('period', '')) # add to alert report regardless alert up/down/out of period # report['results']['alerts'][alert_id]['_alert_stored'] = alert_stored # report['results']['alerts'][alert_id]['_notifications_stored'] = notifications_stored check_result['alerts'][alert_id]['start_time'] = ts_serialize(alert_stored['start_time']) if alert_stored else None check_result['alerts'][alert_id]['start_time_ts'] = alert_stored['start_time'] if alert_stored else None check_result['alerts'][alert_id]['downtimes'] = downtimes setp(req['check_id'], entity_id, 'return notified') # enqueue report to be sent via http request if self._dataservice_poster: #'entity_id': req['entity']['id'], check_result["entity"] = {"id": req['entity']['id']} for k in ["application_id","application_version","stack_name","stack_version","team","account_alias"]: if k in req["entity"]: check_result["entity"][k] = req["entity"][k] self._dataservice_poster.enqueue(check_result) return result #TODO: except SoftTimeLimitExceeded: except Exception: # Notifications should not exceed the time limit. self.logger.exception('Notification for check %s reached soft time limit', req['check_name']) self.con.connection_pool.disconnect() return None def post_trial_run(self, id, entity, result): if self._dataservice_url is not None: val = { 'id': id, 'entity-id': entity, 'result': result } try: requests.put(self._dataservice_url+"trial-run/", data=json.dumps(val, cls=JsonDataEncoder), headers={"Content-Type":"application/json"}) except Exception as ex: self.logger.exception(ex) def notify_for_trial_run(self, val, req, alerts, force_alert=False): """Like notify(), but for trial runs!""" try: # There must be exactly one alert in alerts. alert, = alerts redis_key = 'zmon:trial_run:{uuid}:results'.format(uuid=(alert['id'])[3:]) is_alert, captures = ((True, {}) if force_alert else self.evaluate_alert(alert, req, val)) try: is_in_period = in_period(alert.get('period', '')) except InvalidFormat, e: self.logger.warn('Alert with id %s has malformed time period.', alert['id']) captures['exception'] = '; \n'.join(filter(None, [captures.get('exception'), str(e)])) is_in_period = True try: result = { 'entity': req['entity'], 'value': val, 'captures': captures, 'is_alert': is_alert, 'in_period': is_in_period, } result_json = json.dumps(result, cls=JsonDataEncoder) except TypeError, e: result = { 'entity': req['entity'], 'value': str(e), 'captures': {}, 'is_alert': is_alert, 'in_period': is_in_period, } result_json = json.dumps(result, cls=JsonDataEncoder) self.con.hset(redis_key, req['entity']['id'], result_json) self.con.expire(redis_key, TRIAL_RUN_RESULT_EXPIRY_TIME) self.post_trial_run(alert['id'][3:], req['entity'], result) return ([alert['id']] if is_alert and is_in_period else []) #TODO: except SoftTimeLimitExceeded: except Exception: self.con.connection_pool.disconnect() return None def _store_captures_locally(self, alert_id, entity_id, timestamp, captures): metrics = _convert_captures(self.worker_name, alert_id, entity_id, timestamp, captures) if metrics: self._captures_local.extend(metrics) def _evaluate_downtimes(self, alert_id, entity_id): result = [] p = self.con.pipeline() p.smembers('zmon:downtimes:{}'.format(alert_id)) p.hgetall('zmon:downtimes:{}:{}'.format(alert_id, entity_id)) redis_entities, redis_downtimes = p.execute() try: downtimes = dict((k, json.loads(v)) for (k, v) in redis_downtimes.iteritems()) except ValueError, e: self.logger.exception(e) else: now = time.time() for uuid, d in downtimes.iteritems(): # PF-3604 First check if downtime is active, otherwise check if it's expired, else: it's a future downtime. if now > d['start_time'] and now < d['end_time']: d['id'] = uuid result.append(d) func = 'sadd' elif now >= d['end_time']: func = 'srem' else: continue # Check whether the downtime changed state: active -> inactive or inactive -> active. changed = getattr(self.con, func)('zmon:active_downtimes', '{}:{}:{}'.format(alert_id, entity_id, uuid)) if changed: eventlog.log(EVENTS[('DOWNTIME_ENDED' if func == 'srem' else 'DOWNTIME_STARTED')].id, **{ 'alertId': alert_id, 'entity': entity_id, 'startTime': d['start_time'], 'endTime': d['end_time'], 'userName': d['created_by'], 'comment': d['comment'], }) # If downtime is over, we can remove its definition from redis. if func == 'srem': if len(downtimes) == 1: p.delete('zmon:downtimes:{}:{}'.format(alert_id, entity_id)) if len(redis_entities) == 1: p.delete('zmon:downtimes:{}'.format(alert_id)) p.srem('zmon:downtimes', alert_id) else: p.srem('zmon:downtimes:{}'.format(alert_id), entity_id) else: p.hdel('zmon:downtimes:{}:{}'.format(alert_id, entity_id), uuid) p.execute() return result PK,uGz5zmon_worker_monitor/zmon_worker/functions/__init__.py#!/usr/bin/env python # -*- coding: utf-8 -*- from sql import SqlWrapper from zmon import ZmonWrapper __all__ = [ 'SqlWrapper', 'ZmonWrapper' ] PK,uG\⅌1zmon_worker_monitor/zmon_worker/functions/zmon.py#!/usr/bin/env python # -*- coding: utf-8 -*- import logging from zmon_worker_monitor.zmon_worker.errors import CheckError from functools import partial from suds.client import Client from zmon_worker_monitor.zmon_worker.common.time_ import parse_timedelta from timeperiod import in_period, InvalidFormat from zmon_worker_monitor.zmon_worker.common.utils import async_memory_cache import sys import json import redis import time logger = logging.getLogger(__name__) CHECK_REFRESH_TIME = 240 ALERT_REFRESH_TIME = 120 class ZmonWrapper(object): ZMON_ALERTS_ENTITIES_PATTERN = 'zmon:alerts:*:entities' def __init__(self, wsdl, host, port): try: self.__ws_client = Client(url=wsdl) self.__ws_client.set_options(cache=None) except Exception: raise CheckError('ZmonWrapper Error: failed to connect to zmon-controller') self.__redis = redis.StrictRedis(host, port) self.__checks = {} self.__alerts = [] self.logger = logger self.__checks = self.__load_check_definitions() self.__alerts = self.__load_alert_definitions() @async_memory_cache.cache_on_arguments(namespace='zmon-worker', expiration_time=ALERT_REFRESH_TIME) def __load_alert_definitions(self): try: response = self.__ws_client.service.getAllActiveAlertDefinitions() except Exception: self.logger.exception('ZmonWrapper Error: failed to load alert definitions') raise CheckError('ZmonWrapper Error: failed to load alert definitions'), None, sys.exc_info()[2] else: return [{ 'id': a.id, 'team': a.team, 'responsible_team': a.responsibleTeam, 'check_id': a.checkDefinitionId, 'period': (a.period or '' if hasattr(a, 'period') else ''), } for a in response[1]] @async_memory_cache.cache_on_arguments(namespace='zmon-worker', expiration_time=CHECK_REFRESH_TIME) def __load_check_definitions(self): try: response = self.__ws_client.service.getAllActiveCheckDefinitions() except Exception: self.logger.exception('ZmonWrapper Error: failed to load check definitions') raise CheckError('ZmonWrapper Error: failed to load check definitions'), None, sys.exc_info()[2] else: return dict((c.id, {'interval': c.interval}) for c in response[1]) @staticmethod def _is_entity_alert_stale(last_run, period): ''' Checks whether check's last run is within given period. >>> ZmonWrapper._is_entity_alert_stale(None, 60) False >>> ZmonWrapper._is_entity_alert_stale(time.time(), 10) False >>> ZmonWrapper._is_entity_alert_stale(time.time() - 20, 10) True ''' return (False if last_run is None else time.time() - last_run > period) def __is_alert_stale(self, alert, evaluated_alerts, check_results, multiplier, offset): a_id = alert['id'] # alert id c_id = alert['check_id'] # check id r_id = partial('{}:{}'.format, c_id) # helper function used in iterator to generate result id try: is_in_period = in_period(alert.get('period', '')) except InvalidFormat: self.logger.warn('Alert with id %s has malformed time period.', a_id) is_in_period = True if is_in_period: return a_id not in evaluated_alerts or any(self._is_entity_alert_stale(check_results.get(r_id(entity)), multiplier * self.__checks[c_id]['interval'] + offset) for entity in evaluated_alerts[a_id]) else: return False def stale_active_alerts(self, multiplier=2, offset='5m'): ''' Returns a list of alerts that weren't executed in a given period of time. The period is calculated using multiplier and offset: check's interval * multiplier + offset. Parameters ---------- multiplier: int Multiplier for check's interval. offset: str Time offset, for details see parse_timedelta function in zmon-worker/src/function/time_.py. Returns ------- list A list of stale active alerts. ''' alert_entities = self.__redis.keys(self.ZMON_ALERTS_ENTITIES_PATTERN) # Load evaluated alerts and their entities from redis. p = self.__redis.pipeline() for key in alert_entities: p.hkeys(key) entities = p.execute() evaluated_alerts = dict((int(key.split(':')[2]), entities[i]) for (i, key) in enumerate(alert_entities)) # Load check results for previously loaded alerts and entities. check_ids = [] for alert in self.__alerts: if alert['id'] in evaluated_alerts: for entity in evaluated_alerts[alert['id']]: p.lindex('zmon:checks:{}:{}'.format(alert['check_id'], entity), 0) check_ids.append('{}:{}'.format(alert['check_id'], entity)) results = p.execute() check_results = dict((check_id, json.loads(results[i])['ts']) for (i, check_id) in enumerate(check_ids) if results[i]) return [{'id': alert['id'], 'team': alert['team'], 'responsible_team': alert['responsible_team']} for alert in self.__alerts if self.__is_alert_stale(alert, evaluated_alerts, check_results, multiplier, parse_timedelta(offset).total_seconds())] def check_entities_total(self): ''' Returns total number of checked entities. ''' alert_entities = self.__redis.keys(self.ZMON_ALERTS_ENTITIES_PATTERN) p = self.__redis.pipeline() for key in alert_entities: p.hkeys(key) entities = p.execute() return sum(len(e) for e in entities) PK,uG =i0zmon_worker_monitor/zmon_worker/functions/sql.py#!/usr/bin/env python # -*- coding: utf-8 -*- import psycopg2 import re import sys from zmon_worker_monitor.zmon_worker.errors import CheckError, InsufficientPermissionsError, DbError from psycopg2.extras import NamedTupleCursor DEFAULT_PORT = 5432 CONNECTION_RE = \ re.compile(r''' ^(?P[^:/]+) # host - either IP o hostname (:(?P\d+))? # port - integer, optional /(?P\w+) # database name $ ''' , re.X) ABSOLUTE_MAX_RESULTS = 1000000 REQUIRED_GROUP = 'zalandos' PERMISSIONS_STMT = \ ''' SELECT r.rolcanlogin AS can_login, ARRAY(SELECT b.rolname FROM pg_catalog.pg_auth_members m JOIN pg_catalog.pg_roles b ON (m.roleid = b.oid) WHERE m.member = r.oid) AS member_of FROM pg_catalog.pg_roles r WHERE r.rolname = %s; ''' NON_SAFE_CHARS = re.compile(r'[^a-zA-Z_0-9-]') def make_safe(s): ''' >>> make_safe('Bad bad \\' 123') 'Badbad123' ''' if not s: return '' return NON_SAFE_CHARS.sub('', s) class SqlWrapper(object): '''Shard-aware SQL adapter sql().execute('SELECT 1').result() ''' def __init__( self, shards, user='nagios', password='', timeout=60000, shard=None, created_by=None, check_id=None, ): ''' Parameters ---------- shards: dict A dict of shard definitions where key is the shard's name and value is the host/database string. user: str password: str timeout: int Statement timeout in milliseconds. shard: str Optional shard name. If provided, the check will be run on only one shard matching given name. created_by: str Optional user name. If provided, the check will first make sure that the user has permissions to access the requested database. It's optional because it's currently supported only in trial run. check_id: int The check definition ID in order to set PostgreSQL application name (easier tracking on server side). ''' if not shards: raise CheckError('SqlWrapper: No shards defined') if shard and not shards.get(shard): raise CheckError('SqlWrapper: Shard {} not found in shards definition'.format(shard)) self._cursors = [] self._stmt = None permissions = {} for shard_def in ([shards[shard]] if shard else shards.values()): m = CONNECTION_RE.match(shard_def) if not m: raise CheckError('Invalid shard connection: {}'.format(shard_def)) connection_str = \ "host='{host}' port='{port}' dbname='{dbname}' user='{user}' password='{password}' connect_timeout=5 options='-c statement_timeout={timeout}' application_name='ZMON Check {check_id} (created by {created_by})' ".format( host=m.group('host'), port=int(m.group('port') or DEFAULT_PORT), dbname=m.group('dbname'), user=user, password=password, timeout=timeout, check_id=check_id, created_by=make_safe(created_by), ) try: conn = psycopg2.connect(connection_str) conn.set_session(readonly=True, autocommit=True) cursor = conn.cursor(cursor_factory=NamedTupleCursor) self._cursors.append(cursor) except Exception, e: raise DbError(str(e), operation='Connect to {}'.format(shard_def)), None, sys.exc_info()[2] try: if created_by: cursor.execute(PERMISSIONS_STMT, [created_by]) row = cursor.fetchone() permissions[shard_def] = (row.can_login and REQUIRED_GROUP in row.member_of if row else False) except Exception, e: raise DbError(str(e), operation='Permission query'), None, sys.exc_info()[2] for resource, permitted in permissions.iteritems(): if not permitted: raise InsufficientPermissionsError(created_by, resource) def execute(self, stmt): self._stmt = stmt return self def result(self, agg=sum): '''return single row result, will result primitive value if only one column is selected''' result = {} try: for cur in self._cursors: try: cur.execute(self._stmt) row = cur.fetchone() if row: for k, v in row._asdict().items(): result[k] = result.get(k, []) result[k].append(v) finally: cur.close() except Exception, e: raise DbError(str(e), operation=self._stmt), None, sys.exc_info()[2] for k, v in result.items(): try: result[k] = agg(v) except: # just use list if aggregation function fails # (e.g. if we try to sum strings) result[k] = v if len(result) == 1: return result.values()[0] else: return result def results(self, max_results=100, raise_if_limit_exceeded=True): '''return many rows''' results = [] max_results = min(max_results, ABSOLUTE_MAX_RESULTS) try: for cur in self._cursors: try: cur.execute(self._stmt) if raise_if_limit_exceeded: rows = cur.fetchmany(max_results + 1) if len(rows) > max_results: raise DbError('Too many results, result set was limited to {}. Try setting max_results to a higher value.'.format(max_results), operation=self._stmt) else: rows = cur.fetchmany(max_results) for row in rows: results.append(row._asdict()) finally: cur.close() except Exception, e: raise DbError(str(e), operation=self._stmt), None, sys.exc_info()[2] return results if __name__ == '__main__': if len(sys.argv) == 4: check = SqlWrapper([sys.argv[1] + '/' + sys.argv[2]]) print check.execute(sys.argv[3]).result() elif len(sys.argv) > 1: print 'sql.py ' PKhGj j 4zmon_worker_monitor/zmon_worker/notifications/sms.py#!/usr/bin/env python # -*- coding: utf-8 -*- import requests from notification import BaseNotification import zmon_worker_monitor.eventloghttp as eventlog import logging logger = logging.getLogger(__name__) SMS_PROVIDER_URL = 'https://gateway.smstrade.de' SMS_SENDER = 'zmon2' SMS_API_KEY = '' SMS_ROUTE = 'gold' SMS_MAXLENGTH = 2048 #logger = get_task_logger('zmon-worker') class SmsException(Exception): pass class Sms(BaseNotification): @classmethod def send(cls, alert, *args, **kwargs): provider_url = cls._config.get('notifications.sms.provider_url', SMS_PROVIDER_URL) phone_numbers = BaseNotification.resolve_group(args, phone=True) repeat = kwargs.get('repeat', 0) maxlen = cls._config.get('notifications.sms.maxlength', SMS_MAXLENGTH) message = cls._get_subject(alert, custom_message=kwargs.get('message'))[:maxlen] request_params = { 'to': '', 'key': cls._config['notifications.sms.apikey'], 'from': cls._config.get('notifications.sms.sender', SMS_SENDER), 'route': cls._config.get('notifications.sms.route', SMS_ROUTE), 'message': message, 'cost': 1, 'message_id': 1, } alert_id = alert.get('alert_def', {}).get('id', 0) entity = alert.get('entity', {}).get('id', 0) try: if cls._config.get('notifications.sms.on', True): for phone in phone_numbers: request_params['to'] = phone r = requests.get(provider_url, params=request_params, verify=False) url_secured = r.url.replace(request_params['key'], '*' * len(request_params['key'])) logger.info('SMS sent: request to %s --> status: %s, response headers: %s, response body: %s', url_secured, r.status_code, r.headers, r.text) r.raise_for_status() eventlog.log(cls._EVENTS['SMS_SENT'].id, alertId=alert_id, entity=entity, phoneNumber=phone, httpStatus=r.status_code) except Exception: logger.exception('Failed to send sms for alert %s with id %s to: %s', alert['name'], alert['id'], list(phone_numbers)) finally: return repeat if __name__ == '__main__': Sms.update_config({ 'notifications.sms.on': True, 'notifications.sms.apikey': '--secret--', 'notifications.sms.sender': 'zmon2', 'notifications.sms.route': 'gold', }) test_recipients = ['1stlevel'] fake_alert = { 'is_alert': True, 'alert_def': {'name': 'Test'}, 'entity': {'id': 'hostxy'}, 'captures': {}, } Sms.send(fake_alert, *test_recipients, **{'message': 'My customized zmon2 alert message'}) Sms.send(fake_alert, *test_recipients) PKhG9zmon_worker_monitor/zmon_worker/notifications/__init__.pyPKhGCp5zmon_worker_monitor/zmon_worker/notifications/mail.py#!/usr/bin/env python # -*- coding: utf-8 -*- import jinja2 import os import smtplib from email.mime.multipart import MIMEMultipart from email.mime.text import MIMEText from notification import BaseNotification import logging logger = logging.getLogger(__name__) thisdir = os.path.join(os.path.dirname(__file__)) template_dir = os.path.join(thisdir, '../templates/mail') jinja_env = jinja2.Environment(loader=jinja2.FileSystemLoader(template_dir)) class Mail(BaseNotification): @classmethod def send(cls, alert, *args, **kwargs): sender = cls._config.get('notifications.mail.sender') subject = cls._get_subject(alert, custom_message=kwargs.get('subject')) html = kwargs.get('html', False) cc = kwargs.get('cc', []) hide_recipients = kwargs.get('hide_recipients', True) repeat = kwargs.get('repeat', 0) expanded_alert_name = cls._get_expanded_alert_name(alert) try: tmpl = jinja_env.get_template('alert.txt') body_plain = tmpl.render(expanded_alert_name=expanded_alert_name, **alert) except Exception: logger.exception('Error parsing email template for alert %s with id %s', alert['name'], alert['id']) else: if html: msg = MIMEMultipart('alternative') tmpl = jinja_env.get_template('alert.html') body_html = tmpl.render(expanded_alert_name=expanded_alert_name, **alert) part1 = MIMEText(body_plain.encode('utf-8'), 'plain', 'utf-8') part2 = MIMEText(body_html.encode('utf-8'), 'html', 'utf-8') msg.attach(part1) msg.attach(part2) else: msg = MIMEText(body_plain.encode('utf-8'), 'plain', 'utf-8') msg['Subject'] = subject msg['From'] = 'ZMON 2 <{}>'.format(sender) args = BaseNotification.resolve_group(args) if hide_recipients: msg['To'] = 'Undisclosed Recipients <{}>'.format(sender) msg['Bcc'] = ', '.join(args) else: msg['To'] = ', '.join(args) msg['Cc'] = ', '.join(cc) mail_host = cls._config.get('notifications.mail.host', 'localhost') mail_port = cls._config.get('notifications.mail.port', '25') logger.info("Relaying via %s %s", mail_host, mail_port) if cls._config.get('notifications.mail.on', True): try: if mail_host != 'localhost': s = smtplib.SMTP_SSL(mail_host, mail_port) else: s = smtplib.SMTP(mail_host, mail_port) except Exception: logger.exception('Error connecting to SMTP server %s for alert %s with id %s', mail_host, alert['name'], alert['id']) else: try: mail_user = cls._config.get('notifications.mail.user', None) if mail_user is not None: s.login(mail_user, cls._config.get('notifications.mail.password')) s.sendmail(sender, list(args) + cc, msg.as_string()) except SMTPAuthenticationError: logger.exception('Error sending email for alert %s with id %s: authentication failed for %s', alert['name'], alert['id'], mail_user) except Exception: logger.exception('Error sending email for alert %s with id %s', alert['name'], alert['id']) finally: s.quit() finally: return repeat if __name__ == '__main__': import sys Mail.send({'entity': {'id': 'test'}, 'value': 5}, *sys.argv[1:]) PK,uG{3xx6zmon_worker_monitor/zmon_worker/notifications/hubot.py#!/usr/bin/env python # -*- coding: utf-8 -*- import requests from notification import BaseNotification import logging logger = logging.getLogger(__name__) class HubotException(Exception): pass class Hubot(BaseNotification): @classmethod def notify(cls, alert, queue, hubot_url, message=None, repeat=0): message = cls._get_subject(alert, custom_message=message) if '?' in hubot_url: raise ValueError post_params = { 'event': queue, 'data': message, } alert_id = alert.get('alert_def', {}).get('id', 0) try: r = requests.post(hubot_url, data=post_params) r.raise_for_status() logger.info('Notification sent: request to %s --> status: %s, response headers: %s, response body: %s', hubot_url, r.status_code, r.headers, r.text) except Exception: logger.exception('Failed to send notification for alert %s with id %s to: %s', alert['name'], alert['id'], hubot_url) finally: return repeat if __name__ == '__main__': fake_alert = { 'is_alert': True, 'alert_def': {'name': 'Test'}, 'entity': {'id': 'hostxy'}, 'captures': {}, } Hubot.notify(fake_alert, queue='syslog.info', hubot_url='http://z-hyp18.zalando:8081/publish') PKhGzz8zmon_worker_monitor/zmon_worker/notifications/hipchat.py from notification import BaseNotification import logging import requests import urllib import json logger = logging.getLogger(__name__) class NotifyHipchat(BaseNotification): @classmethod def send(cls, alert, *args, **kwargs): url = cls._config.get('notifications.hipchat.url') token = kwargs.get('token', cls._config.get('notifications.hipchat.token')) repeat = kwargs.get('repeat', 0) color = 'green' if alert and not alert.get('is_alert') else kwargs.get("color", "red") message = {"message": kwargs.get("message", cls._get_subject(alert)), "color": color} try: logger.info("Sending to: " + '{}/v2/room/{}/notification?auth_token={}'.format(url, urllib.quote(kwargs['room']), token) + " " + json.dumps(message)) r = requests.post('{}/v2/room/{}/notification?auth_token={}'.format(url, urllib.quote(kwargs['room']), token), data=json.dumps(message), verify=False, headers={'Content-type':'application/json'}) r.raise_for_status() except Exception as ex: logger.exception("Hipchat write failed %s", ex) return repeatPKhG*j;;6zmon_worker_monitor/zmon_worker/notifications/slack.pyfrom notification import BaseNotification import requests import logging logger = logging.getLogger(__name__) class NotifySlack(BaseNotification): @classmethod def send(cls, alert, *args, **kwargs): url = "https://slack.com/api/chat.postMessage" token = kwargs.get('token', cls._config.get('notifications.slack.token')) repeat = kwargs.get('repeat', 0) message = {"as_user":"true", "token": token, "channel": kwargs.get('channel', '#general'), "text": kwargs.get("message", cls._get_subject(alert))} try: logger.info("Sending to %s %s", url, message) r = requests.post(url, params=message, verify=False) r.raise_for_status() except Exception as ex: logger.exception("Slack write failed %s", ex) return repeat PKhGT2 =zmon_worker_monitor/zmon_worker/notifications/notification.py#!/usr/bin/env python # -*- coding: utf-8 -*- import datetime import logging class BaseNotification(object): _config = {} _EVENTS = None @classmethod def update_config(cls, new_config): cls._config.update(new_config) @classmethod def register_eventlog_events(cls, events): cls._EVENTS = events @classmethod def set_redis_con(cls, r): cls.__redis_conn = r @classmethod def _get_subject(cls, alert, custom_message=None): """ >>> BaseNotification._get_subject({'is_alert': True, 'changed': True, 'alert_def':{'name': 'Test'}, 'entity':{'id':'hostxy'}, 'captures': {}}) 'NEW ALERT: Test on hostxy' >>> BaseNotification._get_subject({'is_alert': True, 'changed': False, 'alert_def':{'name': 'Test'}, 'entity':{'id':'hostxy'}, 'captures': {}, 'duration': datetime.timedelta(seconds=30)}) 'ALERT ONGOING: Test on hostxy for 0:00:30' """ if alert['changed']: event = ('NEW ALERT' if alert and alert.get('is_alert') else 'ALERT ENDED') else: event = 'ALERT ONGOING' name = cls._get_expanded_alert_name(alert, custom_message) if not custom_message: return ('{}: {} on {} for {}'.format(event, name, alert['entity']['id'], str(alert['duration' ])[:7]) if alert.get('duration') else '{}: {} on {}'.format(event, name, alert['entity']['id'])) else: return '{}: {}'.format(event, name) @classmethod def _get_expanded_alert_name(cls, alert, custom_message=None): name = (alert['alert_def']['name'] if not custom_message else custom_message) try: replacements = {'entities': alert['entity']['id']} replacements.update(alert['captures']) return name.format(**replacements) except KeyError, e: return name # This is fairly normal. Just use the unformatted name. except Exception, e: return "<<< Unformattable name '{name}': {message} >>>".format(name=name, message=e) @classmethod def send(cls, alert, *args, **kwargs): raise NotImplementedError('Method meant to be overriden by subclass') @classmethod def resolve_group(cls, targets, phone=False): new_targets = [] for target in targets: prefix = target[0:target.find(':')+1] if not prefix in ['group:', 'active:']: new_targets.append(target) continue group_id = target[target.find(':')+1:] key = 'zmon:group:'+group_id + (':members' if prefix == 'group:' else ':active') team = cls.__redis_conn.smembers(key) if not team: logging.warn("no members found for group: %s", target) continue if not phone: new_targets.extend(team) else: for m in team: phone_numbers = cls.__redis_conn.smembers('zmon:member:'+m+':phone') new_targets.extend(phone_numbers) logging.info("Redirect notifications: from %s to %s", targets, new_targets) return new_targets PKhGx7zmon_worker_monitor/adapters/ifunctionfactory_plugin.py#!/usr/bin/env python # -*- coding: utf-8 -*- from ibase_plugin import IBasePlugin from abc import ABCMeta, abstractmethod class IFunctionFactoryPlugin(IBasePlugin): """ Base class for all plugins of type Function. We call it Function adapter. """ __metaclass__ = ABCMeta def __init__(self): super(IFunctionFactoryPlugin, self).__init__() @abstractmethod def create(self, factory_ctx): """ Automatically called to create the check function's object :param factory_ctx: creation context :return: an object that implements a check function """ raise NotImplementedError class ProtectedPartial(object): ''' Provides functools.partial functionality with one additional feature: if keyword arguments contain '__protected' key with list of arguments as value, the appropriate values will not be overwritten when calling the partial. This way we can prevent user from overwriting internal zmon parameters in check command. The protected key uses double underscore to prevent overwriting it, we reject all commands containing double underscores. ''' def __init__(self, func, *args, **kwargs): self.__func = func self.__partial_args = args self.__partial_kwargs = kwargs self.__protected = frozenset(kwargs.get('__protected', [])) self.__partial_kwargs.pop('__protected', None) def __call__(self, *args, **kwargs): new_kwargs = self.__partial_kwargs.copy() new_kwargs.update((k, v) for (k, v) in kwargs.iteritems() if k not in self.__protected) return self.__func(*self.__partial_args + args, **new_kwargs) def propartial(func, *args, **kwargs): ''' >>> propartial(int, base=2)('100') 4 >>> propartial(int, base=2)('100', base=16) 256 >>> propartial(int, base=2, __protected=['base'])('100', base=16) 4 ''' return ProtectedPartial(func, *args, **kwargs)PKhGzkk,zmon_worker_monitor/adapters/ibase_plugin.py#!/usr/bin/env python # -*- coding: utf-8 -*- from abc import ABCMeta, abstractmethod class IBasePlugin(object): """ Base class for all adapters (plugin types). Users should not extend this class directly. """ __metaclass__ = ABCMeta def __init__(self): """ Set the basic variables. """ self.is_activated = False def activate(self): """ Called at plugin activation. """ self.is_activated = True def deactivate(self): """ Called when the plugin is disabled. """ self.is_activated = False @abstractmethod def configure(self, conf): """ Called after plugin is loaded to pass the [configuration] section in their plugin info file :param conf: configuration dictionary """ raise NotImplementedError PKhGGgϿ(zmon_worker_monitor/adapters/__init__.py#!/usr/bin/env python # -*- coding: utf-8 -*- """ We provide a plugin system to load external functionality into Zmon. This is needed as a way to decouple sensitive details -possibly proprietary- from the Zmon core. The plugin system is separated in *adapters* and *implementations*. Adapters are base classes that specify the behaviour of plugin types, they live in the subpackage ``zmon_worker.adapters``. All apaters inherit from ``zmon_worker.adapters.IBasePlugin``. One adapter may have many implementations, A plugin implementation needs 2 files: 1. a python source file containing a class that extends an adaptor. 2. a plugin info file. Text file with metadata to uniquely identify the plugin. To see how this works let's look at the *resolution of the entities* to monitor: What Zmon scheduler does is basically to periodically run some checks against some entities. An entity may be a host, a database, a network port, or really anything you may want to run a check against. This means that the definition of **entity** must be kept open and flexible, that's why the user must implement an entity plugin to define its entities. How do you write an entity plugin to load your entities (hosts, databases, network ports, etc.)? 1. Write a python file containing a class that extends ``zmon_worker.adapters.ientity_plugin.IEntityPlugin``, this class should contain your own logic for resolving your entities. 2. Write a text file with the same name of your python source but extension ``.scheduler_plugin``. 3. Place both files in a *plugin folder* of your choice and add that folder's absolute path to the environment variable ``ZMON_PLUGINS``. This description is quite dry, but we provide an example that will make things clearer: The scheduler comes with one functional plugin, called ``entity_city``, this plugin provides *city entities*, these are used for internal Zmon checks. This plugin is a simple template to start external plugin implementation. The 2 files that implement the city plugin are ``zmon_scheduler.builtins.plugins.city.scheduler_plugin`` and ``zmon_scheduler.builtins.plugins.city.py``. """ from zmon_worker_monitor.adapters.ibase_plugin import IBasePlugin __all__ = [ 'IBasePlugin', ] PK=GbB/)zmon_worker-0.1.dist-info/DESCRIPTION.rst=========== ZMON Worker =========== .. image:: https://travis-ci.org/zalando/zmon-worker.svg?branch=master :target: https://travis-ci.org/zalando/zmon-worker :alt: Build Status .. image:: https://coveralls.io/repos/zalando/zmon-worker/badge.svg :target: https://coveralls.io/r/zalando/zmon-worker :alt: Coverage Status ZMON's Python worker is doing the heavy lifting of executing tasks against entities, and evaluating all alerts assigned to check. Tasks are picked up from Redis and the resulting check values plus alert state changes are written back to Redis. Local Development ================= Start Redis on localhost:6379: .. code-block:: bash $ docker run -p 6379:6379 -it redis Install the required development libraries: .. code-block:: bash $ sudo apt-get install build-essential python2.7-dev libpq-dev libldap2-dev libsasl2-dev libsnappy-dev $ sudo pip2 install -r requirements.txt Start the ZMON worker process: .. code-block:: bash $ python2 -m zmon_worker_monitor You can query the worker monitor via RPC: .. code-block:: bash $ python2 -m zmon_worker_monitor.rpc_client http://localhost:23500/zmon_rpc list_stats Running Unit Tests ================== .. code-block:: bash $ sudo pip2 install -r test_requirements.txt $ python2 setup.py test Building the Docker Image ========================= .. code-block:: bash $ docker build -t zmon-worker . $ docker run -it zmon-worker Running the Docker image ======================== The Docker image supports many configuration options via environment variables. Configuration options are explained in the `ZMON Documentation `_. PK=G.W?T>>*zmon_worker-0.1.dist-info/entry_points.txt[console_scripts] zmon-worker = zmon_worker_monitor.web:main PK=GtsiKK'zmon_worker-0.1.dist-info/metadata.json{"classifiers": ["Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", "Programming Language :: Python", "Programming Language :: Python :: 2", "Topic :: Software Development :: Libraries :: Python Modules"], "extensions": {"python.commands": {"wrap_console": {"zmon-worker": "zmon_worker_monitor.web:main"}}, "python.details": {"contacts": [{"email": "henning.jacobs@zalando.de", "name": "Henning Jacobs", "role": "author"}], "document_names": {"description": "DESCRIPTION.rst"}, "project_urls": {"Home": "https://github.com/zalando/zmon-worker"}}, "python.exports": {"console_scripts": {"zmon-worker": "zmon_worker_monitor.web:main"}}}, "extras": [], "generator": "bdist_wheel (0.26.0)", "keywords": ["zalando", "zmon", "zmon2", "worker", "component", "monitoring", "infrastructure"], "license": "Apache License 2.0", "metadata_version": "2.0", "name": "zmon-worker", "platform": "All", "run_requires": [{"requires": ["CherryPy", "Jinja2", "MarkupSafe", "PyMySQL (==0.6.1)", "PyYAML", "Yapsy (>=1.10.423)", "boto3", "cassandra-driver (==2.7.2)", "dnspython", "dogpile.cache", "eventlog-writer (==0.4.5)", "functional", "numpy", "prometheus-client", "psycopg2", "pymongo (==3.0.3)", "pysnmp", "python-ldap (==2.4.19)", "python-snappy", "pythonwhois (==2.4.3)", "pytz", "redis", "requests", "setproctitle", "stups-tokens", "subprocess32", "suds", "timeperiod2 (==0.1)"]}], "summary": "ZMON Worker Monitor", "test_requires": [{"requires": ["mock", "pytest", "pytest-cov"]}], "version": "0.1"}PK=Gb 'zmon_worker-0.1.dist-info/top_level.txtzmon_worker_monitor PK=G''\\zmon_worker-0.1.dist-info/WHEELWheel-Version: 1.0 Generator: bdist_wheel (0.26.0) Root-Is-Purelib: true Tag: py2-none-any PK=GL "zmon_worker-0.1.dist-info/METADATAMetadata-Version: 2.0 Name: zmon-worker Version: 0.1 Summary: ZMON Worker Monitor Home-page: https://github.com/zalando/zmon-worker Author: Henning Jacobs Author-email: henning.jacobs@zalando.de License: Apache License 2.0 Keywords: zalando zmon zmon2 worker component monitoring infrastructure Platform: All Classifier: Development Status :: 5 - Production/Stable Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: Apache Software License Classifier: Operating System :: OS Independent Classifier: Programming Language :: Python Classifier: Programming Language :: Python :: 2 Classifier: Topic :: Software Development :: Libraries :: Python Modules Requires-Dist: CherryPy Requires-Dist: Jinja2 Requires-Dist: MarkupSafe Requires-Dist: PyMySQL (==0.6.1) Requires-Dist: PyYAML Requires-Dist: Yapsy (>=1.10.423) Requires-Dist: boto3 Requires-Dist: cassandra-driver (==2.7.2) Requires-Dist: dnspython Requires-Dist: dogpile.cache Requires-Dist: eventlog-writer (==0.4.5) Requires-Dist: functional Requires-Dist: numpy Requires-Dist: prometheus-client Requires-Dist: psycopg2 Requires-Dist: pymongo (==3.0.3) Requires-Dist: pysnmp Requires-Dist: python-ldap (==2.4.19) Requires-Dist: python-snappy Requires-Dist: pythonwhois (==2.4.3) Requires-Dist: pytz Requires-Dist: redis Requires-Dist: requests Requires-Dist: setproctitle Requires-Dist: stups-tokens Requires-Dist: subprocess32 Requires-Dist: suds Requires-Dist: timeperiod2 (==0.1) =========== ZMON Worker =========== .. image:: https://travis-ci.org/zalando/zmon-worker.svg?branch=master :target: https://travis-ci.org/zalando/zmon-worker :alt: Build Status .. image:: https://coveralls.io/repos/zalando/zmon-worker/badge.svg :target: https://coveralls.io/r/zalando/zmon-worker :alt: Coverage Status ZMON's Python worker is doing the heavy lifting of executing tasks against entities, and evaluating all alerts assigned to check. Tasks are picked up from Redis and the resulting check values plus alert state changes are written back to Redis. Local Development ================= Start Redis on localhost:6379: .. code-block:: bash $ docker run -p 6379:6379 -it redis Install the required development libraries: .. code-block:: bash $ sudo apt-get install build-essential python2.7-dev libpq-dev libldap2-dev libsasl2-dev libsnappy-dev $ sudo pip2 install -r requirements.txt Start the ZMON worker process: .. code-block:: bash $ python2 -m zmon_worker_monitor You can query the worker monitor via RPC: .. code-block:: bash $ python2 -m zmon_worker_monitor.rpc_client http://localhost:23500/zmon_rpc list_stats Running Unit Tests ================== .. code-block:: bash $ sudo pip2 install -r test_requirements.txt $ python2 setup.py test Building the Docker Image ========================= .. code-block:: bash $ docker build -t zmon-worker . $ docker run -it zmon-worker Running the Docker image ======================== The Docker image supports many configuration options via environment variables. Configuration options are explained in the `ZMON Documentation `_. PK=G ++ zmon_worker-0.1.dist-info/RECORDzmon_worker-0.1.dist-info/DESCRIPTION.rst,sha256=oyd3qaG42m0xqDuk655tSGj9D_MYQDPWqrQ0isYS5zI,1742 zmon_worker-0.1.dist-info/METADATA,sha256=H3iDw2zaSwWmLfcGKGT7byn7nep4r9O-0tb-kDTq-5k,3212 zmon_worker-0.1.dist-info/RECORD,, zmon_worker-0.1.dist-info/WHEEL,sha256=JTb7YztR8fkPg6aSjc571Q4eiVHCwmUDlX8PhuuqIIE,92 zmon_worker-0.1.dist-info/entry_points.txt,sha256=pZHNtySV45UgjavmrtoZaRRERhXEAjvP7p-sSg7MBjM,62 zmon_worker-0.1.dist-info/metadata.json,sha256=GDpegVyO9j5L7qACSvCRNi7W-Fl0Vd49-mpuWkuccYQ,1611 zmon_worker-0.1.dist-info/top_level.txt,sha256=4euauxSqAbokZvJ7_H3ccc_1er4_bArVEDhxj3Sl0s4,20 zmon_worker_monitor/__init__.py,sha256=Lgf9keBX7aG_JEFt6UYSyiFKcm-BYGtmk9ohpQhYpyk,20 zmon_worker_monitor/__main__.py,sha256=RQX54WL1fvrM0xaGKwd_iQzl_9Lx0DDxgLulHCmJyeI,94 zmon_worker_monitor/emu_kombu.py,sha256=ZmZu7hCGWHZ80UFVYAjpZr87xA0XekO8ha6QJk9UN84,1573 zmon_worker_monitor/eventloghttp.py,sha256=3-SlDUDHQ0w4-aYV1IFAFAiF2q33POA57Kh2-ba4TY4,965 zmon_worker_monitor/plugin_manager.py,sha256=Vir6cHeodhLHYzaUtWMSmgSsNyKkHqaxS7LSJA_jzmM,11111 zmon_worker_monitor/process_controller.py,sha256=O3rawFUz5BJWeRuvuUNipXtOZmhQ8BclSCybwhb6C6o,12807 zmon_worker_monitor/redis_context_manager.py,sha256=Rf7olh8_c5OHL63Kl4uFvah_IrMbEwdjd-gq7KQGrho,9069 zmon_worker_monitor/rpc_client.py,sha256=5CLXhV2y5ZZnSFp8-MhmKAl_l2DLRl4-El1vWj2RkyU,2329 zmon_worker_monitor/rpc_server.py,sha256=fWkOIFXMq664kndPqk6N7jKaYYyR1UcAkKwffcoyiYE,3460 zmon_worker_monitor/rpc_utils.py,sha256=Ik9s20VrStKMRmI6_QDSVCsOtmt9cXVDn070bUcbUUI,4075 zmon_worker_monitor/settings.py,sha256=OtukfO7PoLuRVQjmz4Y9z-1tcj_zz18l_V6vkUcKRoE,1161 zmon_worker_monitor/settings_pro.py,sha256=yoGjJoopZgEibysNAuq_Dz5EmwxgA1Y67H1DGrqIuCo,2135 zmon_worker_monitor/tasks.py,sha256=w6QN_24CuOIdfQ6opNeu74UZLZpFwAY_JdYtjU9cs1Y,1552 zmon_worker_monitor/web.py,sha256=Cx6Cb_vcmlo1FQUYerckeGiBbKPHfzCks3KYiCxPsqo,2264 zmon_worker_monitor/worker.py,sha256=PcPYjTKw4cGnUr9EHPcmChc3i9SRUmSfkKBa6Klt_Oc,1216 zmon_worker_monitor/workflow.py,sha256=zhLuXEoivgV7iRp42r3UYJ7sZhE_U4Oe6EZR-4ww2RA,12068 zmon_worker_monitor/adapters/__init__.py,sha256=mIA7IpB5xRbflaJNjd6sy9tdqPU-T9EiNTLSFuRZOmE,2239 zmon_worker_monitor/adapters/ibase_plugin.py,sha256=-x_0NmCbjxf9QYdVoAOa_6edB9BvmmFTNkeUUUDybT8,875 zmon_worker_monitor/adapters/ifunctionfactory_plugin.py,sha256=NuH44ss4cwILHQXDvGRbAHJhxlpjkFfCKR2HnSvb-Mc,1974 zmon_worker_monitor/builtins/__init__.py,sha256=Q3iZcmhgfDSlcLglO9X979_Q-a6NX3hJNXz78PfGXIc,23 zmon_worker_monitor/builtins/plugins/__init__.py,sha256=Q3iZcmhgfDSlcLglO9X979_Q-a6NX3hJNXz78PfGXIc,23 zmon_worker_monitor/builtins/plugins/cassandra_wrapper.py,sha256=4uXPnbgP-uLGqZYxA7xfnVcwhSC_MoYf65JMvQs6S7A,1809 zmon_worker_monitor/builtins/plugins/cassandra_wrapper.worker_plugin,sha256=PH92LVTZAljqQ5tS3NqobKivCFZzIwWOJi1z5EztRXQ,241 zmon_worker_monitor/builtins/plugins/checkldap.py,sha256=MhSr7nNw8dP3DFnl6JXyzzPpbXfLJZduGq1VbWqPtRk,14973 zmon_worker_monitor/builtins/plugins/checkldap.worker_plugin,sha256=WqV25Fp84IDHVgbxuwfQsESqlL9OmfIWMRxiZvCo9rA,553 zmon_worker_monitor/builtins/plugins/cloudwatch.py,sha256=GJ5iQpSjRSYgHkQRmS7bmCIRNEOjbQjoiIcIeufsEDw,4020 zmon_worker_monitor/builtins/plugins/cloudwatch.worker_plugin,sha256=4nfQSm_S-hNicHuHSfGnW8SD6hPokdxG_F-UqUS06jI,238 zmon_worker_monitor/builtins/plugins/counter.py,sha256=k7kNMJeDGlwGbuXMxnpe0N6O69KaHOaU2QRKaQipUeM,2706 zmon_worker_monitor/builtins/plugins/counter.worker_plugin,sha256=goEu6V16NbupgN0zu2ZfhEZR2SUxb5z04GS1XdJeWhQ,423 zmon_worker_monitor/builtins/plugins/distance_to_history.py,sha256=r-VS-jbd41WTXy5Lgsy_IEGjazsM8p6ihwnb9QRzBjY,6935 zmon_worker_monitor/builtins/plugins/eventlog.py,sha256=gl76U9JQG0FwTreuJn-P_RYVYdBzK0f0UAc87l_PpaM,4269 zmon_worker_monitor/builtins/plugins/eventlog.worker_plugin,sha256=5NCtjmdlJkoVjA28LxQwPKAW8nOb1RaLL3_Lq4_6gOY,501 zmon_worker_monitor/builtins/plugins/exasol.py,sha256=4nTQQOVsedPfjpyuwCTsLGSQS3nOUlV5_hZ8OcRfQz4,3549 zmon_worker_monitor/builtins/plugins/exasol.worker_plugin,sha256=DkoKYf99jy9c2j6BlDdydgcby2kEtGdBfdANQ_ojd70,594 zmon_worker_monitor/builtins/plugins/exceptions_.py,sha256=zYGlev71J96AJWKTphjh5n28hzAmMBemXYt4NK2L0QY,3568 zmon_worker_monitor/builtins/plugins/exceptions_.worker_plugin,sha256=Cwf6hyBlxPfnuof9OzLPlFQsQMVh-x-Id1FDji5vix8,430 zmon_worker_monitor/builtins/plugins/history.py,sha256=eI3cwKH-WUDF-6A3UrlVKJZN9_sAmAIR-v5qrtwXeLQ,6534 zmon_worker_monitor/builtins/plugins/history.worker_plugin,sha256=jLX6yMH9LBmuHN6kc56oZrtyNDSgtlgHJ8LyiiKETu4,541 zmon_worker_monitor/builtins/plugins/http.py,sha256=JRVQsLikv5qg39Tj3CJB9sRqQw2BM9-ZcB3_QkMnMYY,8710 zmon_worker_monitor/builtins/plugins/http.worker_plugin,sha256=ROVnjMVEVmI1v8swrRFj6JmhvDAtuYWpUmnwsqRfMoU,417 zmon_worker_monitor/builtins/plugins/jmx.py,sha256=a9KEt-g5VIu2HlvF0rijzQlr1-NsFZ-yL-YoG6LBft8,4536 zmon_worker_monitor/builtins/plugins/jmx.worker_plugin,sha256=qEMgv84jLS2zJ3DPJuJmQO1TvP0W0X9VDAz4Z2q2Ytg,463 zmon_worker_monitor/builtins/plugins/joblocks.py,sha256=ajXgT_Xnv7UUozD2jmPk7fuQItxXbQaoQlW_F6SSPEI,4204 zmon_worker_monitor/builtins/plugins/joblocks.worker_plugin,sha256=cTdnothpdXkG-g3iOG0jCy98pameiu1f0aAktUsAJT8,462 zmon_worker_monitor/builtins/plugins/jobs.py,sha256=fD1XNkLpRxjVUJKyrRfxORi5S17KgMXALHd8tS1RiQg,2229 zmon_worker_monitor/builtins/plugins/jobs.worker_plugin,sha256=YkIY_w_Ssw5w3CXJf4DoUMuDLHzMoIGE49jCRpFd0oA,417 zmon_worker_monitor/builtins/plugins/kairosdb.py,sha256=RBMbS03uQud-ORhN_iOPC6F61JzvHkH3nhT_ooUVRow,2211 zmon_worker_monitor/builtins/plugins/kairosdb.worker_plugin,sha256=RDRv9qAsam4sR1pQM1P4gR8qjatoKJTqE-MwOYHjzGY,230 zmon_worker_monitor/builtins/plugins/mongodb.py,sha256=EfygRZk8NDg3Pq3tRtbg4Qd9L8PnM2vbyC2RjCnCSj8,1422 zmon_worker_monitor/builtins/plugins/mongodb.worker_plugin,sha256=ACslhr0EsDZnKbPykAFFYwLzAS3PydtupOpddZXLJsk,227 zmon_worker_monitor/builtins/plugins/nagios.py,sha256=_gPgucqOX90HTYYb2aPnVsxhdL1EEqroulYydGH5CtE,31533 zmon_worker_monitor/builtins/plugins/nagios.worker_plugin,sha256=z6LeVJDdplOmGsQAD45yH6ISSmJmmLk_Zwb5PCBKMCI,697 zmon_worker_monitor/builtins/plugins/ping_.py,sha256=YUb1tr1Yt1Vd59vz-YeExvspQB33zHBCcz_tOuVmuhc,1221 zmon_worker_monitor/builtins/plugins/ping_.worker_plugin,sha256=p-9f-sZsBYtKcaYrAnhqoLxfK_cNwvbDjZyfnPFc_Cs,418 zmon_worker_monitor/builtins/plugins/redis_wrapper.py,sha256=Bb9jK9IEtmv9d5Ya7pzmd0oesdVKtteMIyCmx-mSLfU,3986 zmon_worker_monitor/builtins/plugins/redis_wrapper.worker_plugin,sha256=o6vbKrBItRgM1EgFHdJ0-uGpdYkLpI1wowbgmaAmxWM,427 zmon_worker_monitor/builtins/plugins/scalyr.py,sha256=dn7qjNjtuurfqzFdmxXtVYmX_wX8nyw9_0MZ5A-hj1w,3648 zmon_worker_monitor/builtins/plugins/scalyr.worker_plugin,sha256=HnM5H-rp-43xfvSukiV1oTCcabBrWf1bPJXYCPOa8pg,218 zmon_worker_monitor/builtins/plugins/snmp.py,sha256=y9yLxxCbb-mIKvE13LDhMfKbYCX0xGjfes0wbEnhRUo,17583 zmon_worker_monitor/builtins/plugins/snmp.worker_plugin,sha256=z1nf-8KTIN72Rt2GglPrCMudOXUSVaqOSstnXa7gvgg,417 zmon_worker_monitor/builtins/plugins/sql_mssql.py,sha256=OCcHehoyXiCNYx6a0_mtjZvLQgCrZ0UcsjV5QA1-P08,4380 zmon_worker_monitor/builtins/plugins/sql_mssql.worker_plugin,sha256=P0ShdLtStQFre2N5TgSFqCo9sAQxiJqaZeDWHFwxs9U,556 zmon_worker_monitor/builtins/plugins/sql_mysql.py,sha256=TIqO9aZcZHgCB95naXTEbnYi2Xg9pVavnlhSyY-zqv8,6216 zmon_worker_monitor/builtins/plugins/sql_mysql.worker_plugin,sha256=GRe8AdKTGI8i4MHHsi_TkDecfz8so0mgyeAt-Bzjgt0,556 zmon_worker_monitor/builtins/plugins/sql_oracle.py,sha256=3fqcKCry6kAJKA3U2zjzs7HuV6xo9EAZEdqGMPxlN_M,4961 zmon_worker_monitor/builtins/plugins/sql_oracle.worker_plugin,sha256=_wvIX2ycgCLbAaPmfhnfdh5pBOf2RfnZPh82NhE1WuY,558 zmon_worker_monitor/builtins/plugins/sql_postgresql.py,sha256=xcdqtBV39VjaqOjevJUGWdCNmeqhJf6-1m-h5ycPGuM,7910 zmon_worker_monitor/builtins/plugins/sql_postgresql.worker_plugin,sha256=bajK21mL96NWrU2ljPFj47OuA1WYNy5owCVlNuyijTg,557 zmon_worker_monitor/builtins/plugins/tcp.py,sha256=X3D45qu3hNxc0M9SXSDz6jQ3uSvAG9EqMnrJZ-i_l1I,1780 zmon_worker_monitor/builtins/plugins/tcp.worker_plugin,sha256=93u9QL4v7Qr6JjBd4xVjYbr0GuNy-saQtLO-FtWz6S8,415 zmon_worker_monitor/builtins/plugins/time_.py,sha256=NtEqN2733IfqxX-rv1d7HD91PsXDuICB_MZlNHTqCc8,1775 zmon_worker_monitor/builtins/plugins/time_.worker_plugin,sha256=ie1V0SfBVTug0Sjo0wYRjgCrMbWmznZNZmDgSAcc4O8,504 zmon_worker_monitor/builtins/plugins/whois_.py,sha256=dsochPQsbczwsBhs_5O5rQdfWr9wseTGkKL0-NkKiGw,1769 zmon_worker_monitor/builtins/plugins/whois_.worker_plugin,sha256=MqV28Wiq3xALJFx1gkPAsmPwWdub-o8H7QWcCiw51TY,420 zmon_worker_monitor/builtins/plugins/zmon_.py,sha256=haHFMAHtPUg_m0lV8YSkzybpwcUgqAzWxjG88DJF7DM,6911 zmon_worker_monitor/builtins/plugins/zmon_.worker_plugin,sha256=7pkUV0xaATkdX46eLf3ys8o65yCoGkhYVLM1-d8j8t0,418 zmon_worker_monitor/builtins/plugins/zomcat.py,sha256=z3Qwpblp0k9JsXeM-DIsDaOPEROsYT3LG547ZwwfxWQ,7527 zmon_worker_monitor/builtins/plugins/zomcat.worker_plugin,sha256=iSfw-IIXixq8ztJXhMUZOBxf89zc6atKAd43eOENaJ0,421 zmon_worker_monitor/zmon_worker/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1 zmon_worker_monitor/zmon_worker/encoder.py,sha256=n4bRWxhI24oP9YiDpbA8KVnozpmxfOjLCZ9JR-5Zh1k,1239 zmon_worker_monitor/zmon_worker/errors.py,sha256=ws_fZtHsR0nzizApDUU-zqPI--k07I5PjMZgopq8OG8,1646 zmon_worker_monitor/zmon_worker/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 zmon_worker_monitor/zmon_worker/common/mathfun.py,sha256=xzfhnx5sGbhQw5uDQIXY4c0i0cT_S_LK2BuHohI676M,2274 zmon_worker_monitor/zmon_worker/common/time_.py,sha256=F07Lyexv5GdU-znu33MdqyVWv4cJr1gsXAyC8_cS2Tk,2105 zmon_worker_monitor/zmon_worker/common/utils.py,sha256=Q6nzYqsKo4ULPnT7giHARgBBsie1fa3JYCpGjlHSybw,2044 zmon_worker_monitor/zmon_worker/functions/__init__.py,sha256=NSEHAFM9-l_lCswCTBm6WNtZomaWTAErk4tMw3Vr25U,154 zmon_worker_monitor/zmon_worker/functions/sql.py,sha256=0LELhHjaOOt-sntuki1mpTfcjtyMbFIwwVT-HXIh2Fk,6633 zmon_worker_monitor/zmon_worker/functions/zmon.py,sha256=wWujeXQcmeqeQ-VaqKVE9KqDAkvy3CC5VVekjOr-4Eg,6028 zmon_worker_monitor/zmon_worker/notifications/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 zmon_worker_monitor/zmon_worker/notifications/hipchat.py,sha256=k3O13x6_mnBJFQFpiSn0NwCrf7Hvu7VnDD0tLuXCax8,1146 zmon_worker_monitor/zmon_worker/notifications/hubot.py,sha256=RpW3fM4Vln2bVS3SWaudjxNu04phVniBABv1aWyT66E,1400 zmon_worker_monitor/zmon_worker/notifications/mail.py,sha256=zmWfOPSIB3tr99NqcqLAyb9YLYwspprwvRHpLbiSNp0,3840 zmon_worker_monitor/zmon_worker/notifications/notification.py,sha256=KRwpKBLnJN-GoNCEUcowjP-hhUXE2zG8teBwt5y_IsE,3208 zmon_worker_monitor/zmon_worker/notifications/slack.py,sha256=fgr1K9Ark-GPXF8XryVNodUJJZbDcZSZ9dx4Y3PQRjQ,827 zmon_worker_monitor/zmon_worker/notifications/sms.py,sha256=2_xEVr3f4RqYQ9q5WzL6E1064ywtlqj3R7lhnJzjVW8,2922 zmon_worker_monitor/zmon_worker/tasks/__init__.py,sha256=mvs9DM0JQdG1ViJuqmJFbUdcxXLZRMnoGfBLWOEmTnY,174 zmon_worker_monitor/zmon_worker/tasks/notacelery_task.py,sha256=EiCqfHbL8ZVs7rRD9_9HCx1pP5A43NvAjpWhUPZoYEY,70342 zmon_worker_monitor/zmon_worker/tasks/stashacc.py,sha256=uQFDlqcltjWsefxSa30IuGr91rkJrkAleQ3pMxyhKog,3079 PKhGZ^^zmon_worker_monitor/__main__.pyPKhGzmon_worker_monitor/worker.pyPKhG)m#m#,zmon_worker_monitor/redis_context_manager.pyPKhG$6#: M)zmon_worker_monitor/rpc_utils.pyPKuGWW#v9zmon_worker_monitor/settings_pro.pyPKhG9GBzmon_worker_monitor/__init__.pyPKhG=d_Bzmon_worker_monitor/settings.pyPK,uG~%Gzmon_worker_monitor/web.pyPKٌGDg+g+%5Pzmon_worker_monitor/plugin_manager.pyPKhG "22){zmon_worker_monitor/process_controller.pyPK,uG%% -zmon_worker_monitor/emu_kombu.pyPK/xGC !zmon_worker_monitor/rpc_server.pyPK,uG+!)$/$/Szmon_worker_monitor/workflow.pyPKhG1  !zmon_worker_monitor/rpc_client.pyPKhG# zmon_worker_monitor/eventloghttp.pyPKhG>zmon_worker_monitor/tasks.pyPKhGo;(\zmon_worker_monitor/builtins/__init__.pyPKGf:zmon_worker_monitor/builtins/plugins/history.worker_pluginPKGX8.zmon_worker_monitor/builtins/plugins/ping_.worker_pluginPK0}GxW-& zmon_worker_monitor/builtins/plugins/zmon_.pyPK,uGT>-{-{.p%zmon_worker_monitor/builtins/plugins/nagios.pyPKGvi8zmon_worker_monitor/builtins/plugins/time_.worker_pluginPKhGB[97zmon_worker_monitor/builtins/plugins/cassandra_wrapper.pyPKG`Dzmon_worker_monitor/builtins/plugins/cassandra_wrapper.worker_pluginPKhG5`DD,zmon_worker_monitor/builtins/plugins/snmp.pyPK,uGy;+zmon_worker_monitor/builtins/plugins/jmx.pyPK,uG9C5zmon_worker_monitor/builtins/plugins/redis_wrapper.pyPKGeq>zmon_worker_monitor/builtins/plugins/exceptions_.worker_pluginPKG1R7zmon_worker_monitor/builtins/plugins/snmp.worker_pluginPKG):zmon_worker_monitor/builtins/plugins/counter.worker_pluginPKHG@@.zmon_worker_monitor/builtins/plugins/scalyr.pyPKhG EHH1\'zmon_worker_monitor/builtins/plugins/sql_mysql.pyPKGӷ{9?zmon_worker_monitor/builtins/plugins/scalyr.worker_pluginPKGwR(7$Azmon_worker_monitor/builtins/plugins/http.worker_pluginPKGFҏ:Czmon_worker_monitor/builtins/plugins/mongodb.worker_pluginPKhGo;0UDzmon_worker_monitor/builtins/plugins/__init__.pyPKhGτ<0Dzmon_worker_monitor/builtins/plugins/kairosdb.pyPKhGm1Mzmon_worker_monitor/builtins/plugins/sql_mssql.pyPKGVD6_zmon_worker_monitor/builtins/plugins/jmx.worker_pluginPKhG .9azmon_worker_monitor/builtins/plugins/exasol.pyPK,uG$ڴ2bozmon_worker_monitor/builtins/plugins/cloudwatch.pyPKhGaa2fzmon_worker_monitor/builtins/plugins/sql_oracle.pyPKGs"9zmon_worker_monitor/builtins/plugins/nagios.worker_pluginPKhG0'zmon_worker_monitor/builtins/plugins/eventlog.pyPKGvRR9"zmon_worker_monitor/builtins/plugins/exasol.worker_pluginPKhGJ 3˩zmon_worker_monitor/builtins/plugins/exceptions_.pyPKG?9 zmon_worker_monitor/builtins/plugins/whois_.worker_pluginPKGZ;zmon_worker_monitor/builtins/plugins/eventlog.worker_pluginPKG0~ ,,<Uzmon_worker_monitor/builtins/plugins/sql_mssql.worker_pluginPKGaX;۾zmon_worker_monitor/builtins/plugins/kairosdb.worker_pluginPKhG2 /zmon_worker_monitor/builtins/plugins/counter.pyPKGԮbN7zmon_worker_monitor/builtins/plugins/jobs.worker_pluginPK,uGlll0zmon_worker_monitor/builtins/plugins/joblocks.pyPK,uGx;zmon_worker_monitor/builtins/plugins/distance_to_history.pyPKGQԓ))<zmon_worker_monitor/builtins/plugins/checkldap.worker_pluginPKhGOp.zmon_worker_monitor/builtins/plugins/whois_.pyPKGX6zmon_worker_monitor/builtins/plugins/tcp.worker_pluginPKGJZ..=zmon_worker_monitor/builtins/plugins/sql_oracle.worker_pluginPK,uGKȊD/Mzmon_worker_monitor/builtins/plugins/history.pyPKhGgg. !zmon_worker_monitor/builtins/plugins/zomcat.pyPKhGfp8->zmon_worker_monitor/builtins/plugins/ping_.pyPK㈑G͖c6Czmon_worker_monitor/builtins/plugins/sql_postgresql.pyPKhG͋W,czmon_worker_monitor/builtins/plugins/jobs.pyPKMuGQ"",lzmon_worker_monitor/builtins/plugins/http.pyPKG@lzmon_worker_monitor/builtins/plugins/redis_wrapper.worker_pluginPKG)--Auzmon_worker_monitor/builtins/plugins/sql_postgresql.worker_pluginPKG8Cp;zmon_worker_monitor/builtins/plugins/joblocks.worker_pluginPKG!'g}:}:1(zmon_worker_monitor/builtins/plugins/checkldap.pyPK,uG4-`-zmon_worker_monitor/builtins/plugins/time_.pyPKG,,<.zmon_worker_monitor/builtins/plugins/sql_mysql.worker_pluginPKGl>*3zmon_worker-0.1.dist-info/entry_points.txtPK=GtsiKK'zmon_worker-0.1.dist-info/metadata.jsonPK=Gb 'Izmon_worker-0.1.dist-info/top_level.txtPK=G''\\zmon_worker-0.1.dist-info/WHEELPK=GL ";zmon_worker-0.1.dist-info/METADATAPK=G ++ zmon_worker-0.1.dist-info/RECORDPKii&c