diff --git a/.gitignore b/.gitignore index f811fce280..7c730b7305 100644 --- a/.gitignore +++ b/.gitignore @@ -51,7 +51,6 @@ cgroup-network !cgroup-network/ # installation artifacts -installer/.environment.sh packaging/installer/.environment.sh *.tar.* *.run @@ -140,6 +139,7 @@ tests/profile/benchmark-line-parsing tests/profile/benchmark-procfile-parser tests/profile/benchmark-value-pairs tests/profile/statsd-stress +tests/health_mgmtapi/health-cmdapi-test.sh oprofile_data/ vgcore.* callgrind.out.* diff --git a/CMakeLists.txt b/CMakeLists.txt index 1c3d207596..cb1e1ef482 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,7 +13,7 @@ find_package(PkgConfig REQUIRED) #set(CMAKE_BUILD_TYPE "Release") # set this to see the compilation commands -#set(CMAKE_VERBOSE_MAKEFILE 1) +# set(CMAKE_VERBOSE_MAKEFILE 1) # ----------------------------------------------------------------------------- @@ -30,8 +30,8 @@ IF("${CMAKE_BUILD_TYPE}" MATCHES "Debug") set(CXX_FORMAT_SIGNEDNESS "-Wformat-signedness") set(CXX_FORMAT_SECURITY "-Werror=format-security") set(CXX_STACK_PROTECTOR "-fstack-protector-all") - - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O1 -ggdb -Wall -Wextra -DNETDATA_INTERNAL_CHECKS=1 -DNETDATA_VERIFY_LOCKS=1 ${CXX_FORMAT_SIGNEDNESS} ${CXX_FORMAT_SECURITY} ${CXX_STACK_PROTECTOR}") + set(CXX_FLAGS_DEBUG "-O0") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O1 -ggdb -Wall -Wextra -DNETDATA_INTERNAL_CHECKS=1 -DNETDATA_VERIFY_LOCKS=1 ${CXX_FORMAT_SIGNEDNESS} ${CXX_FORMAT_SECURITY} ${CXX_STACK_PROTECTOR} ${CXX_FLAGS_DEBUG}") ELSE() message(STATUS "building for: release") cmake_policy(SET CMP0069 "NEW") @@ -221,8 +221,7 @@ set(HEALTH_PLUGIN_FILES health/health.h health/health_config.c health/health_json.c - health/health_log.c - ) + health/health_log.c) set(IDLEJITTER_PLUGIN_FILES collectors/idlejitter.plugin/plugin_idlejitter.c @@ -354,10 +353,6 @@ set(WEB_PLUGIN_FILES web/server/web_client.h web/server/web_server.c web/server/web_server.h - web/server/single/single-threaded.c - web/server/single/single-threaded.h - web/server/multi/multi-threaded.c - web/server/multi/multi-threaded.h web/server/static/static-threaded.c web/server/static/static-threaded.h web/server/web_client_cache.c @@ -411,6 +406,7 @@ set(API_PLUGIN_FILES web/api/formatters/charts2json.h web/api/formatters/rrdset2json.c web/api/formatters/rrdset2json.h + web/api/health/health_cmdapi.c ) set(STREAMING_PLUGIN_FILES @@ -479,7 +475,7 @@ add_definitions( -DLIBCONFIG_DIR="/usr/lib/netdata/conf.d" -DLOG_DIR="/var/log/netdata" -DPLUGINS_DIR="/usr/libexec/netdata" - -DWEB_DIR="/usr/share/netdata" + -DWEB_DIR="/usr/share/netdata/web" -DVARLIB_DIR="/var/lib/netdata" ) diff --git a/Makefile.am b/Makefile.am index 994b4260bd..a6998839ae 100644 --- a/Makefile.am +++ b/Makefile.am @@ -337,6 +337,8 @@ API_PLUGIN_FILES = \ web/api/formatters/charts2json.h \ web/api/formatters/rrdset2json.c \ web/api/formatters/rrdset2json.h \ + web/api/health/health_cmdapi.c \ + web/api/health/health_cmdapi.h \ web/api/web_api_v1.c \ web/api/web_api_v1.h \ $(NULL) @@ -374,10 +376,6 @@ WEB_PLUGIN_FILES = \ web/server/web_server.h \ web/server/web_client_cache.c \ web/server/web_client_cache.h \ - web/server/single/single-threaded.c \ - web/server/single/single-threaded.h \ - web/server/multi/multi-threaded.c \ - web/server/multi/multi-threaded.h \ web/server/static/static-threaded.c \ web/server/static/static-threaded.h \ $(NULL) diff --git a/build/subst.inc b/build/subst.inc index 508c0e142b..558d33adf9 100644 --- a/build/subst.inc +++ b/build/subst.inc @@ -5,6 +5,8 @@ -e 's#[@]configdir_POST@#$(configdir)#g' \ -e 's#[@]libconfigdir_POST@#$(libconfigdir)#g' \ -e 's#[@]cachedir_POST@#$(cachedir)#g' \ + -e 's#[@]registrydir_POST@#$(registrydir)#g' \ + -e 's#[@]varlibdir_POST@#$(varlibdir)#g' \ $< > $@.tmp; then \ mv "$@.tmp" "$@"; \ else \ diff --git a/configure.ac b/configure.ac index ff591f963c..86b9782ba9 100644 --- a/configure.ac +++ b/configure.ac @@ -609,10 +609,9 @@ AC_CONFIG_FILES([ web/api/queries/ses/Makefile web/api/queries/stddev/Makefile web/api/queries/sum/Makefile + web/api/health/Makefile web/gui/Makefile web/server/Makefile - web/server/single/Makefile - web/server/multi/Makefile web/server/static/Makefile ]) AC_OUTPUT diff --git a/daemon/main.c b/daemon/main.c index b2c4c80bf5..1ec5f6ed5a 100644 --- a/daemon/main.c +++ b/daemon/main.c @@ -67,8 +67,6 @@ struct netdata_static_thread static_threads[] = { // common plugins for all systems {"BACKENDS", NULL, NULL, 1, NULL, NULL, backends_main}, - {"WEB_SERVER[multi]", NULL, NULL, 1, NULL, NULL, socket_listen_main_multi_threaded}, - {"WEB_SERVER[single]", NULL, NULL, 0, NULL, NULL, socket_listen_main_single_threaded}, {"WEB_SERVER[static1]", NULL, NULL, 0, NULL, NULL, socket_listen_main_static_threaded}, {"STREAM", NULL, NULL, 0, NULL, NULL, rrdpush_sender_thread}, @@ -81,18 +79,10 @@ struct netdata_static_thread static_threads[] = { void web_server_threading_selection(void) { web_server_mode = web_server_mode_id(config_get(CONFIG_SECTION_WEB, "mode", web_server_mode_name(web_server_mode))); - int multi_threaded = (web_server_mode == WEB_SERVER_MODE_MULTI_THREADED); - int single_threaded = (web_server_mode == WEB_SERVER_MODE_SINGLE_THREADED); int static_threaded = (web_server_mode == WEB_SERVER_MODE_STATIC_THREADED); int i; for (i = 0; static_threads[i].name; i++) { - if (static_threads[i].start_routine == socket_listen_main_multi_threaded) - static_threads[i].enabled = multi_threaded; - - if (static_threads[i].start_routine == socket_listen_main_single_threaded) - static_threads[i].enabled = single_threaded; - if (static_threads[i].start_routine == socket_listen_main_static_threaded) static_threads[i].enabled = static_threaded; } @@ -113,6 +103,8 @@ void web_server_config_options(void) { web_allow_registry_from = simple_pattern_create(config_get(CONFIG_SECTION_REGISTRY, "allow from", "*"), NULL, SIMPLE_PATTERN_EXACT); web_allow_streaming_from = simple_pattern_create(config_get(CONFIG_SECTION_WEB, "allow streaming from", "*"), NULL, SIMPLE_PATTERN_EXACT); web_allow_netdataconf_from = simple_pattern_create(config_get(CONFIG_SECTION_WEB, "allow netdata.conf from", "localhost fd* 10.* 192.168.* 172.16.* 172.17.* 172.18.* 172.19.* 172.20.* 172.21.* 172.22.* 172.23.* 172.24.* 172.25.* 172.26.* 172.27.* 172.28.* 172.29.* 172.30.* 172.31.*"), NULL, SIMPLE_PATTERN_EXACT); + web_allow_mgmt_from = simple_pattern_create(config_get(CONFIG_SECTION_WEB, "allow management from", "localhost"), NULL, SIMPLE_PATTERN_EXACT); + #ifdef NETDATA_WITH_ZLIB web_enable_gzip = config_get_boolean(CONFIG_SECTION_WEB, "enable gzip compression", web_enable_gzip); @@ -367,13 +359,6 @@ void log_init(void) { } static void backwards_compatible_config() { - // allow existing configurations to work with the current version of netdata - - if(config_exists(CONFIG_SECTION_GLOBAL, "multi threaded web server")) { - int mode = config_get_boolean(CONFIG_SECTION_GLOBAL, "multi threaded web server", 1); - web_server_mode = (mode)?WEB_SERVER_MODE_MULTI_THREADED:WEB_SERVER_MODE_SINGLE_THREADED; - } - // move [global] options to the [web] section config_move(CONFIG_SECTION_GLOBAL, "http port listen backlog", CONFIG_SECTION_WEB, "listen backlog"); @@ -876,7 +861,6 @@ int main(int argc, char **argv) { load_netdata_conf(NULL, 0); } - backwards_compatible_config(); get_netdata_configured_variables(); const char *section = argv[optind]; @@ -1056,7 +1040,6 @@ int main(int argc, char **argv) { rrd_init(netdata_configured_hostname); - // ------------------------------------------------------------------------ // enable log flood protection diff --git a/database/rrdcalc.h b/database/rrdcalc.h index 0c7cd0aa1e..4df4381ae2 100644 --- a/database/rrdcalc.h +++ b/database/rrdcalc.h @@ -25,6 +25,8 @@ #define RRDCALC_FLAG_WARN_ERROR 0x00000010 #define RRDCALC_FLAG_CRIT_ERROR 0x00000020 #define RRDCALC_FLAG_RUNNABLE 0x00000040 +#define RRDCALC_FLAG_DISABLED 0x00000080 +#define RRDCALC_FLAG_SILENCED 0x00000100 #define RRDCALC_FLAG_NO_CLEAR_NOTIFICATION 0x80000000 struct rrdcalc { diff --git a/database/rrdhost.c b/database/rrdhost.c index 43aa2daa29..7234db9a05 100644 --- a/database/rrdhost.c +++ b/database/rrdhost.c @@ -103,7 +103,6 @@ static inline void rrdhost_init_machine_guid(RRDHOST *host, const char *machine_ host->hash_machine_guid = simple_hash(host->machine_guid); } - // ---------------------------------------------------------------------------- // RRDHOST - add a host @@ -149,6 +148,7 @@ RRDHOST *rrdhost_create(const char *hostname, rrdhost_init_hostname(host, hostname); rrdhost_init_machine_guid(host, guid); + rrdhost_init_os(host, os); rrdhost_init_timezone(host, timezone); rrdhost_init_tags(host, tags); @@ -442,7 +442,7 @@ restart_after_removal: void rrd_init(char *hostname) { rrdset_free_obsolete_time = config_get_number(CONFIG_SECTION_GLOBAL, "cleanup obsolete charts after seconds", rrdset_free_obsolete_time); gap_when_lost_iterations_above = (int)config_get_number(CONFIG_SECTION_GLOBAL, "gap when lost iterations above", gap_when_lost_iterations_above); - if(gap_when_lost_iterations_above < 1) + if (gap_when_lost_iterations_above < 1) gap_when_lost_iterations_above = 1; health_init(); @@ -471,6 +471,7 @@ void rrd_init(char *hostname) { , 1 ); rrd_unlock(); + web_client_api_v1_management_init(); } // ---------------------------------------------------------------------------- diff --git a/database/rrdsetvar.c b/database/rrdsetvar.c index 1bb883f0b4..9da4193049 100644 --- a/database/rrdsetvar.c +++ b/database/rrdsetvar.c @@ -150,12 +150,12 @@ RRDSETVAR *rrdsetvar_custom_chart_variable_create(RRDSET *st, const char *name) if(hash == rs->hash && strcmp(n, rs->variable) == 0) { rrdset_unlock(st); if(rs->options & RRDVAR_OPTION_CUSTOM_CHART_VAR) { - free(n); + freez(n); return rs; } else { error("RRDSETVAR: custom variable '%s' on chart '%s' of host '%s', conflicts with an internal chart variable", n, st->id, host->hostname); - free(n); + freez(n); return NULL; } } diff --git a/database/rrdvar.c b/database/rrdvar.c index 951a38caca..600bd34c4a 100644 --- a/database/rrdvar.c +++ b/database/rrdvar.c @@ -137,7 +137,7 @@ static RRDVAR *rrdvar_custom_variable_create(const char *scope, avl_tree_lock *t RRDVAR *rv = rrdvar_create_and_index(scope, tree_lock, name, RRDVAR_TYPE_CALCULATED, RRDVAR_OPTION_CUSTOM_HOST_VAR|RRDVAR_OPTION_ALLOCATED, v); if(unlikely(!rv)) { - free(v); + freez(v); debug(D_VARIABLES, "Requested variable '%s' already exists - possibly 2 plugins are updating it at the same time.", name); char *variable = strdupz(name); diff --git a/docs/generator/buildyaml.sh b/docs/generator/buildyaml.sh index 10811b17e4..3a96643ac8 100755 --- a/docs/generator/buildyaml.sh +++ b/docs/generator/buildyaml.sh @@ -94,8 +94,6 @@ markdown_extensions: - pymdownx.caret - pymdownx.critic - pymdownx.details - - pymdownx.emoji: - emoji_generator: !!python/name:pymdownx.emoji.to_svg - pymdownx.inlinehilite - pymdownx.magiclink - pymdownx.mark @@ -234,5 +232,5 @@ echo -ne "- Hacking netdata: navpart 2 makeself "" "" 4 navpart 2 libnetdata "" "libnetdata" 4 navpart 2 contrib -navpart 2 tests +navpart 2 tests "" "" 2 navpart 2 diagrams/data_structures diff --git a/docs/generator/requirements.txt b/docs/generator/requirements.txt index 5021831087..ac01be7aef 100644 --- a/docs/generator/requirements.txt +++ b/docs/generator/requirements.txt @@ -1,3 +1,2 @@ mkdocs>=1.0.1 mkdocs-material - diff --git a/health/README.md b/health/README.md index fae34fc71a..54f6a3e1f7 100644 --- a/health/README.md +++ b/health/README.md @@ -159,7 +159,7 @@ The simple pattern syntax and operation is explained in [simple patterns](../lib #### Alarm line `lookup` -This lines makes a database lookup to find a value. This result of this lookup is available as `$this`. +This line makes a database lookup to find a value. This result of this lookup is available as `$this`. The format is: @@ -465,7 +465,7 @@ Although the `alarm_variables` link shows you variables for a particular chart, - `$status`, which is resolved to the current status of the alarm (the current = the last status, i.e. before the current database lookup and the evaluation of the `calc` line). This values can be compared with `$REMOVED`, `$UNINITIALIZED`, `$UNDEFINED`, `$CLEAR`, - `$WARNING`, `$CRITICAL`. These values are incremental, ie. `$status > $CLEAL` works as + `$WARNING`, `$CRITICAL`. These values are incremental, ie. `$status > $CLEAR` works as expected. - `$now`, which is resolved to current unix timestamp. @@ -653,5 +653,11 @@ You can find the context of charts by looking up the chart in either You can find how netdata interpreted the expressions by examining the alarm at `http://your.netdata:19999/api/v1/alarms?all`. For each expression, netdata will return the expression as given in its config file, and the same expression with additional parentheses added to indicate the evaluation flow of the expression. +## Disabling health checks or silencing notifications at runtime + +The health checks can be controlled at runtime via the [health management api](../web/api/health/#health-management-api). [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fhealth%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() + + + diff --git a/health/health.c b/health/health.c index 5acbfdd2e8..f92a1ba6b0 100644 --- a/health/health.c +++ b/health/health.c @@ -2,6 +2,12 @@ #include "health.h" +struct health_cmdapi_thread_status { + int status; + ; + struct rusage rusage; +}; + unsigned int default_health_enabled = 1; // ---------------------------------------------------------------------------- @@ -147,6 +153,12 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { } } + // Check if alarm notifications are silenced + if (ae->flags & HEALTH_ENTRY_FLAG_SILENCED) { + info("Health not sending notification for alarm '%s.%s' status %s (command API has disabled notifications)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status)); + goto done; + } + static char command_to_run[ALARM_EXEC_COMMAND_LENGTH + 1]; pid_t command_pid; @@ -381,6 +393,67 @@ static void health_main_cleanup(void *ptr) { static_thread->enabled = NETDATA_MAIN_THREAD_EXITED; } +SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers) { + SILENCER *s; + debug(D_HEALTH, "Checking if alarm was silenced via the command API. Alarm info name:%s context:%s chart:%s host:%s family:%s", + rc->name, (rc->rrdset)?rc->rrdset->context:"", rc->chart, host, (rc->rrdset)?rc->rrdset->family:""); + + for (s = silencers->silencers; s!=NULL; s=s->next){ + if ( + (!s->alarms_pattern || (rc->name && s->alarms_pattern && simple_pattern_matches(s->alarms_pattern,rc->name))) && + (!s->contexts_pattern || (rc->rrdset && rc->rrdset->context && s->contexts_pattern && simple_pattern_matches(s->contexts_pattern,rc->rrdset->context))) && + (!s->hosts_pattern || (host && s->hosts_pattern && simple_pattern_matches(s->hosts_pattern,host))) && + (!s->charts_pattern || (rc->chart && s->charts_pattern && simple_pattern_matches(s->charts_pattern,rc->chart))) && + (!s->families_pattern || (rc->rrdset && rc->rrdset->family && s->families_pattern && simple_pattern_matches(s->families_pattern,rc->rrdset->family))) + ) { + debug(D_HEALTH, "Alarm matches command API silence entry %s:%s:%s:%s:%s", s->alarms,s->charts, s->contexts, s->hosts, s->families); + if (unlikely(silencers->stype == STYPE_NONE)) { + debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rc->name); + } else { + debug(D_HEALTH, "Alarm %s via the command API - name:%s context:%s chart:%s host:%s family:%s" + , (silencers->stype==STYPE_DISABLE_ALARMS)?"Disabled":"Silenced" + , rc->name + , (rc->rrdset)?rc->rrdset->context:"" + , rc->chart + , host + , (rc->rrdset)?rc->rrdset->family:"" + ); + } + return silencers->stype; + } + } + return STYPE_NONE; +} + +int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) { + uint32_t rrdcalc_flags_old = rc->rrdcalc_flags; + // Clear the flags + rc->rrdcalc_flags &= ~(RRDCALC_FLAG_DISABLED | RRDCALC_FLAG_SILENCED); + if (unlikely(silencers->all_alarms)) { + if (silencers->stype == STYPE_DISABLE_ALARMS) rc->rrdcalc_flags |= RRDCALC_FLAG_DISABLED; + else if (silencers->stype == STYPE_SILENCE_NOTIFICATIONS) rc->rrdcalc_flags |= RRDCALC_FLAG_SILENCED; + } else { + SILENCE_TYPE st = check_silenced(rc, host->hostname, silencers); + if (st == STYPE_DISABLE_ALARMS) rc->rrdcalc_flags |= RRDCALC_FLAG_DISABLED; + else if (st == STYPE_SILENCE_NOTIFICATIONS) rc->rrdcalc_flags |= RRDCALC_FLAG_SILENCED; + } + + if (rrdcalc_flags_old != rc->rrdcalc_flags) { + info("Alarm silencing changed for host '%s' alarm '%s': Disabled %s->%s Silenced %s->%s", + host->hostname, + rc->name, + (rrdcalc_flags_old & RRDCALC_FLAG_DISABLED)?"true":"false", + (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED)?"true":"false", + (rrdcalc_flags_old & RRDCALC_FLAG_SILENCED)?"true":"false", + (rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)?"true":"false" + ); + } + if (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED) + return 1; + else + return 0; +} + void *health_main(void *ptr) { netdata_thread_cleanup_push(health_main_cleanup, ptr); @@ -391,371 +464,338 @@ void *health_main(void *ptr) { time_t hibernation_delay = config_get_number(CONFIG_SECTION_HEALTH, "postpone alarms during hibernation for seconds", 60); unsigned int loop = 0; + + silencers = mallocz(sizeof(SILENCERS)); + silencers->all_alarms=0; + silencers->stype=STYPE_NONE; + silencers->silencers=NULL; + while(!netdata_exit) { - loop++; - debug(D_HEALTH, "Health monitoring iteration no %u started", loop); - - int runnable = 0, apply_hibernation_delay = 0; - time_t next_run = now + min_run_every; - RRDCALC *rc; - - if(unlikely(check_if_resumed_from_suspention())) { - apply_hibernation_delay = 1; - - info("Postponing alarm checks for %ld seconds, because it seems that the system was just resumed from suspension." - , hibernation_delay - ); - } - - rrd_rdlock(); - - RRDHOST *host; - rrdhost_foreach_read(host) { - if(unlikely(!host->health_enabled)) - continue; - - if(unlikely(apply_hibernation_delay)) { - - info("Postponing health checks for %ld seconds, on host '%s'." - , hibernation_delay - , host->hostname - ); - - host->health_delay_up_to = now + hibernation_delay; - } - - if(unlikely(host->health_delay_up_to)) { - if(unlikely(now < host->health_delay_up_to)) - continue; - - info("Resuming health checks on host '%s'.", host->hostname); - host->health_delay_up_to = 0; - } - - rrdhost_rdlock(host); - - // the first loop is to lookup values from the db - for(rc = host->alarms; rc; rc = rc->next) { - if(unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) { - if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)) - rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUNNABLE; - continue; - } - - runnable++; - rc->old_value = rc->value; - rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE; - - // ------------------------------------------------------------ - // if there is database lookup, do it - - if(unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) { - /* time_t old_db_timestamp = rc->db_before; */ - int value_is_null = 0; - - int ret = rrdset2value_api_v1(rc->rrdset - , NULL - , &rc->value - , rc->dimensions - , 1 - , rc->after - , rc->before - , rc->group - , 0 - , rc->options - , &rc->db_after - , &rc->db_before - , &value_is_null - ); - - if(unlikely(ret != 200)) { - // database lookup failed - rc->value = NAN; - rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR; - - debug(D_HEALTH - , "Health on host '%s', alarm '%s.%s': database lookup returned error %d" - , host->hostname - , rc->chart ? rc->chart : "NOCHART" - , rc->name - , ret - ); - } - else - rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR; - - /* - RRDCALC_FLAG_DB_STALE not currently used - if (unlikely(old_db_timestamp == rc->db_before)) { - // database is stale - - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name); - - if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) { - rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE; - error("Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name); - } - } - else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE)) - rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE; - */ - - if(unlikely(value_is_null)) { - // collected value is null - rc->value = NAN; - rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN; - - debug(D_HEALTH - , "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)" - , host->hostname - , rc->chart ? rc->chart : "NOCHART" - , rc->name - ); - } - else - rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN; - - debug(D_HEALTH - , "Health on host '%s', alarm '%s.%s': database lookup gave value " CALCULATED_NUMBER_FORMAT - , host->hostname - , rc->chart ? rc->chart : "NOCHART" - , rc->name - , rc->value - ); - } - - // ------------------------------------------------------------ - // if there is calculation expression, run it - - if(unlikely(rc->calculation)) { - if(unlikely(!expression_evaluate(rc->calculation))) { - // calculation failed - rc->value = NAN; - rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR; - - debug(D_HEALTH - , "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s" - , host->hostname - , rc->chart ? rc->chart : "NOCHART" - , rc->name - , rc->calculation->parsed_as - , buffer_tostring(rc->calculation->error_msg) - ); - } - else { - rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR; - - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)" - , host->hostname - , rc->chart ? rc->chart : "NOCHART" - , rc->name - , rc->calculation->parsed_as - , rc->calculation->result - , buffer_tostring(rc->calculation->error_msg) - , rc->source - ); - - rc->value = rc->calculation->result; - - if(rc->local) rc->local->last_updated = now; - if(rc->family) rc->family->last_updated = now; - if(rc->hostid) rc->hostid->last_updated = now; - if(rc->hostname) rc->hostname->last_updated = now; - } - } - } - rrdhost_unlock(host); - - if(unlikely(runnable && !netdata_exit)) { - rrdhost_rdlock(host); - - for(rc = host->alarms; rc; rc = rc->next) { - if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))) - continue; - - RRDCALC_STATUS warning_status = RRDCALC_STATUS_UNDEFINED; - RRDCALC_STATUS critical_status = RRDCALC_STATUS_UNDEFINED; - - // -------------------------------------------------------- - // check the warning expression - - if(likely(rc->warning)) { - if(unlikely(!expression_evaluate(rc->warning))) { - // calculation failed - rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR; - - debug(D_HEALTH - , "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s" - , host->hostname - , rc->chart ? rc->chart : "NOCHART" - , rc->name - , buffer_tostring(rc->warning->error_msg) - ); - } - else { - rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR; - debug(D_HEALTH - , "Health on host '%s', alarm '%s.%s': warning expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)" - , host->hostname - , rc->chart ? rc->chart : "NOCHART" - , rc->name - , rc->warning->result - , buffer_tostring(rc->warning->error_msg) - , rc->source - ); - warning_status = rrdcalc_value2status(rc->warning->result); - } - } - - // -------------------------------------------------------- - // check the critical expression - - if(likely(rc->critical)) { - if(unlikely(!expression_evaluate(rc->critical))) { - // calculation failed - rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR; - - debug(D_HEALTH - , "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s" - , host->hostname - , rc->chart ? rc->chart : "NOCHART" - , rc->name - , buffer_tostring(rc->critical->error_msg) - ); - } - else { - rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR; - debug(D_HEALTH - , "Health on host '%s', alarm '%s.%s': critical expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)" - , host->hostname - , rc->chart ? rc->chart : "NOCHART" - , rc->name - , rc->critical->result - , buffer_tostring(rc->critical->error_msg) - , rc->source - ); - critical_status = rrdcalc_value2status(rc->critical->result); - } - } - - // -------------------------------------------------------- - // decide the final alarm status - - RRDCALC_STATUS status = RRDCALC_STATUS_UNDEFINED; - - switch(warning_status) { - case RRDCALC_STATUS_CLEAR: - status = RRDCALC_STATUS_CLEAR; - break; - - case RRDCALC_STATUS_RAISED: - status = RRDCALC_STATUS_WARNING; - break; - - default: - break; - } - - switch(critical_status) { - case RRDCALC_STATUS_CLEAR: - if(status == RRDCALC_STATUS_UNDEFINED) - status = RRDCALC_STATUS_CLEAR; - break; - - case RRDCALC_STATUS_RAISED: - status = RRDCALC_STATUS_CRITICAL; - break; - - default: - break; - } - - // -------------------------------------------------------- - // check if the new status and the old differ - - if(status != rc->status) { - int delay = 0; - - // apply trigger hysteresis - - if(now > rc->delay_up_to_timestamp) { - rc->delay_up_current = rc->delay_up_duration; - rc->delay_down_current = rc->delay_down_duration; - rc->delay_last = 0; - rc->delay_up_to_timestamp = 0; - } - else { - rc->delay_up_current = (int) (rc->delay_up_current * rc->delay_multiplier); - if(rc->delay_up_current > rc->delay_max_duration) - rc->delay_up_current = rc->delay_max_duration; - - rc->delay_down_current = (int) (rc->delay_down_current * rc->delay_multiplier); - if(rc->delay_down_current > rc->delay_max_duration) - rc->delay_down_current = rc->delay_max_duration; - } - - if(status > rc->status) - delay = rc->delay_up_current; - else - delay = rc->delay_down_current; - - // COMMENTED: because we do need to send raising alarms - // if(now + delay < rc->delay_up_to_timestamp) - // delay = (int)(rc->delay_up_to_timestamp - now); - - rc->delay_last = delay; - rc->delay_up_to_timestamp = now + delay; - - // add the alarm into the log - - health_alarm_log( - host - , rc->id - , rc->next_event_id++ - , now - , rc->name - , rc->rrdset->id - , rc->rrdset->family - , rc->exec - , rc->recipient - , now - rc->last_status_change - , rc->old_value - , rc->value - , rc->status - , status - , rc->source - , rc->units - , rc->info - , rc->delay_last - , (rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION) ? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0 - ); - - rc->last_status_change = now; - rc->status = status; - } - - rc->last_updated = now; - rc->next_update = now + rc->update_every; - - if(next_run > rc->next_update) - next_run = rc->next_update; - } - - rrdhost_unlock(host); - } - - if(unlikely(netdata_exit)) - break; - - // execute notifications - // and cleanup - health_alarm_log_process(host); - - if(unlikely(netdata_exit)) - break; - - } /* rrdhost_foreach */ - - rrd_unlock(); + loop++; + debug(D_HEALTH, "Health monitoring iteration no %u started", loop); + + int runnable = 0, apply_hibernation_delay = 0; + time_t next_run = now + min_run_every; + RRDCALC *rc; + + if (unlikely(check_if_resumed_from_suspention())) { + apply_hibernation_delay = 1; + + info("Postponing alarm checks for %ld seconds, because it seems that the system was just resumed from suspension.", + hibernation_delay + ); + } + + if (unlikely(silencers->all_alarms && silencers->stype == STYPE_DISABLE_ALARMS)) { + static int logged=0; + if (!logged) { + info("Skipping health checks, because all alarms are disabled via a %s command.", + HEALTH_CMDAPI_CMD_DISABLEALL); + logged = 1; + } + } + + rrd_rdlock(); + + RRDHOST *host; + rrdhost_foreach_read(host) { + if (unlikely(!host->health_enabled)) + continue; + + if (unlikely(apply_hibernation_delay)) { + + info("Postponing health checks for %ld seconds, on host '%s'.", hibernation_delay, host->hostname + ); + + host->health_delay_up_to = now + hibernation_delay; + } + + if (unlikely(host->health_delay_up_to)) { + if (unlikely(now < host->health_delay_up_to)) + continue; + + info("Resuming health checks on host '%s'.", host->hostname); + host->health_delay_up_to = 0; + } + + rrdhost_rdlock(host); + + // the first loop is to lookup values from the db + for (rc = host->alarms; rc; rc = rc->next) { + + if (update_disabled_silenced(host, rc)) + continue; + + if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) { + if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)) + rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUNNABLE; + continue; + } + + runnable++; + rc->old_value = rc->value; + rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE; + + // ------------------------------------------------------------ + // if there is database lookup, do it + + if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) { + /* time_t old_db_timestamp = rc->db_before; */ + int value_is_null = 0; + + int ret = rrdset2value_api_v1(rc->rrdset, NULL, &rc->value, rc->dimensions, 1, rc->after, + rc->before, rc->group, 0, rc->options, &rc->db_after, + &rc->db_before, &value_is_null + ); + + if (unlikely(ret != 200)) { + // database lookup failed + rc->value = NAN; + rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR; + + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d", + host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret + ); + } else + rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR; + + /* - RRDCALC_FLAG_DB_STALE not currently used + if (unlikely(old_db_timestamp == rc->db_before)) { + // database is stale + + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name); + + if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) { + rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE; + error("Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name); + } + } + else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE)) + rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE; + */ + + if (unlikely(value_is_null)) { + // collected value is null + rc->value = NAN; + rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN; + + debug(D_HEALTH, + "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", + host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name + ); + } else + rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN; + + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " + CALCULATED_NUMBER_FORMAT, host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, + rc->value + ); + } + + // ------------------------------------------------------------ + // if there is calculation expression, run it + + if (unlikely(rc->calculation)) { + if (unlikely(!expression_evaluate(rc->calculation))) { + // calculation failed + rc->value = NAN; + rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR; + + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", + host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, + rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg) + ); + } else { + rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR; + + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value " + CALCULATED_NUMBER_FORMAT + ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, + rc->calculation->parsed_as, rc->calculation->result, + buffer_tostring(rc->calculation->error_msg), rc->source + ); + + rc->value = rc->calculation->result; + + if (rc->local) rc->local->last_updated = now; + if (rc->family) rc->family->last_updated = now; + if (rc->hostid) rc->hostid->last_updated = now; + if (rc->hostname) rc->hostname->last_updated = now; + } + } + } + + rrdhost_unlock(host); + + if (unlikely(runnable && !netdata_exit)) { + rrdhost_rdlock(host); + + for (rc = host->alarms; rc; rc = rc->next) { + if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))) + continue; + + if (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED) { + continue; + } + RRDCALC_STATUS warning_status = RRDCALC_STATUS_UNDEFINED; + RRDCALC_STATUS critical_status = RRDCALC_STATUS_UNDEFINED; + + // -------------------------------------------------------- + // check the warning expression + + if (likely(rc->warning)) { + if (unlikely(!expression_evaluate(rc->warning))) { + // calculation failed + rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR; + + debug(D_HEALTH, + "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s", + host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, + buffer_tostring(rc->warning->error_msg) + ); + } else { + rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR; + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value " + CALCULATED_NUMBER_FORMAT + ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", + rc->name, rc->warning->result, buffer_tostring(rc->warning->error_msg), rc->source + ); + warning_status = rrdcalc_value2status(rc->warning->result); + } + } + + // -------------------------------------------------------- + // check the critical expression + + if (likely(rc->critical)) { + if (unlikely(!expression_evaluate(rc->critical))) { + // calculation failed + rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR; + + debug(D_HEALTH, + "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s", + host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, + buffer_tostring(rc->critical->error_msg) + ); + } else { + rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR; + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value " + CALCULATED_NUMBER_FORMAT + ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", + rc->name, rc->critical->result, buffer_tostring(rc->critical->error_msg), + rc->source + ); + critical_status = rrdcalc_value2status(rc->critical->result); + } + } + + // -------------------------------------------------------- + // decide the final alarm status + + RRDCALC_STATUS status = RRDCALC_STATUS_UNDEFINED; + + switch (warning_status) { + case RRDCALC_STATUS_CLEAR: + status = RRDCALC_STATUS_CLEAR; + break; + + case RRDCALC_STATUS_RAISED: + status = RRDCALC_STATUS_WARNING; + break; + + default: + break; + } + + switch (critical_status) { + case RRDCALC_STATUS_CLEAR: + if (status == RRDCALC_STATUS_UNDEFINED) + status = RRDCALC_STATUS_CLEAR; + break; + + case RRDCALC_STATUS_RAISED: + status = RRDCALC_STATUS_CRITICAL; + break; + + default: + break; + } + + // -------------------------------------------------------- + // check if the new status and the old differ + + if (status != rc->status) { + int delay = 0; + + // apply trigger hysteresis + + if (now > rc->delay_up_to_timestamp) { + rc->delay_up_current = rc->delay_up_duration; + rc->delay_down_current = rc->delay_down_duration; + rc->delay_last = 0; + rc->delay_up_to_timestamp = 0; + } else { + rc->delay_up_current = (int) (rc->delay_up_current * rc->delay_multiplier); + if (rc->delay_up_current > rc->delay_max_duration) + rc->delay_up_current = rc->delay_max_duration; + + rc->delay_down_current = (int) (rc->delay_down_current * rc->delay_multiplier); + if (rc->delay_down_current > rc->delay_max_duration) + rc->delay_down_current = rc->delay_max_duration; + } + + if (status > rc->status) + delay = rc->delay_up_current; + else + delay = rc->delay_down_current; + + // COMMENTED: because we do need to send raising alarms + // if(now + delay < rc->delay_up_to_timestamp) + // delay = (int)(rc->delay_up_to_timestamp - now); + + rc->delay_last = delay; + rc->delay_up_to_timestamp = now + delay; + + health_alarm_log( + host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, + rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, + rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, + rc->delay_last, + ( + ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) | + ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) + ) + + ); + + rc->last_status_change = now; + rc->status = status; + } + + rc->last_updated = now; + rc->next_update = now + rc->update_every; + + if (next_run > rc->next_update) + next_run = rc->next_update; + } + + rrdhost_unlock(host); + } + + if (unlikely(netdata_exit)) + break; + + // execute notifications + // and cleanup + health_alarm_log_process(host); + + if (unlikely(netdata_exit)) + break; + + } /* rrdhost_foreach */ + + rrd_unlock(); + if(unlikely(netdata_exit)) break; diff --git a/health/health.h b/health/health.h index ff7a4d9bf1..ff10fd6d7a 100644 --- a/health/health.h +++ b/health/health.h @@ -22,9 +22,74 @@ extern unsigned int default_health_enabled; #define HEALTH_ENTRY_FLAG_UPDATED 0x00000002 #define HEALTH_ENTRY_FLAG_EXEC_RUN 0x00000004 #define HEALTH_ENTRY_FLAG_EXEC_FAILED 0x00000008 +#define HEALTH_ENTRY_FLAG_SILENCED 0x00000008 + #define HEALTH_ENTRY_FLAG_SAVED 0x10000000 #define HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION 0x80000000 +#ifndef HEALTH_LISTEN_PORT +#define HEALTH_LISTEN_PORT 19998 +#endif + +#ifndef HEALTH_LISTEN_BACKLOG +#define HEALTH_LISTEN_BACKLOG 4096 +#endif + +#define HEALTH_ALARM_KEY "alarm" +#define HEALTH_TEMPLATE_KEY "template" +#define HEALTH_ON_KEY "on" +#define HEALTH_CONTEXT_KEY "context" +#define HEALTH_CHART_KEY "chart" +#define HEALTH_HOST_KEY "hosts" +#define HEALTH_OS_KEY "os" +#define HEALTH_FAMILIES_KEY "families" +#define HEALTH_LOOKUP_KEY "lookup" +#define HEALTH_CALC_KEY "calc" +#define HEALTH_EVERY_KEY "every" +#define HEALTH_GREEN_KEY "green" +#define HEALTH_RED_KEY "red" +#define HEALTH_WARN_KEY "warn" +#define HEALTH_CRIT_KEY "crit" +#define HEALTH_EXEC_KEY "exec" +#define HEALTH_RECIPIENT_KEY "to" +#define HEALTH_UNITS_KEY "units" +#define HEALTH_INFO_KEY "info" +#define HEALTH_DELAY_KEY "delay" +#define HEALTH_OPTIONS_KEY "options" + +typedef struct silencer { + char *alarms; + SIMPLE_PATTERN *alarms_pattern; + + char *hosts; + SIMPLE_PATTERN *hosts_pattern; + + char *contexts; + SIMPLE_PATTERN *contexts_pattern; + + char *charts; + SIMPLE_PATTERN *charts_pattern; + + char *families; + SIMPLE_PATTERN *families_pattern; + + struct silencer *next; +} SILENCER; + +typedef enum silence_type { + STYPE_NONE, + STYPE_DISABLE_ALARMS, + STYPE_SILENCE_NOTIFICATIONS +} SILENCE_TYPE; + +typedef struct silencers { + int all_alarms; + SILENCE_TYPE stype; + SILENCER *silencers; +} SILENCERS; + +SILENCERS *silencers; + extern void health_init(void); extern void *health_main(void *ptr); @@ -62,8 +127,7 @@ extern void health_alarm_log( const char *units, const char *info, int delay, - uint32_t flags -); + uint32_t flags); extern void health_readdir(RRDHOST *host, const char *user_path, const char *stock_path, const char *subpath); extern char *health_user_config_dir(void); @@ -73,4 +137,6 @@ extern void health_alarm_log_free(RRDHOST *host); extern void health_alarm_log_free_one_nochecks_nounlink(ALARM_ENTRY *ae); +extern void *health_cmdapi_thread(void *ptr); + #endif //NETDATA_HEALTH_H diff --git a/health/health_json.c b/health/health_json.c index a049dc1b2b..7811324472 100644 --- a/health/health_json.c +++ b/health/health_json.c @@ -43,6 +43,7 @@ static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, R "\t\t\"updates_id\": %u,\n" "\t\t\"value_string\": \"%s\",\n" "\t\t\"old_value_string\": \"%s\",\n" + "\t\t\"silenced\": \"%s\",\n" , host->hostname , ae->unique_id , ae->alarm_id @@ -70,6 +71,7 @@ static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, R , ae->updates_id , ae->new_value_string , ae->old_value_string + , (ae->flags & HEALTH_ENTRY_FLAG_SILENCED)?"true":"false" ); health_string2json(wb, "\t\t", "info", ae->info?ae->info:"", ",\n"); @@ -120,6 +122,8 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC "\t\t\t\"chart\": \"%s\",\n" "\t\t\t\"family\": \"%s\",\n" "\t\t\t\"active\": %s,\n" + "\t\t\t\"disabled\": %s,\n" + "\t\t\t\"silenced\": %s,\n" "\t\t\t\"exec\": \"%s\",\n" "\t\t\t\"recipient\": \"%s\",\n" "\t\t\t\"source\": \"%s\",\n" @@ -143,6 +147,8 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC , rc->chart , (rc->rrdset && rc->rrdset->family)?rc->rrdset->family:"" , (rc->rrdset)?"true":"false" + , (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED)?"true":"false" + , (rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)?"true":"false" , rc->exec?rc->exec:host->health_default_exec , rc->recipient?rc->recipient:host->health_default_recipient , rc->source diff --git a/health/health_log.c b/health/health_log.c index dd51be2afa..009e426737 100644 --- a/health/health_log.c +++ b/health/health_log.c @@ -396,7 +396,6 @@ inline void health_alarm_log( ae->duration = duration; ae->delay = delay; ae->delay_up_to_timestamp = when + delay; - ae->flags |= flags; if(ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL) diff --git a/libnetdata/socket/socket.c b/libnetdata/socket/socket.c index e3d1088f59..a4574f4dd1 100644 --- a/libnetdata/socket/socket.c +++ b/libnetdata/socket/socket.c @@ -248,7 +248,7 @@ int create_listen_socket6(int socktype, uint32_t scope_id, const char *ip, int p return sock; } -static inline int listen_sockets_add(LISTEN_SOCKETS *sockets, int fd, int family, int socktype, const char *protocol, const char *ip, uint16_t port) { +static inline int listen_sockets_add(LISTEN_SOCKETS *sockets, int fd, int family, int socktype, const char *protocol, const char *ip, uint16_t port, int acl_flags) { if(sockets->opened >= MAX_LISTEN_FDS) { error("LISTENER: Too many listening sockets. Failed to add listening %s socket at ip '%s' port %d, protocol %s, socktype %d", protocol, ip, port, protocol, socktype); close(fd); @@ -259,6 +259,7 @@ static inline int listen_sockets_add(LISTEN_SOCKETS *sockets, int fd, int family sockets->fds_types[sockets->opened] = socktype; sockets->fds_families[sockets->opened] = family; sockets->fds_names[sockets->opened] = strdup_client_description(family, protocol, ip, port); + sockets->fds_acl_flags[sockets->opened] = acl_flags; sockets->opened++; return 0; @@ -300,8 +301,20 @@ void listen_sockets_close(LISTEN_SOCKETS *sockets) { sockets->failed = 0; } +WEB_CLIENT_ACL read_acl(char *st) { + if (!strcmp(st,"dashboard")) return WEB_CLIENT_ACL_DASHBOARD; + if (!strcmp(st,"registry")) return WEB_CLIENT_ACL_REGISTRY; + if (!strcmp(st,"badges")) return WEB_CLIENT_ACL_BADGE; + if (!strcmp(st,"management")) return WEB_CLIENT_ACL_MGMT; + if (!strcmp(st,"streaming")) return WEB_CLIENT_ACL_STREAMING; + if (!strcmp(st,"netdata.conf")) return WEB_CLIENT_ACL_NETDATACONF; + return WEB_CLIENT_ACL_NONE; +} + static inline int bind_to_this(LISTEN_SOCKETS *sockets, const char *definition, uint16_t default_port, int listen_backlog) { int added = 0; + WEB_CLIENT_ACL acl_flags = WEB_CLIENT_ACL_NONE; + struct addrinfo hints; struct addrinfo *result = NULL, *rp = NULL; @@ -311,10 +324,11 @@ static inline int bind_to_this(LISTEN_SOCKETS *sockets, const char *definition, char buffer2[10 + 1]; snprintfz(buffer2, 10, "%d", default_port); - char *ip = buffer, *port = buffer2, *interface = "";; + char *ip = buffer, *port = buffer2, *interface = "", *portconfig;; int protocol = IPPROTO_TCP, socktype = SOCK_STREAM; const char *protocol_str = "tcp"; + int unix_socket=0; if(strncmp(ip, "tcp:", 4) == 0) { ip += 4; @@ -329,20 +343,10 @@ static inline int bind_to_this(LISTEN_SOCKETS *sockets, const char *definition, protocol_str = "udp"; } else if(strncmp(ip, "unix:", 5) == 0) { - char *path = ip + 5; + ip += 5; socktype = SOCK_STREAM; protocol_str = "unix"; - - int fd = create_listen_socket_unix(path, listen_backlog); - if (fd == -1) { - error("LISTENER: Cannot create unix socket '%s'", path); - sockets->failed++; - } - else { - listen_sockets_add(sockets, fd, AF_UNIX, socktype, protocol_str, path, 0); - added++; - } - return added; + unix_socket=1; } char *e = ip; @@ -355,21 +359,53 @@ static inline int bind_to_this(LISTEN_SOCKETS *sockets, const char *definition, } } else { - while(*e && *e != ':' && *e != '%') e++; + while(*e && *e != ':' && *e != '%' && *e != '=') e++; } if(*e == '%') { *e = '\0'; e++; interface = e; - while(*e && *e != ':') e++; + while(*e && *e != ':' && *e != '=') e++; } if(*e == ':') { port = e + 1; *e = '\0'; + while(*e && *e != '=') e++; } + if(*e == '=') { + *e='\0'; + e++; + portconfig = e; + while (*e != '\0') { + if (*e == '|') { + *e = '\0'; + acl_flags |= read_acl(portconfig); + e++; + portconfig = e; + continue; + } + e++; + } + acl_flags |= read_acl(portconfig); + } else { + acl_flags = WEB_CLIENT_ACL_DASHBOARD | WEB_CLIENT_ACL_REGISTRY | WEB_CLIENT_ACL_BADGE | WEB_CLIENT_ACL_MGMT | WEB_CLIENT_ACL_NETDATACONF | WEB_CLIENT_ACL_STREAMING; + } + + if (unix_socket) { + int fd = create_listen_socket_unix(port, listen_backlog); + if (fd == -1) { + error("LISTENER: Cannot create unix socket '%s'", port); + sockets->failed++; + } else { + listen_sockets_add(sockets, fd, AF_UNIX, socktype, protocol_str, port, 0, acl_flags); + added++; + } + return added; + } + uint32_t scope_id = 0; if(*interface) { scope_id = if_nametoindex(interface); @@ -435,7 +471,7 @@ static inline int bind_to_this(LISTEN_SOCKETS *sockets, const char *definition, sockets->failed++; } else { - listen_sockets_add(sockets, fd, family, socktype, protocol_str, rip, rport); + listen_sockets_add(sockets, fd, family, socktype, protocol_str, rip, rport, acl_flags); added++; } } @@ -975,6 +1011,7 @@ int accept_socket(int fd, int flags, char *client_ip, size_t ipsize, char *clien inline POLLINFO *poll_add_fd(POLLJOB *p , int fd , int socktype + , WEB_CLIENT_ACL port_acl , uint32_t flags , const char *client_ip , const char *client_port @@ -1013,6 +1050,8 @@ inline POLLINFO *poll_add_fd(POLLJOB *p p->inf[i].slot = (size_t)i; p->inf[i].flags = 0; p->inf[i].socktype = -1; + p->inf[i].port_acl = -1; + p->inf[i].client_ip = NULL; p->inf[i].client_port = NULL; p->inf[i].del_callback = p->del_callback; @@ -1042,6 +1081,7 @@ inline POLLINFO *poll_add_fd(POLLJOB *p pi->fd = fd; pi->p = p; pi->socktype = socktype; + pi->port_acl = port_acl; pi->flags = flags; pi->next = NULL; pi->client_ip = strdupz(client_ip); @@ -1272,6 +1312,7 @@ static void poll_events_process(POLLJOB *p, POLLINFO *pi, struct pollfd *pf, sho poll_add_fd(p , nfd , SOCK_STREAM + , pi->port_acl , POLLINFO_FLAG_CLIENT_SOCKET , client_ip , client_port @@ -1414,6 +1455,7 @@ void poll_events(LISTEN_SOCKETS *sockets POLLINFO *pi = poll_add_fd(&p , sockets->fds[i] , sockets->fds_types[i] + , sockets->fds_acl_flags[i] , POLLINFO_FLAG_SERVER_SOCKET , (sockets->fds_names[i])?sockets->fds_names[i]:"UNKNOWN" , "" @@ -1457,7 +1499,7 @@ void poll_events(LISTEN_SOCKETS *sockets } usec_t dt_usec = next_timer_usec - now_usec; - if(dt_usec > 1000 * USEC_PER_MS) + if(dt_usec < 1000 * USEC_PER_MS) timeout_ms = 1000; else timeout_ms = (int)(dt_usec / USEC_PER_MS); diff --git a/libnetdata/socket/socket.h b/libnetdata/socket/socket.h index f5412b63df..c69d4897f3 100644 --- a/libnetdata/socket/socket.h +++ b/libnetdata/socket/socket.h @@ -9,6 +9,24 @@ #define MAX_LISTEN_FDS 50 #endif +typedef enum web_client_acl { + WEB_CLIENT_ACL_NONE = 0, + WEB_CLIENT_ACL_NOCHECK = 0, + WEB_CLIENT_ACL_DASHBOARD = 1 << 0, + WEB_CLIENT_ACL_REGISTRY = 1 << 1, + WEB_CLIENT_ACL_BADGE = 1 << 2, + WEB_CLIENT_ACL_MGMT = 1 << 3, + WEB_CLIENT_ACL_STREAMING = 1 << 4, + WEB_CLIENT_ACL_NETDATACONF = 1 << 5 +} WEB_CLIENT_ACL; + +#define web_client_can_access_dashboard(w) ((w)->acl & WEB_CLIENT_ACL_DASHBOARD) +#define web_client_can_access_registry(w) ((w)->acl & WEB_CLIENT_ACL_REGISTRY) +#define web_client_can_access_badges(w) ((w)->acl & WEB_CLIENT_ACL_BADGE) +#define web_client_can_access_mgmt(w) ((w)->acl & WEB_CLIENT_ACL_MGMT) +#define web_client_can_access_stream(w) ((w)->acl & WEB_CLIENT_ACL_STREAMING) +#define web_client_can_access_netdataconf(w) ((w)->acl & WEB_CLIENT_ACL_NETDATACONF) + typedef struct listen_sockets { struct config *config; // the config file to use const char *config_section; // the netdata configuration section to read settings from @@ -22,6 +40,7 @@ typedef struct listen_sockets { char *fds_names[MAX_LISTEN_FDS]; // descriptions for the open sockets int fds_types[MAX_LISTEN_FDS]; // the socktype for the open sockets (SOCK_STREAM, SOCK_DGRAM) int fds_families[MAX_LISTEN_FDS]; // the family of the open sockets (AF_UNIX, AF_INET, AF_INET6) + WEB_CLIENT_ACL fds_acl_flags[MAX_LISTEN_FDS]; // the acl to apply to the open sockets (dashboard, badges, streaming, netdata.conf, management) } LISTEN_SOCKETS; extern char *strdup_client_description(int family, const char *protocol, const char *ip, uint16_t port); @@ -73,6 +92,7 @@ typedef struct pollinfo { int fd; // the file descriptor int socktype; // the client socket type + WEB_CLIENT_ACL port_acl; // the access lists permitted on this web server port (it's -1 for client sockets) char *client_ip; // the connected client IP char *client_port; // the connected client port @@ -138,6 +158,7 @@ extern void *poll_default_add_callback(POLLINFO *pi, short int *events, void *da extern POLLINFO *poll_add_fd(POLLJOB *p , int fd , int socktype + , WEB_CLIENT_ACL port_acl , uint32_t flags , const char *client_ip , const char *client_port diff --git a/registry/README.md b/registry/README.md index 589a77fd05..5a9a2b3bbc 100644 --- a/registry/README.md +++ b/registry/README.md @@ -36,11 +36,11 @@ The registry keeps track of 3 entities: For each netdata installation (each `machine_guid`) the registry keeps track of the different URLs it is accessed. -1. **persons**: i.e. the web browsers accessing the netdata installations (a random GUID generated by the registry the first time it sees a new web browser; we call this **person_guid**) +2. **persons**: i.e. the web browsers accessing the netdata installations (a random GUID generated by the registry the first time it sees a new web browser; we call this **person_guid**) For each person, the registry keeps track of the netdata installations it has accessed and their URLs. -1. **URLs** of netdata installations (as seen by the web browsers) +3. **URLs** of netdata installations (as seen by the web browsers) For each URL, the registry keeps the URL and nothing more. Each URL is linked to *persons* and *machines*. The only way to find a URL is to know its **machine_guid** or have a **person_guid** it is linked to it. diff --git a/registry/registry.h b/registry/registry.h index ab36de014f..ca74300e06 100644 --- a/registry/registry.h +++ b/registry/registry.h @@ -72,6 +72,7 @@ extern int registry_request_hello_json(RRDHOST *host, struct web_client *w); extern void registry_statistics(void); extern char *registry_get_this_machine_guid(void); +extern char *registry_get_mgmt_api_key(void); extern char *registry_get_this_machine_hostname(void); extern int regenerate_guid(const char *guid, char *result); diff --git a/registry/registry_person.c b/registry/registry_person.c index 53e3f47f42..268b0bd136 100644 --- a/registry/registry_person.c +++ b/registry/registry_person.c @@ -79,7 +79,7 @@ REGISTRY_PERSON_URL *registry_person_url_allocate(REGISTRY_PERSON *p, REGISTRY_M REGISTRY_PERSON_URL *tpu = registry_person_url_index_add(p, pu); if(tpu != pu) { error("Registry: Attempted to add duplicate person url '%s' with name '%s' to person '%s'", u->url, name, p->guid); - free(pu); + freez(pu); pu = tpu; } else diff --git a/registry/registry_url.c b/registry/registry_url.c index 6a71064588..9ac3ce10c5 100644 --- a/registry/registry_url.c +++ b/registry/registry_url.c @@ -51,7 +51,7 @@ REGISTRY_URL *registry_url_get(const char *url, size_t urllen) { n = registry_url_index_add(u); if(n != u) { error("INTERNAL ERROR: registry_url_get(): url '%s' already exists in the registry as '%s'", u->url, n->url); - free(u); + freez(u); u = n; } else diff --git a/streaming/README.md b/streaming/README.md index d92913bba3..e1ebad5369 100644 --- a/streaming/README.md +++ b/streaming/README.md @@ -81,7 +81,7 @@ monitoring (there cannot be health monitoring without a database). ``` [web] - mode = none | static-threaded | single-threaded | multi-threaded + mode = none | static-threaded accept a streaming request every seconds = 0 ``` diff --git a/tests/Makefile.am b/tests/Makefile.am index 722266d771..b0f65456e2 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -1,7 +1,15 @@ # SPDX-License-Identifier: GPL-3.0-or-later +AUTOMAKE_OPTIONS = subdir-objects MAINTAINERCLEANFILES = $(srcdir)/Makefile.in +CLEANFILES = \ + health_mgmtapi/health-cmdapi-test.sh \ + $(NULL) + +include $(top_srcdir)/build/subst.inc +SUFFIXES = .in + dist_noinst_DATA = \ README.md \ web/lib/jasmine-jquery.js \ @@ -13,8 +21,14 @@ dist_noinst_DATA = \ node.d/fronius.parse.spec.js \ node.d/fronius.process.spec.js \ node.d/fronius.validation.spec.js \ + health_mgmtapi/health-cmdapi-test.sh.in \ + $(NULL) + +dist_plugins_SCRIPTS = \ + health_mgmtapi/health-cmdapi-test.sh \ $(NULL) dist_noinst_SCRIPTS = \ stress.sh \ $(NULL) + diff --git a/tests/health_mgmtapi/README.md b/tests/health_mgmtapi/README.md new file mode 100644 index 0000000000..278c72dc16 --- /dev/null +++ b/tests/health_mgmtapi/README.md @@ -0,0 +1,13 @@ +# Health command API tester + +The directory `tests/health_cmdapi` contains the test script `health-cmdapi-test.sh` for the [health command API](../../web/api/health). + +The script can be executed with options to prepare the system for the tests, run them and restore the system to its previous state. + +It depends on the management API being accessible and on the responses to the api/v1/alarms?all requests being functional. + +Run it with `tests/health_mgmtapi/health-cmdapi-test.sh -h` to see the options. + +[![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Ftests%2Fhealth_mgmtapi%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() + + diff --git a/tests/health_mgmtapi/health-cmdapi-test.sh.in b/tests/health_mgmtapi/health-cmdapi-test.sh.in new file mode 100755 index 0000000000..0847be0079 --- /dev/null +++ b/tests/health_mgmtapi/health-cmdapi-test.sh.in @@ -0,0 +1,263 @@ +#!/usr/bin/env bash + +NETDATA_USER_CONFIG_DIR="@configdir_POST@" +NETDATA_STOCK_CONFIG_DIR="@libconfigdir_POST@" +NETDATA_VARLIB_DIR="@varlibdir_POST@" + +printhelp () { + echo "Usage: health-cmdapi-test.sh [OPTIONS] + -s SETUP config files for python example tests + -c CLEANUP config files from python example tests + -r RESTART netdata after SETUP and CLEANUP, using systemctl restart netdata. + -t TEST scenarios execution + -u changes the host:port from localhost:19999 to + " +} + +check () { + echo -e "${GRAY}Check: '${1}' in 2 sec" + sleep 2 + resp=$(curl -s "http://$URL/api/v1/alarms?all") + r=$(echo "${resp}" | \ + python3 -c "import sys, json; d=json.load(sys.stdin); \ + print(\ + d['alarms']['example.random.example_alarm1']['disabled'], \ + d['alarms']['example.random.example_alarm1']['silenced'] , \ + d['alarms']['example.random.example_alarm2']['disabled'], \ + d['alarms']['example.random.example_alarm2']['silenced'], \ + d['alarms']['system.load.load_trigger']['disabled'], \ + d['alarms']['system.load.load_trigger']['silenced'], \ + );" 2>&1) + if [ $? -ne 0 ] ; then + echo -e "${RED}ERROR: Unexpected response '$resp'" + err=$((err+1)) + elif [ "${r}" != "${2}" ] ; then + echo -e "${RED}ERROR: 'Got ${r}'. Expected '${2}'" + err=$((err+1)) + else + echo -e "${GREEN}Success" + fi +} + +cmd () { + echo -e "${WHITE}Cmd '${1}', expecting '${2}'" + RESPONSE=$(curl -s "http://$URL/api/v1/manage/health?${1}" -H "Authorization: Bearer $TOKEN" 2>&1) + if [ "${RESPONSE}" != "${2}" ] ; then + echo -e "${RED}ERROR: Response '${RESPONSE}' != '${2}'" + err=$((err+1)) + else + echo -e "${GREEN}Success" + fi +} + +WHITE='\033[0;37m' +RED='\033[0;31m' +GREEN='\033[0;32m' +GRAY='\033[0;37m' + +SETUP=0 +RESTART=0 +CLEANUP=0 +TEST=0 +URL="localhost:19999" + +while getopts :srctu: option +do + case "$option" in + s) + SETUP=1 + ;; + r) + RESTART=1 + ;; + c) + CLEANUP=1 + ;; + t) + TEST=1 + ;; + u) + URL=$OPTARG + ;; + *) + printhelp + exit 1 + ;; + esac +done + +if [ $SETUP -eq 1 ] ; then + echo "Preparing netdata configuration for testing" + # Prep netdata for tests + if [ -f "${NETDATA_USER_CONFIG_DIR}/python.d.conf" ] ; then + cp -f "${NETDATA_USER_CONFIG_DIR}/python.d.conf" /tmp/python.d.conf + else + cp "${NETDATA_STOCK_CONFIG_DIR}/python.d.conf" "${NETDATA_USER_CONFIG_DIR}/" + fi + sed -i -e "s/example: no/example: yes/g" "${NETDATA_USER_CONFIG_DIR}/python.d.conf" + + mypath=$(cd ${0%/*} && echo $PWD) + + cp -f "${mypath}/python-example.conf" "${NETDATA_USER_CONFIG_DIR}/health.d/" + + # netdata.conf + if [ -f "${NETDATA_USER_CONFIG_DIR}/netdata.conf" ] ; then + cp -f "${NETDATA_USER_CONFIG_DIR}/netdata.conf" /tmp/netdata.conf + fi + printf "[health]\nrun at least every seconds = 1\n" > "${NETDATA_USER_CONFIG_DIR}/netdata.conf" + + chmod +r "${NETDATA_USER_CONFIG_DIR}/python.d.conf" "${NETDATA_USER_CONFIG_DIR}/netdata.conf" "${NETDATA_USER_CONFIG_DIR}/health.d/python-example.conf" "${NETDATA_STOCK_CONFIG_DIR}/health.d/load.conf" + # Restart netdata + if [ $RESTART -eq 1 ] ; then + echo "Restarting netdata" + systemctl restart netdata + fi +fi + +err=0 + +# Execute tests +if [ $TEST -eq 1 ] ; then + + HEALTH_CMDAPI_MSG_AUTHERROR="Auth Error" + HEALTH_CMDAPI_MSG_SILENCEALL="All alarm notifications are silenced" + HEALTH_CMDAPI_MSG_DISABLEALL="All health checks are disabled" + HEALTH_CMDAPI_MSG_RESET="All health checks and notifications are enabled" + HEALTH_CMDAPI_MSG_DISABLE="Health checks disabled for alarms matching the selectors" + HEALTH_CMDAPI_MSG_SILENCE="Alarm notifications silenced for alarms matching the selectors" + HEALTH_CMDAPI_MSG_ADDED="Alarm selector added" + HEALTH_CMDAPI_MSG_INVALID_KEY="Invalid key. Ignoring it." + HEALTH_CMDAPI_MSG_STYPEWARNING="WARNING: Added alarm selector to silence/disable alarms without a SILENCE or DISABLE command." + HEALTH_CMDAPI_MSG_NOSELECTORWARNING="WARNING: SILENCE or DISABLE command is ineffective without defining any alarm selectors." + + if [ -f "${NETDATA_VARLIB_DIR}/netdata.api.key" ] ;then + read -r CORRECT_TOKEN < "${NETDATA_VARLIB_DIR}/netdata.api.key" + else + echo "${NETDATA_VARLIB_DIR}/netdata.api.key not found" + exit 1 + fi + # Set correct token + TOKEN="${CORRECT_TOKEN}" + + # Test default state + cmd "cmd=RESET" "$HEALTH_CMDAPI_MSG_RESET" + check "Default State" "False False False False False False" + + # Test auth failure + TOKEN="Wrong token" + cmd "cmd=DISABLE ALL" "$HEALTH_CMDAPI_MSG_AUTHERROR" + check "Default State" "False False False False False False" + + # Set correct token + TOKEN="${CORRECT_TOKEN}" + + # Test disable + cmd "cmd=DISABLE ALL" "$HEALTH_CMDAPI_MSG_DISABLEALL" + check "All disabled" "True False True False True False" + + # Reset + cmd "cmd=RESET" "$HEALTH_CMDAPI_MSG_RESET" + check "Default State" "False False False False False False" + + # Test silence + cmd "cmd=SILENCE ALL" "$HEALTH_CMDAPI_MSG_SILENCEALL" + check "All silenced" "False True False True False True" + + # Reset + cmd "cmd=RESET" "$HEALTH_CMDAPI_MSG_RESET" + check "Default State" "False False False False False False" + + # Add silencer by name + printf -v resp "$HEALTH_CMDAPI_MSG_SILENCE\n$HEALTH_CMDAPI_MSG_ADDED" + cmd "cmd=SILENCE&alarm=*example_alarm1 *load_trigger" "${resp}" + check "Silence notifications for alarm1 and load_trigger" "False True False False False True" + + # Convert to disable health checks + cmd "cmd=DISABLE" "$HEALTH_CMDAPI_MSG_DISABLE" + check "Disable notifications for alarm1 and load_trigger" "True False False False True False" + + # Convert back to silence notifications + cmd "cmd=SILENCE" "$HEALTH_CMDAPI_MSG_SILENCE" + check "Silence notifications for alarm1 and load_trigger" "False True False False False True" + + # Add second silencer by name + cmd "alarm=*example_alarm2" "$HEALTH_CMDAPI_MSG_ADDED" + check "Silence notifications for alarm1,alarm2 and load_trigger" "False True False True False True" + + # Reset + cmd "cmd=RESET" "$HEALTH_CMDAPI_MSG_RESET" + + # Add silencer by chart + printf -v resp "$HEALTH_CMDAPI_MSG_DISABLE\n$HEALTH_CMDAPI_MSG_ADDED" + cmd "cmd=DISABLE&chart=system.load" "${resp}" + check "Default State" "False False False False True False" + + # Add silencer by context + cmd "context=random" "$HEALTH_CMDAPI_MSG_ADDED" + check "Default State" "True False True False True False" + + # Reset + cmd "cmd=RESET" "$HEALTH_CMDAPI_MSG_RESET" + + # Add second condition to a selector (AND) + printf -v resp "$HEALTH_CMDAPI_MSG_SILENCE\n$HEALTH_CMDAPI_MSG_ADDED" + cmd "cmd=SILENCE&alarm=*example_alarm1 *load_trigger&chart=system.load" "${resp}" + check "Silence notifications load_trigger" "False False False False False True" + + # Add second selector with two conditions + cmd "alarm=*example_alarm1 *load_trigger&context=random" "$HEALTH_CMDAPI_MSG_ADDED" + check "Silence notifications load_trigger" "False True False False False True" + + # Reset + cmd "cmd=RESET" "$HEALTH_CMDAPI_MSG_RESET" + + # Add silencer without a command to disable or silence alarms + printf -v resp "$HEALTH_CMDAPI_MSG_ADDED\n$HEALTH_CMDAPI_MSG_STYPEWARNING" + cmd "families=load" "${resp}" + check "Family selector with no command" "False False False False False False" + + # Add silence command + cmd "cmd=SILENCE" "$HEALTH_CMDAPI_MSG_SILENCE" + check "Silence family load" "False False False False False True" + + # Reset + cmd "cmd=RESET" "$HEALTH_CMDAPI_MSG_RESET" + + # Add command without silencers + printf -v resp "$HEALTH_CMDAPI_MSG_SILENCE\n$HEALTH_CMDAPI_MSG_NOSELECTORWARNING" + cmd "cmd=SILENCE" "${resp}" + check "Command with no selector" "False False False False False False" + + # Add hosts silencer + cmd "hosts=*" "$HEALTH_CMDAPI_MSG_ADDED" + check "Silence all hosts" "False True False True False True" + + # Reset + cmd "cmd=RESET" "$HEALTH_CMDAPI_MSG_RESET" + +fi + +# Cleanup +if [ $CLEANUP -eq 1 ] ; then + echo -e "${WHITE}Restoring netdata configuration" + for f in "python.d.conf" "netdata.conf" ; do + if [ -f "/tmp/$f" ] ; then + mv -f "/tmp/$f" "${NETDATA_USER_CONFIG_DIR}/" + else + rm -f "${NETDATA_USER_CONFIG_DIR}/$f" + fi + done + + rm -f "${NETDATA_USER_CONFIG_DIR}/health.d/python-example.conf" + + # Restart netdata + if [ $RESTART -eq 1 ] ; then + echo "Restarting netdata" + systemctl restart netdata + fi +fi + +if [ $err -gt 0 ] ; then + echo "$err error(s) found" + exit 1 +fi \ No newline at end of file diff --git a/tests/health_mgmtapi/python-example.conf b/tests/health_mgmtapi/python-example.conf new file mode 100644 index 0000000000..66713208c1 --- /dev/null +++ b/tests/health_mgmtapi/python-example.conf @@ -0,0 +1,16 @@ +alarm: example_alarm1 + on: example.random + every: 2s + warn: $random1 > (($status >= $WARNING) ? (55) : (75)) + crit: $random1 > (($status == $CRITICAL) ? (75) : (95)) + info: random + to: sysadmin + +alarm: example_alarm2 + on: example.random + every: 2s + warn: $random2 > (($status >= $WARNING) ? (55) : (75)) + crit: $random2 > (($status == $CRITICAL) ? (75) : (95)) + info: random + to: sysadmin + diff --git a/web/api/Makefile.am b/web/api/Makefile.am index 5755612c8b..0f54817590 100644 --- a/web/api/Makefile.am +++ b/web/api/Makefile.am @@ -8,6 +8,7 @@ SUBDIRS = \ queries \ exporters \ formatters \ + health \ $(NULL) dist_noinst_DATA = \ diff --git a/web/server/multi/Makefile.am b/web/api/health/Makefile.am similarity index 87% rename from web/server/multi/Makefile.am rename to web/api/health/Makefile.am index 90cc9ca1eb..19554bed8e 100644 --- a/web/server/multi/Makefile.am +++ b/web/api/health/Makefile.am @@ -3,9 +3,6 @@ AUTOMAKE_OPTIONS = subdir-objects MAINTAINERCLEANFILES = $(srcdir)/Makefile.in -SUBDIRS = \ - $(NULL) - dist_noinst_DATA = \ README.md \ $(NULL) diff --git a/web/api/health/README.md b/web/api/health/README.md index cbe06c7f16..cbc8aaac4b 100644 --- a/web/api/health/README.md +++ b/web/api/health/README.md @@ -1,6 +1,8 @@ # Health API Calls -## Enabled Alarms +## Health Read API + +### Enabled Alarms NetData enables alarms on demand, i.e. when the chart they should be linked to starts collecting data. So, although many more alarms are configured, only the useful ones are enabled. @@ -8,13 +10,13 @@ To get the list of all enabled alarms: `http://your.netdata.ip:19999/api/v1/alarms?all` -## Raised Alarms +### Raised Alarms This API call will return the alarms currently in WARNING or CRITICAL state. `http://your.netdata.ip:19999/api/v1/alarms` -## Event Log +### Event Log The size of the alarm log is configured in `netdata.conf`. There are 2 settings: the rotation of the alarm log file and the in memory size of the alarm log. @@ -28,17 +30,134 @@ The API call retrieves all entries of the alarm log: `http://your.netdata.ip:19999/api/v1/alarm_log` -## Alarm Log Incremental Updates +### Alarm Log Incremental Updates `http://your.netdata.ip:19999/api/v1/alarm_log?after=UNIQUEID` The above returns all the events in the alarm log that occurred after UNIQUEID (you poll it once without `after=`, remember the last UNIQUEID of the returned set, which you give back to get incrementally the next events). -## Alarm badges +### Alarm badges The following will return an SVG badge of the alarm named `NAME`, attached to the chart named `CHART`. `http://your.netdata.ip:19999/api/v1/badge.svg?alarm=NAME&chart=CHART` +## Health Management API + +Netdata v1.12 and beyond provides a command API to control health checks and notifications at runtime. The feature is especially useful for maintenance periods, during which you receive meaningless alarms. + +Specifically, the API allows you to: + - Disable health checks completely. Alarm conditions will not be evaluated at all and no entries will be added to the alarm log. + - Silence alarm notifications. Alarm conditions will be evaluated, the alarms will appear in the log and the netdata UI will show the alarms as active, but no notifications will be sent. + - Disable or Silence specific alarms that match selectors on alarm/template name, chart, context, host and family. + +The API is available by default, but it is protected by an `api authorization token` that is stored in the file you will see in the following entry of `http://localhost:19999/netdata.conf`: + +```bash +[registry] + # netdata management api key file = /var/lib/netdata/netdata.api.key +``` + +You can access the API via GET requests, by adding the bearer token to an `Authorization` http header, like this: + +``` +curl "http://myserver/api/v1/manage/health?cmd=RESET" -H "Authorization: Bearer Mytoken" +``` + +The command `RESET` just returns netdata to the default operation, with all health checks and notifications enabled. +If you've configured and entered your token correclty, you should see the plain text response `All health checks and notifications are enabled`. + +### Disable or silence all alarms + +If all you need is temporarily disable all health checks, then you issue the following before your maintenance period starts: +``` +curl "http://myserver/api/v1/manage/health?cmd=DISABLE ALL" -H "Authorization: Bearer Mytoken" +``` +The effect of disabling health checks is that the alarm criteria are not evaluated at all and nothing is written in the alarm log. +If you want the health checks to be running but to not receive any notifications during your maintenance period, you can instead use this: + +``` +curl "http://myserver/api/v1/manage/health?cmd=SILENCE ALL" -H "Authorization: Bearer Mytoken" +``` + +Alarms may then still be raised and logged in netdata, so you'll be able to see them via the UI. + +Regardless of the option you choose, at the end of your maintenance period you revert to the normal state via the RESET command. + +``` + curl "http://myserver/api/v1/manage/health?cmd=RESET" -H "Authorization: Bearer Mytoken" +``` + +### Disable or silence specific alarms + +If you do not wish to disable/silence all alarms, then the `DISABLE ALL` and `SILENCE ALL` commands can't be used. +Instead, the following commands expect that one or more alarm selectors will be added, so that only alarms that match the selectors are disabled or silenced. +- `DISABLE` : Set the mode to disable health checks. +- `SILENCE` : Set the mode to silence notifications. + +You will normally put one of these commands in the same request with your first alarm selector, but it's possible to issue them separately as well. +You will get a warning in the response, if a selector was added without a SILENCE/DISABLE command, or vice versa. + +Each request can specify a single alarm `selector`, with one or more `selection criteria`. +A single alarm will match a `selector` if all selection criteria match the alarm. +You can add as many selectors as you like. +In essence, the rule is: IF (alarm matches all the criteria in selector1 OR all the criteria in selector2 OR ...) THEN apply the DISABLE or SILENCE command. + +To clear all selectors and reset the mode to default, use the `RESET` command. + +The following example silences notifications for all the alarms with context=load: + +``` +curl "http://myserver/api/v1/manage/health?cmd=SILENCE&context=load" -H "Authorization: Bearer Mytoken" +``` + +#### Selection criteria + +The `selection criteria` are key/value pairs, in the format `key : value`, where value is a netdata [simple pattern](../../../libnetdata/simple_pattern/). This means that you can create very powerful selectors (you will rarely need more than one or two). + +The accepted keys for the `selection criteria` are the following: +- `alarm` : The expression provided will match both `alarm` and `template` names. +- `chart` : Chart ids/names, as shown on the dashboard. These will match the `on` entry of a configured `alarm`. +- `context` : Chart context, as shown on the dashboard. These will match the `on` entry of a configured `template`. +- `hosts` : The hostnames that will need to match. +- `families` : The alarm families. + +You can add any of the selection criteria you need on the request, to ensure that only the alarms you are interested in are matched and disabled/silenced. e.g. there is no reason to add `hosts: *`, if you want the criteria to be applied to alarms for all hosts. + +Example 1: Disable all health checks for context = `random` + +``` +http://localhost/api/v1/manage/health?cmd=DISABLE&context=random +``` + +Example 2: Silence all alarms and templates with name starting with `out_of` on host `myhost` + +``` +http://localhost/api/v1/manage/health?cmd=SILENCE&alarm=out_of*&hosts=myhost +``` + +Example 2.2: Add one more selector, to also silence alarms for cpu1 and cpu2 + +``` +http://localhost/api/v1/manage/health?families=cpu1 cpu2 +``` + +### Responses + +- "Auth Error" : Token authentication failed +- "All alarm notifications are silenced" : Successful response to cmd=SILENCE ALL +- "All health checks are disabled" : Successful response to cmd=DISABLE ALL +- "All health checks and notifications are enabled" : Successful response to cmd=RESET +- "Health checks disabled for alarms matching the selectors" : Added to the response for a cmd=DISABLE +- "Alarm notifications silenced for alarms matching the selectors" : Added to the response for a cmd=SILENCE +- "Alarm selector added" : Added to the response when a new selector is added +- "Invalid key. Ignoring it." : Wrong name of a parameter. Added to the response and ignored. +- "WARNING: Added alarm selector to silence/disable alarms without a SILENCE or DISABLE command." : Added to the response if a selector is added without a selector-specific command. +- "WARNING: SILENCE or DISABLE command is ineffective without defining any alarm selectors." : Added to the response if a selector-specific command is issued without a selector. + +### Further reading + +The test script under [tests/health_mgmtapi](../../../tests/health_mgmtapi) contains a series of tests that you can either run or read through to understand the various calls and responses better. + [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fweb%2Fapi%2Fhealth%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() diff --git a/web/api/health/health_cmdapi.c b/web/api/health/health_cmdapi.c new file mode 100644 index 0000000000..ec177751b9 --- /dev/null +++ b/web/api/health/health_cmdapi.c @@ -0,0 +1,166 @@ +// +// Created by christopher on 11/12/18. +// + +#include "health_cmdapi.h" + + +static SILENCER *create_silencer(void) { + SILENCER *t = callocz(1, sizeof(SILENCER)); + debug(D_HEALTH, "HEALTH command API: Created empty silencer"); + + return t; +} + +void free_silencers(SILENCER *t) { + if (!t) return; + if (t->next) free_silencers(t->next); + debug(D_HEALTH, "HEALTH command API: Freeing silencer %s:%s:%s:%s:%s", t->alarms, + t->charts, t->contexts, t->hosts, t->families); + simple_pattern_free(t->alarms_pattern); + simple_pattern_free(t->charts_pattern); + simple_pattern_free(t->contexts_pattern); + simple_pattern_free(t->hosts_pattern); + simple_pattern_free(t->families_pattern); + freez(t->alarms); + freez(t->charts); + freez(t->contexts); + freez(t->hosts); + freez(t->families); + freez(t); + return; +} + + + +int web_client_api_request_v1_mgmt_health(RRDHOST *host, struct web_client *w, char *url) { + int ret = 400; + (void) host; + + + + BUFFER *wb = w->response.data; + buffer_flush(wb); + wb->contenttype = CT_TEXT_PLAIN; + + buffer_flush(w->response.data); + + static uint32_t + hash_alarm = 0, + hash_template = 0, + hash_chart = 0, + hash_context = 0, + hash_host = 0, + hash_families = 0; + + if (unlikely(!hash_alarm)) { + hash_alarm = simple_uhash(HEALTH_ALARM_KEY); + hash_template = simple_uhash(HEALTH_TEMPLATE_KEY); + hash_chart = simple_uhash(HEALTH_CHART_KEY); + hash_context = simple_uhash(HEALTH_CONTEXT_KEY); + hash_host = simple_uhash(HEALTH_HOST_KEY); + hash_families = simple_uhash(HEALTH_FAMILIES_KEY); + } + + SILENCER *silencer = NULL; + + if (!w->auth_bearer_token) { + buffer_strcat(wb, HEALTH_CMDAPI_MSG_AUTHERROR); + ret = 403; + } else { + debug(D_HEALTH, "HEALTH command API: Comparing secret '%s' to '%s'", w->auth_bearer_token, api_secret); + if (strcmp(w->auth_bearer_token, api_secret)) { + buffer_strcat(wb, HEALTH_CMDAPI_MSG_AUTHERROR); + ret = 403; + } else { + while (url) { + char *value = mystrsep(&url, "&"); + if (!value || !*value) continue; + + char *key = mystrsep(&value, "="); + if (!key || !*key) continue; + if (!value || !*value) continue; + + debug(D_WEB_CLIENT, "%llu: API v1 health query param '%s' with value '%s'", w->id, key, value); + + // name and value are now the parameters + if (!strcmp(key, "cmd")) { + if (!strcmp(value, HEALTH_CMDAPI_CMD_SILENCEALL)) { + silencers->all_alarms = 1; + silencers->stype = STYPE_SILENCE_NOTIFICATIONS; + buffer_strcat(wb, HEALTH_CMDAPI_MSG_SILENCEALL); + } else if (!strcmp(value, HEALTH_CMDAPI_CMD_DISABLEALL)) { + silencers->all_alarms = 1; + silencers->stype = STYPE_DISABLE_ALARMS; + buffer_strcat(wb, HEALTH_CMDAPI_MSG_DISABLEALL); + } else if (!strcmp(value, HEALTH_CMDAPI_CMD_SILENCE)) { + silencers->stype = STYPE_SILENCE_NOTIFICATIONS; + buffer_strcat(wb, HEALTH_CMDAPI_MSG_SILENCE); + } else if (!strcmp(value, HEALTH_CMDAPI_CMD_DISABLE)) { + silencers->stype = STYPE_DISABLE_ALARMS; + buffer_strcat(wb, HEALTH_CMDAPI_MSG_DISABLE); + } else if (!strcmp(value, HEALTH_CMDAPI_CMD_RESET)) { + silencers->all_alarms = 0; + silencers->stype = STYPE_NONE; + free_silencers(silencers->silencers); + silencers->silencers = NULL; + buffer_strcat(wb, HEALTH_CMDAPI_MSG_RESET); + } + } else { + uint32_t hash = simple_uhash(key); + if (unlikely(silencer == NULL)) { + if ( + (hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) || + (hash == hash_template && !strcasecmp(key, HEALTH_TEMPLATE_KEY)) || + (hash == hash_chart && !strcasecmp(key, HEALTH_CHART_KEY)) || + (hash == hash_context && !strcasecmp(key, HEALTH_CONTEXT_KEY)) || + (hash == hash_host && !strcasecmp(key, HEALTH_HOST_KEY)) || + (hash == hash_families && !strcasecmp(key, HEALTH_FAMILIES_KEY)) + ) { + silencer = create_silencer(); + } + } + + if (hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) { + silencer->alarms = strdupz(value); + silencer->alarms_pattern = simple_pattern_create(silencer->alarms, NULL, SIMPLE_PATTERN_EXACT); + } else if (hash == hash_chart && !strcasecmp(key, HEALTH_CHART_KEY)) { + silencer->charts = strdupz(value); + silencer->charts_pattern = simple_pattern_create(silencer->charts, NULL, SIMPLE_PATTERN_EXACT); + } else if (hash == hash_context && !strcasecmp(key, HEALTH_CONTEXT_KEY)) { + silencer->contexts = strdupz(value); + silencer->contexts_pattern = simple_pattern_create(silencer->contexts, NULL, SIMPLE_PATTERN_EXACT); + } else if (hash == hash_host && !strcasecmp(key, HEALTH_HOST_KEY)) { + silencer->hosts = strdupz(value); + silencer->hosts_pattern = simple_pattern_create(silencer->hosts, NULL, SIMPLE_PATTERN_EXACT); + } else if (hash == hash_families && !strcasecmp(key, HEALTH_FAMILIES_KEY)) { + silencer->families = strdupz(value); + silencer->families_pattern = simple_pattern_create(silencer->families, NULL, SIMPLE_PATTERN_EXACT); + } else { + buffer_strcat(wb, HEALTH_CMDAPI_MSG_INVALID_KEY); + } + } + + } + if (likely(silencer)) { + // Add the created instance to the linked list in silencers + silencer->next = silencers->silencers; + silencers->silencers = silencer; + debug(D_HEALTH, "HEALTH command API: Added silencer %s:%s:%s:%s:%s", silencer->alarms, + silencer->charts, silencer->contexts, silencer->hosts, silencer->families + ); + buffer_strcat(wb, HEALTH_CMDAPI_MSG_ADDED); + if (silencers->stype == STYPE_NONE) { + buffer_strcat(wb, HEALTH_CMDAPI_MSG_STYPEWARNING); + } + } + if (unlikely(silencers->stype != STYPE_NONE && !silencers->all_alarms && !silencers->silencers)) { + buffer_strcat(wb, HEALTH_CMDAPI_MSG_NOSELECTORWARNING); + } + ret = 200; + } + } + w->response.data = wb; + buffer_no_cacheable(w->response.data); + return ret; +} diff --git a/web/api/health/health_cmdapi.h b/web/api/health/health_cmdapi.h new file mode 100644 index 0000000000..d0f30401c9 --- /dev/null +++ b/web/api/health/health_cmdapi.h @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_WEB_HEALTH_SVG_H +#define NETDATA_WEB_HEALTH_SVG_H 1 + +#include "libnetdata/libnetdata.h" +#include "web/server/web_client.h" +#include "health/health.h" + +#define HEALTH_CMDAPI_CMD_SILENCEALL "SILENCE ALL" +#define HEALTH_CMDAPI_CMD_DISABLEALL "DISABLE ALL" +#define HEALTH_CMDAPI_CMD_SILENCE "SILENCE" +#define HEALTH_CMDAPI_CMD_DISABLE "DISABLE" +#define HEALTH_CMDAPI_CMD_RESET "RESET" + +#define HEALTH_CMDAPI_MSG_AUTHERROR "Auth Error\n" +#define HEALTH_CMDAPI_MSG_SILENCEALL "All alarm notifications are silenced\n" +#define HEALTH_CMDAPI_MSG_DISABLEALL "All health checks are disabled\n" +#define HEALTH_CMDAPI_MSG_RESET "All health checks and notifications are enabled\n" +#define HEALTH_CMDAPI_MSG_DISABLE "Health checks disabled for alarms matching the selectors\n" +#define HEALTH_CMDAPI_MSG_SILENCE "Alarm notifications silenced for alarms matching the selectors\n" +#define HEALTH_CMDAPI_MSG_ADDED "Alarm selector added\n" +#define HEALTH_CMDAPI_MSG_INVALID_KEY "Invalid key. Ignoring it.\n" +#define HEALTH_CMDAPI_MSG_STYPEWARNING "WARNING: Added alarm selector to silence/disable alarms without a SILENCE or DISABLE command.\n" +#define HEALTH_CMDAPI_MSG_NOSELECTORWARNING "WARNING: SILENCE or DISABLE command is ineffective without defining any alarm selectors.\n" + +extern int web_client_api_request_v1_mgmt_health(RRDHOST *host, struct web_client *w, char *url); + +#include "web/api/web_api_v1.h" + +#endif /* NETDATA_WEB_HEALTH_SVG_H */ diff --git a/web/api/netdata-swagger.json b/web/api/netdata-swagger.json index d709912c6f..ac84b754d6 100644 --- a/web/api/netdata-swagger.json +++ b/web/api/netdata-swagger.json @@ -545,6 +545,116 @@ } } } + }, + "/alarms": { + "get": { + "summary": "Get a list of active or raised alarms on the server", + "description": "The alarms endpoint returns the list of all raised or enabled alarms on the netdata server. Called without any parameters, the raised alarms in state WARNING or CRITICAL are returned. By passing \"?all\", all the enabled alarms are returned.", + "parameters": [ + { + "name": "all", + "in": "query", + "description": "If passed, all enabled alarms are returned", + "required": false, + "type": "boolean", + "allowEmptyValue": true + } + ], + "responses": { + "200": { + "description": "An object containing general info and a linked list of alarms", + "schema": { + "$ref": "#/definitions/alarms" + } + } + } + } + }, + "/alarm_log": { + "get": { + "summary": "Retrieves the entries of the alarm log", + "description": "Returns an array of alarm_log entries, with historical information on raised and cleared alarms.", + "parameters": [ + { + "name": "after", + "in": "query", + "description": "Passing the parameter after=UNIQUEID returns all the events in the alarm log that occurred after UNIQUEID. An automated series of calls would call the interface once without after=, store the last UNIQUEID of the returned set, and give it back to get incrementally the next events", + "required": false, + "type": "integer" + } + ], + "responses": { + "200": { + "description": "An array of alarm log entries", + "schema": { + "type": "array", + "items": { + "$ref": "#/definitions/alarm_log_entry" + } + } + } + } + } + }, + "/manage/health": { + "get": { + "summary": "Accesses the health management API to control health checks and notifications at runtime.", + "description": "Available from Netdata v1.12 and above, protected via bearer authorization. Especially useful for maintenance periods, the API allows you to disable health checks completely, silence alarm notifications, or Disable/Silence specific alarms that match selectors on alarm/template name, chart, context, host and family. For the simple disable/silence all scenaria, only the cmd parameter is required. The other parameters are used to define alarm selectors. For more information and examples, refer to the netdata documentation.", + "parameters": [ + { + "name": "cmd", + "in": "query", + "description": "DISABLE ALL: No alarm criteria are evaluated, nothing is written in the alarm log. SILENCE ALL: No notifications are sent. RESET: Return to the default state. DISABLE/SILENCE: Set the mode to be used for the alarms matching the criteria of the alarm selectors.", + "required": false, + "type": "string", + "enum": [ + "DISABLE ALL", + "SILENCE ALL", + "DISABLE", + "SILENCE", + "RESET" + ] + }, + { + "name": "alarm", + "in": "query", + "description": "The expression provided will match both `alarm` and `template` names.", + "type": "string" + }, + { + "name": "chart", + "in": "query", + "description": "Chart ids/names, as shown on the dashboard. These will match the `on` entry of a configured `alarm`", + "type": "string" + }, + { + "name": "context", + "in": "query", + "description": "Chart context, as shown on the dashboard. These will match the `on` entry of a configured `template`.", + "type": "string" + }, + { + "name": "hosts", + "in": "query", + "description": "The hostnames that will need to match.", + "type": "string" + }, + { + "name": "families", + "in": "query", + "description": "The alarm families.", + "type": "string" + } + ], + "responses": { + "200": { + "description": "A plain text response based on the result of the command" + }, + "403": { + "description": "Bearer authentication error." + } + } + } } }, "definitions": { @@ -830,6 +940,291 @@ "description": "The result requested, in the format requested." } } + }, + "alarms": { + "type": "object", + "properties": { + "hostname": { + "type": "string" + }, + "latest_alarm_log_unique_id": { + "type": "integer", + "format": "int32" + }, + "status": { + "type": "boolean" + }, + "now": { + "type": "integer", + "format": "int32" + }, + "alarms": { + "type": "object", + "properties": { + "chart-name.alarm-name": { + "type": "object", + "properties": { + "id": { + "type": "integer", + "format": "int32" + }, + "name": { + "type": "string", + "description": "Full alarm name" + }, + "chart": { + "type": "string" + }, + "family": { + "type": "string" + }, + "active": { + "type": "boolean", + "description": "Will be false only if the alarm is disabled in the configuration" + }, + "disabled": { + "type": "boolean", + "description": "Whether the health check for this alarm has been disabled via a health command API DISABLE command." + }, + "silenced": { + "type": "boolean", + "description": "Whether notifications for this alarm have been silenced via a health command API SILENCE command." + }, + "exec": { + "type": "string" + }, + "recipient": { + "type": "string" + }, + "source": { + "type": "string" + }, + "units": { + "type": "string" + }, + "info": { + "type": "string" + }, + "status": { + "type": "string" + }, + "last_status_change": { + "type": "integer", + "format": "int32" + }, + "last_updated": { + "type": "integer", + "format": "int32" + }, + "next_update": { + "type": "integer", + "format": "int32" + }, + "update_every": { + "type": "integer", + "format": "int32" + }, + "delay_up_duration": { + "type": "integer", + "format": "int32" + }, + "delay_down_duration": { + "type": "integer", + "format": "int32" + }, + "delay_max_duration": { + "type": "integer", + "format": "int32" + }, + "delay_multiplier": { + "type": "integer", + "format": "int32" + }, + "delay": { + "type": "integer", + "format": "int32" + }, + "delay_up_to_timestamp": { + "type": "integer", + "format": "int32" + }, + "value_string": { + "type": "string" + }, + "no_clear_notification": { + "type": "boolean" + }, + "lookup_dimensions": { + "type": "string" + }, + "db_after": { + "type": "integer", + "format": "int32" + }, + "db_before": { + "type": "integer", + "format": "int32" + }, + "lookup_method": { + "type": "string" + }, + "lookup_after": { + "type": "integer", + "format": "int32" + }, + "lookup_before": { + "type": "integer", + "format": "int32" + }, + "lookup_options": { + "type": "string" + }, + "calc": { + "type": "string" + }, + "calc_parsed": { + "type": "string" + }, + "warn": { + "type": "string" + }, + "warn_parsed": { + "type": "string" + }, + "crit": { + "type": "string" + }, + "crit_parsed": { + "type": "string" + }, + "green": { + "type": "string", + "format": "nullable" + }, + "red": { + "type": "string", + "format": "nullable" + }, + "value": { + "type": "number" + } + } + } + } + } + } + }, + "alarm_log_entry": { + "type": "object", + "properties": { + "hostname": { + "type": "string" + }, + "unique_id": { + "type": "integer", + "format": "int32" + }, + "alarm_id": { + "type": "integer", + "format": "int32" + }, + "alarm_event_id": { + "type": "integer", + "format": "int32" + }, + "name": { + "type": "string" + }, + "chart": { + "type": "string" + }, + "family": { + "type": "string" + }, + "processed": { + "type": "boolean" + }, + "updated": { + "type": "boolean" + }, + "exec_run": { + "type": "integer", + "format": "int32" + }, + "exec_failed": { + "type": "boolean" + }, + "exec": { + "type": "string" + }, + "recipient": { + "type": "string" + }, + "exec_code": { + "type": "integer", + "format": "int32" + }, + "source": { + "type": "string" + }, + "units": { + "type": "string" + }, + "when": { + "type": "integer", + "format": "int32" + }, + "duration": { + "type": "integer", + "format": "int32" + }, + "non_clear_duration": { + "type": "integer", + "format": "int32" + }, + "status": { + "type": "string" + }, + "old_status": { + "type": "string" + }, + "delay": { + "type": "integer", + "format": "int32" + }, + "delay_up_to_timestamp": { + "type": "integer", + "format": "int32" + }, + "updated_by_id": { + "type": "integer", + "format": "int32" + }, + "updates_id": { + "type": "integer", + "format": "int32" + }, + "value_string": { + "type": "string" + }, + "old_value_string": { + "type": "string" + }, + "silenced": { + "type": "string" + }, + "info": { + "type": "string" + }, + "value": { + "type": "string", + "format": "nullable" + }, + "old_value": { + "type": "string", + "format": "nullable" + } + } } } } \ No newline at end of file diff --git a/web/api/netdata-swagger.yaml b/web/api/netdata-swagger.yaml index fd84933b29..bf62fb95af 100644 --- a/web/api/netdata-swagger.yaml +++ b/web/api/netdata-swagger.yaml @@ -357,6 +357,75 @@ paths: description: 'All the metrics returned in the format requested' '400': description: 'The format requested is not supported' + /alarms: + get: + summary: 'Get a list of active or raised alarms on the server' + description: 'The alarms endpoint returns the list of all raised or enabled alarms on the netdata server. Called without any parameters, the raised alarms in state WARNING or CRITICAL are returned. By passing "?all", all the enabled alarms are returned.' + parameters: + - name: all + in: query + description: 'If passed, all enabled alarms are returned' + required: false + type: boolean + allowEmptyValue: true + responses: + '200': + description: 'An object containing general info and a linked list of alarms' + schema: + $ref: '#/definitions/alarms' + /alarm_log: + get: + summary: 'Retrieves the entries of the alarm log' + description: 'Returns an array of alarm_log entries, with historical information on raised and cleared alarms.' + parameters: + - name: after + in: query + description: 'Passing the parameter after=UNIQUEID returns all the events in the alarm log that occurred after UNIQUEID. An automated series of calls would call the interface once without after=, store the last UNIQUEID of the returned set, and give it back to get incrementally the next events' + required: false + type: integer + responses: + '200': + description: 'An array of alarm log entries' + schema: + type: array + items: + $ref: '#/definitions/alarm_log_entry' + /manage/health: + get: + summary: 'Accesses the health management API to control health checks and notifications at runtime.' + description: 'Available from Netdata v1.12 and above, protected via bearer authorization. Especially useful for maintenance periods, the API allows you to disable health checks completely, silence alarm notifications, or Disable/Silence specific alarms that match selectors on alarm/template name, chart, context, host and family. For the simple disable/silence all scenaria, only the cmd parameter is required. The other parameters are used to define alarm selectors. For more information and examples, refer to the netdata documentation.' + parameters: + - name: cmd + in: query + description: 'DISABLE ALL: No alarm criteria are evaluated, nothing is written in the alarm log. SILENCE ALL: No notifications are sent. RESET: Return to the default state. DISABLE/SILENCE: Set the mode to be used for the alarms matching the criteria of the alarm selectors.' + required: false + type: string + enum: ['DISABLE ALL', 'SILENCE ALL', 'DISABLE', 'SILENCE', 'RESET'] + - name: alarm + in: query + description: 'The expression provided will match both `alarm` and `template` names.' + type: string + - name: chart + in: query + description: 'Chart ids/names, as shown on the dashboard. These will match the `on` entry of a configured `alarm`' + type: string + - name: context + in: query + description: 'Chart context, as shown on the dashboard. These will match the `on` entry of a configured `template`.' + type: string + - name: hosts + in: query + description: 'The hostnames that will need to match.' + type: string + - name: families + in: query + description: 'The alarm families.' + type: string + responses: + '200': + description: 'A plain text response based on the result of the command' + '403': + description: 'Bearer authentication error.' definitions: info: type: object @@ -491,7 +560,6 @@ definitions: name: type: string description: 'The name of the dimension' - json_wrap: type: object properties: @@ -559,3 +627,204 @@ definitions: description: 'The format of the result returned.' result: description: 'The result requested, in the format requested.' + alarms: + type: object + properties: + hostname: + type: string + latest_alarm_log_unique_id: + type: integer + format: int32 + status: + type: boolean + now: + type: integer + format: int32 + alarms: + type: object + properties: + chart-name.alarm-name: + type: object + properties: + id: + type: integer + format: int32 + name: + type: string + description: Full alarm name + chart: + type: string + family: + type: string + active: + type: boolean + description: Will be false only if the alarm is disabled in the configuration + disabled: + type: boolean + description: Whether the health check for this alarm has been disabled via a health command API DISABLE command. + silenced: + type: boolean + description: Whether notifications for this alarm have been silenced via a health command API SILENCE command. + exec: + type: string + recipient: + type: string + source: + type: string + units: + type: string + info: + type: string + status: + type: string + last_status_change: + type: integer + format: int32 + last_updated: + type: integer + format: int32 + next_update: + type: integer + format: int32 + update_every: + type: integer + format: int32 + delay_up_duration: + type: integer + format: int32 + delay_down_duration: + type: integer + format: int32 + delay_max_duration: + type: integer + format: int32 + delay_multiplier: + type: integer + format: int32 + delay: + type: integer + format: int32 + delay_up_to_timestamp: + type: integer + format: int32 + value_string: + type: string + no_clear_notification: + type: boolean + lookup_dimensions: + type: string + db_after: + type: integer + format: int32 + db_before: + type: integer + format: int32 + lookup_method: + type: string + lookup_after: + type: integer + format: int32 + lookup_before: + type: integer + format: int32 + lookup_options: + type: string + calc: + type: string + calc_parsed: + type: string + warn: + type: string + warn_parsed: + type: string + crit: + type: string + crit_parsed: + type: string + green: + type: string + format: nullable + red: + type: string + format: nullable + value: + type: number + alarm_log_entry: + type: object + properties: + hostname: + type: string + unique_id: + type: integer + format: int32 + alarm_id: + type: integer + format: int32 + alarm_event_id: + type: integer + format: int32 + name: + type: string + chart: + type: string + family: + type: string + processed: + type: boolean + updated: + type: boolean + exec_run: + type: integer + format: int32 + exec_failed: + type: boolean + exec: + type: string + recipient: + type: string + exec_code: + type: integer + format: int32 + source: + type: string + units: + type: string + when: + type: integer + format: int32 + duration: + type: integer + format: int32 + non_clear_duration: + type: integer + format: int32 + status: + type: string + old_status: + type: string + delay: + type: integer + format: int32 + delay_up_to_timestamp: + type: integer + format: int32 + updated_by_id: + type: integer + format: int32 + updates_id: + type: integer + format: int32 + value_string: + type: string + old_value_string: + type: string + silenced: + type: string + info: + type: string + value: + type: string + format: nullable + old_value: + type: string + format: nullable diff --git a/web/api/web_api_v1.c b/web/api/web_api_v1.c index 338572bf17..991a9ec8b3 100644 --- a/web/api/web_api_v1.c +++ b/web/api/web_api_v1.c @@ -83,6 +83,68 @@ void web_client_api_v1_init(void) { api_v1_data_google_formats[i].hash = simple_hash(api_v1_data_google_formats[i].name); web_client_api_v1_init_grouping(); + + uuid_t uuid; + + // generate + uuid_generate(uuid); + + // unparse (to string) + char uuid_str[37]; + uuid_unparse_lower(uuid, uuid_str); +} + +char *get_mgmt_api_key(void) { + char filename[FILENAME_MAX + 1]; + snprintfz(filename, FILENAME_MAX, "%s/netdata.api.key", netdata_configured_varlib_dir); + char *api_key_filename=config_get(CONFIG_SECTION_REGISTRY, "netdata management api key file", filename); + static char guid[GUID_LEN + 1] = ""; + + if(likely(guid[0])) + return guid; + + // read it from disk + int fd = open(api_key_filename, O_RDONLY); + if(fd != -1) { + char buf[GUID_LEN + 1]; + if(read(fd, buf, GUID_LEN) != GUID_LEN) + error("Failed to read management API key from '%s'", api_key_filename); + else { + buf[GUID_LEN] = '\0'; + if(regenerate_guid(buf, guid) == -1) { + error("Failed to validate management API key '%s' from '%s'.", + buf, api_key_filename); + + guid[0] = '\0'; + } + } + close(fd); + } + + // generate a new one? + if(!guid[0]) { + uuid_t uuid; + + uuid_generate_time(uuid); + uuid_unparse_lower(uuid, guid); + guid[GUID_LEN] = '\0'; + + // save it + fd = open(api_key_filename, O_WRONLY|O_CREAT|O_TRUNC, 444); + if(fd == -1) + fatal("Cannot create unique management API key file '%s'. Please fix this.", api_key_filename); + + if(write(fd, guid, GUID_LEN) != GUID_LEN) + fatal("Cannot write the unique management API key file '%s'. Please fix this.", api_key_filename); + + close(fd); + } + + return guid; +} + +void web_client_api_v1_management_init(void) { + api_secret = get_mgmt_api_key(); } inline uint32_t web_client_api_request_v1_data_options(char *o) { @@ -697,7 +759,7 @@ static struct api_command { { "alarm_log", 0, WEB_CLIENT_ACL_DASHBOARD, web_client_api_request_v1_alarm_log }, { "alarm_variables", 0, WEB_CLIENT_ACL_DASHBOARD, web_client_api_request_v1_alarm_variables }, { "allmetrics", 0, WEB_CLIENT_ACL_DASHBOARD, web_client_api_request_v1_allmetrics }, - + { "manage/health", 0, WEB_CLIENT_ACL_MGMT, web_client_api_request_v1_mgmt_health }, // terminator { NULL, 0, WEB_CLIENT_ACL_NONE, NULL }, }; @@ -721,7 +783,7 @@ inline int web_client_api_request_v1(RRDHOST *host, struct web_client *w, char * for(i = 0; api_commands[i].command ;i++) { if(unlikely(hash == api_commands[i].hash && !strcmp(tok, api_commands[i].command))) { - if(unlikely(api_commands[i].acl != WEB_CLIENT_ACL_NOCHECK) && !(w->acl & api_commands[i].acl)) + if(unlikely(api_commands[i].acl != WEB_CLIENT_ACL_NOCHECK) && !(w->acl & api_commands[i].acl)) return web_client_permission_denied(w); return api_commands[i].callback(host, w, url); diff --git a/web/api/web_api_v1.h b/web/api/web_api_v1.h index 179fab7cdc..70b7817804 100644 --- a/web/api/web_api_v1.h +++ b/web/api/web_api_v1.h @@ -6,6 +6,7 @@ #include "daemon/common.h" #include "web/api/badges/web_buffer_svg.h" #include "web/api/formatters/rrd2json.h" +#include "web/api/health/health_cmdapi.h" extern uint32_t web_client_api_request_v1_data_options(char *o); extern uint32_t web_client_api_request_v1_data_format(char *name); @@ -23,5 +24,8 @@ extern int web_client_api_request_v1_info(RRDHOST *host, struct web_client *w, c extern int web_client_api_request_v1(RRDHOST *host, struct web_client *w, char *url); extern void web_client_api_v1_init(void); +extern void web_client_api_v1_management_init(void); + +char *api_secret; #endif //NETDATA_WEB_API_V1_H diff --git a/web/server/Makefile.am b/web/server/Makefile.am index 843c4cc9bf..5860a5cbae 100644 --- a/web/server/Makefile.am +++ b/web/server/Makefile.am @@ -4,8 +4,6 @@ AUTOMAKE_OPTIONS = subdir-objects MAINTAINERCLEANFILES = $(srcdir)/Makefile.in SUBDIRS = \ - single \ - multi \ static \ $(NULL) diff --git a/web/server/README.md b/web/server/README.md index c4c250d3ff..7d74c181e9 100644 --- a/web/server/README.md +++ b/web/server/README.md @@ -1,34 +1,21 @@ # Web server -Netdata supports 3 implementations of its internal web server: - -- `static-threaded` is a web server with a fix (configured number of threads) -- `single-threaded` is a simple web server running with a single thread -- `multi-threaded` is a web server that spawns a thread for each client connection -- `none` to disable the web server - -We suggest to use the `static-threaded` one. It is the most efficient. - -All versions of the web servers use non-blocking I/O. - -All web servers respect the `keep-alive` HTTP header to serve multiple HTTP requests via the same connection. +The Netdata web server runs as `static-threaded`, i.e. with a fixed, configurable number of threads. +It uses non-blocking I/O and respects the `keep-alive` HTTP header to serve multiple HTTP requests via the same connection. ## Configuration -### Selecting the web server - -You can select the web server implementation by editing `netdata.conf` and setting: +You can disable the web server by editing `netdata.conf` and setting: ``` [web] - mode = none | single-threaded | multi-threaded | static-threaded + mode = none ``` -The `static` web server supports also these settings: +With the web server enabled, you can control the number of threads and sockets with the following settings: ``` [web] - mode = static-threaded web server threads = 4 web server max sockets = 512 ``` @@ -39,28 +26,37 @@ The `web server max sockets` setting is automatically adjusted to 50% of the max ### Binding netdata to multiple ports -Netdata can bind to multiple IPs and ports. Up to 100 sockets can be used (you can increase it at compile time with `CFLAGS="-DMAX_LISTEN_FDS=200" ./netdata-installer.sh ...`). +Netdata can bind to multiple IPs and ports, offering access to different services on each. Up to 100 sockets can be used (you can increase it at compile time with `CFLAGS="-DMAX_LISTEN_FDS=200" ./netdata-installer.sh ...`). The ports to bind are controlled via `[web].bind to`, like this: ``` [web] default port = 19999 - bind to = 127.0.0.1 10.1.1.1:19998 hostname:19997 [::]:19996 localhost:19995 *:http unix:/tmp/netdata.sock + bind to = 127.0.0.1=dashboard 10.1.1.1:19998=management|netdata.conf hostname:19997=badges [::]:19996=streaming localhost:19995=registry *:http=dashboard unix:/tmp/netdata.sock ``` Using the above, netdata will bind to: -- IPv4 127.0.0.1 at port 19999 (port was used from `default port`) -- IPv4 10.1.1.1 at port 19998 -- All the IPs `hostname` resolves to (both IPv4 and IPv6 depending on the resolved IPs) at port 19997 -- All IPv6 IPs at port 19996 -- All the IPs `localhost` resolves to (both IPv4 and IPv6 depending the resolved IPs) at port 19996 -- All IPv4 and IPv6 IPs at port `http` as set in `/etc/services` -- Unix domain socket `/tmp/netdata.sock` +- IPv4 127.0.0.1 at port 19999 (port was used from `default port`). Only the UI (dashboard) and the read API will be accessible on this port. +- IPv4 10.1.1.1 at port 19998. The management API and netdata.conf will be accessible on this port. +- All the IPs `hostname` resolves to (both IPv4 and IPv6 depending on the resolved IPs) at port 19997. Only badges will be accessible on this port. +- All IPv6 IPs at port 19996. Only metric streaming requests from other netdata agents will be accepted on this port. +- All the IPs `localhost` resolves to (both IPv4 and IPv6 depending the resolved IPs) at port 19996. This port will only accept registry API requests. +- All IPv4 and IPv6 IPs at port `http` as set in `/etc/services`. Only the UI (dashboard) and the read API will be accessible on this port. +- Unix domain socket `/tmp/netdata.sock`. All requests are serviceable on this socket. The option `[web].default port` is used when an entries in `[web].bind to` do not specify a port. +Note that the access permissions specified with the `=request type|request type|...` format are available from version 1.12 onwards. +As shown in the example above, these permissions are optional, with the default being to permit all request types on the specified port. +The request types are strings identical to the `allow X from` directives of the access lists, i.e. `dashboard`, `streaming`, `registry`, `netdata.conf`, `badges` and `management`. +The access lists themselves and the general setting `allow connections from` in the next section are applied regardless of the ports that are configured to provide these services. +The API requests are serviced as follows: +- `dashboard` gives access to the UI, the read API and badges API calls. +- `badges` gives access only to the badges API calls. +- `management` gives access only to the management API calls. + ### Access lists Netdata supports access lists in `netdata.conf`: @@ -72,6 +68,7 @@ Netdata supports access lists in `netdata.conf`: allow badges from = * allow streaming from = * allow netdata.conf from = localhost fd* 10.* 192.168.* 172.16.* 172.17.* 172.18.* 172.19.* 172.20.* 172.21.* 172.22.* 172.23.* 172.24.* 172.25.* 172.26.* 172.27.* 172.28.* 172.29.* 172.30.* 172.31.* + allow management from = localhost ``` `*` does string matches on the IPs of the clients. @@ -92,6 +89,8 @@ Netdata supports access lists in `netdata.conf`: - `allow netdata.conf from` checks the IP to allow `http://netdata.host:19999/netdata.conf`. The IPs listed are all the private IPv4 addresses, including link local IPv6 addresses. Keep in mind that connections to netdata API ports are filtered by `allow connections from`. So, IPs allowed by `allow netdata.conf from` should also be allowed by `allow connections from`. +- `allow management from` checks the IPs to allow API management calls. Management via the API is currently supported for [health](../api/health/#health-management-api) + ### Other netdata.conf [web] section options setting | default | info :------:|:-------:|:---- diff --git a/web/server/multi/README.md b/web/server/multi/README.md deleted file mode 100644 index d37f583242..0000000000 --- a/web/server/multi/README.md +++ /dev/null @@ -1,9 +0,0 @@ -# `multi-threaded` web server - -The `multi-threaded` web server spawns a thread for each connection it receives. - -Each thread uses non-blocking I/O so it can serve any number of web requests in parallel, -though this is not supported by HTTP, so in practice each thread serves all the requests sequentially. - -Each thread respects the `keep-alive` HTTP header to serve multiple HTTP requests via the same connection. -[![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fweb%2Fserver%2Fmulti%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() diff --git a/web/server/multi/multi-threaded.c b/web/server/multi/multi-threaded.c deleted file mode 100644 index 37bdd38ad2..0000000000 --- a/web/server/multi/multi-threaded.c +++ /dev/null @@ -1,314 +0,0 @@ -// SPDX-License-Identifier: GPL-3.0-or-later - -#define WEB_SERVER_INTERNALS 1 -#include "multi-threaded.h" - -// -------------------------------------------------------------------------------------- -// the thread of a single client - for the MULTI-THREADED web server - -// 1. waits for input and output, using async I/O -// 2. it processes HTTP requests -// 3. it generates HTTP responses -// 4. it copies data from input to output if mode is FILECOPY - -int web_client_timeout = DEFAULT_DISCONNECT_IDLE_WEB_CLIENTS_AFTER_SECONDS; -int web_client_first_request_timeout = DEFAULT_TIMEOUT_TO_RECEIVE_FIRST_WEB_REQUEST; -long web_client_streaming_rate_t = 0L; - -static void multi_threaded_web_client_worker_main_cleanup(void *ptr) { - struct web_client *w = ptr; - WEB_CLIENT_IS_DEAD(w); - w->running = 0; -} - -static void *multi_threaded_web_client_worker_main(void *ptr) { - netdata_thread_cleanup_push(multi_threaded_web_client_worker_main_cleanup, ptr); - - struct web_client *w = ptr; - w->running = 1; - - struct pollfd fds[2], *ifd, *ofd; - int retval, timeout_ms; - nfds_t fdmax = 0; - - while(!netdata_exit) { - if(unlikely(web_client_check_dead(w))) { - debug(D_WEB_CLIENT, "%llu: client is dead.", w->id); - break; - } - else if(unlikely(!web_client_has_wait_receive(w) && !web_client_has_wait_send(w))) { - debug(D_WEB_CLIENT, "%llu: client is not set for neither receiving nor sending data.", w->id); - break; - } - - if(unlikely(w->ifd < 0 || w->ofd < 0)) { - error("%llu: invalid file descriptor, ifd = %d, ofd = %d (required 0 <= fd", w->id, w->ifd, w->ofd); - break; - } - - if(w->ifd == w->ofd) { - fds[0].fd = w->ifd; - fds[0].events = 0; - fds[0].revents = 0; - - if(web_client_has_wait_receive(w)) fds[0].events |= POLLIN; - if(web_client_has_wait_send(w)) fds[0].events |= POLLOUT; - - fds[1].fd = -1; - fds[1].events = 0; - fds[1].revents = 0; - - ifd = ofd = &fds[0]; - - fdmax = 1; - } - else { - fds[0].fd = w->ifd; - fds[0].events = 0; - fds[0].revents = 0; - if(web_client_has_wait_receive(w)) fds[0].events |= POLLIN; - ifd = &fds[0]; - - fds[1].fd = w->ofd; - fds[1].events = 0; - fds[1].revents = 0; - if(web_client_has_wait_send(w)) fds[1].events |= POLLOUT; - ofd = &fds[1]; - - fdmax = 2; - } - - debug(D_WEB_CLIENT, "%llu: Waiting socket async I/O for %s %s", w->id, web_client_has_wait_receive(w)?"INPUT":"", web_client_has_wait_send(w)?"OUTPUT":""); - errno = 0; - timeout_ms = web_client_timeout * 1000; - retval = poll(fds, fdmax, timeout_ms); - - if(unlikely(netdata_exit)) break; - - if(unlikely(retval == -1)) { - if(errno == EAGAIN || errno == EINTR) { - debug(D_WEB_CLIENT, "%llu: EAGAIN received.", w->id); - continue; - } - - debug(D_WEB_CLIENT, "%llu: LISTENER: poll() failed (input fd = %d, output fd = %d). Closing client.", w->id, w->ifd, w->ofd); - break; - } - else if(unlikely(!retval)) { - debug(D_WEB_CLIENT, "%llu: Timeout while waiting socket async I/O for %s %s", w->id, web_client_has_wait_receive(w)?"INPUT":"", web_client_has_wait_send(w)?"OUTPUT":""); - break; - } - - if(unlikely(netdata_exit)) break; - - int used = 0; - if(web_client_has_wait_send(w) && ofd->revents & POLLOUT) { - used++; - if(web_client_send(w) < 0) { - debug(D_WEB_CLIENT, "%llu: Cannot send data to client. Closing client.", w->id); - break; - } - } - - if(unlikely(netdata_exit)) break; - - if(web_client_has_wait_receive(w) && (ifd->revents & POLLIN || ifd->revents & POLLPRI)) { - used++; - if(web_client_receive(w) < 0) { - debug(D_WEB_CLIENT, "%llu: Cannot receive data from client. Closing client.", w->id); - break; - } - - if(w->mode == WEB_CLIENT_MODE_NORMAL) { - debug(D_WEB_CLIENT, "%llu: Attempting to process received data.", w->id); - web_client_process_request(w); - - // if the sockets are closed, may have transferred this client - // to plugins.d - if(unlikely(w->mode == WEB_CLIENT_MODE_STREAM)) - break; - } - } - - if(unlikely(!used)) { - debug(D_WEB_CLIENT_ACCESS, "%llu: Received error on socket.", w->id); - break; - } - } - - if(w->mode != WEB_CLIENT_MODE_STREAM) - web_server_log_connection(w, "DISCONNECTED"); - - web_client_request_done(w); - - debug(D_WEB_CLIENT, "%llu: done...", w->id); - - // close the sockets/files now - // to free file descriptors - if(w->ifd == w->ofd) { - if(w->ifd != -1) close(w->ifd); - } - else { - if(w->ifd != -1) close(w->ifd); - if(w->ofd != -1) close(w->ofd); - } - w->ifd = -1; - w->ofd = -1; - - netdata_thread_cleanup_pop(1); - return NULL; -} - -// -------------------------------------------------------------------------------------- -// the main socket listener - MULTI-THREADED - -// 1. it accepts new incoming requests on our port -// 2. creates a new web_client for each connection received -// 3. spawns a new netdata_thread to serve the client (this is optimal for keep-alive clients) -// 4. cleans up old web_clients that their netdata_threads have been exited - -static void web_client_multi_threaded_web_server_release_clients(void) { - struct web_client *w; - for(w = web_clients_cache.used; w ; ) { - if(unlikely(!w->running && web_client_check_dead(w))) { - struct web_client *t = w->next; - web_client_release(w); - w = t; - } - else - w = w->next; - } -} - -static void web_client_multi_threaded_web_server_stop_all_threads(void) { - struct web_client *w; - - int found = 1; - usec_t max = 2 * USEC_PER_SEC, step = 50000; - for(w = web_clients_cache.used; w ; w = w->next) { - if(w->running) { - found++; - info("stopping web client %s, id %llu", w->client_ip, w->id); - netdata_thread_cancel(w->thread); - } - } - - while(found && max > 0) { - max -= step; - info("Waiting %d web threads to finish...", found); - sleep_usec(step); - found = 0; - for(w = web_clients_cache.used; w ; w = w->next) - if(w->running) found++; - } - - if(found) - error("%d web threads are taking too long to finish. Giving up.", found); -} - -static struct pollfd *socket_listen_main_multi_threaded_fds = NULL; - -static void socket_listen_main_multi_threaded_cleanup(void *data) { - struct netdata_static_thread *static_thread = (struct netdata_static_thread *)data; - static_thread->enabled = NETDATA_MAIN_THREAD_EXITING; - - info("cleaning up..."); - - info("releasing allocated memory..."); - freez(socket_listen_main_multi_threaded_fds); - - info("closing all sockets..."); - listen_sockets_close(&api_sockets); - - info("stopping all running web server threads..."); - web_client_multi_threaded_web_server_stop_all_threads(); - - info("freeing web clients cache..."); - web_client_cache_destroy(); - - info("cleanup completed."); - static_thread->enabled = NETDATA_MAIN_THREAD_EXITED; -} - -#define CLEANUP_EVERY_EVENTS 60 -void *socket_listen_main_multi_threaded(void *ptr) { - netdata_thread_cleanup_push(socket_listen_main_multi_threaded_cleanup, ptr); - - web_server_mode = WEB_SERVER_MODE_MULTI_THREADED; - web_server_is_multithreaded = 1; - - struct web_client *w; - int retval, counter = 0; - - if(!api_sockets.opened) - fatal("LISTENER: No sockets to listen to."); - - socket_listen_main_multi_threaded_fds = callocz(sizeof(struct pollfd), api_sockets.opened); - - size_t i; - for(i = 0; i < api_sockets.opened ;i++) { - socket_listen_main_multi_threaded_fds[i].fd = api_sockets.fds[i]; - socket_listen_main_multi_threaded_fds[i].events = POLLIN; - socket_listen_main_multi_threaded_fds[i].revents = 0; - - info("Listening on '%s'", (api_sockets.fds_names[i])?api_sockets.fds_names[i]:"UNKNOWN"); - } - - int timeout_ms = 1 * 1000; - - while(!netdata_exit) { - - // debug(D_WEB_CLIENT, "LISTENER: Waiting..."); - retval = poll(socket_listen_main_multi_threaded_fds, api_sockets.opened, timeout_ms); - - if(unlikely(retval == -1)) { - error("LISTENER: poll() failed."); - continue; - } - else if(unlikely(!retval)) { - debug(D_WEB_CLIENT, "LISTENER: poll() timeout."); - counter++; - continue; - } - - for(i = 0 ; i < api_sockets.opened ; i++) { - short int revents = socket_listen_main_multi_threaded_fds[i].revents; - - // check for new incoming connections - if(revents & POLLIN || revents & POLLPRI) { - socket_listen_main_multi_threaded_fds[i].revents = 0; - - w = web_client_create_on_listenfd(socket_listen_main_multi_threaded_fds[i].fd); - if(unlikely(!w)) { - // no need for error log - web_client_create_on_listenfd already logged the error - continue; - } - - if(api_sockets.fds_families[i] == AF_UNIX) - web_client_set_unix(w); - else - web_client_set_tcp(w); - - char tag[NETDATA_THREAD_TAG_MAX + 1]; - snprintfz(tag, NETDATA_THREAD_TAG_MAX, "WEB_CLIENT[%llu,[%s]:%s]", w->id, w->client_ip, w->client_port); - - w->running = 1; - if(netdata_thread_create(&w->thread, tag, NETDATA_THREAD_OPTION_DONT_LOG, multi_threaded_web_client_worker_main, w) != 0) { - w->running = 0; - web_client_release(w); - } - } - } - - counter++; - if(counter > CLEANUP_EVERY_EVENTS) { - counter = 0; - web_client_multi_threaded_web_server_release_clients(); - } - } - - netdata_thread_cleanup_pop(1); - return NULL; -} - - diff --git a/web/server/multi/multi-threaded.h b/web/server/multi/multi-threaded.h deleted file mode 100644 index d7ebf3c54d..0000000000 --- a/web/server/multi/multi-threaded.h +++ /dev/null @@ -1,10 +0,0 @@ -// SPDX-License-Identifier: GPL-3.0-or-later - -#ifndef NETDATA_WEB_SERVER_MULTI_THREADED_H -#define NETDATA_WEB_SERVER_MULTI_THREADED_H - -#include "web/server/web_server.h" - -extern void *socket_listen_main_multi_threaded(void *ptr); - -#endif //NETDATA_WEB_SERVER_MULTI_THREADED_H diff --git a/web/server/single/Makefile.am b/web/server/single/Makefile.am deleted file mode 100644 index 90cc9ca1eb..0000000000 --- a/web/server/single/Makefile.am +++ /dev/null @@ -1,11 +0,0 @@ -# SPDX-License-Identifier: GPL-3.0-or-later - -AUTOMAKE_OPTIONS = subdir-objects -MAINTAINERCLEANFILES = $(srcdir)/Makefile.in - -SUBDIRS = \ - $(NULL) - -dist_noinst_DATA = \ - README.md \ - $(NULL) diff --git a/web/server/single/README.md b/web/server/single/README.md deleted file mode 100644 index 5e125b2386..0000000000 --- a/web/server/single/README.md +++ /dev/null @@ -1,7 +0,0 @@ -# `single-threaded` web server - -The `single-threaded` web server runs as a single thread inside netdata. -It uses non-blocking I/O so it can serve any number of web requests in parallel. - -This web server respects the `keep-alive` HTTP header to serve multiple HTTP requests via the same connection. -[![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fweb%2Fserver%2Fsingle%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() diff --git a/web/server/single/single-threaded.c b/web/server/single/single-threaded.c deleted file mode 100644 index 7e89ee683b..0000000000 --- a/web/server/single/single-threaded.c +++ /dev/null @@ -1,194 +0,0 @@ -// SPDX-License-Identifier: GPL-3.0-or-later - -#define WEB_SERVER_INTERNALS 1 -#include "single-threaded.h" - -// -------------------------------------------------------------------------------------- -// the main socket listener - SINGLE-THREADED - -struct web_client *single_threaded_clients[FD_SETSIZE]; - -static inline int single_threaded_link_client(struct web_client *w, fd_set *ifds, fd_set *ofds, fd_set *efds, int *max) { - if(unlikely(web_client_check_dead(w) || (!web_client_has_wait_receive(w) && !web_client_has_wait_send(w)))) { - return 1; - } - - if(unlikely(w->ifd < 0 || w->ifd >= (int)FD_SETSIZE || w->ofd < 0 || w->ofd >= (int)FD_SETSIZE)) { - error("%llu: invalid file descriptor, ifd = %d, ofd = %d (required 0 <= fd < FD_SETSIZE (%d)", w->id, w->ifd, w->ofd, (int)FD_SETSIZE); - return 1; - } - - FD_SET(w->ifd, efds); - if(unlikely(*max < w->ifd)) *max = w->ifd; - - if(unlikely(w->ifd != w->ofd)) { - if(*max < w->ofd) *max = w->ofd; - FD_SET(w->ofd, efds); - } - - if(web_client_has_wait_receive(w)) FD_SET(w->ifd, ifds); - if(web_client_has_wait_send(w)) FD_SET(w->ofd, ofds); - - single_threaded_clients[w->ifd] = w; - single_threaded_clients[w->ofd] = w; - - return 0; -} - -static inline int single_threaded_unlink_client(struct web_client *w, fd_set *ifds, fd_set *ofds, fd_set *efds) { - FD_CLR(w->ifd, efds); - if(unlikely(w->ifd != w->ofd)) FD_CLR(w->ofd, efds); - - if(web_client_has_wait_receive(w)) FD_CLR(w->ifd, ifds); - if(web_client_has_wait_send(w)) FD_CLR(w->ofd, ofds); - - single_threaded_clients[w->ifd] = NULL; - single_threaded_clients[w->ofd] = NULL; - - if(unlikely(web_client_check_dead(w) || (!web_client_has_wait_receive(w) && !web_client_has_wait_send(w)))) { - return 1; - } - - return 0; -} - -static void socket_listen_main_single_threaded_cleanup(void *data) { - struct netdata_static_thread *static_thread = (struct netdata_static_thread *)data; - static_thread->enabled = NETDATA_MAIN_THREAD_EXITING; - - info("closing all sockets..."); - listen_sockets_close(&api_sockets); - - info("freeing web clients cache..."); - web_client_cache_destroy(); - - info("cleanup completed."); - static_thread->enabled = NETDATA_MAIN_THREAD_EXITED; -} - -void *socket_listen_main_single_threaded(void *ptr) { - netdata_thread_cleanup_push(socket_listen_main_single_threaded_cleanup, ptr); - web_server_mode = WEB_SERVER_MODE_SINGLE_THREADED; - web_server_is_multithreaded = 0; - - struct web_client *w; - - if(!api_sockets.opened) - fatal("LISTENER: no listen sockets available."); - - size_t i; - for(i = 0; i < (size_t)FD_SETSIZE ; i++) - single_threaded_clients[i] = NULL; - - fd_set ifds, ofds, efds, rifds, rofds, refds; - FD_ZERO (&ifds); - FD_ZERO (&ofds); - FD_ZERO (&efds); - int fdmax = 0; - - for(i = 0; i < api_sockets.opened ; i++) { - if (api_sockets.fds[i] < 0 || api_sockets.fds[i] >= (int)FD_SETSIZE) - fatal("LISTENER: Listen socket %d is not ready, or invalid.", api_sockets.fds[i]); - - info("Listening on '%s'", (api_sockets.fds_names[i])?api_sockets.fds_names[i]:"UNKNOWN"); - - FD_SET(api_sockets.fds[i], &ifds); - FD_SET(api_sockets.fds[i], &efds); - if(fdmax < api_sockets.fds[i]) - fdmax = api_sockets.fds[i]; - } - - while(!netdata_exit) { - debug(D_WEB_CLIENT_ACCESS, "LISTENER: single threaded web server waiting (fdmax = %d)...", fdmax); - - struct timeval tv = { .tv_sec = 1, .tv_usec = 0 }; - rifds = ifds; - rofds = ofds; - refds = efds; - int retval = select(fdmax+1, &rifds, &rofds, &refds, &tv); - - if(unlikely(retval == -1)) { - error("LISTENER: select() failed."); - continue; - } - else if(likely(retval)) { - debug(D_WEB_CLIENT_ACCESS, "LISTENER: got something."); - - for(i = 0; i < api_sockets.opened ; i++) { - if (FD_ISSET(api_sockets.fds[i], &rifds)) { - debug(D_WEB_CLIENT_ACCESS, "LISTENER: new connection."); - w = web_client_create_on_listenfd(api_sockets.fds[i]); - if(unlikely(!w)) - continue; - - if(api_sockets.fds_families[i] == AF_UNIX) - web_client_set_unix(w); - else - web_client_set_tcp(w); - - if (single_threaded_link_client(w, &ifds, &ofds, &ifds, &fdmax) != 0) { - web_client_release(w); - } - } - } - - for(i = 0 ; i <= (size_t)fdmax ; i++) { - if(likely(!FD_ISSET(i, &rifds) && !FD_ISSET(i, &rofds) && !FD_ISSET(i, &refds))) - continue; - - w = single_threaded_clients[i]; - if(unlikely(!w)) { - // error("no client on slot %zu", i); - continue; - } - - if(unlikely(single_threaded_unlink_client(w, &ifds, &ofds, &efds) != 0)) { - // error("failed to unlink client %zu", i); - web_client_release(w); - continue; - } - - if (unlikely(FD_ISSET(w->ifd, &refds) || FD_ISSET(w->ofd, &refds))) { - // error("no input on client %zu", i); - web_client_release(w); - continue; - } - - if (unlikely(web_client_has_wait_receive(w) && FD_ISSET(w->ifd, &rifds))) { - if (unlikely(web_client_receive(w) < 0)) { - // error("cannot read from client %zu", i); - web_client_release(w); - continue; - } - - if (w->mode != WEB_CLIENT_MODE_FILECOPY) { - debug(D_WEB_CLIENT, "%llu: Processing received data.", w->id); - web_client_process_request(w); - } - } - - if (unlikely(web_client_has_wait_send(w) && FD_ISSET(w->ofd, &rofds))) { - if (unlikely(web_client_send(w) < 0)) { - // error("cannot send data to client %zu", i); - debug(D_WEB_CLIENT, "%llu: Cannot send data to client. Closing client.", w->id); - web_client_release(w); - continue; - } - } - - if(unlikely(single_threaded_link_client(w, &ifds, &ofds, &efds, &fdmax) != 0)) { - // error("failed to link client %zu", i); - web_client_release(w); - } - } - } - else { - debug(D_WEB_CLIENT_ACCESS, "LISTENER: single threaded web server timeout."); - } - } - - netdata_thread_cleanup_pop(1); - return NULL; -} - - diff --git a/web/server/single/single-threaded.h b/web/server/single/single-threaded.h deleted file mode 100644 index fab4ceba1d..0000000000 --- a/web/server/single/single-threaded.h +++ /dev/null @@ -1,10 +0,0 @@ -// SPDX-License-Identifier: GPL-3.0-or-later - -#ifndef NETDATA_WEB_SERVER_SINGLE_THREADED_H -#define NETDATA_WEB_SERVER_SINGLE_THREADED_H - -#include "web/server/web_server.h" - -extern void *socket_listen_main_single_threaded(void *ptr); - -#endif //NETDATA_WEB_SERVER_SINGLE_THREADED_H diff --git a/web/server/static/static-threaded.c b/web/server/static/static-threaded.c index 56b8dbf8dd..56e726ba17 100644 --- a/web/server/static/static-threaded.c +++ b/web/server/static/static-threaded.c @@ -3,10 +3,14 @@ #define WEB_SERVER_INTERNALS 1 #include "static-threaded.h" +int web_client_timeout = DEFAULT_DISCONNECT_IDLE_WEB_CLIENTS_AFTER_SECONDS; +int web_client_first_request_timeout = DEFAULT_TIMEOUT_TO_RECEIVE_FIRST_WEB_REQUEST; +long web_client_streaming_rate_t = 0L; + // ---------------------------------------------------------------------------- // high level web clients connection management -static struct web_client *web_client_create_on_fd(int fd, const char *client_ip, const char *client_port) { +static struct web_client *web_client_create_on_fd(int fd, const char *client_ip, const char *client_port, int port_acl) { struct web_client *w; w = web_client_get_from_cache_or_allocate(); @@ -17,6 +21,7 @@ static struct web_client *web_client_create_on_fd(int fd, const char *client_ip, if(unlikely(!*w->client_ip)) strcpy(w->client_ip, "-"); if(unlikely(!*w->client_port)) strcpy(w->client_port, "-"); + w->port_acl = port_acl; web_client_initialize_connection(w); return(w); @@ -44,6 +49,7 @@ struct web_server_static_threaded_worker { }; static long long static_threaded_workers_count = 1; + static struct web_server_static_threaded_worker *static_workers_private_data = NULL; static __thread struct web_server_static_threaded_worker *worker_private = NULL; @@ -143,7 +149,7 @@ static void *web_server_add_callback(POLLINFO *pi, short int *events, void *data *events = POLLIN; debug(D_WEB_CLIENT_ACCESS, "LISTENER on %d: new connection.", pi->fd); - struct web_client *w = web_client_create_on_fd(pi->fd, pi->client_ip, pi->client_port); + struct web_client *w = web_client_create_on_fd(pi->fd, pi->client_ip, pi->client_port, pi->port_acl); w->pollinfo_slot = pi->slot; if(unlikely(pi->socktype == AF_UNIX)) @@ -200,6 +206,7 @@ static int web_server_rcv_callback(POLLINFO *pi, short int *events) { POLLINFO *fpi = poll_add_fd( pi->p , w->ifd + , pi->port_acl , 0 , POLLINFO_FLAG_CLIENT_SOCKET , "FILENAME" @@ -394,7 +401,13 @@ void *socket_listen_main_static_threaded(void *ptr) { // so, if the machine has more CPUs, avoid using resources unnecessarily int def_thread_count = (processors > 6)?6:processors; + if (!strcmp(config_get(CONFIG_SECTION_WEB, "mode", ""),"single-threaded")) { + info("Running web server with one thread, because mode is single-threaded"); + config_set(CONFIG_SECTION_WEB, "mode", "static-threaded"); + def_thread_count = 1; + } static_threaded_workers_count = config_get_number(CONFIG_SECTION_WEB, "web server threads", def_thread_count); + if(static_threaded_workers_count < 1) static_threaded_workers_count = 1; size_t max_sockets = (size_t)config_get_number(CONFIG_SECTION_WEB, "web server max sockets", (long long int)(rlimit_nofile.rlim_cur / 2)); diff --git a/web/server/web_client.c b/web/server/web_client.c index dcc2609e71..5ae764b1d5 100644 --- a/web/server/web_client.c +++ b/web/server/web_client.c @@ -157,6 +157,10 @@ void web_client_request_done(struct web_client *w) { w->origin[1] = '\0'; freez(w->user_agent); w->user_agent = NULL; + if (w->auth_bearer_token) { + freez(w->auth_bearer_token); + w->auth_bearer_token = NULL; + } w->mode = WEB_CLIENT_MODE_NORMAL; @@ -577,6 +581,13 @@ static inline int check_host_and_dashboard_acl_and_call(RRDHOST *host, struct we return check_host_and_call(host, w, url, func); } +static inline int check_host_and_mgmt_acl_and_call(RRDHOST *host, struct web_client *w, char *url, int (*func)(RRDHOST *, struct web_client *, char *)) { + if(!web_client_can_access_mgmt(w)) + return web_client_permission_denied(w); + + return check_host_and_call(host, w, url, func); +} + int web_client_api_request(RRDHOST *host, struct web_client *w, char *url) { // get the api version @@ -713,7 +724,7 @@ const char *web_response_code_to_string(int code) { } static inline char *http_header_parse(struct web_client *w, char *s, int parse_useragent) { - static uint32_t hash_origin = 0, hash_connection = 0, hash_accept_encoding = 0, hash_donottrack = 0, hash_useragent = 0; + static uint32_t hash_origin = 0, hash_connection = 0, hash_accept_encoding = 0, hash_donottrack = 0, hash_useragent = 0, hash_authorization = 0; if(unlikely(!hash_origin)) { hash_origin = simple_uhash("Origin"); @@ -721,6 +732,7 @@ static inline char *http_header_parse(struct web_client *w, char *s, int parse_u hash_accept_encoding = simple_uhash("Accept-Encoding"); hash_donottrack = simple_uhash("DNT"); hash_useragent = simple_uhash("User-Agent"); + hash_authorization = simple_uhash("Authorization"); } char *e = s; @@ -765,6 +777,15 @@ static inline char *http_header_parse(struct web_client *w, char *s, int parse_u } else if(parse_useragent && hash == hash_useragent && !strcasecmp(s, "User-Agent")) { w->user_agent = strdupz(v); + } else if(hash == hash_authorization&& !strcasecmp(s, "Authorization")) { + if (strlen(v) > 8) { // Must contain at least "Bearer " + char *auth_key=v+6; + *auth_key='\0'; + if (!strcasecmp(v,"Bearer")) { + auth_key++; + w->auth_bearer_token=strdupz(auth_key); + } + } } #ifdef NETDATA_WITH_ZLIB else if(hash == hash_accept_encoding && !strcasecmp(s, "Accept-Encoding")) { @@ -1239,9 +1260,15 @@ void web_client_process_request(struct web_client *w) { return; case WEB_CLIENT_MODE_OPTIONS: - if(unlikely(!web_client_can_access_dashboard(w) && !web_client_can_access_registry(w) && !web_client_can_access_badges(w))) { + if(unlikely( + !web_client_can_access_dashboard(w) && + !web_client_can_access_registry(w) && + !web_client_can_access_badges(w) && + !web_client_can_access_mgmt(w) && + !web_client_can_access_netdataconf(w) + )) { web_client_permission_denied(w); - return; + break; } w->response.data->contenttype = CT_TEXT_PLAIN; @@ -1252,9 +1279,15 @@ void web_client_process_request(struct web_client *w) { case WEB_CLIENT_MODE_FILECOPY: case WEB_CLIENT_MODE_NORMAL: - if(unlikely(!web_client_can_access_dashboard(w) && !web_client_can_access_registry(w) && !web_client_can_access_badges(w))) { + if(unlikely( + !web_client_can_access_dashboard(w) && + !web_client_can_access_registry(w) && + !web_client_can_access_badges(w) && + !web_client_can_access_mgmt(w) && + !web_client_can_access_netdataconf(w) + )) { web_client_permission_denied(w); - return; + break; } w->response.code = web_client_process_url(localhost, w, w->decoded_url); diff --git a/web/server/web_client.h b/web/server/web_client.h index b9e528fca8..4263e252a3 100644 --- a/web/server/web_client.h +++ b/web/server/web_client.h @@ -108,31 +108,14 @@ struct response { }; -typedef enum web_client_acl { - WEB_CLIENT_ACL_NONE = 0, - WEB_CLIENT_ACL_NOCHECK = 0, - WEB_CLIENT_ACL_DASHBOARD = 1 << 0, - WEB_CLIENT_ACL_REGISTRY = 1 << 1, - WEB_CLIENT_ACL_BADGE = 1 << 2 -} WEB_CLIENT_ACL; - -#define web_client_can_access_dashboard(w) ((w)->acl & WEB_CLIENT_ACL_DASHBOARD) -#define web_client_can_access_registry(w) ((w)->acl & WEB_CLIENT_ACL_REGISTRY) -#define web_client_can_access_badges(w) ((w)->acl & WEB_CLIENT_ACL_BADGE) - -#define web_client_can_access_stream(w) \ - (!web_allow_streaming_from || simple_pattern_matches(web_allow_streaming_from, (w)->client_ip)) - -#define web_client_can_access_netdataconf(w) \ - (!web_allow_netdataconf_from || simple_pattern_matches(web_allow_netdataconf_from, (w)->client_ip)) - struct web_client { unsigned long long id; WEB_CLIENT_FLAGS flags; // status flags for the client WEB_CLIENT_MODE mode; // the operational mode of the client WEB_CLIENT_ACL acl; // the access list of the client - + int port_acl; // the operations permitted on the port the client connected to + char *auth_bearer_token; // the Bearer auth token (if sent) size_t header_parse_tries; size_t header_parse_last_size; diff --git a/web/server/web_server.c b/web/server/web_server.c index 5a68b125ea..11f7edf8a4 100644 --- a/web/server/web_server.c +++ b/web/server/web_server.c @@ -3,12 +3,6 @@ #define WEB_SERVER_INTERNALS 1 #include "web_server.h" -// this file includes 3 web servers: -// -// 1. single-threaded, based on select() -// 2. multi-threaded, based on poll() that spawns threads to handle the requests, based on select() -// 3. static-threaded, based on poll() using a fixed number of threads (configured at netdata.conf) - WEB_SERVER_MODE web_server_mode = WEB_SERVER_MODE_STATIC_THREADED; // -------------------------------------------------------------------------------------- @@ -16,28 +10,18 @@ WEB_SERVER_MODE web_server_mode = WEB_SERVER_MODE_STATIC_THREADED; WEB_SERVER_MODE web_server_mode_id(const char *mode) { if(!strcmp(mode, "none")) return WEB_SERVER_MODE_NONE; - else if(!strcmp(mode, "single") || !strcmp(mode, "single-threaded")) - return WEB_SERVER_MODE_SINGLE_THREADED; - else if(!strcmp(mode, "static") || !strcmp(mode, "static-threaded")) + else return WEB_SERVER_MODE_STATIC_THREADED; - else // if(!strcmp(mode, "multi") || !strcmp(mode, "multi-threaded")) - return WEB_SERVER_MODE_MULTI_THREADED; + } const char *web_server_mode_name(WEB_SERVER_MODE id) { switch(id) { case WEB_SERVER_MODE_NONE: return "none"; - - case WEB_SERVER_MODE_SINGLE_THREADED: - return "single-threaded"; - + default: case WEB_SERVER_MODE_STATIC_THREADED: return "static-threaded"; - - default: - case WEB_SERVER_MODE_MULTI_THREADED: - return "multi-threaded"; } } @@ -45,20 +29,44 @@ const char *web_server_mode_name(WEB_SERVER_MODE id) { // API sockets LISTEN_SOCKETS api_sockets = { - .config = &netdata_config, - .config_section = CONFIG_SECTION_WEB, - .default_bind_to = "*", - .default_port = API_LISTEN_PORT, - .backlog = API_LISTEN_BACKLOG + .config = &netdata_config, + .config_section = CONFIG_SECTION_WEB, + .default_bind_to = "*", + .default_port = API_LISTEN_PORT, + .backlog = API_LISTEN_BACKLOG }; -int api_listen_sockets_setup(void) { - int socks = listen_sockets_setup(&api_sockets); +void debug_sockets() { + BUFFER *wb = buffer_create(256 * sizeof(char)); + int i; - if(!socks) - fatal("LISTENER: Cannot listen on any API socket. Exiting..."); + for(i = 0 ; i < (int)api_sockets.opened ; i++) { + buffer_strcat(wb, (api_sockets.fds_acl_flags[i] & WEB_CLIENT_ACL_NOCHECK)?"NONE ":""); + buffer_strcat(wb, (api_sockets.fds_acl_flags[i] & WEB_CLIENT_ACL_DASHBOARD)?"dashboard ":""); + buffer_strcat(wb, (api_sockets.fds_acl_flags[i] & WEB_CLIENT_ACL_REGISTRY)?"registry ":""); + buffer_strcat(wb, (api_sockets.fds_acl_flags[i] & WEB_CLIENT_ACL_BADGE)?"badges ":""); + buffer_strcat(wb, (api_sockets.fds_acl_flags[i] & WEB_CLIENT_ACL_MGMT)?"management ":""); + buffer_strcat(wb, (api_sockets.fds_acl_flags[i] & WEB_CLIENT_ACL_STREAMING)?"streaming ":""); + buffer_strcat(wb, (api_sockets.fds_acl_flags[i] & WEB_CLIENT_ACL_NETDATACONF)?"netdata.conf ":""); + debug(D_WEB_CLIENT, "Socket fd %d name '%s' acl_flags: %s", + i, + api_sockets.fds_names[i], + buffer_tostring(wb)); + buffer_reset(wb); + } + buffer_free(wb); +} - return socks; +void api_listen_sockets_setup(void) { + int socks = listen_sockets_setup(&api_sockets); + + if(!socks) + fatal("LISTENER: Cannot listen on any API socket. Exiting..."); + + if(unlikely(debug_flags & D_WEB_CLIENT)) + debug_sockets(); + + return; } @@ -66,13 +74,14 @@ int api_listen_sockets_setup(void) { // access lists SIMPLE_PATTERN *web_allow_connections_from = NULL; -SIMPLE_PATTERN *web_allow_streaming_from = NULL; -SIMPLE_PATTERN *web_allow_netdataconf_from = NULL; // WEB_CLIENT_ACL SIMPLE_PATTERN *web_allow_dashboard_from = NULL; SIMPLE_PATTERN *web_allow_registry_from = NULL; SIMPLE_PATTERN *web_allow_badges_from = NULL; +SIMPLE_PATTERN *web_allow_mgmt_from = NULL; +SIMPLE_PATTERN *web_allow_streaming_from = NULL; +SIMPLE_PATTERN *web_allow_netdataconf_from = NULL; void web_client_update_acl_matches(struct web_client *w) { w->acl = WEB_CLIENT_ACL_NONE; @@ -85,6 +94,17 @@ void web_client_update_acl_matches(struct web_client *w) { if(!web_allow_badges_from || simple_pattern_matches(web_allow_badges_from, w->client_ip)) w->acl |= WEB_CLIENT_ACL_BADGE; + + if(!web_allow_mgmt_from || simple_pattern_matches(web_allow_mgmt_from, w->client_ip)) + w->acl |= WEB_CLIENT_ACL_MGMT; + + if(!web_allow_streaming_from || simple_pattern_matches(web_allow_streaming_from, w->client_ip)) + w->acl |= WEB_CLIENT_ACL_STREAMING; + + if(!web_allow_netdataconf_from || simple_pattern_matches(web_allow_netdataconf_from, w->client_ip)) + w->acl |= WEB_CLIENT_ACL_NETDATACONF; + + w->acl &= w->port_acl; } @@ -119,28 +139,4 @@ void web_client_initialize_connection(struct web_client *w) { web_client_cache_verify(0); } -struct web_client *web_client_create_on_listenfd(int listener) { - struct web_client *w; - - w = web_client_get_from_cache_or_allocate(); - w->ifd = w->ofd = accept_socket(listener, SOCK_NONBLOCK, w->client_ip, sizeof(w->client_ip), w->client_port, sizeof(w->client_port), web_allow_connections_from); - - if(unlikely(!*w->client_ip)) strcpy(w->client_ip, "-"); - if(unlikely(!*w->client_port)) strcpy(w->client_port, "-"); - - if (w->ifd == -1) { - if(errno == EPERM) - web_server_log_connection(w, "ACCESS DENIED"); - else { - web_server_log_connection(w, "CONNECTION FAILED"); - error("%llu: Failed to accept new incoming connection.", w->id); - } - - web_client_release(w); - return NULL; - } - - web_client_initialize_connection(w); - return(w); -} diff --git a/web/server/web_server.h b/web/server/web_server.h index 7777c8978f..e7c2dd4482 100644 --- a/web/server/web_server.h +++ b/web/server/web_server.h @@ -15,9 +15,7 @@ #endif typedef enum web_server_mode { - WEB_SERVER_MODE_SINGLE_THREADED, WEB_SERVER_MODE_STATIC_THREADED, - WEB_SERVER_MODE_MULTI_THREADED, WEB_SERVER_MODE_NONE } WEB_SERVER_MODE; @@ -27,13 +25,14 @@ extern SIMPLE_PATTERN *web_allow_registry_from; extern SIMPLE_PATTERN *web_allow_badges_from; extern SIMPLE_PATTERN *web_allow_streaming_from; extern SIMPLE_PATTERN *web_allow_netdataconf_from; +extern SIMPLE_PATTERN *web_allow_mgmt_from; extern WEB_SERVER_MODE web_server_mode; extern WEB_SERVER_MODE web_server_mode_id(const char *mode); extern const char *web_server_mode_name(WEB_SERVER_MODE id); -extern int api_listen_sockets_setup(void); +extern void api_listen_sockets_setup(void); #define DEFAULT_TIMEOUT_TO_RECEIVE_FIRST_WEB_REQUEST 60 #define DEFAULT_DISCONNECT_IDLE_WEB_CLIENTS_AFTER_SECONDS 60 @@ -51,8 +50,6 @@ extern struct web_client *web_client_create_on_listenfd(int listener); #include "web_client_cache.h" #endif // WEB_SERVER_INTERNALS -#include "single/single-threaded.h" -#include "multi/multi-threaded.h" #include "static/static-threaded.h" #include "daemon/common.h"