Workers utilization charts (#12807)

* initial version of worker utilization

* working example

* without mutexes

* monitoring DBENGINE, ACLKSYNC, WEB workers

* added charts to monitor worker usage

* fixed charts units

* updated contexts

* updated priorities

* added documentation

* converted threads to stacked chart

* One query per query thread

* Revert "One query per query thread"

This reverts commit 6aeb391f5987c3c6ba2864b559fd7f0cd64b14d3.

* fixed priority for web charts

* read worker cpu utilization from proc

* read workers cpu utilization via /proc/self/task/PID/stat, so that we have cpu utilization even when the jobs are too long to finish within our update_every frequency

* disabled web server cpu utilization monitoring - it is now monitored by worker utilization

* tight integration of worker utilization to web server

* monitoring statsd worker threads

* code cleanup and renaming of variables

* contrained worker and statistics conflict to just one variable

* support for rendering jobs per type

* better priorities and removed the total jobs chart

* added busy time in ms per job type

* added proc.plugin monitoring, switch clock to MONOTONIC_RAW if available, global statistics now cleans up old worker threads

* isolated worker thread families

* added cgroups.plugin workers

* remove unneeded dimensions when then expected worker is just one

* plugins.d and streaming monitoring

* rebased; support worker_is_busy() to be called one after another

* added diskspace plugin monitoring

* added tc.plugin monitoring

* added ML threads monitoring

* dont create dimensions and charts that are not needed

* fix crash when job types are added on the fly

* added timex and idlejitter plugins; collected heartbeat statistics; reworked heartbeat according to the POSIX

* the right name is heartbeat for this chart

* monitor streaming senders

* added streaming senders to global stats

* prevent division by zero

* added clock_init() to external C plugins

* added freebsd and macos plugins

* added freebsd and macos to global statistics

* dont use new as a variable; address compiler warnings on FreeBSD and MacOS

* refactored contexts to be unique; added health threads monitoring

Co-authored-by: Stelios Fragkakis <52996999+stelfrag@users.noreply.github.com>
This commit is contained in:
Costa Tsaousis 2022-05-09 16:34:31 +03:00 committed by GitHub
parent 0b3ee50c76
commit eb216a1f4b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
42 changed files with 2071 additions and 1097 deletions

View File

@ -410,6 +410,8 @@ set(LIBNETDATA_FILES
libnetdata/string/utf8.h
libnetdata/socket/security.c
libnetdata/socket/security.h
libnetdata/worker_utilization/worker_utilization.c
libnetdata/worker_utilization/worker_utilization.h
libnetdata/circular_buffer/circular_buffer.c
libnetdata/circular_buffer/circular_buffer.h)

View File

@ -187,6 +187,8 @@ LIBNETDATA_FILES = \
libnetdata/health/health.c \
libnetdata/health/health.h \
libnetdata/string/utf8.h \
libnetdata/worker_utilization/worker_utilization.c \
libnetdata/worker_utilization/worker_utilization.h \
$(NULL)
if ENABLE_PLUGIN_EBPF

View File

@ -351,6 +351,8 @@ static void aclk_query_process_msg(struct aclk_query_thread *query_thr, aclk_que
{
for (int i = 0; aclk_query_handlers[i].type != UNKNOWN; i++) {
if (aclk_query_handlers[i].type == query->type) {
worker_is_busy(i);
debug(D_ACLK, "Processing Queued Message of type: \"%s\"", aclk_query_handlers[i].name);
aclk_query_handlers[i].fnc(query_thr, query);
if (aclk_stats_enabled) {
@ -361,6 +363,8 @@ static void aclk_query_process_msg(struct aclk_query_thread *query_thr, aclk_que
ACLK_STATS_UNLOCK;
}
aclk_query_free(query);
worker_is_idle();
return;
}
}
@ -378,21 +382,33 @@ int aclk_query_process_msgs(struct aclk_query_thread *query_thr)
return 0;
}
static void worker_aclk_register(void) {
worker_register("ACLKQUERY");
for (int i = 0; aclk_query_handlers[i].type != UNKNOWN; i++) {
worker_register_job_name(i, aclk_query_handlers[i].name);
}
}
/**
* Main query processing thread
*/
void *aclk_query_main_thread(void *ptr)
{
worker_aclk_register();
struct aclk_query_thread *query_thr = ptr;
while (!netdata_exit) {
aclk_query_process_msgs(query_thr);
worker_is_idle();
QUERY_THREAD_LOCK;
if (unlikely(pthread_cond_wait(&query_cond_wait, &query_lock_wait)))
sleep_usec(USEC_PER_SEC * 1);
QUERY_THREAD_UNLOCK;
}
worker_unregister();
return NULL;
}

View File

@ -360,10 +360,8 @@
#define NETDATA_CHART_PRIO_CHECKS 99999
#define NETDATA_CHART_PRIO_NETDATA_DISKSPACE 132020
#define NETDATA_CHART_PRIO_NETDATA_TIMEX 132030
#define NETDATA_CHART_PRIO_NETDATA_TC_CPU 135000
#define NETDATA_CHART_PRIO_NETDATA_TC_TIME 135001
#define NETDATA_CHART_PRIO_NETDATA_TC_TIME 1000100
#endif //NETDATA_ALL_H

View File

@ -4124,6 +4124,8 @@ static int check_capabilities() {
int main(int argc, char **argv) {
// debug_flags = D_PROCFILE;
clocks_init();
pagesize = (size_t)sysconf(_SC_PAGESIZE);
// set the name for logging

View File

@ -2646,11 +2646,26 @@ static inline void discovery_process_cgroup(struct cgroup *cg) {
read_cgroup_network_interfaces(cg);
}
#define WORKER_DISCOVERY_INIT 0
#define WORKER_DISCOVERY_FIND 1
#define WORKER_DISCOVERY_PROCESS 2
#define WORKER_DISCOVERY_UPDATE 3
#define WORKER_DISCOVERY_CLEANUP 4
#define WORKER_DISCOVERY_COPY 5
#define WORKER_DISCOVERY_SHARE 6
#define WORKER_DISCOVERY_LOCK 7
#if WORKER_UTILIZATION_MAX_JOB_TYPES < 8
#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 8
#endif
static inline void discovery_find_all_cgroups() {
debug(D_CGROUP, "searching for cgroups");
worker_is_busy(WORKER_DISCOVERY_INIT);
discovery_mark_all_cgroups_as_unavailable();
worker_is_busy(WORKER_DISCOVERY_FIND);
if (!cgroup_use_unified_cgroups) {
discovery_find_all_cgroups_v1();
} else {
@ -2659,16 +2674,25 @@ static inline void discovery_find_all_cgroups() {
struct cgroup *cg;
for (cg = discovered_cgroup_root; cg; cg = cg->discovered_next) {
worker_is_busy(WORKER_DISCOVERY_PROCESS);
discovery_process_cgroup(cg);
}
worker_is_busy(WORKER_DISCOVERY_UPDATE);
discovery_update_filenames();
worker_is_busy(WORKER_DISCOVERY_LOCK);
uv_mutex_lock(&cgroup_root_mutex);
worker_is_busy(WORKER_DISCOVERY_CLEANUP);
discovery_cleanup_all_cgroups();
worker_is_busy(WORKER_DISCOVERY_COPY);
discovery_copy_discovered_cgroups_to_reader();
uv_mutex_unlock(&cgroup_root_mutex);
worker_is_busy(WORKER_DISCOVERY_SHARE);
discovery_share_cgroups_with_ebpf();
debug(D_CGROUP, "done searching for cgroups");
@ -2678,7 +2702,19 @@ void cgroup_discovery_worker(void *ptr)
{
UNUSED(ptr);
worker_register("CGROUPSDISC");
worker_register_job_name(WORKER_DISCOVERY_INIT, "init");
worker_register_job_name(WORKER_DISCOVERY_FIND, "find");
worker_register_job_name(WORKER_DISCOVERY_PROCESS, "process");
worker_register_job_name(WORKER_DISCOVERY_UPDATE, "update");
worker_register_job_name(WORKER_DISCOVERY_CLEANUP, "cleanup");
worker_register_job_name(WORKER_DISCOVERY_COPY, "copy");
worker_register_job_name(WORKER_DISCOVERY_SHARE, "share");
worker_register_job_name(WORKER_DISCOVERY_LOCK, "lock");
while (!netdata_exit) {
worker_is_idle();
uv_mutex_lock(&discovery_thread.mutex);
while (!discovery_thread.start_discovery)
uv_cond_wait(&discovery_thread.cond_var, &discovery_thread.mutex);
@ -2692,6 +2728,7 @@ void cgroup_discovery_worker(void *ptr)
}
discovery_thread.exited = 1;
worker_unregister();
}
// ----------------------------------------------------------------------------
@ -4650,6 +4687,8 @@ void update_cgroup_charts(int update_every) {
// cgroups main
static void cgroup_main_cleanup(void *ptr) {
worker_unregister();
struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
static_thread->enabled = NETDATA_MAIN_THREAD_EXITING;
@ -4687,24 +4726,30 @@ static void cgroup_main_cleanup(void *ptr) {
static_thread->enabled = NETDATA_MAIN_THREAD_EXITED;
}
void *cgroups_main(void *ptr) {
netdata_thread_cleanup_push(cgroup_main_cleanup, ptr);
#define WORKER_CGROUPS_LOCK 0
#define WORKER_CGROUPS_READ 1
#define WORKER_CGROUPS_CHART 2
struct rusage thread;
#if WORKER_UTILIZATION_MAX_JOB_TYPES < 3
#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 3
#endif
void *cgroups_main(void *ptr) {
worker_register("CGROUPS");
worker_register_job_name(WORKER_CGROUPS_LOCK, "lock");
worker_register_job_name(WORKER_CGROUPS_READ, "read");
worker_register_job_name(WORKER_CGROUPS_READ, "chart");
netdata_thread_cleanup_push(cgroup_main_cleanup, ptr);
if (getenv("KUBERNETES_SERVICE_HOST") != NULL && getenv("KUBERNETES_SERVICE_PORT") != NULL) {
is_inside_k8s = 1;
cgroup_enable_cpuacct_cpu_shares = CONFIG_BOOLEAN_YES;
}
// when ZERO, attempt to do it
int vdo_cpu_netdata = config_get_boolean("plugin:cgroups", "cgroups plugin resource charts", 1);
read_cgroup_plugin_configuration();
netdata_cgroup_ebpf_initialize_shm();
RRDSET *stcpu_thread = NULL;
if (uv_mutex_init(&cgroup_root_mutex)) {
error("CGROUP: cannot initialize mutex for the main cgroup list");
goto exit;
@ -4736,6 +4781,8 @@ void *cgroups_main(void *ptr) {
usec_t find_every = cgroup_check_for_new_every * USEC_PER_SEC, find_dt = 0;
while(!netdata_exit) {
worker_is_idle();
usec_t hb_dt = heartbeat_next(&hb, step);
if(unlikely(netdata_exit)) break;
@ -4747,46 +4794,21 @@ void *cgroups_main(void *ptr) {
cgroups_check = 0;
}
worker_is_busy(WORKER_CGROUPS_LOCK);
uv_mutex_lock(&cgroup_root_mutex);
worker_is_busy(WORKER_CGROUPS_READ);
read_all_discovered_cgroups(cgroup_root);
worker_is_busy(WORKER_CGROUPS_CHART);
update_cgroup_charts(cgroup_update_every);
worker_is_idle();
uv_mutex_unlock(&cgroup_root_mutex);
// --------------------------------------------------------------------
if(vdo_cpu_netdata) {
getrusage(RUSAGE_THREAD, &thread);
if(unlikely(!stcpu_thread)) {
stcpu_thread = rrdset_create_localhost(
"netdata"
, "plugin_cgroups_cpu"
, NULL
, "cgroups"
, NULL
, "Netdata CGroups Plugin CPU usage"
, "milliseconds/s"
, PLUGIN_CGROUPS_NAME
, "stats"
, 132000
, cgroup_update_every
, RRDSET_TYPE_STACKED
);
rrddim_add(stcpu_thread, "user", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL);
rrddim_add(stcpu_thread, "system", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL);
}
else
rrdset_next(stcpu_thread);
rrddim_set(stcpu_thread, "user" , thread.ru_utime.tv_sec * 1000000ULL + thread.ru_utime.tv_usec);
rrddim_set(stcpu_thread, "system", thread.ru_stime.tv_sec * 1000000ULL + thread.ru_stime.tv_usec);
rrdset_done(stcpu_thread);
}
}
exit:
worker_unregister();
netdata_thread_cleanup_pop(1);
return NULL;
}

View File

@ -224,6 +224,7 @@ void reset_metrics() {
}
int main(int argc, char **argv) {
clocks_init();
// ------------------------------------------------------------------------
// initialization of netdata plugin

View File

@ -365,6 +365,8 @@ static inline void do_disk_space_stats(struct mountinfo *mi, int update_every) {
}
static void diskspace_main_cleanup(void *ptr) {
worker_unregister();
struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
static_thread->enabled = NETDATA_MAIN_THREAD_EXITING;
@ -373,10 +375,21 @@ static void diskspace_main_cleanup(void *ptr) {
static_thread->enabled = NETDATA_MAIN_THREAD_EXITED;
}
void *diskspace_main(void *ptr) {
netdata_thread_cleanup_push(diskspace_main_cleanup, ptr);
#define WORKER_JOB_MOUNTINFO 0
#define WORKER_JOB_MOUNTPOINT 1
#define WORKER_JOB_CLEANUP 2
int vdo_cpu_netdata = config_get_boolean("plugin:proc", "netdata server resources", 1);
#if WORKER_UTILIZATION_MAX_JOB_TYPES < 3
#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 3
#endif
void *diskspace_main(void *ptr) {
worker_register("DISKSPACE");
worker_register_job_name(WORKER_JOB_MOUNTINFO, "mountinfo");
worker_register_job_name(WORKER_JOB_MOUNTPOINT, "mountpoint");
worker_register_job_name(WORKER_JOB_CLEANUP, "cleanup");
netdata_thread_cleanup_push(diskspace_main_cleanup, ptr);
cleanup_mount_points = config_get_boolean(CONFIG_SECTION_DISKSPACE, "remove charts of unmounted disks" , cleanup_mount_points);
@ -388,14 +401,11 @@ void *diskspace_main(void *ptr) {
if(check_for_new_mountpoints_every < update_every)
check_for_new_mountpoints_every = update_every;
struct rusage thread;
usec_t duration = 0;
usec_t step = update_every * USEC_PER_SEC;
heartbeat_t hb;
heartbeat_init(&hb);
while(!netdata_exit) {
duration = heartbeat_monotonic_dt_to_now_usec(&hb);
worker_is_idle();
/* usec_t hb_dt = */ heartbeat_next(&hb, step);
if(unlikely(netdata_exit)) break;
@ -404,9 +414,9 @@ void *diskspace_main(void *ptr) {
// --------------------------------------------------------------------------
// this is smart enough not to reload it every time
worker_is_busy(WORKER_JOB_MOUNTINFO);
mountinfo_reload(0);
// --------------------------------------------------------------------------
// disk space metrics
@ -420,80 +430,20 @@ void *diskspace_main(void *ptr) {
if(mi->flags & MOUNTINFO_READONLY && !strcmp(mi->root, mi->mount_point))
continue;
worker_is_busy(WORKER_JOB_MOUNTPOINT);
do_disk_space_stats(mi, update_every);
if(unlikely(netdata_exit)) break;
}
if(unlikely(netdata_exit)) break;
if(dict_mountpoints)
if(dict_mountpoints) {
worker_is_busy(WORKER_JOB_CLEANUP);
dictionary_get_all(dict_mountpoints, mount_point_cleanup, NULL);
if(vdo_cpu_netdata) {
static RRDSET *stcpu_thread = NULL, *st_duration = NULL;
static RRDDIM *rd_user = NULL, *rd_system = NULL, *rd_duration = NULL;
// ----------------------------------------------------------------
getrusage(RUSAGE_THREAD, &thread);
if(unlikely(!stcpu_thread)) {
stcpu_thread = rrdset_create_localhost(
"netdata"
, "plugin_diskspace"
, NULL
, "diskspace"
, NULL
, "Netdata Disk Space Plugin CPU usage"
, "milliseconds/s"
, PLUGIN_DISKSPACE_NAME
, NULL
, NETDATA_CHART_PRIO_NETDATA_DISKSPACE
, update_every
, RRDSET_TYPE_STACKED
);
rd_user = rrddim_add(stcpu_thread, "user", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL);
rd_system = rrddim_add(stcpu_thread, "system", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL);
}
else
rrdset_next(stcpu_thread);
rrddim_set_by_pointer(stcpu_thread, rd_user, thread.ru_utime.tv_sec * 1000000ULL + thread.ru_utime.tv_usec);
rrddim_set_by_pointer(stcpu_thread, rd_system, thread.ru_stime.tv_sec * 1000000ULL + thread.ru_stime.tv_usec);
rrdset_done(stcpu_thread);
// ----------------------------------------------------------------
if(unlikely(!st_duration)) {
st_duration = rrdset_create_localhost(
"netdata"
, "plugin_diskspace_dt"
, NULL
, "diskspace"
, NULL
, "Netdata Disk Space Plugin Duration"
, "milliseconds/run"
, PLUGIN_DISKSPACE_NAME
, NULL
, 132021
, update_every
, RRDSET_TYPE_AREA
);
rd_duration = rrddim_add(st_duration, "duration", NULL, 1, 1000, RRD_ALGORITHM_ABSOLUTE);
}
else
rrdset_next(st_duration);
rrddim_set_by_pointer(st_duration, rd_duration, duration);
rrdset_done(st_duration);
// ----------------------------------------------------------------
if(unlikely(netdata_exit)) break;
}
}
worker_unregister();
netdata_thread_cleanup_pop(1);
return NULL;

View File

@ -1864,6 +1864,8 @@ static void ebpf_manage_pid(pid_t pid)
*/
int main(int argc, char **argv)
{
clocks_init();
set_global_variables();
ebpf_parse_args(argc, argv);
ebpf_manage_pid(getpid());

View File

@ -9,7 +9,6 @@ static struct freebsd_module {
int enabled;
int (*func)(int update_every, usec_t dt);
usec_t duration;
RRDDIM *rd;
@ -68,8 +67,14 @@ static struct freebsd_module {
{.name = NULL, .dim = NULL, .enabled = 0, .func = NULL}
};
#if WORKER_UTILIZATION_MAX_JOB_TYPES < 33
#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 33
#endif
static void freebsd_main_cleanup(void *ptr)
{
worker_unregister();
struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
static_thread->enabled = NETDATA_MAIN_THREAD_EXITING;
@ -80,9 +85,9 @@ static void freebsd_main_cleanup(void *ptr)
void *freebsd_main(void *ptr)
{
netdata_thread_cleanup_push(freebsd_main_cleanup, ptr);
worker_register("FREEBSD");
int vdo_cpu_netdata = config_get_boolean("plugin:freebsd", "netdata server resources", 1);
netdata_thread_cleanup_push(freebsd_main_cleanup, ptr);
// initialize FreeBSD plugin
if (freebsd_plugin_init())
@ -94,8 +99,9 @@ void *freebsd_main(void *ptr)
struct freebsd_module *pm = &freebsd_modules[i];
pm->enabled = config_get_boolean("plugin:freebsd", pm->name, pm->enabled);
pm->duration = 0ULL;
pm->rd = NULL;
worker_register_job_name(i, freebsd_modules[i].dim);
}
usec_t step = localhost->rrd_update_every * USEC_PER_SEC;
@ -103,14 +109,13 @@ void *freebsd_main(void *ptr)
heartbeat_init(&hb);
while (!netdata_exit) {
worker_is_idle();
usec_t hb_dt = heartbeat_next(&hb, step);
usec_t duration = 0ULL;
if (unlikely(netdata_exit))
break;
// BEGIN -- the job to be done
for (i = 0; freebsd_modules[i].name; i++) {
struct freebsd_module *pm = &freebsd_modules[i];
if (unlikely(!pm->enabled))
@ -118,92 +123,12 @@ void *freebsd_main(void *ptr)
debug(D_PROCNETDEV_LOOP, "FREEBSD calling %s.", pm->name);
worker_is_busy(i);
pm->enabled = !pm->func(localhost->rrd_update_every, hb_dt);
pm->duration = heartbeat_monotonic_dt_to_now_usec(&hb) - duration;
duration += pm->duration;
if (unlikely(netdata_exit))
break;
}
// END -- the job is done
if (vdo_cpu_netdata) {
static RRDSET *st_cpu_thread = NULL, *st_duration = NULL;
static RRDDIM *rd_user = NULL, *rd_system = NULL;
// ----------------------------------------------------------------
struct rusage thread;
getrusage(RUSAGE_THREAD, &thread);
if (unlikely(!st_cpu_thread)) {
st_cpu_thread = rrdset_create_localhost(
"netdata",
"plugin_freebsd_cpu",
NULL,
"freebsd",
NULL,
"Netdata FreeBSD plugin CPU usage",
"milliseconds/s",
"freebsd.plugin",
"stats",
132000,
localhost->rrd_update_every,
RRDSET_TYPE_STACKED);
rd_user = rrddim_add(st_cpu_thread, "user", NULL, 1, USEC_PER_MS, RRD_ALGORITHM_INCREMENTAL);
rd_system = rrddim_add(st_cpu_thread, "system", NULL, 1, USEC_PER_MS, RRD_ALGORITHM_INCREMENTAL);
} else {
rrdset_next(st_cpu_thread);
}
rrddim_set_by_pointer(
st_cpu_thread, rd_user, thread.ru_utime.tv_sec * USEC_PER_SEC + thread.ru_utime.tv_usec);
rrddim_set_by_pointer(
st_cpu_thread, rd_system, thread.ru_stime.tv_sec * USEC_PER_SEC + thread.ru_stime.tv_usec);
rrdset_done(st_cpu_thread);
// ----------------------------------------------------------------
if (unlikely(!st_duration)) {
st_duration = rrdset_find_active_bytype_localhost("netdata", "plugin_freebsd_modules");
if (!st_duration) {
st_duration = rrdset_create_localhost(
"netdata",
"plugin_freebsd_modules",
NULL,
"freebsd",
NULL,
"Netdata FreeBSD plugin modules durations",
"milliseconds/run",
"freebsd.plugin",
"stats",
132001,
localhost->rrd_update_every,
RRDSET_TYPE_STACKED);
for (i = 0; freebsd_modules[i].name; i++) {
struct freebsd_module *pm = &freebsd_modules[i];
if (unlikely(!pm->enabled))
continue;
pm->rd = rrddim_add(st_duration, pm->dim, NULL, 1, 1000, RRD_ALGORITHM_ABSOLUTE);
}
}
} else
rrdset_next(st_duration);
for (i = 0; freebsd_modules[i].name; i++) {
struct freebsd_module *pm = &freebsd_modules[i];
if (unlikely(!pm->enabled))
continue;
rrddim_set_by_pointer(st_duration, pm->rd, pm->duration);
}
rrdset_done(st_duration);
}
}
netdata_thread_cleanup_pop(1);

View File

@ -1596,6 +1596,7 @@ int host_is_local(const char *host)
}
int main (int argc, char **argv) {
clocks_init();
// ------------------------------------------------------------------------
// initialization of netdata plugin

View File

@ -5,6 +5,8 @@
#define CPU_IDLEJITTER_SLEEP_TIME_MS 20
static void cpuidlejitter_main_cleanup(void *ptr) {
worker_unregister();
struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
static_thread->enabled = NETDATA_MAIN_THREAD_EXITING;
@ -14,6 +16,9 @@ static void cpuidlejitter_main_cleanup(void *ptr) {
}
void *cpuidlejitter_main(void *ptr) {
worker_register("IDLEJITTER");
worker_register_job_name(0, "measurements");
netdata_thread_cleanup_push(cpuidlejitter_main_cleanup, ptr);
usec_t sleep_ut = config_get_number("plugin:idlejitter", "loop time in ms", CPU_IDLEJITTER_SLEEP_TIME_MS) * USEC_PER_MS;
@ -55,7 +60,9 @@ void *cpuidlejitter_main(void *ptr) {
while(elapsed < update_every_ut) {
now_monotonic_high_precision_timeval(&before);
worker_is_idle();
sleep_usec(sleep_ut);
worker_is_busy(0);
now_monotonic_high_precision_timeval(&after);
usec_t dt = dt_usec(&after, &before);

View File

@ -9,7 +9,6 @@ static struct macos_module {
int enabled;
int (*func)(int update_every, usec_t dt);
usec_t duration;
RRDDIM *rd;
@ -22,8 +21,14 @@ static struct macos_module {
{.name = NULL, .dim = NULL, .enabled = 0, .func = NULL}
};
#if WORKER_UTILIZATION_MAX_JOB_TYPES < 3
#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 3
#endif
static void macos_main_cleanup(void *ptr)
{
worker_unregister();
struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
static_thread->enabled = NETDATA_MAIN_THREAD_EXITING;
@ -34,17 +39,18 @@ static void macos_main_cleanup(void *ptr)
void *macos_main(void *ptr)
{
netdata_thread_cleanup_push(macos_main_cleanup, ptr);
worker_register("MACOS");
int vdo_cpu_netdata = config_get_boolean("plugin:macos", "netdata server resources", CONFIG_BOOLEAN_YES);
netdata_thread_cleanup_push(macos_main_cleanup, ptr);
// check the enabled status for each module
for (int i = 0; macos_modules[i].name; i++) {
struct macos_module *pm = &macos_modules[i];
pm->enabled = config_get_boolean("plugin:macos", pm->name, pm->enabled);
pm->duration = 0ULL;
pm->rd = NULL;
worker_register_job_name(i, macos_modules[i].dim);
}
usec_t step = localhost->rrd_update_every * USEC_PER_SEC;
@ -52,10 +58,8 @@ void *macos_main(void *ptr)
heartbeat_init(&hb);
while (!netdata_exit) {
worker_is_idle();
usec_t hb_dt = heartbeat_next(&hb, step);
usec_t duration = 0ULL;
// BEGIN -- the job to be done
for (int i = 0; macos_modules[i].name; i++) {
struct macos_module *pm = &macos_modules[i];
@ -64,92 +68,12 @@ void *macos_main(void *ptr)
debug(D_PROCNETDEV_LOOP, "macos calling %s.", pm->name);
worker_is_busy(i);
pm->enabled = !pm->func(localhost->rrd_update_every, hb_dt);
pm->duration = heartbeat_monotonic_dt_to_now_usec(&hb) - duration;
duration += pm->duration;
if (unlikely(netdata_exit))
break;
}
// END -- the job is done
if (vdo_cpu_netdata) {
static RRDSET *st_cpu_thread = NULL, *st_duration = NULL;
static RRDDIM *rd_user = NULL, *rd_system = NULL;
// ----------------------------------------------------------------
struct rusage thread;
getrusage(RUSAGE_THREAD, &thread);
if (unlikely(!st_cpu_thread)) {
st_cpu_thread = rrdset_create_localhost(
"netdata",
"plugin_macos_cpu",
NULL,
"macos",
NULL,
"Netdata macOS plugin CPU usage",
"milliseconds/s",
"macos.plugin",
"stats",
132000,
localhost->rrd_update_every,
RRDSET_TYPE_STACKED);
rd_user = rrddim_add(st_cpu_thread, "user", NULL, 1, USEC_PER_MS, RRD_ALGORITHM_INCREMENTAL);
rd_system = rrddim_add(st_cpu_thread, "system", NULL, 1, USEC_PER_MS, RRD_ALGORITHM_INCREMENTAL);
} else {
rrdset_next(st_cpu_thread);
}
rrddim_set_by_pointer(
st_cpu_thread, rd_user, thread.ru_utime.tv_sec * USEC_PER_SEC + thread.ru_utime.tv_usec);
rrddim_set_by_pointer(
st_cpu_thread, rd_system, thread.ru_stime.tv_sec * USEC_PER_SEC + thread.ru_stime.tv_usec);
rrdset_done(st_cpu_thread);
// ----------------------------------------------------------------
if (unlikely(!st_duration)) {
st_duration = rrdset_find_active_bytype_localhost("netdata", "plugin_macos_modules");
if (!st_duration) {
st_duration = rrdset_create_localhost(
"netdata",
"plugin_macos_modules",
NULL,
"macos",
NULL,
"Netdata macOS plugin modules durations",
"milliseconds/run",
"macos.plugin",
"stats",
132001,
localhost->rrd_update_every,
RRDSET_TYPE_STACKED);
for (int i = 0; macos_modules[i].name; i++) {
struct macos_module *pm = &macos_modules[i];
if (unlikely(!pm->enabled))
continue;
pm->rd = rrddim_add(st_duration, pm->dim, NULL, 1, 1000, RRD_ALGORITHM_ABSOLUTE);
}
}
} else
rrdset_next(st_duration);
for (int i = 0; macos_modules[i].name; i++) {
struct macos_module *pm = &macos_modules[i];
if (unlikely(!pm->enabled))
continue;
rrddim_set_by_pointer(st_duration, pm->rd, pm->duration);
}
rrdset_done(st_duration);
}
}
netdata_thread_cleanup_pop(1);

View File

@ -745,6 +745,7 @@ void nfacct_signals()
}
int main(int argc, char **argv) {
clocks_init();
// ------------------------------------------------------------------------
// initialization of netdata plugin

View File

@ -1283,6 +1283,7 @@ void parse_command_line(int argc, char **argv) {
}
int main(int argc, char **argv) {
clocks_init();
// ------------------------------------------------------------------------
// initialization of netdata plugin

View File

@ -230,6 +230,8 @@ static void pluginsd_worker_thread_handle_error(struct plugind *cd, int worker_r
void *pluginsd_worker_thread(void *arg)
{
worker_register("PLUGINSD");
netdata_thread_cleanup_push(pluginsd_worker_thread_cleanup, arg);
struct plugind *cd = (struct plugind *)arg;
@ -260,6 +262,7 @@ void *pluginsd_worker_thread(void *arg)
if (unlikely(!cd->enabled))
break;
}
worker_unregister();
netdata_thread_cleanup_pop(1);
return NULL;
@ -281,6 +284,8 @@ static void pluginsd_main_cleanup(void *data)
info("cleanup completed.");
static_thread->enabled = NETDATA_MAIN_THREAD_EXITED;
worker_unregister();
}
void *pluginsd_main(void *ptr)

View File

@ -9,7 +9,6 @@ static struct proc_module {
int enabled;
int (*func)(int update_every, usec_t dt);
usec_t duration;
RRDDIM *rd;
@ -66,9 +65,7 @@ static struct proc_module {
// ZFS metrics
{.name = "/proc/spl/kstat/zfs/arcstats", .dim = "zfs_arcstats", .func = do_proc_spl_kstat_zfs_arcstats},
{.name = "/proc/spl/kstat/zfs/pool/state",
.dim = "zfs_pool_state",
.func = do_proc_spl_kstat_zfs_pool_state},
{.name = "/proc/spl/kstat/zfs/pool/state",.dim = "zfs_pool_state",.func = do_proc_spl_kstat_zfs_pool_state},
// BTRFS metrics
{.name = "/sys/fs/btrfs", .dim = "btrfs", .func = do_sys_fs_btrfs},
@ -83,6 +80,10 @@ static struct proc_module {
{.name = NULL, .dim = NULL, .func = NULL}
};
#if WORKER_UTILIZATION_MAX_JOB_TYPES < 36
#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 36
#endif
static void proc_main_cleanup(void *ptr)
{
struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
@ -91,13 +92,15 @@ static void proc_main_cleanup(void *ptr)
info("cleaning up...");
static_thread->enabled = NETDATA_MAIN_THREAD_EXITED;
worker_unregister();
}
void *proc_main(void *ptr)
{
netdata_thread_cleanup_push(proc_main_cleanup, ptr);
worker_register("PROC");
int vdo_cpu_netdata = config_get_boolean("plugin:proc", "netdata server resources", CONFIG_BOOLEAN_YES);
netdata_thread_cleanup_push(proc_main_cleanup, ptr);
config_get_boolean("plugin:proc", "/proc/pagetypeinfo", CONFIG_BOOLEAN_NO);
@ -107,128 +110,34 @@ void *proc_main(void *ptr)
struct proc_module *pm = &proc_modules[i];
pm->enabled = config_get_boolean("plugin:proc", pm->name, CONFIG_BOOLEAN_YES);
pm->duration = 0ULL;
pm->rd = NULL;
worker_register_job_name(i, proc_modules[i].dim);
}
usec_t step = localhost->rrd_update_every * USEC_PER_SEC;
heartbeat_t hb;
heartbeat_init(&hb);
size_t iterations = 0;
while (!netdata_exit) {
iterations++;
(void)iterations;
worker_is_idle();
usec_t hb_dt = heartbeat_next(&hb, step);
usec_t duration = 0ULL;
if (unlikely(netdata_exit))
break;
// BEGIN -- the job to be done
for (i = 0; proc_modules[i].name; i++) {
if (unlikely(netdata_exit))
break;
struct proc_module *pm = &proc_modules[i];
if (unlikely(!pm->enabled))
continue;
debug(D_PROCNETDEV_LOOP, "PROC calling %s.", pm->name);
//#ifdef NETDATA_LOG_ALLOCATIONS
// if(pm->func == do_proc_interrupts)
// log_thread_memory_allocations = iterations;
//#endif
worker_is_busy(i);
pm->enabled = !pm->func(localhost->rrd_update_every, hb_dt);
pm->duration = heartbeat_monotonic_dt_to_now_usec(&hb) - duration;
duration += pm->duration;
//#ifdef NETDATA_LOG_ALLOCATIONS
// if(pm->func == do_proc_interrupts)
// log_thread_memory_allocations = 0;
//#endif
if (unlikely(netdata_exit))
break;
}
// END -- the job is done
if (vdo_cpu_netdata) {
static RRDSET *st_cpu_thread = NULL, *st_duration = NULL;
static RRDDIM *rd_user = NULL, *rd_system = NULL;
// ----------------------------------------------------------------
struct rusage thread;
getrusage(RUSAGE_THREAD, &thread);
if (unlikely(!st_cpu_thread)) {
st_cpu_thread = rrdset_create_localhost(
"netdata",
"plugin_proc_cpu",
NULL,
"proc",
NULL,
"Netdata proc plugin CPU usage",
"milliseconds/s",
"proc",
"stats",
132000,
localhost->rrd_update_every,
RRDSET_TYPE_STACKED);
rd_user = rrddim_add(st_cpu_thread, "user", NULL, 1, USEC_PER_MS, RRD_ALGORITHM_INCREMENTAL);
rd_system = rrddim_add(st_cpu_thread, "system", NULL, 1, USEC_PER_MS, RRD_ALGORITHM_INCREMENTAL);
} else {
rrdset_next(st_cpu_thread);
}
rrddim_set_by_pointer(
st_cpu_thread, rd_user, thread.ru_utime.tv_sec * USEC_PER_SEC + thread.ru_utime.tv_usec);
rrddim_set_by_pointer(
st_cpu_thread, rd_system, thread.ru_stime.tv_sec * USEC_PER_SEC + thread.ru_stime.tv_usec);
rrdset_done(st_cpu_thread);
// ----------------------------------------------------------------
if (unlikely(!st_duration)) {
st_duration = rrdset_find_active_bytype_localhost("netdata", "plugin_proc_modules");
if (!st_duration) {
st_duration = rrdset_create_localhost(
"netdata",
"plugin_proc_modules",
NULL,
"proc",
NULL,
"Netdata proc plugin modules durations",
"milliseconds/run",
"proc",
"stats",
132001,
localhost->rrd_update_every,
RRDSET_TYPE_STACKED);
for (i = 0; proc_modules[i].name; i++) {
struct proc_module *pm = &proc_modules[i];
if (unlikely(!pm->enabled))
continue;
pm->rd = rrddim_add(st_duration, pm->dim, NULL, 1, USEC_PER_MS, RRD_ALGORITHM_ABSOLUTE);
}
}
} else
rrdset_next(st_duration);
for (i = 0; proc_modules[i].name; i++) {
struct proc_module *pm = &proc_modules[i];
if (unlikely(!pm->enabled))
continue;
rrddim_set_by_pointer(st_duration, pm->rd, pm->duration);
}
rrdset_done(st_duration);
}
}

View File

@ -336,6 +336,7 @@ void usage(void) {
}
int main(int argc, char **argv) {
clocks_init();
program_name = argv[0];
program_version = "0.1";

View File

@ -9,6 +9,15 @@
#define STATSD_LISTEN_PORT 8125
#define STATSD_LISTEN_BACKLOG 4096
#define WORKER_JOB_TYPE_TCP_CONNECTED 0
#define WORKER_JOB_TYPE_TCP_DISCONNECTED 1
#define WORKER_JOB_TYPE_RCV_DATA 2
#define WORKER_JOB_TYPE_SND_DATA 3
#if WORKER_UTILIZATION_MAX_JOB_TYPES < 4
#error Please increase WORKER_UTILIZATION_MAX_JOB_TYPES to at least 4
#endif
// --------------------------------------------------------------------------------------
// #define STATSD_MULTITHREADED 1
@ -237,10 +246,6 @@ struct collection_thread_status {
size_t max_sockets;
netdata_thread_t thread;
struct rusage rusage;
RRDSET *st_cpu;
RRDDIM *rd_user;
RRDDIM *rd_system;
};
static struct statsd {
@ -788,6 +793,7 @@ static void *statsd_add_callback(POLLINFO *pi, short int *events, void *data) {
(void)pi;
(void)data;
worker_is_busy(WORKER_JOB_TYPE_TCP_CONNECTED);
*events = POLLIN;
struct statsd_tcp *t = (struct statsd_tcp *)callocz(sizeof(struct statsd_tcp) + STATSD_TCP_BUFFER_SIZE, 1);
@ -796,11 +802,14 @@ static void *statsd_add_callback(POLLINFO *pi, short int *events, void *data) {
statsd.tcp_socket_connects++;
statsd.tcp_socket_connected++;
worker_is_idle();
return t;
}
// TCP client disconnected
static void statsd_del_callback(POLLINFO *pi) {
worker_is_busy(WORKER_JOB_TYPE_TCP_DISCONNECTED);
struct statsd_tcp *t = pi->data;
if(likely(t)) {
@ -818,10 +827,15 @@ static void statsd_del_callback(POLLINFO *pi) {
freez(t);
}
worker_is_idle();
}
// Receive data
static int statsd_rcv_callback(POLLINFO *pi, short int *events) {
int retval = -1;
worker_is_busy(WORKER_JOB_TYPE_RCV_DATA);
*events = POLLIN;
int fd = pi->fd;
@ -832,14 +846,16 @@ static int statsd_rcv_callback(POLLINFO *pi, short int *events) {
if(unlikely(!d)) {
error("STATSD: internal error: expected TCP data pointer is NULL");
statsd.socket_errors++;
return -1;
retval = -1;
goto cleanup;
}
#ifdef NETDATA_INTERNAL_CHECKS
if(unlikely(d->type != STATSD_SOCKET_DATA_TYPE_TCP)) {
error("STATSD: internal error: socket data type should be %d, but it is %d", (int)STATSD_SOCKET_DATA_TYPE_TCP, (int)d->type);
statsd.socket_errors++;
return -1;
retval = -1;
goto cleanup;
}
#endif
@ -872,8 +888,10 @@ static int statsd_rcv_callback(POLLINFO *pi, short int *events) {
d->len = statsd_process(d->buffer, d->len, 1);
}
if(unlikely(ret == -1))
return -1;
if(unlikely(ret == -1)) {
retval = -1;
goto cleanup;
}
} while (rc != -1);
break;
@ -884,14 +902,16 @@ static int statsd_rcv_callback(POLLINFO *pi, short int *events) {
if(unlikely(!d)) {
error("STATSD: internal error: expected UDP data pointer is NULL");
statsd.socket_errors++;
return -1;
retval = -1;
goto cleanup;
}
#ifdef NETDATA_INTERNAL_CHECKS
if(unlikely(d->type != STATSD_SOCKET_DATA_TYPE_UDP)) {
error("STATSD: internal error: socket data should be %d, but it is %d", (int)d->type, (int)STATSD_SOCKET_DATA_TYPE_UDP);
statsd.socket_errors++;
return -1;
retval = -1;
goto cleanup;
}
#endif
@ -904,7 +924,8 @@ static int statsd_rcv_callback(POLLINFO *pi, short int *events) {
if (errno != EWOULDBLOCK && errno != EAGAIN && errno != EINTR) {
error("STATSD: recvmmsg() on UDP socket %d failed.", fd);
statsd.socket_errors++;
return -1;
retval = -1;
goto cleanup;
}
} else if (rc) {
// data received
@ -929,7 +950,8 @@ static int statsd_rcv_callback(POLLINFO *pi, short int *events) {
if (errno != EWOULDBLOCK && errno != EAGAIN && errno != EINTR) {
error("STATSD: recv() on UDP socket %d failed.", fd);
statsd.socket_errors++;
return -1;
retval = -1;
goto cleanup;
}
} else if (rc) {
// data received
@ -947,24 +969,26 @@ static int statsd_rcv_callback(POLLINFO *pi, short int *events) {
default: {
error("STATSD: internal error: unknown socktype %d on socket %d", pi->socktype, fd);
statsd.socket_errors++;
return -1;
retval = -1;
goto cleanup;
}
}
return 0;
retval = 0;
cleanup:
worker_is_idle();
return retval;
}
static int statsd_snd_callback(POLLINFO *pi, short int *events) {
(void)pi;
(void)events;
worker_is_busy(WORKER_JOB_TYPE_SND_DATA);
error("STATSD: snd_callback() called, but we never requested to send data to statsd clients.");
return -1;
}
worker_is_idle();
static void statsd_timer_callback(void *timer_data) {
struct collection_thread_status *status = timer_data;
getrusage(RUSAGE_THREAD, &status->rusage);
return -1;
}
// --------------------------------------------------------------------------------------------------------------------
@ -986,12 +1010,19 @@ void statsd_collector_thread_cleanup(void *data) {
#endif
freez(d);
worker_unregister();
}
void *statsd_collector_thread(void *ptr) {
struct collection_thread_status *status = ptr;
status->status = 1;
worker_register("STATSD");
worker_register_job_name(WORKER_JOB_TYPE_TCP_CONNECTED, "tcp connect");
worker_register_job_name(WORKER_JOB_TYPE_TCP_DISCONNECTED, "tcp disconnect");
worker_register_job_name(WORKER_JOB_TYPE_RCV_DATA, "receive");
worker_register_job_name(WORKER_JOB_TYPE_SND_DATA, "send");
info("STATSD collector thread started with taskid %d", gettid());
struct statsd_udp *d = callocz(sizeof(struct statsd_udp), 1);
@ -1019,7 +1050,7 @@ void *statsd_collector_thread(void *ptr) {
, statsd_del_callback
, statsd_rcv_callback
, statsd_snd_callback
, statsd_timer_callback
, NULL
, NULL // No access control pattern
, 0 // No dns lookups for access control pattern
, (void *)d
@ -2147,9 +2178,32 @@ static void statsd_main_cleanup(void *data) {
info("STATSD: cleanup completed.");
static_thread->enabled = NETDATA_MAIN_THREAD_EXITED;
worker_unregister();
}
#define WORKER_STATSD_FLUSH_GAUGES 0
#define WORKER_STATSD_FLUSH_COUNTERS 1
#define WORKER_STATSD_FLUSH_METERS 2
#define WORKER_STATSD_FLUSH_TIMERS 3
#define WORKER_STATSD_FLUSH_HISTOGRAMS 4
#define WORKER_STATSD_FLUSH_SETS 5
#define WORKER_STATSD_FLUSH_STATS 6
#if WORKER_UTILIZATION_MAX_JOB_TYPES < 7
#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 6
#endif
void *statsd_main(void *ptr) {
worker_register("STATSDFLUSH");
worker_register_job_name(WORKER_STATSD_FLUSH_GAUGES, "gauges");
worker_register_job_name(WORKER_STATSD_FLUSH_COUNTERS, "counters");
worker_register_job_name(WORKER_STATSD_FLUSH_METERS, "meters");
worker_register_job_name(WORKER_STATSD_FLUSH_TIMERS, "timers");
worker_register_job_name(WORKER_STATSD_FLUSH_HISTOGRAMS, "histograms");
worker_register_job_name(WORKER_STATSD_FLUSH_SETS, "sets");
worker_register_job_name(WORKER_STATSD_FLUSH_STATS, "statistics");
netdata_thread_cleanup_push(statsd_main_cleanup, ptr);
// ----------------------------------------------------------------------------------------------------------------
@ -2420,71 +2474,37 @@ void *statsd_main(void *ptr) {
);
RRDDIM *rd_pcharts = rrddim_add(st_pcharts, "charts", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
RRDSET *stcpu_thread = rrdset_create_localhost(
"netdata"
, "plugin_statsd_charting_cpu"
, NULL
, "statsd"
, "netdata.statsd_cpu"
, "Netdata statsd charting thread CPU usage"
, "milliseconds/s"
, PLUGIN_STATSD_NAME
, "stats"
, 132001
, statsd.update_every
, RRDSET_TYPE_STACKED
);
RRDDIM *rd_user = rrddim_add(stcpu_thread, "user", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL);
RRDDIM *rd_system = rrddim_add(stcpu_thread, "system", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL);
struct rusage thread;
for(i = 0; i < statsd.threads ;i++) {
char id[100 + 1];
char title[100 + 1];
snprintfz(id, 100, "plugin_statsd_collector%d_cpu", i + 1);
snprintfz(title, 100, "Netdata statsd collector thread No %d CPU usage", i + 1);
statsd.collection_threads_status[i].st_cpu = rrdset_create_localhost(
"netdata"
, id
, NULL
, "statsd"
, "netdata.statsd_cpu"
, title
, "milliseconds/s"
, PLUGIN_STATSD_NAME
, "stats"
, 132002 + i
, statsd.update_every
, RRDSET_TYPE_STACKED
);
statsd.collection_threads_status[i].rd_user = rrddim_add(statsd.collection_threads_status[i].st_cpu, "user", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL);
statsd.collection_threads_status[i].rd_system = rrddim_add(statsd.collection_threads_status[i].st_cpu, "system", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL);
}
// ----------------------------------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------------------------------
// statsd thread to turn metrics into charts
usec_t step = statsd.update_every * USEC_PER_SEC;
heartbeat_t hb;
heartbeat_init(&hb);
while(!netdata_exit) {
worker_is_idle();
usec_t hb_dt = heartbeat_next(&hb, step);
worker_is_busy(WORKER_STATSD_FLUSH_GAUGES);
statsd_flush_index_metrics(&statsd.gauges, statsd_flush_gauge);
worker_is_busy(WORKER_STATSD_FLUSH_COUNTERS);
statsd_flush_index_metrics(&statsd.counters, statsd_flush_counter);
worker_is_busy(WORKER_STATSD_FLUSH_METERS);
statsd_flush_index_metrics(&statsd.meters, statsd_flush_meter);
worker_is_busy(WORKER_STATSD_FLUSH_TIMERS);
statsd_flush_index_metrics(&statsd.timers, statsd_flush_timer);
worker_is_busy(WORKER_STATSD_FLUSH_HISTOGRAMS);
statsd_flush_index_metrics(&statsd.histograms, statsd_flush_histogram);
worker_is_busy(WORKER_STATSD_FLUSH_SETS);
statsd_flush_index_metrics(&statsd.sets, statsd_flush_set);
worker_is_busy(WORKER_STATSD_FLUSH_STATS);
statsd_update_all_app_charts();
getrusage(RUSAGE_THREAD, &thread);
if(unlikely(netdata_exit))
break;
@ -2498,9 +2518,6 @@ void *statsd_main(void *ptr) {
rrdset_next(st_tcp_connects);
rrdset_next(st_tcp_connected);
rrdset_next(st_pcharts);
rrdset_next(stcpu_thread);
for(i = 0; i < statsd.threads ;i++)
rrdset_next(statsd.collection_threads_status[i].st_cpu);
}
rrddim_set_by_pointer(st_metrics, rd_metrics_gauge, (collected_number)statsd.gauges.metrics);
@ -2550,16 +2567,6 @@ void *statsd_main(void *ptr) {
rrddim_set_by_pointer(st_pcharts, rd_pcharts, (collected_number)statsd.private_charts);
rrdset_done(st_pcharts);
rrddim_set_by_pointer(stcpu_thread, rd_user, thread.ru_utime.tv_sec * 1000000ULL + thread.ru_utime.tv_usec);
rrddim_set_by_pointer(stcpu_thread, rd_system, thread.ru_stime.tv_sec * 1000000ULL + thread.ru_stime.tv_usec);
rrdset_done(stcpu_thread);
for(i = 0; i < statsd.threads ;i++) {
rrddim_set_by_pointer(statsd.collection_threads_status[i].st_cpu, statsd.collection_threads_status[i].rd_user, statsd.collection_threads_status[i].rusage.ru_utime.tv_sec * 1000000ULL + statsd.collection_threads_status[i].rusage.ru_utime.tv_usec);
rrddim_set_by_pointer(statsd.collection_threads_status[i].st_cpu, statsd.collection_threads_status[i].rd_system, statsd.collection_threads_status[i].rusage.ru_stime.tv_sec * 1000000ULL + statsd.collection_threads_status[i].rusage.ru_stime.tv_usec);
rrdset_done(statsd.collection_threads_status[i].st_cpu);
}
}
cleanup: ; // added semi-colon to prevent older gcc error: label at end of compound statement

View File

@ -844,6 +844,8 @@ static inline void tc_split_words(char *str, char **words, int max_words) {
static pid_t tc_child_pid = 0;
static void tc_main_cleanup(void *ptr) {
worker_unregister();
struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
static_thread->enabled = NETDATA_MAIN_THREAD_EXITING;
@ -864,10 +866,35 @@ static void tc_main_cleanup(void *ptr) {
static_thread->enabled = NETDATA_MAIN_THREAD_EXITED;
}
void *tc_main(void *ptr) {
netdata_thread_cleanup_push(tc_main_cleanup, ptr);
#define WORKER_TC_CLASS 0
#define WORKER_TC_BEGIN 1
#define WORKER_TC_END 2
#define WORKER_TC_SENT 3
#define WORKER_TC_LENDED 4
#define WORKER_TC_TOKENS 5
#define WORKER_TC_SETDEVICENAME 6
#define WORKER_TC_SETDEVICEGROUP 7
#define WORKER_TC_SETCLASSNAME 8
#define WORKER_TC_WORKTIME 9
struct rusage thread;
#if WORKER_UTILIZATION_MAX_JOB_TYPES < 10
#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 10
#endif
void *tc_main(void *ptr) {
worker_register("TC");
worker_register_job_name(WORKER_TC_CLASS, "class");
worker_register_job_name(WORKER_TC_BEGIN, "begin");
worker_register_job_name(WORKER_TC_END, "end");
worker_register_job_name(WORKER_TC_SENT, "sent");
worker_register_job_name(WORKER_TC_LENDED, "lended");
worker_register_job_name(WORKER_TC_TOKENS, "tokens");
worker_register_job_name(WORKER_TC_SETDEVICENAME, "devicename");
worker_register_job_name(WORKER_TC_SETDEVICEGROUP, "devicegroup");
worker_register_job_name(WORKER_TC_SETCLASSNAME, "classname");
worker_register_job_name(WORKER_TC_WORKTIME, "worktime");
netdata_thread_cleanup_push(tc_main_cleanup, ptr);
char command[FILENAME_MAX + 1];
char *words[PLUGINSD_MAX_WORDS] = { NULL };
@ -913,6 +940,7 @@ void *tc_main(void *ptr) {
if(unlikely(!words[0] || !*words[0])) {
// debug(D_TC_LOOP, "empty line");
worker_is_idle();
continue;
}
// else debug(D_TC_LOOP, "First word is '%s'", words[0]);
@ -920,6 +948,8 @@ void *tc_main(void *ptr) {
first_hash = simple_hash(words[0]);
if(unlikely(device && ((first_hash == CLASS_HASH && strcmp(words[0], "class") == 0) || (first_hash == QDISC_HASH && strcmp(words[0], "qdisc") == 0)))) {
worker_is_busy(WORKER_TC_CLASS);
// debug(D_TC_LOOP, "CLASS line on class id='%s', parent='%s', parentid='%s', leaf='%s', leafid='%s'", words[2], words[3], words[4], words[5], words[6]);
char *type = words[1]; // the class/qdisc type: htb, fq_codel, etc
@ -949,6 +979,7 @@ void *tc_main(void *ptr) {
// there should be an IFB interface for this
class = NULL;
worker_is_idle();
continue;
}
@ -985,6 +1016,8 @@ void *tc_main(void *ptr) {
}
}
else if(unlikely(first_hash == END_HASH && strcmp(words[0], "END") == 0)) {
worker_is_busy(WORKER_TC_END);
// debug(D_TC_LOOP, "END line");
if(likely(device)) {
@ -998,6 +1031,8 @@ void *tc_main(void *ptr) {
class = NULL;
}
else if(unlikely(first_hash == BEGIN_HASH && strcmp(words[0], "BEGIN") == 0)) {
worker_is_busy(WORKER_TC_BEGIN);
// debug(D_TC_LOOP, "BEGIN line on device '%s'", words[1]);
if(likely(words[1] && *words[1])) {
@ -1011,6 +1046,8 @@ void *tc_main(void *ptr) {
class = NULL;
}
else if(unlikely(device && class && first_hash == SENT_HASH && strcmp(words[0], "Sent") == 0)) {
worker_is_busy(WORKER_TC_SENT);
// debug(D_TC_LOOP, "SENT line '%s'", words[1]);
if(likely(words[1] && *words[1])) {
class->bytes = str2ull(words[1]);
@ -1033,6 +1070,8 @@ void *tc_main(void *ptr) {
class->requeues = str2ull(words[8]);
}
else if(unlikely(device && class && class->updated && first_hash == LENDED_HASH && strcmp(words[0], "lended:") == 0)) {
worker_is_busy(WORKER_TC_LENDED);
// debug(D_TC_LOOP, "LENDED line '%s'", words[1]);
if(likely(words[1] && *words[1]))
class->lended = str2ull(words[1]);
@ -1044,6 +1083,8 @@ void *tc_main(void *ptr) {
class->giants = str2ull(words[5]);
}
else if(unlikely(device && class && class->updated && first_hash == TOKENS_HASH && strcmp(words[0], "tokens:") == 0)) {
worker_is_busy(WORKER_TC_TOKENS);
// debug(D_TC_LOOP, "TOKENS line '%s'", words[1]);
if(likely(words[1] && *words[1]))
class->tokens = str2ull(words[1]);
@ -1052,16 +1093,22 @@ void *tc_main(void *ptr) {
class->ctokens = str2ull(words[3]);
}
else if(unlikely(device && first_hash == SETDEVICENAME_HASH && strcmp(words[0], "SETDEVICENAME") == 0)) {
worker_is_busy(WORKER_TC_SETDEVICENAME);
// debug(D_TC_LOOP, "SETDEVICENAME line '%s'", words[1]);
if(likely(words[1] && *words[1]))
tc_device_set_device_name(device, words[1]);
}
else if(unlikely(device && first_hash == SETDEVICEGROUP_HASH && strcmp(words[0], "SETDEVICEGROUP") == 0)) {
worker_is_busy(WORKER_TC_SETDEVICEGROUP);
// debug(D_TC_LOOP, "SETDEVICEGROUP line '%s'", words[1]);
if(likely(words[1] && *words[1]))
tc_device_set_device_family(device, words[1]);
}
else if(unlikely(device && first_hash == SETCLASSNAME_HASH && strcmp(words[0], "SETCLASSNAME") == 0)) {
worker_is_busy(WORKER_TC_SETCLASSNAME);
// debug(D_TC_LOOP, "SETCLASSNAME line '%s' '%s'", words[1], words[2]);
char *id = words[1];
char *path = words[2];
@ -1069,36 +1116,9 @@ void *tc_main(void *ptr) {
tc_device_set_class_name(device, id, path);
}
else if(unlikely(first_hash == WORKTIME_HASH && strcmp(words[0], "WORKTIME") == 0)) {
worker_is_busy(WORKER_TC_WORKTIME);
// debug(D_TC_LOOP, "WORKTIME line '%s' '%s'", words[1], words[2]);
getrusage(RUSAGE_THREAD, &thread);
static RRDSET *stcpu = NULL;
static RRDDIM *rd_user = NULL, *rd_system = NULL;
if(unlikely(!stcpu)) {
stcpu = rrdset_create_localhost(
"netdata"
, "plugin_tc_cpu"
, NULL
, "tc.helper"
, NULL
, "Netdata TC CPU usage"
, "milliseconds/s"
, PLUGIN_TC_NAME
, NULL
, NETDATA_CHART_PRIO_NETDATA_TC_CPU
, localhost->rrd_update_every
, RRDSET_TYPE_STACKED
);
rd_user = rrddim_add(stcpu, "user", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL);
rd_system = rrddim_add(stcpu, "system", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL);
}
else rrdset_next(stcpu);
rrddim_set_by_pointer(stcpu, rd_user , thread.ru_utime.tv_sec * 1000000ULL + thread.ru_utime.tv_usec);
rrddim_set_by_pointer(stcpu, rd_system, thread.ru_stime.tv_sec * 1000000ULL + thread.ru_stime.tv_usec);
rrdset_done(stcpu);
static RRDSET *sttime = NULL;
static RRDDIM *rd_run_time = NULL;
@ -1107,8 +1127,8 @@ void *tc_main(void *ptr) {
"netdata"
, "plugin_tc_time"
, NULL
, "tc.helper"
, NULL
, "workers plugin tc"
, "netdata.workers.tc.script_time"
, "Netdata TC script execution"
, "milliseconds/run"
, PLUGIN_TC_NAME
@ -1128,6 +1148,8 @@ void *tc_main(void *ptr) {
//else {
// debug(D_TC_LOOP, "IGNORED line");
//}
worker_is_idle();
}
// fgets() failed or loop broke
@ -1158,6 +1180,7 @@ void *tc_main(void *ptr) {
}
cleanup: ; // added semi-colon to prevent older gcc error: label at end of compound statement
worker_unregister();
netdata_thread_cleanup_pop(1);
return NULL;
}

View File

@ -32,6 +32,8 @@ struct status_codes {
static void timex_main_cleanup(void *ptr)
{
worker_unregister();
struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
static_thread->enabled = NETDATA_MAIN_THREAD_EXITING;
@ -42,9 +44,10 @@ static void timex_main_cleanup(void *ptr)
void *timex_main(void *ptr)
{
netdata_thread_cleanup_push(timex_main_cleanup, ptr);
worker_register("TIMEX");
worker_register_job_name(0, "clock check");
int vdo_cpu_netdata = config_get_boolean(CONFIG_SECTION_TIMEX, "timex plugin resource charts", CONFIG_BOOLEAN_YES);
netdata_thread_cleanup_push(timex_main_cleanup, ptr);
int update_every = (int)config_get_number(CONFIG_SECTION_TIMEX, "update every", 10);
if (update_every < localhost->rrd_update_every)
@ -62,8 +65,9 @@ void *timex_main(void *ptr)
heartbeat_t hb;
heartbeat_init(&hb);
while (!netdata_exit) {
usec_t duration = heartbeat_monotonic_dt_to_now_usec(&hb);
worker_is_idle();
heartbeat_next(&hb, step);
worker_is_busy(0);
struct timex timex_buf = {};
int sync_state = 0;
@ -170,68 +174,6 @@ void *timex_main(void *ptr)
rrddim_set_by_pointer(st_offset, rd_offset, timex_buf.offset);
rrdset_done(st_offset);
}
if (vdo_cpu_netdata) {
static RRDSET *stcpu_thread = NULL, *st_duration = NULL;
static RRDDIM *rd_user = NULL, *rd_system = NULL, *rd_duration = NULL;
// ----------------------------------------------------------------
struct rusage thread;
getrusage(RUSAGE_THREAD, &thread);
if (unlikely(!stcpu_thread)) {
stcpu_thread = rrdset_create_localhost(
"netdata",
"plugin_timex",
NULL,
"timex",
NULL,
"Netdata Timex Plugin CPU usage",
"milliseconds/s",
PLUGIN_TIMEX_NAME,
NULL,
NETDATA_CHART_PRIO_NETDATA_TIMEX,
update_every,
RRDSET_TYPE_STACKED);
rd_user = rrddim_add(stcpu_thread, "user", NULL, 1, USEC_PER_MS, RRD_ALGORITHM_INCREMENTAL);
rd_system = rrddim_add(stcpu_thread, "system", NULL, 1, USEC_PER_MS, RRD_ALGORITHM_INCREMENTAL);
} else {
rrdset_next(stcpu_thread);
}
rrddim_set_by_pointer(
stcpu_thread, rd_user, thread.ru_utime.tv_sec * USEC_PER_SEC + thread.ru_utime.tv_usec);
rrddim_set_by_pointer(
stcpu_thread, rd_system, thread.ru_stime.tv_sec * USEC_PER_SEC + thread.ru_stime.tv_usec);
rrdset_done(stcpu_thread);
// ----------------------------------------------------------------
if (unlikely(!st_duration)) {
st_duration = rrdset_create_localhost(
"netdata",
"plugin_timex_dt",
NULL,
"timex",
NULL,
"Netdata Timex Plugin Duration",
"milliseconds/run",
PLUGIN_TIMEX_NAME,
NULL,
NETDATA_CHART_PRIO_NETDATA_TIMEX + 1,
update_every,
RRDSET_TYPE_AREA);
rd_duration = rrddim_add(st_duration, "duration", NULL, 1, USEC_PER_MS, RRD_ALGORITHM_ABSOLUTE);
} else {
rrdset_next(st_duration);
}
rrddim_set_by_pointer(st_duration, rd_duration, duration);
rrdset_done(st_duration);
}
}
exit:

View File

@ -920,6 +920,7 @@ static void xenstat_send_domain_metrics() {
}
int main(int argc, char **argv) {
clocks_init();
// ------------------------------------------------------------------------
// initialization of netdata plugin

View File

@ -1767,6 +1767,7 @@ AC_CONFIG_FILES([
libnetdata/url/Makefile
libnetdata/json/Makefile
libnetdata/health/Makefile
libnetdata/worker_utilization/Makefile
registry/Makefile
streaming/Makefile
system/Makefile

File diff suppressed because it is too large Load Diff

View File

@ -11,6 +11,10 @@ rrdeng_stats_t global_flushing_pressure_page_deletions = 0;
static unsigned pages_per_extent = MAX_PAGES_PER_EXTENT;
#if WORKER_UTILIZATION_MAX_JOB_TYPES < (RRDENG_MAX_OPCODE + 2)
#error Please increase WORKER_UTILIZATION_MAX_JOB_TYPES to at least (RRDENG_MAX_OPCODE + 2)
#endif
void *dbengine_page_alloc() {
void *page = netdata_mmap(NULL, RRDENG_BLOCK_SIZE, MAP_PRIVATE, enable_ksm);
if(!page) fatal("Cannot allocate dbengine page cache page, with mmap()");
@ -23,6 +27,8 @@ void dbengine_page_free(void *page) {
static void sanity_check(void)
{
BUILD_BUG_ON(WORKER_UTILIZATION_MAX_JOB_TYPES < (RRDENG_MAX_OPCODE + 2));
/* Magic numbers must fit in the super-blocks */
BUILD_BUG_ON(strlen(RRDENG_DF_MAGIC) > RRDENG_MAGIC_SZ);
BUILD_BUG_ON(strlen(RRDENG_JF_MAGIC) > RRDENG_MAGIC_SZ);
@ -1085,13 +1091,17 @@ void async_cb(uv_async_t *handle)
void timer_cb(uv_timer_t* handle)
{
worker_is_busy(RRDENG_MAX_OPCODE + 1);
struct rrdengine_worker_config* wc = handle->data;
struct rrdengine_instance *ctx = wc->ctx;
uv_stop(handle->loop);
uv_update_time(handle->loop);
if (unlikely(!ctx->metalog_ctx->initialized))
if (unlikely(!ctx->metalog_ctx->initialized)) {
worker_is_idle();
return; /* Wait for the metadata log to initialize */
}
rrdeng_test_quota(wc);
debug(D_RRDENGINE, "%s: timeout reached.", __func__);
if (likely(!wc->now_deleting_files && !wc->now_invalidating_dirty_pages)) {
@ -1133,12 +1143,26 @@ void timer_cb(uv_timer_t* handle)
debug(D_RRDENGINE, "%s", get_rrdeng_statistics(wc->ctx, buf, sizeof(buf)));
}
#endif
worker_is_idle();
}
#define MAX_CMD_BATCH_SIZE (256)
void rrdeng_worker(void* arg)
{
worker_register("DBENGINE");
worker_register_job_name(RRDENG_NOOP, "noop");
worker_register_job_name(RRDENG_READ_PAGE, "page read");
worker_register_job_name(RRDENG_READ_EXTENT, "extent read");
worker_register_job_name(RRDENG_COMMIT_PAGE, "commit");
worker_register_job_name(RRDENG_FLUSH_PAGES, "flush");
worker_register_job_name(RRDENG_SHUTDOWN, "shutdown");
worker_register_job_name(RRDENG_INVALIDATE_OLDEST_MEMORY_PAGE, "page lru");
worker_register_job_name(RRDENG_QUIESCE, "quiesce");
worker_register_job_name(RRDENG_MAX_OPCODE, "cleanup");
worker_register_job_name(RRDENG_MAX_OPCODE + 1, "timer");
struct rrdengine_worker_config* wc = arg;
struct rrdengine_instance *ctx = wc->ctx;
uv_loop_t* loop;
@ -1188,7 +1212,9 @@ void rrdeng_worker(void* arg)
shutdown = 0;
int set_name = 0;
while (likely(shutdown == 0 || rrdeng_threads_alive(wc))) {
worker_is_idle();
uv_run(loop, UV_RUN_DEFAULT);
worker_is_busy(RRDENG_MAX_OPCODE);
rrdeng_cleanup_finished_threads(wc);
/* wait for commands */
@ -1205,6 +1231,9 @@ void rrdeng_worker(void* arg)
opcode = cmd.opcode;
++cmd_batch_size;
if(likely(opcode != RRDENG_NOOP))
worker_is_busy(opcode);
switch (opcode) {
case RRDENG_NOOP:
/* the command queue was empty, do nothing */
@ -1281,6 +1310,7 @@ void rrdeng_worker(void* arg)
fatal_assert(0 == uv_loop_close(loop));
freez(loop);
worker_unregister();
return;
error_after_timer_init:
@ -1293,6 +1323,7 @@ error_after_loop_init:
wc->error = UV_EAGAIN;
/* wake up initialization thread */
completion_mark_complete(&ctx->rrdengine_completion);
worker_unregister();
}
/* C entry point for development purposes

View File

@ -10,6 +10,11 @@
#include "../../aclk/aclk.h"
#endif
void sanity_check(void) {
// make sure the compiler will stop on misconfigurations
BUILD_BUG_ON(WORKER_UTILIZATION_MAX_JOB_TYPES < ACLK_MAX_ENUMERATIONS_DEFINED);
}
const char *aclk_sync_config[] = {
"CREATE TABLE IF NOT EXISTS dimension_delete (dimension_id blob, dimension_name text, chart_type_id text, "
"dim_id blob, chart_id blob, host_id blob, date_created);",
@ -352,6 +357,29 @@ static void timer_cb(uv_timer_t* handle)
void aclk_database_worker(void *arg)
{
worker_register("ACLKSYNC");
worker_register_job_name(ACLK_DATABASE_NOOP, "noop");
#ifdef ENABLE_NEW_CLOUD_PROTOCOL
worker_register_job_name(ACLK_DATABASE_ADD_CHART, "chart add");
worker_register_job_name(ACLK_DATABASE_ADD_DIMENSION, "dimension add");
worker_register_job_name(ACLK_DATABASE_PUSH_CHART, "chart push");
worker_register_job_name(ACLK_DATABASE_PUSH_CHART_CONFIG, "chart conf push");
worker_register_job_name(ACLK_DATABASE_RESET_CHART, "chart reset");
worker_register_job_name(ACLK_DATABASE_CHART_ACK, "chart ack");
worker_register_job_name(ACLK_DATABASE_UPD_RETENTION, "retention check");
worker_register_job_name(ACLK_DATABASE_DIM_DELETION, "dimension delete");
worker_register_job_name(ACLK_DATABASE_ORPHAN_HOST, "node orphan");
#endif
worker_register_job_name(ACLK_DATABASE_ALARM_HEALTH_LOG, "alert log");
worker_register_job_name(ACLK_DATABASE_CLEANUP, "cleanup");
worker_register_job_name(ACLK_DATABASE_DELETE_HOST, "node delete");
worker_register_job_name(ACLK_DATABASE_NODE_INFO, "node info");
worker_register_job_name(ACLK_DATABASE_PUSH_ALERT, "alert push");
worker_register_job_name(ACLK_DATABASE_PUSH_ALERT_CONFIG, "alert conf push");
worker_register_job_name(ACLK_DATABASE_PUSH_ALERT_SNAPSHOT, "alert snapshot");
worker_register_job_name(ACLK_DATABASE_QUEUE_REMOVED_ALERTS, "alerts check");
worker_register_job_name(ACLK_DATABASE_TIMER, "timer");
struct aclk_database_worker_config *wc = arg;
uv_loop_t *loop;
int ret;
@ -413,6 +441,7 @@ void aclk_database_worker(void *arg)
debug(D_ACLK_SYNC,"Node %s reports pending message count = %u", wc->node_id, wc->chart_payload_count);
while (likely(!netdata_exit)) {
worker_is_idle();
uv_run(loop, UV_RUN_DEFAULT);
/* wait for commands */
@ -427,6 +456,10 @@ void aclk_database_worker(void *arg)
opcode = cmd.opcode;
++cmd_batch_size;
if(likely(opcode != ACLK_DATABASE_NOOP))
worker_is_busy(opcode);
switch (opcode) {
case ACLK_DATABASE_NOOP:
/* the command queue was empty, do nothing */
@ -439,6 +472,7 @@ void aclk_database_worker(void *arg)
if (wc->host == localhost)
sql_check_aclk_table_list(wc);
break;
case ACLK_DATABASE_DELETE_HOST:
debug(D_ACLK_SYNC,"Cleaning ACLK tables for %s", (char *) cmd.data);
sql_delete_aclk_table_list(wc, cmd);
@ -577,6 +611,8 @@ void aclk_database_worker(void *arg)
wc->host->dbsync_worker = NULL;
freez(wc);
rrd_unlock();
worker_unregister();
return;
error_after_timer_init:
@ -585,6 +621,7 @@ error_after_async_init:
fatal_assert(0 == uv_loop_close(loop));
error_after_loop_init:
freez(loop);
worker_unregister();
}
// -------------------------------------------------------------

View File

@ -133,7 +133,11 @@ enum aclk_database_opcode {
ACLK_DATABASE_PUSH_ALERT_CONFIG,
ACLK_DATABASE_PUSH_ALERT_SNAPSHOT,
ACLK_DATABASE_QUEUE_REMOVED_ALERTS,
ACLK_DATABASE_TIMER
ACLK_DATABASE_TIMER,
// leave this last
// we need it to check for worker utilization
ACLK_MAX_ENUMERATIONS_DEFINED
};
struct aclk_chart_payload_t {

View File

@ -573,6 +573,8 @@ static inline int check_if_resumed_from_suspension(void) {
}
static void health_main_cleanup(void *ptr) {
worker_unregister();
struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
static_thread->enabled = NETDATA_MAIN_THREAD_EXITING;
@ -695,7 +697,31 @@ static void init_pending_foreach_alarms(RRDHOST *host) {
*
* @return It always returns NULL
*/
#define WORKER_HEALTH_JOB_RRD_LOCK 0
#define WORKER_HEALTH_JOB_HOST_LOCK 1
#define WORKER_HEALTH_JOB_DB_QUERY 2
#define WORKER_HEALTH_JOB_CALC_EVAL 3
#define WORKER_HEALTH_JOB_WARNING_EVAL 4
#define WORKER_HEALTH_JOB_CRITICAL_EVAL 5
#define WORKER_HEALTH_JOB_ALARM_LOG_ENTRY 6
#define WORKER_HEALTH_JOB_ALARM_LOG_PROCESS 7
#if WORKER_UTILIZATION_MAX_JOB_TYPES < 8
#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 8
#endif
void *health_main(void *ptr) {
worker_register("HEALTH");
worker_register_job_name(WORKER_HEALTH_JOB_RRD_LOCK, "rrd lock");
worker_register_job_name(WORKER_HEALTH_JOB_HOST_LOCK, "host lock");
worker_register_job_name(WORKER_HEALTH_JOB_DB_QUERY, "db lookup");
worker_register_job_name(WORKER_HEALTH_JOB_CALC_EVAL, "calc eval");
worker_register_job_name(WORKER_HEALTH_JOB_WARNING_EVAL, "warning eval");
worker_register_job_name(WORKER_HEALTH_JOB_CRITICAL_EVAL, "critical eval");
worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY, "alarm log entry");
worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS, "alarm log process");
netdata_thread_cleanup_push(health_main_cleanup, ptr);
int min_run_every = (int)config_get_number(CONFIG_SECTION_HEALTH, "run at least every seconds", 10);
@ -743,6 +769,7 @@ void *health_main(void *ptr) {
marked_aclk_reload_loop = loop;
#endif
worker_is_busy(WORKER_HEALTH_JOB_RRD_LOCK);
rrd_rdlock();
RRDHOST *host;
@ -772,6 +799,7 @@ void *health_main(void *ptr) {
init_pending_foreach_alarms(host);
worker_is_busy(WORKER_HEALTH_JOB_HOST_LOCK);
rrdhost_rdlock(host);
// the first loop is to lookup values from the db
@ -786,6 +814,7 @@ void *health_main(void *ptr) {
rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE) &&
now > (rc->rrdset->last_collected_time.tv_sec + 60))) {
if (!rrdcalc_isrepeating(rc)) {
worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
time_t now = now_realtime_sec();
ALARM_ENTRY *ae = health_create_alarm_entry(
host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id,
@ -820,6 +849,8 @@ void *health_main(void *ptr) {
// if there is database lookup, do it
if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
worker_is_busy(WORKER_HEALTH_JOB_DB_QUERY);
/* time_t old_db_timestamp = rc->db_before; */
int value_is_null = 0;
@ -876,6 +907,8 @@ void *health_main(void *ptr) {
// if there is calculation expression, run it
if (unlikely(rc->calculation)) {
worker_is_busy(WORKER_HEALTH_JOB_CALC_EVAL);
if (unlikely(!expression_evaluate(rc->calculation))) {
// calculation failed
rc->value = NAN;
@ -924,6 +957,8 @@ void *health_main(void *ptr) {
// check the warning expression
if (likely(rc->warning)) {
worker_is_busy(WORKER_HEALTH_JOB_WARNING_EVAL);
if (unlikely(!expression_evaluate(rc->warning))) {
// calculation failed
rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
@ -948,6 +983,8 @@ void *health_main(void *ptr) {
// check the critical expression
if (likely(rc->critical)) {
worker_is_busy(WORKER_HEALTH_JOB_CRITICAL_EVAL);
if (unlikely(!expression_evaluate(rc->critical))) {
// calculation failed
rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
@ -1005,6 +1042,7 @@ void *health_main(void *ptr) {
// check if the new status and the old differ
if (status != rc->status) {
worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
int delay = 0;
// apply trigger hysteresis
@ -1086,6 +1124,7 @@ void *health_main(void *ptr) {
}
if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) {
worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
rc->last_repeat = now;
if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++;
ALARM_ENTRY *ae = health_create_alarm_entry(
@ -1118,6 +1157,7 @@ void *health_main(void *ptr) {
// execute notifications
// and cleanup
worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS);
health_alarm_log_process(host);
if (unlikely(netdata_exit)) {
@ -1156,6 +1196,7 @@ void *health_main(void *ptr) {
now = now_realtime_sec();
if(now < next_run) {
worker_is_idle();
debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now));
now = now_realtime_sec();

View File

@ -26,6 +26,7 @@ SUBDIRS = \
storage_number \
threads \
url \
worker_utilization \
tests \
$(NULL)

View File

@ -7,6 +7,9 @@
static clockid_t clock_boottime_to_use = CLOCK_MONOTONIC;
static clockid_t clock_monotonic_to_use = CLOCK_MONOTONIC;
usec_t clock_monotonic_resolution = 1000;
usec_t clock_realtime_resolution = 1000;
#ifndef HAVE_CLOCK_GETTIME
inline int clock_gettime(clockid_t clk_id, struct timespec *ts) {
struct timeval tv;
@ -20,15 +23,19 @@ inline int clock_gettime(clockid_t clk_id, struct timespec *ts) {
}
#endif
// When running a binary with CLOCK_MONOTONIC_COARSE defined on a system with a linux kernel older than Linux 2.6.32 the
// clock_gettime(2) system call fails with EINVAL. In that case it must fall-back to CLOCK_MONOTONIC.
// Similar to CLOCK_MONOTONIC, but provides access to a raw hardware-based time that is not subject to NTP adjustments
// or the incremental adjustments performed by adjtime(3). This clock does not count time that the system is suspended
static void test_clock_monotonic_coarse(void) {
static void test_clock_monotonic_raw(void) {
#ifdef CLOCK_MONOTONIC_RAW
struct timespec ts;
if(clock_gettime(CLOCK_MONOTONIC_COARSE, &ts) == -1 && errno == EINVAL)
if(clock_gettime(CLOCK_MONOTONIC_RAW, &ts) == -1 && errno == EINVAL)
clock_monotonic_to_use = CLOCK_MONOTONIC;
else
clock_monotonic_to_use = CLOCK_MONOTONIC_COARSE;
clock_monotonic_to_use = CLOCK_MONOTONIC_RAW;
#else
clock_monotonic_to_use = CLOCK_MONOTONIC;
#endif
}
// When running a binary with CLOCK_BOOTTIME defined on a system with a linux kernel older than Linux 2.6.39 the
@ -42,14 +49,31 @@ static void test_clock_boottime(void) {
clock_boottime_to_use = CLOCK_BOOTTIME;
}
static usec_t get_clock_resolution(clockid_t clock) {
struct timespec ts;
clock_getres(clock, &ts);
return ts.tv_sec * USEC_PER_SEC + ts.tv_nsec * NSEC_PER_USEC;
}
// perform any initializations required for clocks
void clocks_init(void) {
// monotonic coarse has to be tested before boottime
test_clock_monotonic_coarse();
// monotonic raw has to be tested before boottime
test_clock_monotonic_raw();
// boottime has to be tested after monotonic coarse
test_clock_boottime();
clock_monotonic_resolution = get_clock_resolution(clock_monotonic_to_use);
clock_realtime_resolution = get_clock_resolution(CLOCK_REALTIME);
// if for any reason these are zero, netdata will crash
// since we use them as modulo to calculations
if(!clock_realtime_resolution)
clock_realtime_resolution = 1000;
if(!clock_monotonic_resolution)
clock_monotonic_resolution = 1000;
}
inline time_t now_sec(clockid_t clk_id) {
@ -155,8 +179,110 @@ inline usec_t dt_usec(struct timeval *now, struct timeval *old) {
return (ts1 > ts2) ? (ts1 - ts2) : (ts2 - ts1);
}
void sleep_to_absolute_time(usec_t usec) {
static int einval_printed = 0, enotsup_printed = 0, eunknown_printed = 0;
clockid_t clock = CLOCK_REALTIME;
struct timespec req = {
.tv_sec = (time_t)(usec / USEC_PER_SEC),
.tv_nsec = (suseconds_t)((usec % USEC_PER_SEC) * NSEC_PER_USEC)
};
int ret = 0;
while( (ret = clock_nanosleep(clock, TIMER_ABSTIME, &req, NULL)) != 0 ) {
if(ret == EINTR) continue;
else {
if (ret == EINVAL) {
if (!einval_printed) {
einval_printed++;
error(
"Invalid time given to clock_nanosleep(): clockid = %d, tv_sec = %ld, tv_nsec = %ld",
clock,
req.tv_sec,
req.tv_nsec);
}
} else if (ret == ENOTSUP) {
if (!enotsup_printed) {
enotsup_printed++;
error(
"Invalid clock id given to clock_nanosleep(): clockid = %d, tv_sec = %ld, tv_nsec = %ld",
clock,
req.tv_sec,
req.tv_nsec);
}
} else {
if (!eunknown_printed) {
eunknown_printed++;
error(
"Unknown return value %d from clock_nanosleep(): clockid = %d, tv_sec = %ld, tv_nsec = %ld",
ret,
clock,
req.tv_sec,
req.tv_nsec);
}
}
sleep_usec(usec);
}
}
};
#define HEARTBEAT_ALIGNMENT_STATISTICS_SIZE 10
netdata_mutex_t heartbeat_alignment_mutex = NETDATA_MUTEX_INITIALIZER;
static size_t heartbeat_alignment_id = 0;
struct heartbeat_thread_statistics {
size_t sequence;
usec_t dt;
};
static struct heartbeat_thread_statistics heartbeat_alignment_values[HEARTBEAT_ALIGNMENT_STATISTICS_SIZE] = { 0 };
void heartbeat_statistics(usec_t *min_ptr, usec_t *max_ptr, usec_t *average_ptr, size_t *count_ptr) {
struct heartbeat_thread_statistics current[HEARTBEAT_ALIGNMENT_STATISTICS_SIZE];
static struct heartbeat_thread_statistics old[HEARTBEAT_ALIGNMENT_STATISTICS_SIZE] = { 0 };
memcpy(current, heartbeat_alignment_values, sizeof(struct heartbeat_thread_statistics) * HEARTBEAT_ALIGNMENT_STATISTICS_SIZE);
usec_t min = 0, max = 0, total = 0, average = 0;
size_t i, count = 0;
for(i = 0; i < HEARTBEAT_ALIGNMENT_STATISTICS_SIZE ;i++) {
if(current[i].sequence == old[i].sequence) continue;
usec_t value = current[i].dt - old[i].dt;
if(!count) {
min = max = total = value;
count = 1;
}
else {
total += value;
if(value < min) min = value;
if(value > max) max = value;
count++;
}
}
average = total / count;
if(min_ptr) *min_ptr = min;
if(max_ptr) *max_ptr = max;
if(average_ptr) *average_ptr = average;
if(count_ptr) *count_ptr = count;
memcpy(old, current, sizeof(struct heartbeat_thread_statistics) * HEARTBEAT_ALIGNMENT_STATISTICS_SIZE);
}
inline void heartbeat_init(heartbeat_t *hb) {
hb->monotonic = hb->realtime = 0ULL;
hb->realtime = 0ULL;
hb->randomness = 250 * USEC_PER_MS + ((now_realtime_usec() * clock_realtime_resolution) % (250 * USEC_PER_MS));
hb->randomness -= (hb->randomness % clock_realtime_resolution);
netdata_mutex_lock(&heartbeat_alignment_mutex);
hb->statistics_id = heartbeat_alignment_id;
heartbeat_alignment_id++;
netdata_mutex_unlock(&heartbeat_alignment_mutex);
if(hb->statistics_id < HEARTBEAT_ALIGNMENT_STATISTICS_SIZE) {
heartbeat_alignment_values[hb->statistics_id].dt = 0;
heartbeat_alignment_values[hb->statistics_id].sequence = 0;
}
}
// waits for the next heartbeat
@ -164,96 +290,73 @@ inline void heartbeat_init(heartbeat_t *hb) {
// it returns the dt using the realtime clock
usec_t heartbeat_next(heartbeat_t *hb, usec_t tick) {
heartbeat_t now;
now.monotonic = now_monotonic_usec();
now.realtime = now_realtime_usec();
usec_t next_monotonic = now.monotonic - (now.monotonic % tick) + tick;
while(now.monotonic < next_monotonic) {
sleep_usec(next_monotonic - now.monotonic);
now.monotonic = now_monotonic_usec();
now.realtime = now_realtime_usec();
if(unlikely(hb->randomness > tick / 2)) {
// TODO: The heartbeat tick should be specified at the heartbeat_init() function
usec_t tmp = (now_realtime_usec() * clock_realtime_resolution) % (tick / 2);
info("heartbeat randomness of %llu is too big for a tick of %llu - setting it to %llu", hb->randomness, tick, tmp);
hb->randomness = tmp;
}
if(likely(hb->realtime != 0ULL)) {
usec_t dt_monotonic = now.monotonic - hb->monotonic;
usec_t dt_realtime = now.realtime - hb->realtime;
usec_t dt;
usec_t now = now_realtime_usec();
usec_t next = now - (now % tick) + tick + hb->randomness;
hb->monotonic = now.monotonic;
hb->realtime = now.realtime;
// align the next time we want to the clock resolution
if(next % clock_realtime_resolution)
next = next - (next % clock_realtime_resolution) + clock_realtime_resolution;
if(unlikely(dt_monotonic >= tick + tick / 2)) {
errno = 0;
error("heartbeat missed %llu monotonic microseconds", dt_monotonic - tick);
}
// sleep_usec() has a loop to guarantee we will sleep for at least the requested time.
// According the specs, when we sleep for a relative time, clock adjustments should not affect the duration
// we sleep.
sleep_usec(next - now);
now = now_realtime_usec();
dt = now - hb->realtime;
return dt_realtime;
if(hb->statistics_id < HEARTBEAT_ALIGNMENT_STATISTICS_SIZE) {
heartbeat_alignment_values[hb->statistics_id].dt += now - next;
heartbeat_alignment_values[hb->statistics_id].sequence++;
}
else {
hb->monotonic = now.monotonic;
hb->realtime = now.realtime;
return 0ULL;
if(unlikely(now < next)) {
errno = 0;
error("heartbeat clock: woke up %llu microseconds earlier than expected (can be due to the CLOCK_REALTIME set to the past).", next - now);
}
else if(unlikely(now - next > tick / 2)) {
errno = 0;
error("heartbeat clock: woke up %llu microseconds later than expected (can be due to system load or the CLOCK_REALTIME set to the future).", now - next);
}
if(unlikely(!hb->realtime)) {
// the first time return zero
dt = 0;
}
hb->realtime = now;
return dt;
}
// returned the elapsed time, since the last heartbeat
// using the monotonic clock
inline usec_t heartbeat_monotonic_dt_to_now_usec(heartbeat_t *hb) {
if(!hb || !hb->monotonic) return 0ULL;
return now_monotonic_usec() - hb->monotonic;
}
int sleep_usec(usec_t usec) {
#ifndef NETDATA_WITH_USLEEP
void sleep_usec(usec_t usec) {
// we expect microseconds (1.000.000 per second)
// but timespec is nanoseconds (1.000.000.000 per second)
struct timespec rem, req = {
.tv_sec = (time_t) (usec / 1000000),
.tv_nsec = (suseconds_t) ((usec % 1000000) * 1000)
.tv_sec = (time_t) (usec / USEC_PER_SEC),
.tv_nsec = (suseconds_t) ((usec % USEC_PER_SEC) * NSEC_PER_USEC)
};
while (nanosleep(&req, &rem) == -1) {
while ((errno = clock_nanosleep(CLOCK_REALTIME, 0, &req, &rem)) != 0) {
if (likely(errno == EINTR)) {
debug(D_SYSTEM, "nanosleep() interrupted (while sleeping for %llu microseconds).", usec);
req.tv_sec = rem.tv_sec;
req.tv_nsec = rem.tv_nsec;
} else {
error("Cannot nanosleep() for %llu microseconds.", usec);
error("Cannot clock_nanosleep(CLOCK_REALTIME) for %llu microseconds.", usec);
break;
}
}
return 0;
#else
int ret = usleep(usec);
if(unlikely(ret == -1 && errno == EINVAL)) {
// on certain systems, usec has to be up to 999999
if(usec > 999999) {
int counter = usec / 999999;
while(counter--)
usleep(999999);
usleep(usec % 999999);
}
else {
error("Cannot usleep() for %llu microseconds.", usec);
return ret;
}
}
if(ret != 0)
error("usleep() failed for %llu microseconds.", usec);
return ret;
#endif
}
static inline collected_number uptime_from_boottime(void) {
#ifdef CLOCK_BOOTTIME_IS_AVAILABLE
return now_boottime_usec() / 1000;
return (collected_number)(now_boottime_usec() / USEC_PER_MS);
#else
error("uptime cannot be read from CLOCK_BOOTTIME on this system.");
return 0;

View File

@ -22,8 +22,9 @@ typedef unsigned long long usec_t;
typedef long long susec_t;
typedef struct heartbeat {
usec_t monotonic;
usec_t realtime;
usec_t randomness;
size_t statistics_id;
} heartbeat_t;
/* Linux value is as good as any other */
@ -36,20 +37,14 @@ typedef struct heartbeat {
#define CLOCK_MONOTONIC CLOCK_REALTIME
#endif
/* Prefer CLOCK_MONOTONIC_COARSE where available to reduce overhead. It has the same semantics as CLOCK_MONOTONIC */
#ifndef CLOCK_MONOTONIC_COARSE
/* fallback to CLOCK_MONOTONIC if not available */
#define CLOCK_MONOTONIC_COARSE CLOCK_MONOTONIC
#endif
#ifndef CLOCK_BOOTTIME
#ifdef CLOCK_UPTIME
/* CLOCK_BOOTTIME falls back to CLOCK_UPTIME on FreeBSD */
#define CLOCK_BOOTTIME CLOCK_UPTIME
#else // CLOCK_UPTIME
/* CLOCK_BOOTTIME falls back to CLOCK_MONOTONIC */
#define CLOCK_BOOTTIME CLOCK_MONOTONIC_COARSE
/* CLOCK_BOOTTIME falls back to CLOCK_REALTIME */
#define CLOCK_BOOTTIME CLOCK_REALTIME
#endif // CLOCK_UPTIME
#else // CLOCK_BOOTTIME
@ -115,8 +110,6 @@ extern int clock_gettime(clockid_t clk_id, struct timespec *ts);
* All now_*_sec() functions return the time in seconds from the appropriate clock, or 0 on error.
* All now_*_usec() functions return the time in microseconds from the appropriate clock, or 0 on error.
*
* Most functions will attempt to use CLOCK_MONOTONIC_COARSE if available to reduce contention overhead and improve
* performance scaling. If high precision is required please use one of the available now_*_high_precision_* functions.
*/
extern int now_realtime_timeval(struct timeval *tv);
extern time_t now_realtime_sec(void);
@ -146,10 +139,9 @@ extern void heartbeat_init(heartbeat_t *hb);
*/
extern usec_t heartbeat_next(heartbeat_t *hb, usec_t tick);
/* Returns elapsed time in microseconds since last heartbeat */
extern usec_t heartbeat_monotonic_dt_to_now_usec(heartbeat_t *hb);
extern void heartbeat_statistics(usec_t *min_ptr, usec_t *max_ptr, usec_t *average_ptr, size_t *count_ptr);
extern int sleep_usec(usec_t usec);
extern void sleep_usec(usec_t usec);
extern void clocks_init(void);
@ -160,4 +152,9 @@ extern int now_timeval(clockid_t clk_id, struct timeval *tv);
extern collected_number uptime_msec(char *filename);
extern usec_t clock_monotonic_resolution;
extern usec_t clock_realtime_resolution;
extern void sleep_to_absolute_time(usec_t usec);
#endif /* NETDATA_CLOCKS_H */

View File

@ -346,6 +346,7 @@ extern char *netdata_configured_host_prefix;
#include "health/health.h"
#include "string/utf8.h"
#include "onewayalloc/onewayalloc.h"
#include "worker_utilization/worker_utilization.h"
// BEWARE: Outside of the C code this also exists in alarm-notify.sh
#define DEFAULT_CLOUD_BASE_URL "https://app.netdata.cloud"

View File

@ -0,0 +1,8 @@
# SPDX-License-Identifier: GPL-3.0-or-later
AUTOMAKE_OPTIONS = subdir-objects
MAINTAINERCLEANFILES = $(srcdir)/Makefile.in
dist_noinst_DATA = \
README.md \
$(NULL)

View File

@ -0,0 +1,58 @@
<!--
title: "Worker Utilization"
custom_edit_url: https://github.com/netdata/netdata/edit/master/libnetdata/onewayallocator/README.md
-->
# Worker Utilization
This library is to be used when there are 1 or more worker threads accepting requests of some kind and servicing them.
The goal is to provide a very simple way to monitor worker threads utilization, as a percentage of the time they are busy and the amount of requests served.
## How to use
When a working thread starts, call:
```c
void worker_register(const char *name);
```
This will create the necessary structures for the library to work.
No need to keep a pointer to them. They are allocated as `__thread` variables.
When the thread stops, call:
```c
void worker_unregister(void)
```
Again, no parameters, or return values.
When you are about to do some work in the working thread, call:
```c
void worker_is_busy(void)
```
When you finish doing the job, call:
```c
void worker_is_idle(void)
```
Calls to `worker_is_busy()` can be made one after another (without calling
`worker_is_idle()` between them) to switch jobs without losing any time between
them and eliminating one of the 2 clock calls involved.
## Implementation details
Totally lockless, extremely fast, it should not introduce any kind of problems to the workers.
Every time `worker_is_busy()` or `worker_is_idle()` are called, a call to `now_realtime_usec()`
is done and a couple of variables are updated. That's it!
The worker does not need to update the variables regularly. Based on the last status of the worker,
the statistics collector of netdata will calculate if the thread is busy or idle all the time or
part of the time. Works well for both thousands of jobs per second and unlimited working time
(being totally busy with a single request for ages).
The statistics collector is called by the global statistics thread of netdata. So, even if the workers
are extremely busy with their jobs, netdata will be able to know how busy they are.

View File

@ -0,0 +1,201 @@
#include "worker_utilization.h"
#define WORKER_IDLE 'I'
#define WORKER_BUSY 'B'
struct worker_job_type {
char name[WORKER_UTILIZATION_MAX_JOB_NAME_LENGTH + 1];
size_t worker_jobs_started;
usec_t worker_busy_time;
size_t statistics_jobs_started;
usec_t statistics_busy_time;
};
struct worker {
pid_t pid;
const char *tag;
const char *workname;
uint32_t workname_hash;
// only one variable is set by our statistics callers
usec_t statistics_last_checkpoint;
size_t statistics_last_jobs_started;
usec_t statistics_last_busy_time;
// the worker controlled variables
size_t job_id;
volatile size_t jobs_started;
volatile usec_t busy_time;
volatile usec_t last_action_timestamp;
volatile char last_action;
struct worker_job_type per_job_type[WORKER_UTILIZATION_MAX_JOB_TYPES];
struct worker *next;
};
static netdata_mutex_t base_lock = NETDATA_MUTEX_INITIALIZER;
static struct worker *base = NULL;
static __thread struct worker *worker = NULL;
void worker_register(const char *workname) {
if(unlikely(worker)) return;
worker = callocz(1, sizeof(struct worker));
worker->pid = gettid();
worker->tag = strdupz(netdata_thread_tag());
worker->workname = strdupz(workname);
worker->workname_hash = simple_hash(worker->workname);
usec_t now = now_realtime_usec();
worker->statistics_last_checkpoint = now;
worker->last_action_timestamp = now;
worker->last_action = WORKER_IDLE;
netdata_mutex_lock(&base_lock);
worker->next = base;
base = worker;
netdata_mutex_unlock(&base_lock);
}
void worker_register_job_name(size_t job_id, const char *name) {
if(unlikely(!worker)) return;
if(unlikely(job_id >= WORKER_UTILIZATION_MAX_JOB_TYPES)) {
error("WORKER_UTILIZATION: job_id %zu is too big. Max is %zu", job_id, (size_t)(WORKER_UTILIZATION_MAX_JOB_TYPES - 1));
return;
}
strncpy(worker->per_job_type[job_id].name, name, WORKER_UTILIZATION_MAX_JOB_NAME_LENGTH);
}
void worker_unregister(void) {
if(unlikely(!worker)) return;
netdata_mutex_lock(&base_lock);
if(base == worker)
base = worker->next;
else {
struct worker *p;
for(p = base; p && p->next && p->next != worker ;p = p->next);
if(p && p->next == worker)
p->next = worker->next;
}
netdata_mutex_unlock(&base_lock);
freez((void *)worker->tag);
freez((void *)worker->workname);
freez(worker);
worker = NULL;
}
static inline void worker_is_idle_with_time(usec_t now) {
usec_t delta = now - worker->last_action_timestamp;
worker->busy_time += delta;
worker->per_job_type[worker->job_id].worker_busy_time += delta;
// the worker was busy
// set it to idle before we set the timestamp
worker->last_action = WORKER_IDLE;
if(likely(worker->last_action_timestamp < now))
worker->last_action_timestamp = now;
}
void worker_is_idle(void) {
if(unlikely(!worker)) return;
if(unlikely(worker->last_action != WORKER_BUSY)) return;
worker_is_idle_with_time(now_realtime_usec());
}
void worker_is_busy(size_t job_id) {
if(unlikely(!worker)) return;
if(unlikely(job_id >= WORKER_UTILIZATION_MAX_JOB_TYPES))
job_id = 0;
usec_t now = now_realtime_usec();
if(worker->last_action == WORKER_BUSY)
worker_is_idle_with_time(now);
// the worker was idle
// set the timestamp and then set it to busy
worker->job_id = job_id;
worker->per_job_type[job_id].worker_jobs_started++;
worker->jobs_started++;
worker->last_action_timestamp = now;
worker->last_action = WORKER_BUSY;
}
// statistics interface
void workers_foreach(const char *workname, void (*callback)(void *data, pid_t pid, const char *thread_tag, size_t utilization_usec, size_t duration_usec, size_t jobs_started, size_t is_running, const char **job_types_names, size_t *job_types_jobs_started, usec_t *job_types_busy_time), void *data) {
netdata_mutex_lock(&base_lock);
uint32_t hash = simple_hash(workname);
usec_t busy_time, delta;
size_t i, jobs_started, jobs_running;
struct worker *p;
for(p = base; p ; p = p->next) {
if(hash != p->workname_hash || strcmp(workname, p->workname)) continue;
usec_t now = now_realtime_usec();
// find per job type statistics
const char *per_job_type_name[WORKER_UTILIZATION_MAX_JOB_TYPES];
size_t per_job_type_jobs_started[WORKER_UTILIZATION_MAX_JOB_TYPES];
usec_t per_job_type_busy_time[WORKER_UTILIZATION_MAX_JOB_TYPES];
for(i = 0; i < WORKER_UTILIZATION_MAX_JOB_TYPES ;i++) {
per_job_type_name[i] = p->per_job_type[i].name;
size_t tmp_jobs_started = p->per_job_type[i].worker_jobs_started;
per_job_type_jobs_started[i] = tmp_jobs_started - p->per_job_type[i].statistics_jobs_started;
p->per_job_type[i].statistics_jobs_started = tmp_jobs_started;
usec_t tmp_busy_time = p->per_job_type[i].worker_busy_time;
per_job_type_busy_time[i] = tmp_busy_time - p->per_job_type[i].statistics_busy_time;
p->per_job_type[i].statistics_busy_time = tmp_busy_time;
}
// get a copy of the worker variables
usec_t worker_busy_time = p->busy_time;
size_t worker_jobs_started = p->jobs_started;
char worker_last_action = p->last_action;
usec_t worker_last_action_timestamp = p->last_action_timestamp;
// this is the only variable both the worker thread and the statistics thread are writing
// we set this only when the worker is busy, so that worker will not
// accumulate all the busy time, but only the time after the point we collected statistics
if(worker_last_action == WORKER_BUSY && p->last_action_timestamp == worker_last_action_timestamp && p->last_action == WORKER_BUSY)
p->last_action_timestamp = now;
// calculate delta busy time
busy_time = worker_busy_time - p->statistics_last_busy_time;
p->statistics_last_busy_time = worker_busy_time;
// calculate delta jobs done
jobs_started = worker_jobs_started - p->statistics_last_jobs_started;
p->statistics_last_jobs_started = worker_jobs_started;
jobs_running = 0;
if(worker_last_action == WORKER_BUSY) {
// the worker is still busy with something
// let's add that busy time to the reported one
busy_time += now - worker_last_action_timestamp;
jobs_running = 1;
}
delta = now - p->statistics_last_checkpoint;
p->statistics_last_checkpoint = now;
callback(data, p->pid, p->tag, busy_time, delta, jobs_started, jobs_running, per_job_type_name, per_job_type_jobs_started, per_job_type_busy_time);
}
netdata_mutex_unlock(&base_lock);
}

View File

@ -0,0 +1,22 @@
#ifndef WORKER_UTILIZATION_H
#define WORKER_UTILIZATION_H 1
#include "../libnetdata.h"
// workers interfaces
#define WORKER_UTILIZATION_MAX_JOB_TYPES 50
#define WORKER_UTILIZATION_MAX_JOB_NAME_LENGTH 25
extern void worker_register(const char *workname);
extern void worker_register_job_name(size_t job_id, const char *name);
extern void worker_unregister(void);
extern void worker_is_idle(void);
extern void worker_is_busy(size_t job_id);
// statistics interface
extern void workers_foreach(const char *workname, void (*callback)(void *data, pid_t pid, const char *thread_tag, size_t utilization_usec, size_t duration_usec, size_t jobs_started, size_t is_running, const char **job_types_names, size_t *job_types_jobs_started, usec_t *job_types_busy_time), void *data);
#endif // WORKER_UTILIZATION_H

View File

@ -358,6 +358,10 @@ void TrainableHost::trainDimension(Dimension *D, const TimePoint &NowTP) {
void TrainableHost::train() {
Duration<double> MaxSleepFor = Seconds{10 * updateEvery()};
worker_register("MLTRAIN");
worker_register_job_name(0, "dimensions");
worker_is_busy(0);
while (!netdata_exit) {
netdata_thread_testcancel();
netdata_thread_disable_cancelability();
@ -378,11 +382,23 @@ void TrainableHost::train() {
if (RealDuration >= AllottedDuration)
continue;
worker_is_idle();
SleepFor = std::min(AllottedDuration - RealDuration, MaxSleepFor);
std::this_thread::sleep_for(SleepFor);
worker_is_busy(0);
}
}
#define WORKER_JOB_DETECT_DIMENSION 0
#define WORKER_JOB_UPDATE_DETECTION_CHART 1
#define WORKER_JOB_UPDATE_ANOMALY_RATES 2
#define WORKER_JOB_UPDATE_CHARTS 3
#define WORKER_JOB_SAVE_ANOMALY_EVENT 4
#if WORKER_UTILIZATION_MAX_JOB_TYPES < 5
#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 5
#endif
void DetectableHost::detectOnce() {
auto P = BRW.insert(WindowAnomalyRate >= Cfg.HostAnomalyRateThreshold);
BitRateWindow::Edge Edge = P.first;
@ -408,6 +424,8 @@ void DetectableHost::detectOnce() {
DimsOverThreshold.reserve(DimensionsMap.size());
for (auto &DP : DimensionsMap) {
worker_is_busy(WORKER_JOB_DETECT_DIMENSION);
Dimension *D = DP.second;
auto P = D->detect(WindowLength, ResetBitCounter);
@ -434,6 +452,7 @@ void DetectableHost::detectOnce() {
}
if (CollectAnomalyRates) {
worker_is_busy(WORKER_JOB_UPDATE_ANOMALY_RATES);
AnomalyRateTimer = 0;
rrdset_done(AnomalyRateRS);
}
@ -442,6 +461,7 @@ void DetectableHost::detectOnce() {
this->NumNormalDimensions = NumNormalDimensions;
this->NumTrainedDimensions = NumTrainedDimensions;
worker_is_busy(WORKER_JOB_UPDATE_CHARTS);
updateDimensionsChart(getRH(), NumTrainedDimensions, NumNormalDimensions, NumAnomalousDimensions);
updateRateChart(getRH(), WindowAnomalyRate * 10000.0);
updateWindowLengthChart(getRH(), WindowLength);
@ -454,6 +474,8 @@ void DetectableHost::detectOnce() {
if (!NewAnomalyEvent || (DimsOverThreshold.size() == 0))
return;
worker_is_busy(WORKER_JOB_SAVE_ANOMALY_EVENT);
std::sort(DimsOverThreshold.begin(), DimsOverThreshold.end());
std::reverse(DimsOverThreshold.begin(), DimsOverThreshold.end());
@ -476,6 +498,13 @@ void DetectableHost::detectOnce() {
}
void DetectableHost::detect() {
worker_register("MLDETECT");
worker_register_job_name(WORKER_JOB_DETECT_DIMENSION, "dimensions");
worker_register_job_name(WORKER_JOB_UPDATE_DETECTION_CHART, "detection chart");
worker_register_job_name(WORKER_JOB_UPDATE_ANOMALY_RATES, "anomaly rates");
worker_register_job_name(WORKER_JOB_UPDATE_CHARTS, "charts");
worker_register_job_name(WORKER_JOB_SAVE_ANOMALY_EVENT, "anomaly event");
std::this_thread::sleep_for(Seconds{10});
heartbeat_t HB;
@ -483,10 +512,13 @@ void DetectableHost::detect() {
while (!netdata_exit) {
netdata_thread_testcancel();
worker_is_idle();
heartbeat_next(&HB, updateEvery() * USEC_PER_SEC);
netdata_thread_disable_cancelability();
detectOnce();
worker_is_busy(WORKER_JOB_UPDATE_DETECTION_CHART);
updateDetectionChart(getRH());
netdata_thread_enable_cancelability();
}

View File

@ -133,10 +133,13 @@ int parser_add_keyword(PARSER *parser, char *keyword, keyword_function func)
tmp_keyword = callocz(1, sizeof(*tmp_keyword));
tmp_keyword->worker_job_id = parser->worker_job_ids++;
tmp_keyword->keyword = strdupz(keyword);
tmp_keyword->keyword_hash = keyword_hash;
tmp_keyword->func[tmp_keyword->func_no++] = (void *) func;
worker_register_job_name(tmp_keyword->worker_job_id, tmp_keyword->keyword);
tmp_keyword->next = parser->keyword;
parser->keyword = tmp_keyword;
return tmp_keyword->func_no;
@ -265,10 +268,12 @@ inline int parser_action(PARSER *parser, char *input)
uint32_t command_hash = simple_hash(command);
size_t worker_job_id;
while(tmp_keyword) {
if (command_hash == tmp_keyword->keyword_hash &&
(!strcmp(command, tmp_keyword->keyword))) {
action_function_list = &tmp_keyword->func[0];
worker_job_id = tmp_keyword->worker_job_id;
break;
}
tmp_keyword = tmp_keyword->next;
@ -284,12 +289,14 @@ inline int parser_action(PARSER *parser, char *input)
#endif
}
else {
worker_is_busy(worker_job_id);
while ((action_function = *action_function_list) != NULL) {
rc = action_function(words, parser->user, parser->plugins_action);
if (unlikely(rc == PARSER_RC_ERROR || rc == PARSER_RC_STOP))
break;
action_function_list++;
}
worker_is_idle();
}
if (likely(input == parser->buffer))

View File

@ -54,6 +54,7 @@ typedef enum parser_input_type {
typedef PARSER_RC (*keyword_function)(char **, void *, PLUGINSD_ACTION *plugins_action);
typedef struct parser_keyword {
size_t worker_job_id;
char *keyword;
uint32_t keyword_hash;
int func_no;
@ -67,6 +68,7 @@ typedef struct parser_data {
} PARSER_DATA;
typedef struct parser {
size_t worker_job_ids;
uint8_t version; // Parser version
RRDHOST *host;
void *input; // Input source e.g. stream

View File

@ -30,6 +30,8 @@ void destroy_receiver_state(struct receiver_state *rpt) {
}
static void rrdpush_receiver_thread_cleanup(void *ptr) {
worker_unregister();
static __thread int executed = 0;
if(!executed) {
executed = 1;
@ -716,7 +718,9 @@ void *rrdpush_receiver_thread(void *ptr) {
struct receiver_state *rpt = (struct receiver_state *)ptr;
info("STREAM %s [%s]:%s: receive thread created (task id %d)", rpt->hostname, rpt->client_ip, rpt->client_port, gettid());
worker_register("STREAMRCV");
rrdpush_receive(rpt);
worker_unregister();
netdata_thread_cleanup_pop(1);
return NULL;

View File

@ -2,6 +2,26 @@
#include "rrdpush.h"
#define WORKER_SENDER_JOB_CONNECT 0
#define WORKER_SENDER_JOB_PIPE_READ 1
#define WORKER_SENDER_JOB_SOCKET_RECEIVE 2
#define WORKER_SENDER_JOB_EXECUTE 3
#define WORKER_SENDER_JOB_SOCKET_SEND 4
#define WORKER_SENDER_JOB_DISCONNECT_BAD_HANDSHAKE 5
#define WORKER_SENDER_JOB_DISCONNECT_OVERFLOW 6
#define WORKER_SENDER_JOB_DISCONNECT_TIMEOUT 7
#define WORKER_SENDER_JOB_DISCONNECT_POLL_ERROR 8
#define WORKER_SENDER_JOB_DISCONNECT_SOCKER_ERROR 9
#define WORKER_SENDER_JOB_DISCONNECT_SSL_ERROR 10
#define WORKER_SENDER_JOB_DISCONNECT_PARENT_CLOSED 11
#define WORKER_SENDER_JOB_DISCONNECT_RECEIVE_ERROR 12
#define WORKER_SENDER_JOB_DISCONNECT_SEND_ERROR 13
#define WORKER_SENDER_JOB_DISCONNECT_NO_COMPRESSION 14
#if WORKER_UTILIZATION_MAX_JOB_TYPES < 15
#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 15
#endif
extern struct config stream_config;
extern int netdata_use_ssl_on_stream;
extern char *netdata_ssl_ca_path;
@ -21,8 +41,8 @@ static inline void rrdpush_sender_thread_close_socket(RRDHOST *host);
* Inform the user through the error log file and
* deactivate compression by downgrading the stream protocol.
*/
static inline void deactivate_compression(struct sender_state *s)
{
static inline void deactivate_compression(struct sender_state *s) {
worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_NO_COMPRESSION);
error("STREAM_COMPRESSION: Deactivating compression to avoid stream corruption");
default_compression_enabled = 0;
s->rrdpush_compression = 0;
@ -389,6 +409,7 @@ if(!s->rrdpush_compression)
err = SSL_get_error(host->ssl.conn, err);
error("SSL cannot connect with the server: %s ",ERR_error_string((long)SSL_get_error(host->ssl.conn,err),NULL));
if (netdata_use_ssl_on_stream == NETDATA_SSL_FORCE) {
worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_SSL_ERROR);
rrdpush_sender_thread_close_socket(host);
return 0;
}else {
@ -399,6 +420,7 @@ if(!s->rrdpush_compression)
if (netdata_use_ssl_on_stream == NETDATA_SSL_FORCE) {
if (netdata_validate_server == NETDATA_SSL_VALID_CERTIFICATE) {
if ( security_test_certificate(host->ssl.conn)) {
worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_SSL_ERROR);
error("Closing the stream connection, because the server SSL certificate is not valid.");
rrdpush_sender_thread_close_socket(host);
return 0;
@ -411,6 +433,7 @@ if(!s->rrdpush_compression)
#else
if(send_timeout(host->rrdpush_sender_socket, http, strlen(http), 0, timeout) == -1) {
#endif
worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_TIMEOUT);
error("STREAM %s [send to %s]: failed to send HTTP header to remote netdata.", host->hostname, s->connected_to);
rrdpush_sender_thread_close_socket(host);
return 0;
@ -426,6 +449,7 @@ if(!s->rrdpush_compression)
received = recv_timeout(host->rrdpush_sender_socket, http, HTTP_HEADER_SIZE, 0, timeout);
if(received == -1) {
#endif
worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_TIMEOUT);
error("STREAM %s [send to %s]: remote netdata does not respond.", host->hostname, s->connected_to);
rrdpush_sender_thread_close_socket(host);
return 0;
@ -435,6 +459,7 @@ if(!s->rrdpush_compression)
debug(D_STREAM, "Response to sender from far end: %s", http);
int32_t version = (int32_t)parse_stream_version(host, http);
if(version == -1) {
worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_BAD_HANDSHAKE);
error("STREAM %s [send to %s]: server is not replying properly (is it a netdata?).", host->hostname, s->connected_to);
rrdpush_sender_thread_close_socket(host);
return 0;
@ -541,9 +566,9 @@ void attempt_to_send(struct sender_state *s) {
s->last_sent_t = now_monotonic_sec();
}
else if (ret == -1 && (errno == EAGAIN || errno == EINTR || errno == EWOULDBLOCK))
debug(D_STREAM, "STREAM %s [send to %s]: unavailable after polling POLLOUT", s->host->hostname,
s->connected_to);
debug(D_STREAM, "STREAM %s [send to %s]: unavailable after polling POLLOUT", s->host->hostname, s->connected_to);
else if (ret == -1) {
worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_SEND_ERROR);
debug(D_STREAM, "STREAM: Send failed - closing socket...");
error("STREAM %s [send to %s]: failed to send metrics - closing connection - we have sent %zu bytes on this connection.", s->host->hostname, s->connected_to, s->sent_bytes_on_this_connection);
rrdpush_sender_thread_close_socket(s->host);
@ -570,6 +595,8 @@ int ret;
int sslerrno = SSL_get_error(s->host->ssl.conn, desired);
if (sslerrno == SSL_ERROR_WANT_READ || sslerrno == SSL_ERROR_WANT_WRITE)
return;
worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_SSL_ERROR);
u_long err;
char buf[256];
while ((err = ERR_get_error()) != 0) {
@ -581,20 +608,25 @@ int ret;
return;
}
#endif
ret = recv(s->host->rrdpush_sender_socket, s->read_buffer + s->read_len, sizeof(s->read_buffer) - s->read_len - 1,
MSG_DONTWAIT);
ret = recv(s->host->rrdpush_sender_socket, s->read_buffer + s->read_len, sizeof(s->read_buffer) - s->read_len - 1,MSG_DONTWAIT);
if (ret>0) {
s->read_len += ret;
return;
}
debug(D_STREAM, "Socket was POLLIN, but req %zu bytes gave %d", sizeof(s->read_buffer) - s->read_len - 1, ret);
if (ret<0 && (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR))
return;
if (ret==0)
if (ret==0) {
worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_PARENT_CLOSED);
error("STREAM %s [send to %s]: connection closed by far end. Restarting connection", s->host->hostname, s->connected_to);
else
error("STREAM %s [send to %s]: error during read (%d). Restarting connection", s->host->hostname, s->connected_to,
ret);
}
else {
worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_RECEIVE_ERROR);
error("STREAM %s [send to %s]: error during receive (%d). Restarting connection", s->host->hostname, s->connected_to, ret);
}
rrdpush_sender_thread_close_socket(s->host);
}
@ -615,6 +647,8 @@ void execute_commands(struct sender_state *s) {
static void rrdpush_sender_thread_cleanup_callback(void *ptr) {
worker_unregister();
RRDHOST *host = (RRDHOST *)ptr;
netdata_mutex_lock(&host->sender->mutex);
@ -707,6 +741,25 @@ void *rrdpush_sender_thread(void *ptr) {
fds[Collector].fd = s->host->rrdpush_sender_pipe[PIPE_READ];
fds[Collector].events = POLLIN;
worker_register("STREAMSND");
worker_register_job_name(WORKER_SENDER_JOB_CONNECT, "connect");
worker_register_job_name(WORKER_SENDER_JOB_PIPE_READ, "pipe read");
worker_register_job_name(WORKER_SENDER_JOB_SOCKET_RECEIVE, "receive");
worker_register_job_name(WORKER_SENDER_JOB_EXECUTE, "execute");
worker_register_job_name(WORKER_SENDER_JOB_SOCKET_SEND, "send");
// disconnection reasons
worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_TIMEOUT, "disconnect timeout");
worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_POLL_ERROR, "disconnect poll error");
worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_SOCKER_ERROR, "disconnect socket error");
worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_OVERFLOW, "disconnect overflow");
worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_SSL_ERROR, "disconnect ssl error");
worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_PARENT_CLOSED, "disconnect parent closed");
worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_RECEIVE_ERROR, "disconnect receive error");
worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_SEND_ERROR, "disconnect send error");
worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_NO_COMPRESSION, "disconnect no compression");
worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_BAD_HANDSHAKE, "disconnect bad handshake");
netdata_thread_cleanup_push(rrdpush_sender_thread_cleanup_callback, s->host);
for(; s->host->rrdpush_send_enabled && !netdata_exit ;) {
// check for outstanding cancellation requests
@ -714,6 +767,7 @@ void *rrdpush_sender_thread(void *ptr) {
// The connection attempt blocks (after which we use the socket in nonblocking)
if(unlikely(s->host->rrdpush_sender_socket == -1)) {
worker_is_busy(WORKER_SENDER_JOB_CONNECT);
s->overflow = 0;
s->read_len = 0;
s->buffer->read = 0;
@ -731,11 +785,14 @@ void *rrdpush_sender_thread(void *ptr) {
// If the TCP window never opened then something is wrong, restart connection
if(unlikely(now_monotonic_sec() - s->last_sent_t > s->timeout)) {
worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_TIMEOUT);
error("STREAM %s [send to %s]: could not send metrics for %d seconds - closing connection - we have sent %zu bytes on this connection via %zu send attempts.", s->host->hostname, s->connected_to, s->timeout, s->sent_bytes_on_this_connection, s->send_attempts);
rrdpush_sender_thread_close_socket(s->host);
continue;
}
worker_is_idle();
// Wait until buffer opens in the socket or a rrdset_done_push wakes us
fds[Collector].revents = 0;
fds[Socket].revents = 0;
@ -757,16 +814,18 @@ void *rrdpush_sender_thread(void *ptr) {
int retval = poll(fds, 2, 1000);
debug(D_STREAM, "STREAM: poll() finished collector=%d socket=%d (current chunk %zu bytes)...",
fds[Collector].revents, fds[Socket].revents, outstanding);
if(unlikely(netdata_exit)) break;
// Spurious wake-ups without error - loop again
if (retval == 0 || ((retval == -1) && (errno == EAGAIN || errno == EINTR)))
{
if (retval == 0 || ((retval == -1) && (errno == EAGAIN || errno == EINTR))) {
debug(D_STREAM, "Spurious wakeup");
continue;
}
// Only errors from poll() are internal, but try restarting the connection
if(unlikely(retval == -1)) {
worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_POLL_ERROR);
error("STREAM %s [send to %s]: failed to poll(). Closing socket.", s->host->hostname, s->connected_to);
rrdpush_sender_thread_close_socket(s->host);
continue;
@ -774,6 +833,7 @@ void *rrdpush_sender_thread(void *ptr) {
// If the collector woke us up then empty the pipe to remove the signal
if (fds[Collector].revents & POLLIN || fds[Collector].revents & POLLPRI) {
worker_is_busy(WORKER_SENDER_JOB_PIPE_READ);
debug(D_STREAM, "STREAM: Data added to send buffer (current buffer chunk %zu bytes)...", outstanding);
char buffer[1000 + 1];
@ -782,13 +842,19 @@ void *rrdpush_sender_thread(void *ptr) {
}
// Read as much as possible to fill the buffer, split into full lines for execution.
if (fds[Socket].revents & POLLIN)
if (fds[Socket].revents & POLLIN) {
worker_is_busy(WORKER_SENDER_JOB_SOCKET_RECEIVE);
attempt_read(s);
}
worker_is_busy(WORKER_SENDER_JOB_EXECUTE);
execute_commands(s);
// If we have data and have seen the TCP window open then try to close it by a transmission.
if (outstanding && fds[Socket].revents & POLLOUT)
if (outstanding && fds[Socket].revents & POLLOUT) {
worker_is_busy(WORKER_SENDER_JOB_SOCKET_SEND);
attempt_to_send(s);
}
// TODO-GAPS - why do we only check this on the socket, not the pipe?
if (outstanding) {
@ -800,6 +866,7 @@ void *rrdpush_sender_thread(void *ptr) {
else if (unlikely(fds[Socket].revents & POLLNVAL))
error = "connection is invalid (POLLNVAL)";
if(unlikely(error)) {
worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_SOCKER_ERROR);
error("STREAM %s [send to %s]: restart stream because %s - %zu bytes transmitted.", s->host->hostname,
s->connected_to, error, s->sent_bytes_on_this_connection);
rrdpush_sender_thread_close_socket(s->host);
@ -808,6 +875,7 @@ void *rrdpush_sender_thread(void *ptr) {
// protection from overflow
if (s->overflow) {
worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_OVERFLOW);
errno = 0;
error("STREAM %s [send to %s]: buffer full (%zu-bytes) after %zu bytes. Restarting connection",
s->host->hostname, s->connected_to, s->buffer->size, s->sent_bytes_on_this_connection);

View File

@ -7,6 +7,20 @@ int web_client_timeout = DEFAULT_DISCONNECT_IDLE_WEB_CLIENTS_AFTER_SECONDS;
int web_client_first_request_timeout = DEFAULT_TIMEOUT_TO_RECEIVE_FIRST_WEB_REQUEST;
long web_client_streaming_rate_t = 0L;
#define WORKER_JOB_ADD_CONNECTION 0
#define WORKER_JOB_DEL_COLLECTION 1
#define WORKER_JOB_ADD_FILE 2
#define WORKER_JOB_DEL_FILE 3
#define WORKER_JOB_READ_FILE 4
#define WORKER_JOB_WRITE_FILE 5
#define WORKER_JOB_RCV_DATA 6
#define WORKER_JOB_SND_DATA 7
#define WORKER_JOB_PROCESS 8
#if (WORKER_UTILIZATION_MAX_JOB_TYPES < 9)
#error Please increase WORKER_UTILIZATION_MAX_JOB_TYPES to at least 8
#endif
/*
* --------------------------------------------------------------------------------------------------------------------
* Build web_client state from the pollinfo that describes an accepted connection.
@ -71,11 +85,15 @@ static inline int web_server_check_client_status(struct web_client *w) {
static void *web_server_file_add_callback(POLLINFO *pi, short int *events, void *data) {
struct web_client *w = (struct web_client *)data;
worker_is_busy(WORKER_JOB_ADD_FILE);
worker_private->files_read++;
debug(D_WEB_CLIENT, "%llu: ADDED FILE READ ON FD %d", w->id, pi->fd);
*events = POLLIN;
pi->data = w;
worker_is_idle();
return w;
}
@ -83,27 +101,36 @@ static void web_server_file_del_callback(POLLINFO *pi) {
struct web_client *w = (struct web_client *)pi->data;
debug(D_WEB_CLIENT, "%llu: RELEASE FILE READ ON FD %d", w->id, pi->fd);
worker_is_busy(WORKER_JOB_DEL_FILE);
w->pollinfo_filecopy_slot = 0;
if(unlikely(!w->pollinfo_slot)) {
debug(D_WEB_CLIENT, "%llu: CROSS WEB CLIENT CLEANUP (iFD %d, oFD %d)", w->id, pi->fd, w->ofd);
web_client_release(w);
}
worker_is_idle();
}
static int web_server_file_read_callback(POLLINFO *pi, short int *events) {
int retval = -1;
struct web_client *w = (struct web_client *)pi->data;
worker_is_busy(WORKER_JOB_READ_FILE);
// if there is no POLLINFO linked to this, it means the client disconnected
// stop the file reading too
if(unlikely(!w->pollinfo_slot)) {
debug(D_WEB_CLIENT, "%llu: PREVENTED ATTEMPT TO READ FILE ON FD %d, ON CLOSED WEB CLIENT", w->id, pi->fd);
return -1;
retval = -1;
goto cleanup;
}
if(unlikely(w->mode != WEB_CLIENT_MODE_FILECOPY || w->ifd == w->ofd)) {
debug(D_WEB_CLIENT, "%llu: PREVENTED ATTEMPT TO READ FILE ON FD %d, ON NON-FILECOPY WEB CLIENT", w->id, pi->fd);
return -1;
retval = -1;
goto cleanup;
}
debug(D_WEB_CLIENT, "%llu: READING FILE ON FD %d", w->id, pi->fd);
@ -121,18 +148,25 @@ static int web_server_file_read_callback(POLLINFO *pi, short int *events) {
if(unlikely(ret <= 0 || w->ifd == w->ofd)) {
debug(D_WEB_CLIENT, "%llu: DONE READING FILE ON FD %d", w->id, pi->fd);
return -1;
retval = -1;
goto cleanup;
}
*events = POLLIN;
return 0;
retval = 0;
cleanup:
worker_is_idle();
return retval;
}
static int web_server_file_write_callback(POLLINFO *pi, short int *events) {
(void)pi;
(void)events;
worker_is_busy(WORKER_JOB_WRITE_FILE);
error("Writing to web files is not supported!");
worker_is_idle();
return -1;
}
@ -143,6 +177,7 @@ static int web_server_file_write_callback(POLLINFO *pi, short int *events) {
static void *web_server_add_callback(POLLINFO *pi, short int *events, void *data) {
(void)data; // Suppress warning on unused argument
worker_is_busy(WORKER_JOB_ADD_CONNECTION);
worker_private->connected++;
size_t concurrent = worker_private->connected - worker_private->disconnected;
@ -177,7 +212,7 @@ static void *web_server_add_callback(POLLINFO *pi, short int *events, void *data
//this means that the mensage was not completely read, so
//I cannot identify it yet.
sock_setnonblock(w->ifd);
return w;
goto cleanup;
}
//The next two ifs are not together because I am reusing SSL structure
@ -191,7 +226,7 @@ static void *web_server_add_callback(POLLINFO *pi, short int *events, void *data
if (test[0] < 0x18){
WEB_CLIENT_IS_DEAD(w);
sock_setnonblock(w->ifd);
return w;
goto cleanup;
}
}
}
@ -217,11 +252,16 @@ static void *web_server_add_callback(POLLINFO *pi, short int *events, void *data
#endif
debug(D_WEB_CLIENT, "%llu: ADDED CLIENT FD %d", w->id, pi->fd);
cleanup:
worker_is_idle();
return w;
}
// TCP client disconnected
static void web_server_del_callback(POLLINFO *pi) {
worker_is_busy(WORKER_JOB_DEL_COLLECTION);
worker_private->disconnected++;
struct web_client *w = (struct web_client *)pi->data;
@ -240,18 +280,27 @@ static void web_server_del_callback(POLLINFO *pi) {
debug(D_WEB_CLIENT, "%llu: CLOSING CLIENT FD %d", w->id, pi->fd);
web_client_release(w);
}
worker_is_idle();
}
static int web_server_rcv_callback(POLLINFO *pi, short int *events) {
int ret = -1;
worker_is_busy(WORKER_JOB_RCV_DATA);
worker_private->receptions++;
struct web_client *w = (struct web_client *)pi->data;
int fd = pi->fd;
if(unlikely(web_client_receive(w) < 0))
return -1;
if(unlikely(web_client_receive(w) < 0)) {
ret = -1;
goto cleanup;
}
debug(D_WEB_CLIENT, "%llu: processing received data on fd %d.", w->id, fd);
worker_is_idle();
worker_is_busy(WORKER_JOB_PROCESS);
web_client_process_request(w);
if(unlikely(w->mode == WEB_CLIENT_MODE_FILECOPY)) {
@ -282,7 +331,8 @@ static int web_server_rcv_callback(POLLINFO *pi, short int *events) {
w->pollinfo_filecopy_slot = fpi->slot;
else {
error("Failed to add filecopy fd. Closing client.");
return -1;
ret = -1;
goto cleanup;
}
}
}
@ -295,10 +345,17 @@ static int web_server_rcv_callback(POLLINFO *pi, short int *events) {
if(unlikely(w->ofd == fd && web_client_has_wait_send(w)))
*events |= POLLOUT;
return web_server_check_client_status(w);
ret = web_server_check_client_status(w);
cleanup:
worker_is_idle();
return ret;
}
static int web_server_snd_callback(POLLINFO *pi, short int *events) {
int retval = -1;
worker_is_busy(WORKER_JOB_SND_DATA);
worker_private->sends++;
struct web_client *w = (struct web_client *)pi->data;
@ -306,8 +363,12 @@ static int web_server_snd_callback(POLLINFO *pi, short int *events) {
debug(D_WEB_CLIENT, "%llu: sending data on fd %d.", w->id, fd);
if(unlikely(web_client_send(w) < 0))
return -1;
int ret = web_client_send(w);
if(unlikely(ret < 0)) {
retval = -1;
goto cleanup;
}
if(unlikely(w->ifd == fd && web_client_has_wait_receive(w)))
*events |= POLLIN;
@ -315,50 +376,11 @@ static int web_server_snd_callback(POLLINFO *pi, short int *events) {
if(unlikely(w->ofd == fd && web_client_has_wait_send(w)))
*events |= POLLOUT;
return web_server_check_client_status(w);
}
retval = web_server_check_client_status(w);
static void web_server_tmr_callback(void *timer_data) {
worker_private = (struct web_server_static_threaded_worker *)timer_data;
static __thread RRDSET *st = NULL;
static __thread RRDDIM *rd_user = NULL, *rd_system = NULL;
if(unlikely(netdata_exit)) return;
if(unlikely(!st)) {
char id[100 + 1];
char title[100 + 1];
snprintfz(id, 100, "web_thread%d_cpu", worker_private->id + 1);
snprintfz(title, 100, "Netdata web server thread CPU usage");
st = rrdset_create_localhost(
"netdata"
, id
, NULL
, "web"
, "netdata.web_cpu"
, title
, "milliseconds/s"
, "web"
, "stats"
, 132000 + worker_private->id
, default_rrd_update_every
, RRDSET_TYPE_STACKED
);
rd_user = rrddim_add(st, "user", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL);
rd_system = rrddim_add(st, "system", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL);
}
else
rrdset_next(st);
struct rusage rusage;
getrusage(RUSAGE_THREAD, &rusage);
rrddim_set_by_pointer(st, rd_user, rusage.ru_utime.tv_sec * 1000000ULL + rusage.ru_utime.tv_usec);
rrddim_set_by_pointer(st, rd_system, rusage.ru_stime.tv_sec * 1000000ULL + rusage.ru_stime.tv_usec);
rrdset_done(st);
cleanup:
worker_is_idle();
return retval;
}
// ----------------------------------------------------------------------------
@ -379,11 +401,22 @@ static void socket_listen_main_static_threaded_worker_cleanup(void *ptr) {
);
worker_private->running = 0;
worker_unregister();
}
void *socket_listen_main_static_threaded_worker(void *ptr) {
worker_private = (struct web_server_static_threaded_worker *)ptr;
worker_private->running = 1;
worker_register("WEB");
worker_register_job_name(WORKER_JOB_ADD_CONNECTION, "connect");
worker_register_job_name(WORKER_JOB_DEL_COLLECTION, "disconnect");
worker_register_job_name(WORKER_JOB_ADD_FILE, "file start");
worker_register_job_name(WORKER_JOB_DEL_FILE, "file end");
worker_register_job_name(WORKER_JOB_READ_FILE, "file read");
worker_register_job_name(WORKER_JOB_WRITE_FILE, "file write");
worker_register_job_name(WORKER_JOB_RCV_DATA, "receive");
worker_register_job_name(WORKER_JOB_SND_DATA, "send");
worker_register_job_name(WORKER_JOB_PROCESS, "process");
netdata_thread_cleanup_push(socket_listen_main_static_threaded_worker_cleanup, ptr);
@ -392,7 +425,7 @@ void *socket_listen_main_static_threaded_worker(void *ptr) {
, web_server_del_callback
, web_server_rcv_callback
, web_server_snd_callback
, web_server_tmr_callback
, NULL
, web_allow_connections_from
, web_allow_connections_dns
, NULL