DBENGINE v2 - improvements part 11 (#14337)

* acquiring / releasing interface for metrics

* metrics registry statistics

* cleanup metrics registry by deleting metrics when they dont have retention anymore; do not double copy the data of pages to be flushed

* print the tier in retention summary

* Open files with buffered instead of direct I/O (test)

* added more metrics stats and fixed unittest

* rename writer functions to avoid confusion with refcounting

* do not release a metric that is not acquired

* Revert to use direct I/O on write -- use direct I/O on read as well

* keep track of ARAL overhead and add it to the memory chart

* aral full check via api

* Cleanup

* give names to ARALs and PGCs

* aral improvements

* restore query expansion to the future

* prefer higher resolution tier when switching plans

* added extent read statistics

* smoother joining of tiers at query engine

* fine tune aral max allocation size

* aral restructuring to hide its internals from the rest of netdata

* aral restructuring; addtion of defrag option to aral to keep the linked list sorted - enabled by default to test it

* fully async aral

* some statistics and cleanup

* fix infinite loop while calculating retention

* aral docs and defragmenting disabled by default

* fix bug and add optimization when defragmenter is not enabled

* aral stress test

* aral speed report and documentation

* added internal checks that all pages are full

* improve internal log about metrics deletion

* metrics registry uses one aral per partition

* metrics registry aral max size to 512 elements per page

* remove data_structures/README.md dependency

---------

Co-authored-by: Stelios Fragkakis <52996999+stelfrag@users.noreply.github.com>
This commit is contained in:
Costa Tsaousis 2023-01-30 20:36:16 +02:00 committed by GitHub
parent fd7f39a744
commit 7f8f11eb37
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
40 changed files with 1915 additions and 818 deletions

View File

@ -453,8 +453,8 @@ set(LIBNETDATA_FILES
libnetdata/adaptive_resortable_list/adaptive_resortable_list.h
libnetdata/config/appconfig.c
libnetdata/config/appconfig.h
libnetdata/arrayalloc/arrayalloc.c
libnetdata/arrayalloc/arrayalloc.h
libnetdata/aral/aral.c
libnetdata/aral/aral.h
libnetdata/avl/avl.c
libnetdata/avl/avl.h
libnetdata/buffer/buffer.c

View File

@ -131,8 +131,8 @@ LIBNETDATA_FILES = \
libnetdata/adaptive_resortable_list/adaptive_resortable_list.h \
libnetdata/config/appconfig.c \
libnetdata/config/appconfig.h \
libnetdata/arrayalloc/arrayalloc.c \
libnetdata/arrayalloc/arrayalloc.h \
libnetdata/aral/aral.c \
libnetdata/aral/aral.h \
libnetdata/avl/avl.c \
libnetdata/avl/avl.h \
libnetdata/buffer/buffer.c \

View File

@ -975,7 +975,7 @@ static inline struct pid_stat *get_pid_entry(pid_t pid) {
init_pid_fds(p, 0, p->fds_size);
p->pid = pid;
DOUBLE_LINKED_LIST_APPEND_UNSAFE(root_of_pids, p, prev, next);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(root_of_pids, p, prev, next);
all_pids[pid] = p;
all_pids_count++;
@ -993,7 +993,7 @@ static inline void del_pid_entry(pid_t pid) {
debug_log("process %d %s exited, deleting it.", pid, p->comm);
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(root_of_pids, p, prev, next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(root_of_pids, p, prev, next);
// free the filename
#ifndef __FreeBSD__

View File

@ -1730,7 +1730,7 @@ AC_CONFIG_FILES([
libnetdata/Makefile
libnetdata/tests/Makefile
libnetdata/adaptive_resortable_list/Makefile
libnetdata/arrayalloc/Makefile
libnetdata/aral/Makefile
libnetdata/avl/Makefile
libnetdata/buffer/Makefile
libnetdata/clocks/Makefile

View File

@ -243,6 +243,9 @@ static void global_statistics_charts(void) {
global_statistics_copy(&gs, GLOBAL_STATS_RESET_WEB_USEC_MAX);
getrusage(RUSAGE_SELF, &me);
size_t aral_structures, aral_malloc_allocated, aral_malloc_used, aral_mmap_allocated, aral_mmap_used;
aral_get_size_statistics(&aral_structures, &aral_malloc_allocated, &aral_malloc_used, &aral_mmap_allocated, &aral_mmap_used);
// ----------------------------------------------------------------
{
@ -292,6 +295,7 @@ static void global_statistics_charts(void) {
static RRDDIM *rd_replication = NULL;
static RRDDIM *rd_buffers = NULL;
static RRDDIM *rd_workers = NULL;
static RRDDIM *rd_aral = NULL;
static RRDDIM *rd_other = NULL;
if (unlikely(!st_memory)) {
@ -322,6 +326,7 @@ static void global_statistics_charts(void) {
rd_replication = rrddim_add(st_memory, "replication", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
rd_buffers = rrddim_add(st_memory, "buffers", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
rd_workers = rrddim_add(st_memory, "workers", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
rd_aral = rrddim_add(st_memory, "aral", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
rd_other = rrddim_add(st_memory, "other", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
}
@ -355,6 +360,7 @@ static void global_statistics_charts(void) {
rrddim_set_by_pointer(st_memory, rd_replication, (collected_number)dictionary_stats_memory_total(dictionary_stats_category_replication) + (collected_number)replication_allocated_memory());
rrddim_set_by_pointer(st_memory, rd_buffers, (collected_number)buffers);
rrddim_set_by_pointer(st_memory, rd_workers, (collected_number) workers_allocated_memory());
rrddim_set_by_pointer(st_memory, rd_aral, (collected_number) aral_structures);
rrddim_set_by_pointer(st_memory, rd_other, (collected_number)dictionary_stats_memory_total(dictionary_stats_category_other));
rrdset_done(st_memory);
@ -374,6 +380,7 @@ static void global_statistics_charts(void) {
static RRDDIM *rd_cbuffers_streaming = NULL;
static RRDDIM *rd_buffers_replication = NULL;
static RRDDIM *rd_buffers_web = NULL;
static RRDDIM *rd_buffers_aral = NULL;
if (unlikely(!st_memory_buffers)) {
st_memory_buffers = rrdset_create_localhost(
@ -402,6 +409,7 @@ static void global_statistics_charts(void) {
rd_cbuffers_streaming = rrddim_add(st_memory_buffers, "streaming cbuf", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
rd_buffers_replication = rrddim_add(st_memory_buffers, "replication", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
rd_buffers_web = rrddim_add(st_memory_buffers, "web", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
rd_buffers_aral = rrddim_add(st_memory_buffers, "aral", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
}
rrddim_set_by_pointer(st_memory_buffers, rd_queries, (collected_number)netdata_buffers_statistics.query_targets_size + (collected_number) onewayalloc_allocated_memory());
@ -416,6 +424,7 @@ static void global_statistics_charts(void) {
rrddim_set_by_pointer(st_memory_buffers, rd_cbuffers_streaming, (collected_number)netdata_buffers_statistics.cbuffers_streaming);
rrddim_set_by_pointer(st_memory_buffers, rd_buffers_replication, (collected_number)replication_allocated_buffers());
rrddim_set_by_pointer(st_memory_buffers, rd_buffers_web, (collected_number)netdata_buffers_statistics.buffers_web);
rrddim_set_by_pointer(st_memory_buffers, rd_buffers_aral, (collected_number)(aral_malloc_allocated + aral_mmap_allocated) - (collected_number)(aral_malloc_used + aral_mmap_used));
rrdset_done(st_memory_buffers);
}
@ -1885,6 +1894,111 @@ static void dbengine2_statistics_charts(void) {
}
#endif
{
static RRDSET *st_mrg_metrics = NULL;
static RRDDIM *rd_mrg_metrics = NULL;
static RRDDIM *rd_mrg_acquired = NULL;
static RRDDIM *rd_mrg_collected = NULL;
static RRDDIM *rd_mrg_with_retention = NULL;
static RRDDIM *rd_mrg_without_retention = NULL;
static RRDDIM *rd_mrg_multiple_writers = NULL;
if (unlikely(!st_mrg_metrics)) {
st_mrg_metrics = rrdset_create_localhost(
"netdata",
"dbengine_metrics",
NULL,
"dbengine metrics",
NULL,
"Netdata Metrics in Metrics Registry",
"metrics",
"netdata",
"stats",
priority,
localhost->rrd_update_every,
RRDSET_TYPE_LINE);
rd_mrg_metrics = rrddim_add(st_mrg_metrics, "all", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
rd_mrg_acquired = rrddim_add(st_mrg_metrics, "acquired", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
rd_mrg_collected = rrddim_add(st_mrg_metrics, "collected", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
rd_mrg_with_retention = rrddim_add(st_mrg_metrics, "with retention", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
rd_mrg_without_retention = rrddim_add(st_mrg_metrics, "without retention", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
rd_mrg_multiple_writers = rrddim_add(st_mrg_metrics, "multi-collected", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
}
priority++;
rrddim_set_by_pointer(st_mrg_metrics, rd_mrg_metrics, (collected_number)mrg_stats.entries);
rrddim_set_by_pointer(st_mrg_metrics, rd_mrg_acquired, (collected_number)mrg_stats.entries_referenced);
rrddim_set_by_pointer(st_mrg_metrics, rd_mrg_collected, (collected_number)mrg_stats.writers);
rrddim_set_by_pointer(st_mrg_metrics, rd_mrg_with_retention, (collected_number)mrg_stats.entries_with_retention);
rrddim_set_by_pointer(st_mrg_metrics, rd_mrg_without_retention, (collected_number)mrg_stats.entries - (collected_number)mrg_stats.entries_with_retention);
rrddim_set_by_pointer(st_mrg_metrics, rd_mrg_multiple_writers, (collected_number)mrg_stats.writers_conflicts);
rrdset_done(st_mrg_metrics);
}
{
static RRDSET *st_mrg_ops = NULL;
static RRDDIM *rd_mrg_add = NULL;
static RRDDIM *rd_mrg_del = NULL;
static RRDDIM *rd_mrg_search = NULL;
if (unlikely(!st_mrg_ops)) {
st_mrg_ops = rrdset_create_localhost(
"netdata",
"dbengine_metrics_registry_operations",
NULL,
"dbengine metrics",
NULL,
"Netdata Metrics Registry Operations",
"metrics",
"netdata",
"stats",
priority,
localhost->rrd_update_every,
RRDSET_TYPE_LINE);
rd_mrg_add = rrddim_add(st_mrg_ops, "add", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
rd_mrg_del = rrddim_add(st_mrg_ops, "delete", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
rd_mrg_search = rrddim_add(st_mrg_ops, "search", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
}
priority++;
rrddim_set_by_pointer(st_mrg_ops, rd_mrg_add, (collected_number)mrg_stats.additions);
rrddim_set_by_pointer(st_mrg_ops, rd_mrg_del, (collected_number)mrg_stats.deletions);
rrddim_set_by_pointer(st_mrg_ops, rd_mrg_search, (collected_number)mrg_stats.search_hits + (collected_number)mrg_stats.search_misses);
rrdset_done(st_mrg_ops);
}
{
static RRDSET *st_mrg_references = NULL;
static RRDDIM *rd_mrg_references = NULL;
if (unlikely(!st_mrg_references)) {
st_mrg_references = rrdset_create_localhost(
"netdata",
"dbengine_metrics_registry_references",
NULL,
"dbengine metrics",
NULL,
"Netdata Metrics Registry References",
"references",
"netdata",
"stats",
priority,
localhost->rrd_update_every,
RRDSET_TYPE_LINE);
rd_mrg_references = rrddim_add(st_mrg_references, "references", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
}
priority++;
rrddim_set_by_pointer(st_mrg_references, rd_mrg_references, (collected_number)mrg_stats.current_references);
rrdset_done(st_mrg_references);
}
{
static RRDSET *st_cache_hit_ratio = NULL;
static RRDDIM *rd_hit_ratio = NULL;
@ -3817,7 +3931,7 @@ static void workers_threads_cleanup(struct worker_utilization *wu) {
if(!t->enabled) {
JudyLDel(&workers_by_pid_JudyL_array, t->pid, PJE0);
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(wu->threads, t, prev, next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(wu->threads, t, prev, next);
freez(t);
}
t = next;
@ -3844,7 +3958,7 @@ static struct worker_thread *worker_thread_create(struct worker_utilization *wu,
*PValue = wt;
// link it
DOUBLE_LINKED_LIST_APPEND_UNSAFE(wu->threads, wt, prev, next);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(wu->threads, wt, prev, next);
return wt;
}

View File

@ -16,7 +16,7 @@
typedef int32_t REFCOUNT;
#define REFCOUNT_DELETING (-100)
// to use arrayalloc uncomment the following line:
// to use ARAL uncomment the following line:
#define PGC_WITH_ARAL 1
typedef enum __attribute__ ((__packed__)) {
@ -82,6 +82,8 @@ struct pgc_linked_list {
struct pgc {
struct {
char name[PGC_NAME_MAX + 1];
size_t partitions;
size_t clean_size;
size_t max_dirty_pages_per_call;
@ -415,13 +417,25 @@ struct section_pages {
PGC_PAGE *base;
};
static ARAL section_pages_aral = {
.filename = NULL,
.cache_dir = NULL,
.use_mmap = false,
.initial_elements = 16384 / sizeof(struct section_pages),
.requested_element_size = sizeof(struct section_pages),
};
static ARAL *pgc_section_pages_aral = NULL;
static void pgc_section_pages_static_aral_init(void) {
static SPINLOCK spinlock = NETDATA_SPINLOCK_INITIALIZER;
if(unlikely(!pgc_section_pages_aral)) {
netdata_spinlock_lock(&spinlock);
// we have to check again
if(!pgc_section_pages_aral)
pgc_section_pages_aral = aral_create(
"pgc_section",
sizeof(struct section_pages),
0,
4096,
NULL, NULL, false, false);
netdata_spinlock_unlock(&spinlock);
}
}
static inline void pgc_stats_ll_judy_change(PGC *cache, struct pgc_linked_list *ll, size_t mem_before_judyl, size_t mem_after_judyl) {
if(mem_after_judyl > mem_before_judyl) {
@ -462,7 +476,7 @@ static void pgc_ll_add(PGC *cache __maybe_unused, struct pgc_linked_list *ll, PG
struct section_pages *sp = *section_pages_pptr;
if(!sp) {
// sp = callocz(1, sizeof(struct section_pages));
sp = arrayalloc_mallocz(&section_pages_aral);
sp = aral_mallocz(pgc_section_pages_aral);
memset(sp, 0, sizeof(struct section_pages));
*section_pages_pptr = sp;
@ -473,7 +487,7 @@ static void pgc_ll_add(PGC *cache __maybe_unused, struct pgc_linked_list *ll, PG
sp->entries++;
sp->size += page->assumed_size;
DOUBLE_LINKED_LIST_APPEND_UNSAFE(sp->base, page, link.prev, link.next);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(sp->base, page, link.prev, link.next);
if((sp->entries % cache->config.max_dirty_pages_per_call) == 0)
ll->version++;
@ -484,11 +498,11 @@ static void pgc_ll_add(PGC *cache __maybe_unused, struct pgc_linked_list *ll, PG
// - DIRTY pages made CLEAN, depending on their accesses may be appended (accesses > 0) or prepended (accesses = 0).
if(page->accesses || page_flag_check(page, PGC_PAGE_HAS_BEEN_ACCESSED | PGC_PAGE_HAS_NO_DATA_IGNORE_ACCESSES) == PGC_PAGE_HAS_BEEN_ACCESSED) {
DOUBLE_LINKED_LIST_APPEND_UNSAFE(ll->base, page, link.prev, link.next);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(ll->base, page, link.prev, link.next);
page_flag_clear(page, PGC_PAGE_HAS_BEEN_ACCESSED);
}
else
DOUBLE_LINKED_LIST_PREPEND_UNSAFE(ll->base, page, link.prev, link.next);
DOUBLE_LINKED_LIST_PREPEND_ITEM_UNSAFE(ll->base, page, link.prev, link.next);
ll->version++;
}
@ -530,7 +544,7 @@ static void pgc_ll_del(PGC *cache __maybe_unused, struct pgc_linked_list *ll, PG
struct section_pages *sp = *section_pages_pptr;
sp->entries--;
sp->size -= page->assumed_size;
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(sp->base, page, link.prev, link.next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(sp->base, page, link.prev, link.next);
if(!sp->base) {
size_t mem_before_judyl, mem_after_judyl;
@ -543,13 +557,13 @@ static void pgc_ll_del(PGC *cache __maybe_unused, struct pgc_linked_list *ll, PG
fatal("DBENGINE CACHE: cannot delete section from Judy LL");
// freez(sp);
arrayalloc_freez(&section_pages_aral, sp);
aral_freez(pgc_section_pages_aral, sp);
mem_after_judyl -= sizeof(struct section_pages);
pgc_stats_ll_judy_change(cache, ll, mem_before_judyl, mem_after_judyl);
}
}
else {
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(ll->base, page, link.prev, link.next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(ll->base, page, link.prev, link.next);
ll->version++;
}
@ -565,8 +579,8 @@ static inline void page_has_been_accessed(PGC *cache, PGC_PAGE *page) {
if (flags & PGC_PAGE_CLEAN) {
if(pgc_ll_trylock(cache, &cache->clean)) {
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(cache->clean.base, page, link.prev, link.next);
DOUBLE_LINKED_LIST_APPEND_UNSAFE(cache->clean.base, page, link.prev, link.next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(cache->clean.base, page, link.prev, link.next);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(cache->clean.base, page, link.prev, link.next);
pgc_ll_unlock(cache, &cache->clean);
page_flag_clear(page, PGC_PAGE_HAS_BEEN_ACCESSED);
}
@ -860,7 +874,7 @@ static inline void free_this_page(PGC *cache, PGC_PAGE *page) {
// free our memory
#ifdef PGC_WITH_ARAL
arrayalloc_freez(cache->aral, page);
aral_freez(cache->aral, page);
#else
freez(page);
#endif
@ -1038,8 +1052,8 @@ static bool evict_pages_with_filter(PGC *cache, size_t max_skip, size_t max_evic
break;
if(unlikely(page_flag_check(page, PGC_PAGE_HAS_BEEN_ACCESSED | PGC_PAGE_HAS_NO_DATA_IGNORE_ACCESSES) == PGC_PAGE_HAS_BEEN_ACCESSED)) {
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(cache->clean.base, page, link.prev, link.next);
DOUBLE_LINKED_LIST_APPEND_UNSAFE(cache->clean.base, page, link.prev, link.next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(cache->clean.base, page, link.prev, link.next);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(cache->clean.base, page, link.prev, link.next);
page_flag_clear(page, PGC_PAGE_HAS_BEEN_ACCESSED);
continue;
}
@ -1056,7 +1070,7 @@ static bool evict_pages_with_filter(PGC *cache, size_t max_skip, size_t max_evic
__atomic_add_fetch(&cache->stats.evicting_entries, 1, __ATOMIC_RELAXED);
__atomic_add_fetch(&cache->stats.evicting_size, page->assumed_size, __ATOMIC_RELAXED);
DOUBLE_LINKED_LIST_APPEND_UNSAFE(pages_to_evict, page, link.prev, link.next);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(pages_to_evict, page, link.prev, link.next);
pages_to_evict_size += page->assumed_size;
@ -1073,8 +1087,8 @@ static bool evict_pages_with_filter(PGC *cache, size_t max_skip, size_t max_evic
if(!first_page_we_relocated)
first_page_we_relocated = page;
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(cache->clean.base, page, link.prev, link.next);
DOUBLE_LINKED_LIST_APPEND_UNSAFE(cache->clean.base, page, link.prev, link.next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(cache->clean.base, page, link.prev, link.next);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(cache->clean.base, page, link.prev, link.next);
// check if we have to stop
if(unlikely(++total_pages_skipped >= max_skip && !all_of_them)) {
@ -1099,8 +1113,8 @@ static bool evict_pages_with_filter(PGC *cache, size_t max_skip, size_t max_evic
next = page->link.next;
size_t partition = pgc_indexing_partition(cache, page->metric_id);
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(pages_to_evict, page, link.prev, link.next);
DOUBLE_LINKED_LIST_APPEND_UNSAFE(pages_per_partition[partition], page, link.prev, link.next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(pages_to_evict, page, link.prev, link.next);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(pages_per_partition[partition], page, link.prev, link.next);
}
// remove them from the index
@ -1178,7 +1192,7 @@ static PGC_PAGE *page_add(PGC *cache, PGC_ENTRY *entry, bool *added) {
__atomic_add_fetch(&cache->stats.workers_add, 1, __ATOMIC_RELAXED);
#ifdef PGC_WITH_ARAL
PGC_PAGE *allocation = arrayalloc_mallocz(cache->aral);
PGC_PAGE *allocation = aral_mallocz(cache->aral);
#endif
PGC_PAGE *page;
size_t spins = 0;
@ -1285,7 +1299,7 @@ static PGC_PAGE *page_add(PGC *cache, PGC_ENTRY *entry, bool *added) {
#ifdef PGC_WITH_ARAL
if(allocation)
arrayalloc_freez(cache->aral, allocation);
aral_freez(cache->aral, allocation);
#endif
__atomic_sub_fetch(&cache->stats.workers_add, 1, __ATOMIC_RELAXED);
@ -1713,7 +1727,8 @@ void free_all_unreferenced_clean_pages(PGC *cache) {
// ----------------------------------------------------------------------------
// public API
PGC *pgc_create(size_t clean_size_bytes, free_clean_page_callback pgc_free_cb,
PGC *pgc_create(const char *name,
size_t clean_size_bytes, free_clean_page_callback pgc_free_cb,
size_t max_dirty_pages_per_flush,
save_dirty_init_callback pgc_save_init_cb,
save_dirty_page_callback pgc_save_dirty_cb,
@ -1732,6 +1747,7 @@ PGC *pgc_create(size_t clean_size_bytes, free_clean_page_callback pgc_free_cb,
max_flushes_inline = 2;
PGC *cache = callocz(1, sizeof(PGC));
strncpyz(cache->config.name, name, PGC_NAME_MAX);
cache->config.options = options;
cache->config.clean_size = (clean_size_bytes < 1 * 1024 * 1024) ? 1 * 1024 * 1024 : clean_size_bytes;
cache->config.pgc_free_clean_cb = pgc_free_cb;
@ -1772,10 +1788,14 @@ PGC *pgc_create(size_t clean_size_bytes, free_clean_page_callback pgc_free_cb,
cache->clean.stats = &cache->stats.queues.clean;
#ifdef PGC_WITH_ARAL
cache->aral = arrayalloc_create(sizeof(PGC_PAGE) + cache->config.additional_bytes_per_page, 65536 / sizeof(PGC_PAGE),
NULL, NULL, false, false);
cache->aral = aral_create(name,
sizeof(PGC_PAGE) + cache->config.additional_bytes_per_page,
0,
4096,
NULL, NULL, false, false);
#endif
pgc_section_pages_static_aral_init();
pointer_index_init(cache);
return cache;
@ -1803,7 +1823,7 @@ void pgc_destroy(PGC *cache) {
else {
pointer_destroy_index(cache);
#ifdef PGC_WITH_ARAL
arrayalloc_destroy(cache->aral);
aral_destroy(cache->aral);
#endif
freez(cache);
}
@ -2602,7 +2622,8 @@ void unittest_stress_test(void) {
#endif
int pgc_unittest(void) {
PGC *cache = pgc_create(32 * 1024 * 1024, unittest_free_clean_page_callback,
PGC *cache = pgc_create("test",
32 * 1024 * 1024, unittest_free_clean_page_callback,
64, NULL, unittest_save_dirty_page_callback,
10, 10, 1000, 10,
PGC_OPTIONS_DEFAULT, 1, 11);

View File

@ -8,6 +8,7 @@
typedef struct pgc PGC;
typedef struct pgc_page PGC_PAGE;
#define PGC_NAME_MAX 23
typedef enum __attribute__ ((__packed__)) {
PGC_OPTIONS_NONE = 0,
@ -165,7 +166,8 @@ typedef void (*free_clean_page_callback)(PGC *cache, PGC_ENTRY entry);
typedef void (*save_dirty_page_callback)(PGC *cache, PGC_ENTRY *entries_array, PGC_PAGE **pages_array, size_t entries);
typedef void (*save_dirty_init_callback)(PGC *cache, Word_t section);
// create a cache
PGC *pgc_create(size_t clean_size_bytes, free_clean_page_callback pgc_free_clean_cb,
PGC *pgc_create(const char *name,
size_t clean_size_bytes, free_clean_page_callback pgc_free_clean_cb,
size_t max_dirty_pages_per_flush, save_dirty_init_callback pgc_save_init_cb, save_dirty_page_callback pgc_save_dirty_cb,
size_t max_pages_per_inline_eviction, size_t max_inline_evictors,
size_t max_skip_pages_per_inline_eviction,

View File

@ -4,13 +4,13 @@
void datafile_list_insert(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile)
{
uv_rwlock_wrlock(&ctx->datafiles.rwlock);
DOUBLE_LINKED_LIST_APPEND_UNSAFE(ctx->datafiles.first, datafile, prev, next);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(ctx->datafiles.first, datafile, prev, next);
uv_rwlock_wrunlock(&ctx->datafiles.rwlock);
}
void datafile_list_delete_unsafe(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile)
{
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(ctx->datafiles.first, datafile, prev, next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(ctx->datafiles.first, datafile, prev, next);
}

View File

@ -605,7 +605,9 @@ static void journalfile_restore_extent_metadata(struct rrdengine_instance *ctx,
false);
if(!vd.is_valid) {
mrg_metric_release(main_mrg, metric);
if(metric)
mrg_metric_release(main_mrg, metric);
continue;
}

View File

@ -3,6 +3,10 @@
typedef int32_t REFCOUNT;
#define REFCOUNT_DELETING (-100)
typedef enum __attribute__ ((__packed__)) {
METRIC_FLAG_HAS_RETENTION = (1 << 0),
} METRIC_FLAGS;
struct metric {
uuid_t uuid; // never changes
Word_t section; // never changes
@ -12,6 +16,8 @@ struct metric {
time_t latest_time_s_hot; // latest time of the currently collected page
uint32_t latest_update_every_s; //
pid_t writer;
METRIC_FLAGS flags;
REFCOUNT refcount;
SPINLOCK spinlock; // protects all variable members
// THIS IS allocated with malloc()
@ -19,8 +25,9 @@ struct metric {
};
struct mrg {
ARAL *aral[MRG_PARTITIONS];
struct pgc_index {
ARAL *aral;
netdata_rwlock_t rwlock;
Pvoid_t uuid_judy; // each UUID has a JudyL of sections (tiers)
} index[MRG_PARTITIONS];
@ -95,9 +102,75 @@ static inline size_t uuid_partition(MRG *mrg __maybe_unused, uuid_t *uuid) {
return u[UUID_SZ - 1] % MRG_PARTITIONS;
}
static METRIC *metric_add(MRG *mrg, MRG_ENTRY *entry, bool *ret) {
static inline bool metric_has_retention_unsafe(MRG *mrg __maybe_unused, METRIC *metric) {
bool has_retention = (metric->first_time_s || metric->latest_time_s_clean || metric->latest_time_s_hot);
if(has_retention && !(metric->flags & METRIC_FLAG_HAS_RETENTION)) {
metric->flags |= METRIC_FLAG_HAS_RETENTION;
__atomic_add_fetch(&mrg->stats.entries_with_retention, 1, __ATOMIC_RELAXED);
}
else if(!has_retention && (metric->flags & METRIC_FLAG_HAS_RETENTION)) {
metric->flags &= ~METRIC_FLAG_HAS_RETENTION;
__atomic_sub_fetch(&mrg->stats.entries_with_retention, 1, __ATOMIC_RELAXED);
}
return has_retention;
}
static inline REFCOUNT metric_acquire(MRG *mrg __maybe_unused, METRIC *metric, bool having_spinlock) {
REFCOUNT refcount;
if(!having_spinlock)
netdata_spinlock_lock(&metric->spinlock);
if(unlikely(metric->refcount < 0))
fatal("METRIC: refcount is %d (negative) during acquire", metric->refcount);
refcount = ++metric->refcount;
// update its retention flags
metric_has_retention_unsafe(mrg, metric);
if(!having_spinlock)
netdata_spinlock_unlock(&metric->spinlock);
if(refcount == 1)
__atomic_add_fetch(&mrg->stats.entries_referenced, 1, __ATOMIC_RELAXED);
__atomic_add_fetch(&mrg->stats.current_references, 1, __ATOMIC_RELAXED);
return refcount;
}
static inline bool metric_release_and_can_be_deleted(MRG *mrg __maybe_unused, METRIC *metric) {
bool ret = true;
REFCOUNT refcount;
netdata_spinlock_lock(&metric->spinlock);
if(unlikely(metric->refcount <= 0))
fatal("METRIC: refcount is %d (zero or negative) during release", metric->refcount);
refcount = --metric->refcount;
if(likely(metric_has_retention_unsafe(mrg, metric) || refcount != 0))
ret = false;
netdata_spinlock_unlock(&metric->spinlock);
if(unlikely(!refcount))
__atomic_sub_fetch(&mrg->stats.entries_referenced, 1, __ATOMIC_RELAXED);
__atomic_sub_fetch(&mrg->stats.current_references, 1, __ATOMIC_RELAXED);
return ret;
}
static METRIC *metric_add_and_acquire(MRG *mrg, MRG_ENTRY *entry, bool *ret) {
size_t partition = uuid_partition(mrg, &entry->uuid);
METRIC *allocation = aral_mallocz(mrg->aral[partition]);
mrg_index_write_lock(mrg, partition);
size_t mem_before_judyl, mem_after_judyl;
@ -117,18 +190,22 @@ static METRIC *metric_add(MRG *mrg, MRG_ENTRY *entry, bool *ret) {
if(unlikely(!PValue || PValue == PJERR))
fatal("DBENGINE METRIC: corrupted section JudyL array");
if(*PValue != NULL) {
if(unlikely(*PValue != NULL)) {
METRIC *metric = *PValue;
metric_acquire(mrg, metric, false);
mrg_index_write_unlock(mrg, partition);
if(ret)
*ret = false;
aral_freez(mrg->aral[partition], allocation);
MRG_STATS_DUPLICATE_ADD(mrg);
return metric;
}
METRIC *metric = arrayalloc_mallocz(mrg->index[partition].aral);
METRIC *metric = allocation;
uuid_copy(metric->uuid, entry->uuid);
metric->section = entry->section;
metric->first_time_s = entry->first_time_s;
@ -136,7 +213,10 @@ static METRIC *metric_add(MRG *mrg, MRG_ENTRY *entry, bool *ret) {
metric->latest_time_s_hot = 0;
metric->latest_update_every_s = entry->latest_update_every_s;
metric->writer = 0;
metric->refcount = 0;
metric->flags = 0;
netdata_spinlock_init(&metric->spinlock);
metric_acquire(mrg, metric, true); // no spinlock use required here
*PValue = metric;
mrg_index_write_unlock(mrg, partition);
@ -149,7 +229,7 @@ static METRIC *metric_add(MRG *mrg, MRG_ENTRY *entry, bool *ret) {
return metric;
}
static METRIC *metric_get(MRG *mrg, uuid_t *uuid, Word_t section) {
static METRIC *metric_get_and_acquire(MRG *mrg, uuid_t *uuid, Word_t section) {
size_t partition = uuid_partition(mrg, uuid);
mrg_index_read_lock(mrg, partition);
@ -170,19 +250,27 @@ static METRIC *metric_get(MRG *mrg, uuid_t *uuid, Word_t section) {
METRIC *metric = *PValue;
metric_acquire(mrg, metric, false);
mrg_index_read_unlock(mrg, partition);
MRG_STATS_SEARCH_HIT(mrg);
return metric;
}
static bool metric_del(MRG *mrg, METRIC *metric) {
static bool acquired_metric_del(MRG *mrg, METRIC *metric) {
size_t partition = uuid_partition(mrg, &metric->uuid);
size_t mem_before_judyl, mem_after_judyl;
mrg_index_write_lock(mrg, partition);
if(!metric_release_and_can_be_deleted(mrg, metric)) {
mrg_index_write_unlock(mrg, partition);
__atomic_add_fetch(&mrg->stats.delete_having_retention_or_referenced, 1, __ATOMIC_RELAXED);
return false;
}
Pvoid_t *sections_judy_pptr = JudyHSGet(mrg->index[partition].uuid_judy, &metric->uuid, sizeof(uuid_t));
if(unlikely(!sections_judy_pptr || !*sections_judy_pptr)) {
mrg_index_write_unlock(mrg, partition);
@ -208,11 +296,10 @@ static bool metric_del(MRG *mrg, METRIC *metric) {
mrg_stats_size_judyhs_removed_uuid(mrg);
}
// arrayalloc is running lockless here
arrayalloc_freez(mrg->index[partition].aral, metric);
mrg_index_write_unlock(mrg, partition);
aral_freez(mrg->aral[partition], metric);
MRG_STATS_DELETED_METRIC(mrg, partition);
return true;
@ -223,11 +310,22 @@ static bool metric_del(MRG *mrg, METRIC *metric) {
MRG *mrg_create(void) {
MRG *mrg = callocz(1, sizeof(MRG));
for(size_t i = 0; i < MRG_PARTITIONS ; i++) {
char buf[ARAL_MAX_NAME + 1];
snprintfz(buf, ARAL_MAX_NAME, "mrg[%zu]", i);
netdata_rwlock_init(&mrg->index[i].rwlock);
mrg->index[i].aral = arrayalloc_create(sizeof(METRIC), 32768 / sizeof(METRIC), NULL, NULL, false, true);
mrg->aral[i] = aral_create("mrg",
sizeof(METRIC),
0,
512,
NULL, NULL, false,
false);
}
mrg->stats.size = sizeof(MRG);
return mrg;
}
@ -242,32 +340,27 @@ void mrg_destroy(MRG *mrg __maybe_unused) {
}
METRIC *mrg_metric_add_and_acquire(MRG *mrg, MRG_ENTRY entry, bool *ret) {
// FIXME - support refcount
// internal_fatal(entry.latest_time_s > max_acceptable_collected_time(),
// "DBENGINE METRIC: metric latest time is in the future");
return metric_add(mrg, &entry, ret);
return metric_add_and_acquire(mrg, &entry, ret);
}
METRIC *mrg_metric_get_and_acquire(MRG *mrg, uuid_t *uuid, Word_t section) {
// FIXME - support refcount
return metric_get(mrg, uuid, section);
return metric_get_and_acquire(mrg, uuid, section);
}
bool mrg_metric_release_and_delete(MRG *mrg, METRIC *metric) {
// FIXME - support refcount
return metric_del(mrg, metric);
return acquired_metric_del(mrg, metric);
}
METRIC *mrg_metric_dup(MRG *mrg __maybe_unused, METRIC *metric) {
// FIXME - duplicate refcount
METRIC *mrg_metric_dup(MRG *mrg, METRIC *metric) {
metric_acquire(mrg, metric, false);
return metric;
}
void mrg_metric_release(MRG *mrg __maybe_unused, METRIC *metric __maybe_unused) {
// FIXME - release refcount
bool mrg_metric_release(MRG *mrg, METRIC *metric) {
return metric_release_and_can_be_deleted(mrg, metric);
}
Word_t mrg_metric_id(MRG *mrg __maybe_unused, METRIC *metric) {
@ -285,6 +378,7 @@ Word_t mrg_metric_section(MRG *mrg __maybe_unused, METRIC *metric) {
bool mrg_metric_set_first_time_s(MRG *mrg __maybe_unused, METRIC *metric, time_t first_time_s) {
netdata_spinlock_lock(&metric->spinlock);
metric->first_time_s = first_time_s;
metric_has_retention_unsafe(mrg, metric);
netdata_spinlock_unlock(&metric->spinlock);
return true;
@ -311,6 +405,7 @@ void mrg_metric_expand_retention(MRG *mrg __maybe_unused, METRIC *metric, time_t
else if(unlikely(!metric->latest_update_every_s && update_every_s))
metric->latest_update_every_s = update_every_s;
metric_has_retention_unsafe(mrg, metric);
netdata_spinlock_unlock(&metric->spinlock);
}
@ -322,6 +417,7 @@ bool mrg_metric_set_first_time_s_if_bigger(MRG *mrg __maybe_unused, METRIC *metr
metric->first_time_s = first_time_s;
ret = true;
}
metric_has_retention_unsafe(mrg, metric);
netdata_spinlock_unlock(&metric->spinlock);
return ret;
@ -382,10 +478,63 @@ bool mrg_metric_set_clean_latest_time_s(MRG *mrg __maybe_unused, METRIC *metric,
// if(unlikely(metric->first_time_s > latest_time_s))
// metric->first_time_s = latest_time_s;
metric_has_retention_unsafe(mrg, metric);
netdata_spinlock_unlock(&metric->spinlock);
return true;
}
// returns true when metric still has retention
bool mrg_metric_zero_disk_retention(MRG *mrg __maybe_unused, METRIC *metric) {
Word_t section = mrg_metric_section(mrg, metric);
bool do_again = false;
size_t countdown = 5;
bool ret = true;
do {
time_t min_first_time_s = LONG_MAX;
time_t max_end_time_s = 0;
PGC_PAGE *page;
PGC_SEARCH method = PGC_SEARCH_FIRST;
time_t page_first_time_s = 0;
time_t page_end_time_s = 0;
while ((page = pgc_page_get_and_acquire(main_cache, section, (Word_t)metric, page_first_time_s, method))) {
method = PGC_SEARCH_NEXT;
bool is_hot = pgc_is_page_hot(page);
bool is_dirty = pgc_is_page_dirty(page);
page_first_time_s = pgc_page_start_time_s(page);
page_end_time_s = pgc_page_end_time_s(page);
if ((is_hot || is_dirty) && page_first_time_s < min_first_time_s)
min_first_time_s = page_first_time_s;
if (is_dirty && page_end_time_s > max_end_time_s)
max_end_time_s = page_end_time_s;
pgc_page_release(main_cache, page);
}
if (min_first_time_s == LONG_MAX)
min_first_time_s = 0;
netdata_spinlock_lock(&metric->spinlock);
if (--countdown && !min_first_time_s && metric->latest_time_s_hot)
do_again = true;
else {
internal_error(!countdown, "METRIC: giving up on updating the retention of metric without disk retention");
do_again = false;
metric->first_time_s = min_first_time_s;
metric->latest_time_s_clean = max_end_time_s;
ret = metric_has_retention_unsafe(mrg, metric);
}
netdata_spinlock_unlock(&metric->spinlock);
} while(do_again);
return ret;
}
bool mrg_metric_set_hot_latest_time_s(MRG *mrg __maybe_unused, METRIC *metric, time_t latest_time_s) {
// internal_fatal(latest_time_s > max_acceptable_collected_time(),
// "DBENGINE METRIC: metric latest time is in the future");
@ -399,6 +548,7 @@ bool mrg_metric_set_hot_latest_time_s(MRG *mrg __maybe_unused, METRIC *metric, t
// if(unlikely(metric->first_time_s > latest_time_s))
// metric->first_time_s = latest_time_s;
metric_has_retention_unsafe(mrg, metric);
netdata_spinlock_unlock(&metric->spinlock);
return true;
}
@ -444,7 +594,7 @@ time_t mrg_metric_get_update_every_s(MRG *mrg __maybe_unused, METRIC *metric) {
return update_every_s;
}
bool mrg_metric_writer_acquire(MRG *mrg, METRIC *metric) {
bool mrg_metric_set_writer(MRG *mrg, METRIC *metric) {
bool done = false;
netdata_spinlock_lock(&metric->spinlock);
if(!metric->writer) {
@ -452,11 +602,13 @@ bool mrg_metric_writer_acquire(MRG *mrg, METRIC *metric) {
__atomic_add_fetch(&mrg->stats.writers, 1, __ATOMIC_RELAXED);
done = true;
}
else
__atomic_add_fetch(&mrg->stats.writers_conflicts, 1, __ATOMIC_RELAXED);
netdata_spinlock_unlock(&metric->spinlock);
return done;
}
bool mrg_metric_writer_release(MRG *mrg, METRIC *metric) {
bool mrg_metric_clear_writer(MRG *mrg, METRIC *metric) {
bool done = false;
netdata_spinlock_lock(&metric->spinlock);
if(metric->writer) {
@ -584,73 +736,80 @@ static void *mrg_stress_test_thread3(void *ptr) {
int mrg_unittest(void) {
MRG *mrg = mrg_create();
METRIC *metric1, *metric2;
METRIC *m1_t0, *m2_t0, *m3_t0, *m4_t0;
METRIC *m1_t1, *m2_t1, *m3_t1, *m4_t1;
bool ret;
MRG_ENTRY entry = {
.section = 1,
.section = 0,
.first_time_s = 2,
.last_time_s = 3,
.latest_update_every_s = 4,
};
uuid_generate(entry.uuid);
metric1 = mrg_metric_add_and_acquire(mrg, entry, &ret);
m1_t0 = mrg_metric_add_and_acquire(mrg, entry, &ret);
if(!ret)
fatal("DBENGINE METRIC: failed to add metric");
// add the same metric again
if(mrg_metric_add_and_acquire(mrg, entry, &ret) != metric1)
m2_t0 = mrg_metric_add_and_acquire(mrg, entry, &ret);
if(m2_t0 != m1_t0)
fatal("DBENGINE METRIC: adding the same metric twice, does not return the same pointer");
if(ret)
fatal("DBENGINE METRIC: managed to add the same metric twice");
if(mrg_metric_get_and_acquire(mrg, &entry.uuid, entry.section) != metric1)
m3_t0 = mrg_metric_get_and_acquire(mrg, &entry.uuid, entry.section);
if(m3_t0 != m1_t0)
fatal("DBENGINE METRIC: cannot find the metric added");
// add the same metric again
if(mrg_metric_add_and_acquire(mrg, entry, &ret) != metric1)
m4_t0 = mrg_metric_add_and_acquire(mrg, entry, &ret);
if(m4_t0 != m1_t0)
fatal("DBENGINE METRIC: adding the same metric twice, does not return the same pointer");
if(ret)
fatal("DBENGINE METRIC: managed to add the same metric twice");
// add the same metric in another section
entry.section = 0;
metric2 = mrg_metric_add_and_acquire(mrg, entry, &ret);
entry.section = 1;
m1_t1 = mrg_metric_add_and_acquire(mrg, entry, &ret);
if(!ret)
fatal("DBENGINE METRIC: failed to add metric in different section");
fatal("DBENGINE METRIC: failed to add metric in section %zu", (size_t)entry.section);
// add the same metric again
if(mrg_metric_add_and_acquire(mrg, entry, &ret) != metric2)
fatal("DBENGINE METRIC: adding the same metric twice (section 0), does not return the same pointer");
m2_t1 = mrg_metric_add_and_acquire(mrg, entry, &ret);
if(m2_t1 != m1_t1)
fatal("DBENGINE METRIC: adding the same metric twice (section %zu), does not return the same pointer", (size_t)entry.section);
if(ret)
fatal("DBENGINE METRIC: managed to add the same metric twice in (section 0)");
if(mrg_metric_get_and_acquire(mrg, &entry.uuid, entry.section) != metric2)
fatal("DBENGINE METRIC: cannot find the metric added (section 0)");
m3_t1 = mrg_metric_get_and_acquire(mrg, &entry.uuid, entry.section);
if(m3_t1 != m1_t1)
fatal("DBENGINE METRIC: cannot find the metric added (section %zu)", (size_t)entry.section);
// delete the first metric
if(!mrg_metric_release_and_delete(mrg, metric1))
mrg_metric_release(mrg, m2_t0);
mrg_metric_release(mrg, m3_t0);
mrg_metric_release(mrg, m4_t0);
mrg_metric_set_first_time_s(mrg, m1_t0, 0);
mrg_metric_set_clean_latest_time_s(mrg, m1_t0, 0);
mrg_metric_set_hot_latest_time_s(mrg, m1_t0, 0);
if(!mrg_metric_release_and_delete(mrg, m1_t0))
fatal("DBENGINE METRIC: cannot delete the first metric");
if(mrg_metric_get_and_acquire(mrg, &entry.uuid, entry.section) != metric2)
fatal("DBENGINE METRIC: cannot find the metric added (section 0), after deleting the first one");
// delete the first metric again - metric1 pointer is invalid now
if(mrg_metric_release_and_delete(mrg, metric1))
fatal("DBENGINE METRIC: deleted again an already deleted metric");
// find the section 0 metric again
if(mrg_metric_get_and_acquire(mrg, &entry.uuid, entry.section) != metric2)
fatal("DBENGINE METRIC: cannot find the metric added (section 0), after deleting the first one twice");
m4_t1 = mrg_metric_get_and_acquire(mrg, &entry.uuid, entry.section);
if(m4_t1 != m1_t1)
fatal("DBENGINE METRIC: cannot find the metric added (section %zu), after deleting the first one", (size_t)entry.section);
// delete the second metric
if(!mrg_metric_release_and_delete(mrg, metric2))
mrg_metric_release(mrg, m2_t1);
mrg_metric_release(mrg, m3_t1);
mrg_metric_release(mrg, m4_t1);
mrg_metric_set_first_time_s(mrg, m1_t1, 0);
mrg_metric_set_clean_latest_time_s(mrg, m1_t1, 0);
mrg_metric_set_hot_latest_time_s(mrg, m1_t1, 0);
if(!mrg_metric_release_and_delete(mrg, m1_t1))
fatal("DBENGINE METRIC: cannot delete the second metric");
// delete the second metric again
if(mrg_metric_release_and_delete(mrg, metric2))
fatal("DBENGINE METRIC: managed to delete an already deleted metric");
if(mrg->stats.entries != 0)
fatal("DBENGINE METRIC: invalid entries counter");

View File

@ -18,23 +18,32 @@ typedef struct mrg_entry {
struct mrg_statistics {
size_t entries;
size_t size; // memory without indexing
size_t entries_referenced;
size_t entries_with_retention;
size_t size; // total memory used, with indexing
size_t current_references;
size_t additions;
size_t additions_duplicate;
size_t deletions;
size_t delete_having_retention_or_referenced;
size_t delete_misses;
size_t search_hits;
size_t search_misses;
size_t pointer_validation_hits;
size_t pointer_validation_misses;
size_t writers;
size_t writers_conflicts;
};
MRG *mrg_create(void);
void mrg_destroy(MRG *mrg);
METRIC *mrg_metric_dup(MRG *mrg, METRIC *metric);
void mrg_metric_release(MRG *mrg, METRIC *metric);
bool mrg_metric_release(MRG *mrg, METRIC *metric);
METRIC *mrg_metric_add_and_acquire(MRG *mrg, MRG_ENTRY entry, bool *ret);
METRIC *mrg_metric_get_and_acquire(MRG *mrg, uuid_t *uuid, Word_t section);
@ -58,9 +67,10 @@ time_t mrg_metric_get_update_every_s(MRG *mrg, METRIC *metric);
void mrg_metric_expand_retention(MRG *mrg, METRIC *metric, time_t first_time_s, time_t last_time_s, time_t update_every_s);
void mrg_metric_get_retention(MRG *mrg, METRIC *metric, time_t *first_time_s, time_t *last_time_s, time_t *update_every_s);
bool mrg_metric_zero_disk_retention(MRG *mrg __maybe_unused, METRIC *metric);
bool mrg_metric_writer_acquire(MRG *mrg, METRIC *metric);
bool mrg_metric_writer_release(MRG *mrg, METRIC *metric);
bool mrg_metric_set_writer(MRG *mrg, METRIC *metric);
bool mrg_metric_clear_writer(MRG *mrg, METRIC *metric);
struct mrg_statistics mrg_get_statistics(MRG *mrg);

View File

@ -23,6 +23,9 @@ static void main_cache_flush_dirty_page_init_callback(PGC *cache __maybe_unused,
static void main_cache_flush_dirty_page_callback(PGC *cache __maybe_unused, PGC_ENTRY *entries_array __maybe_unused, PGC_PAGE **pages_array __maybe_unused, size_t entries __maybe_unused)
{
if(!entries)
return;
struct rrdengine_instance *ctx = (struct rrdengine_instance *) entries_array[0].section;
size_t bytes_per_point = CTX_POINT_SIZE_BYTES(ctx);
@ -50,8 +53,8 @@ static void main_cache_flush_dirty_page_callback(PGC *cache __maybe_unused, PGC_
error_limit(&erl, "DBENGINE: page exceeds the maximum size, adjusting it to max.");
}
memcpy(descr->page, pgc_page_data(pages_array[Index]), descr->page_length);
DOUBLE_LINKED_LIST_APPEND_UNSAFE(base, descr, link.prev, link.next);
descr->page = pgc_page_data(pages_array[Index]);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(base, descr, link.prev, link.next);
internal_fatal(descr->page_length > RRDENG_BLOCK_SIZE, "DBENGINE: faulty page length calculation");
}
@ -1074,6 +1077,7 @@ void init_page_cache(void)
}
main_cache = pgc_create(
"main_cache",
main_cache_size,
main_cache_free_clean_page_callback,
(size_t) rrdeng_pages_per_extent,
@ -1089,6 +1093,7 @@ void init_page_cache(void)
);
open_cache = pgc_create(
"open_cache",
open_cache_size, // the default is 1MB
open_cache_free_clean_page_callback,
1,
@ -1105,6 +1110,7 @@ void init_page_cache(void)
pgc_set_dynamic_target_cache_size_callback(open_cache, dynamic_open_cache_size);
extent_cache = pgc_create(
"extent_cache",
extent_cache_size,
extent_cache_free_clean_page_callback,
1,

View File

@ -27,7 +27,7 @@ struct page_descr_with_data {
uint8_t type;
uint32_t update_every_s;
uint32_t page_length;
uint8_t page[RRDENG_BLOCK_SIZE];
uint8_t *page;
struct {
struct page_descr_with_data *prev;

View File

@ -68,7 +68,7 @@ void pdc_cleanup1(void) {
if(pdc_globals.protected.available_items && pdc_globals.protected.available > (size_t)libuv_worker_threads) {
item = pdc_globals.protected.available_items;
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(pdc_globals.protected.available_items, item, cache.prev, cache.next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(pdc_globals.protected.available_items, item, cache.prev, cache.next);
pdc_globals.protected.available--;
}
@ -87,7 +87,7 @@ PDC *pdc_get(void) {
if(likely(pdc_globals.protected.available_items)) {
pdc = pdc_globals.protected.available_items;
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(pdc_globals.protected.available_items, pdc, cache.prev, cache.next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(pdc_globals.protected.available_items, pdc, cache.prev, cache.next);
pdc_globals.protected.available--;
}
@ -106,7 +106,7 @@ static void pdc_release(PDC *pdc) {
if(unlikely(!pdc)) return;
netdata_spinlock_lock(&pdc_globals.protected.spinlock);
DOUBLE_LINKED_LIST_APPEND_UNSAFE(pdc_globals.protected.available_items, pdc, cache.prev, cache.next);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(pdc_globals.protected.available_items, pdc, cache.prev, cache.next);
pdc_globals.protected.available++;
netdata_spinlock_unlock(&pdc_globals.protected.spinlock);
}
@ -147,7 +147,7 @@ void page_details_cleanup1(void) {
if(page_details_globals.protected.available_items && page_details_globals.protected.available > (size_t)libuv_worker_threads * 2) {
item = page_details_globals.protected.available_items;
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(page_details_globals.protected.available_items, item, cache.prev, cache.next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(page_details_globals.protected.available_items, item, cache.prev, cache.next);
page_details_globals.protected.available--;
}
@ -166,7 +166,7 @@ struct page_details *page_details_get(void) {
if(likely(page_details_globals.protected.available_items)) {
pd = page_details_globals.protected.available_items;
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(page_details_globals.protected.available_items, pd, cache.prev, cache.next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(page_details_globals.protected.available_items, pd, cache.prev, cache.next);
page_details_globals.protected.available--;
}
@ -185,7 +185,7 @@ static void page_details_release(struct page_details *pd) {
if(unlikely(!pd)) return;
netdata_spinlock_lock(&page_details_globals.protected.spinlock);
DOUBLE_LINKED_LIST_APPEND_UNSAFE(page_details_globals.protected.available_items, pd, cache.prev, cache.next);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(page_details_globals.protected.available_items, pd, cache.prev, cache.next);
page_details_globals.protected.available++;
netdata_spinlock_unlock(&page_details_globals.protected.spinlock);
}
@ -226,7 +226,7 @@ void epdl_cleanup1(void) {
if(epdl_globals.protected.available_items && epdl_globals.protected.available > 100) {
item = epdl_globals.protected.available_items;
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(epdl_globals.protected.available_items, item, cache.prev, cache.next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(epdl_globals.protected.available_items, item, cache.prev, cache.next);
epdl_globals.protected.available--;
}
@ -245,7 +245,7 @@ static EPDL *epdl_get(void) {
if(likely(epdl_globals.protected.available_items)) {
epdl = epdl_globals.protected.available_items;
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(epdl_globals.protected.available_items, epdl, cache.prev, cache.next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(epdl_globals.protected.available_items, epdl, cache.prev, cache.next);
epdl_globals.protected.available--;
}
@ -264,7 +264,7 @@ static void epdl_release(EPDL *epdl) {
if(unlikely(!epdl)) return;
netdata_spinlock_lock(&epdl_globals.protected.spinlock);
DOUBLE_LINKED_LIST_APPEND_UNSAFE(epdl_globals.protected.available_items, epdl, cache.prev, cache.next);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(epdl_globals.protected.available_items, epdl, cache.prev, cache.next);
epdl_globals.protected.available++;
netdata_spinlock_unlock(&epdl_globals.protected.spinlock);
}
@ -305,7 +305,7 @@ void deol_cleanup1(void) {
if(deol_globals.protected.available_items && deol_globals.protected.available > 100) {
item = deol_globals.protected.available_items;
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(deol_globals.protected.available_items, item, cache.prev, cache.next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(deol_globals.protected.available_items, item, cache.prev, cache.next);
deol_globals.protected.available--;
}
@ -324,7 +324,7 @@ static DEOL *deol_get(void) {
if(likely(deol_globals.protected.available_items)) {
deol = deol_globals.protected.available_items;
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(deol_globals.protected.available_items, deol, cache.prev, cache.next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(deol_globals.protected.available_items, deol, cache.prev, cache.next);
deol_globals.protected.available--;
}
@ -343,7 +343,7 @@ static void deol_release(DEOL *deol) {
if(unlikely(!deol)) return;
netdata_spinlock_lock(&deol_globals.protected.spinlock);
DOUBLE_LINKED_LIST_APPEND_UNSAFE(deol_globals.protected.available_items, deol, cache.prev, cache.next);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(deol_globals.protected.available_items, deol, cache.prev, cache.next);
deol_globals.protected.available++;
netdata_spinlock_unlock(&deol_globals.protected.spinlock);
}
@ -399,7 +399,7 @@ void extent_buffer_cleanup1(void) {
if(extent_buffer_globals.protected.available_items && extent_buffer_globals.protected.available > 1) {
item = extent_buffer_globals.protected.available_items;
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(extent_buffer_globals.protected.available_items, item, cache.prev, cache.next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(extent_buffer_globals.protected.available_items, item, cache.prev, cache.next);
extent_buffer_globals.protected.available--;
}
@ -424,7 +424,7 @@ struct extent_buffer *extent_buffer_get(size_t size) {
netdata_spinlock_lock(&extent_buffer_globals.protected.spinlock);
if(likely(extent_buffer_globals.protected.available_items)) {
eb = extent_buffer_globals.protected.available_items;
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(extent_buffer_globals.protected.available_items, eb, cache.prev, cache.next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(extent_buffer_globals.protected.available_items, eb, cache.prev, cache.next);
extent_buffer_globals.protected.available--;
}
netdata_spinlock_unlock(&extent_buffer_globals.protected.spinlock);
@ -452,7 +452,7 @@ void extent_buffer_release(struct extent_buffer *eb) {
if(unlikely(!eb)) return;
netdata_spinlock_lock(&extent_buffer_globals.protected.spinlock);
DOUBLE_LINKED_LIST_APPEND_UNSAFE(extent_buffer_globals.protected.available_items, eb, cache.prev, cache.next);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(extent_buffer_globals.protected.available_items, eb, cache.prev, cache.next);
extent_buffer_globals.protected.available++;
netdata_spinlock_unlock(&extent_buffer_globals.protected.spinlock);
}
@ -671,7 +671,7 @@ static bool epdl_pending_add(EPDL *epdl) {
rrdeng_req_cmd(epdl_get_cmd, base, epdl->pdc->priority);
}
DOUBLE_LINKED_LIST_APPEND_UNSAFE(base, epdl, query.prev, query.next);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(base, epdl, query.prev, query.next);
*PValue = base;
netdata_spinlock_unlock(&epdl->datafile->extent_queries.spinlock);
@ -1009,7 +1009,7 @@ static inline struct page_details *epdl_get_pd_load_link_list_from_metric_start_
if (unlikely(__atomic_load_n(&ep->pdc->workers_should_stop, __ATOMIC_RELAXED)))
pdc_page_status_set(pd, PDC_PAGE_FAILED | PDC_PAGE_CANCELLED);
else
DOUBLE_LINKED_LIST_APPEND_UNSAFE(pd_list, pd, load.prev, load.next);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(pd_list, pd, load.prev, load.next);
}
}
}
@ -1259,6 +1259,35 @@ static bool epdl_populate_pages_from_extent_data(
return true;
}
static inline void *datafile_extent_read(struct rrdengine_instance *ctx, uv_file file, unsigned pos, unsigned size_bytes)
{
void *buffer;
uv_fs_t request;
unsigned real_io_size = ALIGN_BYTES_CEILING(size_bytes);
int ret = posix_memalign(&buffer, RRDFILE_ALIGNMENT, real_io_size);
if (unlikely(ret))
fatal("DBENGINE: posix_memalign(): %s", strerror(ret));
uv_buf_t iov = uv_buf_init(buffer, real_io_size);
ret = uv_fs_read(NULL, &request, file, &iov, 1, pos, NULL);
if (unlikely(-1 == ret)) {
ctx_io_error(ctx);
posix_memfree(buffer);
buffer = NULL;
}
else
ctx_io_read_op_bytes(ctx, real_io_size);
uv_fs_req_cleanup(&request);
return buffer;
}
static inline void datafile_extent_read_free(void *buffer) {
posix_memfree(buffer);
}
void epdl_find_extent_and_populate_pages(struct rrdengine_instance *ctx, EPDL *epdl, bool worker) {
size_t *statistics_counter = NULL;
PDC_PAGE_STATUS not_loaded_pages_tag = 0, loaded_pages_tag = 0;
@ -1306,18 +1335,12 @@ void epdl_find_extent_and_populate_pages(struct rrdengine_instance *ctx, EPDL *e
if(worker)
worker_is_busy(UV_EVENT_DBENGINE_EXTENT_MMAP);
off_t map_start = ALIGN_BYTES_FLOOR(epdl->extent_offset);
size_t length = ALIGN_BYTES_CEILING(epdl->extent_offset + epdl->extent_size) - map_start;
void *mmap_data = mmap(NULL, length, PROT_READ, MAP_SHARED, epdl->file, map_start);
if(mmap_data != MAP_FAILED) {
extent_compressed_data = mmap_data + (epdl->extent_offset - map_start);
void *extent_data = datafile_extent_read(ctx, epdl->file, epdl->extent_offset, epdl->extent_size);
if(extent_data != NULL) {
void *copied_extent_compressed_data = dbengine_extent_alloc(epdl->extent_size);
memcpy(copied_extent_compressed_data, extent_compressed_data, epdl->extent_size);
int ret = munmap(mmap_data, length);
fatal_assert(0 == ret);
memcpy(copied_extent_compressed_data, extent_data, epdl->extent_size);
datafile_extent_read_free(extent_data);
if(worker)
worker_is_busy(UV_EVENT_DBENGINE_EXTENT_CACHE_LOOKUP);

View File

@ -124,7 +124,7 @@ static void work_request_cleanup1(void) {
if(work_request_globals.protected.available_items && work_request_globals.protected.available > (size_t)libuv_worker_threads) {
item = work_request_globals.protected.available_items;
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(work_request_globals.protected.available_items, item, cache.prev, cache.next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(work_request_globals.protected.available_items, item, cache.prev, cache.next);
work_request_globals.protected.available--;
}
netdata_spinlock_unlock(&work_request_globals.protected.spinlock);
@ -137,7 +137,7 @@ static void work_request_cleanup1(void) {
static inline void work_done(struct rrdeng_work *work_request) {
netdata_spinlock_lock(&work_request_globals.protected.spinlock);
DOUBLE_LINKED_LIST_APPEND_UNSAFE(work_request_globals.protected.available_items, work_request, cache.prev, cache.next);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(work_request_globals.protected.available_items, work_request, cache.prev, cache.next);
work_request_globals.protected.available++;
netdata_spinlock_unlock(&work_request_globals.protected.spinlock);
}
@ -183,7 +183,7 @@ static bool work_dispatch(struct rrdengine_instance *ctx, void *data, struct com
if(likely(work_request_globals.protected.available_items)) {
work_request = work_request_globals.protected.available_items;
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(work_request_globals.protected.available_items, work_request, cache.prev, cache.next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(work_request_globals.protected.available_items, work_request, cache.prev, cache.next);
work_request_globals.protected.available--;
}
@ -246,7 +246,7 @@ static void page_descriptor_cleanup1(void) {
if(page_descriptor_globals.protected.available_items && page_descriptor_globals.protected.available > MAX_PAGES_PER_EXTENT) {
item = page_descriptor_globals.protected.available_items;
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(page_descriptor_globals.protected.available_items, item, cache.prev, cache.next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(page_descriptor_globals.protected.available_items, item, cache.prev, cache.next);
page_descriptor_globals.protected.available--;
}
@ -265,7 +265,7 @@ struct page_descr_with_data *page_descriptor_get(void) {
if(likely(page_descriptor_globals.protected.available_items)) {
descr = page_descriptor_globals.protected.available_items;
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(page_descriptor_globals.protected.available_items, descr, cache.prev, cache.next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(page_descriptor_globals.protected.available_items, descr, cache.prev, cache.next);
page_descriptor_globals.protected.available--;
}
@ -284,7 +284,7 @@ static inline void page_descriptor_release(struct page_descr_with_data *descr) {
if(unlikely(!descr)) return;
netdata_spinlock_lock(&page_descriptor_globals.protected.spinlock);
DOUBLE_LINKED_LIST_APPEND_UNSAFE(page_descriptor_globals.protected.available_items, descr, cache.prev, cache.next);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(page_descriptor_globals.protected.available_items, descr, cache.prev, cache.next);
page_descriptor_globals.protected.available++;
netdata_spinlock_unlock(&page_descriptor_globals.protected.spinlock);
}
@ -322,7 +322,7 @@ static void extent_io_descriptor_cleanup1(void) {
if(extent_io_descriptor_globals.protected.available_items && extent_io_descriptor_globals.protected.available > (size_t)libuv_worker_threads) {
item = extent_io_descriptor_globals.protected.available_items;
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(extent_io_descriptor_globals.protected.available_items, item, cache.prev, cache.next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(extent_io_descriptor_globals.protected.available_items, item, cache.prev, cache.next);
extent_io_descriptor_globals.protected.available--;
}
netdata_spinlock_unlock(&extent_io_descriptor_globals.protected.spinlock);
@ -340,7 +340,7 @@ static struct extent_io_descriptor *extent_io_descriptor_get(void) {
if(likely(extent_io_descriptor_globals.protected.available_items)) {
xt_io_descr = extent_io_descriptor_globals.protected.available_items;
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(extent_io_descriptor_globals.protected.available_items, xt_io_descr, cache.prev, cache.next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(extent_io_descriptor_globals.protected.available_items, xt_io_descr, cache.prev, cache.next);
extent_io_descriptor_globals.protected.available--;
}
@ -359,7 +359,7 @@ static inline void extent_io_descriptor_release(struct extent_io_descriptor *xt_
if(unlikely(!xt_io_descr)) return;
netdata_spinlock_lock(&extent_io_descriptor_globals.protected.spinlock);
DOUBLE_LINKED_LIST_APPEND_UNSAFE(extent_io_descriptor_globals.protected.available_items, xt_io_descr, cache.prev, cache.next);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(extent_io_descriptor_globals.protected.available_items, xt_io_descr, cache.prev, cache.next);
extent_io_descriptor_globals.protected.available++;
netdata_spinlock_unlock(&extent_io_descriptor_globals.protected.spinlock);
}
@ -396,7 +396,7 @@ static void rrdeng_query_handle_cleanup1(void) {
if(rrdeng_query_handle_globals.protected.available_items && rrdeng_query_handle_globals.protected.available > 10) {
item = rrdeng_query_handle_globals.protected.available_items;
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(rrdeng_query_handle_globals.protected.available_items, item, cache.prev, cache.next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(rrdeng_query_handle_globals.protected.available_items, item, cache.prev, cache.next);
rrdeng_query_handle_globals.protected.available--;
}
@ -415,7 +415,7 @@ struct rrdeng_query_handle *rrdeng_query_handle_get(void) {
if(likely(rrdeng_query_handle_globals.protected.available_items)) {
handle = rrdeng_query_handle_globals.protected.available_items;
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(rrdeng_query_handle_globals.protected.available_items, handle, cache.prev, cache.next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(rrdeng_query_handle_globals.protected.available_items, handle, cache.prev, cache.next);
rrdeng_query_handle_globals.protected.available--;
}
@ -434,7 +434,7 @@ void rrdeng_query_handle_release(struct rrdeng_query_handle *handle) {
if(unlikely(!handle)) return;
netdata_spinlock_lock(&rrdeng_query_handle_globals.protected.spinlock);
DOUBLE_LINKED_LIST_APPEND_UNSAFE(rrdeng_query_handle_globals.protected.available_items, handle, cache.prev, cache.next);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(rrdeng_query_handle_globals.protected.available_items, handle, cache.prev, cache.next);
rrdeng_query_handle_globals.protected.available++;
netdata_spinlock_unlock(&rrdeng_query_handle_globals.protected.spinlock);
}
@ -471,7 +471,7 @@ static void wal_cleanup1(void) {
if(wal_globals.protected.available_items && wal_globals.protected.available > storage_tiers) {
wal = wal_globals.protected.available_items;
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(wal_globals.protected.available_items, wal, cache.prev, cache.next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(wal_globals.protected.available_items, wal, cache.prev, cache.next);
wal_globals.protected.available--;
}
@ -494,7 +494,7 @@ WAL *wal_get(struct rrdengine_instance *ctx, unsigned size) {
if(likely(wal_globals.protected.available_items)) {
wal = wal_globals.protected.available_items;
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(wal_globals.protected.available_items, wal, cache.prev, cache.next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(wal_globals.protected.available_items, wal, cache.prev, cache.next);
wal_globals.protected.available--;
}
@ -532,7 +532,7 @@ void wal_release(WAL *wal) {
if(unlikely(!wal)) return;
netdata_spinlock_lock(&wal_globals.protected.spinlock);
DOUBLE_LINKED_LIST_APPEND_UNSAFE(wal_globals.protected.available_items, wal, cache.prev, cache.next);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(wal_globals.protected.available_items, wal, cache.prev, cache.next);
wal_globals.protected.available++;
netdata_spinlock_unlock(&wal_globals.protected.spinlock);
}
@ -596,7 +596,7 @@ static void rrdeng_cmd_cleanup1(void) {
if(rrdeng_cmd_globals.cache.available_items && rrdeng_cmd_globals.cache.available > 100) {
item = rrdeng_cmd_globals.cache.available_items;
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(rrdeng_cmd_globals.cache.available_items, item, cache.prev, cache.next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(rrdeng_cmd_globals.cache.available_items, item, cache.prev, cache.next);
rrdeng_cmd_globals.cache.available--;
}
netdata_spinlock_unlock(&rrdeng_cmd_globals.cache.spinlock);
@ -639,8 +639,8 @@ void rrdeng_req_cmd(requeue_callback_t get_cmd_cb, void *data, STORAGE_PRIORITY
priority = rrdeng_enq_cmd_map_opcode_to_priority(cmd->opcode, priority);
if (cmd->priority > priority) {
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(rrdeng_cmd_globals.queue.waiting_items_by_priority[cmd->priority], cmd, cache.prev, cache.next);
DOUBLE_LINKED_LIST_APPEND_UNSAFE(rrdeng_cmd_globals.queue.waiting_items_by_priority[priority], cmd, cache.prev, cache.next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(rrdeng_cmd_globals.queue.waiting_items_by_priority[cmd->priority], cmd, cache.prev, cache.next);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(rrdeng_cmd_globals.queue.waiting_items_by_priority[priority], cmd, cache.prev, cache.next);
cmd->priority = priority;
}
}
@ -657,7 +657,7 @@ void rrdeng_enq_cmd(struct rrdengine_instance *ctx, enum rrdeng_opcode opcode, v
netdata_spinlock_lock(&rrdeng_cmd_globals.cache.spinlock);
if(likely(rrdeng_cmd_globals.cache.available_items)) {
cmd = rrdeng_cmd_globals.cache.available_items;
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(rrdeng_cmd_globals.cache.available_items, cmd, cache.prev, cache.next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(rrdeng_cmd_globals.cache.available_items, cmd, cache.prev, cache.next);
rrdeng_cmd_globals.cache.available--;
}
netdata_spinlock_unlock(&rrdeng_cmd_globals.cache.spinlock);
@ -676,7 +676,7 @@ void rrdeng_enq_cmd(struct rrdengine_instance *ctx, enum rrdeng_opcode opcode, v
cmd->dequeue_cb = dequeue_cb;
netdata_spinlock_lock(&rrdeng_cmd_globals.queue.spinlock);
DOUBLE_LINKED_LIST_APPEND_UNSAFE(rrdeng_cmd_globals.queue.waiting_items_by_priority[priority], cmd, cache.prev, cache.next);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(rrdeng_cmd_globals.queue.waiting_items_by_priority[priority], cmd, cache.prev, cache.next);
rrdeng_cmd_globals.queue.waiting++;
if(enqueue_cb)
enqueue_cb(cmd);
@ -715,7 +715,7 @@ static inline struct rrdeng_cmd rrdeng_deq_cmd(void) {
}
// remove it from the queue
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(rrdeng_cmd_globals.queue.waiting_items_by_priority[priority], cmd, cache.prev, cache.next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(rrdeng_cmd_globals.queue.waiting_items_by_priority[priority], cmd, cache.prev, cache.next);
rrdeng_cmd_globals.queue.waiting--;
break;
}
@ -735,7 +735,7 @@ static inline struct rrdeng_cmd rrdeng_deq_cmd(void) {
// put it in the cache
netdata_spinlock_lock(&rrdeng_cmd_globals.cache.spinlock);
DOUBLE_LINKED_LIST_APPEND_UNSAFE(rrdeng_cmd_globals.cache.available_items, cmd, cache.prev, cache.next);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(rrdeng_cmd_globals.cache.available_items, cmd, cache.prev, cache.next);
rrdeng_cmd_globals.cache.available++;
netdata_spinlock_unlock(&rrdeng_cmd_globals.cache.spinlock);
}
@ -968,10 +968,6 @@ static void *extent_flushed_to_open_tp_worker(struct rrdengine_instance *ctx __m
struct rrdengine_datafile *datafile;
unsigned i;
if (uv_fs_request->result < 0) {
ctx_io_error(ctx);
error("DBENGINE: %s: uv_fs_write: %s", __func__, uv_strerror((int)uv_fs_request->result));
}
datafile = xt_io_descr->datafile;
bool still_running = ctx_is_available_for_queries(ctx);
@ -1014,6 +1010,11 @@ static void after_extent_write_datafile_io(uv_fs_t *uv_fs_request) {
struct rrdengine_datafile *datafile = xt_io_descr->datafile;
struct rrdengine_instance *ctx = datafile->ctx;
if (uv_fs_request->result < 0) {
ctx_io_error(ctx);
error("DBENGINE: %s: uv_fs_write(): %s", __func__, uv_strerror((int)uv_fs_request->result));
}
journalfile_v1_extent_write(ctx, xt_io_descr->datafile, xt_io_descr->wal, &rrdeng_main.loop);
netdata_spinlock_lock(&datafile->writers.spinlock);
@ -1398,7 +1399,7 @@ void find_uuid_first_time(
}
}
internal_error(true,
"DBENGINE: analyzed the retention of %zu rotated metrics, "
"DBENGINE: analyzed the retention of %zu rotated metrics of tier %d, "
"did %zu jv2 matching binary searches (%zu not matching, %zu overflown) in %u journal files, "
"%zu metrics with entries in open cache, "
"metrics first time found per datafile index ([not in jv2]:%zu, [1]:%zu, [2]:%zu, [3]:%zu, [4]:%zu, [5]:%zu, [6]:%zu, [7]:%zu, [8]:%zu, [bigger]: %zu), "
@ -1406,6 +1407,7 @@ void find_uuid_first_time(
"metrics without any remaining retention %zu, "
"metrics not in MRG %zu",
metric_count,
ctx->config.tier,
binary_match,
not_matching_bsearches,
not_needed_bsearches,
@ -1446,7 +1448,8 @@ static void update_metrics_first_time_s(struct rrdengine_instance *ctx, struct r
added++;
}
info("DBENGINE: recalculating retention for %zu metrics starting with datafile %u", count, first_datafile_remaining->fileno);
info("DBENGINE: recalculating tier %d retention for %zu metrics starting with datafile %u",
ctx->config.tier, count, first_datafile_remaining->fileno);
journalfile_v2_data_release(journalfile);
@ -1460,18 +1463,40 @@ static void update_metrics_first_time_s(struct rrdengine_instance *ctx, struct r
if(worker)
worker_is_busy(UV_EVENT_DBENGINE_POPULATE_MRG);
info("DBENGINE: updating metric registry retention for %zu metrics", added);
info("DBENGINE: updating tier %d metrics registry retention for %zu metrics",
ctx->config.tier, added);
size_t deleted_metrics = 0, zero_retention_referenced = 0, zero_disk_retention = 0, zero_disk_but_live = 0;
for (size_t index = 0; index < added; ++index) {
uuid_first_t_entry = &uuid_first_entry_list[index];
if (likely(uuid_first_t_entry->first_time_s != LONG_MAX))
if (likely(uuid_first_t_entry->first_time_s != LONG_MAX)) {
mrg_metric_set_first_time_s_if_bigger(main_mrg, uuid_first_t_entry->metric, uuid_first_t_entry->first_time_s);
else
mrg_metric_set_first_time_s(main_mrg, uuid_first_t_entry->metric, 0);
mrg_metric_release(main_mrg, uuid_first_t_entry->metric);
mrg_metric_release(main_mrg, uuid_first_t_entry->metric);
}
else {
zero_disk_retention++;
// there is no retention for this metric
bool has_retention = mrg_metric_zero_disk_retention(main_mrg, uuid_first_t_entry->metric);
if (!has_retention) {
bool deleted = mrg_metric_release_and_delete(main_mrg, uuid_first_t_entry->metric);
if(deleted)
deleted_metrics++;
else
zero_retention_referenced++;
}
else {
zero_disk_but_live++;
mrg_metric_release(main_mrg, uuid_first_t_entry->metric);
}
}
}
freez(uuid_first_entry_list);
internal_error(zero_disk_retention,
"DBENGINE: deleted %zu metrics, zero retention but referenced %zu (out of %zu total, of which %zu have main cache retention) zero on-disk retention tier %d metrics from metrics registry",
deleted_metrics, zero_retention_referenced, zero_disk_retention, zero_disk_but_live, ctx->config.tier);
if(worker)
worker_is_idle();
}

View File

@ -243,7 +243,7 @@ STORAGE_COLLECT_HANDLE *rrdeng_store_metric_init(STORAGE_METRIC_HANDLE *db_metri
struct rrdengine_instance *ctx = mrg_metric_ctx(metric);
bool is_1st_metric_writer = true;
if(!mrg_metric_writer_acquire(main_mrg, metric)) {
if(!mrg_metric_set_writer(main_mrg, metric)) {
is_1st_metric_writer = false;
char uuid[UUID_STR_LEN + 1];
uuid_unparse(*mrg_metric_uuid(main_mrg, metric), uuid);
@ -696,7 +696,7 @@ int rrdeng_store_metric_finalize(STORAGE_COLLECT_HANDLE *collection_handle) {
if(!(handle->options & RRDENG_1ST_METRIC_WRITER))
__atomic_sub_fetch(&ctx->atomic.collectors_running_duplicate, 1, __ATOMIC_RELAXED);
if((handle->options & RRDENG_1ST_METRIC_WRITER) && !mrg_metric_writer_release(main_mrg, handle->metric))
if((handle->options & RRDENG_1ST_METRIC_WRITER) && !mrg_metric_clear_writer(main_mrg, handle->metric))
internal_fatal(true, "DBENGINE: metric is already released");
time_t first_time_s, last_time_s, update_every_s;
@ -738,12 +738,12 @@ static void register_query_handle(struct rrdeng_query_handle *handle) {
handle->started_time_s = now_realtime_sec();
netdata_spinlock_lock(&global_query_handle_spinlock);
DOUBLE_LINKED_LIST_APPEND_UNSAFE(global_query_handle_ll, handle, prev, next);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(global_query_handle_ll, handle, prev, next);
netdata_spinlock_unlock(&global_query_handle_spinlock);
}
static void unregister_query_handle(struct rrdeng_query_handle *handle) {
netdata_spinlock_lock(&global_query_handle_spinlock);
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(global_query_handle_ll, handle, prev, next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(global_query_handle_ll, handle, prev, next);
netdata_spinlock_unlock(&global_query_handle_spinlock);
}
#else

View File

@ -436,8 +436,10 @@ void rrddim_memory_file_save(RRDDIM *rd);
(x).end_time_s = end_s; \
} while(0)
#define STORAGE_POINT_UNSET { .min = NAN, .max = NAN, .sum = NAN, .count = 0, .anomaly_count = 0, .flags = SN_FLAG_NONE, .start_time_s = 0, .end_time_s = 0 }
#define storage_point_is_unset(x) (!(x).count)
#define storage_point_is_empty(x) (!netdata_double_isnumber((x).sum))
#define storage_point_is_gap(x) (!netdata_double_isnumber((x).sum))
// ------------------------------------------------------------------------
// function pointers that handle data collection

View File

@ -181,7 +181,7 @@ static void rrdcalc_link_to_rrdset(RRDSET *st, RRDCALC *rc) {
rc->rrdset = st;
netdata_rwlock_wrlock(&st->alerts.rwlock);
DOUBLE_LINKED_LIST_APPEND_UNSAFE(st->alerts.base, rc, prev, next);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(st->alerts.base, rc, prev, next);
netdata_rwlock_unlock(&st->alerts.rwlock);
if(rc->update_every < rc->rrdset->update_every) {
@ -328,7 +328,7 @@ static void rrdcalc_unlink_from_rrdset(RRDCALC *rc, bool having_ll_wrlock) {
if(!having_ll_wrlock)
netdata_rwlock_wrlock(&st->alerts.rwlock);
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(st->alerts.base, rc, prev, next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(st->alerts.base, rc, prev, next);
if(!having_ll_wrlock)
netdata_rwlock_unlock(&st->alerts.rwlock);

View File

@ -122,6 +122,8 @@ typedef struct query_plan_entry {
size_t tier;
time_t after;
time_t before;
time_t expanded_after;
time_t expanded_before;
struct storage_engine_query_handle handle;
STORAGE_POINT (*next_metric)(struct storage_engine_query_handle *handle);
int (*is_finished)(struct storage_engine_query_handle *handle);

View File

@ -477,9 +477,9 @@ int is_legacy = 1;
rrdhost_index_add_hostname(host);
if(is_localhost)
DOUBLE_LINKED_LIST_PREPEND_UNSAFE(localhost, host, prev, next);
DOUBLE_LINKED_LIST_PREPEND_ITEM_UNSAFE(localhost, host, prev, next);
else
DOUBLE_LINKED_LIST_APPEND_UNSAFE(localhost, host, prev, next);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(localhost, host, prev, next);
rrd_unlock();
@ -1092,7 +1092,7 @@ void rrdhost_free___while_having_rrd_wrlock(RRDHOST *host, bool force) {
rrdhost_index_del_by_guid(host);
if (host->prev)
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(localhost, host, prev, next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(localhost, host, prev, next);
}
// ------------------------------------------------------------------------

View File

@ -1149,7 +1149,7 @@ void store_metric_at_tier(RRDDIM *rd, size_t tier, struct rrddim_tier *t, STORAG
t->virtual_point.end_time_s = sp.end_time_s;
// merge the values into our virtual point
if (likely(!storage_point_is_empty(sp))) {
if (likely(!storage_point_is_gap(sp))) {
// we aggregate only non NULLs into higher tiers
if (likely(!storage_point_is_unset(t->virtual_point))) {

View File

@ -130,7 +130,7 @@ NETDATA_DOUBLE exporting_calculate_value_from_stored_data(
STORAGE_POINT sp = rd->tiers[0].query_ops->next_metric(&handle);
points_read++;
if (unlikely(storage_point_is_empty(sp))) {
if (unlikely(storage_point_is_gap(sp))) {
// not collected
continue;
}

View File

@ -5,7 +5,7 @@ MAINTAINERCLEANFILES = $(srcdir)/Makefile.in
SUBDIRS = \
adaptive_resortable_list \
arrayalloc \
aral \
avl \
buffer \
clocks \

169
libnetdata/aral/README.md Normal file
View File

@ -0,0 +1,169 @@
<!--
title: "Array Allocator"
custom_edit_url: https://github.com/netdata/netdata/edit/master/libnetdata/aral/README.md
-->
# Array Allocator
Come on! Array allocators are embedded in libc! Why do we need such a thing in Netdata?
Well, we have a couple of problems to solve:
1. **Fragmentation** - It is important for Netdata to keeps its overall memory footprint as low as possible. libc does an amazing job when the same thread allocates and frees some memory. But it simply cannot do better without knowing the specifics of the application when memory is allocated and freed randomly between threads.
2. **Speed** - Especially when allocations and de-allocations happen across threads, the speed penalty is tremendous.
In Netdata we have a few moments that are very tough. Imagine collecting 1 million metrics per second. You have a buffer for each metric and put append new points there. This works beautifully, of course! But then, when the buffers get full, imagine the situation. You suddenly need 1 million buffers, at once!
To solve this problem we first spread out the buffers. So, the first time each metric asks for a buffer, it gets a smaller one. We added logic there to spread them as evenly as possible across time. Solved? Not exactly!
We have 3 tiers for each metric. For the metrics of tier 0 (per second resolution) we have a max buffer for 1024 points and every new metrics gets a random size between 3 points and 1024. So they are distributed across time. For 1 million metrics, we have about 1000 buffers beings created every second.
But at some point, the end of the minute will come, and suddenly all the metrics will need a new buffer for tier 1 (per minute). Oops! We will spread tier 1 buffers across time too, but the first minute is a tough one. We really need 1 million buffers instantly.
And if that minute happens to also be the beginning of an hour... tier 2 (per hour) kicks in. For that instant we are going to need 2 million buffers instantly.
The problem becomes even bigger when we collect 2, or even 10 million metrics...
So solve it, Netdata uses a special implementation of an array allocator that is tightly integrated with the structures we need.
## Features
1. Malloc, or MMAP modes. File based MMAP is also supported to put the data in file backed up shared memory.
2. Fully asynchronous operations. There are just a couple of points where spin-locks protect a few counters and pointers.
3. Optional defragmenter, that once enabled it will make free operation slower while trying to maintain a sorted list of fragments to offer first during allocations. The defragmenter can be enabled / disabled at run time. The defragmenter can hurt performance on application with intense turn-around of allocation, like Netdata dbengine caches. So, it is disabled by default.
4. Without the defragmenter enabled, ARAL still tries to keep pages full, but the depth of the search is limited to 3 pages (so, a page with a free slot will either become 1st, 2nd, or 3rd). At the same time, during allocations, ARAL will evaluate the first 2 pages to find the one that is more full than the other, to use it for the new allocation.
## How it works
Allocations are organized in pages. Pages have a minimum size (a system page, usually 4KB) and a maximum defined by for each different kind of object.
Initially every page is free. When an allocation request is made, the free space is split, and the first element is reserved. Free space is now considered there rest.
This continuous until the page gets full, where a new page is allocated and the process is repeated.
Each allocation returned has a pointer appended to it. The pointer points to the page the allocation belongs to.
When a pointer is freed, the page it belongs is identified, its space is marked free, and it is prepended in a single linked list that resides in the page itself. So, each page has its own list of free slots to use.
Pages are then on another linked list. This is a double linked list and at its beginning has the pages with free space and at the end the pages that are full.
When the defragmenter is enabled the pages double linked list is also sorted, like this: the fewer the free slots on a page, the earlier in the linked list the page will be, except if it does not have any free slot, in which case it will be at the end. So, the defragmenter tries to have pages full.
When a page is entirerly free, it is given back to the system immediately. There is no caching of free pages.
Parallelism is achieved like this:
When some threads are waiting for a page to be allocated, free operations are allowed. If a free operation happens before a new page is allocated, any waiting thread will get the slot that is freed on another page.
Free operations happen in parallel, even for the same page. There is a spin-lock on each page to protect the base pointer of the page's free slots single linked list. But, this is instant. All preparative work happens lockless, then to add the free slot to the page, the page spinlock is acquired, the free slot is prepended to the linked list on the page, the spinlock is released. Such free operations on different pages are totally parallel.
Once the free operation on a page has finished, the pages double linked list spinlock is acquired to put the page first on that linked list. If the defragmenter is enabled, the spinlock is retained for a little longer, to find the exact position of the page in the linked list.
During allocations, the reverse order is used. First get the pages double linked list spinlock, get the first page and decrement its free slots counter, then release the spinlock. If the first page does not have any free slots, a page allocation is spawn, without any locks acquired. All threads are spinning waiting for a page with free slots, either from the newly allocated one or from a free operation that may happen in parallel.
Once a page is acquired, each thread locks its own page to get the first free slot and releases the lock immediately. This is guaranteed to succeed, because when the page was given to that thread its free slots counter was decremented. So, there is a free slot for every thread that got that page. All preparative work to return a pointer to the caller is done lock free. Allocations on different pages are done in parallel, without any intervention between them.
## What to expect
Systems not designed for parallelism achieve their top performance single threaded. The single threaded speed is the baseline. Adding more threads makes them slower.
The baseline for ARAL is the following, the included stress test when running single threaded:
```
Running stress test of 1 threads, with 10000 elements each, for 5 seconds...
2023-01-29 17:04:50: netdata INFO : TH[0] : set name of thread 1314983 to TH[0]
ARAL executes 12.27 M malloc and 12.26 M free operations/s
ARAL executes 12.29 M malloc and 12.29 M free operations/s
ARAL executes 12.30 M malloc and 12.30 M free operations/s
ARAL executes 12.30 M malloc and 12.29 M free operations/s
ARAL executes 12.29 M malloc and 12.29 M free operations/s
Waiting the threads to finish...
2023-01-29 17:04:55: netdata INFO : MAIN : ARAL: did 61487356 malloc, 61487356 free, using 1 threads, in 5003808 usecs
```
The same test with 2 threads, both threads on the same ARAL of course. As you see performance improved:
```
Running stress test of 2 threads, with 10000 elements each, for 5 seconds...
2023-01-29 17:05:25: netdata INFO : TH[0] : set name of thread 1315537 to TH[0]
2023-01-29 17:05:25: netdata INFO : TH[1] : set name of thread 1315538 to TH[1]
ARAL executes 17.75 M malloc and 17.73 M free operations/s
ARAL executes 17.93 M malloc and 17.93 M free operations/s
ARAL executes 18.17 M malloc and 18.18 M free operations/s
ARAL executes 18.33 M malloc and 18.32 M free operations/s
ARAL executes 18.36 M malloc and 18.36 M free operations/s
Waiting the threads to finish...
2023-01-29 17:05:30: netdata INFO : MAIN : ARAL: did 90976190 malloc, 90976190 free, using 2 threads, in 5029462 usecs
```
The same test with 4 threads:
```
Running stress test of 4 threads, with 10000 elements each, for 5 seconds...
2023-01-29 17:10:12: netdata INFO : TH[0] : set name of thread 1319552 to TH[0]
2023-01-29 17:10:12: netdata INFO : TH[1] : set name of thread 1319553 to TH[1]
2023-01-29 17:10:12: netdata INFO : TH[2] : set name of thread 1319554 to TH[2]
2023-01-29 17:10:12: netdata INFO : TH[3] : set name of thread 1319555 to TH[3]
ARAL executes 19.95 M malloc and 19.91 M free operations/s
ARAL executes 20.08 M malloc and 20.08 M free operations/s
ARAL executes 20.85 M malloc and 20.85 M free operations/s
ARAL executes 20.84 M malloc and 20.84 M free operations/s
ARAL executes 21.37 M malloc and 21.37 M free operations/s
Waiting the threads to finish...
2023-01-29 17:10:17: netdata INFO : MAIN : ARAL: did 103549747 malloc, 103549747 free, using 4 threads, in 5023325 usecs
```
The same with 8 threads:
```
Running stress test of 8 threads, with 10000 elements each, for 5 seconds...
2023-01-29 17:07:06: netdata INFO : TH[0] : set name of thread 1317608 to TH[0]
2023-01-29 17:07:06: netdata INFO : TH[1] : set name of thread 1317609 to TH[1]
2023-01-29 17:07:06: netdata INFO : TH[2] : set name of thread 1317610 to TH[2]
2023-01-29 17:07:06: netdata INFO : TH[3] : set name of thread 1317611 to TH[3]
2023-01-29 17:07:06: netdata INFO : TH[4] : set name of thread 1317612 to TH[4]
2023-01-29 17:07:06: netdata INFO : TH[5] : set name of thread 1317613 to TH[5]
2023-01-29 17:07:06: netdata INFO : TH[6] : set name of thread 1317614 to TH[6]
2023-01-29 17:07:06: netdata INFO : TH[7] : set name of thread 1317615 to TH[7]
ARAL executes 15.73 M malloc and 15.66 M free operations/s
ARAL executes 13.95 M malloc and 13.94 M free operations/s
ARAL executes 15.59 M malloc and 15.58 M free operations/s
ARAL executes 15.49 M malloc and 15.49 M free operations/s
ARAL executes 16.16 M malloc and 16.16 M free operations/s
Waiting the threads to finish...
2023-01-29 17:07:11: netdata INFO : MAIN : ARAL: did 78427750 malloc, 78427750 free, using 8 threads, in 5088591 usecs
```
The same with 16 threads:
```
Running stress test of 16 threads, with 10000 elements each, for 5 seconds...
2023-01-29 17:08:04: netdata INFO : TH[0] : set name of thread 1318663 to TH[0]
2023-01-29 17:08:04: netdata INFO : TH[1] : set name of thread 1318664 to TH[1]
2023-01-29 17:08:04: netdata INFO : TH[2] : set name of thread 1318665 to TH[2]
2023-01-29 17:08:04: netdata INFO : TH[3] : set name of thread 1318666 to TH[3]
2023-01-29 17:08:04: netdata INFO : TH[4] : set name of thread 1318667 to TH[4]
2023-01-29 17:08:04: netdata INFO : TH[5] : set name of thread 1318668 to TH[5]
2023-01-29 17:08:04: netdata INFO : TH[6] : set name of thread 1318669 to TH[6]
2023-01-29 17:08:04: netdata INFO : TH[7] : set name of thread 1318670 to TH[7]
2023-01-29 17:08:04: netdata INFO : TH[8] : set name of thread 1318671 to TH[8]
2023-01-29 17:08:04: netdata INFO : TH[9] : set name of thread 1318672 to TH[9]
2023-01-29 17:08:04: netdata INFO : TH[10] : set name of thread 1318673 to TH[10]
2023-01-29 17:08:04: netdata INFO : TH[11] : set name of thread 1318674 to TH[11]
2023-01-29 17:08:04: netdata INFO : TH[12] : set name of thread 1318675 to TH[12]
2023-01-29 17:08:04: netdata INFO : TH[13] : set name of thread 1318676 to TH[13]
2023-01-29 17:08:04: netdata INFO : TH[14] : set name of thread 1318677 to TH[14]
2023-01-29 17:08:04: netdata INFO : TH[15] : set name of thread 1318678 to TH[15]
ARAL executes 11.77 M malloc and 11.62 M free operations/s
ARAL executes 12.80 M malloc and 12.81 M free operations/s
ARAL executes 13.26 M malloc and 13.25 M free operations/s
ARAL executes 13.30 M malloc and 13.29 M free operations/s
ARAL executes 13.23 M malloc and 13.25 M free operations/s
Waiting the threads to finish...
2023-01-29 17:08:09: netdata INFO : MAIN : ARAL: did 65302122 malloc, 65302122 free, using 16 threads, in 5066009 usecs
```
As you can see, the top performance is with 4 threads, almost double the single thread speed.
16 threads performance is still better than single threaded, despite the intense concurrency.

918
libnetdata/aral/aral.c Normal file
View File

@ -0,0 +1,918 @@
#include "../libnetdata.h"
#include "aral.h"
#ifdef NETDATA_TRACE_ALLOCATIONS
#define TRACE_ALLOCATIONS_FUNCTION_DEFINITION_PARAMS , const char *file, const char *function, size_t line
#define TRACE_ALLOCATIONS_FUNCTION_CALL_PARAMS , file, function, line
#else
#define TRACE_ALLOCATIONS_FUNCTION_DEFINITION_PARAMS
#define TRACE_ALLOCATIONS_FUNCTION_CALL_PARAMS
#endif
#define ARAL_FREE_PAGES_DELTA_TO_REARRANGE_LIST 5
// max file size
#define ARAL_MAX_PAGE_SIZE_MMAP (1*1024*1024*1024)
// max malloc size
// optimal at current versions of libc is up to 256k
// ideal to have the same overhead as libc is 4k
#define ARAL_MAX_PAGE_SIZE_MALLOC (65*1024)
typedef struct aral_free {
size_t size;
struct aral_free *next;
} ARAL_FREE;
typedef struct aral_page {
size_t size; // the allocation size of the page
const char *filename;
uint8_t *data;
uint32_t free_elements_to_move_first;
uint32_t max_elements; // the number of elements that can fit on this page
struct {
uint32_t used_elements; // the number of used elements on this page
uint32_t free_elements; // the number of free elements on this page
} aral_lock;
struct {
SPINLOCK spinlock;
ARAL_FREE *list;
} free;
struct aral_page *prev; // the prev page on the list
struct aral_page *next; // the next page on the list
} ARAL_PAGE;
struct aral {
struct {
char name[ARAL_MAX_NAME + 1];
bool lockless;
bool defragment;
size_t element_size; // calculated to take into account ARAL overheads
size_t max_allocation_size; // calculated in bytes
size_t page_ptr_offset; // calculated
size_t natural_page_size; // calculated
size_t requested_element_size;
size_t initial_page_elements;
size_t max_page_elements;
struct {
bool enabled;
const char *filename;
char **cache_dir;
} mmap;
} config;
struct {
SPINLOCK spinlock;
size_t file_number; // for mmap
struct aral_page *pages; // linked list of pages
size_t user_malloc_operations;
size_t user_free_operations;
size_t defragment_operations;
size_t defragment_linked_list_traversals;
} aral_lock;
struct {
SPINLOCK spinlock;
size_t allocation_size; // current allocation size
} adders;
struct {
} atomic;
};
struct {
struct {
struct {
size_t allocations;
size_t allocated;
} structures;
struct {
size_t allocations;
size_t allocated;
size_t used;
} malloc;
struct {
size_t allocations;
size_t allocated;
size_t used;
} mmap;
} atomic;
} aral_globals = {};
void aral_get_size_statistics(size_t *structures, size_t *malloc_allocated, size_t *malloc_used, size_t *mmap_allocated, size_t *mmap_used) {
*structures = __atomic_load_n(&aral_globals.atomic.structures.allocated, __ATOMIC_RELAXED);
*malloc_allocated = __atomic_load_n(&aral_globals.atomic.malloc.allocated, __ATOMIC_RELAXED);
*malloc_used = __atomic_load_n(&aral_globals.atomic.malloc.used, __ATOMIC_RELAXED);
*mmap_allocated = __atomic_load_n(&aral_globals.atomic.mmap.allocated, __ATOMIC_RELAXED);
*mmap_used = __atomic_load_n(&aral_globals.atomic.mmap.used, __ATOMIC_RELAXED);
}
#define ARAL_NATURAL_ALIGNMENT (sizeof(uintptr_t) * 2)
static inline size_t natural_alignment(size_t size, size_t alignment) {
if(unlikely(size % alignment))
size = size + alignment - (size % alignment);
return size;
}
static size_t aral_align_alloc_size(ARAL *ar, uint64_t size) {
if(size % ar->config.natural_page_size)
size += ar->config.natural_page_size - (size % ar->config.natural_page_size) ;
if(size % ar->config.element_size)
size -= size % ar->config.element_size;
return size;
}
static inline void aral_lock(ARAL *ar) {
if(likely(!ar->config.lockless))
netdata_spinlock_lock(&ar->aral_lock.spinlock);
}
static inline void aral_unlock(ARAL *ar) {
if(likely(!ar->config.lockless))
netdata_spinlock_unlock(&ar->aral_lock.spinlock);
}
static void aral_delete_leftover_files(const char *name, const char *path, const char *required_prefix) {
DIR *dir = opendir(path);
if(!dir) return;
char full_path[FILENAME_MAX + 1];
size_t len = strlen(required_prefix);
struct dirent *de = NULL;
while((de = readdir(dir))) {
if(de->d_type == DT_DIR)
continue;
if(strncmp(de->d_name, required_prefix, len) != 0)
continue;
snprintfz(full_path, FILENAME_MAX, "%s/%s", path, de->d_name);
info("ARAL: '%s' removing left-over file '%s'", name, full_path);
if(unlikely(unlink(full_path) == -1))
error("ARAL: '%s' cannot delete file '%s'", name, full_path);
}
closedir(dir);
}
// ----------------------------------------------------------------------------
// check a free slot
#ifdef NETDATA_INTERNAL_CHECKS
static inline void aral_free_validate_internal_check(ARAL *ar, ARAL_FREE *fr) {
if(unlikely(fr->size < ar->config.element_size))
fatal("ARAL: '%s' free item of size %zu, less than the expected element size %zu",
ar->config.name, fr->size, ar->config.element_size);
if(unlikely(fr->size % ar->config.element_size))
fatal("ARAL: '%s' free item of size %zu is not multiple to element size %zu",
ar->config.name, fr->size, ar->config.element_size);
}
#else
#define aral_free_validate_internal_check(ar, fr) debug_dummy()
#endif
// ----------------------------------------------------------------------------
// find the page a pointer belongs to
#ifdef NETDATA_INTERNAL_CHECKS
static inline ARAL_PAGE *find_page_with_allocation_internal_check(ARAL *ar, void *ptr) {
aral_lock(ar);
uintptr_t seeking = (uintptr_t)ptr;
ARAL_PAGE *page;
for(page = ar->aral_lock.pages; page ; page = page->next) {
if(unlikely(seeking >= (uintptr_t)page->data && seeking < (uintptr_t)page->data + page->size))
break;
}
aral_unlock(ar);
return page;
}
#endif
// ----------------------------------------------------------------------------
// find a page with a free slot (there shouldn't be any)
#ifdef NETDATA_ARAL_INTERNAL_CHECKS
static inline ARAL_PAGE *find_page_with_free_slots_internal_check___with_aral_lock(ARAL *ar) {
ARAL_PAGE *page;
for(page = ar->aral_lock.pages; page ; page = page->next) {
if(page->aral_lock.free_elements)
break;
internal_fatal(page->size - page->aral_lock.used_elements * ar->config.element_size >= ar->config.element_size,
"ARAL: '%s' a page is marked full, but it is not!", ar->config.name);
internal_fatal(page->size < page->aral_lock.used_elements * ar->config.element_size,
"ARAL: '%s' a page has been overflown!", ar->config.name);
}
return page;
}
#endif
static ARAL_PAGE *aral_create_page___no_lock_needed(ARAL *ar TRACE_ALLOCATIONS_FUNCTION_DEFINITION_PARAMS) {
ARAL_PAGE *page = callocz(1, sizeof(ARAL_PAGE));
netdata_spinlock_init(&page->free.spinlock);
page->size = ar->adders.allocation_size;
if(page->size > ar->config.max_allocation_size)
page->size = ar->config.max_allocation_size;
else
ar->adders.allocation_size = aral_align_alloc_size(ar, (uint64_t)ar->adders.allocation_size * 4 / 3);
page->max_elements = page->aral_lock.free_elements = page->size / ar->config.element_size;
page->free_elements_to_move_first = page->max_elements / 4;
if(unlikely(page->free_elements_to_move_first < 1))
page->free_elements_to_move_first = 1;
__atomic_add_fetch(&aral_globals.atomic.structures.allocations, 1, __ATOMIC_RELAXED);
__atomic_add_fetch(&aral_globals.atomic.structures.allocated, sizeof(ARAL_PAGE), __ATOMIC_RELAXED);
if(unlikely(ar->config.mmap.enabled)) {
ar->aral_lock.file_number++;
char filename[FILENAME_MAX + 1];
snprintfz(filename, FILENAME_MAX, "%s/array_alloc.mmap/%s.%zu", *ar->config.mmap.cache_dir, ar->config.mmap.filename, ar->aral_lock.file_number);
page->filename = strdupz(filename);
page->data = netdata_mmap(page->filename, page->size, MAP_SHARED, 0, false, NULL);
if (unlikely(!page->data))
fatal("ARAL: '%s' cannot allocate aral buffer of size %zu on filename '%s'",
ar->config.name, page->size, page->filename);
__atomic_add_fetch(&aral_globals.atomic.mmap.allocations, 1, __ATOMIC_RELAXED);
__atomic_add_fetch(&aral_globals.atomic.mmap.allocated, page->size, __ATOMIC_RELAXED);
}
else {
#ifdef NETDATA_TRACE_ALLOCATIONS
page->data = mallocz_int(page->size TRACE_ALLOCATIONS_FUNCTION_CALL_PARAMS);
#else
page->data = mallocz(page->size);
#endif
__atomic_add_fetch(&aral_globals.atomic.malloc.allocations, 1, __ATOMIC_RELAXED);
__atomic_add_fetch(&aral_globals.atomic.malloc.allocated, page->size, __ATOMIC_RELAXED);
}
// link the free space to its page
ARAL_FREE *fr = (ARAL_FREE *)page->data;
fr->size = page->size;
fr->next = NULL;
page->free.list = fr;
aral_free_validate_internal_check(ar, fr);
return page;
}
void aral_del_page___no_lock_needed(ARAL *ar, ARAL_PAGE *page TRACE_ALLOCATIONS_FUNCTION_DEFINITION_PARAMS) {
// free it
if (ar->config.mmap.enabled) {
netdata_munmap(page->data, page->size);
if (unlikely(unlink(page->filename) == 1))
error("Cannot delete file '%s'", page->filename);
freez((void *)page->filename);
__atomic_sub_fetch(&aral_globals.atomic.mmap.allocations, 1, __ATOMIC_RELAXED);
__atomic_sub_fetch(&aral_globals.atomic.mmap.allocated, page->size, __ATOMIC_RELAXED);
}
else {
#ifdef NETDATA_TRACE_ALLOCATIONS
freez_int(page->data TRACE_ALLOCATIONS_FUNCTION_CALL_PARAMS);
#else
freez(page->data);
#endif
__atomic_sub_fetch(&aral_globals.atomic.malloc.allocations, 1, __ATOMIC_RELAXED);
__atomic_sub_fetch(&aral_globals.atomic.malloc.allocated, page->size, __ATOMIC_RELAXED);
}
freez(page);
__atomic_sub_fetch(&aral_globals.atomic.structures.allocations, 1, __ATOMIC_RELAXED);
__atomic_sub_fetch(&aral_globals.atomic.structures.allocated, sizeof(ARAL_PAGE), __ATOMIC_RELAXED);
}
static inline void aral_insert_not_linked_page_with_free_items_to_proper_position___aral_lock_needed(ARAL *ar, ARAL_PAGE *page) {
ARAL_PAGE *first = ar->aral_lock.pages;
if (page->aral_lock.free_elements <= page->free_elements_to_move_first ||
!first ||
!first->aral_lock.free_elements ||
page->aral_lock.free_elements <= first->aral_lock.free_elements + ARAL_FREE_PAGES_DELTA_TO_REARRANGE_LIST) {
// first position
DOUBLE_LINKED_LIST_PREPEND_ITEM_UNSAFE(ar->aral_lock.pages, page, prev, next);
}
else {
ARAL_PAGE *second = first->next;
if (!second ||
!second->aral_lock.free_elements ||
page->aral_lock.free_elements <= second->aral_lock.free_elements)
// second position
DOUBLE_LINKED_LIST_INSERT_ITEM_AFTER_UNSAFE(ar->aral_lock.pages, first, page, prev, next);
else
// third position
DOUBLE_LINKED_LIST_INSERT_ITEM_AFTER_UNSAFE(ar->aral_lock.pages, second, page, prev, next);
}
}
static inline ARAL_PAGE *aral_acquire_a_free_slot(ARAL *ar TRACE_ALLOCATIONS_FUNCTION_DEFINITION_PARAMS) {
aral_lock(ar);
ARAL_PAGE *page = ar->aral_lock.pages;
while(!page || !page->aral_lock.free_elements) {
#ifdef NETDATA_ARAL_INTERNAL_CHECKS
internal_fatal(find_page_with_free_slots_internal_check___with_aral_lock(ar), "ARAL: '%s' found page with free slot!", ar->config.name);
#endif
aral_unlock(ar);
if(netdata_spinlock_trylock(&ar->adders.spinlock)) {
page = aral_create_page___no_lock_needed(ar TRACE_ALLOCATIONS_FUNCTION_CALL_PARAMS);
aral_lock(ar);
aral_insert_not_linked_page_with_free_items_to_proper_position___aral_lock_needed(ar, page);
netdata_spinlock_unlock(&ar->adders.spinlock);
break;
}
else {
aral_lock(ar);
page = ar->aral_lock.pages;
}
}
// we have a page
// and aral locked
{
ARAL_PAGE *first = ar->aral_lock.pages;
ARAL_PAGE *second = first->next;
if (!second ||
!second->aral_lock.free_elements ||
first->aral_lock.free_elements <= second->aral_lock.free_elements + ARAL_FREE_PAGES_DELTA_TO_REARRANGE_LIST)
page = first;
else {
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(ar->aral_lock.pages, second, prev, next);
DOUBLE_LINKED_LIST_PREPEND_ITEM_UNSAFE(ar->aral_lock.pages, second, prev, next);
page = second;
}
}
internal_fatal(!page || !page->aral_lock.free_elements,
"ARAL: '%s' selected page does not have a free slot in it",
ar->config.name);
internal_fatal(page->max_elements != page->aral_lock.used_elements + page->aral_lock.free_elements,
"ARAL: '%s' page element counters do not match, "
"page says it can handle %zu elements, "
"but there are %zu used and %zu free items, "
"total %zu items",
ar->config.name,
(size_t)page->max_elements,
(size_t)page->aral_lock.used_elements, (size_t)page->aral_lock.free_elements,
(size_t)page->aral_lock.used_elements + (size_t)page->aral_lock.free_elements
);
ar->aral_lock.user_malloc_operations++;
// acquire a slot for the caller
page->aral_lock.used_elements++;
if(--page->aral_lock.free_elements == 0) {
// we are done with this page
// move the full page last
// so that pages with free items remain first in the list
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(ar->aral_lock.pages, page, prev, next);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(ar->aral_lock.pages, page, prev, next);
}
aral_unlock(ar);
return page;
}
void *aral_mallocz_internal(ARAL *ar TRACE_ALLOCATIONS_FUNCTION_DEFINITION_PARAMS) {
ARAL_PAGE *page = aral_acquire_a_free_slot(ar TRACE_ALLOCATIONS_FUNCTION_CALL_PARAMS);
netdata_spinlock_lock(&page->free.spinlock);
internal_fatal(!page->free.list,
"ARAL: '%s' free item to use, cannot be NULL.", ar->config.name);
internal_fatal(page->free.list->size < ar->config.element_size,
"ARAL: '%s' free item size %zu, cannot be smaller than %zu",
ar->config.name, page->free.list->size, ar->config.element_size);
ARAL_FREE *found_fr = page->free.list;
// check if the remaining size (after we use this slot) is not enough for another element
if(unlikely(found_fr->size - ar->config.element_size < ar->config.element_size)) {
// we can use the entire free space entry
page->free.list = found_fr->next;
}
else {
// we can split the free space entry
uint8_t *data = (uint8_t *)found_fr;
ARAL_FREE *fr = (ARAL_FREE *)&data[ar->config.element_size];
fr->size = found_fr->size - ar->config.element_size;
// link the free slot first in the page
fr->next = found_fr->next;
page->free.list = fr;
aral_free_validate_internal_check(ar, fr);
}
netdata_spinlock_unlock(&page->free.spinlock);
// put the page pointer after the element
uint8_t *data = (uint8_t *)found_fr;
ARAL_PAGE **page_ptr = (ARAL_PAGE **)&data[ar->config.page_ptr_offset];
*page_ptr = page;
if(unlikely(ar->config.mmap.enabled))
__atomic_add_fetch(&aral_globals.atomic.mmap.used, ar->config.element_size, __ATOMIC_RELAXED);
else
__atomic_add_fetch(&aral_globals.atomic.malloc.used, ar->config.element_size, __ATOMIC_RELAXED);
return (void *)found_fr;
}
static inline ARAL_PAGE *aral_ptr_to_page___must_NOT_have_aral_lock(ARAL *ar, void *ptr) {
// given a data pointer we returned before,
// find the ARAL_PAGE it belongs to
uint8_t *data = (uint8_t *)ptr;
ARAL_PAGE **page_ptr = (ARAL_PAGE **)&data[ar->config.page_ptr_offset];
ARAL_PAGE *page = *page_ptr;
#ifdef NETDATA_INTERNAL_CHECKS
// make it NULL so that we will fail on double free
// do not enable this on production, because the MMAP file
// will need to be saved again!
*page_ptr = NULL;
#endif
#ifdef NETDATA_ARAL_INTERNAL_CHECKS
{
// find the page ptr belongs
ARAL_PAGE *page2 = find_page_with_allocation_internal_check(ar, ptr);
internal_fatal(page != page2,
"ARAL: '%s' page pointers do not match!",
ar->name);
internal_fatal(!page2,
"ARAL: '%s' free of pointer %p is not in ARAL address space.",
ar->name, ptr);
}
#endif
internal_fatal(!page,
"ARAL: '%s' possible corruption or double free of pointer %p",
ar->config.name, ptr);
return page;
}
static void aral_defrag_sorted_page_position___aral_lock_needed(ARAL *ar, ARAL_PAGE *page) {
ARAL_PAGE *tmp;
int action = 0; (void)action;
size_t move_later = 0, move_earlier = 0;
for(tmp = page->next ;
tmp && tmp->aral_lock.free_elements && tmp->aral_lock.free_elements < page->aral_lock.free_elements ;
tmp = tmp->next)
move_later++;
if(!tmp && page->next) {
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(ar->aral_lock.pages, page, prev, next);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(ar->aral_lock.pages, page, prev, next);
action = 1;
}
else if(tmp != page->next) {
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(ar->aral_lock.pages, page, prev, next);
DOUBLE_LINKED_LIST_INSERT_ITEM_BEFORE_UNSAFE(ar->aral_lock.pages, tmp, page, prev, next);
action = 2;
}
else {
for(tmp = (page == ar->aral_lock.pages) ? NULL : page->prev ;
tmp && (!tmp->aral_lock.free_elements || tmp->aral_lock.free_elements > page->aral_lock.free_elements);
tmp = (tmp == ar->aral_lock.pages) ? NULL : tmp->prev)
move_earlier++;
if(!tmp) {
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(ar->aral_lock.pages, page, prev, next);
DOUBLE_LINKED_LIST_PREPEND_ITEM_UNSAFE(ar->aral_lock.pages, page, prev, next);
action = 3;
}
else if(tmp != page->prev){
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(ar->aral_lock.pages, page, prev, next);
DOUBLE_LINKED_LIST_INSERT_ITEM_AFTER_UNSAFE(ar->aral_lock.pages, tmp, page, prev, next);
action = 4;
}
}
ar->aral_lock.defragment_operations++;
ar->aral_lock.defragment_linked_list_traversals += move_earlier + move_later;
internal_fatal(page->next && page->next->aral_lock.free_elements && page->next->aral_lock.free_elements < page->aral_lock.free_elements,
"ARAL: '%s' item should be later in the list", ar->config.name);
internal_fatal(page != ar->aral_lock.pages && (!page->prev->aral_lock.free_elements || page->prev->aral_lock.free_elements > page->aral_lock.free_elements),
"ARAL: '%s' item should be earlier in the list", ar->config.name);
}
static inline void aral_move_page_with_free_list___aral_lock_needed(ARAL *ar, ARAL_PAGE *page) {
if(unlikely(page == ar->aral_lock.pages))
// we are the first already
return;
if(likely(!ar->config.defragment)) {
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(ar->aral_lock.pages, page, prev, next);
aral_insert_not_linked_page_with_free_items_to_proper_position___aral_lock_needed(ar, page);
}
else
aral_defrag_sorted_page_position___aral_lock_needed(ar, page);
}
void aral_freez_internal(ARAL *ar, void *ptr TRACE_ALLOCATIONS_FUNCTION_DEFINITION_PARAMS) {
if(unlikely(!ptr)) return;
// get the page pointer
ARAL_PAGE *page = aral_ptr_to_page___must_NOT_have_aral_lock(ar, ptr);
if(unlikely(ar->config.mmap.enabled))
__atomic_sub_fetch(&aral_globals.atomic.mmap.used, ar->config.element_size, __ATOMIC_RELAXED);
else
__atomic_sub_fetch(&aral_globals.atomic.malloc.used, ar->config.element_size, __ATOMIC_RELAXED);
// make this element available
ARAL_FREE *fr = (ARAL_FREE *)ptr;
fr->size = ar->config.element_size;
netdata_spinlock_lock(&page->free.spinlock);
fr->next = page->free.list;
page->free.list = fr;
netdata_spinlock_unlock(&page->free.spinlock);
aral_lock(ar);
internal_fatal(!page->aral_lock.used_elements,
"ARAL: '%s' pointer %p is inside a page without any active allocations.",
ar->config.name, ptr);
internal_fatal(page->max_elements != page->aral_lock.used_elements + page->aral_lock.free_elements,
"ARAL: '%s' page element counters do not match, "
"page says it can handle %zu elements, "
"but there are %zu used and %zu free items, "
"total %zu items",
ar->config.name,
(size_t)page->max_elements,
(size_t)page->aral_lock.used_elements, (size_t)page->aral_lock.free_elements,
(size_t)page->aral_lock.used_elements + (size_t)page->aral_lock.free_elements
);
page->aral_lock.used_elements--;
page->aral_lock.free_elements++;
ar->aral_lock.user_free_operations++;
// if the page is empty, release it
if(unlikely(!page->aral_lock.used_elements)) {
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(ar->aral_lock.pages, page, prev, next);
aral_unlock(ar);
aral_del_page___no_lock_needed(ar, page TRACE_ALLOCATIONS_FUNCTION_CALL_PARAMS);
}
else {
aral_move_page_with_free_list___aral_lock_needed(ar, page);
aral_unlock(ar);
}
}
void aral_destroy_internal(ARAL *ar TRACE_ALLOCATIONS_FUNCTION_DEFINITION_PARAMS) {
aral_lock(ar);
ARAL_PAGE *page;
while((page = ar->aral_lock.pages)) {
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(ar->aral_lock.pages, page, prev, next);
aral_del_page___no_lock_needed(ar, page TRACE_ALLOCATIONS_FUNCTION_CALL_PARAMS);
}
aral_unlock(ar);
freez(ar);
}
ARAL *aral_create(const char *name, size_t element_size, size_t initial_page_elements, size_t max_page_elements, const char *filename, char **cache_dir, bool mmap, bool lockless) {
ARAL *ar = callocz(1, sizeof(ARAL));
ar->config.requested_element_size = element_size;
ar->config.initial_page_elements = initial_page_elements;
ar->config.max_page_elements = max_page_elements;
ar->config.mmap.filename = filename;
ar->config.mmap.cache_dir = cache_dir;
ar->config.mmap.enabled = mmap;
ar->config.lockless = lockless;
ar->config.defragment = false;
strncpyz(ar->config.name, name, ARAL_MAX_NAME);
netdata_spinlock_init(&ar->aral_lock.spinlock);
long int page_size = sysconf(_SC_PAGE_SIZE);
if (unlikely(page_size == -1))
ar->config.natural_page_size = 4096;
else
ar->config.natural_page_size = page_size;
// we need to add a page pointer after the element
// so, first align the element size to the pointer size
ar->config.element_size = natural_alignment(ar->config.requested_element_size, sizeof(uintptr_t));
// then add the size of a pointer to it
ar->config.element_size += sizeof(uintptr_t);
// make sure it is at least what we need for an ARAL_FREE slot
if (ar->config.element_size < sizeof(ARAL_FREE))
ar->config.element_size = sizeof(ARAL_FREE);
// and finally align it to the natural alignment
ar->config.element_size = natural_alignment(ar->config.element_size, ARAL_NATURAL_ALIGNMENT);
// we write the page pointer just after each element
ar->config.page_ptr_offset = ar->config.element_size - sizeof(uintptr_t);
if(ar->config.requested_element_size + sizeof(uintptr_t) > ar->config.element_size)
fatal("ARAL: '%s' failed to calculate properly page_ptr_offset: "
"element size %zu, sizeof(uintptr_t) %zu, natural alignment %zu, "
"final element size %zu, page_ptr_offset %zu",
ar->config.name, ar->config.requested_element_size, sizeof(uintptr_t), ARAL_NATURAL_ALIGNMENT,
ar->config.element_size, ar->config.page_ptr_offset);
//info("ARAL: element size %zu, sizeof(uintptr_t) %zu, natural alignment %zu, final element size %zu, page_ptr_offset %zu",
// ar->element_size, sizeof(uintptr_t), ARAL_NATURAL_ALIGNMENT, ar->internal.element_size, ar->internal.page_ptr_offset);
if (ar->config.initial_page_elements < 2)
ar->config.initial_page_elements = 2;
if(ar->config.mmap.enabled && (!ar->config.mmap.cache_dir || !*ar->config.mmap.cache_dir)) {
error("ARAL: '%s' mmap cache directory is not configured properly, disabling mmap.", ar->config.name);
ar->config.mmap.enabled = false;
internal_fatal(true, "ARAL: '%s' mmap cache directory is not configured properly", ar->config.name);
}
uint64_t max_alloc_size;
if(!ar->config.max_page_elements)
max_alloc_size = ar->config.mmap.enabled ? ARAL_MAX_PAGE_SIZE_MMAP : ARAL_MAX_PAGE_SIZE_MALLOC;
else
max_alloc_size = ar->config.max_page_elements * ar->config.element_size;
ar->config.max_allocation_size = aral_align_alloc_size(ar, max_alloc_size);
ar->adders.allocation_size = aral_align_alloc_size(ar, (uint64_t)ar->config.element_size * ar->config.initial_page_elements);
ar->aral_lock.pages = NULL;
ar->aral_lock.file_number = 0;
if(ar->config.mmap.enabled) {
char directory_name[FILENAME_MAX + 1];
snprintfz(directory_name, FILENAME_MAX, "%s/array_alloc.mmap", *ar->config.mmap.cache_dir);
int r = mkdir(directory_name, 0775);
if (r != 0 && errno != EEXIST)
fatal("Cannot create directory '%s'", directory_name);
char file[FILENAME_MAX + 1];
snprintfz(file, FILENAME_MAX, "%s.", ar->config.mmap.filename);
aral_delete_leftover_files(ar->config.name, directory_name, file);
}
internal_error(true,
"ARAL: '%s' "
"element size %zu (requested %zu bytes), "
"min elements per page %zu (requested %zu), "
"max elements per page %zu (requested %zu), "
"max page size %zu bytes, "
, ar->config.name
, ar->config.element_size, ar->config.requested_element_size
, ar->adders.allocation_size / ar->config.element_size, ar->config.initial_page_elements
, ar->config.max_allocation_size / ar->config.element_size, ar->config.max_page_elements
, ar->config.max_allocation_size
);
__atomic_add_fetch(&aral_globals.atomic.structures.allocations, 1, __ATOMIC_RELAXED);
__atomic_add_fetch(&aral_globals.atomic.structures.allocated, sizeof(ARAL), __ATOMIC_RELAXED);
return ar;
}
// ----------------------------------------------------------------------------
// unittest
struct aral_unittest_config {
bool single_threaded;
bool stop;
ARAL *ar;
size_t elements;
size_t threads;
int errors;
};
static void *aral_test_thread(void *ptr) {
struct aral_unittest_config *auc = ptr;
ARAL *ar = auc->ar;
size_t elements = auc->elements;
void **pointers = callocz(elements, sizeof(void *));
do {
for (size_t i = 0; i < elements; i++) {
pointers[i] = aral_mallocz(ar);
}
for (size_t div = 5; div >= 2; div--) {
for (size_t i = 0; i < elements / div; i++) {
aral_freez(ar, pointers[i]);
pointers[i] = NULL;
}
for (size_t i = 0; i < elements / div; i++) {
pointers[i] = aral_mallocz(ar);
}
}
for (size_t step = 50; step >= 10; step -= 10) {
for (size_t i = 0; i < elements; i += step) {
aral_freez(ar, pointers[i]);
pointers[i] = NULL;
}
for (size_t i = 0; i < elements; i += step) {
pointers[i] = aral_mallocz(ar);
}
}
for (size_t i = 0; i < elements; i++) {
aral_freez(ar, pointers[i]);
pointers[i] = NULL;
}
if (auc->single_threaded && ar->aral_lock.pages) {
fprintf(stderr, "\n\nARAL leftovers detected (1)\n\n");
__atomic_add_fetch(&auc->errors, 1, __ATOMIC_RELAXED);
}
if(!auc->single_threaded && __atomic_load_n(&auc->stop, __ATOMIC_RELAXED))
break;
for (size_t i = 0; i < elements; i++) {
pointers[i] = aral_mallocz(ar);
}
size_t increment = elements / ar->config.max_page_elements;
for (size_t all = increment; all <= elements / 2; all += increment) {
size_t to_free = all % ar->config.max_page_elements;
size_t step = elements / to_free;
if(!step) step = 1;
// fprintf(stderr, "all %zu, to free %zu, step %zu\n", all, to_free, step);
size_t free_list[to_free];
for (size_t i = 0; i < to_free; i++) {
size_t pos = step * i;
aral_freez(ar, pointers[pos]);
pointers[pos] = NULL;
free_list[i] = pos;
}
for (size_t i = 0; i < to_free; i++) {
size_t pos = free_list[i];
pointers[pos] = aral_mallocz(ar);
}
}
for (size_t i = 0; i < elements; i++) {
aral_freez(ar, pointers[i]);
pointers[i] = NULL;
}
if (auc->single_threaded && ar->aral_lock.pages) {
fprintf(stderr, "\n\nARAL leftovers detected (2)\n\n");
__atomic_add_fetch(&auc->errors, 1, __ATOMIC_RELAXED);
}
} while(!auc->single_threaded && !__atomic_load_n(&auc->stop, __ATOMIC_RELAXED));
freez(pointers);
return ptr;
}
int aral_stress_test(size_t threads, size_t elements, size_t seconds) {
fprintf(stderr, "Running stress test of %zu threads, with %zu elements each, for %zu seconds...\n",
threads, elements, seconds);
memset(&aral_globals, 0, sizeof(aral_globals));
struct aral_unittest_config auc = {
.single_threaded = false,
.threads = threads,
.ar = aral_create("aral-test", 20, 10, 1024, "test-aral", NULL, false, false),
.elements = elements,
.errors = 0,
};
usec_t started_ut = now_monotonic_usec();
netdata_thread_t thread_ptrs[threads];
for(size_t i = 0; i < threads ; i++) {
char tag[NETDATA_THREAD_NAME_MAX + 1];
snprintfz(tag, NETDATA_THREAD_NAME_MAX, "TH[%zu]", i);
netdata_thread_create(&thread_ptrs[i], tag,
NETDATA_THREAD_OPTION_JOINABLE | NETDATA_THREAD_OPTION_DONT_LOG,
aral_test_thread, &auc);
}
size_t malloc_done = 0;
size_t free_done = 0;
size_t countdown = seconds;
while(countdown-- > 0) {
sleep_usec(1 * USEC_PER_SEC);
aral_lock(auc.ar);
size_t m = auc.ar->aral_lock.user_malloc_operations;
size_t f = auc.ar->aral_lock.user_free_operations;
aral_unlock(auc.ar);
fprintf(stderr, "ARAL executes %0.2f M malloc and %0.2f M free operations/s\n",
(double)(m - malloc_done) / 1000000.0, (double)(f - free_done) / 1000000.0);
malloc_done = m;
free_done = f;
}
__atomic_store_n(&auc.stop, true, __ATOMIC_RELAXED);
// fprintf(stderr, "Cancelling the threads...\n");
// for(size_t i = 0; i < threads ; i++) {
// netdata_thread_cancel(thread_ptrs[i]);
// }
fprintf(stderr, "Waiting the threads to finish...\n");
for(size_t i = 0; i < threads ; i++) {
netdata_thread_join(thread_ptrs[i], NULL);
}
usec_t ended_ut = now_monotonic_usec();
if (auc.ar->aral_lock.pages) {
fprintf(stderr, "\n\nARAL leftovers detected (3)\n\n");
__atomic_add_fetch(&auc.errors, 1, __ATOMIC_RELAXED);
}
info("ARAL: did %zu malloc, %zu free, "
"using %zu threads, in %llu usecs",
auc.ar->aral_lock.user_malloc_operations,
auc.ar->aral_lock.user_free_operations,
threads,
ended_ut - started_ut);
aral_destroy(auc.ar);
return auc.errors;
}
int aral_unittest(size_t elements) {
char *cache_dir = "/tmp/";
struct aral_unittest_config auc = {
.single_threaded = true,
.threads = 1,
.ar = aral_create("aral-test", 20, 10, 1024, "test-aral", &cache_dir, false, false),
.elements = elements,
.errors = 0,
};
aral_test_thread(&auc);
aral_destroy(auc.ar);
int errors = aral_stress_test(2, elements, 5);
return auc.errors + errors;
}

37
libnetdata/aral/aral.h Normal file
View File

@ -0,0 +1,37 @@
#ifndef ARAL_H
#define ARAL_H 1
#include "../libnetdata.h"
#define ARAL_MAX_NAME 23
typedef struct aral ARAL;
ARAL *aral_create(const char *name, size_t element_size, size_t initial_page_elements, size_t max_page_elements, const char *filename, char **cache_dir, bool mmap, bool lockless);
int aral_unittest(size_t elements);
void aral_get_size_statistics(size_t *structures, size_t *malloc_allocated, size_t *malloc_used, size_t *mmap_allocated, size_t *mmap_used);
#ifdef NETDATA_TRACE_ALLOCATIONS
#define aral_mallocz(ar) aral_mallocz_internal(ar, __FILE__, __FUNCTION__, __LINE__)
#define aral_freez(ar, ptr) aral_freez_internal(ar, ptr, __FILE__, __FUNCTION__, __LINE__)
#define aral_destroy(ar) aral_destroy_internal(ar, __FILE__, __FUNCTION__, __LINE__)
void *aral_mallocz_internal(ARAL *ar, const char *file, const char *function, size_t line);
void aral_freez_internal(ARAL *ar, void *ptr, const char *file, const char *function, size_t line);
void aral_destroy_internal(ARAL *ar, const char *file, const char *function, size_t line);
#else // NETDATA_TRACE_ALLOCATIONS
#define aral_mallocz(ar) aral_mallocz_internal(ar)
#define aral_freez(ar, ptr) aral_freez_internal(ar, ptr)
#define aral_destroy(ar) aral_destroy_internal(ar)
void *aral_mallocz_internal(ARAL *ar);
void aral_freez_internal(ARAL *ar, void *ptr);
void aral_destroy_internal(ARAL *ar);
#endif // NETDATA_TRACE_ALLOCATIONS
#endif // ARAL_H

View File

@ -1,7 +0,0 @@
<!--
title: "Array Allocator"
custom_edit_url: https://github.com/netdata/netdata/edit/master/libnetdata/arrayalloc/README.md
-->
# Array Allocator

View File

@ -1,501 +0,0 @@
#include "../libnetdata.h"
#include "arrayalloc.h"
#ifdef NETDATA_TRACE_ALLOCATIONS
#define TRACE_ALLOCATIONS_FUNCTION_DEFINITION_PARAMS , const char *file, const char *function, size_t line
#define TRACE_ALLOCATIONS_FUNCTION_CALL_PARAMS , file, function, line
#else
#define TRACE_ALLOCATIONS_FUNCTION_DEFINITION_PARAMS
#define TRACE_ALLOCATIONS_FUNCTION_CALL_PARAMS
#endif
// max file size
#define ARAL_MAX_PAGE_SIZE_MMAP (1*1024*1024*1024)
// max malloc size
// optimal at current versions of libc is up to 256k
// ideal to have the same overhead as libc is 4k
#define ARAL_MAX_PAGE_SIZE_MALLOC (64*1024)
typedef struct arrayalloc_free {
size_t size;
struct arrayalloc_page *page;
struct arrayalloc_free *next;
} ARAL_FREE;
typedef struct arrayalloc_page {
const char *filename;
size_t size; // the total size of the page
size_t used_elements; // the total number of used elements on this page
uint8_t *data;
ARAL_FREE *free_list;
struct arrayalloc_page *prev; // the prev page on the list
struct arrayalloc_page *next; // the next page on the list
} ARAL_PAGE;
#define ARAL_NATURAL_ALIGNMENT (sizeof(uintptr_t) * 2)
static inline size_t natural_alignment(size_t size, size_t alignment) {
if(unlikely(size % alignment))
size = size + alignment - (size % alignment);
return size;
}
static void arrayalloc_delete_leftover_files(const char *path, const char *required_prefix) {
DIR *dir = opendir(path);
if(!dir) return;
char full_path[FILENAME_MAX + 1];
size_t len = strlen(required_prefix);
struct dirent *de = NULL;
while((de = readdir(dir))) {
if(de->d_type == DT_DIR)
continue;
if(strncmp(de->d_name, required_prefix, len) != 0)
continue;
snprintfz(full_path, FILENAME_MAX, "%s/%s", path, de->d_name);
info("ARRAYALLOC: removing left-over file '%s'", full_path);
if(unlikely(unlink(full_path) == -1))
error("Cannot delete file '%s'", full_path);
}
closedir(dir);
}
// ----------------------------------------------------------------------------
// arrayalloc_init()
static void arrayalloc_init(ARAL *ar) {
static netdata_mutex_t mutex = NETDATA_MUTEX_INITIALIZER;
netdata_mutex_lock(&mutex);
if(!ar->internal.initialized) {
netdata_spinlock_init(&ar->internal.spinlock);
long int page_size = sysconf(_SC_PAGE_SIZE);
if (unlikely(page_size == -1))
ar->internal.natural_page_size = 4096;
else
ar->internal.natural_page_size = page_size;
// we need to add a page pointer after the element
// so, first align the element size to the pointer size
ar->internal.element_size = natural_alignment(ar->requested_element_size, sizeof(uintptr_t));
// then add the size of a pointer to it
ar->internal.element_size += sizeof(uintptr_t);
// make sure it is at least what we need for an ARAL_FREE slot
if (ar->internal.element_size < sizeof(ARAL_FREE))
ar->internal.element_size = sizeof(ARAL_FREE);
// and finally align it to the natural alignment
ar->internal.element_size = natural_alignment(ar->internal.element_size, ARAL_NATURAL_ALIGNMENT);
// we write the page pointer just after each element
ar->internal.page_ptr_offset = ar->internal.element_size - sizeof(uintptr_t);
if(ar->requested_element_size + sizeof(uintptr_t) > ar->internal.element_size)
fatal("ARRAYALLOC: failed to calculate properly page_ptr_offset: element size %zu, sizeof(uintptr_t) %zu, natural alignment %zu, final element size %zu, page_ptr_offset %zu",
ar->requested_element_size, sizeof(uintptr_t), ARAL_NATURAL_ALIGNMENT, ar->internal.element_size, ar->internal.page_ptr_offset);
//info("ARRAYALLOC: element size %zu, sizeof(uintptr_t) %zu, natural alignment %zu, final element size %zu, page_ptr_offset %zu",
// ar->element_size, sizeof(uintptr_t), ARAL_NATURAL_ALIGNMENT, ar->internal.element_size, ar->internal.page_ptr_offset);
if (ar->initial_elements < 10)
ar->initial_elements = 10;
ar->internal.mmap = (ar->use_mmap && ar->cache_dir && *ar->cache_dir) ? true : false;
ar->internal.max_alloc_size = ar->internal.mmap ? ARAL_MAX_PAGE_SIZE_MMAP : ARAL_MAX_PAGE_SIZE_MALLOC;
if(ar->internal.max_alloc_size % ar->internal.natural_page_size)
ar->internal.max_alloc_size += ar->internal.natural_page_size - (ar->internal.max_alloc_size % ar->internal.natural_page_size) ;
if(ar->internal.max_alloc_size % ar->internal.element_size)
ar->internal.max_alloc_size -= ar->internal.max_alloc_size % ar->internal.element_size;
ar->internal.pages = NULL;
ar->internal.allocation_multiplier = 1;
ar->internal.file_number = 0;
if(ar->internal.mmap) {
char directory_name[FILENAME_MAX + 1];
snprintfz(directory_name, FILENAME_MAX, "%s/array_alloc.mmap", *ar->cache_dir);
int r = mkdir(directory_name, 0775);
if (r != 0 && errno != EEXIST)
fatal("Cannot create directory '%s'", directory_name);
char filename[FILENAME_MAX + 1];
snprintfz(filename, FILENAME_MAX, "%s.", ar->filename);
arrayalloc_delete_leftover_files(directory_name, filename);
}
ar->internal.initialized = true;
}
netdata_mutex_unlock(&mutex);
}
// ----------------------------------------------------------------------------
// check a free slot
#ifdef NETDATA_INTERNAL_CHECKS
static inline void arrayalloc_free_validate_internal_check(ARAL *ar, ARAL_FREE *fr) {
if(unlikely(fr->size < ar->internal.element_size))
fatal("ARRAYALLOC: free item of size %zu, less than the expected element size %zu", fr->size, ar->internal.element_size);
if(unlikely(fr->size % ar->internal.element_size))
fatal("ARRAYALLOC: free item of size %zu is not multiple to element size %zu", fr->size, ar->internal.element_size);
}
#else
#define arrayalloc_free_validate_internal_check(ar, fr) debug_dummy()
#endif
// ----------------------------------------------------------------------------
// find the page a pointer belongs to
#ifdef NETDATA_INTERNAL_CHECKS
static inline ARAL_PAGE *find_page_with_allocation_internal_check(ARAL *ar, void *ptr) {
uintptr_t seeking = (uintptr_t)ptr;
ARAL_PAGE *page;
for(page = ar->internal.pages; page ; page = page->next) {
if(unlikely(seeking >= (uintptr_t)page->data && seeking < (uintptr_t)page->data + page->size))
break;
}
return page;
}
#endif
// ----------------------------------------------------------------------------
// find a page with a free slot (there shouldn't be any)
#ifdef NETDATA_ARRAYALLOC_INTERNAL_CHECKS
static inline ARAL_PAGE *find_page_with_free_slots_internal_check(ARAL *ar) {
ARAL_PAGE *page;
for(page = ar->internal.pages; page ; page = page->next) {
if(page->free_list)
break;
internal_fatal(page->size - page->used_elements * ar->internal.element_size >= ar->internal.element_size,
"ARRAYALLOC: a page is marked full, but it is not!");
internal_fatal(page->size < page->used_elements * ar->internal.element_size,
"ARRAYALLOC: a page has been overflown!");
}
return page;
}
#endif
static void arrayalloc_add_page(ARAL *ar TRACE_ALLOCATIONS_FUNCTION_DEFINITION_PARAMS) {
if(unlikely(!ar->internal.initialized))
arrayalloc_init(ar);
ARAL_PAGE *page = callocz(1, sizeof(ARAL_PAGE));
page->size = ar->initial_elements * ar->internal.element_size * ar->internal.allocation_multiplier;
if(page->size > ar->internal.max_alloc_size)
page->size = ar->internal.max_alloc_size;
else
ar->internal.allocation_multiplier *= 2;
if(ar->internal.mmap) {
ar->internal.file_number++;
char filename[FILENAME_MAX + 1];
snprintfz(filename, FILENAME_MAX, "%s/array_alloc.mmap/%s.%zu", *ar->cache_dir, ar->filename, ar->internal.file_number);
page->filename = strdupz(filename);
page->data = netdata_mmap(page->filename, page->size, MAP_SHARED, 0, false, NULL);
if (unlikely(!page->data))
fatal("Cannot allocate arrayalloc buffer of size %zu on filename '%s'", page->size, page->filename);
}
else {
#ifdef NETDATA_TRACE_ALLOCATIONS
page->data = mallocz_int(page->size TRACE_ALLOCATIONS_FUNCTION_CALL_PARAMS);
#else
page->data = mallocz(page->size);
#endif
}
// link the free space to its page
ARAL_FREE *fr = (ARAL_FREE *)page->data;
fr->size = page->size;
fr->page = page;
fr->next = NULL;
page->free_list = fr;
// link the new page at the front of the list of pages
DOUBLE_LINKED_LIST_PREPEND_UNSAFE(ar->internal.pages, page, prev, next);
arrayalloc_free_validate_internal_check(ar, fr);
}
static inline void arrayalloc_lock(ARAL *ar) {
if(likely(!ar->internal.lockless))
netdata_spinlock_lock(&ar->internal.spinlock);
}
static inline void arrayalloc_unlock(ARAL *ar) {
if(likely(!ar->internal.lockless))
netdata_spinlock_unlock(&ar->internal.spinlock);
}
ARAL *arrayalloc_create(size_t element_size, size_t elements, const char *filename, char **cache_dir, bool mmap, bool lockless) {
ARAL *ar = callocz(1, sizeof(ARAL));
ar->requested_element_size = element_size;
ar->initial_elements = elements;
ar->filename = filename;
ar->cache_dir = cache_dir;
ar->use_mmap = mmap;
ar->internal.lockless = lockless;
return ar;
}
void arrayalloc_del_page(ARAL *ar, ARAL_PAGE *page TRACE_ALLOCATIONS_FUNCTION_DEFINITION_PARAMS) {
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(ar->internal.pages, page, prev, next);
// free it
if (ar->internal.mmap) {
netdata_munmap(page->data, page->size);
if (unlikely(unlink(page->filename) == 1))
error("Cannot delete file '%s'", page->filename);
freez((void *)page->filename);
}
else {
#ifdef NETDATA_TRACE_ALLOCATIONS
freez_int(page->data TRACE_ALLOCATIONS_FUNCTION_CALL_PARAMS);
#else
freez(page->data);
#endif
}
freez(page);
}
void arrayalloc_destroy_internal(ARAL *ar TRACE_ALLOCATIONS_FUNCTION_DEFINITION_PARAMS) {
arrayalloc_lock(ar);
while(ar->internal.pages)
arrayalloc_del_page(ar, ar->internal.pages TRACE_ALLOCATIONS_FUNCTION_CALL_PARAMS);
arrayalloc_unlock(ar);
freez(ar);
}
void *arrayalloc_mallocz_internal(ARAL *ar TRACE_ALLOCATIONS_FUNCTION_DEFINITION_PARAMS) {
if(unlikely(!ar->internal.initialized))
arrayalloc_init(ar);
arrayalloc_lock(ar);
if(unlikely(!ar->internal.pages || !ar->internal.pages->free_list)) {
#ifdef NETDATA_ARRAYALLOC_INTERNAL_CHECKS
internal_fatal(find_page_with_free_slots_internal_check(ar) != NULL,
"ARRAYALLOC: first page does not have any free slots, but there is another that has!");
#endif
arrayalloc_add_page(ar TRACE_ALLOCATIONS_FUNCTION_CALL_PARAMS);
}
ARAL_PAGE *page = ar->internal.pages;
ARAL_FREE *found_fr = page->free_list;
internal_fatal(!found_fr,
"ARRAYALLOC: free item to use, cannot be NULL.");
internal_fatal(found_fr->size < ar->internal.element_size,
"ARRAYALLOC: free item size %zu, cannot be smaller than %zu",
found_fr->size, ar->internal.element_size);
if(unlikely(found_fr->size - ar->internal.element_size < ar->internal.element_size)) {
// we can use the entire free space entry
page->free_list = found_fr->next;
if(unlikely(!page->free_list)) {
// we are done with this page
// move the full page last
// so that pages with free items remain first in the list
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(ar->internal.pages, page, prev, next);
DOUBLE_LINKED_LIST_APPEND_UNSAFE(ar->internal.pages, page, prev, next);
}
}
else {
// we can split the free space entry
uint8_t *data = (uint8_t *)found_fr;
ARAL_FREE *fr = (ARAL_FREE *)&data[ar->internal.element_size];
fr->page = page;
fr->size = found_fr->size - ar->internal.element_size;
// link the free slot first in the page
fr->next = found_fr->next;
page->free_list = fr;
arrayalloc_free_validate_internal_check(ar, fr);
}
page->used_elements++;
// put the page pointer after the element
uint8_t *data = (uint8_t *)found_fr;
ARAL_PAGE **page_ptr = (ARAL_PAGE **)&data[ar->internal.page_ptr_offset];
*page_ptr = page;
arrayalloc_unlock(ar);
return (void *)found_fr;
}
void arrayalloc_freez_internal(ARAL *ar, void *ptr TRACE_ALLOCATIONS_FUNCTION_DEFINITION_PARAMS) {
if(unlikely(!ptr)) return;
arrayalloc_lock(ar);
// get the page pointer
ARAL_PAGE *page;
{
uint8_t *data = (uint8_t *)ptr;
ARAL_PAGE **page_ptr = (ARAL_PAGE **)&data[ar->internal.page_ptr_offset];
page = *page_ptr;
#ifdef NETDATA_INTERNAL_CHECKS
// make it NULL so that we will fail on double free
// do not enable this on production, because the MMAP file
// will need to be saved again!
*page_ptr = NULL;
#endif
}
#ifdef NETDATA_ARRAYALLOC_INTERNAL_CHECKS
{
// find the page ptr belongs
ARAL_PAGE *page2 = find_page_with_allocation_internal_check(ar, ptr);
if(unlikely(page != page2))
fatal("ARRAYALLOC: page pointers do not match!");
if (unlikely(!page2))
fatal("ARRAYALLOC: free of pointer %p is not in arrayalloc address space.", ptr);
}
#endif
if(unlikely(!page))
fatal("ARRAYALLOC: possible corruption or double free of pointer %p", ptr);
if (unlikely(!page->used_elements))
fatal("ARRAYALLOC: free of pointer %p is inside a page without any active allocations.", ptr);
page->used_elements--;
// make this element available
ARAL_FREE *fr = (ARAL_FREE *)ptr;
fr->page = page;
fr->size = ar->internal.element_size;
fr->next = page->free_list;
page->free_list = fr;
// if the page is empty, release it
if(!page->used_elements)
arrayalloc_del_page(ar, page TRACE_ALLOCATIONS_FUNCTION_CALL_PARAMS);
else if(page != ar->internal.pages) {
// move the page with free item first
// so that the next allocation will use this page
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(ar->internal.pages, page, prev, next);
DOUBLE_LINKED_LIST_PREPEND_UNSAFE(ar->internal.pages, page, prev, next);
}
arrayalloc_unlock(ar);
}
int aral_unittest(size_t elements) {
char *cache_dir = "/tmp/";
ARAL *ar = arrayalloc_create(20, 10, "test-aral", &cache_dir, false, false);
void *pointers[elements];
for(size_t i = 0; i < elements ;i++) {
pointers[i] = arrayalloc_mallocz(ar);
}
for(size_t div = 5; div >= 2 ;div--) {
for (size_t i = 0; i < elements / div; i++) {
arrayalloc_freez(ar, pointers[i]);
}
for (size_t i = 0; i < elements / div; i++) {
pointers[i] = arrayalloc_mallocz(ar);
}
}
for(size_t step = 50; step >= 10 ;step -= 10) {
for (size_t i = 0; i < elements; i += step) {
arrayalloc_freez(ar, pointers[i]);
}
for (size_t i = 0; i < elements; i += step) {
pointers[i] = arrayalloc_mallocz(ar);
}
}
for(size_t i = 0; i < elements ;i++) {
arrayalloc_freez(ar, pointers[i]);
}
if(ar->internal.pages) {
fprintf(stderr, "ARAL leftovers detected (1)");
return 1;
}
size_t ops = 0; (void)ops;
size_t increment = elements / 10;
size_t allocated = 0;
for(size_t all = increment; all <= elements ; all += increment) {
for(; allocated < all ; allocated++) {
pointers[allocated] = arrayalloc_mallocz(ar);
ops++;
}
size_t to_free = now_realtime_usec() % all;
size_t free_list[to_free];
for(size_t i = 0; i < to_free ;i++) {
size_t pos;
do {
pos = now_realtime_usec() % all;
} while(!pointers[pos]);
arrayalloc_freez(ar, pointers[pos]);
pointers[pos] = NULL;
free_list[i] = pos;
ops++;
}
for(size_t i = 0; i < to_free ;i++) {
size_t pos = free_list[i];
pointers[pos] = arrayalloc_mallocz(ar);
ops++;
}
}
for(size_t i = 0; i < allocated - 1 ;i++) {
arrayalloc_freez(ar, pointers[i]);
ops++;
}
arrayalloc_freez(ar, pointers[allocated - 1]);
if(ar->internal.pages) {
fprintf(stderr, "ARAL leftovers detected (2)");
return 1;
}
return 0;
}

View File

@ -1,55 +0,0 @@
#ifndef ARRAYALLOC_H
#define ARRAYALLOC_H 1
#include "../libnetdata.h"
typedef struct arrayalloc {
size_t requested_element_size;
size_t initial_elements;
const char *filename;
char **cache_dir;
bool use_mmap;
// private members - do not touch
struct {
bool mmap;
bool lockless;
bool initialized;
size_t element_size;
size_t page_ptr_offset;
size_t file_number;
size_t natural_page_size;
size_t allocation_multiplier;
size_t max_alloc_size;
SPINLOCK spinlock;
struct arrayalloc_page *pages;
} internal;
} ARAL;
ARAL *arrayalloc_create(size_t element_size, size_t elements, const char *filename, char **cache_dir, bool mmap, bool lockless);
int aral_unittest(size_t elements);
#ifdef NETDATA_TRACE_ALLOCATIONS
#define arrayalloc_mallocz(ar) arrayalloc_mallocz_internal(ar, __FILE__, __FUNCTION__, __LINE__)
#define arrayalloc_freez(ar, ptr) arrayalloc_freez_internal(ar, ptr, __FILE__, __FUNCTION__, __LINE__)
#define arrayalloc_destroy(ar) arrayalloc_destroy_internal(ar, __FILE__, __FUNCTION__, __LINE__)
void *arrayalloc_mallocz_internal(ARAL *ar, const char *file, const char *function, size_t line);
void arrayalloc_freez_internal(ARAL *ar, void *ptr, const char *file, const char *function, size_t line);
void arrayalloc_destroy_internal(ARAL *ar, const char *file, const char *function, size_t line);
#else // NETDATA_TRACE_ALLOCATIONS
#define arrayalloc_mallocz(ar) arrayalloc_mallocz_internal(ar)
#define arrayalloc_freez(ar, ptr) arrayalloc_freez_internal(ar, ptr)
#define arrayalloc_destroy(ar) arrayalloc_destroy_internal(ar)
void *arrayalloc_mallocz_internal(ARAL *ar);
void arrayalloc_freez_internal(ARAL *ar, void *ptr);
void arrayalloc_destroy_internal(ARAL *ar);
#endif // NETDATA_TRACE_ALLOCATIONS
#endif // ARRAYALLOC_H

View File

@ -789,7 +789,7 @@ static void garbage_collect_pending_deletes(DICTIONARY *dict) {
// we didn't get a reference
if(item_is_not_referenced_and_can_be_removed(dict, item)) {
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(dict->items.list, item, prev, next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(dict->items.list, item, prev, next);
dict_item_free_with_hooks(dict, item);
deleted++;
@ -1167,9 +1167,9 @@ static inline void item_linked_list_add(DICTIONARY *dict, DICTIONARY_ITEM *item)
ll_recursive_lock(dict, DICTIONARY_LOCK_WRITE);
if(dict->options & DICT_OPTION_ADD_IN_FRONT)
DOUBLE_LINKED_LIST_PREPEND_UNSAFE(dict->items.list, item, prev, next);
DOUBLE_LINKED_LIST_PREPEND_ITEM_UNSAFE(dict->items.list, item, prev, next);
else
DOUBLE_LINKED_LIST_APPEND_UNSAFE(dict->items.list, item, prev, next);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(dict->items.list, item, prev, next);
#ifdef NETDATA_INTERNAL_CHECKS
item->ll_adder_pid = gettid();
@ -1186,7 +1186,7 @@ static inline void item_linked_list_add(DICTIONARY *dict, DICTIONARY_ITEM *item)
static inline void item_linked_list_remove(DICTIONARY *dict, DICTIONARY_ITEM *item) {
ll_recursive_lock(dict, DICTIONARY_LOCK_WRITE);
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(dict->items.list, item, prev, next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(dict->items.list, item, prev, next);
#ifdef NETDATA_INTERNAL_CHECKS
item->ll_remover_pid = gettid();
@ -1234,28 +1234,43 @@ static inline size_t item_get_name_len(const DICTIONARY_ITEM *item) {
return strlen(item->caller_name);
}
static ARAL dict_items_aral = {
.filename = NULL,
.cache_dir = NULL,
.use_mmap = false,
.initial_elements = 65536 / sizeof(DICTIONARY_ITEM),
.requested_element_size = sizeof(DICTIONARY_ITEM),
};
static ARAL *dict_items_aral = NULL;
static ARAL *dict_shared_items_aral = NULL;
static ARAL dict_shared_items_aral = {
.filename = NULL,
.cache_dir = NULL,
.use_mmap = false,
.initial_elements = 65536 / sizeof(DICTIONARY_ITEM_SHARED),
.requested_element_size = sizeof(DICTIONARY_ITEM_SHARED),
};
void dictionary_static_items_aral_init(void) {
static SPINLOCK spinlock;
if(unlikely(!dict_items_aral || !dict_shared_items_aral)) {
netdata_spinlock_lock(&spinlock);
// we have to check again
if(!dict_items_aral)
dict_items_aral = aral_create(
"dict-items",
sizeof(DICTIONARY_ITEM),
0,
4096,
NULL, NULL, false, false);
// we have to check again
if(!dict_shared_items_aral)
dict_shared_items_aral = aral_create(
"dict-shared-items",
sizeof(DICTIONARY_ITEM_SHARED),
0,
4096,
NULL, NULL, false, false);
netdata_spinlock_unlock(&spinlock);
}
}
static DICTIONARY_ITEM *dict_item_create(DICTIONARY *dict __maybe_unused, size_t *allocated_bytes, DICTIONARY_ITEM *master_item) {
DICTIONARY_ITEM *item;
size_t size = sizeof(DICTIONARY_ITEM);
// item = callocz(1, size);
item = arrayalloc_mallocz(&dict_items_aral);
item = aral_mallocz(dict_items_aral);
memset(item, 0, sizeof(DICTIONARY_ITEM));
#ifdef NETDATA_INTERNAL_CHECKS
@ -1276,7 +1291,7 @@ static DICTIONARY_ITEM *dict_item_create(DICTIONARY *dict __maybe_unused, size_t
else {
size = sizeof(DICTIONARY_ITEM_SHARED);
// item->shared = callocz(1, size);
item->shared = arrayalloc_mallocz(&dict_shared_items_aral);
item->shared = aral_mallocz(dict_shared_items_aral);
memset(item->shared, 0, sizeof(DICTIONARY_ITEM_SHARED));
item->shared->links = 1;
@ -1418,13 +1433,13 @@ static size_t dict_item_free_with_hooks(DICTIONARY *dict, DICTIONARY_ITEM *item)
value_size += item->shared->value_len;
// freez(item->shared);
arrayalloc_freez(&dict_shared_items_aral, item->shared);
aral_freez(dict_shared_items_aral, item->shared);
item->shared = NULL;
item_size += sizeof(DICTIONARY_ITEM_SHARED);
}
// freez(item);
arrayalloc_freez(&dict_items_aral, item);
aral_freez(dict_items_aral, item);
item_size += sizeof(DICTIONARY_ITEM);
@ -1971,6 +1986,7 @@ static DICTIONARY *dictionary_create_internal(DICT_OPTIONS options, struct dicti
dict_size += reference_counter_init(dict);
dict_size += hashtable_init_unsafe(dict);
dictionary_static_items_aral_init();
pointer_index_init(dict);
DICTIONARY_STATS_PLUS_MEMORY(dict, 0, dict_size, 0);

View File

@ -64,7 +64,7 @@ void julyl_cleanup1(void) {
if(julyl_globals.protected.available_items && julyl_globals.protected.available > 10) {
item = julyl_globals.protected.available_items;
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(julyl_globals.protected.available_items, item, cache.prev, cache.next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(julyl_globals.protected.available_items, item, cache.prev, cache.next);
julyl_globals.protected.available--;
}
@ -85,7 +85,7 @@ struct JulyL *julyl_get(void) {
j = julyl_globals.protected.available_items;
if(likely(j)) {
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(julyl_globals.protected.available_items, j, cache.prev, cache.next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(julyl_globals.protected.available_items, j, cache.prev, cache.next);
julyl_globals.protected.available--;
}
@ -114,7 +114,7 @@ static void julyl_release(struct JulyL *j) {
__atomic_add_fetch(&julyl_globals.atomics.reallocs, j->reallocs, __ATOMIC_RELAXED);
netdata_spinlock_lock(&julyl_globals.protected.spinlock);
DOUBLE_LINKED_LIST_APPEND_UNSAFE(julyl_globals.protected.available_items, j, cache.prev, cache.next);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(julyl_globals.protected.available_items, j, cache.prev, cache.next);
julyl_globals.protected.available++;
netdata_spinlock_unlock(&julyl_globals.protected.spinlock);
}

View File

@ -233,8 +233,9 @@ extern "C" {
// ---------------------------------------------------------------------------------------------
// double linked list management
// inspired by https://github.com/troydhanson/uthash/blob/master/src/utlist.h
#define DOUBLE_LINKED_LIST_PREPEND_UNSAFE(head, item, prev, next) \
#define DOUBLE_LINKED_LIST_PREPEND_ITEM_UNSAFE(head, item, prev, next) \
do { \
(item)->next = (head); \
\
@ -248,7 +249,7 @@ extern "C" {
(head) = (item); \
} while (0)
#define DOUBLE_LINKED_LIST_APPEND_UNSAFE(head, item, prev, next) \
#define DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(head, item, prev, next) \
do { \
if(likely(head)) { \
(item)->prev = (head)->prev; \
@ -264,39 +265,97 @@ extern "C" {
\
} while (0)
#define DOUBLE_LINKED_LIST_REMOVE_UNSAFE(head, item, prev, next) \
#define DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(head, item, prev, next) \
do { \
fatal_assert((head) != NULL); \
fatal_assert((item)->prev != NULL); \
\
if((item)->prev == (item)) { \
if((item)->prev == (item)) \
/* it is the only item in the list */ \
(head) = NULL; \
} \
\
else if((item) == (head)) { \
/* it is the first item */ \
fatal_assert((item)->next != NULL); \
(item)->next->prev = (item)->prev; \
(head) = (item)->next; \
} \
else { \
/* it is any other item */ \
(item)->prev->next = (item)->next; \
if ((item)->next) { \
\
if ((item)->next) \
(item)->next->prev = (item)->prev; \
} \
else { \
else \
(head)->prev = (item)->prev; \
} \
} \
\
(item)->next = NULL; \
(item)->prev = NULL; \
} while (0)
#define DOUBLE_LINKED_LIST_INSERT_ITEM_BEFORE_UNSAFE(head, existing, item, prev, next) \
do { \
if (existing) { \
fatal_assert((head) != NULL); \
fatal_assert((item) != NULL); \
\
(item)->next = (existing); \
(item)->prev = (existing)->prev; \
(existing)->prev = (item); \
\
if ((head) == (existing)) \
(head) = (item); \
else \
(item)->prev->next = (item); \
\
} \
else \
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(head, item, prev, next); \
\
} while (0)
#define DOUBLE_LINKED_LIST_INSERT_ITEM_AFTER_UNSAFE(head, existing, item, prev, next) \
do { \
if (existing) { \
fatal_assert((head) != NULL); \
fatal_assert((item) != NULL); \
\
(item)->next = (existing)->next; \
(item)->prev = (existing); \
(existing)->next = (item); \
\
if ((item)->next) \
(item)->next->prev = (item); \
else \
(head)->prev = (item); \
} \
else \
DOUBLE_LINKED_LIST_PREPEND_ITEM_UNSAFE(head, item, prev, next); \
\
} while (0)
#define DOUBLE_LINKED_LIST_APPEND_LIST_UNSAFE(head, head2, prev, next) \
do { \
if (head2) { \
if (head) { \
__typeof(head2) _head2_last_item = (head2)->prev; \
\
(head2)->prev = (head)->prev; \
(head)->prev->next = (head2); \
\
(head)->prev = _head2_last_item; \
} \
else \
(head) = (head2); \
} \
} while (0)
#define DOUBLE_LINKED_LIST_FOREACH_FORWARD(head, var, prev, next) \
for ((var) = (head); (var) ; (var) = (var)->next)
#define DOUBLE_LINKED_LIST_FOREACH_BACKWARD(head, var, prev, next) \
for ((var) = (head)?(head)->prev:NULL; (var) && (var) != (head)->prev ; (var) = (var)->prev)
for ((var) = (head) ? (head)->prev : NULL ; (var) ; (var) = ((var) == (head)) ? NULL : (var)->prev)
// ---------------------------------------------------------------------------------------------
@ -481,7 +540,7 @@ extern char *netdata_configured_host_prefix;
#include "json/json.h"
#include "health/health.h"
#include "string/utf8.h"
#include "arrayalloc/arrayalloc.h"
#include "libnetdata/aral/aral.h"
#include "onewayalloc/onewayalloc.h"
#include "worker_utilization/worker_utilization.h"

View File

@ -12,8 +12,8 @@ typedef pthread_mutex_t netdata_mutex_t;
typedef struct netdata_spinlock {
bool locked;
#ifdef NETDATA_INTERNAL_CHECKS
size_t spins;
pid_t locker_pid;
size_t spins;
#endif
} SPINLOCK;

View File

@ -43,7 +43,7 @@ static void netdata_popen_tracking_add_pid_unsafe(pid_t pid) {
mp = mallocz(sizeof(struct netdata_popen));
mp->pid = pid;
DOUBLE_LINKED_LIST_PREPEND_UNSAFE(netdata_popen_root, mp, prev, next);
DOUBLE_LINKED_LIST_PREPEND_ITEM_UNSAFE(netdata_popen_root, mp, prev, next);
}
// myp_del deletes pid if we're tracking.
@ -61,7 +61,7 @@ static void netdata_popen_tracking_del_pid(pid_t pid) {
}
if(mp) {
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(netdata_popen_root, mp, prev, next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(netdata_popen_root, mp, prev, next);
freez(mp);
}
else
@ -96,7 +96,7 @@ void netdata_popen_tracking_cleanup(void) {
while(netdata_popen_root) {
struct netdata_popen *mp = netdata_popen_root;
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(netdata_popen_root, mp, prev, next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(netdata_popen_root, mp, prev, next);
freez(mp);
}

View File

@ -100,7 +100,7 @@ void worker_register(const char *name) {
}
netdata_spinlock_lock(&workname->spinlock);
DOUBLE_LINKED_LIST_APPEND_UNSAFE(workname->base, worker, prev, next);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(workname->base, worker, prev, next);
netdata_spinlock_unlock(&workname->spinlock);
netdata_spinlock_unlock(&workers_globals.spinlock);
@ -141,7 +141,7 @@ void worker_unregister(void) {
if(PValue) {
struct workers_workname *workname = *PValue;
netdata_spinlock_lock(&workname->spinlock);
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(workname->base, worker, prev, next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(workname->base, worker, prev, next);
netdata_spinlock_unlock(&workname->spinlock);
if(!workname->base) {

View File

@ -392,7 +392,7 @@ static void replication_query_execute(BUFFER *wb, struct replication_query *q, s
if (likely( d->sp.start_time_s <= min_end_time &&
d->sp.end_time_s >= min_end_time &&
!storage_point_is_unset(d->sp) &&
!storage_point_is_empty(d->sp))) {
!storage_point_is_gap(d->sp))) {
buffer_sprintf(wb, PLUGINSD_KEYWORD_REPLAY_SET " \"%s\" " NETDATA_DOUBLE_FORMAT " \"%s\"\n",
rrddim_id(d->rd), d->sp.sum, d->sp.flags & SN_FLAG_RESET ? "R" : "");

View File

@ -498,8 +498,8 @@ int connect_to_one_of_destinations(
// move the current item to the end of the list
// without this, this destination will break the loop again and again
// not advancing the destinations to find one that may work
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(host->destinations, d, prev, next);
DOUBLE_LINKED_LIST_APPEND_UNSAFE(host->destinations, d, prev, next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(host->destinations, d, prev, next);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(host->destinations, d, prev, next);
break;
}
@ -522,7 +522,7 @@ bool destinations_init_add_one(char *entry, void *data) {
__atomic_add_fetch(&netdata_buffers_statistics.rrdhost_senders, sizeof(struct rrdpush_destinations), __ATOMIC_RELAXED);
DOUBLE_LINKED_LIST_APPEND_UNSAFE(t->list, d, prev, next);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(t->list, d, prev, next);
t->count++;
info("STREAM: added streaming destination No %d: '%s' to host '%s'", t->count, string2str(d->destination), rrdhost_hostname(t->host));
@ -549,7 +549,7 @@ void rrdpush_destinations_init(RRDHOST *host) {
void rrdpush_destinations_free(RRDHOST *host) {
while (host->destinations) {
struct rrdpush_destinations *tmp = host->destinations;
DOUBLE_LINKED_LIST_REMOVE_UNSAFE(host->destinations, tmp, prev, next);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(host->destinations, tmp, prev, next);
string_freez(tmp->destination);
freez(tmp);
__atomic_sub_fetch(&netdata_buffers_statistics.rrdhost_senders, sizeof(struct rrdpush_destinations), __ATOMIC_RELAXED);

View File

@ -17,7 +17,7 @@
#include "percentile/percentile.h"
#include "trimmed_mean/trimmed_mean.h"
#define POINTS_TO_EXPAND_QUERY 0
#define POINTS_TO_EXPAND_QUERY 5
// ----------------------------------------------------------------------------
@ -927,6 +927,8 @@ typedef struct query_engine_ops {
// query planer
size_t current_plan;
time_t current_plan_expire_time;
time_t plan_expanded_after;
time_t plan_expanded_before;
// storage queries
size_t tier;
@ -955,6 +957,20 @@ typedef struct query_engine_ops {
#define query_plan_should_switch_plan(ops, now) ((now) >= (ops)->current_plan_expire_time)
static size_t query_planer_expand_duration_in_points(time_t this_update_every, time_t next_update_every) {
time_t delta = this_update_every - next_update_every;
if(delta < 0) delta = -delta;
size_t points;
if(delta < this_update_every * POINTS_TO_EXPAND_QUERY)
points = POINTS_TO_EXPAND_QUERY;
else
points = (delta + this_update_every - 1) / this_update_every;
return points;
}
static void query_planer_initialize_plans(QUERY_ENGINE_OPS *ops) {
QUERY_METRIC *qm = ops->qm;
@ -962,8 +978,35 @@ static void query_planer_initialize_plans(QUERY_ENGINE_OPS *ops) {
size_t tier = qm->plan.array[p].tier;
time_t update_every = qm->tiers[tier].db_update_every_s;
time_t after = qm->plan.array[p].after - (update_every * POINTS_TO_EXPAND_QUERY);
time_t before = qm->plan.array[p].before + (update_every * POINTS_TO_EXPAND_QUERY);
size_t points_to_add_to_after;
if(p > 0) {
// there is another plan before to this
size_t tier0 = qm->plan.array[p - 1].tier;
time_t update_every0 = qm->tiers[tier0].db_update_every_s;
points_to_add_to_after = query_planer_expand_duration_in_points(update_every, update_every0);
}
else
points_to_add_to_after = (tier == 0) ? 0 : POINTS_TO_EXPAND_QUERY;
size_t points_to_add_to_before;
if(p + 1 < qm->plan.used) {
// there is another plan after to this
size_t tier1 = qm->plan.array[p+1].tier;
time_t update_every1 = qm->tiers[tier1].db_update_every_s;
points_to_add_to_before = query_planer_expand_duration_in_points(update_every, update_every1);
}
else
points_to_add_to_before = POINTS_TO_EXPAND_QUERY;
time_t after = qm->plan.array[p].after - (time_t)(update_every * points_to_add_to_after);
time_t before = qm->plan.array[p].before + (time_t)(update_every * points_to_add_to_before);
qm->plan.array[p].expanded_after = after;
qm->plan.array[p].expanded_before = before;
struct query_metric_tier *tier_ptr = &qm->tiers[tier];
tier_ptr->eng->api.query_ops.init(
@ -1027,9 +1070,12 @@ static void query_planer_activate_plan(QUERY_ENGINE_OPS *ops, size_t plan_id, ti
ops->current_plan_expire_time = qm->plan.array[plan_id + 1].after;
else
ops->current_plan_expire_time = qm->plan.array[plan_id].before;
ops->plan_expanded_after = qm->plan.array[plan_id].expanded_after;
ops->plan_expanded_before = qm->plan.array[plan_id].expanded_before;
}
static void query_planer_next_plan(QUERY_ENGINE_OPS *ops, time_t now, time_t last_point_end_time) {
static bool query_planer_next_plan(QUERY_ENGINE_OPS *ops, time_t now, time_t last_point_end_time) {
QUERY_METRIC *qm = ops->qm;
size_t old_plan = ops->current_plan;
@ -1043,7 +1089,7 @@ static void query_planer_next_plan(QUERY_ENGINE_OPS *ops, time_t now, time_t las
ops->current_plan_expire_time = ops->r->internal.qt->window.before;
// let the query run with current plan
// we will not switch it
return;
return false;
}
next_plan_before_time = qm->plan.array[ops->current_plan].before;
@ -1052,11 +1098,12 @@ static void query_planer_next_plan(QUERY_ENGINE_OPS *ops, time_t now, time_t las
if(!query_metric_is_valid_tier(qm, qm->plan.array[ops->current_plan].tier)) {
ops->current_plan = old_plan;
ops->current_plan_expire_time = ops->r->internal.qt->window.before;
return;
return false;
}
query_planer_finalize_plan(ops, old_plan);
query_planer_activate_plan(ops, ops->current_plan, MIN(now, last_point_end_time));
return true;
}
static int compare_query_plan_entries_on_start_time(const void *a, const void *b) {
@ -1273,6 +1320,11 @@ static void rrd2rrdr_query_execute(RRDR *r, size_t dim_id_in_rrdr, QUERY_ENGINE_
QUERY_POINT last1_point = QUERY_POINT_EMPTY;
QUERY_POINT new_point = QUERY_POINT_EMPTY;
// ONE POINT READ-AHEAD
// when we switch plans, we read-ahead a point from the next plan
// to join them smoothly at the exact time the next plan begins
STORAGE_POINT next1_point = STORAGE_POINT_UNSET;
time_t now_start_time = after_wanted - ops->query_granularity;
time_t now_end_time = after_wanted + ops->view_update_every - ops->query_granularity;
@ -1311,8 +1363,41 @@ static void rrd2rrdr_query_execute(RRDR *r, size_t dim_id_in_rrdr, QUERY_ENGINE_
// fetch the new point
{
db_points_read_since_plan_switch++;
STORAGE_POINT sp = ops->next_metric(ops->handle);
STORAGE_POINT sp;
if(likely(storage_point_is_unset(next1_point))) {
db_points_read_since_plan_switch++;
sp = ops->next_metric(ops->handle);
}
else {
// ONE POINT READ-AHEAD
sp = next1_point;
storage_point_unset(next1_point);
db_points_read_since_plan_switch = 1;
}
// ONE POINT READ-AHEAD
if(unlikely(query_plan_should_switch_plan(ops, sp.end_time_s) &&
query_planer_next_plan(ops, now_end_time, new_point.end_time))) {
// The end time of the current point, crosses our plans (tiers)
// so, we switched plan (tier)
//
// There are 2 cases now:
//
// A. the entire point of the previous plan is to the future of point from the next plan
// B. part of the point of the previous plan overlaps with the point from the next plan
STORAGE_POINT sp2 = ops->next_metric(ops->handle);
if(sp.start_time_s > sp2.start_time_s)
// the point from the previous plan is useless
sp = sp2;
else
// let the query run from the previous plan
// but setting this will also cut off the interpolation
// of the point from the previous plan
next1_point = sp2;
}
ops->db_points_read_per_tier[ops->tier]++;
ops->db_total_points_read++;
@ -1326,8 +1411,8 @@ static void rrd2rrdr_query_execute(RRDR *r, size_t dim_id_in_rrdr, QUERY_ENGINE_
// info("QUERY: got point %zu, from time %ld to %ld // now from %ld to %ld // query from %ld to %ld",
// new_point.id, new_point.start_time, new_point.end_time, now_start_time, now_end_time, after_wanted, before_wanted);
//
// set the right value to the point we got
if(likely(!storage_point_is_unset(sp) && !storage_point_is_empty(sp))) {
// get the right value from the point we got
if(likely(!storage_point_is_unset(sp) && !storage_point_is_gap(sp))) {
if(unlikely(use_anomaly_bit_as_value))
new_point.value = new_point.anomaly;
@ -1391,7 +1476,7 @@ static void rrd2rrdr_query_execute(RRDR *r, size_t dim_id_in_rrdr, QUERY_ENGINE_
count_same_end_time = 0;
// decide how to use this point
if(likely(new_point.end_time < now_end_time)) { // likely to favor tier0
if(likely(new_point.end_time <= now_end_time)) { // likely to favor tier0
// this db point ends before our now_end_time
if(likely(new_point.end_time >= now_start_time)) { // likely to favor tier0
@ -1408,8 +1493,8 @@ static void rrd2rrdr_query_execute(RRDR *r, size_t dim_id_in_rrdr, QUERY_ENGINE_
// at exactly the time we will want
// we only log if this is not point 1
internal_error(new_point.end_time < after_wanted &&
new_point.id > POINTS_TO_EXPAND_QUERY + 1,
internal_error(new_point.end_time < ops->plan_expanded_after &&
db_points_read_since_plan_switch > 1,
"QUERY: '%s', dimension '%s' next_metric() "
"returned point %zu from %ld time %ld, "
"which is entirely before our current timeframe %ld to %ld "
@ -1417,7 +1502,7 @@ static void rrd2rrdr_query_execute(RRDR *r, size_t dim_id_in_rrdr, QUERY_ENGINE_
qt->id, string2str(qm->dimension.id),
new_point.id, new_point.start_time, new_point.end_time,
now_start_time, now_end_time,
after_wanted, before_wanted);
ops->plan_expanded_after, ops->plan_expanded_before);
}
}
@ -1430,19 +1515,30 @@ static void rrd2rrdr_query_execute(RRDR *r, size_t dim_id_in_rrdr, QUERY_ENGINE_
if(unlikely(count_same_end_time)) {
internal_error(true,
"QUERY: '%s', dimension '%s', the database does not advance the query, it returned an end time less or equal to the end time of the last point we got %ld, %zu times",
qt->id, string2str(qm->dimension.id), last1_point.end_time, count_same_end_time);
"QUERY: '%s', dimension '%s', the database does not advance the query,"
" it returned an end time less or equal to the end time of the last "
"point we got %ld, %zu times",
qt->id, string2str(qm->dimension.id),
last1_point.end_time, count_same_end_time);
if(unlikely(new_point.end_time <= last1_point.end_time))
new_point.end_time = now_end_time;
}
time_t stop_time = new_point.end_time;
if(unlikely(!storage_point_is_unset(next1_point))) {
// ONE POINT READ-AHEAD
// the point crosses the start time of the
// read ahead storage point we have read
stop_time = next1_point.start_time_s;
}
// the inner loop
// we have 3 points in memory: last2, last1, new
// we select the one to use based on their timestamps
size_t iterations = 0;
for ( ; now_end_time <= new_point.end_time && points_added < points_wanted ;
for ( ; now_end_time <= stop_time && points_added < points_wanted ;
now_end_time += ops->view_update_every, iterations++) {
// now_start_time is wrong in this loop

View File

@ -129,10 +129,10 @@ void web_client_request_done(struct web_client *w) {
, mode
, sent
, size
, -((size > 0) ? ((size - sent) / (double) size * 100.0) : 0.0)
, dt_usec(&w->tv_ready, &w->tv_in) / 1000.0
, dt_usec(&tv, &w->tv_ready) / 1000.0
, dt_usec(&tv, &w->tv_in) / 1000.0
, -((size > 0) ? ((double)(size - sent) / (double) size * 100.0) : 0.0)
, (double)dt_usec(&w->tv_ready, &w->tv_in) / 1000.0
, (double)dt_usec(&tv, &w->tv_ready) / 1000.0
, (double)dt_usec(&tv, &w->tv_in) / 1000.0
, w->response.code
, strip_control_characters(w->last_url)
);
@ -302,7 +302,7 @@ int mysendfile(struct web_client *w, char *filename) {
}
}
// if the filename contains a .. refuse to serve it
// if the filename contains a double dot refuse to serve it
if(strstr(filename, "..") != 0) {
debug(D_WEB_CLIENT_ACCESS, "%llu: File '%s' is not acceptable.", w->id, filename);
w->response.data->contenttype = CT_TEXT_HTML;
@ -831,9 +831,8 @@ static inline char *web_client_valid_method(struct web_client *w, char *s) {
* @param s is the first address of the string.
* @param ptr is the address of the separator.
*/
static void web_client_set_path_query(struct web_client *w, char *s, char *ptr) {
static void web_client_set_path_query(struct web_client *w, const char *s, char *ptr) {
w->url_path_length = (size_t)(ptr -s);
w->url_search_path = ptr;
}
@ -1429,7 +1428,7 @@ static inline int web_client_process_url(RRDHOST *host, struct web_client *w, ch
// replace the zero bytes with spaces
buffer_char_replace(w->response.data, '\0', ' ');
// just leave the buffer as is
// just leave the buffer as-is
// it will be copied back to the client
return HTTP_RESP_OK;
@ -1546,7 +1545,7 @@ void web_client_process_request(struct web_client *w) {
break;
}
// keep track of the time we done processing
// keep track of the processing time
now_realtime_timeval(&w->tv_ready);
w->response.sent = 0;
@ -1847,7 +1846,7 @@ ssize_t web_client_read_file(struct web_client *w)
if(unlikely(w->response.rlen <= w->response.data->len))
return 0;
ssize_t left = w->response.rlen - w->response.data->len;
ssize_t left = (ssize_t)(w->response.rlen - w->response.data->len);
ssize_t bytes = read(w->ifd, &w->response.data->buffer[w->response.data->len], (size_t)left);
if(likely(bytes > 0)) {
size_t old = w->response.data->len;
@ -1897,7 +1896,7 @@ ssize_t web_client_receive(struct web_client *w)
return web_client_read_file(w);
ssize_t bytes;
ssize_t left = w->response.data->size - w->response.data->len;
ssize_t left = (ssize_t)(w->response.data->size - w->response.data->len);
// do we have any space for more data?
buffer_need_bytes(w->response.data, NETDATA_WEB_REQUEST_RECEIVE_SIZE);