diff --git a/src/Makefile b/src/Makefile index 5d2b2b395..fec6573c2 100644 --- a/src/Makefile +++ b/src/Makefile @@ -139,7 +139,7 @@ endif REDIS_SERVER_NAME=redis-server REDIS_SENTINEL_NAME=redis-sentinel -REDIS_SERVER_OBJ=adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o +REDIS_SERVER_OBJ=adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o REDIS_CLI_NAME=redis-cli REDIS_CLI_OBJ=anet.o adlist.o redis-cli.o zmalloc.o release.o anet.o ae.o crc64.o REDIS_BENCHMARK_NAME=redis-benchmark diff --git a/src/cluster.c b/src/cluster.c index 4d7b0502d..1a38a8e47 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -476,8 +476,10 @@ void clusterInit(void) { } } - /* The slots -> keys map is a sorted set. Init it. */ - server.cluster->slots_to_keys = zslCreate(); + /* The slots -> keys map is a radix tree. Initialize it here. */ + server.cluster->slots_to_keys = raxNew(); + memset(server.cluster->slots_keys_count,0, + sizeof(server.cluster->slots_keys_count)); /* Set myself->port / cport to my listening ports, we'll just need to * discover the IP address via MEET messages. */ diff --git a/src/cluster.h b/src/cluster.h index 6dd69a01b..be6fe0bdc 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -116,7 +116,8 @@ typedef struct clusterState { clusterNode *migrating_slots_to[CLUSTER_SLOTS]; clusterNode *importing_slots_from[CLUSTER_SLOTS]; clusterNode *slots[CLUSTER_SLOTS]; - zskiplist *slots_to_keys; + uint64_t slots_keys_count[CLUSTER_SLOTS]; + rax *slots_to_keys; /* The following fields are used to take the slave state on elections. */ mstime_t failover_auth_time; /* Time of previous or next election. */ int failover_auth_count; /* Number of votes received so far. */ diff --git a/src/db.c b/src/db.c index a21437c76..ee7398abb 100644 --- a/src/db.c +++ b/src/db.c @@ -1301,90 +1301,85 @@ int *migrateGetKeys(struct redisCommand *cmd, robj **argv, int argc, int *numkey /* Slot to Key API. This is used by Redis Cluster in order to obtain in * a fast way a key that belongs to a specified hash slot. This is useful - * while rehashing the cluster. */ -void slotToKeyAdd(robj *key) { + * while rehashing the cluster and in other conditions when we need to + * understand if we have keys for a given hash slot. */ +void slotToKeyUpdateKey(robj *key, int add) { unsigned int hashslot = keyHashSlot(key->ptr,sdslen(key->ptr)); + unsigned char buf[64]; + unsigned char *indexed = buf; + size_t keylen = sdslen(key->ptr); - sds sdskey = sdsdup(key->ptr); - zslInsert(server.cluster->slots_to_keys,hashslot,sdskey); + server.cluster->slots_keys_count[hashslot] += add ? 1 : -1; + if (keylen+2 > 64) indexed = zmalloc(keylen+2); + indexed[0] = (hashslot >> 8) & 0xff; + indexed[1] = hashslot & 0xff; + memcpy(indexed+2,key->ptr,keylen); + if (add) { + raxInsert(server.cluster->slots_to_keys,indexed,keylen+2,NULL); + } else { + raxRemove(server.cluster->slots_to_keys,indexed,keylen+2); + } + if (indexed != buf) zfree(indexed); +} + +void slotToKeyAdd(robj *key) { + slotToKeyUpdateKey(key,1); } void slotToKeyDel(robj *key) { - unsigned int hashslot = keyHashSlot(key->ptr,sdslen(key->ptr)); - zslDelete(server.cluster->slots_to_keys,hashslot,key->ptr,NULL); + slotToKeyUpdateKey(key,0); } void slotToKeyFlush(void) { - zslFree(server.cluster->slots_to_keys); - server.cluster->slots_to_keys = zslCreate(); + raxFree(server.cluster->slots_to_keys); + server.cluster->slots_to_keys = raxNew(); + memset(server.cluster->slots_keys_count,0, + sizeof(server.cluster->slots_keys_count)); } /* Pupulate the specified array of objects with keys in the specified slot. * New objects are returned to represent keys, it's up to the caller to * decrement the reference count to release the keys names. */ unsigned int getKeysInSlot(unsigned int hashslot, robj **keys, unsigned int count) { - zskiplistNode *n; - zrangespec range; + raxIterator iter; int j = 0; + unsigned char indexed[2]; - range.min = range.max = hashslot; - range.minex = range.maxex = 0; - - n = zslFirstInRange(server.cluster->slots_to_keys, &range); - while(n && n->score == hashslot && count--) { - keys[j++] = createStringObject(n->ele,sdslen(n->ele)); - n = n->level[0].forward; + indexed[0] = (hashslot >> 8) & 0xff; + indexed[1] = hashslot & 0xff; + raxStart(&iter,server.cluster->slots_to_keys); + raxSeek(&iter,indexed,2,">="); + while(count-- && raxNext(&iter,NULL,0,NULL)) { + if (iter.key[0] != indexed[0] || iter.key[1] != indexed[1]) break; + keys[j++] = createStringObject((char*)iter.key+2,iter.key_len-2); } + raxStop(&iter); return j; } /* Remove all the keys in the specified hash slot. * The number of removed items is returned. */ unsigned int delKeysInSlot(unsigned int hashslot) { - zskiplistNode *n; - zrangespec range; + raxIterator iter; int j = 0; + unsigned char indexed[2]; - range.min = range.max = hashslot; - range.minex = range.maxex = 0; + indexed[0] = (hashslot >> 8) & 0xff; + indexed[1] = hashslot & 0xff; + raxStart(&iter,server.cluster->slots_to_keys); + while(server.cluster->slots_keys_count[hashslot]) { + raxSeek(&iter,indexed,2,">="); + raxNext(&iter,NULL,0,NULL); - n = zslFirstInRange(server.cluster->slots_to_keys, &range); - while(n && n->score == hashslot) { - sds sdskey = n->ele; - robj *key = createStringObject(sdskey,sdslen(sdskey)); - n = n->level[0].forward; /* Go to the next item before freeing it. */ + robj *key = createStringObject((char*)iter.key+2,iter.key_len-2); dbDelete(&server.db[0],key); decrRefCount(key); j++; } + raxStop(&iter); return j; } unsigned int countKeysInSlot(unsigned int hashslot) { - zskiplist *zsl = server.cluster->slots_to_keys; - zskiplistNode *zn; - zrangespec range; - int rank, count = 0; - - range.min = range.max = hashslot; - range.minex = range.maxex = 0; - - /* Find first element in range */ - zn = zslFirstInRange(zsl, &range); - - /* Use rank of first element, if any, to determine preliminary count */ - if (zn != NULL) { - rank = zslGetRank(zsl, zn->score, zn->ele); - count = (zsl->length - (rank - 1)); - - /* Find last element in range */ - zn = zslLastInRange(zsl, &range); - - /* Use rank of last element, if any, to determine the actual count */ - if (zn != NULL) { - rank = zslGetRank(zsl, zn->score, zn->ele); - count -= (zsl->length - rank); - } - } - return count; + return server.cluster->slots_keys_count[hashslot]; } diff --git a/src/lazyfree.c b/src/lazyfree.c index c05252159..8d56e1031 100644 --- a/src/lazyfree.c +++ b/src/lazyfree.c @@ -97,11 +97,14 @@ void emptyDbAsync(redisDb *db) { /* Empty the slots-keys map of Redis CLuster by creating a new empty one * and scheduiling the old for lazy freeing. */ void slotToKeyFlushAsync(void) { - zskiplist *oldsl = server.cluster->slots_to_keys; - server.cluster->slots_to_keys = zslCreate(); - atomicIncr(lazyfree_objects,oldsl->length, + rax *old = server.cluster->slots_to_keys; + + server.cluster->slots_to_keys = raxNew(); + memset(server.cluster->slots_keys_count,0, + sizeof(server.cluster->slots_keys_count)); + atomicIncr(lazyfree_objects,old->numele, lazyfree_objects_mutex); - bioCreateBackgroundJob(BIO_LAZY_FREE,NULL,NULL,oldsl); + bioCreateBackgroundJob(BIO_LAZY_FREE,NULL,NULL,old); } /* Release objects from the lazyfree thread. It's just decrRefCount() @@ -125,8 +128,8 @@ void lazyfreeFreeDatabaseFromBioThread(dict *ht1, dict *ht2) { /* Release the skiplist mapping Redis Cluster keys to slots in the * lazyfree thread. */ -void lazyfreeFreeSlotsMapFromBioThread(zskiplist *sl) { - size_t len = sl->length; - zslFree(sl); +void lazyfreeFreeSlotsMapFromBioThread(rax *rt) { + size_t len = rt->numele; + raxFree(rt); atomicDecr(lazyfree_objects,len,lazyfree_objects_mutex); } diff --git a/src/rax.c b/src/rax.c new file mode 100644 index 000000000..ca49787ed --- /dev/null +++ b/src/rax.c @@ -0,0 +1,1845 @@ +/* Rax -- A radix tree implementation. + * + * Copyright (c) 2017, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include "rax.h" +#include "rax_malloc.h" + +/* This is a special pointer that is guaranteed to never have the same value + * of a radix tree node. It's used in order to report "not found" error without + * requiring the function to have multiple return values. */ +void *raxNotFound = (void*)"rax-not-found-pointer"; + +/* -------------------------------- Debugging ------------------------------ */ + +void raxDebugShowNode(const char *msg, raxNode *n); + +/* Turn debugging messages on/off. */ +#if 0 +#define debugf(...) \ + do { \ + printf("%s:%s:%d:\t", __FILE__, __FUNCTION__, __LINE__); \ + printf(__VA_ARGS__); \ + fflush(stdout); \ + } while (0); + +#define debugnode(msg,n) raxDebugShowNode(msg,n) +#else +#define debugf(...) +#define debugnode(msg,n) +#endif + +/* ------------------------- raxStack functions -------------------------- + * The raxStack is a simple stack of pointers that is capable of switching + * from using a stack-allocated array to dynamic heap once a given number of + * items are reached. It is used in order to retain the list of parent nodes + * while walking the radix tree in order to implement certain operations that + * need to navigate the tree upward. + * ------------------------------------------------------------------------- */ + +/* Initialize the stack. */ +static inline void raxStackInit(raxStack *ts) { + ts->stack = ts->static_items; + ts->items = 0; + ts->maxitems = RAX_STACK_STATIC_ITEMS; + ts->oom = 0; +} + +/* Push an item into the stack, returns 1 on success, 0 on out of memory. */ +static inline int raxStackPush(raxStack *ts, void *ptr) { + if (ts->items == ts->maxitems) { + if (ts->stack == ts->static_items) { + ts->stack = rax_malloc(sizeof(void*)*ts->maxitems*2); + if (ts->stack == NULL) { + ts->stack = ts->static_items; + ts->oom = 1; + return 0; + } + memcpy(ts->stack,ts->static_items,sizeof(void*)*ts->maxitems); + } else { + void **newalloc = rax_realloc(ts->stack,sizeof(void*)*ts->maxitems*2); + if (newalloc == NULL) { + ts->oom = 1; + return 0; + } + ts->stack = newalloc; + } + ts->maxitems *= 2; + } + ts->stack[ts->items] = ptr; + ts->items++; + return 1; +} + +/* Pop an item from the stack, the function returns NULL if there are no + * items to pop. */ +static inline void *raxStackPop(raxStack *ts) { + if (ts->items == 0) return NULL; + ts->items--; + return ts->stack[ts->items]; +} + +/* Return the stack item at the top of the stack without actually consuming + * it. */ +static inline void *raxStackPeek(raxStack *ts) { + if (ts->items == 0) return NULL; + return ts->stack[ts->items-1]; +} + +/* Free the stack in case we used heap allocation. */ +static inline void raxStackFree(raxStack *ts) { + if (ts->stack != ts->static_items) rax_free(ts->stack); +} + +/* ---------------------------------------------------------------------------- + * Radis tree implementation + * --------------------------------------------------------------------------*/ + +/* Allocate a new non compressed node with the specified number of children. + * If datafiled is true, the allocation is made large enough to hold the + * associated data pointer. + * Returns the new node pointer. On out of memory NULL is returned. */ +raxNode *raxNewNode(size_t children, int datafield) { + size_t nodesize = sizeof(raxNode)+children+ + sizeof(raxNode*)*children; + if (datafield) nodesize += sizeof(void*); + raxNode *node = rax_malloc(nodesize); + if (node == NULL) return NULL; + node->iskey = 0; + node->isnull = 0; + node->iscompr = 0; + node->size = children; + return node; +} + +/* Allocate a new rax and return its pointer. On out of memory the function + * returns NULL. */ +rax *raxNew(void) { + rax *rax = rax_malloc(sizeof(*rax)); + if (rax == NULL) return NULL; + rax->numele = 0; + rax->numnodes = 1; + rax->head = raxNewNode(0,0); + if (rax->head == NULL) { + rax_free(rax); + return NULL; + } else { + return rax; + } +} + +/* Return the current total size of the node. */ +#define raxNodeCurrentLength(n) ( \ + sizeof(raxNode)+(n)->size+ \ + ((n)->iscompr ? sizeof(raxNode*) : sizeof(raxNode*)*(n)->size)+ \ + (((n)->iskey && !(n)->isnull)*sizeof(void*)) \ +) + +/* realloc the node to make room for auxiliary data in order + * to store an item in that node. On out of memory NULL is returned. */ +raxNode *raxReallocForData(raxNode *n, void *data) { + if (data == NULL) return n; /* No reallocation needed, setting isnull=1 */ + size_t curlen = raxNodeCurrentLength(n); + return rax_realloc(n,curlen+sizeof(void*)); +} + +/* Set the node auxiliary data to the specified pointer. */ +void raxSetData(raxNode *n, void *data) { + n->iskey = 1; + if (data != NULL) { + void **ndata = (void**) + ((char*)n+raxNodeCurrentLength(n)-sizeof(void*)); + memcpy(ndata,&data,sizeof(data)); + n->isnull = 0; + } else { + n->isnull = 1; + } +} + +/* Get the node auxiliary data. */ +void *raxGetData(raxNode *n) { + if (n->isnull) return NULL; + void **ndata =(void**)((char*)n+raxNodeCurrentLength(n)-sizeof(void*)); + void *data; + memcpy(&data,ndata,sizeof(data)); + return data; +} + +/* Add a new child to the node 'n' representing the character 'c' and return + * its new pointer, as well as the child pointer by reference. Additionally + * '***parentlink' is populated with the raxNode pointer-to-pointer of where + * the new child was stored, which is useful for the caller to replace the + * child pointer if it gets reallocated. + * + * On success the new parent node pointer is returned (it may change because + * of the realloc, so the caller should discard 'n' and use the new value). + * On out of memory NULL is returned, and the old node is still valid. */ +raxNode *raxAddChild(raxNode *n, char c, raxNode **childptr, raxNode ***parentlink) { + assert(n->iscompr == 0); + + size_t curlen = sizeof(raxNode)+ + n->size+ + sizeof(raxNode*)*n->size; + size_t newlen; + + /* Alloc the new child we will link to 'n'. */ + raxNode *child = raxNewNode(0,0); + if (child == NULL) return NULL; + + /* Make space in the original node. */ + if (n->iskey) curlen += sizeof(void*); + newlen = curlen+sizeof(raxNode*)+1; /* Add 1 char and 1 pointer. */ + raxNode *newn = rax_realloc(n,newlen); + if (newn == NULL) { + rax_free(child); + return NULL; + } + n = newn; + + /* After the reallocation, we have 5/9 (depending on the system + * pointer size) bytes at the end, that is, the additional char + * in the 'data' section, plus one pointer to the new child: + * + * [numc][abx][ap][bp][xp]|auxp|..... + * + * Let's find where to insert the new child in order to make sure + * it is inserted in-place lexicographically. */ + int pos; + for (pos = 0; pos < n->size; pos++) { + if (n->data[pos] > c) break; + } + + /* Now, if present, move auxiliary data pointer at the end + * so that we can mess with the other data without overwriting it. + * We will obtain something like that: + * + * [numc][abx][ap][bp][xp].....|auxp| */ + unsigned char *src; + if (n->iskey && !n->isnull) { + src = n->data+n->size+sizeof(raxNode*)*n->size; + memmove(src+1+sizeof(raxNode*),src,sizeof(void*)); + } + + /* Now imagine we are adding a node with edge 'c'. The insertion + * point is between 'b' and 'x', so the 'pos' variable value is + * To start, move all the child pointers after the insertion point + * of 1+sizeof(pointer) bytes on the right, to obtain: + * + * [numc][abx][ap][bp].....[xp]|auxp| */ + src = n->data+n->size+sizeof(raxNode*)*pos; + memmove(src+1+sizeof(raxNode*),src,sizeof(raxNode*)*(n->size-pos)); + + /* Now make the space for the additional char in the data section, + * but also move the pointers before the insertion point in the right + * by 1 byte, in order to obtain the following: + * + * [numc][ab.x][ap][bp]....[xp]|auxp| */ + src = n->data+pos; + memmove(src+1,src,n->size-pos+sizeof(raxNode*)*pos); + + /* We can now set the character and its child node pointer to get: + * + * [numc][abcx][ap][bp][cp]....|auxp| + * [numc][abcx][ap][bp][cp][xp]|auxp| */ + n->data[pos] = c; + n->size++; + raxNode **childfield = (raxNode**)(n->data+n->size+sizeof(raxNode*)*pos); + memcpy(childfield,&child,sizeof(child)); + *childptr = child; + *parentlink = childfield; + return n; +} + +/* Return the pointer to the last child pointer in a node. For the compressed + * nodes this is the only child pointer. */ +#define raxNodeLastChildPtr(n) ((raxNode**) ( \ + ((char*)(n)) + \ + raxNodeCurrentLength(n) - \ + sizeof(raxNode*) - \ + (((n)->iskey && !(n)->isnull) ? sizeof(void*) : 0) \ +)) + +/* Return the pointer to the first child pointer. */ +#define raxNodeFirstChildPtr(n) ((raxNode**)((n)->data+(n)->size)) + +/* Turn the node 'n', that must be a node without any children, into a + * compressed node representing a set of nodes linked one after the other + * and having exactly one child each. The node can be a key or not: this + * property and the associated value if any will be preserved. + * + * The function also returns a child node, since the last node of the + * compressed chain cannot be part of the chain: it has zero children while + * we can only compress inner nodes with exactly one child each. */ +raxNode *raxCompressNode(raxNode *n, unsigned char *s, size_t len, raxNode **child) { + assert(n->size == 0 && n->iscompr == 0); + void *data = NULL; /* Initialized only to avoid warnings. */ + size_t newsize; + + debugf("Compress node: %.*s\n", (int)len,s); + + /* Allocate the child to link to this node. */ + *child = raxNewNode(0,0); + if (*child == NULL) return NULL; + + /* Make space in the parent node. */ + newsize = sizeof(raxNode)+len+sizeof(raxNode*); + if (n->iskey) { + data = raxGetData(n); /* To restore it later. */ + if (!n->isnull) newsize += sizeof(void*); + } + raxNode *newn = rax_realloc(n,newsize); + if (newn == NULL) { + rax_free(*child); + return NULL; + } + n = newn; + + n->iscompr = 1; + n->size = len; + memcpy(n->data,s,len); + if (n->iskey) raxSetData(n,data); + raxNode **childfield = raxNodeLastChildPtr(n); + memcpy(childfield,&child,sizeof(child)); + return n; +} + +/* Low level function that walks the tree looking for the string + * 's' of 'len' bytes. The function returns the number of characters + * of the key that was possible to process: if the returned integer + * is the same as 'len', then it means that the node corresponding to the + * string was found (however it may not be a key in case the node->iskey is + * zero or if simply we stopped in the middle of a compressed node, so that + * 'splitpos' is non zero). + * + * Otherwise if the returned integer is not the same as 'len', there was an + * early stop during the tree walk because of a character mismatch. + * + * The node where the search ended (because the full string was processed + * or because there was an early stop) is returned by reference as + * '*stopnode' if the passed pointer is not NULL. This node link in the + * parent's node is returned as '*plink' if not NULL. Finally, if the + * search stopped in a compressed node, '*splitpos' returns the index + * inside the compressed node where the search ended. This is useful to + * know where to split the node for insertion. */ +static inline size_t raxLowWalk(rax *rax, unsigned char *s, size_t len, raxNode **stopnode, raxNode ***plink, int *splitpos, raxStack *ts) { + raxNode *h = rax->head; + raxNode **parentlink = &rax->head; + + size_t i = 0; /* Position in the string. */ + size_t j = 0; /* Position in the node children (or bytes if compressed).*/ + while(h->size && i < len) { + debugnode("Lookup current node",h); + unsigned char *v = h->data; + + if (h->iscompr) { + for (j = 0; j < h->size && i < len; j++, i++) { + if (v[j] != s[i]) break; + } + if (j != h->size) break; + } else { + /* Even when h->size is large, linear scan provides good + * performances compared to other approaches that are in theory + * more sounding, like performing a binary search. */ + for (j = 0; j < h->size; j++) { + if (v[j] == s[i]) break; + } + if (j == h->size) break; + i++; + } + + if (ts) raxStackPush(ts,h); /* Save stack of parent nodes. */ + raxNode **children = raxNodeFirstChildPtr(h); + if (h->iscompr) j = 0; /* Compressed node only child is at index 0. */ + memcpy(&h,children+j,sizeof(h)); + parentlink = children+j; + j = 0; /* If the new node is compressed and we do not + iterate again (since i == l) set the split + position to 0 to signal this node represents + the searched key. */ + } + if (stopnode) *stopnode = h; + if (plink) *plink = parentlink; + if (splitpos && h->iscompr) *splitpos = j; + return i; +} + +/* Insert the element 's' of size 'len', setting as auxiliary data + * the pointer 'data'. If the element is already present, the associated + * data is updated, and 0 is returned, otherwise the element is inserted + * and 1 is returned. On out of memory the function returns 0 as well but + * sets errno to ENOMEM, otherwise errno will be set to 0. */ +int raxInsert(rax *rax, unsigned char *s, size_t len, void *data) { + size_t i; + int j = 0; /* Split position. If raxLowWalk() stops in a compressed + node, the index 'j' represents the char we stopped within the + compressed node, that is, the position where to split the + node for insertion. */ + raxNode *h, **parentlink; + + debugf("### Insert %.*s with value %p\n", (int)len, s, data); + i = raxLowWalk(rax,s,len,&h,&parentlink,&j,NULL); + + /* If i == len we walked following the whole string. If we are not + * in the middle of a compressed node, the string is either already + * inserted or this middle node is currently not a key, but can represent + * our key. We have just to reallocate the node and make space for the + * data pointer. */ + if (i == len && (!h->iscompr || j == 0 /* not in the middle if j is 0 */)) { + if (h->iskey) { + raxSetData(h,data); + errno = 0; + return 0; /* Element already exists. */ + } + h = raxReallocForData(h,data); + if (h == NULL) { + errno = ENOMEM; + return 0; + } + memcpy(parentlink,&h,sizeof(h)); + raxSetData(h,data); + rax->numele++; + return 1; /* Element inserted. */ + } + + /* If the node we stopped at is a compressed node, we need to + * split it before to continue. + * + * Splitting a compressed node have a few possibile cases. + * Imagine that the node 'h' we are currently at is a compressed + * node contaning the string "ANNIBALE" (it means that it represents + * nodes A -> N -> N -> I -> B -> A -> L -> E with the only child + * pointer of this node pointing at the 'E' node, because remember that + * we have characters at the edges of the graph, not inside the nodes + * themselves. + * + * In order to show a real case imagine our node to also point to + * another compressed node, that finally points at the node without + * children, representing 'O': + * + * "ANNIBALE" -> "SCO" -> [] + * + * When inserting we may face the following cases. Note that all the cases + * require the insertion of a non compressed node with exactly two + * children, except for the last case which just requires splitting a + * compressed node. + * + * 1) Inserting "ANNIENTARE" + * + * |B| -> "ALE" -> "SCO" -> [] + * "ANNI" -> |-| + * |E| -> (... continue algo ...) "NTARE" -> [] + * + * 2) Inserting "ANNIBALI" + * + * |E| -> "SCO" -> [] + * "ANNIBAL" -> |-| + * |I| -> (... continue algo ...) [] + * + * 3) Inserting "AGO" (Like case 1, but set iscompr = 0 into original node) + * + * |N| -> "NIBALE" -> "SCO" -> [] + * |A| -> |-| + * |G| -> (... continue algo ...) |O| -> [] + * + * 4) Inserting "CIAO" + * + * |A| -> "NNIBALE" -> "SCO" -> [] + * |-| + * |C| -> (... continue algo ...) "IAO" -> [] + * + * 5) Inserting "ANNI" + * + * "ANNI" -> "BALE" -> "SCO" -> [] + * + * The final algorithm for insertion covering all the above cases is as + * follows. + * + * ============================= ALGO 1 ============================= + * + * For the above cases 1 to 4, that is, all cases where we stopped in + * the middle of a compressed node for a character mismatch, do: + * + * Let $SPLITPOS be the zero-based index at which, in the + * compressed node array of characters, we found the mismatching + * character. For example if the node contains "ANNIBALE" and we add + * "ANNIENTARE" the $SPLITPOS is 4, that is, the index at which the + * mismatching character is found. + * + * 1. Save the current compressed node $NEXT pointer (the pointer to the + * child element, that is always present in compressed nodes). + * + * 2. Create "split node" having as child the non common letter + * at the compressed node. The other non common letter (at the key) + * will be added later as we continue the normal insertion algorithm + * at step "6". + * + * 3a. IF $SPLITPOS == 0: + * Replace the old node with the split node, by copying the auxiliary + * data if any. Fix parent's reference. Free old node eventually + * (we still need its data for the next steps of the algorithm). + * + * 3b. IF $SPLITPOS != 0: + * Trim the compressed node (reallocating it as well) in order to + * contain $splitpos characters. Change chilid pointer in order to link + * to the split node. If new compressed node len is just 1, set + * iscompr to 0 (layout is the same). Fix parent's reference. + * + * 4a. IF the postfix len (the length of the remaining string of the + * original compressed node after the split character) is non zero, + * create a "postfix node". If the postfix node has just one character + * set iscompr to 0, otherwise iscompr to 1. Set the postfix node + * child pointer to $NEXT. + * + * 4b. IF the postfix len is zero, just use $NEXT as postfix pointer. + * + * 5. Set child[0] of split node to postfix node. + * + * 6. Set the split node as the current node, set current index at child[1] + * and continue insertion algorithm as usually. + * + * ============================= ALGO 2 ============================= + * + * For case 5, that is, if we stopped in the middle of a compressed + * node but no mismatch was found, do: + * + * Let $SPLITPOS be the zero-based index at which, in the + * compressed node array of characters, we stopped iterating because + * there were no more keys character to match. So in the example of + * the node "ANNIBALE", addig the string "ANNI", the $SPLITPOS is 4. + * + * 1. Save the current compressed node $NEXT pointer (the pointer to the + * child element, that is always present in compressed nodes). + * + * 2. Create a "postfix node" containing all the characters from $SPLITPOS + * to the end. Use $NEXT as the postfix node child pointer. + * If the postfix node length is 1, set iscompr to 0. + * Set the node as a key with the associated value of the new + * inserted key. + * + * 3. Trim the current node to contain the first $SPLITPOS characters. + * As usually if the new node length is just 1, set iscompr to 0. + * Take the iskey / associated value as it was in the orignal node. + * Fix the parent's reference. + * + * 4. Set the postfix node as the only child pointer of the trimmed + * node created at step 1. + */ + + /* ------------------------- ALGORITHM 1 --------------------------- */ + if (h->iscompr && i != len) { + debugf("ALGO 1: Stopped at compressed node %.*s (%p)\n", + h->size, h->data, (void*)h); + debugf("Still to insert: %.*s\n", (int)(len-i), s+i); + debugf("Splitting at %d: '%c'\n", j, ((char*)h->data)[j]); + debugf("Other (key) letter is '%c'\n", s[i]); + + /* 1: Save next pointer. */ + raxNode **childfield = raxNodeLastChildPtr(h); + raxNode *next; + memcpy(&next,childfield,sizeof(next)); + debugf("Next is %p\n", (void*)next); + debugf("iskey %d\n", h->iskey); + if (h->iskey) { + debugf("key value is %p\n", raxGetData(h)); + } + + /* Set the length of the additional nodes we will need. */ + size_t trimmedlen = j; + size_t postfixlen = h->size - j - 1; + int split_node_is_key = !trimmedlen && h->iskey && !h->isnull; + size_t nodesize; + + /* 2: Create the split node. Also allocate the other nodes we'll need + * ASAP, so that it will be simpler to handle OOM. */ + raxNode *splitnode = raxNewNode(1, split_node_is_key); + raxNode *trimmed = NULL; + raxNode *postfix = NULL; + + if (trimmedlen) { + nodesize = sizeof(raxNode)+trimmedlen+sizeof(raxNode*); + if (h->iskey && !h->isnull) nodesize += sizeof(void*); + trimmed = rax_malloc(nodesize); + } + + if (postfixlen) { + nodesize = sizeof(raxNode)+postfixlen+ + sizeof(raxNode*); + postfix = rax_malloc(nodesize); + } + + /* OOM? Abort now that the tree is untouched. */ + if (splitnode == NULL || + (trimmedlen && trimmed == NULL) || + (postfixlen && postfix == NULL)) + { + rax_free(splitnode); + rax_free(trimmed); + rax_free(postfix); + errno = ENOMEM; + return 0; + } + splitnode->data[0] = h->data[j]; + + if (j == 0) { + /* 3a: Replace the old node with the split node. */ + if (h->iskey) { + void *ndata = raxGetData(h); + raxSetData(splitnode,ndata); + } + memcpy(parentlink,&splitnode,sizeof(splitnode)); + } else { + /* 3b: Trim the compressed node. */ + trimmed->size = j; + memcpy(trimmed->data,h->data,j); + trimmed->iscompr = j > 1 ? 1 : 0; + trimmed->iskey = h->iskey; + trimmed->isnull = h->isnull; + if (h->iskey && !h->isnull) { + void *ndata = raxGetData(h); + raxSetData(trimmed,ndata); + } + raxNode **cp = raxNodeLastChildPtr(trimmed); + memcpy(cp,&splitnode,sizeof(splitnode)); + memcpy(parentlink,&trimmed,sizeof(trimmed)); + parentlink = cp; /* Set parentlink to splitnode parent. */ + rax->numnodes++; + } + + /* 4: Create the postfix node: what remains of the original + * compressed node after the split. */ + if (postfixlen) { + /* 4a: create a postfix node. */ + postfix->iskey = 0; + postfix->isnull = 0; + postfix->size = postfixlen; + postfix->iscompr = postfixlen > 1; + memcpy(postfix->data,h->data+j+1,postfixlen); + raxNode **cp = raxNodeLastChildPtr(postfix); + memcpy(cp,&next,sizeof(next)); + rax->numnodes++; + } else { + /* 4b: just use next as postfix node. */ + postfix = next; + } + + /* 5: Set splitnode first child as the postfix node. */ + raxNode **splitchild = raxNodeLastChildPtr(splitnode); + memcpy(splitchild,&postfix,sizeof(postfix)); + + /* 6. Continue insertion: this will cause the splitnode to + * get a new child (the non common character at the currently + * inserted key). */ + rax_free(h); + h = splitnode; + } else if (h->iscompr && i == len) { + /* ------------------------- ALGORITHM 2 --------------------------- */ + debugf("ALGO 2: Stopped at compressed node %.*s (%p) j = %d\n", + h->size, h->data, (void*)h, j); + + /* Allocate postfix & trimmed nodes ASAP to fail for OOM gracefully. */ + size_t postfixlen = h->size - j; + size_t nodesize = sizeof(raxNode)+postfixlen+sizeof(raxNode*); + if (data != NULL) nodesize += sizeof(void*); + raxNode *postfix = rax_malloc(nodesize); + + nodesize = sizeof(raxNode)+j+sizeof(raxNode*); + if (h->iskey && !h->isnull) nodesize += sizeof(void*); + raxNode *trimmed = rax_malloc(nodesize); + + if (postfix == NULL || trimmed == NULL) { + rax_free(postfix); + rax_free(trimmed); + errno = ENOMEM; + return 0; + } + + /* 1: Save next pointer. */ + raxNode **childfield = raxNodeLastChildPtr(h); + raxNode *next; + memcpy(&next,childfield,sizeof(next)); + + /* 2: Create the postfix node. */ + postfix->size = postfixlen; + postfix->iscompr = postfixlen > 1; + postfix->iskey = 1; + postfix->isnull = 0; + memcpy(postfix->data,h->data+j,postfixlen); + raxSetData(postfix,data); + raxNode **cp = raxNodeLastChildPtr(postfix); + memcpy(cp,&next,sizeof(next)); + rax->numnodes++; + + /* 3: Trim the compressed node. */ + trimmed->size = j; + trimmed->iscompr = j > 1; + trimmed->iskey = 0; + trimmed->isnull = 0; + memcpy(trimmed->data,h->data,j); + memcpy(parentlink,&trimmed,sizeof(trimmed)); + if (h->iskey) { + void *aux = raxGetData(h); + raxSetData(trimmed,aux); + } + + /* Fix the trimmed node child pointer to point to + * the postfix node. */ + cp = raxNodeLastChildPtr(trimmed); + memcpy(cp,&postfix,sizeof(postfix)); + + /* Finish! We don't need to contine with the insertion + * algorithm for ALGO 2. The key is already inserted. */ + rax->numele++; + return 1; /* Key inserted. */ + } + + /* We walked the radix tree as far as we could, but still there are left + * chars in our string. We need to insert the missing nodes. + * Note: while loop never entered if the node was split by ALGO2, + * since i == len. */ + while(i < len) { + raxNode *child; + rax->numnodes++; + + /* If this node is going to have a single child, and there + * are other characters, so that that would result in a chain + * of single-childed nodes, turn it into a compressed node. */ + if (h->size == 0 && len-i > 1) { + debugf("Inserting compressed node\n"); + size_t comprsize = len-i; + if (comprsize > RAX_NODE_MAX_SIZE) + comprsize = RAX_NODE_MAX_SIZE; + raxNode *newh = raxCompressNode(h,s+i,comprsize,&child); + if (newh == NULL) goto oom; + h = newh; + memcpy(parentlink,&h,sizeof(h)); + parentlink = raxNodeLastChildPtr(h); + i += comprsize; + } else { + debugf("Inserting normal node\n"); + raxNode **new_parentlink; + raxNode *newh = raxAddChild(h,s[i],&child,&new_parentlink); + if (newh == NULL) goto oom; + h = newh; + memcpy(parentlink,&h,sizeof(h)); + parentlink = new_parentlink; + i++; + } + h = child; + } + raxNode *newh = raxReallocForData(h,data); + if (newh == NULL) goto oom; + h = newh; + if (!h->iskey) rax->numele++; + raxSetData(h,data); + memcpy(parentlink,&h,sizeof(h)); + return 1; /* Element inserted. */ + +oom: + /* This code path handles out of memory after part of the sub-tree was + * already added. Set the node as a key, and then remove it. */ + h->isnull = 1; + h->iskey = 1; + raxRemove(rax,s,i); + errno = ENOMEM; + return 0; +} + +/* Find a key in the rax, returns raxNotFound special void pointer value + * if the item was not found, otherwise the value associated with the + * item is returned. */ +void *raxFind(rax *rax, unsigned char *s, size_t len) { + raxNode *h; + + debugf("### Lookup: %.*s\n", (int)len, s); + int splitpos = 0; + size_t i = raxLowWalk(rax,s,len,&h,NULL,&splitpos,NULL); + if (i != len || (h->iscompr && splitpos != 0) || !h->iskey) + return raxNotFound; + return raxGetData(h); +} + +/* Return the memory address where the 'parent' node stores the specified + * 'child' pointer, so that the caller can update the pointer with another + * one if needed. The function assumes it will find a match, otherwise the + * operation is an undefined behavior (it will continue scanning the + * memory without any bound checking). */ +raxNode **raxFindParentLink(raxNode *parent, raxNode *child) { + raxNode **cp = raxNodeFirstChildPtr(parent); + raxNode *c; + while(1) { + memcpy(&c,cp,sizeof(c)); + if (c == child) break; + cp++; + } + return cp; +} + +/* Low level child removal from node. The new node pointer (after the child + * removal) is returned. Note that this function does not fix the pointer + * of the parent node in its parent, so this task is up to the caller. + * The function never fails for out of memory. */ +raxNode *raxRemoveChild(raxNode *parent, raxNode *child) { + debugnode("raxRemoveChild before", parent); + /* If parent is a compressed node (having a single child, as for definition + * of the data structure), the removal of the child consists into turning + * it into a normal node without children. */ + if (parent->iscompr) { + void *data = NULL; + if (parent->iskey) data = raxGetData(parent); + parent->isnull = 0; + parent->iscompr = 0; + parent->size = 0; + if (parent->iskey) raxSetData(parent,data); + debugnode("raxRemoveChild after", parent); + return parent; + } + + /* Otherwise we need to scan for the children pointer and memmove() + * accordingly. + * + * 1. To start we seek the first element in both the children + * pointers and edge bytes in the node. */ + raxNode **cp = raxNodeLastChildPtr(parent) - (parent->size-1); + raxNode **c = cp; + unsigned char *e = parent->data; + + /* 2. Search the child pointer to remove inside the array of children + * pointers. */ + while(1) { + raxNode *aux; + memcpy(&aux,c,sizeof(aux)); + if (aux == child) break; + c++; + e++; + } + + /* 3. Remove the edge and the pointer by memmoving the remaining children + * pointer and edge bytes one position before. */ + int taillen = parent->size - (e - parent->data) - 1; + debugf("raxRemoveChild tail len: %d\n", taillen); + memmove(e,e+1,taillen); + + /* Since we have one data byte less, also child pointers start one byte + * before now. */ + memmove(((char*)cp)-1,cp,(parent->size-taillen-1)*sizeof(raxNode**)); + + /* Move the remaining "tail" pointer at the right position as well. */ + memmove(((char*)c)-1,c+1,taillen*sizeof(raxNode**)); + + /* 4. Update size. */ + parent->size--; + + /* realloc the node according to the theoretical memory usage, to free + * data if we are over-allocating right now. */ + raxNode *newnode = rax_realloc(parent,raxNodeCurrentLength(parent)); + debugnode("raxRemoveChild after", newnode); + /* Note: if rax_realloc() fails we just return the old address, which + * is valid. */ + return newnode ? newnode : parent; +} + +/* Remove the specified item. Returns 1 if the item was found and + * deleted, 0 otherwise. */ +int raxRemove(rax *rax, unsigned char *s, size_t len) { + raxNode *h; + raxStack ts; + + debugf("### Delete: %.*s\n", (int)len, s); + raxStackInit(&ts); + int splitpos = 0; + size_t i = raxLowWalk(rax,s,len,&h,NULL,NULL,&ts); + if (i != len || (h->iscompr && splitpos != 0) || !h->iskey) { + raxStackFree(&ts); + return 0; + } + h->iskey = 0; + rax->numele--; + + /* If this node has no children, the deletion needs to reclaim the + * no longer used nodes. This is an iterative process that needs to + * walk the three upward, deleting all the nodes with just one child + * that are not keys, until the head of the rax is reached or the first + * node with more than one child is found. */ + + int trycompress = 0; /* Will be set to 1 if we should try to optimize the + tree resulting from the deletion. */ + + if (h->size == 0) { + debugf("Key deleted in node without children. Cleanup needed.\n"); + raxNode *child = NULL; + while(h != rax->head) { + child = h; + debugf("Freeing child %p [%.*s] key:%d\n", (void*)child, + (int)child->size, (char*)child->data, child->iskey); + rax_free(child); + rax->numnodes--; + h = raxStackPop(&ts); + /* If this node has more then one child, or actually holds + * a key, stop here. */ + if (h->iskey || (!h->iscompr && h->size != 1)) break; + } + if (child) { + debugf("Unlinking child %p from parent %p\n", + (void*)child, (void*)h); + raxNode *new = raxRemoveChild(h,child); + if (new != h) { + raxNode *parent = raxStackPeek(&ts); + raxNode **parentlink; + if (parent == NULL) { + parentlink = &rax->head; + } else { + parentlink = raxFindParentLink(parent,h); + } + memcpy(parentlink,&new,sizeof(new)); + } + + /* If after the removal the node has just a single child + * and is not a key, we need to try to compress it. */ + if (new->size == 1 && new->iskey == 0) { + trycompress = 1; + h = new; + } + } + } else if (h->size == 1) { + /* If the node had just one child, after the removal of the key + * further compression with adjacent nodes is pontentially possible. */ + trycompress = 1; + } + + /* Don't try node compression if our nodes pointers stack is not + * complete because of OOM while executing raxLowWalk() */ + if (trycompress && ts.oom) trycompress = 0; + + /* Recompression: if trycompress is true, 'h' points to a radix tree node + * that changed in a way that could allow to compress nodes in this + * sub-branch. Compressed nodes represent chains of nodes that are not + * keys and have a single child, so there are two deletion events that + * may alter the tree so that further compression is needed: + * + * 1) A node with a single child was a key and now no longer is a key. + * 2) A node with two children now has just one child. + * + * We try to navigate upward till there are other nodes that can be + * compressed, when we reach the upper node which is not a key and has + * a single child, we scan the chain of children to collect the + * compressable part of the tree, and replace the current node with the + * new one, fixing the child pointer to reference the first non + * compressable node. + * + * Example of case "1". A tree stores the keys "FOO" = 1 and + * "FOOBAR" = 2: + * + * + * "FOO" -> "BAR" -> [] (2) + * (1) + * + * After the removal of "FOO" the tree can be compressed as: + * + * "FOOBAR" -> [] (2) + * + * + * Example of case "2". A tree stores the keys "FOOBAR" = 1 and + * "FOOTER" = 2: + * + * |B| -> "AR" -> [] (1) + * "FOO" -> |-| + * |T| -> "ER" -> [] (2) + * + * After the removal of "FOOTER" the resulting tree is: + * + * "FOO" -> |B| -> "AR" -> [] (1) + * + * That can be compressed into: + * + * "FOOBAR" -> [] (1) + */ + if (trycompress) { + debugf("After removing %.*s:\n", (int)len, s); + debugnode("Compression may be needed",h); + debugf("Seek start node\n"); + + /* Try to reach the upper node that is compressible. + * At the end of the loop 'h' will point to the first node we + * can try to compress and 'parent' to its parent. */ + raxNode *parent; + while(1) { + parent = raxStackPop(&ts); + if (!parent || parent->iskey || + (!parent->iscompr && parent->size != 1)) break; + h = parent; + debugnode("Going up to",h); + } + raxNode *start = h; /* Compression starting node. */ + + /* Scan chain of nodes we can compress. */ + size_t comprsize = h->size; + int nodes = 1; + while(h->size != 0) { + raxNode **cp = raxNodeLastChildPtr(h); + memcpy(&h,cp,sizeof(h)); + if (h->iskey || (!h->iscompr && h->size != 1)) break; + nodes++; + comprsize += h->size; + } + if (nodes > 1) { + /* If we can compress, create the new node and populate it. */ + size_t nodesize = + sizeof(raxNode)+comprsize+sizeof(raxNode*); + raxNode *new = rax_malloc(nodesize); + /* An out of memory here just means we cannot optimize this + * node, but the tree is left in a consistent state. */ + if (new == NULL) { + raxStackFree(&ts); + return 1; + } + new->iskey = 0; + new->isnull = 0; + new->iscompr = 1; + new->size = comprsize; + rax->numnodes++; + + /* Scan again, this time to populate the new node content and + * to fix the new node child pointer. At the same time we free + * all the nodes that we'll no longer use. */ + comprsize = 0; + h = start; + while(h->size != 0) { + memcpy(new->data+comprsize,h->data,h->size); + comprsize += h->size; + raxNode **cp = raxNodeLastChildPtr(h); + raxNode *tofree = h; + memcpy(&h,cp,sizeof(h)); + rax_free(tofree); rax->numnodes--; + if (h->iskey || (!h->iscompr && h->size != 1)) break; + } + debugnode("New node",new); + + /* Now 'h' points to the first node that we still need to use, + * so our new node child pointer will point to it. */ + raxNode **cp = raxNodeLastChildPtr(new); + memcpy(cp,&h,sizeof(h)); + + /* Fix parent link. */ + if (parent) { + raxNode **parentlink = raxFindParentLink(parent,start); + memcpy(parentlink,&new,sizeof(new)); + } else { + rax->head = new; + } + + debugf("Compressed %d nodes, %d total bytes\n", + nodes, (int)comprsize); + } + } + raxStackFree(&ts); + return 1; +} + +/* This is the core of raxFree(): performs a depth-first scan of the + * tree and releases all the nodes found. */ +void raxRecursiveFree(rax *rax, raxNode *n) { + int numchildren = n->iscompr ? 1 : n->size; + raxNode **cp = raxNodeLastChildPtr(n); + while(numchildren--) { + raxNode *child; + memcpy(&child,cp,sizeof(child)); + raxRecursiveFree(rax,child); + cp--; + } + debugnode("free depth-first",n); + rax_free(n); + rax->numnodes--; +} + +/* Free a whole radix tree. */ +void raxFree(rax *rax) { + raxRecursiveFree(rax,rax->head); + assert(rax->numnodes == 0); + rax_free(rax); +} + +/* ------------------------------- Iterator --------------------------------- */ + +/* Initialize a Rax iterator. This call should be performed a single time + * to initialize the iterator, and must be followed by a raxSeek() call, + * otherwise the raxPrev()/raxNext() functions will just return EOF. */ +void raxStart(raxIterator *it, rax *rt) { + it->flags = RAX_ITER_EOF; /* No crash if the iterator is not seeked. */ + it->rt = rt; + it->key_len = 0; + it->key = it->key_static_string; + it->key_max = RAX_ITER_STATIC_LEN; + it->data = NULL; + raxStackInit(&it->stack); +} + +/* Append characters at the current key string of the iterator 'it'. This + * is a low level function used to implement the iterator, not callable by + * the user. Returns 0 on out of memory, otherwise 1 is returned. */ +int raxIteratorAddChars(raxIterator *it, unsigned char *s, size_t len) { + if (it->key_max < it->key_len+len) { + unsigned char *old = (it->key == it->key_static_string) ? NULL : + it->key; + size_t new_max = (it->key_len+len)*2; + it->key = rax_realloc(old,new_max); + if (it->key == NULL) { + it->key = (!old) ? it->key_static_string : old; + return 0; + } + if (old == NULL) memcpy(it->key,it->key_static_string,it->key_len); + it->key_max = new_max; + } + /* Use memmove since there could be an overlap between 's' and + * it->key when we use the current key in order to re-seek. */ + memmove(it->key+it->key_len,s,len); + it->key_len += len; + return 1; +} + +/* Remove the specified number of chars from the right of the current + * iterator key. */ +void raxIteratorDelChars(raxIterator *it, size_t count) { + it->key_len -= count; +} + +/* Do an iteration step towards the next element. At the end of the step the + * iterator key will represent the (new) current key. If it is not possible + * to step in the specified direction since there are no longer elements, the + * iterator is flagged with RAX_ITER_EOF. + * + * If 'noup' is true the function starts directly scanning for the next + * lexicographically smaller children, and the current node is already assumed + * to be the parent of the last key node, so the first operation to go back to + * the parent will be skipped. This option is used by raxSeek() when + * implementing seeking a non existing element with the ">" or "<" options: + * the starting node is not a key in that particular case, so we start the scan + * from a node that does not represent the key set. + * + * The function returns 1 on success or 0 on out of memory. */ +int raxIteratorNextStep(raxIterator *it, int noup) { + if (it->flags & RAX_ITER_EOF) { + return 0; + } else if (it->flags & RAX_ITER_JUST_SEEKED) { + it->flags &= ~RAX_ITER_JUST_SEEKED; + return 1; + } + + /* Save key len, stack items and the node where we are currently + * so that on iterator EOF we can restore the current key and state. */ + size_t orig_key_len = it->key_len; + size_t orig_stack_items = it->stack.items; + raxNode *orig_node = it->node; + + /* Clear the EOF flag: it will be set again if the EOF condition + * is still valid. */ + it->flags &= ~RAX_ITER_EOF; + + while(1) { + int children = it->node->iscompr ? 1 : it->node->size; + if (!noup && children) { + debugf("GO DEEPER\n"); + /* Seek the lexicographically smaller key in this subtree, which + * is the first one found always going torwards the first child + * of every successive node. */ + if (!raxStackPush(&it->stack,it->node)) return 0; + raxNode **cp = raxNodeFirstChildPtr(it->node); + if (!raxIteratorAddChars(it,it->node->data, + it->node->iscompr ? it->node->size : 1)) return 0; + memcpy(&it->node,cp,sizeof(it->node)); + /* For "next" step, stop every time we find a key along the + * way, since the key is lexicograhically smaller compared to + * what follows in the sub-children. */ + if (it->node->iskey) { + it->data = raxGetData(it->node); + return 1; + } + } else { + /* If we finished exporing the previous sub-tree, switch to the + * new one: go upper until a node is found where there are + * children representing keys lexicographically greater than the + * current key. */ + while(1) { + /* Already on head? Can't go up, iteration finished. */ + if (!noup && it->node == it->rt->head) { + it->flags |= RAX_ITER_EOF; + it->stack.items = orig_stack_items; + it->key_len = orig_key_len; + it->node = orig_node; + return 1; + } + /* If there are no children at the current node, try parent's + * next child. */ + unsigned char prevchild = it->key[it->key_len-1]; + if (!noup) { + it->node = raxStackPop(&it->stack); + } else { + noup = 0; + } + /* Adjust the current key to represent the node we are + * at. */ + int todel = it->node->iscompr ? it->node->size : 1; + raxIteratorDelChars(it,todel); + + /* Try visitng the next child if there was at least one + * additional child. */ + if (!it->node->iscompr && it->node->size > 1) { + raxNode **cp = raxNodeFirstChildPtr(it->node); + int i = 0; + while (i < it->node->size) { + debugf("SCAN NEXT %c\n", it->node->data[i]); + if (it->node->data[i] > prevchild) break; + i++; + cp++; + } + if (i != it->node->size) { + debugf("SCAN found a new node\n"); + raxIteratorAddChars(it,it->node->data+i,1); + if (!raxStackPush(&it->stack,it->node)) return 0; + memcpy(&it->node,cp,sizeof(it->node)); + if (it->node->iskey) { + it->data = raxGetData(it->node); + return 1; + } + break; + } + } + } + } + } +} + +/* Seek the grestest key in the subtree at the current node. Return 0 on + * out of memory, otherwise 1. This is an helper function for different + * iteration functions below. */ +int raxSeekGreatest(raxIterator *it) { + while(it->node->size) { + if (it->node->iscompr) { + if (!raxIteratorAddChars(it,it->node->data, + it->node->size)) return 0; + } else { + if (!raxIteratorAddChars(it,it->node->data+it->node->size-1,1)) + return 0; + } + raxNode **cp = raxNodeLastChildPtr(it->node); + if (!raxStackPush(&it->stack,it->node)) return 0; + memcpy(&it->node,cp,sizeof(it->node)); + } + return 1; +} + +/* Like raxIteratorNextStep() but implements an iteration step moving + * to the lexicographically previous element. The 'noup' option has a similar + * effect to the one of raxIteratorPrevSte(). */ +int raxIteratorPrevStep(raxIterator *it, int noup) { + if (it->flags & RAX_ITER_EOF) { + return 0; + } else if (it->flags & RAX_ITER_JUST_SEEKED) { + it->flags &= ~RAX_ITER_JUST_SEEKED; + return 1; + } + + /* Save key len, stack items and the node where we are currently + * so that on iterator EOF we can restore the current key and state. */ + size_t orig_key_len = it->key_len; + size_t orig_stack_items = it->stack.items; + raxNode *orig_node = it->node; + + while(1) { + /* Already on head? Can't go up, iteration finished. */ + if (!noup && it->node == it->rt->head) { + it->flags |= RAX_ITER_EOF; + it->stack.items = orig_stack_items; + it->key_len = orig_key_len; + it->node = orig_node; + return 1; + } + + unsigned char prevchild = it->key[it->key_len-1]; + if (!noup) { + it->node = raxStackPop(&it->stack); + } else { + noup = 0; + } + + /* Adjust the current key to represent the node we are + * at. */ + int todel = it->node->iscompr ? it->node->size : 1; + raxIteratorDelChars(it,todel); + + /* Try visiting the prev child if there was at least one + * additional child. */ + if (!it->node->iscompr && it->node->size > 1) { + raxNode **cp = raxNodeLastChildPtr(it->node); + int i = it->node->size-1; + while (i >= 0) { + debugf("SCAN PREV %c\n", it->node->data[i]); + if (it->node->data[i] < prevchild) break; + i--; + cp--; + } + /* If we found a new subtree to explore in this node, + * go deeper following all the last children in order to + * find the key lexicographically greater. */ + if (i != -1) { + debugf("SCAN found a new node\n"); + /* Enter the node we just found. */ + if (!raxIteratorAddChars(it,it->node->data+i,1)) return 0; + if (!raxStackPush(&it->stack,it->node)) return 0; + memcpy(&it->node,cp,sizeof(it->node)); + /* Seek sub-tree max. */ + if (!raxSeekGreatest(it)) return 0; + } + } + + /* Return the key: this could be the key we found scanning a new + * subtree, or if we did not find a new subtree to explore here, + * before giving up with this node, check if it's a key itself. */ + if (it->node->iskey) { + it->data = raxGetData(it->node); + return 1; + } + } +} + +/* Seek an iterator at the specified element. + * Return 0 if the seek failed for syntax error or out of memory. Otherwise + * 1 is returned. */ +int raxSeek(raxIterator *it, unsigned char *ele, size_t len, const char *op) { + int eq = 0, lt = 0, gt = 0, first = 0, last = 0; + + it->stack.items = 0; /* Just resetting. Intialized by raxStart(). */ + it->flags |= RAX_ITER_JUST_SEEKED; + it->flags &= ~RAX_ITER_EOF; + it->key_len = 0; + it->node = NULL; + + /* Set flags according to the operator used to perform the seek. */ + if (op[0] == '>') { + gt = 1; + if (op[1] == '=') eq = 1; + } else if (op[0] == '<') { + lt = 1; + if (op[1] == '=') eq = 1; + } else if (op[0] == '=') { + eq = 1; + } else if (op[0] == '^') { + first = 1; + } else if (op[0] == '$') { + last = 1; + } else { + return 0; /* Error. */ + } + + /* If there are no elements, set the EOF condition immediately and + * return. */ + if (it->rt->numele == 0) { + it->flags |= RAX_ITER_EOF; + return 1; + } + + if (first) { + /* Seeking the first key greater or equal to the empty string + * is equivalent to seeking the smaller key available. */ + return raxSeek(it,NULL,0,">="); + } + + if (last) { + /* Find the greatest key taking always the last child till a + * final node is found. */ + it->node = it->rt->head; + if (!raxSeekGreatest(it)) return 0; + assert(it->node->iskey); + return 1; + } + + /* We need to seek the specified key. What we do here is to actually + * perform a lookup, and later invoke the prev/next key code that + * we already use for iteration. */ + int splitpos = 0; + size_t i = raxLowWalk(it->rt,ele,len,&it->node,NULL,&splitpos,&it->stack); + + /* Return OOM on incomplete stack info. */ + if (it->stack.oom) return 0; + + if (eq && i == len && (!it->node->iscompr || splitpos == 0) && + it->node->iskey) + { + /* We found our node, since the key matches and we have an + * "equal" condition. */ + if (!raxIteratorAddChars(it,ele,len)) return 0; /* OOM. */ + } else { + /* Exact key not found or eq flag not set. We have to set as current + * key the one represented by the node we stopped at, and perform + * a next/prev operation to seek. To reconstruct the key at this node + * we start from the parent and go to the current node, accumulating + * the characters found along the way. */ + if (!raxStackPush(&it->stack,it->node)) return 0; + for (size_t j = 1; j < it->stack.items; j++) { + raxNode *parent = it->stack.stack[j-1]; + raxNode *child = it->stack.stack[j]; + if (parent->iscompr) { + if (!raxIteratorAddChars(it,parent->data,parent->size)) + return 0; + } else { + raxNode **cp = raxNodeFirstChildPtr(parent); + unsigned char *p = parent->data; + while(1) { + raxNode *aux; + memcpy(&aux,cp,sizeof(aux)); + if (aux == child) break; + cp++; + p++; + } + if (!raxIteratorAddChars(it,p,1)) return 0; + } + } + raxStackPop(&it->stack); + + /* We need to set the iterator in the correct state to call next/prev + * step in order to seek the desired element. */ + debugf("After initial seek: i=%d len=%d key=%.*s\n", + (int)i, (int)len, (int)it->key_len, it->key); + if (i != len && !it->node->iscompr) { + /* If we stopped in the middle of a normal node because of a + * mismatch, add the mismatching character to the current key + * and call the iterator with the 'noup' flag so that it will try + * to seek the next/prev child in the current node directly based + * on the mismatching character. */ + if (!raxIteratorAddChars(it,ele+i,1)) return 0; + debugf("Seek normal node on mismatch: %.*s\n", + (int)it->key_len, (char*)it->key); + + it->flags &= ~RAX_ITER_JUST_SEEKED; + if (lt && !raxIteratorPrevStep(it,1)) return 0; + if (gt && !raxIteratorNextStep(it,1)) return 0; + it->flags |= RAX_ITER_JUST_SEEKED; /* Ignore next call. */ + } else if (i != len && it->node->iscompr) { + debugf("Compressed mismatch: %.*s\n", + (int)it->key_len, (char*)it->key); + /* In case of a mismatch within a compressed node. */ + int nodechar = it->node->data[splitpos]; + int keychar = ele[i]; + it->flags &= ~RAX_ITER_JUST_SEEKED; + if (gt) { + /* If the key the compressed node represents is greater + * than our seek element, continue forward, otherwise set the + * state in order to go back to the next sub-tree. */ + if (nodechar > keychar) { + if (!raxIteratorNextStep(it,0)) return 0; + } else { + if (!raxIteratorAddChars(it,it->node->data,it->node->size)) + return 0; + if (!raxIteratorNextStep(it,1)) return 0; + } + } + if (lt) { + /* If the key the compressed node represents is smaller + * than our seek element, seek the greater key in this + * subtree, otherwise set the state in order to go back to + * the previous sub-tree. */ + if (nodechar < keychar) { + if (!raxSeekGreatest(it)) return 0; + } else { + if (!raxIteratorAddChars(it,it->node->data,it->node->size)) + return 0; + if (!raxIteratorPrevStep(it,1)) return 0; + } + } + it->flags |= RAX_ITER_JUST_SEEKED; /* Ignore next call. */ + } else { + debugf("No mismatch: %.*s\n", + (int)it->key_len, (char*)it->key); + /* If there was no mismatch we are into a node representing the + * key, (but which is not a key or the seek operator does not + * include 'eq'), or we stopped in the middle of a compressed node + * after processing all the key. Cotinue iterating as this was + * a legitimate key we stopped at. */ + it->flags &= ~RAX_ITER_JUST_SEEKED; + if (gt && !raxIteratorNextStep(it,0)) return 0; + if (lt && !raxIteratorPrevStep(it,0)) return 0; + it->flags |= RAX_ITER_JUST_SEEKED; /* Ignore next call. */ + } + } + return 1; +} + +/* Go to the next element in the scope of the iterator 'it'. + * If EOF (or out of memory) is reached, 0 is returned, otherwise 1 is + * returned. In case 0 is returned because of OOM, errno is set to ENOMEM. */ +int raxNext(raxIterator *it, unsigned char *stop, size_t stoplen, char *op) { + if (!raxIteratorNextStep(it,0)) { + errno = ENOMEM; + return 0; + } + if (it->flags & RAX_ITER_EOF) { + errno = 0; + return 0; + } + return 1; +} + +/* Go to the previous element in the scope of the iterator 'it'. + * If EOF (or out of memory) is reached, 0 is returned, otherwise 1 is + * returned. In case 0 is returned because of OOM, errno is set to ENOMEM. */ +int raxPrev(raxIterator *it, unsigned char *stop, size_t stoplen, char *op) { + if (!raxIteratorPrevStep(it,0)) { + errno = ENOMEM; + return 0; + } + if (it->flags & RAX_ITER_EOF) { + errno = 0; + return 0; + } + return 1; +} + +/* Free the iterator. */ +void raxStop(raxIterator *it) { + if (it->key != it->key_static_string) rax_free(it->key); + raxStackFree(&it->stack); +} + +/* ----------------------------- Introspection ------------------------------ */ + +/* This function is mostly used for debugging and learning purposes. + * It shows an ASCII representation of a tree on standard output, outling + * all the nodes and the contained keys. + * + * The representation is as follow: + * + * "foobar" (compressed node) + * [abc] (normal node with three children) + * [abc]=0x12345678 (node is a key, pointing to value 0x12345678) + * [] (a normal empty node) + * + * Children are represented in new idented lines, each children prefixed by + * the "`-(x)" string, where "x" is the edge byte. + * + * [abc] + * `-(a) "ladin" + * `-(b) [kj] + * `-(c) [] + * + * However when a node has a single child the following representation + * is used instead: + * + * [abc] -> "ladin" -> [] + */ + +/* The actual implementation of raxShow(). */ +void raxRecursiveShow(int level, int lpad, raxNode *n) { + char s = n->iscompr ? '"' : '['; + char e = n->iscompr ? '"' : ']'; + + int numchars = printf("%c%.*s%c", s, n->size, n->data, e); + if (n->iskey) { + numchars += printf("=%p",raxGetData(n)); + } + + int numchildren = n->iscompr ? 1 : n->size; + /* Note that 7 and 4 magic constants are the string length + * of " `-(x) " and " -> " respectively. */ + if (level) { + lpad += (numchildren > 1) ? 7 : 4; + if (numchildren == 1) lpad += numchars; + } + raxNode **cp = raxNodeFirstChildPtr(n); + for (int i = 0; i < numchildren; i++) { + char *branch = " `-(%c) "; + if (numchildren > 1) { + printf("\n"); + for (int j = 0; j < lpad; j++) putchar(' '); + printf(branch,n->data[i]); + } else { + printf(" -> "); + } + raxNode *child; + memcpy(&child,cp,sizeof(child)); + raxRecursiveShow(level+1,lpad,child); + cp++; + } +} + +/* Show a tree, as outlined in the comment above. */ +void raxShow(rax *rax) { + raxRecursiveShow(0,0,rax->head); + putchar('\n'); +} + +/* Used by debugnode() macro to show info about a given node. */ +void raxDebugShowNode(const char *msg, raxNode *n) { + printf("%s: %p [%.*s] key:%d size:%d children:", + msg, (void*)n, (int)n->size, (char*)n->data, n->iskey, n->size); + int numcld = n->iscompr ? 1 : n->size; + raxNode **cldptr = raxNodeLastChildPtr(n) - (numcld-1); + while(numcld--) { + raxNode *child; + memcpy(&child,cldptr,sizeof(child)); + cldptr++; + printf("%p ", (void*)child); + } + printf("\n"); + fflush(stdout); +} + +#ifdef BENCHMARK_MAIN +#include +#include + +/* This is a simple Feistel network in order to turn every possible + * uint32_t input into another "randomly" looking uint32_t. It is a + * one to one map so there are no repetitions. */ +static uint32_t int2int(uint32_t input) { + uint16_t l = input & 0xffff; + uint16_t r = input >> 16; + for (int i = 0; i < 8; i++) { + uint16_t nl = r; + uint16_t F = (((r * 31) + (r >> 5) + 7 * 371) ^ r) & 0xffff; + r = l ^ F; + l = nl; + } + return (r<<16)|l; +} + +/* Turn an uint32_t integer into an alphanumerical key and return its + * length. This function is used in order to generate keys that have + * a large charset, so that the radix tree can be testsed with many + * children per node. */ +static size_t int2alphakey(char *s, size_t maxlen, uint32_t i) { + const char *set = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "0123456789"; + const size_t setlen = 62; + + if (maxlen == 0) return 0; + maxlen--; /* Space for null term char. */ + size_t len = 0; + while(len < maxlen) { + s[len++] = set[i%setlen]; + i /= setlen; + if (i == 0) break; + } + s[len] = '\0'; + return len; +} + +/* Return the UNIX time in microseconds */ +static long long ustime(void) { + struct timeval tv; + long long ust; + + gettimeofday(&tv, NULL); + ust = ((long long)tv.tv_sec)*1000000; + ust += tv.tv_usec; + return ust; +} + +/* Turn the integer 'i' into a key according to 'mode'. + * mode = 0, just represents the integer as a string. + * mode = 1, turn it into a random-looking alphanumerical string + * according to the int2alphakey() function. */ +static size_t int2key(char *s, size_t maxlen, uint32_t i, int mode) { + if (mode == 0) { + return snprintf(s,maxlen,"%lu",(unsigned long)i); + } else { + i = int2int(i); + return int2alphakey(s,maxlen,i); + } +} + +int main(void) { + for (int mode = 0; mode < 2; mode++) { + printf("Benchmark with %s keys:\n", + (mode == 0) ? "integer" : "alphanumerical"); + rax *t = raxNew(); + long long start = ustime(); + for (int i = 0; i < 5000000; i++) { + char buf[64]; + int len = int2key(buf,sizeof(buf),i,mode); + raxInsert(t,(unsigned char*)buf,len,(void*)(long)i); + } + printf("Insert: %f\n", (double)(ustime()-start)/1000000); + printf("%llu total nodes\n", (unsigned long long)t->numnodes); + printf("%llu total elements\n", (unsigned long long)t->numele); + + start = ustime(); + for (int i = 0; i < 5000000; i++) { + char buf[64]; + int len = int2key(buf,sizeof(buf),i,mode); + void *data = raxFind(t,(unsigned char*)buf,len); + if (data != (void*)(long)i) { + printf("Issue with %s: %p instead of %p\n", buf, + data, (void*)(long)i); + } + } + printf("Lookup: %f\n", (double)(ustime()-start)/1000000); + + start = ustime(); + for (int i = 0; i < 5000000; i++) { + char buf[64]; + int r = rand() % 5000000; + int len = int2key(buf,sizeof(buf),r,mode); + void *data = raxFind(t,(unsigned char*)buf,len); + if (data != (void*)(long)r) { + printf("Issue with %s: %p instead of %p\n", buf, + data, (void*)(long)r); + } + } + printf("Random lookup: %f\n", (double)(ustime()-start)/1000000); + + start = ustime(); + int count = 0; + for (int i = 0; i < 5000000; i++) { + char buf[64]; + int len = int2key(buf,sizeof(buf),i,mode); + buf[i%len] = '!'; /* "!" is never set into keys. */ + void *data = raxFind(t,(unsigned char*) buf,len); + if (data != (void*)(long)i) count++; + } + printf("Failed lookup: %f\n", (double)(ustime()-start)/1000000); + + start = ustime(); + for (int i = 0; i < 5000000; i++) { + char buf[64]; + int len = int2key(buf,sizeof(buf),i,mode); + int retval = raxRemove(t,(unsigned char*)buf,len); + assert(retval == 1); + } + printf("Deletion: %f\n", (double)(ustime()-start)/1000000); + + printf("%llu total nodes\n", (unsigned long long)t->numnodes); + printf("%llu total elements\n", (unsigned long long)t->numele); + raxFree(t); + } + return 0; +} +#endif + +#ifdef TEST_MAIN +#include +#include + +int main(void) { + printf("notfound = %p\n", raxNotFound); + rax *t = raxNew(); + char *toadd[] = {"alligator","alien","baloon","chromodynamic","romane","romanus","romulus","rubens","ruber","rubicon","rubicundus","all","rub","ba",NULL}; + + srand(time(NULL)); + for (int x = 0; x < 10000; x++) rand(); + + long items = 0; + while(toadd[items] != NULL) items++; + + for (long i = 0; i < items; i++) { + raxInsert(t,(unsigned char*)toadd[i],strlen(toadd[i]),(void*)i); + printf("Added %s\n", toadd[i]); + } + raxShow(t); + + raxIterator iter; + raxStart(&iter,t); + + // OK: all this tests will need to go in the Rax unit test. + // raxSeek(&iter,(unsigned char*)"rpxxx",5,"<="); + // raxSeek(&iter,(unsigned char*)"rom",3,">="); + // raxSeek(&iter,(unsigned char*)"rub",3,">="); + // raxSeek(&iter,(unsigned char*)"rub",3,">"); + // raxSeek(&iter,(unsigned char*)"rub",3,"<"); + // raxSeek(&iter,(unsigned char*)"rom",3,">"); + // raxSeek(&iter,(unsigned char*)"chro",4,">"); + // raxSeek(&iter,(unsigned char*)"chro",4,"<"); + // raxSeek(&iter,(unsigned char*)"chromz",6,"<"); + // raxSeek(&iter,NULL,0,"^"); + // raxSeek(&iter,"zorro",5,"<="); + // raxSeek(&iter,"zorro",5,"<"); + // raxSeek(&iter,NULL,0,"$"); + + // STILL TO TEST + raxSeek(&iter,(unsigned char*)"ro",2,">="); + printf("EOF: %d\n", (iter.flags & RAX_ITER_EOF) != 0); + + printf("SEEKED: %.*s, val %p\n", (int)iter.key_len, + (char*)iter.key, + iter.data); + + printf("NEXT\n"); + while(raxNext(&iter,NULL,0,NULL)) { + printf("--- key: %.*s, val %p\n", (int)iter.key_len, + (char*)iter.key, + iter.data); + } + + printf("After EOF element is: %.*s\n", (int)iter.key_len, + (char*)iter.key); + printf("~~~~~~~~~~~~~~\n"); + + printf("PREV\n"); + raxSeek(&iter,iter.key,iter.key_len,"=="); + while(raxPrev(&iter,NULL,0,NULL)) { + printf("--- key: %.*s, val %p\n", (int)iter.key_len, + (char*)iter.key, + iter.data); + } + + printf("After EOF element is: %.*s\n", (int)iter.key_len, + (char*)iter.key); + raxStop(&iter); + +#if 0 + raxStop(&iter); +#endif + exit(0); + + int rnum = rand(); + int survivor = rnum % items; + +#if 1 + printf("Removing everything but %s in random order\n", toadd[survivor]); + for (long i = 0; i < 1000; i++) { + int r = rand() % items; + if (r == survivor) continue; + raxRemove(t,(unsigned char*)toadd[r],strlen(toadd[r])); + } +#else + printf("Removing rubicon\n"); + raxRemove(t,(unsigned char*)"rubicon",7); +#endif + + printf("%llu total nodes\n", (unsigned long long)t->numnodes); + printf("%llu total elements\n", (unsigned long long)t->numele); + + raxShow(t); + raxFree(t); +} +#endif diff --git a/src/rax.h b/src/rax.h new file mode 100644 index 000000000..76330e0de --- /dev/null +++ b/src/rax.h @@ -0,0 +1,158 @@ +#ifndef RAX_H +#define RAX_H + +#include + +/* Representation of a radix tree as implemented in this file, that contains + * the strings "foo", "foobar" and "footer" after the insertion of each + * word. When the node represents a key inside the radix tree, we write it + * between [], otherwise it is written between (). + * + * This is the vanilla representation: + * + * (f) "" + * \ + * (o) "f" + * \ + * (o) "fo" + * \ + * [t b] "foo" + * / \ + * "foot" (e) (a) "foob" + * / \ + * "foote" (r) (r) "fooba" + * / \ + * "footer" [] [] "foobar" + * + * However, this implementation implements a very common optimization where + * successive nodes having a single child are "compressed" into the node + * itself as a string of characters, each representing a next-level child, + * and only the link to the node representing the last character node is + * provided inside the representation. So the above representation is turend + * into: + * + * ["foo"] "" + * | + * [t b] "foo" + * / \ + * "foot" ("er") ("ar") "foob" + * / \ + * "footer" [] [] "foobar" + * + * However this optimization makes the implementation a bit more complex. + * For instance if a key "first" is added in the above radix tree, a + * "node splitting" operation is needed, since the "foo" prefix is no longer + * composed of nodes having a single child one after the other. This is the + * above tree and the resulting node splitting after this event happens: + * + * + * (f) "" + * / + * (i o) "f" + * / \ + * "firs" ("rst") (o) "fo" + * / \ + * "first" [] [t b] "foo" + * / \ + * "foot" ("er") ("ar") "foob" + * / \ + * "footer" [] [] "foobar" + * + * Similarly after deletion, if a new chain of nodes having a single child + * is created (the chain must also not include nodes that represent keys), + * it must be compressed back into a single node. + * + */ + +#define RAX_NODE_MAX_SIZE ((1<<29)-1) +typedef struct raxNode { + uint32_t iskey:1; /* Does this node contain a key? */ + uint32_t isnull:1; /* Associated value is NULL (don't store it). */ + uint32_t iscompr:1; /* Node is compressed. */ + uint32_t size:29; /* Number of children, or compressed string len. */ + /* Data layout is as follows: + * + * If node is not compressed we have 'size' bytes, one for each children + * character, and 'size' raxNode pointers, point to each child node. + * Note how the character is not stored in the children but in the + * edge of the parents: + * + * [header strlen=0][abc][a-ptr][b-ptr][c-ptr](value-ptr?) + * + * if node is compressed (strlen != 0) the node has 1 children. + * In that case the 'size' bytes of the string stored immediately at + * the start of the data section, represent a sequence of successive + * nodes linked one after the other, for which only the last one in + * the sequence is actually represented as a node, and pointed to by + * the current compressed node. + * + * [header strlen=3][xyz][z-ptr](value-ptr?) + * + * Both compressed and not compressed nodes can represent a key + * with associated data in the radix tree at any level (not just terminal + * nodes). + * + * If the node has an associated key (iskey=1) and is not NULL + * (isnull=0), then after the raxNode pointers poiting to the + * childen, an additional value pointer is present (as you can see + * in the representation above as "value-ptr" field). + */ + unsigned char data[]; +} raxNode; + +typedef struct rax { + raxNode *head; + uint64_t numele; + uint64_t numnodes; +} rax; + +/* Stack data structure used by raxLowWalk() in order to, optionally, return + * a list of parent nodes to the caller. The nodes do not have a "parent" + * field for space concerns, so we use the auxiliary stack when needed. */ +#define RAX_STACK_STATIC_ITEMS 32 +typedef struct raxStack { + void **stack; /* Points to static_items or an heap allocated array. */ + size_t items, maxitems; /* Number of items contained and total space. */ + /* Up to RAXSTACK_STACK_ITEMS items we avoid to allocate on the heap + * and use this static array of pointers instead. */ + void *static_items[RAX_STACK_STATIC_ITEMS]; + int oom; /* True if pushing into this stack failed for OOM at some point. */ +} raxStack; + +/* Radix tree iterator state is encapsulated into this data structure. */ +#define RAX_ITER_STATIC_LEN 128 +#define RAX_ITER_JUST_SEEKED (1<<0) /* Iterator was just seeked. Return current + element for the first iteration and + clear the flag. */ +#define RAX_ITER_EOF (1<<1) /* End of iteration reached. */ +#define RAX_ITER_SAFE (1<<2) /* Safe iterator, allows operations while + iterating. But it is slower. */ +typedef struct raxIterator { + int flags; + rax *rt; /* Radix tree we are iterating. */ + unsigned char *key; /* The current string. */ + void *data; /* Data associated to this key. */ + size_t key_len; /* Current key length. */ + size_t key_max; /* Max key len the current key buffer can hold. */ + unsigned char key_static_string[RAX_ITER_STATIC_LEN]; + raxNode *node; /* Current node. Only for unsafe iteration. */ + raxStack stack; /* Stack used for unsafe iteration. */ +} raxIterator; + +/* A special pointer returned for not found items. */ +extern void *raxNotFound; + +/* Exported API. */ +rax *raxNew(void); +int raxInsert(rax *rax, unsigned char *s, size_t len, void *data); +int raxRemove(rax *rax, unsigned char *s, size_t len); +void *raxFind(rax *rax, unsigned char *s, size_t len); +void raxFree(rax *rax); +void raxStart(raxIterator *it, rax *rt); +int raxSeek(raxIterator *it, unsigned char *ele, size_t len, const char *op); +int raxNext(raxIterator *it, unsigned char *stop, size_t stoplen, char *op); +int raxPrev(raxIterator *it, unsigned char *stop, size_t stoplen, char *op); +void raxStop(raxIterator *it); +void raxShow(rax *rax); + +#endif diff --git a/src/rax_malloc.h b/src/rax_malloc.h new file mode 100644 index 000000000..9295985c6 --- /dev/null +++ b/src/rax_malloc.h @@ -0,0 +1,44 @@ +/* Rax -- A radix tree implementation. + * + * Copyright (c) 2017, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* Allocator selection. + * + * This file is used in order to change the Rax allocator at compile time. + * Just define the following defines to what you want to use. Also add + * the include of your alternate allocator if needed (not needed in order + * to use the default libc allocator). */ + +#ifndef RAX_ALLOC_H +#define RAX_ALLOC_H +#include "zmalloc.h" +#define rax_malloc zmalloc +#define rax_realloc zrealloc +#define rax_free zfree +#endif diff --git a/src/server.h b/src/server.h index 75ff384cd..3b28fa44d 100644 --- a/src/server.h +++ b/src/server.h @@ -63,7 +63,9 @@ typedef long long mstime_t; /* millisecond time type. */ #include "util.h" /* Misc functions useful in many places */ #include "latency.h" /* Latency monitor API */ #include "sparkline.h" /* ASCII graphs API */ -#include "quicklist.h" +#include "quicklist.h" /* Lists are encoded as linked lists of + N-elements flat arrays */ +#include "rax.h" /* Radix tree */ /* Following includes allow test functions to be called from Redis main() */ #include "zipmap.h"