Make db->avg_ttl more precise (#12949)

Currently, We compute `db->avg_ttl` after each short `dbScan` sweep (a few buckets without checking the time limit). But after each `dbScan` sweep, we don't have much data and this makes the db->avg_ttl less precise. For example, even if we scan the whole db, we can't get the exact avg_ttl because we separate the data. i.e. because of the running average, if we issue 16 calls to scan, we'll give lower weight to the first one, and higher weight to the last one. I think we should calculate `db->avg_ttl` until completing more of the db iteration (judgement of time limit or the beginning of iterating next db) because we have more sample data in this db and can get more accurate result. In the best case, if we scan the whole db, we can get the exact avg_ttl. In this PR, we postpone the avg_ttl calculation until the judgement of time limit or iteration of next db, so we can accumulate more data to get more precise avg_ttl. Note that we still need to make sure to decay the old TTLs at the same speed as before, which is why we want to run the decay mechanism several times, or use the Pow formula, see the comment in the code. In my experiment, this PR can improve 89% or 52% accuracy in different workload. Co-authored-by: Oran Agra <oran@redislabs.com>
2024-02-04 14:34:26 +08:00 · 2024-02-04 14:34:26 +08:00 · c1041c2c0d
parent 62153b3b2f
commit c1041c2c0d
1 changed files with 52 additions and 21 deletions
--- a/src/expire.c
+++ b/src/expire.c
@ -40,6 +40,10 @@
 * if no access is performed on them.
 *----------------------------------------------------------------------------*/

+/* Constants table from pow(0.98, 1) to pow(0.98, 16). 
+ * Help calculating the db->avg_ttl. */
+static double avg_ttl_factor[16] = {0.98, 0.9604, 0.941192, 0.922368, 0.903921, 0.885842, 0.868126, 0.850763, 0.833748, 0.817073, 0.800731, 0.784717, 0.769022, 0.753642, 0.738569, 0.723798};
+
 /* Helper function for the activeExpireCycle() function.
 * This function will try to expire the key that is stored in the hash table
 * entry 'de' of the 'expires' hash table of a Redis database.
@ -235,10 +239,14 @@ void activeExpireCycle(int type) {
    for (j = 0; dbs_performed < dbs_per_call && timelimit_exit == 0 && j < server.dbnum; j++) {
        /* Scan callback data including expired and checked count per iteration. */
        expireScanData data;
+        data.ttl_sum = 0;
+        data.ttl_samples = 0;

        redisDb *db = server.db+(current_db % server.dbnum);
        data.db = db;

+        int update_avg_ttl_times = 0, repeat = 0;
+
        /* Increment the DB now so we are sure if we run out of time
         * in the current DB we'll restart from the next. This allows to
         * distribute the time evenly across DBs. */
@ -265,8 +273,6 @@ void activeExpireCycle(int type) {
             * with an expire set, checking for expired ones. */
            data.sampled = 0;
            data.expired = 0;
-            data.ttl_sum = 0;
-            data.ttl_samples = 0;

            if (num > config_keys_per_loop)
                num = config_keys_per_loop;
@ -284,6 +290,8 @@ void activeExpireCycle(int type) {
            long max_buckets = num*20;
            long checked_buckets = 0;

+            int origin_ttl_samples = data.ttl_samples;
+
            while (data.sampled < num && checked_buckets < max_buckets) {
                db->expires_cursor = dbScan(db, DB_EXPIRES, db->expires_cursor, -1, expireScanCallback, isExpiryDictValidForSamplingCb, &data);
                if (db->expires_cursor == 0) {
@ -294,33 +302,56 @@ void activeExpireCycle(int type) {
            total_expired += data.expired;
            total_sampled += data.sampled;

-            /* Update the average TTL stats for this database. */
-            if (data.ttl_samples) {
-                long long avg_ttl = data.ttl_sum / data.ttl_samples;
-
-                /* Do a simple running average with a few samples.
-                 * We just use the current estimate with a weight of 2%
-                 * and the previous estimate with a weight of 98%. */
-                if (db->avg_ttl == 0) db->avg_ttl = avg_ttl;
-                db->avg_ttl = (db->avg_ttl/50)*49 + (avg_ttl/50);
-            }
+            /* If find keys with ttl not yet expired, we need to update the average TTL stats once. */
+            if (data.ttl_samples - origin_ttl_samples > 0) update_avg_ttl_times++;

+            repeat = data.sampled == 0 || (data.expired * 100 / data.sampled) > config_cycle_acceptable_stale;
            /* We can't block forever here even if there are many keys to
-             * expire. So after a given amount of milliseconds return to the
+             * expire. So after a given amount of microseconds return to the
             * caller waiting for the other active expire cycle. */
-            if ((iteration & 0xf) == 0) { /* check once every 16 iterations. */
-                elapsed = ustime()-start;
-                if (elapsed > timelimit) {
-                    timelimit_exit = 1;
-                    server.stat_expired_time_cap_reached_count++;
-                    break;
+            if ((iteration & 0xf) == 0 || !repeat) { /* Update the average TTL stats every 16 iterations or about to exit. */
+                /* Update the average TTL stats for this database, 
+                 * because this may reach the time limit. */
+                if (data.ttl_samples) {
+                    long long avg_ttl = data.ttl_sum / data.ttl_samples;
+
+                    /* Do a simple running average with a few samples.
+                     * We just use the current estimate with a weight of 2%
+                     * and the previous estimate with a weight of 98%. */
+                    if (db->avg_ttl == 0) {
+                        db->avg_ttl = avg_ttl;
+                    } else {
+                        /* Thr origin code is as follow.
+                         * for (int i = 0; i < update_avg_ttl_times; i++) {
+                         *   db->avg_ttl = (db->avg_ttl/50)*49 + (avg_ttl/50);
+                         * } 
+                         * We can convert the loop into a sum of a geometric progression.
+                         * db->avg_ttl = db->avg_ttl * pow(0.98, update_avg_ttl_times) + 
+                         *                  avg_ttl / 50 * (pow(0.98, update_avg_ttl_times - 1) + ... + 1) 
+                         *             = db->avg_ttl * pow(0.98, update_avg_ttl_times) + 
+                         *                  avg_ttl * (1 - pow(0.98, update_avg_ttl_times))
+                         *             = avg_ttl +  (db->avg_ttl - avg_ttl) * pow(0.98, update_avg_ttl_times) 
+                         * Notice that update_avg_ttl_times is between 1 and 16, we use a constant table 
+                         * to accelerate the calculation of pow(0.98, update_avg_ttl_times).*/
+                        db->avg_ttl = avg_ttl + (db->avg_ttl - avg_ttl) * avg_ttl_factor[update_avg_ttl_times - 1] ;
+                    }
+                    update_avg_ttl_times = 0;
+                    data.ttl_sum = 0;
+                    data.ttl_samples = 0;
+                }
+                if ((iteration & 0xf) == 0) { /* check time limit every 16 iterations. */
+                    elapsed = ustime()-start;
+                    if (elapsed > timelimit) {
+                        timelimit_exit = 1;
+                        server.stat_expired_time_cap_reached_count++;
+                        break;
+                    }
                }
            }
            /* We don't repeat the cycle for the current database if there are
             * an acceptable amount of stale keys (logically expired but yet
             * not reclaimed). */
-        } while (data.sampled == 0 ||
-                 (data.expired * 100 / data.sampled) > config_cycle_acceptable_stale);
+        } while (repeat);
    }

    elapsed = ustime()-start;