diff --git a/DOCS/man/options.rst b/DOCS/man/options.rst
index 0ef7aae84a..b7ffda9a8c 100644
--- a/DOCS/man/options.rst
+++ b/DOCS/man/options.rst
@@ -4346,10 +4346,16 @@ The following video options are currently all specific to ``--vo=gpu`` and
 
     Used in ``--dither=fruit`` mode only.
 
-``--dither=<fruit|ordered|no>``
+``--dither=<fruit|ordered|error-diffusion|no>``
     Select dithering algorithm (default: fruit). (Normally, the
     ``--dither-depth`` option controls whether dithering is enabled.)
 
+    The ``error-diffusion`` option requires compute shader support. It also
+    requires large amount of shared memory to run, the size of which depends on
+    both the kernel (see ``--error-diffusion`` option below) and the height of
+    video window. It will fallback to ``fruit`` dithering if there is no enough
+    shared memory to run the shader.
+
 ``--temporal-dither``
     Enable temporal dithering. (Only active if dithering is enabled in
     general.) This changes between 8 different dithering patterns on each frame
@@ -4362,6 +4368,29 @@ The following video options are currently all specific to ``--vo=gpu`` and
     ``--temporal-dither`` is in use. 1 (the default) will update on every video
     frame, 2 on every other frame, etc.
 
+``--error-diffusion=<kernel>``
+    The error diffusion kernel to use when ``--dither=error-diffusion`` is set.
+
+    ``simple``
+        Propagate error to only two adjacent pixels. Fastest but low quality.
+
+    ``sierra-lite``
+        Fast with reasonable quality. This is the default.
+
+    ``floyd-steinberg``
+        Most notable error diffusion kernel.
+
+    ``atkinson``
+        Looks different from other kernels because only fraction of errors will
+        be propagated during dithering. A typical use case of this kernel is
+        saving dithered screenshot (in window mode). This kernel produces
+        slightly smaller file, with still reasonable dithering quality.
+
+    There are other kernels (use ``--error-diffusion=help`` to list) but most of
+    them are much slower and demanding even larger amount of shared memory.
+    Among these kernels, ``burkes`` achieves a good balance between performance
+    and quality, and probably is the one you want to try first.
+
 ``--gpu-debug``
     Enables GPU debugging. What this means depends on the API type. For OpenGL,
     it calls ``glGetError()``, and requests a debug context. For Vulkan, it
diff --git a/video/out/gpu/error_diffusion.c b/video/out/gpu/error_diffusion.c
new file mode 100644
index 0000000000..2bcd2084dd
--- /dev/null
+++ b/video/out/gpu/error_diffusion.c
@@ -0,0 +1,288 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdlib.h>
+
+#include "error_diffusion.h"
+
+#include "common/common.h"
+
+#define GLSL(...) gl_sc_addf(sc, __VA_ARGS__)
+#define GLSLH(...) gl_sc_haddf(sc, __VA_ARGS__)
+
+// After a (y, x) -> (y, x + y * shift) mapping, find the right most column that
+// will be affected by the current column.
+static int compute_rightmost_shifted_column(const struct error_diffusion_kernel *k)
+{
+    int ret = 0;
+    for (int y = 0; y <= EF_MAX_DELTA_Y; y++) {
+        for (int x = EF_MIN_DELTA_X; x <= EF_MAX_DELTA_X; x++) {
+            if (k->pattern[y][x - EF_MIN_DELTA_X] != 0) {
+                int shifted_x = x + y * k->shift;
+
+                // The shift mapping guarantees current column (or left of it)
+                // won't be affected by error diffusion.
+                assert(shifted_x > 0);
+
+                ret = MPMAX(ret, shifted_x);
+            }
+        }
+    }
+    return ret;
+}
+
+const struct error_diffusion_kernel *mp_find_error_diffusion_kernel(const char *name)
+{
+    if (!name)
+        return NULL;
+    for (const struct error_diffusion_kernel *k = mp_error_diffusion_kernels;
+         k->name;
+         k++) {
+        if (strcmp(k->name, name) == 0)
+            return k;
+    }
+    return NULL;
+}
+
+int mp_ef_compute_shared_memory_size(const struct error_diffusion_kernel *k,
+                                     int height)
+{
+    // We add EF_MAX_DELTA_Y empty lines on the bottom to handle errors
+    // propagated out from bottom side.
+    int rows = height + EF_MAX_DELTA_Y;
+    int shifted_columns = compute_rightmost_shifted_column(k) + 1;
+
+    // The shared memory is an array of size rows*shifted_columns. Each element
+    // is three int, for each RGB component.
+    return rows * shifted_columns * 3 * 4;
+}
+
+void pass_error_diffusion(struct gl_shader_cache *sc,
+                          const struct error_diffusion_kernel *k,
+                          int tex, int width, int height, int depth, int block_size)
+{
+    assert(block_size <= height);
+
+    // The parallel error diffusion works by applying the shift mapping first.
+    // Taking the Floyd and Steinberg algorithm for example. After applying
+    // the (y, x) -> (y, x + y * shift) mapping (with shift=2), all errors are
+    // propagated into the next few columns, which makes parallel processing on
+    // the same column possible.
+    //
+    //           X    7/16                X    7/16
+    //    3/16  5/16  1/16   ==>    0     0    3/16  5/16  1/16
+
+    // Figuring out the size of rectangle containing all shifted pixels.
+    // The rectangle height is not changed.
+    int shifted_width = width + (height - 1) * k->shift;
+
+    // We process all pixels from the shifted rectangles column by column, with
+    // a single global work group of size |block_size|.
+    // Figuring out how many block are required to process all pixels. We need
+    // this explicitly to make the number of barrier() calls match.
+    int blocks = (height * shifted_width + block_size - 1) / block_size;
+
+    // If we figure out how many of the next columns will be affected while the
+    // current columns is being processed. We can store errors of only a few
+    // columns in the shared memory. Using a ring buffer will further save the
+    // cost while iterating to next column.
+    int ring_buffer_rows = height + EF_MAX_DELTA_Y;
+    int ring_buffer_columns = compute_rightmost_shifted_column(k) + 1;
+    int ring_buffer_size = ring_buffer_rows * ring_buffer_columns;
+
+    const char *rgb = "rgb";
+
+    // Defines the ring buffer in shared memory.
+    for (int comp = 0; comp < 3; comp++)
+        GLSLH("shared int err_%c[%d];\n", rgb[comp], ring_buffer_size);
+
+    // Initialize the ring buffer.
+    GLSL("for (int i = int(gl_LocalInvocationIndex); i < %d; i += %d) {\n",
+         ring_buffer_size, block_size);
+    for (int comp = 0; comp < 3; comp++)
+        GLSL("err_%c[i] = 0;\n", rgb[comp]);
+    GLSL("}\n");
+
+    GLSL("for (int block_id = 0; block_id < %d; ++block_id) {\n", blocks);
+
+    // Add barrier here to have previous block all processed before starting
+    // the processing of the next.
+    GLSL("groupMemoryBarrier();\n");
+    GLSL("barrier();\n");
+
+    // Compute the coordinate of the pixel we are currently processing, both
+    // before and after the shift mapping.
+    GLSL("int id = int(gl_LocalInvocationIndex) + block_id * %d;\n", block_size);
+    GLSL("int y = id %% %d, x_shifted = id / %d;\n", height, height);
+    GLSL("int x = x_shifted - y * %d;\n", k->shift);
+
+    // Proceed only if we are processing a valid pixel.
+    GLSL("if (0 <= x && x < %d) {\n", width);
+
+    // The index that the current pixel have on the ring buffer.
+    GLSL("int idx = (x_shifted * %d + y) %% %d;\n", ring_buffer_rows, ring_buffer_size);
+
+    // Fetch the current pixel.
+    GLSL("vec3 pix = texelFetch(texture%d, ivec2(x, y), 0).rgb;\n", tex);
+
+    // The dithering will quantize pixel value into multiples of 1/dither_quant.
+    int dither_quant = (1 << depth) - 1;
+    // The absolute value of the errors to propagate is less than 1/dither_quant,
+    // multiply by dither_quant24 to have them processed with int in 24 bit
+    // precision.
+    double dither_quant24 = (double)(1 << 24) * dither_quant;
+
+    // Adding the error previously propagated into current pixel, and clear it
+    // in the buffer.
+    GLSL("pix += vec3(err_r[idx], err_g[idx], err_b[idx]) / %f;\n", dither_quant24);
+    for (int comp = 0; comp < 3; comp++)
+        GLSL("err_%c[idx] = 0;\n", rgb[comp]);
+
+    // Dithering to depth.
+    GLSL("vec3 dithered = floor(pix * %d.0 + 0.5) / %d.0;\n", dither_quant, dither_quant);
+    GLSL("ivec3 err = ivec3((pix - dithered) * %f + 0.5);\n", dither_quant24);
+
+    // Write the dithered pixel.
+    GLSL("imageStore(out_image, ivec2(x, y), vec4(dithered, 0.0));\n");
+
+    GLSL("int nidx;\n");
+    for (int y = 0; y <= EF_MAX_DELTA_Y; y++) {
+        for (int x = EF_MIN_DELTA_X; x <= EF_MAX_DELTA_X; x++) {
+            if (k->pattern[y][x - EF_MIN_DELTA_X] != 0) {
+                int shifted_x = x + y * k->shift;
+
+                // Unlike the right border, errors propagated out from left
+                // border will remain in the ring buffer. This will produce
+                // visible artifacts near the left border, especially for
+                // shift=3 kernels.
+                bool left_border_check = x < 0;
+
+                if (left_border_check)
+                    GLSL("if (x >= %d) {\n", -x);
+
+                // Calculate the new position in the ring buffer to propagate
+                // the error into.
+                int ring_buffer_delta = shifted_x * ring_buffer_rows + y;
+                GLSL("nidx = (idx + %d) %% %d;\n", ring_buffer_delta, ring_buffer_size);
+
+                // Propagate the error with atomic operation.
+                for (int comp = 0; comp < 3; comp++) {
+                    GLSL("atomicAdd(err_%c[nidx], err.%c * %d / %d);\n",
+                         rgb[comp], rgb[comp],
+                         k->pattern[y][x - EF_MIN_DELTA_X],
+                         k->divisor);
+                }
+
+                if (left_border_check)
+                    GLSL("}\n");
+            }
+        }
+    }
+
+    GLSL("}\n"); // if (0 <= x && x < width)
+
+    GLSL("}\n"); // block_id
+}
+
+// Different kernels for error diffusion.
+// Patterns are from http://www.efg2.com/Lab/Library/ImageProcessing/DHALF.TXT
+const struct error_diffusion_kernel mp_error_diffusion_kernels[] = {
+    {
+        .name = "simple",
+        .shift = 1,
+        .pattern = {{0, 0, 0, 1, 0},
+                    {0, 0, 1, 0, 0},
+                    {0, 0, 0, 0, 0}},
+        .divisor = 2
+    },
+    {
+        // The "false" Floyd-Steinberg kernel
+        .name = "false-fs",
+        .shift = 1,
+        .pattern = {{0, 0, 0, 3, 0},
+                    {0, 0, 3, 2, 0},
+                    {0, 0, 0, 0, 0}},
+        .divisor = 8
+    },
+    {
+        .name = "sierra-lite",
+        .shift = 2,
+        .pattern = {{0, 0, 0, 2, 0},
+                    {0, 1, 1, 0, 0},
+                    {0, 0, 0, 0, 0}},
+        .divisor = 4
+    },
+    {
+        .name = "floyd-steinberg",
+        .shift = 2,
+        .pattern = {{0, 0, 0, 7, 0},
+                    {0, 3, 5, 1, 0},
+                    {0, 0, 0, 0, 0}},
+        .divisor = 16
+    },
+    {
+        .name = "atkinson",
+        .shift = 2,
+        .pattern = {{0, 0, 0, 1, 1},
+                    {0, 1, 1, 1, 0},
+                    {0, 0, 1, 0, 0}},
+        .divisor = 8
+    },
+    // All kernels below have shift value of 3, and probably are too heavy for
+    // low end GPU.
+    {
+        .name = "jarvis-judice-ninke",
+        .shift = 3,
+        .pattern = {{0, 0, 0, 7, 5},
+                    {3, 5, 7, 5, 3},
+                    {1, 3, 5, 3, 1}},
+        .divisor = 48
+    },
+    {
+        .name = "stucki",
+        .shift = 3,
+        .pattern = {{0, 0, 0, 8, 4},
+                    {2, 4, 8, 4, 2},
+                    {1, 2, 4, 2, 1}},
+        .divisor = 42
+    },
+    {
+        .name = "burkes",
+        .shift = 3,
+        .pattern = {{0, 0, 0, 8, 4},
+                    {2, 4, 8, 4, 2},
+                    {0, 0, 0, 0, 0}},
+        .divisor = 32
+    },
+    {
+        .name = "sierra-3",
+        .shift = 3,
+        .pattern = {{0, 0, 0, 5, 3},
+                    {2, 4, 5, 4, 2},
+                    {0, 2, 3, 2, 0}},
+        .divisor = 32
+    },
+    {
+        .name = "sierra-2",
+        .shift = 3,
+        .pattern = {{0, 0, 0, 4, 3},
+                    {1, 2, 3, 2, 1},
+                    {0, 0, 0, 0, 0}},
+        .divisor = 16
+    },
+    {0}
+};
diff --git a/video/out/gpu/error_diffusion.h b/video/out/gpu/error_diffusion.h
new file mode 100644
index 0000000000..6bdcea16f7
--- /dev/null
+++ b/video/out/gpu/error_diffusion.h
@@ -0,0 +1,48 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MP_GL_ERROR_DIFFUSION
+#define MP_GL_ERROR_DIFFUSION
+
+#include "shader_cache.h"
+
+// defines the border of all error diffusion kernels
+#define EF_MIN_DELTA_X (-2)
+#define EF_MAX_DELTA_X  (2)
+#define EF_MAX_DELTA_Y  (2)
+
+struct error_diffusion_kernel {
+    const char *name;
+
+    // The minimum value such that a (y, x) -> (y, x + y * shift) mapping will
+    // make all error pushing operations affect next column (and after it) only.
+    int shift;
+
+    // The diffusion factor for (y, x) is pattern[y][x - EF_MIN_DELTA_X] / divisor.
+    int pattern[EF_MAX_DELTA_Y + 1][EF_MAX_DELTA_X - EF_MIN_DELTA_X + 1];
+    int divisor;
+};
+
+extern const struct error_diffusion_kernel mp_error_diffusion_kernels[];
+
+const struct error_diffusion_kernel *mp_find_error_diffusion_kernel(const char *name);
+int mp_ef_compute_shared_memory_size(const struct error_diffusion_kernel *k, int height);
+void pass_error_diffusion(struct gl_shader_cache *sc,
+                          const struct error_diffusion_kernel *k,
+                          int tex, int width, int height, int depth, int block_size);
+
+#endif /* MP_GL_ERROR_DIFFUSION */
diff --git a/video/out/gpu/video.c b/video/out/gpu/video.c
index b3e9c0ee1c..9244a9ad95 100644
--- a/video/out/gpu/video.c
+++ b/video/out/gpu/video.c
@@ -38,6 +38,7 @@
 #include "stream/stream.h"
 #include "video_shaders.h"
 #include "user_shaders.h"
+#include "error_diffusion.h"
 #include "video/out/filter_kernels.h"
 #include "video/out/aspect.h"
 #include "video/out/dither.h"
@@ -211,6 +212,7 @@ struct gl_video {
     struct ra_tex *integer_tex[4];
     struct ra_tex *indirect_tex;
     struct ra_tex *blend_subs_tex;
+    struct ra_tex *error_diffusion_tex;
     struct ra_tex *screen_tex;
     struct ra_tex *output_tex;
     struct ra_tex *vdpau_deinterleave_tex[2];
@@ -295,6 +297,7 @@ static const struct gl_video_opts gl_video_opts_def = {
     .dither_depth = -1,
     .dither_size = 6,
     .temporal_dither_period = 1,
+    .error_diffusion = "sierra-lite",
     .fbo_format = "auto",
     .sigmoid_center = 0.75,
     .sigmoid_slope = 6.5,
@@ -334,6 +337,9 @@ static int validate_scaler_opt(struct mp_log *log, const m_option_t *opt,
 static int validate_window_opt(struct mp_log *log, const m_option_t *opt,
                                struct bstr name, struct bstr param);
 
+static int validate_error_diffusion_opt(struct mp_log *log, const m_option_t *opt,
+                                        struct bstr name, struct bstr param);
+
 #define OPT_BASE_STRUCT struct gl_video_opts
 
 #define SCALER_OPTS(n, i) \
@@ -402,10 +408,13 @@ const struct m_sub_options gl_video_conf = {
         OPT_CHOICE("dither", dither_algo, 0,
                    ({"fruit", DITHER_FRUIT},
                     {"ordered", DITHER_ORDERED},
+                    {"error-diffusion", DITHER_ERROR_DIFFUSION},
                     {"no", DITHER_NONE})),
         OPT_INTRANGE("dither-size-fruit", dither_size, 0, 2, 8),
         OPT_FLAG("temporal-dither", temporal_dither, 0),
         OPT_INTRANGE("temporal-dither-period", temporal_dither_period, 0, 1, 128),
+        OPT_STRING_VALIDATE("error-diffusion", error_diffusion, 0,
+                            validate_error_diffusion_opt),
         OPT_CHOICE("alpha", alpha_mode, 0,
                    ({"no", ALPHA_NO},
                     {"yes", ALPHA_YES},
@@ -544,6 +553,7 @@ static void uninit_rendering(struct gl_video *p)
 
     ra_tex_free(p->ra, &p->indirect_tex);
     ra_tex_free(p->ra, &p->blend_subs_tex);
+    ra_tex_free(p->ra, &p->error_diffusion_tex);
     ra_tex_free(p->ra, &p->screen_tex);
     ra_tex_free(p->ra, &p->output_tex);
 
@@ -2595,6 +2605,51 @@ static void pass_dither(struct gl_video *p)
     if (p->opts.dither_depth < 0 || p->opts.dither_algo == DITHER_NONE)
         return;
 
+    if (p->opts.dither_algo == DITHER_ERROR_DIFFUSION) {
+        const struct error_diffusion_kernel *kernel =
+            mp_find_error_diffusion_kernel(p->opts.error_diffusion);
+        int o_w = p->dst_rect.x1 - p->dst_rect.x0,
+            o_h = p->dst_rect.y1 - p->dst_rect.y0;
+
+        int shmem_req = mp_ef_compute_shared_memory_size(kernel, o_h);
+        if (shmem_req > p->ra->max_shmem) {
+            MP_WARN(p, "Fallback to dither=fruit because there is no enough "
+                       "shared memory (%d/%d).\n",
+                       shmem_req, (int)p->ra->max_shmem);
+            p->opts.dither_algo = DITHER_FRUIT;
+        } else {
+            finish_pass_tex(p, &p->screen_tex, o_w, o_h);
+
+            struct image img = image_wrap(p->screen_tex, PLANE_RGB, p->components);
+
+            // 1024 is minimal required number of invocation allowed in single
+            // work group in OpenGL. Use it for maximal performance.
+            int block_size = MPMIN(1024, o_h);
+
+            pass_describe(p, "dither=error-diffusion (kernel=%s, depth=%d)",
+                             kernel->name, dst_depth);
+
+            p->pass_compute = (struct compute_info) {
+                .active = true,
+                .threads_w = block_size,
+                .threads_h = 1,
+                .directly_writes = true
+            };
+
+            int tex_id = pass_bind(p, img);
+
+            pass_error_diffusion(p->sc, kernel, tex_id, o_w, o_h,
+                                 dst_depth, block_size);
+
+            finish_pass_tex(p, &p->error_diffusion_tex, o_w, o_h);
+
+            img = image_wrap(p->error_diffusion_tex, PLANE_RGB, p->components);
+            copy_image(p, &(int){0}, img);
+
+            return;
+        }
+    }
+
     if (!p->dither_texture) {
         MP_VERBOSE(p, "Dither to %d.\n", dst_depth);
 
@@ -3632,6 +3687,12 @@ static void check_gl_features(struct gl_video *p)
                    "available! See your FBO format configuration!\n");
     }
 
+    if (!have_compute && p->opts.dither_algo == DITHER_ERROR_DIFFUSION) {
+        MP_WARN(p, "Disabling error diffusion dithering because compute shader "
+                   "was not supported. Fallback to dither=fruit instead.\n");
+        p->opts.dither_algo = DITHER_FRUIT;
+    }
+
     bool have_compute_peak = have_compute && have_ssbo;
     if (!have_compute_peak && p->opts.tone_map.compute_peak >= 0) {
         int msgl = p->opts.tone_map.compute_peak == 1 ? MSGL_WARN : MSGL_V;
@@ -3663,6 +3724,7 @@ static void check_gl_features(struct gl_video *p)
             .dither_algo = p->opts.dither_algo,
             .dither_depth = p->opts.dither_depth,
             .dither_size = p->opts.dither_size,
+            .error_diffusion = p->opts.error_diffusion,
             .temporal_dither = p->opts.temporal_dither,
             .temporal_dither_period = p->opts.temporal_dither_period,
             .tex_pad_x = p->opts.tex_pad_x,
@@ -4011,6 +4073,29 @@ static int validate_window_opt(struct mp_log *log, const m_option_t *opt,
     return r;
 }
 
+static int validate_error_diffusion_opt(struct mp_log *log, const m_option_t *opt,
+                                        struct bstr name, struct bstr param)
+{
+    char s[20] = {0};
+    int r = 1;
+    if (bstr_equals0(param, "help")) {
+        r = M_OPT_EXIT;
+    } else {
+        snprintf(s, sizeof(s), "%.*s", BSTR_P(param));
+        const struct error_diffusion_kernel *k = mp_find_error_diffusion_kernel(s);
+        if (!k)
+            r = M_OPT_INVALID;
+    }
+    if (r < 1) {
+        mp_info(log, "Available error diffusion kernels:\n");
+        for (int n = 0; mp_error_diffusion_kernels[n].name; n++)
+            mp_info(log, "    %s\n", mp_error_diffusion_kernels[n].name);
+        if (s[0])
+            mp_fatal(log, "No error diffusion kernel named '%s' found!\n", s);
+    }
+    return r;
+}
+
 float gl_video_scale_ambient_lux(float lmin, float lmax,
                                  float rmin, float rmax, float lux)
 {
diff --git a/video/out/gpu/video.h b/video/out/gpu/video.h
index 1b0994ac78..931944a777 100644
--- a/video/out/gpu/video.h
+++ b/video/out/gpu/video.h
@@ -71,6 +71,7 @@ enum dither_algo {
     DITHER_NONE = 0,
     DITHER_FRUIT,
     DITHER_ORDERED,
+    DITHER_ERROR_DIFFUSION,
 };
 
 enum alpha_mode {
@@ -131,6 +132,7 @@ struct gl_video_opts {
     int dither_size;
     int temporal_dither;
     int temporal_dither_period;
+    char *error_diffusion;
     char *fbo_format;
     int alpha_mode;
     int use_rectangle;
diff --git a/wscript_build.py b/wscript_build.py
index 6306c28d72..0814b9122e 100644
--- a/wscript_build.py
+++ b/wscript_build.py
@@ -433,6 +433,7 @@ def build(ctx):
         ( "video/out/filter_kernels.c" ),
         ( "video/out/gpu/context.c" ),
         ( "video/out/gpu/d3d11_helpers.c",       "d3d11 || egl-angle-win32" ),
+        ( "video/out/gpu/error_diffusion.c" ),
         ( "video/out/gpu/hwdec.c" ),
         ( "video/out/gpu/lcms.c" ),
         ( "video/out/gpu/libmpv_gpu.c" ),