diff --git a/DOCS/man/options.rst b/DOCS/man/options.rst index 0ef7aae84a..b7ffda9a8c 100644 --- a/DOCS/man/options.rst +++ b/DOCS/man/options.rst @@ -4346,10 +4346,16 @@ The following video options are currently all specific to ``--vo=gpu`` and Used in ``--dither=fruit`` mode only. -``--dither=`` +``--dither=`` Select dithering algorithm (default: fruit). (Normally, the ``--dither-depth`` option controls whether dithering is enabled.) + The ``error-diffusion`` option requires compute shader support. It also + requires large amount of shared memory to run, the size of which depends on + both the kernel (see ``--error-diffusion`` option below) and the height of + video window. It will fallback to ``fruit`` dithering if there is no enough + shared memory to run the shader. + ``--temporal-dither`` Enable temporal dithering. (Only active if dithering is enabled in general.) This changes between 8 different dithering patterns on each frame @@ -4362,6 +4368,29 @@ The following video options are currently all specific to ``--vo=gpu`` and ``--temporal-dither`` is in use. 1 (the default) will update on every video frame, 2 on every other frame, etc. +``--error-diffusion=`` + The error diffusion kernel to use when ``--dither=error-diffusion`` is set. + + ``simple`` + Propagate error to only two adjacent pixels. Fastest but low quality. + + ``sierra-lite`` + Fast with reasonable quality. This is the default. + + ``floyd-steinberg`` + Most notable error diffusion kernel. + + ``atkinson`` + Looks different from other kernels because only fraction of errors will + be propagated during dithering. A typical use case of this kernel is + saving dithered screenshot (in window mode). This kernel produces + slightly smaller file, with still reasonable dithering quality. + + There are other kernels (use ``--error-diffusion=help`` to list) but most of + them are much slower and demanding even larger amount of shared memory. + Among these kernels, ``burkes`` achieves a good balance between performance + and quality, and probably is the one you want to try first. + ``--gpu-debug`` Enables GPU debugging. What this means depends on the API type. For OpenGL, it calls ``glGetError()``, and requests a debug context. For Vulkan, it diff --git a/video/out/gpu/error_diffusion.c b/video/out/gpu/error_diffusion.c new file mode 100644 index 0000000000..2bcd2084dd --- /dev/null +++ b/video/out/gpu/error_diffusion.c @@ -0,0 +1,288 @@ +/* + * This file is part of mpv. + * + * mpv is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * mpv is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with mpv. If not, see . + */ + +#include + +#include "error_diffusion.h" + +#include "common/common.h" + +#define GLSL(...) gl_sc_addf(sc, __VA_ARGS__) +#define GLSLH(...) gl_sc_haddf(sc, __VA_ARGS__) + +// After a (y, x) -> (y, x + y * shift) mapping, find the right most column that +// will be affected by the current column. +static int compute_rightmost_shifted_column(const struct error_diffusion_kernel *k) +{ + int ret = 0; + for (int y = 0; y <= EF_MAX_DELTA_Y; y++) { + for (int x = EF_MIN_DELTA_X; x <= EF_MAX_DELTA_X; x++) { + if (k->pattern[y][x - EF_MIN_DELTA_X] != 0) { + int shifted_x = x + y * k->shift; + + // The shift mapping guarantees current column (or left of it) + // won't be affected by error diffusion. + assert(shifted_x > 0); + + ret = MPMAX(ret, shifted_x); + } + } + } + return ret; +} + +const struct error_diffusion_kernel *mp_find_error_diffusion_kernel(const char *name) +{ + if (!name) + return NULL; + for (const struct error_diffusion_kernel *k = mp_error_diffusion_kernels; + k->name; + k++) { + if (strcmp(k->name, name) == 0) + return k; + } + return NULL; +} + +int mp_ef_compute_shared_memory_size(const struct error_diffusion_kernel *k, + int height) +{ + // We add EF_MAX_DELTA_Y empty lines on the bottom to handle errors + // propagated out from bottom side. + int rows = height + EF_MAX_DELTA_Y; + int shifted_columns = compute_rightmost_shifted_column(k) + 1; + + // The shared memory is an array of size rows*shifted_columns. Each element + // is three int, for each RGB component. + return rows * shifted_columns * 3 * 4; +} + +void pass_error_diffusion(struct gl_shader_cache *sc, + const struct error_diffusion_kernel *k, + int tex, int width, int height, int depth, int block_size) +{ + assert(block_size <= height); + + // The parallel error diffusion works by applying the shift mapping first. + // Taking the Floyd and Steinberg algorithm for example. After applying + // the (y, x) -> (y, x + y * shift) mapping (with shift=2), all errors are + // propagated into the next few columns, which makes parallel processing on + // the same column possible. + // + // X 7/16 X 7/16 + // 3/16 5/16 1/16 ==> 0 0 3/16 5/16 1/16 + + // Figuring out the size of rectangle containing all shifted pixels. + // The rectangle height is not changed. + int shifted_width = width + (height - 1) * k->shift; + + // We process all pixels from the shifted rectangles column by column, with + // a single global work group of size |block_size|. + // Figuring out how many block are required to process all pixels. We need + // this explicitly to make the number of barrier() calls match. + int blocks = (height * shifted_width + block_size - 1) / block_size; + + // If we figure out how many of the next columns will be affected while the + // current columns is being processed. We can store errors of only a few + // columns in the shared memory. Using a ring buffer will further save the + // cost while iterating to next column. + int ring_buffer_rows = height + EF_MAX_DELTA_Y; + int ring_buffer_columns = compute_rightmost_shifted_column(k) + 1; + int ring_buffer_size = ring_buffer_rows * ring_buffer_columns; + + const char *rgb = "rgb"; + + // Defines the ring buffer in shared memory. + for (int comp = 0; comp < 3; comp++) + GLSLH("shared int err_%c[%d];\n", rgb[comp], ring_buffer_size); + + // Initialize the ring buffer. + GLSL("for (int i = int(gl_LocalInvocationIndex); i < %d; i += %d) {\n", + ring_buffer_size, block_size); + for (int comp = 0; comp < 3; comp++) + GLSL("err_%c[i] = 0;\n", rgb[comp]); + GLSL("}\n"); + + GLSL("for (int block_id = 0; block_id < %d; ++block_id) {\n", blocks); + + // Add barrier here to have previous block all processed before starting + // the processing of the next. + GLSL("groupMemoryBarrier();\n"); + GLSL("barrier();\n"); + + // Compute the coordinate of the pixel we are currently processing, both + // before and after the shift mapping. + GLSL("int id = int(gl_LocalInvocationIndex) + block_id * %d;\n", block_size); + GLSL("int y = id %% %d, x_shifted = id / %d;\n", height, height); + GLSL("int x = x_shifted - y * %d;\n", k->shift); + + // Proceed only if we are processing a valid pixel. + GLSL("if (0 <= x && x < %d) {\n", width); + + // The index that the current pixel have on the ring buffer. + GLSL("int idx = (x_shifted * %d + y) %% %d;\n", ring_buffer_rows, ring_buffer_size); + + // Fetch the current pixel. + GLSL("vec3 pix = texelFetch(texture%d, ivec2(x, y), 0).rgb;\n", tex); + + // The dithering will quantize pixel value into multiples of 1/dither_quant. + int dither_quant = (1 << depth) - 1; + // The absolute value of the errors to propagate is less than 1/dither_quant, + // multiply by dither_quant24 to have them processed with int in 24 bit + // precision. + double dither_quant24 = (double)(1 << 24) * dither_quant; + + // Adding the error previously propagated into current pixel, and clear it + // in the buffer. + GLSL("pix += vec3(err_r[idx], err_g[idx], err_b[idx]) / %f;\n", dither_quant24); + for (int comp = 0; comp < 3; comp++) + GLSL("err_%c[idx] = 0;\n", rgb[comp]); + + // Dithering to depth. + GLSL("vec3 dithered = floor(pix * %d.0 + 0.5) / %d.0;\n", dither_quant, dither_quant); + GLSL("ivec3 err = ivec3((pix - dithered) * %f + 0.5);\n", dither_quant24); + + // Write the dithered pixel. + GLSL("imageStore(out_image, ivec2(x, y), vec4(dithered, 0.0));\n"); + + GLSL("int nidx;\n"); + for (int y = 0; y <= EF_MAX_DELTA_Y; y++) { + for (int x = EF_MIN_DELTA_X; x <= EF_MAX_DELTA_X; x++) { + if (k->pattern[y][x - EF_MIN_DELTA_X] != 0) { + int shifted_x = x + y * k->shift; + + // Unlike the right border, errors propagated out from left + // border will remain in the ring buffer. This will produce + // visible artifacts near the left border, especially for + // shift=3 kernels. + bool left_border_check = x < 0; + + if (left_border_check) + GLSL("if (x >= %d) {\n", -x); + + // Calculate the new position in the ring buffer to propagate + // the error into. + int ring_buffer_delta = shifted_x * ring_buffer_rows + y; + GLSL("nidx = (idx + %d) %% %d;\n", ring_buffer_delta, ring_buffer_size); + + // Propagate the error with atomic operation. + for (int comp = 0; comp < 3; comp++) { + GLSL("atomicAdd(err_%c[nidx], err.%c * %d / %d);\n", + rgb[comp], rgb[comp], + k->pattern[y][x - EF_MIN_DELTA_X], + k->divisor); + } + + if (left_border_check) + GLSL("}\n"); + } + } + } + + GLSL("}\n"); // if (0 <= x && x < width) + + GLSL("}\n"); // block_id +} + +// Different kernels for error diffusion. +// Patterns are from http://www.efg2.com/Lab/Library/ImageProcessing/DHALF.TXT +const struct error_diffusion_kernel mp_error_diffusion_kernels[] = { + { + .name = "simple", + .shift = 1, + .pattern = {{0, 0, 0, 1, 0}, + {0, 0, 1, 0, 0}, + {0, 0, 0, 0, 0}}, + .divisor = 2 + }, + { + // The "false" Floyd-Steinberg kernel + .name = "false-fs", + .shift = 1, + .pattern = {{0, 0, 0, 3, 0}, + {0, 0, 3, 2, 0}, + {0, 0, 0, 0, 0}}, + .divisor = 8 + }, + { + .name = "sierra-lite", + .shift = 2, + .pattern = {{0, 0, 0, 2, 0}, + {0, 1, 1, 0, 0}, + {0, 0, 0, 0, 0}}, + .divisor = 4 + }, + { + .name = "floyd-steinberg", + .shift = 2, + .pattern = {{0, 0, 0, 7, 0}, + {0, 3, 5, 1, 0}, + {0, 0, 0, 0, 0}}, + .divisor = 16 + }, + { + .name = "atkinson", + .shift = 2, + .pattern = {{0, 0, 0, 1, 1}, + {0, 1, 1, 1, 0}, + {0, 0, 1, 0, 0}}, + .divisor = 8 + }, + // All kernels below have shift value of 3, and probably are too heavy for + // low end GPU. + { + .name = "jarvis-judice-ninke", + .shift = 3, + .pattern = {{0, 0, 0, 7, 5}, + {3, 5, 7, 5, 3}, + {1, 3, 5, 3, 1}}, + .divisor = 48 + }, + { + .name = "stucki", + .shift = 3, + .pattern = {{0, 0, 0, 8, 4}, + {2, 4, 8, 4, 2}, + {1, 2, 4, 2, 1}}, + .divisor = 42 + }, + { + .name = "burkes", + .shift = 3, + .pattern = {{0, 0, 0, 8, 4}, + {2, 4, 8, 4, 2}, + {0, 0, 0, 0, 0}}, + .divisor = 32 + }, + { + .name = "sierra-3", + .shift = 3, + .pattern = {{0, 0, 0, 5, 3}, + {2, 4, 5, 4, 2}, + {0, 2, 3, 2, 0}}, + .divisor = 32 + }, + { + .name = "sierra-2", + .shift = 3, + .pattern = {{0, 0, 0, 4, 3}, + {1, 2, 3, 2, 1}, + {0, 0, 0, 0, 0}}, + .divisor = 16 + }, + {0} +}; diff --git a/video/out/gpu/error_diffusion.h b/video/out/gpu/error_diffusion.h new file mode 100644 index 0000000000..6bdcea16f7 --- /dev/null +++ b/video/out/gpu/error_diffusion.h @@ -0,0 +1,48 @@ +/* + * This file is part of mpv. + * + * mpv is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * mpv is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with mpv. If not, see . + */ + +#ifndef MP_GL_ERROR_DIFFUSION +#define MP_GL_ERROR_DIFFUSION + +#include "shader_cache.h" + +// defines the border of all error diffusion kernels +#define EF_MIN_DELTA_X (-2) +#define EF_MAX_DELTA_X (2) +#define EF_MAX_DELTA_Y (2) + +struct error_diffusion_kernel { + const char *name; + + // The minimum value such that a (y, x) -> (y, x + y * shift) mapping will + // make all error pushing operations affect next column (and after it) only. + int shift; + + // The diffusion factor for (y, x) is pattern[y][x - EF_MIN_DELTA_X] / divisor. + int pattern[EF_MAX_DELTA_Y + 1][EF_MAX_DELTA_X - EF_MIN_DELTA_X + 1]; + int divisor; +}; + +extern const struct error_diffusion_kernel mp_error_diffusion_kernels[]; + +const struct error_diffusion_kernel *mp_find_error_diffusion_kernel(const char *name); +int mp_ef_compute_shared_memory_size(const struct error_diffusion_kernel *k, int height); +void pass_error_diffusion(struct gl_shader_cache *sc, + const struct error_diffusion_kernel *k, + int tex, int width, int height, int depth, int block_size); + +#endif /* MP_GL_ERROR_DIFFUSION */ diff --git a/video/out/gpu/video.c b/video/out/gpu/video.c index b3e9c0ee1c..9244a9ad95 100644 --- a/video/out/gpu/video.c +++ b/video/out/gpu/video.c @@ -38,6 +38,7 @@ #include "stream/stream.h" #include "video_shaders.h" #include "user_shaders.h" +#include "error_diffusion.h" #include "video/out/filter_kernels.h" #include "video/out/aspect.h" #include "video/out/dither.h" @@ -211,6 +212,7 @@ struct gl_video { struct ra_tex *integer_tex[4]; struct ra_tex *indirect_tex; struct ra_tex *blend_subs_tex; + struct ra_tex *error_diffusion_tex; struct ra_tex *screen_tex; struct ra_tex *output_tex; struct ra_tex *vdpau_deinterleave_tex[2]; @@ -295,6 +297,7 @@ static const struct gl_video_opts gl_video_opts_def = { .dither_depth = -1, .dither_size = 6, .temporal_dither_period = 1, + .error_diffusion = "sierra-lite", .fbo_format = "auto", .sigmoid_center = 0.75, .sigmoid_slope = 6.5, @@ -334,6 +337,9 @@ static int validate_scaler_opt(struct mp_log *log, const m_option_t *opt, static int validate_window_opt(struct mp_log *log, const m_option_t *opt, struct bstr name, struct bstr param); +static int validate_error_diffusion_opt(struct mp_log *log, const m_option_t *opt, + struct bstr name, struct bstr param); + #define OPT_BASE_STRUCT struct gl_video_opts #define SCALER_OPTS(n, i) \ @@ -402,10 +408,13 @@ const struct m_sub_options gl_video_conf = { OPT_CHOICE("dither", dither_algo, 0, ({"fruit", DITHER_FRUIT}, {"ordered", DITHER_ORDERED}, + {"error-diffusion", DITHER_ERROR_DIFFUSION}, {"no", DITHER_NONE})), OPT_INTRANGE("dither-size-fruit", dither_size, 0, 2, 8), OPT_FLAG("temporal-dither", temporal_dither, 0), OPT_INTRANGE("temporal-dither-period", temporal_dither_period, 0, 1, 128), + OPT_STRING_VALIDATE("error-diffusion", error_diffusion, 0, + validate_error_diffusion_opt), OPT_CHOICE("alpha", alpha_mode, 0, ({"no", ALPHA_NO}, {"yes", ALPHA_YES}, @@ -544,6 +553,7 @@ static void uninit_rendering(struct gl_video *p) ra_tex_free(p->ra, &p->indirect_tex); ra_tex_free(p->ra, &p->blend_subs_tex); + ra_tex_free(p->ra, &p->error_diffusion_tex); ra_tex_free(p->ra, &p->screen_tex); ra_tex_free(p->ra, &p->output_tex); @@ -2595,6 +2605,51 @@ static void pass_dither(struct gl_video *p) if (p->opts.dither_depth < 0 || p->opts.dither_algo == DITHER_NONE) return; + if (p->opts.dither_algo == DITHER_ERROR_DIFFUSION) { + const struct error_diffusion_kernel *kernel = + mp_find_error_diffusion_kernel(p->opts.error_diffusion); + int o_w = p->dst_rect.x1 - p->dst_rect.x0, + o_h = p->dst_rect.y1 - p->dst_rect.y0; + + int shmem_req = mp_ef_compute_shared_memory_size(kernel, o_h); + if (shmem_req > p->ra->max_shmem) { + MP_WARN(p, "Fallback to dither=fruit because there is no enough " + "shared memory (%d/%d).\n", + shmem_req, (int)p->ra->max_shmem); + p->opts.dither_algo = DITHER_FRUIT; + } else { + finish_pass_tex(p, &p->screen_tex, o_w, o_h); + + struct image img = image_wrap(p->screen_tex, PLANE_RGB, p->components); + + // 1024 is minimal required number of invocation allowed in single + // work group in OpenGL. Use it for maximal performance. + int block_size = MPMIN(1024, o_h); + + pass_describe(p, "dither=error-diffusion (kernel=%s, depth=%d)", + kernel->name, dst_depth); + + p->pass_compute = (struct compute_info) { + .active = true, + .threads_w = block_size, + .threads_h = 1, + .directly_writes = true + }; + + int tex_id = pass_bind(p, img); + + pass_error_diffusion(p->sc, kernel, tex_id, o_w, o_h, + dst_depth, block_size); + + finish_pass_tex(p, &p->error_diffusion_tex, o_w, o_h); + + img = image_wrap(p->error_diffusion_tex, PLANE_RGB, p->components); + copy_image(p, &(int){0}, img); + + return; + } + } + if (!p->dither_texture) { MP_VERBOSE(p, "Dither to %d.\n", dst_depth); @@ -3632,6 +3687,12 @@ static void check_gl_features(struct gl_video *p) "available! See your FBO format configuration!\n"); } + if (!have_compute && p->opts.dither_algo == DITHER_ERROR_DIFFUSION) { + MP_WARN(p, "Disabling error diffusion dithering because compute shader " + "was not supported. Fallback to dither=fruit instead.\n"); + p->opts.dither_algo = DITHER_FRUIT; + } + bool have_compute_peak = have_compute && have_ssbo; if (!have_compute_peak && p->opts.tone_map.compute_peak >= 0) { int msgl = p->opts.tone_map.compute_peak == 1 ? MSGL_WARN : MSGL_V; @@ -3663,6 +3724,7 @@ static void check_gl_features(struct gl_video *p) .dither_algo = p->opts.dither_algo, .dither_depth = p->opts.dither_depth, .dither_size = p->opts.dither_size, + .error_diffusion = p->opts.error_diffusion, .temporal_dither = p->opts.temporal_dither, .temporal_dither_period = p->opts.temporal_dither_period, .tex_pad_x = p->opts.tex_pad_x, @@ -4011,6 +4073,29 @@ static int validate_window_opt(struct mp_log *log, const m_option_t *opt, return r; } +static int validate_error_diffusion_opt(struct mp_log *log, const m_option_t *opt, + struct bstr name, struct bstr param) +{ + char s[20] = {0}; + int r = 1; + if (bstr_equals0(param, "help")) { + r = M_OPT_EXIT; + } else { + snprintf(s, sizeof(s), "%.*s", BSTR_P(param)); + const struct error_diffusion_kernel *k = mp_find_error_diffusion_kernel(s); + if (!k) + r = M_OPT_INVALID; + } + if (r < 1) { + mp_info(log, "Available error diffusion kernels:\n"); + for (int n = 0; mp_error_diffusion_kernels[n].name; n++) + mp_info(log, " %s\n", mp_error_diffusion_kernels[n].name); + if (s[0]) + mp_fatal(log, "No error diffusion kernel named '%s' found!\n", s); + } + return r; +} + float gl_video_scale_ambient_lux(float lmin, float lmax, float rmin, float rmax, float lux) { diff --git a/video/out/gpu/video.h b/video/out/gpu/video.h index 1b0994ac78..931944a777 100644 --- a/video/out/gpu/video.h +++ b/video/out/gpu/video.h @@ -71,6 +71,7 @@ enum dither_algo { DITHER_NONE = 0, DITHER_FRUIT, DITHER_ORDERED, + DITHER_ERROR_DIFFUSION, }; enum alpha_mode { @@ -131,6 +132,7 @@ struct gl_video_opts { int dither_size; int temporal_dither; int temporal_dither_period; + char *error_diffusion; char *fbo_format; int alpha_mode; int use_rectangle; diff --git a/wscript_build.py b/wscript_build.py index 6306c28d72..0814b9122e 100644 --- a/wscript_build.py +++ b/wscript_build.py @@ -433,6 +433,7 @@ def build(ctx): ( "video/out/filter_kernels.c" ), ( "video/out/gpu/context.c" ), ( "video/out/gpu/d3d11_helpers.c", "d3d11 || egl-angle-win32" ), + ( "video/out/gpu/error_diffusion.c" ), ( "video/out/gpu/hwdec.c" ), ( "video/out/gpu/lcms.c" ), ( "video/out/gpu/libmpv_gpu.c" ),