vo_gpu: implement error diffusion for dithering

This is a straightforward parallel implementation of error diffusion
algorithms in compute shader. Basically we use single work group with
maximal possible size to process the whole image. After a shift
mapping we are able to process all pixels column by column.

A large ring buffer are allocated in shared memory to speed things up.
However the size of required shared memory depends linearly on the
height of video window (or screen height in fullscreen mode). In case
there is no enough shared memory, it will fallback to `--dither=fruit`.

The maximal allowed work group size is hardcoded as 1024. Ideally we
could query `GL_MAX_COMPUTE_WORK_GROUP_INVOCATIONS`. But for whatever
reason, it seems most high end card from nvidia and amd support only
the minimal required value, so I guess we can stick to it for now.
This commit is contained in:
Bin Jin 2019-03-16 11:19:51 +00:00 committed by sfan5
parent 6aecd10eba
commit ca2f193671
6 changed files with 454 additions and 1 deletions

View File

@ -4346,10 +4346,16 @@ The following video options are currently all specific to ``--vo=gpu`` and
Used in ``--dither=fruit`` mode only.
``--dither=<fruit|ordered|no>``
``--dither=<fruit|ordered|error-diffusion|no>``
Select dithering algorithm (default: fruit). (Normally, the
``--dither-depth`` option controls whether dithering is enabled.)
The ``error-diffusion`` option requires compute shader support. It also
requires large amount of shared memory to run, the size of which depends on
both the kernel (see ``--error-diffusion`` option below) and the height of
video window. It will fallback to ``fruit`` dithering if there is no enough
shared memory to run the shader.
``--temporal-dither``
Enable temporal dithering. (Only active if dithering is enabled in
general.) This changes between 8 different dithering patterns on each frame
@ -4362,6 +4368,29 @@ The following video options are currently all specific to ``--vo=gpu`` and
``--temporal-dither`` is in use. 1 (the default) will update on every video
frame, 2 on every other frame, etc.
``--error-diffusion=<kernel>``
The error diffusion kernel to use when ``--dither=error-diffusion`` is set.
``simple``
Propagate error to only two adjacent pixels. Fastest but low quality.
``sierra-lite``
Fast with reasonable quality. This is the default.
``floyd-steinberg``
Most notable error diffusion kernel.
``atkinson``
Looks different from other kernels because only fraction of errors will
be propagated during dithering. A typical use case of this kernel is
saving dithered screenshot (in window mode). This kernel produces
slightly smaller file, with still reasonable dithering quality.
There are other kernels (use ``--error-diffusion=help`` to list) but most of
them are much slower and demanding even larger amount of shared memory.
Among these kernels, ``burkes`` achieves a good balance between performance
and quality, and probably is the one you want to try first.
``--gpu-debug``
Enables GPU debugging. What this means depends on the API type. For OpenGL,
it calls ``glGetError()``, and requests a debug context. For Vulkan, it

View File

@ -0,0 +1,288 @@
/*
* This file is part of mpv.
*
* mpv is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* mpv is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with mpv. If not, see <http://www.gnu.org/licenses/>.
*/
#include <stdlib.h>
#include "error_diffusion.h"
#include "common/common.h"
#define GLSL(...) gl_sc_addf(sc, __VA_ARGS__)
#define GLSLH(...) gl_sc_haddf(sc, __VA_ARGS__)
// After a (y, x) -> (y, x + y * shift) mapping, find the right most column that
// will be affected by the current column.
static int compute_rightmost_shifted_column(const struct error_diffusion_kernel *k)
{
int ret = 0;
for (int y = 0; y <= EF_MAX_DELTA_Y; y++) {
for (int x = EF_MIN_DELTA_X; x <= EF_MAX_DELTA_X; x++) {
if (k->pattern[y][x - EF_MIN_DELTA_X] != 0) {
int shifted_x = x + y * k->shift;
// The shift mapping guarantees current column (or left of it)
// won't be affected by error diffusion.
assert(shifted_x > 0);
ret = MPMAX(ret, shifted_x);
}
}
}
return ret;
}
const struct error_diffusion_kernel *mp_find_error_diffusion_kernel(const char *name)
{
if (!name)
return NULL;
for (const struct error_diffusion_kernel *k = mp_error_diffusion_kernels;
k->name;
k++) {
if (strcmp(k->name, name) == 0)
return k;
}
return NULL;
}
int mp_ef_compute_shared_memory_size(const struct error_diffusion_kernel *k,
int height)
{
// We add EF_MAX_DELTA_Y empty lines on the bottom to handle errors
// propagated out from bottom side.
int rows = height + EF_MAX_DELTA_Y;
int shifted_columns = compute_rightmost_shifted_column(k) + 1;
// The shared memory is an array of size rows*shifted_columns. Each element
// is three int, for each RGB component.
return rows * shifted_columns * 3 * 4;
}
void pass_error_diffusion(struct gl_shader_cache *sc,
const struct error_diffusion_kernel *k,
int tex, int width, int height, int depth, int block_size)
{
assert(block_size <= height);
// The parallel error diffusion works by applying the shift mapping first.
// Taking the Floyd and Steinberg algorithm for example. After applying
// the (y, x) -> (y, x + y * shift) mapping (with shift=2), all errors are
// propagated into the next few columns, which makes parallel processing on
// the same column possible.
//
// X 7/16 X 7/16
// 3/16 5/16 1/16 ==> 0 0 3/16 5/16 1/16
// Figuring out the size of rectangle containing all shifted pixels.
// The rectangle height is not changed.
int shifted_width = width + (height - 1) * k->shift;
// We process all pixels from the shifted rectangles column by column, with
// a single global work group of size |block_size|.
// Figuring out how many block are required to process all pixels. We need
// this explicitly to make the number of barrier() calls match.
int blocks = (height * shifted_width + block_size - 1) / block_size;
// If we figure out how many of the next columns will be affected while the
// current columns is being processed. We can store errors of only a few
// columns in the shared memory. Using a ring buffer will further save the
// cost while iterating to next column.
int ring_buffer_rows = height + EF_MAX_DELTA_Y;
int ring_buffer_columns = compute_rightmost_shifted_column(k) + 1;
int ring_buffer_size = ring_buffer_rows * ring_buffer_columns;
const char *rgb = "rgb";
// Defines the ring buffer in shared memory.
for (int comp = 0; comp < 3; comp++)
GLSLH("shared int err_%c[%d];\n", rgb[comp], ring_buffer_size);
// Initialize the ring buffer.
GLSL("for (int i = int(gl_LocalInvocationIndex); i < %d; i += %d) {\n",
ring_buffer_size, block_size);
for (int comp = 0; comp < 3; comp++)
GLSL("err_%c[i] = 0;\n", rgb[comp]);
GLSL("}\n");
GLSL("for (int block_id = 0; block_id < %d; ++block_id) {\n", blocks);
// Add barrier here to have previous block all processed before starting
// the processing of the next.
GLSL("groupMemoryBarrier();\n");
GLSL("barrier();\n");
// Compute the coordinate of the pixel we are currently processing, both
// before and after the shift mapping.
GLSL("int id = int(gl_LocalInvocationIndex) + block_id * %d;\n", block_size);
GLSL("int y = id %% %d, x_shifted = id / %d;\n", height, height);
GLSL("int x = x_shifted - y * %d;\n", k->shift);
// Proceed only if we are processing a valid pixel.
GLSL("if (0 <= x && x < %d) {\n", width);
// The index that the current pixel have on the ring buffer.
GLSL("int idx = (x_shifted * %d + y) %% %d;\n", ring_buffer_rows, ring_buffer_size);
// Fetch the current pixel.
GLSL("vec3 pix = texelFetch(texture%d, ivec2(x, y), 0).rgb;\n", tex);
// The dithering will quantize pixel value into multiples of 1/dither_quant.
int dither_quant = (1 << depth) - 1;
// The absolute value of the errors to propagate is less than 1/dither_quant,
// multiply by dither_quant24 to have them processed with int in 24 bit
// precision.
double dither_quant24 = (double)(1 << 24) * dither_quant;
// Adding the error previously propagated into current pixel, and clear it
// in the buffer.
GLSL("pix += vec3(err_r[idx], err_g[idx], err_b[idx]) / %f;\n", dither_quant24);
for (int comp = 0; comp < 3; comp++)
GLSL("err_%c[idx] = 0;\n", rgb[comp]);
// Dithering to depth.
GLSL("vec3 dithered = floor(pix * %d.0 + 0.5) / %d.0;\n", dither_quant, dither_quant);
GLSL("ivec3 err = ivec3((pix - dithered) * %f + 0.5);\n", dither_quant24);
// Write the dithered pixel.
GLSL("imageStore(out_image, ivec2(x, y), vec4(dithered, 0.0));\n");
GLSL("int nidx;\n");
for (int y = 0; y <= EF_MAX_DELTA_Y; y++) {
for (int x = EF_MIN_DELTA_X; x <= EF_MAX_DELTA_X; x++) {
if (k->pattern[y][x - EF_MIN_DELTA_X] != 0) {
int shifted_x = x + y * k->shift;
// Unlike the right border, errors propagated out from left
// border will remain in the ring buffer. This will produce
// visible artifacts near the left border, especially for
// shift=3 kernels.
bool left_border_check = x < 0;
if (left_border_check)
GLSL("if (x >= %d) {\n", -x);
// Calculate the new position in the ring buffer to propagate
// the error into.
int ring_buffer_delta = shifted_x * ring_buffer_rows + y;
GLSL("nidx = (idx + %d) %% %d;\n", ring_buffer_delta, ring_buffer_size);
// Propagate the error with atomic operation.
for (int comp = 0; comp < 3; comp++) {
GLSL("atomicAdd(err_%c[nidx], err.%c * %d / %d);\n",
rgb[comp], rgb[comp],
k->pattern[y][x - EF_MIN_DELTA_X],
k->divisor);
}
if (left_border_check)
GLSL("}\n");
}
}
}
GLSL("}\n"); // if (0 <= x && x < width)
GLSL("}\n"); // block_id
}
// Different kernels for error diffusion.
// Patterns are from http://www.efg2.com/Lab/Library/ImageProcessing/DHALF.TXT
const struct error_diffusion_kernel mp_error_diffusion_kernels[] = {
{
.name = "simple",
.shift = 1,
.pattern = {{0, 0, 0, 1, 0},
{0, 0, 1, 0, 0},
{0, 0, 0, 0, 0}},
.divisor = 2
},
{
// The "false" Floyd-Steinberg kernel
.name = "false-fs",
.shift = 1,
.pattern = {{0, 0, 0, 3, 0},
{0, 0, 3, 2, 0},
{0, 0, 0, 0, 0}},
.divisor = 8
},
{
.name = "sierra-lite",
.shift = 2,
.pattern = {{0, 0, 0, 2, 0},
{0, 1, 1, 0, 0},
{0, 0, 0, 0, 0}},
.divisor = 4
},
{
.name = "floyd-steinberg",
.shift = 2,
.pattern = {{0, 0, 0, 7, 0},
{0, 3, 5, 1, 0},
{0, 0, 0, 0, 0}},
.divisor = 16
},
{
.name = "atkinson",
.shift = 2,
.pattern = {{0, 0, 0, 1, 1},
{0, 1, 1, 1, 0},
{0, 0, 1, 0, 0}},
.divisor = 8
},
// All kernels below have shift value of 3, and probably are too heavy for
// low end GPU.
{
.name = "jarvis-judice-ninke",
.shift = 3,
.pattern = {{0, 0, 0, 7, 5},
{3, 5, 7, 5, 3},
{1, 3, 5, 3, 1}},
.divisor = 48
},
{
.name = "stucki",
.shift = 3,
.pattern = {{0, 0, 0, 8, 4},
{2, 4, 8, 4, 2},
{1, 2, 4, 2, 1}},
.divisor = 42
},
{
.name = "burkes",
.shift = 3,
.pattern = {{0, 0, 0, 8, 4},
{2, 4, 8, 4, 2},
{0, 0, 0, 0, 0}},
.divisor = 32
},
{
.name = "sierra-3",
.shift = 3,
.pattern = {{0, 0, 0, 5, 3},
{2, 4, 5, 4, 2},
{0, 2, 3, 2, 0}},
.divisor = 32
},
{
.name = "sierra-2",
.shift = 3,
.pattern = {{0, 0, 0, 4, 3},
{1, 2, 3, 2, 1},
{0, 0, 0, 0, 0}},
.divisor = 16
},
{0}
};

View File

@ -0,0 +1,48 @@
/*
* This file is part of mpv.
*
* mpv is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* mpv is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with mpv. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef MP_GL_ERROR_DIFFUSION
#define MP_GL_ERROR_DIFFUSION
#include "shader_cache.h"
// defines the border of all error diffusion kernels
#define EF_MIN_DELTA_X (-2)
#define EF_MAX_DELTA_X (2)
#define EF_MAX_DELTA_Y (2)
struct error_diffusion_kernel {
const char *name;
// The minimum value such that a (y, x) -> (y, x + y * shift) mapping will
// make all error pushing operations affect next column (and after it) only.
int shift;
// The diffusion factor for (y, x) is pattern[y][x - EF_MIN_DELTA_X] / divisor.
int pattern[EF_MAX_DELTA_Y + 1][EF_MAX_DELTA_X - EF_MIN_DELTA_X + 1];
int divisor;
};
extern const struct error_diffusion_kernel mp_error_diffusion_kernels[];
const struct error_diffusion_kernel *mp_find_error_diffusion_kernel(const char *name);
int mp_ef_compute_shared_memory_size(const struct error_diffusion_kernel *k, int height);
void pass_error_diffusion(struct gl_shader_cache *sc,
const struct error_diffusion_kernel *k,
int tex, int width, int height, int depth, int block_size);
#endif /* MP_GL_ERROR_DIFFUSION */

View File

@ -38,6 +38,7 @@
#include "stream/stream.h"
#include "video_shaders.h"
#include "user_shaders.h"
#include "error_diffusion.h"
#include "video/out/filter_kernels.h"
#include "video/out/aspect.h"
#include "video/out/dither.h"
@ -211,6 +212,7 @@ struct gl_video {
struct ra_tex *integer_tex[4];
struct ra_tex *indirect_tex;
struct ra_tex *blend_subs_tex;
struct ra_tex *error_diffusion_tex;
struct ra_tex *screen_tex;
struct ra_tex *output_tex;
struct ra_tex *vdpau_deinterleave_tex[2];
@ -295,6 +297,7 @@ static const struct gl_video_opts gl_video_opts_def = {
.dither_depth = -1,
.dither_size = 6,
.temporal_dither_period = 1,
.error_diffusion = "sierra-lite",
.fbo_format = "auto",
.sigmoid_center = 0.75,
.sigmoid_slope = 6.5,
@ -334,6 +337,9 @@ static int validate_scaler_opt(struct mp_log *log, const m_option_t *opt,
static int validate_window_opt(struct mp_log *log, const m_option_t *opt,
struct bstr name, struct bstr param);
static int validate_error_diffusion_opt(struct mp_log *log, const m_option_t *opt,
struct bstr name, struct bstr param);
#define OPT_BASE_STRUCT struct gl_video_opts
#define SCALER_OPTS(n, i) \
@ -402,10 +408,13 @@ const struct m_sub_options gl_video_conf = {
OPT_CHOICE("dither", dither_algo, 0,
({"fruit", DITHER_FRUIT},
{"ordered", DITHER_ORDERED},
{"error-diffusion", DITHER_ERROR_DIFFUSION},
{"no", DITHER_NONE})),
OPT_INTRANGE("dither-size-fruit", dither_size, 0, 2, 8),
OPT_FLAG("temporal-dither", temporal_dither, 0),
OPT_INTRANGE("temporal-dither-period", temporal_dither_period, 0, 1, 128),
OPT_STRING_VALIDATE("error-diffusion", error_diffusion, 0,
validate_error_diffusion_opt),
OPT_CHOICE("alpha", alpha_mode, 0,
({"no", ALPHA_NO},
{"yes", ALPHA_YES},
@ -544,6 +553,7 @@ static void uninit_rendering(struct gl_video *p)
ra_tex_free(p->ra, &p->indirect_tex);
ra_tex_free(p->ra, &p->blend_subs_tex);
ra_tex_free(p->ra, &p->error_diffusion_tex);
ra_tex_free(p->ra, &p->screen_tex);
ra_tex_free(p->ra, &p->output_tex);
@ -2595,6 +2605,51 @@ static void pass_dither(struct gl_video *p)
if (p->opts.dither_depth < 0 || p->opts.dither_algo == DITHER_NONE)
return;
if (p->opts.dither_algo == DITHER_ERROR_DIFFUSION) {
const struct error_diffusion_kernel *kernel =
mp_find_error_diffusion_kernel(p->opts.error_diffusion);
int o_w = p->dst_rect.x1 - p->dst_rect.x0,
o_h = p->dst_rect.y1 - p->dst_rect.y0;
int shmem_req = mp_ef_compute_shared_memory_size(kernel, o_h);
if (shmem_req > p->ra->max_shmem) {
MP_WARN(p, "Fallback to dither=fruit because there is no enough "
"shared memory (%d/%d).\n",
shmem_req, (int)p->ra->max_shmem);
p->opts.dither_algo = DITHER_FRUIT;
} else {
finish_pass_tex(p, &p->screen_tex, o_w, o_h);
struct image img = image_wrap(p->screen_tex, PLANE_RGB, p->components);
// 1024 is minimal required number of invocation allowed in single
// work group in OpenGL. Use it for maximal performance.
int block_size = MPMIN(1024, o_h);
pass_describe(p, "dither=error-diffusion (kernel=%s, depth=%d)",
kernel->name, dst_depth);
p->pass_compute = (struct compute_info) {
.active = true,
.threads_w = block_size,
.threads_h = 1,
.directly_writes = true
};
int tex_id = pass_bind(p, img);
pass_error_diffusion(p->sc, kernel, tex_id, o_w, o_h,
dst_depth, block_size);
finish_pass_tex(p, &p->error_diffusion_tex, o_w, o_h);
img = image_wrap(p->error_diffusion_tex, PLANE_RGB, p->components);
copy_image(p, &(int){0}, img);
return;
}
}
if (!p->dither_texture) {
MP_VERBOSE(p, "Dither to %d.\n", dst_depth);
@ -3632,6 +3687,12 @@ static void check_gl_features(struct gl_video *p)
"available! See your FBO format configuration!\n");
}
if (!have_compute && p->opts.dither_algo == DITHER_ERROR_DIFFUSION) {
MP_WARN(p, "Disabling error diffusion dithering because compute shader "
"was not supported. Fallback to dither=fruit instead.\n");
p->opts.dither_algo = DITHER_FRUIT;
}
bool have_compute_peak = have_compute && have_ssbo;
if (!have_compute_peak && p->opts.tone_map.compute_peak >= 0) {
int msgl = p->opts.tone_map.compute_peak == 1 ? MSGL_WARN : MSGL_V;
@ -3663,6 +3724,7 @@ static void check_gl_features(struct gl_video *p)
.dither_algo = p->opts.dither_algo,
.dither_depth = p->opts.dither_depth,
.dither_size = p->opts.dither_size,
.error_diffusion = p->opts.error_diffusion,
.temporal_dither = p->opts.temporal_dither,
.temporal_dither_period = p->opts.temporal_dither_period,
.tex_pad_x = p->opts.tex_pad_x,
@ -4011,6 +4073,29 @@ static int validate_window_opt(struct mp_log *log, const m_option_t *opt,
return r;
}
static int validate_error_diffusion_opt(struct mp_log *log, const m_option_t *opt,
struct bstr name, struct bstr param)
{
char s[20] = {0};
int r = 1;
if (bstr_equals0(param, "help")) {
r = M_OPT_EXIT;
} else {
snprintf(s, sizeof(s), "%.*s", BSTR_P(param));
const struct error_diffusion_kernel *k = mp_find_error_diffusion_kernel(s);
if (!k)
r = M_OPT_INVALID;
}
if (r < 1) {
mp_info(log, "Available error diffusion kernels:\n");
for (int n = 0; mp_error_diffusion_kernels[n].name; n++)
mp_info(log, " %s\n", mp_error_diffusion_kernels[n].name);
if (s[0])
mp_fatal(log, "No error diffusion kernel named '%s' found!\n", s);
}
return r;
}
float gl_video_scale_ambient_lux(float lmin, float lmax,
float rmin, float rmax, float lux)
{

View File

@ -71,6 +71,7 @@ enum dither_algo {
DITHER_NONE = 0,
DITHER_FRUIT,
DITHER_ORDERED,
DITHER_ERROR_DIFFUSION,
};
enum alpha_mode {
@ -131,6 +132,7 @@ struct gl_video_opts {
int dither_size;
int temporal_dither;
int temporal_dither_period;
char *error_diffusion;
char *fbo_format;
int alpha_mode;
int use_rectangle;

View File

@ -433,6 +433,7 @@ def build(ctx):
( "video/out/filter_kernels.c" ),
( "video/out/gpu/context.c" ),
( "video/out/gpu/d3d11_helpers.c", "d3d11 || egl-angle-win32" ),
( "video/out/gpu/error_diffusion.c" ),
( "video/out/gpu/hwdec.c" ),
( "video/out/gpu/lcms.c" ),
( "video/out/gpu/libmpv_gpu.c" ),