postgresql/src/fe_utils/mbprint.c

406 lines
8.6 KiB
C

/*-------------------------------------------------------------------------
*
* Multibyte character printing support for frontend code
*
*
* Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/fe_utils/mbprint.c
*
*-------------------------------------------------------------------------
*/
#include "postgres_fe.h"
#include "fe_utils/mbprint.h"
#include "libpq-fe.h"
/*
* To avoid version-skew problems, this file must not use declarations
* from pg_wchar.h: the encoding IDs we are dealing with are determined
* by the libpq.so we are linked with, and that might not match the
* numbers we see at compile time. (If this file were inside libpq,
* the problem would go away...)
*
* Hence, we have our own definition of pg_wchar, and we get the values
* of any needed encoding IDs on-the-fly.
*/
typedef unsigned int pg_wchar;
static int
pg_get_utf8_id(void)
{
static int utf8_id = -1;
if (utf8_id < 0)
utf8_id = pg_char_to_encoding("utf8");
return utf8_id;
}
#define PG_UTF8 pg_get_utf8_id()
/*
* Convert a UTF-8 character to a Unicode code point.
* This is a one-character version of pg_utf2wchar_with_len.
*
* No error checks here, c must point to a long-enough string.
*/
static pg_wchar
utf8_to_unicode(const unsigned char *c)
{
if ((*c & 0x80) == 0)
return (pg_wchar) c[0];
else if ((*c & 0xe0) == 0xc0)
return (pg_wchar) (((c[0] & 0x1f) << 6) |
(c[1] & 0x3f));
else if ((*c & 0xf0) == 0xe0)
return (pg_wchar) (((c[0] & 0x0f) << 12) |
((c[1] & 0x3f) << 6) |
(c[2] & 0x3f));
else if ((*c & 0xf8) == 0xf0)
return (pg_wchar) (((c[0] & 0x07) << 18) |
((c[1] & 0x3f) << 12) |
((c[2] & 0x3f) << 6) |
(c[3] & 0x3f));
else
/* that is an invalid code on purpose */
return 0xffffffff;
}
/*
* Unicode 3.1 compliant validation : for each category, it checks the
* combination of each byte to make sure it maps to a valid range. It also
* returns -1 for the following UCS values: ucs > 0x10ffff ucs & 0xfffe =
* 0xfffe 0xfdd0 < ucs < 0xfdef ucs & 0xdb00 = 0xd800 (surrogates)
*/
static int
utf_charcheck(const unsigned char *c)
{
if ((*c & 0x80) == 0)
return 1;
else if ((*c & 0xe0) == 0xc0)
{
/* two-byte char */
if (((c[1] & 0xc0) == 0x80) && ((c[0] & 0x1f) > 0x01))
return 2;
return -1;
}
else if ((*c & 0xf0) == 0xe0)
{
/* three-byte char */
if (((c[1] & 0xc0) == 0x80) &&
(((c[0] & 0x0f) != 0x00) || ((c[1] & 0x20) == 0x20)) &&
((c[2] & 0xc0) == 0x80))
{
int z = c[0] & 0x0f;
int yx = ((c[1] & 0x3f) << 6) | (c[0] & 0x3f);
int lx = yx & 0x7f;
/* check 0xfffe/0xffff, 0xfdd0..0xfedf range, surrogates */
if (((z == 0x0f) &&
(((yx & 0xffe) == 0xffe) ||
(((yx & 0xf80) == 0xd80) && (lx >= 0x30) && (lx <= 0x4f)))) ||
((z == 0x0d) && ((yx & 0xb00) == 0x800)))
return -1;
return 3;
}
return -1;
}
else if ((*c & 0xf8) == 0xf0)
{
int u = ((c[0] & 0x07) << 2) | ((c[1] & 0x30) >> 4);
/* four-byte char */
if (((c[1] & 0xc0) == 0x80) &&
(u > 0x00) && (u <= 0x10) &&
((c[2] & 0xc0) == 0x80) && ((c[3] & 0xc0) == 0x80))
{
/* test for 0xzzzzfffe/0xzzzzfffff */
if (((c[1] & 0x0f) == 0x0f) && ((c[2] & 0x3f) == 0x3f) &&
((c[3] & 0x3e) == 0x3e))
return -1;
return 4;
}
return -1;
}
return -1;
}
static void
mb_utf_validate(unsigned char *pwcs)
{
unsigned char *p = pwcs;
while (*pwcs)
{
int len;
if ((len = utf_charcheck(pwcs)) > 0)
{
if (p != pwcs)
{
int i;
for (i = 0; i < len; i++)
*p++ = *pwcs++;
}
else
{
pwcs += len;
p += len;
}
}
else
/* we skip the char */
pwcs++;
}
if (p != pwcs)
*p = '\0';
}
/*
* public functions : wcswidth and mbvalidate
*/
/*
* pg_wcswidth is the dumb display-width function.
* It assumes that everything will appear on one line.
* OTOH it is easier to use than pg_wcssize if this applies to you.
*/
int
pg_wcswidth(const char *pwcs, size_t len, int encoding)
{
int width = 0;
while (len > 0)
{
int chlen,
chwidth;
chlen = PQmblen(pwcs, encoding);
if (len < (size_t) chlen)
break; /* Invalid string */
chwidth = PQdsplen(pwcs, encoding);
if (chwidth > 0)
width += chwidth;
pwcs += chlen;
len -= chlen;
}
return width;
}
/*
* pg_wcssize takes the given string in the given encoding and returns three
* values:
* result_width: Width in display characters of the longest line in string
* result_height: Number of lines in display output
* result_format_size: Number of bytes required to store formatted
* representation of string
*
* This MUST be kept in sync with pg_wcsformat!
*/
void
pg_wcssize(const unsigned char *pwcs, size_t len, int encoding,
int *result_width, int *result_height, int *result_format_size)
{
int w,
chlen = 0,
linewidth = 0;
int width = 0;
int height = 1;
int format_size = 0;
for (; *pwcs && len > 0; pwcs += chlen)
{
chlen = PQmblen((const char *) pwcs, encoding);
if (len < (size_t) chlen)
break;
w = PQdsplen((const char *) pwcs, encoding);
if (chlen == 1) /* single-byte char */
{
if (*pwcs == '\n') /* Newline */
{
if (linewidth > width)
width = linewidth;
linewidth = 0;
height += 1;
format_size += 1; /* For NUL char */
}
else if (*pwcs == '\r') /* Linefeed */
{
linewidth += 2;
format_size += 2;
}
else if (*pwcs == '\t') /* Tab */
{
do
{
linewidth++;
format_size++;
} while (linewidth % 8 != 0);
}
else if (w < 0) /* Other control char */
{
linewidth += 4;
format_size += 4;
}
else /* Output it as-is */
{
linewidth += w;
format_size += 1;
}
}
else if (w < 0) /* Non-ascii control char */
{
linewidth += 6; /* \u0000 */
format_size += 6;
}
else /* All other chars */
{
linewidth += w;
format_size += chlen;
}
len -= chlen;
}
if (linewidth > width)
width = linewidth;
format_size += 1; /* For NUL char */
/* Set results */
if (result_width)
*result_width = width;
if (result_height)
*result_height = height;
if (result_format_size)
*result_format_size = format_size;
}
/*
* Format a string into one or more "struct lineptr" lines.
* lines[i].ptr == NULL indicates the end of the array.
*
* This MUST be kept in sync with pg_wcssize!
*/
void
pg_wcsformat(const unsigned char *pwcs, size_t len, int encoding,
struct lineptr *lines, int count)
{
int w,
chlen = 0;
int linewidth = 0;
unsigned char *ptr = lines->ptr; /* Pointer to data area */
for (; *pwcs && len > 0; pwcs += chlen)
{
chlen = PQmblen((const char *) pwcs, encoding);
if (len < (size_t) chlen)
break;
w = PQdsplen((const char *) pwcs, encoding);
if (chlen == 1) /* single-byte char */
{
if (*pwcs == '\n') /* Newline */
{
*ptr++ = '\0';
lines->width = linewidth;
linewidth = 0;
lines++;
count--;
if (count <= 0)
exit(1); /* Screwup */
/* make next line point to remaining memory */
lines->ptr = ptr;
}
else if (*pwcs == '\r') /* Linefeed */
{
strcpy((char *) ptr, "\\r");
linewidth += 2;
ptr += 2;
}
else if (*pwcs == '\t') /* Tab */
{
do
{
*ptr++ = ' ';
linewidth++;
} while (linewidth % 8 != 0);
}
else if (w < 0) /* Other control char */
{
sprintf((char *) ptr, "\\x%02X", *pwcs);
linewidth += 4;
ptr += 4;
}
else /* Output it as-is */
{
linewidth += w;
*ptr++ = *pwcs;
}
}
else if (w < 0) /* Non-ascii control char */
{
if (encoding == PG_UTF8)
sprintf((char *) ptr, "\\u%04X", utf8_to_unicode(pwcs));
else
{
/*
* This case cannot happen in the current code because only
* UTF-8 signals multibyte control characters. But we may need
* to support it at some stage
*/
sprintf((char *) ptr, "\\u????");
}
ptr += 6;
linewidth += 6;
}
else /* All other chars */
{
int i;
for (i = 0; i < chlen; i++)
*ptr++ = pwcs[i];
linewidth += w;
}
len -= chlen;
}
lines->width = linewidth;
*ptr++ = '\0'; /* Terminate formatted string */
if (count <= 0)
exit(1); /* Screwup */
(lines + 1)->ptr = NULL; /* terminate line array */
}
/*
* Encoding validation: delete any unvalidatable characters from the string
*
* This seems redundant with existing functionality elsewhere?
*/
unsigned char *
mbvalidate(unsigned char *pwcs, int encoding)
{
if (encoding == PG_UTF8)
mb_utf_validate(pwcs);
else
{
/*
* other encodings needing validation should add their own routines
* here
*/
}
return pwcs;
}