Add unicode_strtitle() for Unicode Default Case Conversion.

This brings the titlecasing implementation for the builtin provider
out of formatting.c and into unicode_case.c, along with
unicode_strlower() and unicode_strupper(). Accepts an arbitrary word
boundary callback.

Simple for now, but can be extended to support the Unicode Default
Case Conversion algorithm with full case mapping.

Discussion: https://postgr.es/m/3bc653b5d562ae9e2838b11cb696816c328a489a.camel@j-davis.com
Reviewed-by: Peter Eisentraut
This commit is contained in:
Jeff Davis 2024-03-29 17:35:07 -07:00
parent a96a8b15fa
commit 46e5441fa5
3 changed files with 140 additions and 48 deletions

View File

@ -1922,6 +1922,47 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)
return result;
}
struct WordBoundaryState
{
const char *str;
size_t len;
size_t offset;
bool init;
bool prev_alnum;
};
/*
* Simple word boundary iterator that draws boundaries each time the result of
* pg_u_isalnum() changes.
*/
static size_t
initcap_wbnext(void *state)
{
struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
while (wbstate->offset < wbstate->len &&
wbstate->str[wbstate->offset] != '\0')
{
pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str +
wbstate->offset);
bool curr_alnum = pg_u_isalnum(u, true);
if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
{
size_t prev_offset = wbstate->offset;
wbstate->init = true;
wbstate->offset += unicode_utf8len(u);
wbstate->prev_alnum = curr_alnum;
return prev_offset;
}
wbstate->offset += unicode_utf8len(u);
}
return wbstate->len;
}
/*
* collation-aware, wide-character-aware initcap function
*
@ -1980,56 +2021,42 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
#endif
if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN)
{
const unsigned char *src = (unsigned char *) buff;
const char *src = buff;
size_t srclen = nbytes;
unsigned char *dst;
size_t dstsize;
int srcoff = 0;
int dstoff = 0;
char *dst;
size_t needed;
struct WordBoundaryState wbstate = {
.str = src,
.len = srclen,
.offset = 0,
.init = false,
.prev_alnum = false,
};
Assert(GetDatabaseEncoding() == PG_UTF8);
/* overflow paranoia */
if ((srclen + 1) > (INT_MAX / MAX_MULTIBYTE_CHAR_LEN))
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
/* first try buffer of equal size plus terminating NUL */
dstsize = srclen + 1;
dst = palloc(dstsize);
/* result is at most srclen codepoints plus terminating NUL */
dstsize = srclen * MAX_MULTIBYTE_CHAR_LEN + 1;
dst = (unsigned char *) palloc(dstsize);
while (srcoff < nbytes)
needed = unicode_strtitle(dst, dstsize, src, srclen,
initcap_wbnext, &wbstate);
if (needed + 1 > dstsize)
{
pg_wchar u1 = utf8_to_unicode(src + srcoff);
pg_wchar u2;
int u1len = unicode_utf8len(u1);
int u2len;
/* reset iterator */
wbstate.offset = 0;
wbstate.init = false;
if (wasalnum)
u2 = unicode_lowercase_simple(u1);
else
u2 = unicode_uppercase_simple(u1);
u2len = unicode_utf8len(u2);
Assert(dstoff + u2len + 1 <= dstsize);
wasalnum = pg_u_isalnum(u2, true);
unicode_to_utf8(u2, dst + dstoff);
srcoff += u1len;
dstoff += u2len;
/* grow buffer if needed and retry */
dstsize = needed + 1;
dst = repalloc(dst, dstsize);
needed = unicode_strtitle(dst, dstsize, src, srclen,
initcap_wbnext, &wbstate);
Assert(needed + 1 == dstsize);
}
Assert(dstoff + 1 <= dstsize);
*(dst + dstoff) = '\0';
dstoff++;
/* allocate result buffer of the right size and free workspace */
result = palloc(dstoff);
memcpy(result, dst, dstoff);
pfree(dst);
result = dst;
}
else
{

View File

@ -21,8 +21,9 @@
#include "mb/pg_wchar.h"
static const pg_case_map *find_case_map(pg_wchar ucs);
static size_t convert_case(char *dst, size_t dstsize, const char *src,
ssize_t srclen, CaseKind casekind);
static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
CaseKind str_casekind, WordBoundaryNext wbnext,
void *wbstate);
pg_wchar
unicode_lowercase_simple(pg_wchar code)
@ -67,7 +68,40 @@ unicode_uppercase_simple(pg_wchar code)
size_t
unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
{
return convert_case(dst, dstsize, src, srclen, CaseLower);
return convert_case(dst, dstsize, src, srclen, CaseLower, NULL, NULL);
}
/*
* unicode_strtitle()
*
* Convert src to titlecase, and return the result length (not including
* terminating NUL).
*
* String src must be encoded in UTF-8. If srclen < 0, src must be
* NUL-terminated.
*
* Result string is stored in dst, truncating if larger than dstsize. If
* dstsize is greater than the result length, dst will be NUL-terminated;
* otherwise not.
*
* If dstsize is zero, dst may be NULL. This is useful for calculating the
* required buffer size before allocating.
*
* Titlecasing requires knowledge about word boundaries, which is provided by
* the callback wbnext. A word boundary is the offset of the start of a word
* or the offset of the character immediately following a word.
*
* The caller is expected to initialize and free the callback state
* wbstate. The callback should first return offset 0 for the first boundary;
* then the offset of each subsequent word boundary; then the total length of
* the string to indicate the final boundary.
*/
size_t
unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
WordBoundaryNext wbnext, void *wbstate)
{
return convert_case(dst, dstsize, src, srclen, CaseTitle, wbnext,
wbstate);
}
/*
@ -89,20 +123,34 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
size_t
unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
{
return convert_case(dst, dstsize, src, srclen, CaseUpper);
return convert_case(dst, dstsize, src, srclen, CaseUpper, NULL, NULL);
}
/*
* Implement Unicode Default Case Conversion algorithm.
* If str_casekind is CaseLower or CaseUpper, map each character in the string
* for which a mapping is available.
*
* Map each character in the string for which a mapping is available.
* If str_casekind is CaseTitle, maps characters found on a word boundary to
* uppercase and other characters to lowercase.
*/
static size_t
convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
CaseKind casekind)
CaseKind str_casekind, WordBoundaryNext wbnext, void *wbstate)
{
/* character CaseKind varies while titlecasing */
CaseKind chr_casekind = str_casekind;
size_t srcoff = 0;
size_t result_len = 0;
size_t boundary = 0;
Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
(str_casekind != CaseTitle && !wbnext && !wbstate));
if (str_casekind == CaseTitle)
{
boundary = wbnext(wbstate);
Assert(boundary == 0); /* start of text is always a boundary */
}
while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
{
@ -110,9 +158,21 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
int u1len = unicode_utf8len(u1);
const pg_case_map *casemap = find_case_map(u1);
if (str_casekind == CaseTitle)
{
if (srcoff == boundary)
{
chr_casekind = CaseUpper;
boundary = wbnext(wbstate);
}
else
chr_casekind = CaseLower;
}
/* perform mapping, update result_len, and write to dst */
if (casemap)
{
pg_wchar u2 = casemap->simplemap[casekind];
pg_wchar u2 = casemap->simplemap[chr_casekind];
pg_wchar u2len = unicode_utf8len(u2);
if (result_len + u2len <= dstsize)

View File

@ -16,11 +16,16 @@
#include "mb/pg_wchar.h"
typedef size_t (*WordBoundaryNext) (void *wbstate);
pg_wchar unicode_lowercase_simple(pg_wchar ucs);
pg_wchar unicode_titlecase_simple(pg_wchar ucs);
pg_wchar unicode_uppercase_simple(pg_wchar ucs);
size_t unicode_strlower(char *dst, size_t dstsize, const char *src,
ssize_t srclen);
size_t unicode_strtitle(char *dst, size_t dstsize, const char *src,
ssize_t srclen, WordBoundaryNext wbnext,
void *wbstate);
size_t unicode_strupper(char *dst, size_t dstsize, const char *src,
ssize_t srclen);