Add unicode_strtitle() for Unicode Default Case Conversion.

This brings the titlecasing implementation for the builtin provider out of formatting.c and into unicode_case.c, along with unicode_strlower() and unicode_strupper(). Accepts an arbitrary word boundary callback. Simple for now, but can be extended to support the Unicode Default Case Conversion algorithm with full case mapping. Discussion: https://postgr.es/m/3bc653b5d562ae9e2838b11cb696816c328a489a.camel@j-davis.com Reviewed-by: Peter Eisentraut
2024-03-29 17:35:07 -07:00 · 2024-03-29 17:35:07 -07:00 · 46e5441fa5
parent a96a8b15fa
commit 46e5441fa5
3 changed files with 140 additions and 48 deletions
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@ -1922,6 +1922,47 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)
 	return result;
 }

+struct WordBoundaryState
+{
+	const char *str;
+	size_t		len;
+	size_t		offset;
+	bool		init;
+	bool		prev_alnum;
+};
+
+/*
+ * Simple word boundary iterator that draws boundaries each time the result of
+ * pg_u_isalnum() changes.
+ */
+static size_t
+initcap_wbnext(void *state)
+{
+	struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
+
+	while (wbstate->offset < wbstate->len &&
+		   wbstate->str[wbstate->offset] != '\0')
+	{
+		pg_wchar	u = utf8_to_unicode((unsigned char *) wbstate->str +
+										wbstate->offset);
+		bool		curr_alnum = pg_u_isalnum(u, true);
+
+		if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
+		{
+			size_t		prev_offset = wbstate->offset;
+
+			wbstate->init = true;
+			wbstate->offset += unicode_utf8len(u);
+			wbstate->prev_alnum = curr_alnum;
+			return prev_offset;
+		}
+
+		wbstate->offset += unicode_utf8len(u);
+	}
+
+	return wbstate->len;
+}
+
 /*
 * collation-aware, wide-character-aware initcap function
 *
@ -1980,56 +2021,42 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
 #endif
 		if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN)
 		{
-			const unsigned char *src = (unsigned char *) buff;
+			const char *src = buff;
 			size_t		srclen = nbytes;
-			unsigned char *dst;
 			size_t		dstsize;
-			int			srcoff = 0;
-			int			dstoff = 0;
+			char	   *dst;
+			size_t		needed;
+			struct WordBoundaryState wbstate = {
+				.str = src,
+				.len = srclen,
+				.offset = 0,
+				.init = false,
+				.prev_alnum = false,
+			};

 			Assert(GetDatabaseEncoding() == PG_UTF8);

-			/* overflow paranoia */
-			if ((srclen + 1) > (INT_MAX / MAX_MULTIBYTE_CHAR_LEN))
-				ereport(ERROR,
-						(errcode(ERRCODE_OUT_OF_MEMORY),
-						 errmsg("out of memory")));
+			/* first try buffer of equal size plus terminating NUL */
+			dstsize = srclen + 1;
+			dst = palloc(dstsize);

-			/* result is at most srclen codepoints plus terminating NUL */
-			dstsize = srclen * MAX_MULTIBYTE_CHAR_LEN + 1;
-			dst = (unsigned char *) palloc(dstsize);
-
-			while (srcoff < nbytes)
+			needed = unicode_strtitle(dst, dstsize, src, srclen,
+									  initcap_wbnext, &wbstate);
+			if (needed + 1 > dstsize)
 			{
-				pg_wchar	u1 = utf8_to_unicode(src + srcoff);
-				pg_wchar	u2;
-				int			u1len = unicode_utf8len(u1);
-				int			u2len;
+				/* reset iterator */
+				wbstate.offset = 0;
+				wbstate.init = false;

-				if (wasalnum)
-					u2 = unicode_lowercase_simple(u1);
-				else
-					u2 = unicode_uppercase_simple(u1);
-
-				u2len = unicode_utf8len(u2);
-
-				Assert(dstoff + u2len + 1 <= dstsize);
-
-				wasalnum = pg_u_isalnum(u2, true);
-
-				unicode_to_utf8(u2, dst + dstoff);
-				srcoff += u1len;
-				dstoff += u2len;
+				/* grow buffer if needed and retry */
+				dstsize = needed + 1;
+				dst = repalloc(dst, dstsize);
+				needed = unicode_strtitle(dst, dstsize, src, srclen,
+										  initcap_wbnext, &wbstate);
+				Assert(needed + 1 == dstsize);
 			}

-			Assert(dstoff + 1 <= dstsize);
-			*(dst + dstoff) = '\0';
-			dstoff++;
-
-			/* allocate result buffer of the right size and free workspace */
-			result = palloc(dstoff);
-			memcpy(result, dst, dstoff);
-			pfree(dst);
+			result = dst;
 		}
 		else
 		{
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@ -21,8 +21,9 @@
 #include "mb/pg_wchar.h"

 static const pg_case_map *find_case_map(pg_wchar ucs);
-static size_t convert_case(char *dst, size_t dstsize, const char *src,
-						   ssize_t srclen, CaseKind casekind);
+static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
+						   CaseKind str_casekind, WordBoundaryNext wbnext,
+						   void *wbstate);

 pg_wchar
 unicode_lowercase_simple(pg_wchar code)
@ -67,7 +68,40 @@ unicode_uppercase_simple(pg_wchar code)
 size_t
 unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
 {
-	return convert_case(dst, dstsize, src, srclen, CaseLower);
+	return convert_case(dst, dstsize, src, srclen, CaseLower, NULL, NULL);
+}
+
+/*
+ * unicode_strtitle()
+ *
+ * Convert src to titlecase, and return the result length (not including
+ * terminating NUL).
+ *
+ * String src must be encoded in UTF-8. If srclen < 0, src must be
+ * NUL-terminated.
+ *
+ * Result string is stored in dst, truncating if larger than dstsize. If
+ * dstsize is greater than the result length, dst will be NUL-terminated;
+ * otherwise not.
+ *
+ * If dstsize is zero, dst may be NULL. This is useful for calculating the
+ * required buffer size before allocating.
+ *
+ * Titlecasing requires knowledge about word boundaries, which is provided by
+ * the callback wbnext. A word boundary is the offset of the start of a word
+ * or the offset of the character immediately following a word.
+ *
+ * The caller is expected to initialize and free the callback state
+ * wbstate. The callback should first return offset 0 for the first boundary;
+ * then the offset of each subsequent word boundary; then the total length of
+ * the string to indicate the final boundary.
+ */
+size_t
+unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
+				 WordBoundaryNext wbnext, void *wbstate)
+{
+	return convert_case(dst, dstsize, src, srclen, CaseTitle, wbnext,
+						wbstate);
 }

 /*
@ -89,20 +123,34 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
 size_t
 unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
 {
-	return convert_case(dst, dstsize, src, srclen, CaseUpper);
+	return convert_case(dst, dstsize, src, srclen, CaseUpper, NULL, NULL);
 }

 /*
- * Implement Unicode Default Case Conversion algorithm.
+ * If str_casekind is CaseLower or CaseUpper, map each character in the string
+ * for which a mapping is available.
 *
- * Map each character in the string for which a mapping is available.
+ * If str_casekind is CaseTitle, maps characters found on a word boundary to
+ * uppercase and other characters to lowercase.
 */
 static size_t
 convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
-			 CaseKind casekind)
+			 CaseKind str_casekind, WordBoundaryNext wbnext, void *wbstate)
 {
+	/* character CaseKind varies while titlecasing */
+	CaseKind	chr_casekind = str_casekind;
 	size_t		srcoff = 0;
 	size_t		result_len = 0;
+	size_t		boundary = 0;
+
+	Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
+		   (str_casekind != CaseTitle && !wbnext && !wbstate));
+
+	if (str_casekind == CaseTitle)
+	{
+		boundary = wbnext(wbstate);
+		Assert(boundary == 0);	/* start of text is always a boundary */
+	}

 	while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
 	{
@ -110,9 +158,21 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 		int			u1len = unicode_utf8len(u1);
 		const		pg_case_map *casemap = find_case_map(u1);

+		if (str_casekind == CaseTitle)
+		{
+			if (srcoff == boundary)
+			{
+				chr_casekind = CaseUpper;
+				boundary = wbnext(wbstate);
+			}
+			else
+				chr_casekind = CaseLower;
+		}
+
+		/* perform mapping, update result_len, and write to dst */
 		if (casemap)
 		{
-			pg_wchar	u2 = casemap->simplemap[casekind];
+			pg_wchar	u2 = casemap->simplemap[chr_casekind];
 			pg_wchar	u2len = unicode_utf8len(u2);

 			if (result_len + u2len <= dstsize)
--- a/src/include/common/unicode_case.h
+++ b/src/include/common/unicode_case.h
@ -16,11 +16,16 @@

 #include "mb/pg_wchar.h"

+typedef size_t (*WordBoundaryNext) (void *wbstate);
+
 pg_wchar	unicode_lowercase_simple(pg_wchar ucs);
 pg_wchar	unicode_titlecase_simple(pg_wchar ucs);
 pg_wchar	unicode_uppercase_simple(pg_wchar ucs);
 size_t		unicode_strlower(char *dst, size_t dstsize, const char *src,
 							 ssize_t srclen);
+size_t		unicode_strtitle(char *dst, size_t dstsize, const char *src,
+							 ssize_t srclen, WordBoundaryNext wbnext,
+							 void *wbstate);
 size_t		unicode_strupper(char *dst, size_t dstsize, const char *src,
 							 ssize_t srclen);