postgresql/src/common/unicode/generate-unicode_case_table.pl

#!/usr/bin/perl
#
# Generate Unicode character case mappings. Does not include tailoring
# or locale-specific mappings.
#
# Input: UnicodeData.txt
# Output: unicode_case_table.h
#
# Copyright (c) 2000-2023, PostgreSQL Global Development Group

use strict;
use warnings;
use Getopt::Long;

use FindBin;
use lib "$FindBin::RealBin/../../tools/";

my $output_path = '.';

GetOptions('outdir:s' => \$output_path);

my $output_table_file = "$output_path/unicode_case_table.h";

my $FH;

my %simple = ();

open($FH, '<', "$output_path/UnicodeData.txt")
  or die "Could not open $output_path/UnicodeData.txt: $!.";
while (my $line = <$FH>)
{
	my @elts = split(';', $line);
	my $code = hex($elts[0]);
	my $simple_uppercase = hex($elts[12] =~ s/^\s+|\s+$//rg);
	my $simple_lowercase = hex($elts[13] =~ s/^\s+|\s+$//rg);
	my $simple_titlecase = hex($elts[14] =~ s/^\s+|\s+$//rg);

	die "codepoint $code out of range" if $code > 0x10FFFF;
	die "Simple_Lowercase $code out of range" if $simple_lowercase > 0x10FFFF;
	die "Simple_Titlecase $code out of range" if $simple_titlecase > 0x10FFFF;
	die "Simple_Uppercase $code out of range" if $simple_uppercase > 0x10FFFF;

	if ($simple_lowercase || $simple_titlecase || $simple_uppercase)
	{
		$simple{$code} = {
			Simple_Lowercase => ($simple_lowercase || $code),
			Simple_Titlecase => ($simple_titlecase || $code),
			Simple_Uppercase => ($simple_uppercase || $code)
		};
	}
}
close $FH;

# Start writing out the output files
open my $OT, '>', $output_table_file
  or die "Could not open output file $output_table_file: $!\n";

# determine size of array given that codepoints <= 0x80 are dense and
# the rest of the entries are sparse
my $num_simple = 0x80;
foreach my $code (sort { $a <=> $b } (keys %simple))
{
	$num_simple++ unless $code < 0x80;
}

print $OT <<"EOS";
/*-------------------------------------------------------------------------
 *
 * unicode_case_table.h
 *	  Case mapping and information table.
 *
 * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * src/include/common/unicode_case_table.h
 *
 *-------------------------------------------------------------------------
 */

/*
 * File auto-generated by src/common/unicode/generate-unicode_case_table.pl,
 * do not edit. There is deliberately not an #ifndef PG_UNICODE_CASE_TABLE_H
 * here.
 */

#include "common/unicode_case.h"
#include "mb/pg_wchar.h"

typedef enum
{
	CaseLower = 0,
	CaseTitle = 1,
	CaseUpper = 2,
	NCaseKind
}			CaseKind;

typedef struct
{
	pg_wchar	codepoint;		/* Unicode codepoint */
	pg_wchar	simplemap[NCaseKind];
}			pg_case_map;

/*
 * Case mapping table. Dense for codepoints < 0x80 (enabling fast lookup),
 * sparse for higher codepoints (requiring scan or binary search).
 */
static const pg_case_map case_map[$num_simple] =
{
EOS

printf $OT "\t/* begin dense entries for codepoints < 0x80 */\n";
for (my $code = 0; $code < 0x80; $code++)
{
	my $lc = ($simple{$code}{Simple_Lowercase} || $code);
	my $tc = ($simple{$code}{Simple_Titlecase} || $code);
	my $uc = ($simple{$code}{Simple_Uppercase} || $code);
	printf $OT
	  "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}},\n",
	  $code, $lc, $tc, $uc;
}
printf $OT "\n";

printf $OT "\t/* begin sparse entries for codepoints >= 0x80 */\n";
foreach my $code (sort { $a <=> $b } (keys %simple))
{
	next unless $code >= 0x80;    # already output above

	my $map = $simple{$code};
	printf $OT
	  "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}},\n",
	  $code, $map->{Simple_Lowercase}, $map->{Simple_Titlecase},
	  $map->{Simple_Uppercase};
}
print $OT "};\n";