postgresql/src/common/unicode/generate-unicode_east_asian...

77 lines
1.7 KiB
Perl

#!/usr/bin/perl
#
# Generate a sorted list of non-overlapping intervals of East Asian Wide (W)
# and East Asian Fullwidth (F) characters, using Unicode data files as input.
# Pass EastAsianWidth.txt as argument. The output is on stdout.
#
# Copyright (c) 2019-2024, PostgreSQL Global Development Group
use strict;
use warnings FATAL => 'all';
my $range_start = undef;
my ($first, $last);
my $prev_last;
print
"/* generated by src/common/unicode/generate-unicode_east_asian_fw_table.pl, do not edit */\n\n";
print "static const struct mbinterval east_asian_fw[] = {\n";
foreach my $line (<ARGV>)
{
chomp $line;
$line =~ s/\s*#.*$//;
next if $line eq '';
my ($codepoint, $width) = split /\s*;\s*/, $line;
if ($codepoint =~ /\.\./)
{
($first, $last) = split /\.\./, $codepoint;
}
else
{
$first = $last = $codepoint;
}
($first, $last) = map(hex, ($first, $last));
if ($width eq 'F' || $width eq 'W')
{
# fullwidth/wide characters
if (!defined($range_start))
{
# save for start of range if one hasn't been started yet
$range_start = $first;
}
elsif ($first != $prev_last + 1)
{
# ranges aren't contiguous; emit the last and start a new one
printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last;
$range_start = $first;
}
}
else
{
# not wide characters, print out previous range if any
if (defined($range_start))
{
printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last;
$range_start = undef;
}
}
}
continue
{
$prev_last = $last;
}
# don't forget any ranges at the very end of the database (though there are none
# as of Unicode 13.0)
if (defined($range_start))
{
printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last;
}
print "};\n";