postgresql/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl

678 lines
13 KiB
Perl
Executable File

#! /usr/bin/perl
#
# Copyright (c) 2001-2020, PostgreSQL Global Development Group
#
# src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl
#
# Generate UTF-8 <--> EUC_JP code conversion tables from
# map files provided by Unicode organization.
# Unfortunately it is prohibited by the organization
# to distribute the map files. So if you try to use this script,
# you have to obtain CP932.TXT and JIS0212.TXT from the
# organization's ftp site.
use strict;
use warnings;
use convutils;
my $this_script = 'src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl';
# Load JIS0212.TXT
my $jis0212 = &read_source("JIS0212.TXT");
my @mapping;
foreach my $i (@$jis0212)
{
# We have a different mapping for this in the EUC_JP to UTF-8 direction.
if ($i->{code} == 0x2243)
{
$i->{direction} = FROM_UNICODE;
}
if ($i->{code} == 0x2271)
{
$i->{direction} = TO_UNICODE;
}
if ($i->{ucs} >= 0x080)
{
$i->{code} = $i->{code} | 0x8f8080;
}
else
{
next;
}
push @mapping, $i;
}
# Load CP932.TXT.
my $ct932 = &read_source("CP932.TXT");
foreach my $i (@$ct932)
{
my $sjis = $i->{code};
# We have a different mapping for this in the EUC_JP to UTF-8 direction.
if ( $sjis == 0xeefa
|| $sjis == 0xeefb
|| $sjis == 0xeefc)
{
next;
}
if ($sjis >= 0xa1)
{
my $jis = &sjis2jis($sjis);
$i->{code} = $jis | (
$jis < 0x100
? 0x8e00
: ($sjis >= 0xeffd ? 0x8f8080 : 0x8080));
# Remember the SJIS code for later.
$i->{sjis} = $sjis;
push @mapping, $i;
}
}
foreach my $i (@mapping)
{
my $sjis = $i->{sjis};
# These SJIS characters are excluded completely.
if ( $sjis >= 0xed00 && $sjis <= 0xeef9
|| $sjis >= 0xfa54 && $sjis <= 0xfa56
|| $sjis >= 0xfa58 && $sjis <= 0xfc4b)
{
$i->{direction} = NONE;
next;
}
# These SJIS characters are only in the UTF-8 to EUC_JP table
if ($sjis == 0xeefa || $sjis == 0xeefb || $sjis == 0xeefc)
{
$i->{direction} = FROM_UNICODE;
next;
}
if ( $sjis == 0x8790
|| $sjis == 0x8791
|| $sjis == 0x8792
|| $sjis == 0x8795
|| $sjis == 0x8796
|| $sjis == 0x8797
|| $sjis == 0x879a
|| $sjis == 0x879b
|| $sjis == 0x879c
|| ($sjis >= 0xfa4a && $sjis <= 0xfa53))
{
$i->{direction} = TO_UNICODE;
next;
}
}
push @mapping, (
{
direction => BOTH,
ucs => 0x4efc,
code => 0x8ff4af,
comment => '# CJK(4EFC)'
},
{
direction => BOTH,
ucs => 0x50f4,
code => 0x8ff4b0,
comment => '# CJK(50F4)'
},
{
direction => BOTH,
ucs => 0x51EC,
code => 0x8ff4b1,
comment => '# CJK(51EC)'
},
{
direction => BOTH,
ucs => 0x5307,
code => 0x8ff4b2,
comment => '# CJK(5307)'
},
{
direction => BOTH,
ucs => 0x5324,
code => 0x8ff4b3,
comment => '# CJK(5324)'
},
{
direction => BOTH,
ucs => 0x548A,
code => 0x8ff4b5,
comment => '# CJK(548A)'
},
{
direction => BOTH,
ucs => 0x5759,
code => 0x8ff4b6,
comment => '# CJK(5759)'
},
{
direction => BOTH,
ucs => 0x589E,
code => 0x8ff4b9,
comment => '# CJK(589E)'
},
{
direction => BOTH,
ucs => 0x5BEC,
code => 0x8ff4ba,
comment => '# CJK(5BEC)'
},
{
direction => BOTH,
ucs => 0x5CF5,
code => 0x8ff4bb,
comment => '# CJK(5CF5)'
},
{
direction => BOTH,
ucs => 0x5D53,
code => 0x8ff4bc,
comment => '# CJK(5D53)'
},
{
direction => BOTH,
ucs => 0x5FB7,
code => 0x8ff4be,
comment => '# CJK(5FB7)'
},
{
direction => BOTH,
ucs => 0x6085,
code => 0x8ff4bf,
comment => '# CJK(6085)'
},
{
direction => BOTH,
ucs => 0x6120,
code => 0x8ff4c0,
comment => '# CJK(6120)'
},
{
direction => BOTH,
ucs => 0x654E,
code => 0x8ff4c1,
comment => '# CJK(654E)'
},
{
direction => BOTH,
ucs => 0x663B,
code => 0x8ff4c2,
comment => '# CJK(663B)'
},
{
direction => BOTH,
ucs => 0x6665,
code => 0x8ff4c3,
comment => '# CJK(6665)'
},
{
direction => BOTH,
ucs => 0x6801,
code => 0x8ff4c6,
comment => '# CJK(6801)'
},
{
direction => BOTH,
ucs => 0x6A6B,
code => 0x8ff4c9,
comment => '# CJK(6A6B)'
},
{
direction => BOTH,
ucs => 0x6AE2,
code => 0x8ff4ca,
comment => '# CJK(6AE2)'
},
{
direction => BOTH,
ucs => 0x6DF2,
code => 0x8ff4cc,
comment => '# CJK(6DF2)'
},
{
direction => BOTH,
ucs => 0x6DF8,
code => 0x8ff4cb,
comment => '# CJK(6DF8)'
},
{
direction => BOTH,
ucs => 0x7028,
code => 0x8ff4cd,
comment => '# CJK(7028)'
},
{
direction => BOTH,
ucs => 0x70BB,
code => 0x8ff4ae,
comment => '# CJK(70BB)'
},
{
direction => BOTH,
ucs => 0x7501,
code => 0x8ff4d0,
comment => '# CJK(7501)'
},
{
direction => BOTH,
ucs => 0x7682,
code => 0x8ff4d1,
comment => '# CJK(7682)'
},
{
direction => BOTH,
ucs => 0x769E,
code => 0x8ff4d2,
comment => '# CJK(769E)'
},
{
direction => BOTH,
ucs => 0x7930,
code => 0x8ff4d4,
comment => '# CJK(7930)'
},
{
direction => BOTH,
ucs => 0x7AE7,
code => 0x8ff4d9,
comment => '# CJK(7AE7)'
},
{
direction => BOTH,
ucs => 0x7DA0,
code => 0x8ff4dc,
comment => '# CJK(7DA0)'
},
{
direction => BOTH,
ucs => 0x7DD6,
code => 0x8ff4dd,
comment => '# CJK(7DD6)'
},
{
direction => BOTH,
ucs => 0x8362,
code => 0x8ff4df,
comment => '# CJK(8362)'
},
{
direction => BOTH,
ucs => 0x85B0,
code => 0x8ff4e1,
comment => '# CJK(85B0)'
},
{
direction => BOTH,
ucs => 0x8807,
code => 0x8ff4e4,
comment => '# CJK(8807)'
},
{
direction => BOTH,
ucs => 0x8B7F,
code => 0x8ff4e6,
comment => '# CJK(8B7F)'
},
{
direction => BOTH,
ucs => 0x8CF4,
code => 0x8ff4e7,
comment => '# CJK(8CF4)'
},
{
direction => BOTH,
ucs => 0x8D76,
code => 0x8ff4e8,
comment => '# CJK(8D76)'
},
{
direction => BOTH,
ucs => 0x90DE,
code => 0x8ff4ec,
comment => '# CJK(90DE)'
},
{
direction => BOTH,
ucs => 0x9115,
code => 0x8ff4ee,
comment => '# CJK(9115)'
},
{
direction => BOTH,
ucs => 0x9592,
code => 0x8ff4f1,
comment => '# CJK(9592)'
},
{
direction => BOTH,
ucs => 0x973B,
code => 0x8ff4f4,
comment => '# CJK(973B)'
},
{
direction => BOTH,
ucs => 0x974D,
code => 0x8ff4f5,
comment => '# CJK(974D)'
},
{
direction => BOTH,
ucs => 0x9751,
code => 0x8ff4f6,
comment => '# CJK(9751)'
},
{
direction => BOTH,
ucs => 0x999E,
code => 0x8ff4fa,
comment => '# CJK(999E)'
},
{
direction => BOTH,
ucs => 0x9AD9,
code => 0x8ff4fb,
comment => '# CJK(9AD9)'
},
{
direction => BOTH,
ucs => 0x9B72,
code => 0x8ff4fc,
comment => '# CJK(9B72)'
},
{
direction => BOTH,
ucs => 0x9ED1,
code => 0x8ff4fe,
comment => '# CJK(9ED1)'
},
{
direction => BOTH,
ucs => 0xF929,
code => 0x8ff4c5,
comment => '# CJK COMPATIBILITY IDEOGRAPH-F929'
},
{
direction => BOTH,
ucs => 0xF9DC,
code => 0x8ff4f2,
comment => '# CJK COMPATIBILITY IDEOGRAPH-F9DC'
},
{
direction => BOTH,
ucs => 0xFA0E,
code => 0x8ff4b4,
comment => '# CJK COMPATIBILITY IDEOGRAPH-FA0E'
},
{
direction => BOTH,
ucs => 0xFA0F,
code => 0x8ff4b7,
comment => '# CJK COMPATIBILITY IDEOGRAPH-FA0F'
},
{
direction => BOTH,
ucs => 0xFA10,
code => 0x8ff4b8,
comment => '# CJK COMPATIBILITY IDEOGRAPH-FA10'
},
{
direction => BOTH,
ucs => 0xFA11,
code => 0x8ff4bd,
comment => '# CJK COMPATIBILITY IDEOGRAPH-FA11'
},
{
direction => BOTH,
ucs => 0xFA12,
code => 0x8ff4c4,
comment => '# CJK COMPATIBILITY IDEOGRAPH-FA12'
},
{
direction => BOTH,
ucs => 0xFA13,
code => 0x8ff4c7,
comment => '# CJK COMPATIBILITY IDEOGRAPH-FA13'
},
{
direction => BOTH,
ucs => 0xFA14,
code => 0x8ff4c8,
comment => '# CJK COMPATIBILITY IDEOGRAPH-FA14'
},
{
direction => BOTH,
ucs => 0xFA15,
code => 0x8ff4ce,
comment => '# CJK COMPATIBILITY IDEOGRAPH-FA15'
},
{
direction => BOTH,
ucs => 0xFA16,
code => 0x8ff4cf,
comment => '# CJK COMPATIBILITY IDEOGRAPH-FA16'
},
{
direction => BOTH,
ucs => 0xFA17,
code => 0x8ff4d3,
comment => '# CJK COMPATIBILITY IDEOGRAPH-FA17'
},
{
direction => BOTH,
ucs => 0xFA18,
code => 0x8ff4d5,
comment => '# CJK COMPATIBILITY IDEOGRAPH-FA18'
},
{
direction => BOTH,
ucs => 0xFA19,
code => 0x8ff4d6,
comment => '# CJK COMPATIBILITY IDEOGRAPH-FA19'
},
{
direction => BOTH,
ucs => 0xFA1A,
code => 0x8ff4d7,
comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1A'
},
{
direction => BOTH,
ucs => 0xFA1B,
code => 0x8ff4d8,
comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1B'
},
{
direction => BOTH,
ucs => 0xFA1C,
code => 0x8ff4da,
comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1C'
},
{
direction => BOTH,
ucs => 0xFA1D,
code => 0x8ff4db,
comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1D'
},
{
direction => BOTH,
ucs => 0xFA1E,
code => 0x8ff4de,
comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1E'
},
{
direction => BOTH,
ucs => 0xFA1F,
code => 0x8ff4e0,
comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1F'
},
{
direction => BOTH,
ucs => 0xFA20,
code => 0x8ff4e2,
comment => '# CJK COMPATIBILITY IDEOGRAPH-FA20'
},
{
direction => BOTH,
ucs => 0xFA21,
code => 0x8ff4e3,
comment => '# CJK COMPATIBILITY IDEOGRAPH-FA21'
},
{
direction => BOTH,
ucs => 0xFA22,
code => 0x8ff4e5,
comment => '# CJK COMPATIBILITY IDEOGRAPH-FA22'
},
{
direction => BOTH,
ucs => 0xFA23,
code => 0x8ff4e9,
comment => '# CJK COMPATIBILITY IDEOGRAPH-FA23'
},
{
direction => BOTH,
ucs => 0xFA24,
code => 0x8ff4ea,
comment => '# CJK COMPATIBILITY IDEOGRAPH-FA24'
},
{
direction => BOTH,
ucs => 0xFA25,
code => 0x8ff4eb,
comment => '# CJK COMPATIBILITY IDEOGRAPH-FA25'
},
{
direction => BOTH,
ucs => 0xFA26,
code => 0x8ff4ed,
comment => '# CJK COMPATIBILITY IDEOGRAPH-FA26'
},
{
direction => BOTH,
ucs => 0xFA27,
code => 0x8ff4ef,
comment => '# CJK COMPATIBILITY IDEOGRAPH-FA27'
},
{
direction => BOTH,
ucs => 0xFA28,
code => 0x8ff4f0,
comment => '# CJK COMPATIBILITY IDEOGRAPH-FA28'
},
{
direction => BOTH,
ucs => 0xFA29,
code => 0x8ff4f3,
comment => '# CJK COMPATIBILITY IDEOGRAPH-FA29'
},
{
direction => BOTH,
ucs => 0xFA2A,
code => 0x8ff4f7,
comment => '# CJK COMPATIBILITY IDEOGRAPH-FA2A'
},
{
direction => BOTH,
ucs => 0xFA2B,
code => 0x8ff4f8,
comment => '# CJK COMPATIBILITY IDEOGRAPH-FA2B'
},
{
direction => BOTH,
ucs => 0xFA2C,
code => 0x8ff4f9,
comment => '# CJK COMPATIBILITY IDEOGRAPH-FA2C'
},
{
direction => BOTH,
ucs => 0xFA2D,
code => 0x8ff4fd,
comment => '# CJK COMPATIBILITY IDEOGRAPH-FA2D'
},
{
direction => BOTH,
ucs => 0xFF07,
code => 0x8ff4a9,
comment => '# FULLWIDTH APOSTROPHE'
},
{
direction => BOTH,
ucs => 0xFFE4,
code => 0x8fa2c3,
comment => '# FULLWIDTH BROKEN BAR'
},
# additional conversions for EUC_JP -> UTF-8 conversion
{
direction => TO_UNICODE,
ucs => 0x2116,
code => 0x8ff4ac,
comment => '# NUMERO SIGN'
},
{
direction => TO_UNICODE,
ucs => 0x2121,
code => 0x8ff4ad,
comment => '# TELEPHONE SIGN'
},
{
direction => TO_UNICODE,
ucs => 0x3231,
code => 0x8ff4ab,
comment => '# PARENTHESIZED IDEOGRAPH STOCK'
});
print_conversion_tables($this_script, "EUC_JP", \@mapping);
#######################################################################
# sjis2jis ; SJIS => JIS conversion
sub sjis2jis
{
my ($sjis) = @_;
return $sjis if ($sjis <= 0x100);
my $hi = $sjis >> 8;
my $lo = $sjis & 0xff;
if ($lo >= 0x80) { $lo--; }
$lo -= 0x40;
if ($hi >= 0xe0) { $hi -= 0x40; }
$hi -= 0x81;
my $pos = $lo + $hi * 0xbc;
if ($pos >= 114 * 0x5e && $pos <= 115 * 0x5e + 0x1b)
{
# This region (115-ku) is out of range of JIS code but for
# convenient to generate code in EUC CODESET 3, move this to
# seemingly duplicate region (83-84-ku).
$pos = $pos - ((31 * 0x5e) + 12);
# after 85-ku 82-ten needs to be moved 2 codepoints
$pos = $pos - 2 if ($pos >= 84 * 0x5c + 82);
}
my $hi2 = $pos / 0x5e;
my $lo2 = ($pos % 0x5e);
my $ret = $lo2 + 0x21 + (($hi2 + 0x21) << 8);
return $ret;
}