Merge branch 'utf8refactor' into psr2

* utf8refactor:
  replaced deprecated utf8 functions
  formatting cleanup
  mark old utf8 functions deprecated
  Some cleanup for the UTF-8 stuff
  Moved all utf8 methods to their own namespaced classes
  Create separate table files for UTF-8 handling
This commit is contained in:
Andreas Gohr 2019-07-14 21:11:04 +02:00
commit b47790f975
48 changed files with 4288 additions and 1785 deletions

View File

@ -88,6 +88,7 @@
<exclude-pattern>*/inc/Mailer.class.php</exclude-pattern>
<exclude-pattern>*/doku.php</exclude-pattern>
<exclude-pattern>*/install.php</exclude-pattern>
<exclude-pattern>*/inc/utf8.php</exclude-pattern>
<exclude-pattern>*/feed.php</exclude-pattern>
<exclude-pattern>*/inc/load.php</exclude-pattern>
<exclude-pattern>*/bin/*.php</exclude-pattern>

View File

@ -124,7 +124,7 @@ class media_searchlist_test extends DokuWikiTest {
$info = array();
$info['id'] = $this->upload_ns . ':' . $rel_id;
$info['perm'] = auth_quickaclcheck(getNS($info['id']).':*');
$info['file'] = utf8_basename($file);
$info['file'] = \dokuwiki\Utf8\PhpString::basename($file);
$info['size'] = filesize($file);
$info['mtime'] = filemtime($file);
$info['writable'] = is_writable($file);

View File

@ -84,8 +84,8 @@ class utf8_basename_test extends DokuWikiTest {
);
foreach($data as $test){
$this->assertEquals($test[2], utf8_basename($test[0], $test[1]), "input: ('".$test[0]."', '".$test[1]."')");
$this->assertEquals($test[2], \dokuwiki\Utf8\PhpString::basename($test[0], $test[1]), "input: ('".$test[0]."', '".$test[1]."')");
}
}
}
}

View File

@ -15,7 +15,7 @@ class utf8_correctidx_test extends DokuWikiTest {
$tests[] = array('aaживπά우리をあöä',1,true,1);
foreach($tests as $test){
$this->assertEquals(utf8_correctIdx($test[0],$test[1],$test[2]),$test[3]);
$this->assertEquals(\dokuwiki\Utf8\Clean::correctIdx($test[0],$test[1],$test[2]),$test[3]);
}
}
@ -33,7 +33,7 @@ class utf8_correctidx_test extends DokuWikiTest {
$tests[] = array('aaживπά우리をあöä',4,true,4);
foreach($tests as $test){
$this->assertEquals(utf8_correctIdx($test[0],$test[1],$test[2]),$test[3]);
$this->assertEquals(\dokuwiki\Utf8\Clean::correctIdx($test[0],$test[1],$test[2]),$test[3]);
}
}
@ -53,7 +53,7 @@ class utf8_correctidx_test extends DokuWikiTest {
$tests[] = array('aaживπά우리をあöä',13,true,13);
foreach($tests as $test){
$this->assertEquals(utf8_correctIdx($test[0],$test[1],$test[2]),$test[3]);
$this->assertEquals(\dokuwiki\Utf8\Clean::correctIdx($test[0],$test[1],$test[2]),$test[3]);
}
}
@ -69,7 +69,7 @@ class utf8_correctidx_test extends DokuWikiTest {
$tests[] = array('aaживπά우리をあöä',128,true,29);
foreach($tests as $test){
$this->assertEquals(utf8_correctIdx($test[0],$test[1],$test[2]),$test[3]);
$this->assertEquals(\dokuwiki\Utf8\Clean::correctIdx($test[0],$test[1],$test[2]),$test[3]);
}
}

View File

@ -8,61 +8,61 @@ class utf8_html_test extends DokuWikiTest {
function test_from_1byte(){
$in = 'a';
$out = 'a';
$this->assertEquals(utf8_tohtml($in),$out);
$this->assertEquals(\dokuwiki\Utf8\Conversion::toHtml($in),$out);
}
function test_from_2byte(){
$in = "\xc3\xbc";
$out = '&#252;';
$this->assertEquals(utf8_tohtml($in),$out);
$this->assertEquals(\dokuwiki\Utf8\Conversion::toHtml($in),$out);
}
function test_from_3byte(){
$in = "\xe2\x99\x8a";
$out = '&#x264a;';
$this->assertEquals(utf8_tohtml($in),$out);
$this->assertEquals(\dokuwiki\Utf8\Conversion::toHtml($in),$out);
}
function test_from_4byte(){
$in = "\xf4\x80\x80\x81";
$out = '&#x100001;';
$this->assertEquals(utf8_tohtml($in),$out);
$this->assertEquals(\dokuwiki\Utf8\Conversion::toHtml($in),$out);
}
function test_to_1byte(){
$out = 'a';
$in = 'a';
$this->assertEquals(utf8_unhtml($in),$out);
$this->assertEquals(\dokuwiki\Utf8\Conversion::fromHtml($in),$out);
}
function test_to_2byte(){
$out = "\xc3\xbc";
$in = '&#252;';
$this->assertEquals(utf8_unhtml($in),$out);
$this->assertEquals(\dokuwiki\Utf8\Conversion::fromHtml($in),$out);
}
function test_to_3byte(){
$out = "\xe2\x99\x8a";
$in = '&#x264a;';
$this->assertEquals(utf8_unhtml($in),$out);
$this->assertEquals(\dokuwiki\Utf8\Conversion::fromHtml($in),$out);
}
function test_to_4byte(){
$out = "\xf4\x80\x80\x81";
$in = '&#x100001;';
$this->assertEquals(utf8_unhtml($in),$out);
$this->assertEquals(\dokuwiki\Utf8\Conversion::fromHtml($in),$out);
}
function test_without_entities(){
$out = '&amp;#38;&amp;#38;';
$in = '&amp;#38;&#38;amp;#38;';
$this->assertEquals(utf8_unhtml($in),$out);
$this->assertEquals(\dokuwiki\Utf8\Conversion::fromHtml($in),$out);
}
function test_with_entities(){
$out = '&#38;&amp;#38;';
$in = '&amp;#38;&#38;amp;#38;';
$this->assertEquals(utf8_unhtml($in,HTML_ENTITIES),$out);
$this->assertEquals(\dokuwiki\Utf8\Conversion::fromHtml($in,HTML_ENTITIES),$out);
}
}

View File

@ -18,7 +18,7 @@ class utf8_romanize_test extends DokuWikiTest {
foreach($tests as $test){
list($jap,$rom) = explode(';',trim($test));
$chk = utf8_romanize($jap);
$chk = \dokuwiki\Utf8\Clean::romanize($jap);
$this->assertEquals($rom,$chk,"$jap\t->\t$chk\t!=\t$rom\t($line)");
$line++;
}
@ -31,7 +31,7 @@ class utf8_romanize_test extends DokuWikiTest {
* @author Andreas Gohr <andi@splitbrain.org>
*/
function test_deaccented(){
$this->assertEquals("a A a A a o O",utf8_romanize("å Å ä Ä ä ö Ö"));
$this->assertEquals("a A a A a o O",\dokuwiki\Utf8\Clean::romanize("å Å ä Ä ä ö Ö"));
}
}
//Setup VIM: ex: et ts=4 :

View File

@ -19,7 +19,7 @@ class utf8_stripspecials extends DokuWikiTest {
$tests[] = array('string with nbsps','_','\*','string_with_nbsps');
foreach($tests as $test){
$this->assertEquals(utf8_stripspecials($test[0],$test[1],$test[2]),$test[3]);
$this->assertEquals(\dokuwiki\Utf8\Clean::stripspecials($test[0],$test[1],$test[2]),$test[3]);
}
}

View File

@ -10,7 +10,7 @@ class utf8_strtolower_test extends DokuWikiTest {
);
foreach($data as $input => $expected) {
$this->assertEquals($expected, utf8_strtolower($input));
$this->assertEquals($expected, \dokuwiki\Utf8\PhpString::strtolower($input));
}
// just make sure our data was correct
@ -20,4 +20,4 @@ class utf8_strtolower_test extends DokuWikiTest {
}
}
}
}
}

View File

@ -21,7 +21,7 @@ class utf8_substr_test extends DokuWikiTest {
$tests[] = array('живπά우리をあöä',-6,-2,'우리をあ');
foreach($tests as $test){
$this->assertEquals(utf8_substr($test[0],$test[1],$test[2]),$test[3]);
$this->assertEquals(\dokuwiki\Utf8\PhpString::substr($test[0],$test[1],$test[2]),$test[3]);
}
}
@ -34,7 +34,7 @@ class utf8_substr_test extends DokuWikiTest {
$tests[] = array($str,0,66002,$str);
foreach($tests as $test){
$this->assertEquals(utf8_substr($test[0],$test[1],$test[2]),$test[3]);
$this->assertEquals(\dokuwiki\Utf8\PhpString::substr($test[0],$test[1],$test[2]),$test[3]);
}
}

View File

@ -8,49 +8,49 @@ class utf8_unicode_test extends DokuWikiTest {
function test_from_1byte(){
$in = 'a';
$out = array(97);
$this->assertEquals(utf8_to_unicode($in),$out);
$this->assertEquals(\dokuwiki\Utf8\Unicode::fromUtf8($in),$out);
}
function test_from_2byte(){
$in = "\xc3\xbc";
$out = array(252);
$this->assertEquals(utf8_to_unicode($in),$out);
$this->assertEquals(\dokuwiki\Utf8\Unicode::fromUtf8($in),$out);
}
function test_from_3byte(){
$in = "\xe2\x99\x8a";
$out = array(9802);
$this->assertEquals(utf8_to_unicode($in),$out);
$this->assertEquals(\dokuwiki\Utf8\Unicode::fromUtf8($in),$out);
}
function test_from_4byte(){
$in = "\xf4\x80\x80\x81";
$out = array(1048577);
$this->assertEquals(utf8_to_unicode($in),$out);
$this->assertEquals(\dokuwiki\Utf8\Unicode::fromUtf8($in),$out);
}
function test_to_1byte(){
$out = 'a';
$in = array(97);
$this->assertEquals(unicode_to_utf8($in),$out);
$this->assertEquals(\dokuwiki\Utf8\Unicode::toUtf8($in),$out);
}
function test_to_2byte(){
$out = "\xc3\xbc";
$in = array(252);
$this->assertEquals(unicode_to_utf8($in),$out);
$this->assertEquals(\dokuwiki\Utf8\Unicode::toUtf8($in),$out);
}
function test_to_3byte(){
$out = "\xe2\x99\x8a";
$in = array(9802);
$this->assertEquals(unicode_to_utf8($in),$out);
$this->assertEquals(\dokuwiki\Utf8\Unicode::toUtf8($in),$out);
}
function test_to_4byte(){
$out = "\xf4\x80\x80\x81";
$in = array(1048577);
$this->assertEquals(unicode_to_utf8($in),$out);
$this->assertEquals(\dokuwiki\Utf8\Unicode::toUtf8($in),$out);
}
}

View File

@ -12,14 +12,14 @@ class utf8_utf16be_test extends DokuWikiTest {
* Convert from UTF-8 to UTF-16BE
*/
function test_to16be(){
$this->assertEquals(utf8_to_utf16be($this->utf8), $this->utf16);
$this->assertEquals(\dokuwiki\Utf8\Conversion::toUtf16Be($this->utf8), $this->utf16);
}
/**
* Convert from UTF-16BE to UTF-8
*/
function test_from16be(){
$this->assertEquals(utf16be_to_utf8($this->utf16),$this->utf8);
$this->assertEquals(\dokuwiki\Utf8\Conversion::fromUtf16Be($this->utf16),$this->utf8);
}
}

View File

@ -182,7 +182,7 @@ class PageCLI extends CLI {
}
if(empty($localfile)) {
$localfile = getcwd() . '/' . utf8_basename($wiki_fn);
$localfile = getcwd() . '/' . \dokuwiki\Utf8\PhpString::basename($wiki_fn);
}
if(!file_exists(dirname($localfile))) {

View File

@ -47,7 +47,7 @@ class Sitemap extends AbstractAction {
if(is_readable($sitemap)) {
// Send headers
header('Content-Type: ' . $mime);
header('Content-Disposition: attachment; filename=' . utf8_basename($sitemap));
header('Content-Disposition: attachment; filename=' . \dokuwiki\Utf8\PhpString::basename($sitemap));
http_conditionalRequest(filemtime($sitemap));

View File

@ -1302,7 +1302,7 @@ class JpegMeta {
function _parseFileInfo() {
if (file_exists($this->_fileName) && is_file($this->_fileName)) {
$this->_info['file'] = array();
$this->_info['file']['Name'] = utf8_decodeFN(utf8_basename($this->_fileName));
$this->_info['file']['Name'] = utf8_decodeFN(\dokuwiki\Utf8\PhpString::basename($this->_fileName));
$this->_info['file']['Path'] = fullpath($this->_fileName);
$this->_info['file']['Size'] = filesize($this->_fileName);
if ($this->_info['file']['Size'] < 1024) {
@ -1393,7 +1393,7 @@ class JpegMeta {
}
} else {
$this->_info['file'] = array();
$this->_info['file']['Name'] = utf8_basename($this->_fileName);
$this->_info['file']['Name'] = \dokuwiki\Utf8\PhpString::basename($this->_fileName);
$this->_info['file']['Url'] = $this->_fileName;
}

View File

@ -80,7 +80,7 @@ class Mailer {
*/
public function attachFile($path, $mime, $name = '', $embed = '') {
if(!$name) {
$name = utf8_basename($path);
$name = \dokuwiki\Utf8\PhpString::basename($path);
}
$this->attach[] = array(
@ -387,7 +387,7 @@ class Mailer {
}
// FIXME: is there a way to encode the localpart of a emailaddress?
if(!utf8_isASCII($addr)) {
if(!\dokuwiki\Utf8\Clean::isASCII($addr)) {
msg(hsc("E-Mail address <$addr> is not ASCII"), -1);
continue;
}
@ -403,11 +403,11 @@ class Mailer {
$addr = "<$addr>";
if(defined('MAILHEADER_ASCIIONLY')) {
$text = utf8_deaccent($text);
$text = utf8_strip($text);
$text = \dokuwiki\Utf8\Clean::deaccent($text);
$text = \dokuwiki\Utf8\Clean::strip($text);
}
if(strpos($text, ',') !== false || !utf8_isASCII($text)) {
if(strpos($text, ',') !== false || !\dokuwiki\Utf8\Clean::isASCII($text)) {
$text = '=?UTF-8?B?'.base64_encode($text).'?=';
}
} else {
@ -553,10 +553,10 @@ class Mailer {
if(isset($this->headers['Subject'])) {
// add prefix to subject
if(empty($conf['mailprefix'])) {
if(utf8_strlen($conf['title']) < 20) {
if(\dokuwiki\Utf8\PhpString::strlen($conf['title']) < 20) {
$prefix = '['.$conf['title'].']';
} else {
$prefix = '['.utf8_substr($conf['title'], 0, 20).'...]';
$prefix = '['.\dokuwiki\Utf8\PhpString::substr($conf['title'], 0, 20).'...]';
}
} else {
$prefix = '['.$conf['mailprefix'].']';
@ -568,10 +568,10 @@ class Mailer {
// encode subject
if(defined('MAILHEADER_ASCIIONLY')) {
$this->headers['Subject'] = utf8_deaccent($this->headers['Subject']);
$this->headers['Subject'] = utf8_strip($this->headers['Subject']);
$this->headers['Subject'] = \dokuwiki\Utf8\Clean::deaccent($this->headers['Subject']);
$this->headers['Subject'] = \dokuwiki\Utf8\Clean::strip($this->headers['Subject']);
}
if(!utf8_isASCII($this->headers['Subject'])) {
if(!\dokuwiki\Utf8\Clean::isASCII($this->headers['Subject'])) {
$this->headers['Subject'] = '=?UTF-8?B?'.base64_encode($this->headers['Subject']).'?=';
}
}

View File

@ -45,7 +45,7 @@ class SafeFN {
* @author Christopher Smith <chris@jalakai.co.uk>
*/
public static function encode($filename) {
return self::unicodeToSafe(utf8_to_unicode($filename));
return self::unicodeToSafe(\dokuwiki\Utf8\Unicode::fromUtf8($filename));
}
/**
@ -74,7 +74,7 @@ class SafeFN {
* @author Christopher Smith <chris@jalakai.co.uk>
*/
public static function decode($filename) {
return unicode_to_utf8(self::safeToUnicode(strtolower($filename)));
return \dokuwiki\Utf8\Unicode::toUtf8(self::safeToUnicode(strtolower($filename)));
}
public static function validatePrintableUtf8($printable_utf8) {

View File

@ -500,7 +500,7 @@ class Search extends Ui
public function createPagenameFromQuery($parsedQuery)
{
$cleanedQuery = cleanID($parsedQuery['query']); // already strtolowered
if ($cleanedQuery === utf8_strtolower($parsedQuery['query'])) {
if ($cleanedQuery === \dokuwiki\Utf8\PhpString::strtolower($parsedQuery['query'])) {
return ':' . $cleanedQuery;
}
$pagename = '';

204
inc/Utf8/Clean.php Normal file
View File

@ -0,0 +1,204 @@
<?php
namespace dokuwiki\Utf8;
/**
* Methods to assess and clean UTF-8 strings
*/
class Clean
{
/**
* Checks if a string contains 7bit ASCII only
*
* @author Andreas Haerter <andreas.haerter@dev.mail-node.com>
*
* @param string $str
* @return bool
*/
public static function isASCII($str)
{
return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
}
/**
* Tries to detect if a string is in Unicode encoding
*
* @author <bmorel@ssi.fr>
* @link http://php.net/manual/en/function.utf8-encode.php
*
* @param string $str
* @return bool
*/
public static function isUtf8($str)
{
$len = strlen($str);
for ($i = 0; $i < $len; $i++) {
$b = ord($str[$i]);
if ($b < 0x80) continue; # 0bbbbbbb
elseif (($b & 0xE0) === 0xC0) $n = 1; # 110bbbbb
elseif (($b & 0xF0) === 0xE0) $n = 2; # 1110bbbb
elseif (($b & 0xF8) === 0xF0) $n = 3; # 11110bbb
elseif (($b & 0xFC) === 0xF8) $n = 4; # 111110bb
elseif (($b & 0xFE) === 0xFC) $n = 5; # 1111110b
else return false; # Does not match any model
for ($j = 0; $j < $n; $j++) { # n bytes matching 10bbbbbb follow ?
if ((++$i === $len) || ((ord($str[$i]) & 0xC0) !== 0x80))
return false;
}
}
return true;
}
/**
* Strips all high byte chars
*
* Returns a pure ASCII7 string
*
* @author Andreas Gohr <andi@splitbrain.org>
*
* @param string $str
* @return string
*/
public static function strip($str)
{
$ascii = '';
$len = strlen($str);
for ($i = 0; $i < $len; $i++) {
if (ord($str{$i}) < 128) {
$ascii .= $str{$i};
}
}
return $ascii;
}
/**
* Removes special characters (nonalphanumeric) from a UTF-8 string
*
* This function adds the controlchars 0x00 to 0x19 to the array of
* stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
*
* @author Andreas Gohr <andi@splitbrain.org>
*
* @param string $string The UTF8 string to strip of special chars
* @param string $repl Replace special with this string
* @param string $additional Additional chars to strip (used in regexp char class)
* @return string
*/
public static function stripspecials($string, $repl = '', $additional = '')
{
static $specials = null;
if ($specials === null) {
$specials = preg_quote(Table::specialChars(), '/');
}
return preg_replace('/[' . $additional . '\x00-\x19' . $specials . ']/u', $repl, $string);
}
/**
* Replace bad bytes with an alternative character
*
* ASCII character is recommended for replacement char
*
* PCRE Pattern to locate bad bytes in a UTF-8 string
* Comes from W3 FAQ: Multilingual Forms
* Note: modified to include full ASCII range including control chars
*
* @author Harry Fuecks <hfuecks@gmail.com>
* @see http://www.w3.org/International/questions/qa-forms-utf-8
*
* @param string $str to search
* @param string $replace to replace bad bytes with (defaults to '?') - use ASCII
* @return string
*/
public static function replaceBadBytes($str, $replace = '')
{
$UTF8_BAD =
'([\x00-\x7F]' . # ASCII (including control chars)
'|[\xC2-\xDF][\x80-\xBF]' . # non-overlong 2-byte
'|\xE0[\xA0-\xBF][\x80-\xBF]' . # excluding overlongs
'|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}' . # straight 3-byte
'|\xED[\x80-\x9F][\x80-\xBF]' . # excluding surrogates
'|\xF0[\x90-\xBF][\x80-\xBF]{2}' . # planes 1-3
'|[\xF1-\xF3][\x80-\xBF]{3}' . # planes 4-15
'|\xF4[\x80-\x8F][\x80-\xBF]{2}' . # plane 16
'|(.{1}))'; # invalid byte
ob_start();
while (preg_match('/' . $UTF8_BAD . '/S', $str, $matches)) {
if (!isset($matches[2])) {
echo $matches[0];
} else {
echo $replace;
}
$str = substr($str, strlen($matches[0]));
}
return ob_get_clean();
}
/**
* Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
*
* Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
* letters. Default is to deaccent both cases ($case = 0)
*
* @author Andreas Gohr <andi@splitbrain.org>
*
* @param string $string
* @param int $case
* @return string
*/
public static function deaccent($string, $case = 0)
{
if ($case <= 0) {
$string = strtr($string, Table::lowerAccents());
}
if ($case >= 0) {
$string = strtr($string, Table::upperAccents());
}
return $string;
}
/**
* Romanize a non-latin string
*
* @author Andreas Gohr <andi@splitbrain.org>
*
* @param string $string
* @return string
*/
public static function romanize($string)
{
if (self::isASCII($string)) return $string; //nothing to do
return strtr($string, Table::romanization());
}
/**
* adjust a byte index into a utf8 string to a utf8 character boundary
*
* @author chris smith <chris@jalakai.co.uk>
*
* @param string $str utf8 character string
* @param int $i byte index into $str
* @param bool $next direction to search for boundary, false = up (current character) true = down (next character)
* @return int byte index into $str now pointing to a utf8 character boundary
*/
public static function correctIdx($str, $i, $next = false)
{
if ($i <= 0) return 0;
$limit = strlen($str);
if ($i >= $limit) return $limit;
if ($next) {
while (($i < $limit) && ((ord($str[$i]) & 0xC0) === 0x80)) $i++;
} else {
while ($i && ((ord($str[$i]) & 0xC0) === 0x80)) $i--;
}
return $i;
}
}

161
inc/Utf8/Conversion.php Normal file
View File

@ -0,0 +1,161 @@
<?php
namespace dokuwiki\Utf8;
/**
* Methods to convert from and to UTF-8 strings
*/
class Conversion
{
/**
* Encodes UTF-8 characters to HTML entities
*
* @author Tom N Harris <tnharris@whoopdedo.org>
* @author <vpribish at shopping dot com>
* @link http://php.net/manual/en/function.utf8-decode.php
*
* @param string $str
* @return string
*/
public static function toHtml($str)
{
$ret = '';
foreach (Unicode::fromUtf8($str) as $cp) {
if ($cp < 0x80) {
$ret .= chr($cp);
} elseif ($cp < 0x100) {
$ret .= "&#$cp;";
} else {
$ret .= '&#x' . dechex($cp) . ';';
}
}
return $ret;
}
/**
* Decodes HTML entities to UTF-8 characters
*
* Convert any &#..; entity to a codepoint,
* The entities flag defaults to only decoding numeric entities.
* Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
* are handled as well. Avoids the problem that would occur if you
* had to decode "&amp;#38;&#38;amp;#38;"
*
* unhtmlspecialchars(\dokuwiki\Utf8\Conversion::fromHtml($s)) -> "&#38;&#38;"
* \dokuwiki\Utf8\Conversion::fromHtml(unhtmlspecialchars($s)) -> "&&amp#38;"
* what it should be -> "&#38;&amp#38;"
*
* @author Tom N Harris <tnharris@whoopdedo.org>
*
* @param string $str UTF-8 encoded string
* @param boolean $entities decode name entities in addtition to numeric ones
* @return string UTF-8 encoded string with numeric (and named) entities replaced.
*/
public static function fromHtml($str, $entities = false)
{
if (!$entities) {
return preg_replace_callback(
'/(&#([Xx])?([0-9A-Za-z]+);)/m',
[__CLASS__, 'decodeNumericEntity'],
$str
);
}
return preg_replace_callback(
'/&(#)?([Xx])?([0-9A-Za-z]+);/m',
[__CLASS__, 'decodeAnyEntity'],
$str
);
}
/**
* Decodes any HTML entity to it's correct UTF-8 char equivalent
*
* @param string $ent An entity
* @return string
*/
protected static function decodeAnyEntity($ent)
{
// create the named entity lookup table
static $table = null;
if ($table === null) {
$table = get_html_translation_table(HTML_ENTITIES);
$table = array_flip($table);
$table = array_map(
static function ($c) {
return Unicode::toUtf8(array(ord($c)));
},
$table
);
}
if ($ent[1] === '#') {
return self::decodeNumericEntity($ent);
}
if (array_key_exists($ent[0], $table)) {
return $table[$ent[0]];
}
return $ent[0];
}
/**
* Decodes numeric HTML entities to their correct UTF-8 characters
*
* @param $ent string A numeric entity
* @return string|false
*/
protected static function decodeNumericEntity($ent)
{
switch ($ent[2]) {
case 'X':
case 'x':
$cp = hexdec($ent[3]);
break;
default:
$cp = intval($ent[3]);
break;
}
return Unicode::toUtf8(array($cp));
}
/**
* UTF-8 to UTF-16BE conversion.
*
* Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
*
* @param string $str
* @param bool $bom
* @return string
*/
public static function toUtf16be($str, $bom = false)
{
$out = $bom ? "\xFE\xFF" : '';
if (UTF8_MBSTRING) {
return $out . mb_convert_encoding($str, 'UTF-16BE', 'UTF-8');
}
$uni = Unicode::fromUtf8($str);
foreach ($uni as $cp) {
$out .= pack('n', $cp);
}
return $out;
}
/**
* UTF-8 to UTF-16BE conversion.
*
* Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
*
* @param string $str
* @return false|string
*/
public static function fromUtf16be($str)
{
$uni = unpack('n*', $str);
return Unicode::toUtf8($uni);
}
}

381
inc/Utf8/PhpString.php Normal file
View File

@ -0,0 +1,381 @@
<?php
namespace dokuwiki\Utf8;
/**
* UTF-8 aware equivalents to PHP's string functions
*/
class PhpString
{
/**
* A locale independent basename() implementation
*
* works around a bug in PHP's basename() implementation
*
* @param string $path A path
* @param string $suffix If the name component ends in suffix this will also be cut off
* @return string
* @link https://bugs.php.net/bug.php?id=37738
*
* @see basename()
*/
public static function basename($path, $suffix = '')
{
$path = trim($path, '\\/');
$rpos = max(strrpos($path, '/'), strrpos($path, '\\'));
if ($rpos) {
$path = substr($path, $rpos + 1);
}
$suflen = strlen($suffix);
if ($suflen && (substr($path, -$suflen) === $suffix)) {
$path = substr($path, 0, -$suflen);
}
return $path;
}
/**
* Unicode aware replacement for strlen()
*
* utf8_decode() converts characters that are not in ISO-8859-1
* to '?', which, for the purpose of counting, is alright - It's
* even faster than mb_strlen.
*
* @param string $string
* @return int
* @see utf8_decode()
*
* @author <chernyshevsky at hotmail dot com>
* @see strlen()
*/
public static function strlen($string)
{
if (function_exists('utf8_decode')) {
return strlen(utf8_decode($string));
}
if (UTF8_MBSTRING) {
return mb_strlen($string, 'UTF-8');
}
if (function_exists('iconv_strlen')) {
return iconv_strlen($string, 'UTF-8');
}
return strlen($string);
}
/**
* UTF-8 aware alternative to substr
*
* Return part of a string given character offset (and optionally length)
*
* @param string $str
* @param int $offset number of UTF-8 characters offset (from left)
* @param int $length (optional) length in UTF-8 characters from offset
* @return string
* @author Harry Fuecks <hfuecks@gmail.com>
* @author Chris Smith <chris@jalakai.co.uk>
*
*/
public static function substr($str, $offset, $length = null)
{
if (UTF8_MBSTRING) {
if ($length === null) {
return mb_substr($str, $offset);
}
return mb_substr($str, $offset, $length);
}
/*
* Notes:
*
* no mb string support, so we'll use pcre regex's with 'u' flag
* pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
* offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
*
* substr documentation states false can be returned in some cases (e.g. offset > string length)
* mb_substr never returns false, it will return an empty string instead.
*
* calculating the number of characters in the string is a relatively expensive operation, so
* we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
*/
// cast parameters to appropriate types to avoid multiple notices/warnings
$str = (string)$str; // generates E_NOTICE for PHP4 objects, but not PHP5 objects
$offset = (int)$offset;
if ($length !== null) $length = (int)$length;
// handle trivial cases
if ($length === 0) return '';
if ($offset < 0 && $length < 0 && $length < $offset) return '';
$offset_pattern = '';
$length_pattern = '';
// normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
if ($offset < 0) {
$strlen = self::strlen($str); // see notes
$offset = $strlen + $offset;
if ($offset < 0) $offset = 0;
}
// establish a pattern for offset, a non-captured group equal in length to offset
if ($offset > 0) {
$Ox = (int)($offset / 65535);
$Oy = $offset % 65535;
if ($Ox) $offset_pattern = '(?:.{65535}){' . $Ox . '}';
$offset_pattern = '^(?:' . $offset_pattern . '.{' . $Oy . '})';
} else {
$offset_pattern = '^'; // offset == 0; just anchor the pattern
}
// establish a pattern for length
if ($length === null) {
$length_pattern = '(.*)$'; // the rest of the string
} else {
if (!isset($strlen)) $strlen = self::strlen($str); // see notes
if ($offset > $strlen) return ''; // another trivial case
if ($length > 0) {
// reduce any length that would go past the end of the string
$length = min($strlen - $offset, $length);
$Lx = (int)($length / 65535);
$Ly = $length % 65535;
// +ve length requires ... a captured group of length characters
if ($Lx) $length_pattern = '(?:.{65535}){' . $Lx . '}';
$length_pattern = '(' . $length_pattern . '.{' . $Ly . '})';
} else if ($length < 0) {
if ($length < ($offset - $strlen)) return '';
$Lx = (int)((-$length) / 65535);
$Ly = (-$length) % 65535;
// -ve length requires ... capture everything except a group of -length characters
// anchored at the tail-end of the string
if ($Lx) $length_pattern = '(?:.{65535}){' . $Lx . '}';
$length_pattern = '(.*)(?:' . $length_pattern . '.{' . $Ly . '})$';
}
}
if (!preg_match('#' . $offset_pattern . $length_pattern . '#us', $str, $match)) return '';
return $match[1];
}
/**
* Unicode aware replacement for substr_replace()
*
* @param string $string input string
* @param string $replacement the replacement
* @param int $start the replacing will begin at the start'th offset into string.
* @param int $length If given and is positive, it represents the length of the portion of string which is
* to be replaced. If length is zero then this function will have the effect of inserting
* replacement into string at the given start offset.
* @return string
* @see substr_replace()
*
* @author Andreas Gohr <andi@splitbrain.org>
*/
public static function substr_replace($string, $replacement, $start, $length = 0)
{
$ret = '';
if ($start > 0) $ret .= self::substr($string, 0, $start);
$ret .= $replacement;
$ret .= self::substr($string, $start + $length);
return $ret;
}
/**
* Unicode aware replacement for ltrim()
*
* @param string $str
* @param string $charlist
* @return string
* @see ltrim()
*
* @author Andreas Gohr <andi@splitbrain.org>
*/
public static function ltrim($str, $charlist = '')
{
if ($charlist === '') return ltrim($str);
//quote charlist for use in a characterclass
$charlist = preg_replace('!([\\\\\\-\\]\\[/])!', '\\\${1}', $charlist);
return preg_replace('/^[' . $charlist . ']+/u', '', $str);
}
/**
* Unicode aware replacement for rtrim()
*
* @param string $str
* @param string $charlist
* @return string
* @see rtrim()
*
* @author Andreas Gohr <andi@splitbrain.org>
*/
public static function rtrim($str, $charlist = '')
{
if ($charlist === '') return rtrim($str);
//quote charlist for use in a characterclass
$charlist = preg_replace('!([\\\\\\-\\]\\[/])!', '\\\${1}', $charlist);
return preg_replace('/[' . $charlist . ']+$/u', '', $str);
}
/**
* Unicode aware replacement for trim()
*
* @param string $str
* @param string $charlist
* @return string
* @see trim()
*
* @author Andreas Gohr <andi@splitbrain.org>
*/
public static function trim($str, $charlist = '')
{
if ($charlist === '') return trim($str);
return self::ltrim(self::rtrim($str, $charlist), $charlist);
}
/**
* This is a unicode aware replacement for strtolower()
*
* Uses mb_string extension if available
*
* @param string $string
* @return string
* @see \dokuwiki\Utf8\PhpString::strtoupper()
*
* @author Leo Feyer <leo@typolight.org>
* @see strtolower()
*/
public static function strtolower($string)
{
if (UTF8_MBSTRING) {
if (class_exists('Normalizer', $autoload = false)) {
return \Normalizer::normalize(mb_strtolower($string, 'utf-8'));
}
return (mb_strtolower($string, 'utf-8'));
}
return strtr($string, Table::upperCaseToLowerCase());
}
/**
* This is a unicode aware replacement for strtoupper()
*
* Uses mb_string extension if available
*
* @param string $string
* @return string
* @see \dokuwiki\Utf8\PhpString::strtoupper()
*
* @author Leo Feyer <leo@typolight.org>
* @see strtoupper()
*/
public static function strtoupper($string)
{
if (UTF8_MBSTRING) return mb_strtoupper($string, 'utf-8');
return strtr($string, Table::lowerCaseToUpperCase());
}
/**
* UTF-8 aware alternative to ucfirst
* Make a string's first character uppercase
*
* @param string $str
* @return string with first character as upper case (if applicable)
* @author Harry Fuecks
*
*/
public static function ucfirst($str)
{
switch (self::strlen($str)) {
case 0:
return '';
case 1:
return self::strtoupper($str);
default:
preg_match('/^(.{1})(.*)$/us', $str, $matches);
return self::strtoupper($matches[1]) . $matches[2];
}
}
/**
* UTF-8 aware alternative to ucwords
* Uppercase the first character of each word in a string
*
* @param string $str
* @return string with first char of each word uppercase
* @author Harry Fuecks
* @see http://php.net/ucwords
*
*/
public static function ucwords($str)
{
// Note: [\x0c\x09\x0b\x0a\x0d\x20] matches;
// form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns
// This corresponds to the definition of a "word" defined at http://php.net/ucwords
$pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u';
return preg_replace_callback(
$pattern,
function ($matches) {
$leadingws = $matches[2];
$ucfirst = self::strtoupper($matches[3]);
$ucword = self::substr_replace(ltrim($matches[0]), $ucfirst, 0, 1);
return $leadingws . $ucword;
},
$str
);
}
/**
* This is an Unicode aware replacement for strpos
*
* @param string $haystack
* @param string $needle
* @param integer $offset
* @return integer
* @author Leo Feyer <leo@typolight.org>
* @see strpos()
*
*/
public static function strpos($haystack, $needle, $offset = 0)
{
$comp = 0;
$length = null;
while ($length === null || $length < $offset) {
$pos = strpos($haystack, $needle, $offset + $comp);
if ($pos === false)
return false;
$length = self::strlen(substr($haystack, 0, $pos));
if ($length < $offset)
$comp = $pos - $length;
}
return $length;
}
}

93
inc/Utf8/Table.php Normal file
View File

@ -0,0 +1,93 @@
<?php
namespace dokuwiki\Utf8;
/**
* Provides static access to the UTF-8 conversion tables
*
* Lazy-Loads tables on first access
*/
class Table
{
/**
* Get the upper to lower case conversion table
*
* @return array
*/
public static function upperCaseToLowerCase()
{
static $table = null;
if ($table === null) $table = include __DIR__ . '/tables/case.php';
return $table;
}
/**
* Get the lower to upper case conversion table
*
* @return array
*/
public static function lowerCaseToUpperCase()
{
static $table = null;
if ($table === null) {
$uclc = self::upperCaseToLowerCase();
$table = array_flip($uclc);
}
return $table;
}
/**
* Get the lower case accent table
* @return array
*/
public static function lowerAccents()
{
static $table = null;
if ($table === null) {
$table = include __DIR__ . '/tables/loweraccents.php';
}
return $table;
}
/**
* Get the lower case accent table
* @return array
*/
public static function upperAccents()
{
static $table = null;
if ($table === null) {
$table = include __DIR__ . '/tables/upperaccents.php';
}
return $table;
}
/**
* Get the romanization table
* @return array
*/
public static function romanization()
{
static $table = null;
if ($table === null) {
$table = include __DIR__ . '/tables/romanization.php';
}
return $table;
}
/**
* Get the special chars as a concatenated string
* @return string
*/
public static function specialChars()
{
static $string = null;
if ($string === null) {
$table = include __DIR__ . '/tables/specials.php';
// FIXME should we cache this to file system?
$string = Unicode::toUtf8($table);
}
return $string;
}
}

277
inc/Utf8/Unicode.php Normal file
View File

@ -0,0 +1,277 @@
<?php
namespace dokuwiki\Utf8;
/**
* Convert between UTF-8 and a list of Unicode Code Points
*/
class Unicode
{
/**
* Takes an UTF-8 string and returns an array of ints representing the
* Unicode characters. Astral planes are supported ie. the ints in the
* output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
* are not allowed.
*
* If $strict is set to true the function returns false if the input
* string isn't a valid UTF-8 octet sequence and raises a PHP error at
* level E_USER_WARNING
*
* Note: this function has been modified slightly in this library to
* trigger errors on encountering bad bytes
*
* @author <hsivonen@iki.fi>
* @author Harry Fuecks <hfuecks@gmail.com>
* @see unicode_to_utf8
* @link http://hsivonen.iki.fi/php-utf8/
* @link http://sourceforge.net/projects/phputf8/
* @todo break into less complex chunks
* @todo use exceptions instead of user errors
*
* @param string $str UTF-8 encoded string
* @param boolean $strict Check for invalid sequences?
* @return mixed array of unicode code points or false if UTF-8 invalid
*/
public static function fromUtf8($str, $strict = false)
{
$mState = 0; // cached expected number of octets after the current octet
// until the beginning of the next UTF8 character sequence
$mUcs4 = 0; // cached Unicode character
$mBytes = 1; // cached expected number of octets in the current sequence
$out = array();
$len = strlen($str);
for ($i = 0; $i < $len; $i++) {
$in = ord($str{$i});
if ($mState === 0) {
// When mState is zero we expect either a US-ASCII character or a
// multi-octet sequence.
if (0 === (0x80 & $in)) {
// US-ASCII, pass straight through.
$out[] = $in;
$mBytes = 1;
} else if (0xC0 === (0xE0 & $in)) {
// First octet of 2 octet sequence
$mUcs4 = $in;
$mUcs4 = ($mUcs4 & 0x1F) << 6;
$mState = 1;
$mBytes = 2;
} else if (0xE0 === (0xF0 & $in)) {
// First octet of 3 octet sequence
$mUcs4 = $in;
$mUcs4 = ($mUcs4 & 0x0F) << 12;
$mState = 2;
$mBytes = 3;
} else if (0xF0 === (0xF8 & $in)) {
// First octet of 4 octet sequence
$mUcs4 = $in;
$mUcs4 = ($mUcs4 & 0x07) << 18;
$mState = 3;
$mBytes = 4;
} else if (0xF8 === (0xFC & $in)) {
/* First octet of 5 octet sequence.
*
* This is illegal because the encoded codepoint must be either
* (a) not the shortest form or
* (b) outside the Unicode range of 0-0x10FFFF.
* Rather than trying to resynchronize, we will carry on until the end
* of the sequence and let the later error handling code catch it.
*/
$mUcs4 = $in;
$mUcs4 = ($mUcs4 & 0x03) << 24;
$mState = 4;
$mBytes = 5;
} else if (0xFC === (0xFE & $in)) {
// First octet of 6 octet sequence, see comments for 5 octet sequence.
$mUcs4 = $in;
$mUcs4 = ($mUcs4 & 1) << 30;
$mState = 5;
$mBytes = 6;
} elseif ($strict) {
/* Current octet is neither in the US-ASCII range nor a legal first
* octet of a multi-octet sequence.
*/
trigger_error(
'utf8_to_unicode: Illegal sequence identifier ' .
'in UTF-8 at byte ' . $i,
E_USER_WARNING
);
return false;
}
} else {
// When mState is non-zero, we expect a continuation of the multi-octet
// sequence
if (0x80 === (0xC0 & $in)) {
// Legal continuation.
$shift = ($mState - 1) * 6;
$tmp = $in;
$tmp = ($tmp & 0x0000003F) << $shift;
$mUcs4 |= $tmp;
/**
* End of the multi-octet sequence. mUcs4 now contains the final
* Unicode codepoint to be output
*/
if (0 === --$mState) {
/*
* Check for illegal sequences and codepoints.
*/
// From Unicode 3.1, non-shortest form is illegal
if (((2 === $mBytes) && ($mUcs4 < 0x0080)) ||
((3 === $mBytes) && ($mUcs4 < 0x0800)) ||
((4 === $mBytes) && ($mUcs4 < 0x10000)) ||
(4 < $mBytes) ||
// From Unicode 3.2, surrogate characters are illegal
(($mUcs4 & 0xFFFFF800) === 0xD800) ||
// Codepoints outside the Unicode range are illegal
($mUcs4 > 0x10FFFF)) {
if ($strict) {
trigger_error(
'utf8_to_unicode: Illegal sequence or codepoint ' .
'in UTF-8 at byte ' . $i,
E_USER_WARNING
);
return false;
}
}
if (0xFEFF !== $mUcs4) {
// BOM is legal but we don't want to output it
$out[] = $mUcs4;
}
//initialize UTF8 cache
$mState = 0;
$mUcs4 = 0;
$mBytes = 1;
}
} elseif ($strict) {
/**
*((0xC0 & (*in) != 0x80) && (mState != 0))
* Incomplete multi-octet sequence.
*/
trigger_error(
'utf8_to_unicode: Incomplete multi-octet ' .
' sequence in UTF-8 at byte ' . $i,
E_USER_WARNING
);
return false;
}
}
}
return $out;
}
/**
* Takes an array of ints representing the Unicode characters and returns
* a UTF-8 string. Astral planes are supported ie. the ints in the
* input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
* are not allowed.
*
* If $strict is set to true the function returns false if the input
* array contains ints that represent surrogates or are outside the
* Unicode range and raises a PHP error at level E_USER_WARNING
*
* Note: this function has been modified slightly in this library to use
* output buffering to concatenate the UTF-8 string (faster) as well as
* reference the array by it's keys
*
* @param array $arr of unicode code points representing a string
* @param boolean $strict Check for invalid sequences?
* @return string|false UTF-8 string or false if array contains invalid code points
*
* @author <hsivonen@iki.fi>
* @author Harry Fuecks <hfuecks@gmail.com>
* @see utf8_to_unicode
* @link http://hsivonen.iki.fi/php-utf8/
* @link http://sourceforge.net/projects/phputf8/
* @todo use exceptions instead of user errors
*/
public static function toUtf8($arr, $strict = false)
{
if (!is_array($arr)) return '';
ob_start();
foreach (array_keys($arr) as $k) {
if (($arr[$k] >= 0) && ($arr[$k] <= 0x007f)) {
# ASCII range (including control chars)
echo chr($arr[$k]);
} else if ($arr[$k] <= 0x07ff) {
# 2 byte sequence
echo chr(0xc0 | ($arr[$k] >> 6));
echo chr(0x80 | ($arr[$k] & 0x003f));
} else if ($arr[$k] == 0xFEFF) {
# Byte order mark (skip)
// nop -- zap the BOM
} else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
# Test for illegal surrogates
// found a surrogate
if ($strict) {
trigger_error(
'unicode_to_utf8: Illegal surrogate ' .
'at index: ' . $k . ', value: ' . $arr[$k],
E_USER_WARNING
);
return false;
}
} else if ($arr[$k] <= 0xffff) {
# 3 byte sequence
echo chr(0xe0 | ($arr[$k] >> 12));
echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
echo chr(0x80 | ($arr[$k] & 0x003f));
} else if ($arr[$k] <= 0x10ffff) {
# 4 byte sequence
echo chr(0xf0 | ($arr[$k] >> 18));
echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
echo chr(0x80 | ($arr[$k] & 0x3f));
} elseif ($strict) {
trigger_error(
'unicode_to_utf8: Codepoint out of Unicode range ' .
'at index: ' . $k . ', value: ' . $arr[$k],
E_USER_WARNING
);
// out of range
return false;
}
}
return ob_get_clean();
}
}

567
inc/Utf8/tables/case.php Normal file
View File

@ -0,0 +1,567 @@
<?php
/**
* UTF-8 Case lookup table
*
* This lookuptable defines the lower case letters to their corresponding
* upper case letter in UTF-8
*
* @author Andreas Gohr <andi@splitbrain.org>
*/
return [
'A' => 'a',
'' => '',
'Á' => 'á',
'À' => 'à',
'Ă' => 'ă',
'Ắ' => 'ắ',
'Ẵ' => 'ẵ',
'Ẳ' => 'ẳ',
'Â' => 'â',
'Ấ' => 'ấ',
'Ầ' => 'ầ',
'Ẫ' => 'ẫ',
'Ǎ' => 'ǎ',
'Å' => 'å',
'Ǻ' => 'ǻ',
'Ä' => 'ä',
'Ǟ' => 'ǟ',
'Ã' => 'ã',
'Ǡ' => 'ǡ',
'Ą' => 'ą',
'Ā' => 'ā',
'Ả' => 'ả',
'Ȁ' => 'ȁ',
'Ȃ' => 'ȃ',
'Ặ' => 'ặ',
'Ậ' => 'ậ',
'Ḁ' => 'ḁ',
'Æ' => 'æ',
'Ǽ' => 'ǽ',
'Ǣ' => 'ǣ',
'' => '',
'Ḃ' => 'ḃ',
'Ḅ' => 'ḅ',
'Ḇ' => 'ḇ',
'Ɓ' => 'ɓ',
'Ƃ' => 'ƃ',
'' => '',
'Ć' => 'ć',
'Ĉ' => 'ĉ',
'Č' => 'č',
'Ċ' => 'ċ',
'Ç' => 'ç',
'Ƈ' => 'ƈ',
'D' => 'd',
'' => '',
'Ď' => 'ď',
'Ḋ' => 'ḋ',
'Ḑ' => 'ḑ',
'Ḍ' => 'ḍ',
'Ḓ' => 'ḓ',
'Ḏ' => 'ḏ',
'Ð' => 'ð',
'Dz' => 'dz', //FIXME
'Dž' => 'dž', //FIXME
'Ɗ' => 'ɗ',
'Ƌ' => 'ƌ',
'E' => 'e',
'' => '',
'É' => 'é',
'È' => 'è',
'Ê' => 'ê',
'Ế' => 'ế',
'Ề' => 'ề',
'Ễ' => 'ễ',
'Ể' => 'ể',
'Ě' => 'ě',
'Ẽ' => 'ẽ',
'Ė' => 'ė',
'Ȩ' => 'ȩ',
'Ḝ' => 'ḝ',
'Ę' => 'ę',
'Ē' => 'ē',
'Ḕ' => 'ḕ',
'Ẻ' => 'ẻ',
'Ȅ' => 'ȅ',
'Ȇ' => 'ȇ',
'Ẹ' => 'ẹ',
'Ệ' => 'ệ',
'Ḛ' => 'ḛ',
'Ǝ' => 'ǝ',
'Ə' => 'ə',
'Ɛ' => 'ɛ',
'F' => 'f',
'' => '',
'Ƒ' => 'ƒ',
'G' => 'g',
'' => '',
'Ǵ' => 'ǵ',
'Ğ' => 'ğ',
'Ĝ' => 'ĝ',
'Ġ' => 'ġ',
'Ģ' => 'ģ',
'Ḡ' => 'ḡ',
'Ǥ' => 'ǥ',
'Ɣ' => 'ɣ',
'Ƣ' => 'ƣ',
'' => '',
'Ĥ' => 'ĥ',
'Ȟ' => 'ȟ',
'Ḧ' => 'ḧ',
'Ḣ' => 'ḣ',
'Ḩ' => 'ḩ',
'Ḥ' => 'ḥ',
'Ḫ' => 'ḫ',
'Ƕ' => 'ƕ',
'I' => 'i',
'' => '',
'Í' => 'í',
'Ĭ' => 'ĭ',
'Î' => 'î',
'Ǐ' => 'ǐ',
'Ï' => 'ï',
'Ḯ' => 'ḯ',
'Ĩ' => 'ĩ',
'Ī' => 'ī',
'Ỉ' => 'ỉ',
'Ȉ' => 'ȉ',
'Ȋ' => 'ȋ',
'Ị' => 'ị',
'Ḭ' => 'ḭ',
'Ɨ' => 'ɨ',
'Ɩ' => 'ɩ',
'J' => 'j',
'' => '',
'Ĵ' => 'ĵ',
'K' => 'k',
'Ḱ' => 'ḱ',
'Ǩ' => 'ǩ',
'Ķ' => 'ķ',
'Ḳ' => 'ḳ',
'Ḵ' => 'ḵ',
'Ƙ' => 'ƙ',
'' => '',
'Ĺ' => 'ĺ',
'Ľ' => 'ľ',
'Ļ' => 'ļ',
'Ł' => 'ł',
'Ḷ' => 'ḷ',
'Ḽ' => 'ḽ',
'Ḻ' => 'ḻ',
'Ŀ' => 'ŀ',
'Lj' => 'lj', // FIXME
'M' => 'm',
'' => '',
'Ṁ' => 'ṁ',
'Ṃ' => 'ṃ',
'N' => 'n',
'' => '',
'Ń' => 'ń',
'Ǹ' => 'ǹ',
'Ñ' => 'ñ',
'Ṅ' => 'ṅ',
'Ņ' => 'ņ',
'Ṇ' => 'ṇ',
'Ṋ' => 'ṋ',
'Ṉ' => 'ṉ',
'Ɲ' => 'ɲ',
'Ƞ' => 'ƞ',
'Ŋ' => 'ŋ',
'O' => 'o',
'' => '',
'Ó' => 'ó',
'Ŏ' => 'ŏ',
'Ô' => 'ô',
'Ố' => 'ố',
'Ồ' => 'ồ',
'Ỗ' => 'ỗ',
'Ổ' => 'ổ',
'Ö' => 'ö',
'Ȫ' => 'ȫ',
'Ő' => 'ő',
'Õ' => 'õ',
'Ṍ' => 'ṍ',
'Ṏ' => 'ṏ',
'Ȯ' => 'ȯ',
'Ȱ' => 'ȱ',
'Ø' => 'ø',
'Ǿ' => 'ǿ',
'Ǫ' => 'ǫ',
'Ǭ' => 'ǭ',
'Ṓ' => 'ṓ',
'Ṑ' => 'ṑ',
'Ỏ' => 'ỏ',
'Ȍ' => 'ȍ',
'Ȏ' => 'ȏ',
'Ơ' => 'ơ',
'Ờ' => 'ờ',
'Ỡ' => 'ỡ',
'Ở' => 'ở',
'Ợ' => 'ợ',
'Ọ' => 'ọ',
'Ộ' => 'ộ',
'Ɔ' => 'ɔ',
'Ɵ' => 'ɵ',
'Ȣ' => 'ȣ',
'P' => 'p',
'' => '',
'Ṕ' => 'ṕ',
'Ƥ' => 'ƥ',
'Q' => 'q',
'' => '',
'R' => 'r',
'' => '',
'Ŕ' => 'ŕ',
'Ṙ' => 'ṙ',
'Ŗ' => 'ŗ',
'Ȑ' => 'ȑ',
'Ȓ' => 'ȓ',
'Ṛ' => 'ṛ',
'Ṝ' => 'ṝ',
'Ʀ' => 'ʀ',
'S' => 's',
'' => '',
'Ś' => 'ś',
'Ṥ' => 'ṥ',
'Ŝ' => 'ŝ',
'Ṧ' => 'ṧ',
'Ṡ' => 'ṡ',
'Ş' => 'ş',
'Ṣ' => 'ṣ',
'Ṩ' => 'ṩ',
'Ș' => 'ș',
'T' => 't',
'' => '',
'Ť' => 'ť',
'Ṫ' => 'ṫ',
'Ţ' => 'ţ',
'Ṭ' => 'ṭ',
'Ṱ' => 'ṱ',
'Ṯ' => 'ṯ',
'Ŧ' => 'ŧ',
'Ƭ' => 'ƭ',
'Ʈ' => 'ʈ',
'U' => 'u',
'Ú' => 'ú',
'Ù' => 'ù',
'Ŭ' => 'ŭ',
'Û' => 'û',
'Ǔ' => 'ǔ',
'Ů' => 'ů',
'Ǘ' => 'ǘ',
'Ǜ' => 'ǜ',
'Ǚ' => 'ǚ',
'Ǖ' => 'ǖ',
'Ű' => 'ű',
'Ũ' => 'ũ',
'Ų' => 'ų',
'Ū' => 'ū',
'Ṻ' => 'ṻ',
'Ủ' => 'ủ',
'Ȕ' => 'ȕ',
'Ȗ' => 'ȗ',
'Ứ' => 'ứ',
'Ừ' => 'ừ',
'Ữ' => 'ữ',
'Ử' => 'ử',
'Ự' => 'ự',
'Ụ' => 'ụ',
'Ṷ' => 'ṷ',
'Ṵ' => 'ṵ',
'Ɯ' => 'ɯ',
'Ʊ' => 'ʊ',
'V' => 'v',
'' => '',
'Ṿ' => 'ṿ',
'Ʋ' => 'ʋ',
'W' => 'w',
'' => '',
'Ẃ' => 'ẃ',
'Ẁ' => 'ẁ',
'Ẅ' => 'ẅ',
'Ẇ' => 'ẇ',
'Ẉ' => 'ẉ',
'X' => 'x',
'' => '',
'Ẍ' => 'ẍ',
'Y' => 'y',
'' => '',
'Ý' => 'ý',
'Ỳ' => 'ỳ',
'Ŷ' => 'ŷ',
'Ÿ' => 'ÿ',
'Ẏ' => 'ẏ',
'Ȳ' => 'ȳ',
'Ỷ' => 'ỷ',
'Ỵ' => 'ỵ',
'Ƴ' => 'ƴ',
'Ȝ' => 'ȝ',
'' => '',
'Ź' => 'ź',
'Ẑ' => 'ẑ',
'Ž' => 'ž',
'Ż' => 'ż',
'Ẓ' => 'ẓ',
'Ƶ' => 'ƶ',
'Ȥ' => 'ȥ',
'Ʒ' => 'ʒ',
'Ǯ' => 'ǯ',
'Ƹ' => 'ƹ',
'Þ' => 'þ',
'Ƨ' => 'ƨ',
'Ƽ' => 'ƽ',
'Ƅ' => 'ƅ',
'Α' => 'α',
'Ἀ' => 'ἀ',
'Ἄ' => 'ἄ',
'Ἂ' => 'ἂ',
'ᾊ' => 'ᾂ',
'Ἆ' => 'ἆ',
'ᾎ' => 'ᾆ',
'ᾈ' => 'ᾀ',
'Ἁ' => 'ἁ',
'ᾍ' => 'ᾅ',
'Ἃ' => 'ἃ',
'ᾋ' => 'ᾃ',
'Ἇ' => 'ἇ',
'ᾏ' => 'ᾇ',
'ᾉ' => 'ᾁ',
'Ὰ' => 'ὰ',
'Ᾰ' => 'ᾰ',
'Ᾱ' => 'ᾱ',
'ᾼ' => 'ᾳ',
'Β' => 'β',
'Γ' => 'γ',
'Ε' => 'ε',
'Ἐ' => 'ἐ',
'Ἔ' => 'ἔ',
'Ἒ' => 'ἒ',
'Ἑ' => 'ἑ',
'Ἕ' => 'ἕ',
'Έ' => 'έ',
'Ὲ' => 'ὲ',
'Ϝ' => 'ϝ',
'Ϛ' => 'ϛ',
'Ζ' => 'ζ',
'Η' => 'η',
'ᾜ' => 'ᾔ',
'Ἢ' => 'ἢ',
'ᾚ' => 'ᾒ',
'Ἦ' => 'ἦ',
'ᾞ' => 'ᾖ',
'ᾘ' => 'ᾐ',
'Ἥ' => 'ἥ',
'ᾝ' => 'ᾕ',
'Ἣ' => 'ἣ',
'ᾛ' => 'ᾓ',
'Ἧ' => 'ἧ',
'ᾟ' => 'ᾗ',
'Ή' => 'ή',
'Ὴ' => 'ὴ',
'ῌ' => 'ῃ',
'Θ' => 'θ',
'Ι' => 'ι',
'Ἰ' => 'ἰ',
'Ἲ' => 'ἲ',
'Ἶ' => 'ἶ',
'Ἱ' => 'ἱ',
'Ἵ' => 'ἵ',
'Ἳ' => 'ἳ',
'Ἷ' => 'ἷ',
'Ὶ' => 'ὶ',
'Ῐ' => 'ῐ',
'Ϊ' => 'ϊ',
'Ῑ' => 'ῑ',
'Κ' => 'κ',
'Λ' => 'λ',
'Ν' => 'ν',
'Ξ' => 'ξ',
'Ο' => 'ο',
'Ὀ' => 'ὀ',
'Ὄ' => 'ὄ',
'Ὂ' => 'ὂ',
'Ὅ' => 'ὅ',
'Ὃ' => 'ὃ',
'Ό' => 'ό',
'Ὸ' => 'ὸ',
'Π' => 'π',
'Ϟ' => 'ϟ',
'Ρ' => 'ρ',
'Ῥ' => 'ῥ',
'Σ' => 'ς',
'Τ' => 'τ',
'Υ' => 'υ',
'Ὑ' => 'ὑ',
'Ὓ' => 'ὓ',
'Ὗ' => 'ὗ',
'Ύ' => 'ύ',
'Ὺ' => 'ὺ',
'Ϋ' => 'ϋ',
'Ῡ' => 'ῡ',
'Χ' => 'χ',
'Ψ' => 'ψ',
'Ω' => 'ω',
'Ὤ' => 'ὤ',
'ᾬ' => 'ᾤ',
'Ὢ' => 'ὢ',
'Ὦ' => 'ὦ',
'ᾮ' => 'ᾦ',
'Ὡ' => 'ὡ',
'Ὥ' => 'ὥ',
'ᾭ' => 'ᾥ',
'Ὣ' => 'ὣ',
'Ὧ' => 'ὧ',
'ᾯ' => 'ᾧ',
'ᾩ' => 'ᾡ',
'Ώ' => 'ώ',
'Ὼ' => 'ὼ',
'ῼ' => 'ῳ',
'Ϣ' => 'ϣ',
'Ϥ' => 'ϥ',
'Ϧ' => 'ϧ',
'Ϩ' => 'ϩ',
'Ϫ' => 'ϫ',
'Ϭ' => 'ϭ',
'А' => 'а',
'Ӑ' => 'ӑ',
'Ӓ' => 'ӓ',
'Ә' => 'ә',
'Ӛ' => 'ӛ',
'Ӕ' => 'ӕ',
'В' => 'в',
'Г' => 'г',
'Ѓ' => 'ѓ',
'Ґ' => 'ґ',
'Ғ' => 'ғ',
'Ҕ' => 'ҕ',
'Ԁ' => 'ԁ',
'Ђ' => 'ђ',
'Ԃ' => 'ԃ',
'Ҙ' => 'ҙ',
'Е' => 'е',
'Ѐ' => 'ѐ',
'Ё' => 'ё',
'Є' => 'є',
'Ж' => 'ж',
'Ӂ' => 'ӂ',
'Ӝ' => 'ӝ',
'Җ' => 'җ',
'Ӟ' => 'ӟ',
'Ԅ' => 'ԅ',
'Ѕ' => 'ѕ',
'Ӡ' => 'ӡ',
'Ԇ' => 'ԇ',
'И' => 'и',
'Ӥ' => 'ӥ',
'Ӣ' => 'ӣ',
'Ҋ' => 'ҋ',
'І' => 'і',
'Ї' => 'ї',
'Й' => 'й',
'К' => 'к',
'Ќ' => 'ќ',
'Қ' => 'қ',
'Ӄ' => 'ӄ',
'Ҡ' => 'ҡ',
'Ҟ' => 'ҟ',
'Л' => 'л',
'Ӆ' => 'ӆ',
'Љ' => 'љ',
'Ԉ' => 'ԉ',
'М' => 'м',
'Ӎ' => 'ӎ',
'Ӊ' => 'ӊ',
'Ң' => 'ң',
'Ӈ' => 'ӈ',
'Ҥ' => 'ҥ',
'Њ' => 'њ',
'Ԋ' => 'ԋ',
'Ӧ' => 'ӧ',
'Ө' => 'ө',
'Ӫ' => 'ӫ',
'П' => 'п',
'Ҧ' => 'ҧ',
'Ҁ' => 'ҁ',
'Ҏ' => 'ҏ',
'С' => 'с',
'Ԍ' => 'ԍ',
'Ҫ' => 'ҫ',
'Т' => 'т',
'Ԏ' => 'ԏ',
'Ћ' => 'ћ',
'У' => 'у',
'Ў' => 'ў',
'Ӱ' => 'ӱ',
'Ӳ' => 'ӳ',
'Ӯ' => 'ӯ',
'Ұ' => 'ұ',
'Ѹ' => 'ѹ',
'Ф' => 'ф',
'Х' => 'х',
'Ҳ' => 'ҳ',
'Һ' => 'һ',
'Ѿ' => 'ѿ',
'Ѽ' => 'ѽ',
'Ѻ' => 'ѻ',
'Ц' => 'ц',
'Ҵ' => 'ҵ',
'Ч' => 'ч',
'Ҷ' => 'ҷ',
'Ӌ' => 'ӌ',
'Ҹ' => 'ҹ',
'Ҽ' => 'ҽ',
'Ҿ' => 'ҿ',
'Џ' => 'џ',
'Щ' => 'щ',
'Ъ' => 'ъ',
'Ы' => 'ы',
'Ӹ' => 'ӹ',
'Ь' => 'ь',
'Ҍ' => 'ҍ',
'Э' => 'э',
'Ӭ' => 'ӭ',
'Ю' => 'ю',
'Я' => 'я',
'Ѥ' => 'ѥ',
'Ѧ' => 'ѧ',
'Ѩ' => 'ѩ',
'Ѭ' => 'ѭ',
'Ѯ' => 'ѯ',
'Ѱ' => 'ѱ',
'Ѳ' => 'ѳ',
'Ѵ' => 'ѵ',
'Ҩ' => 'ҩ',
'Ա' => 'ա',
'Բ' => 'բ',
'Գ' => 'գ',
'Դ' => 'դ',
'Ե' => 'ե',
'Է' => 'է',
'Ը' => 'ը',
'Թ' => 'թ',
'Ժ' => 'ժ',
'Ի' => 'ի',
'Լ' => 'լ',
'Ծ' => 'ծ',
'Կ' => 'կ',
'Հ' => 'հ',
'Ձ' => 'ձ',
'Ղ' => 'ղ',
'Ճ' => 'ճ',
'Յ' => 'յ',
'Ն' => 'ն',
'Շ' => 'շ',
'Ո' => 'ո',
'Չ' => 'չ',
'Պ' => 'պ',
'Ռ' => 'ռ',
'Ս' => 'ս',
'Վ' => 'վ',
'Տ' => 'տ',
'Ր' => 'ր',
'Ց' => 'ց',
'Փ' => 'փ',
'Ք' => 'ք',
'Օ' => 'օ',
'Ֆ' => 'ֆ',
];

View File

@ -0,0 +1,116 @@
<?php
/**
* UTF-8 lookup table for lower case accented letters
*
* This lookuptable defines replacements for accented characters from the ASCII-7
* range. This are lower case letters only.
*
* @author Andreas Gohr <andi@splitbrain.org>
* @see \dokuwiki\Utf8\Clean::deaccent()
*/
return [
'á' => 'a',
'à' => 'a',
'ă' => 'a',
'â' => 'a',
'å' => 'a',
'ä' => 'ae',
'ã' => 'a',
'ą' => 'a',
'ā' => 'a',
'æ' => 'ae',
'ḃ' => 'b',
'ć' => 'c',
'ĉ' => 'c',
'č' => 'c',
'ċ' => 'c',
'ç' => 'c',
'ď' => 'd',
'ḋ' => 'd',
'đ' => 'd',
'ð' => 'dh',
'é' => 'e',
'è' => 'e',
'ĕ' => 'e',
'ê' => 'e',
'ě' => 'e',
'ë' => 'e',
'ė' => 'e',
'ę' => 'e',
'ē' => 'e',
'ḟ' => 'f',
'ƒ' => 'f',
'ğ' => 'g',
'ĝ' => 'g',
'ġ' => 'g',
'ģ' => 'g',
'ĥ' => 'h',
'ħ' => 'h',
'í' => 'i',
'ì' => 'i',
'î' => 'i',
'ï' => 'i',
'ĩ' => 'i',
'į' => 'i',
'ī' => 'i',
'ĵ' => 'j',
'ķ' => 'k',
'ĺ' => 'l',
'ľ' => 'l',
'ļ' => 'l',
'ł' => 'l',
'ṁ' => 'm',
'ń' => 'n',
'ň' => 'n',
'ñ' => 'n',
'ņ' => 'n',
'ó' => 'o',
'ò' => 'o',
'ô' => 'o',
'ö' => 'oe',
'ő' => 'o',
'õ' => 'o',
'ø' => 'o',
'ō' => 'o',
'ơ' => 'o',
'ṗ' => 'p',
'ŕ' => 'r',
'ř' => 'r',
'ŗ' => 'r',
'ś' => 's',
'ŝ' => 's',
'š' => 's',
'ṡ' => 's',
'ş' => 's',
'ș' => 's',
'ß' => 'ss',
'ť' => 't',
'ṫ' => 't',
'ţ' => 't',
'ț' => 't',
'ŧ' => 't',
'ú' => 'u',
'ù' => 'u',
'ŭ' => 'u',
'û' => 'u',
'ů' => 'u',
'ü' => 'ue',
'ű' => 'u',
'ũ' => 'u',
'ų' => 'u',
'ū' => 'u',
'ư' => 'u',
'ẃ' => 'w',
'ẁ' => 'w',
'ŵ' => 'w',
'ẅ' => 'w',
'ý' => 'y',
'ỳ' => 'y',
'ŷ' => 'y',
'ÿ' => 'y',
'ź' => 'z',
'ž' => 'z',
'ż' => 'z',
'þ' => 'th',
'µ' => 'u',
];

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,620 @@
<?php
/**
* UTF-8 array of common special characters
*
* This array should contain all special characters (not a letter or digit)
* defined in the various local charsets - it's not a complete list of non-alphanum
* characters in UTF-8. It's not perfect but should match most cases of special
* chars.
*
* The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
* These chars are _not_ in the array either: _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
*
* @author Andreas Gohr <andi@splitbrain.org>
* @see \dokuwiki\Utf8\Clean::stripspecials()
*/
return [
0x1a, // 
0x1b, // 
0x1c, // 
0x1d, // 
0x1e, // 
0x1f, // 
0x20, // <space>
0x21, // !
0x22, // "
0x23, // #
0x24, // $
0x25, // %
0x26, // &
0x27, // '
0x28, // (
0x29, // )
0x2b, // +
0x2c, // ,
0x2f, // /
0x3b, // ;
0x3c, // <
0x3d, // =
0x3e, // >
0x3f, // ?
0x40, // @
0x5b, // [
0x5c, // \
0x5d, // ]
0x5e, // ^
0x60, // `
0x7b, // {
0x7c, // |
0x7d, // }
0x7e, // ~
0x7f, // 
0x80, // €
0x81, // 
0x82, // ‚
0x83, // ƒ
0x84, // „
0x85, // …
0x86, // †
0x87, // ‡
0x88, // ˆ
0x89, // ‰
0x8a, // Š
0x8b, // ‹
0x8c, // Œ
0x8d, // 
0x8e, // Ž
0x8f, // 
0x90, // 
0x91, // ‘
0x92, // ’
0x93, // “
0x94, // ”
0x95, // •
0x96, // –
0x97, // —
0x98, // ˜
0x99, // ™
0x9a, // š
0x9b, // ›
0x9c, // œ
0x9d, // 
0x9e, // ž
0x9f, // Ÿ
0xa0, //  
0xa1, // ¡
0xa2, // ¢
0xa3, // £
0xa4, // ¤
0xa5, // ¥
0xa6, // ¦
0xa7, // §
0xa8, // ¨
0xa9, // ©
0xaa, // ª
0xab, // «
0xac, // ¬
0xad, // ­
0xae, // ®
0xaf, // ¯
0xb0, // °
0xb1, // ±
0xb2, // ²
0xb3, // ³
0xb4, // ´
0xb5, // µ
0xb6, // ¶
0xb7, // ·
0xb8, // ¸
0xb9, // ¹
0xba, // º
0xbb, // »
0xbc, // ¼
0xbd, // ½
0xbe, // ¾
0xbf, // ¿
0xd7, // ×
0xf7, // ÷
0x2c7, // ˇ
0x2d8, // ˘
0x2d9, // ˙
0x2da, // ˚
0x2db, // ˛
0x2dc, // ˜
0x2dd, // ˝
0x300, // ̀
0x301, // ́
0x303, // ̃
0x309, // ̉
0x323, // ̣
0x384, // ΄
0x385, // ΅
0x387, // ·
0x3c6, // φ
0x3d1, // ϑ
0x3d2, // ϒ
0x3d5, // ϕ
0x3d6, // ϖ
0x5b0, // ְ
0x5b1, // ֱ
0x5b2, // ֲ
0x5b3, // ֳ
0x5b4, // ִ
0x5b5, // ֵ
0x5b6, // ֶ
0x5b7, // ַ
0x5b8, // ָ
0x5b9, // ֹ
0x5bb, // ֻ
0x5bc, // ּ
0x5bd, // ֽ
0x5be, // ־
0x5bf, // ֿ
0x5c0, // ׀
0x5c1, // ׁ
0x5c2, // ׂ
0x5c3, // ׃
0x5f3, // ׳
0x5f4, // ״
0x60c, // ،
0x61b, // ؛
0x61f, // ؟
0x640, // ـ
0x64b, // ً
0x64c, // ٌ
0x64d, // ٍ
0x64e, // َ
0x64f, // ُ
0x650, // ِ
0x651, // ّ
0x652, // ْ
0x66a, // ٪
0xe3f, // ฿
0x200c, //
0x200d, //
0x200e, //
0x200f, //
0x2013, //
0x2014, // —
0x2015, // ―
0x2017, // ‗
0x2018, //
0x2019, //
0x201a, //
0x201c, // “
0x201d, // ”
0x201e, // „
0x2020, // †
0x2021, // ‡
0x2022, // •
0x2026, // …
0x2030, // ‰
0x2032, //
0x2033, // ″
0x2039, //
0x203a, //
0x2044, //
0x20a7, // ₧
0x20aa, // ₪
0x20ab, // ₫
0x20ac, // €
0x2116, // №
0x2118, // ℘
0x2122, // ™
0x2126, // Ω
0x2135, // ℵ
0x2190, // ←
0x2191, // ↑
0x2192, // →
0x2193, // ↓
0x2194, // ↔
0x2195, // ↕
0x21b5, // ↵
0x21d0, // ⇐
0x21d1, // ⇑
0x21d2, // ⇒
0x21d3, // ⇓
0x21d4, // ⇔
0x2200, // ∀
0x2202, // ∂
0x2203, // ∃
0x2205, // ∅
0x2206, // ∆
0x2207, // ∇
0x2208, // ∈
0x2209, // ∉
0x220b, // ∋
0x220f, // ∏
0x2211, // ∑
0x2212, //
0x2215, //
0x2217, //
0x2219, // ∙
0x221a, // √
0x221d, // ∝
0x221e, // ∞
0x2220, // ∠
0x2227, // ∧
0x2228, //
0x2229, // ∩
0x222a, //
0x222b, // ∫
0x2234, // ∴
0x223c, //
0x2245, // ≅
0x2248, // ≈
0x2260, // ≠
0x2261, // ≡
0x2264, // ≤
0x2265, // ≥
0x2282, // ⊂
0x2283, // ⊃
0x2284, // ⊄
0x2286, // ⊆
0x2287, // ⊇
0x2295, // ⊕
0x2297, // ⊗
0x22a5, // ⊥
0x22c5, // ⋅
0x2310, // ⌐
0x2320, // ⌠
0x2321, // ⌡
0x2329, // 〈
0x232a, // 〉
0x2469, // ⑩
0x2500, // ─
0x2502, // │
0x250c, // ┌
0x2510, // ┐
0x2514, // └
0x2518, // ┘
0x251c, // ├
0x2524, // ┤
0x252c, // ┬
0x2534, // ┴
0x253c, // ┼
0x2550, // ═
0x2551, // ║
0x2552, // ╒
0x2553, // ╓
0x2554, // ╔
0x2555, // ╕
0x2556, // ╖
0x2557, // ╗
0x2558, // ╘
0x2559, // ╙
0x255a, // ╚
0x255b, // ╛
0x255c, // ╜
0x255d, // ╝
0x255e, // ╞
0x255f, // ╟
0x2560, // ╠
0x2561, // ╡
0x2562, // ╢
0x2563, // ╣
0x2564, // ╤
0x2565, // ╥
0x2566, // ╦
0x2567, // ╧
0x2568, // ╨
0x2569, // ╩
0x256a, // ╪
0x256b, // ╫
0x256c, // ╬
0x2580, // ▀
0x2584, // ▄
0x2588, // █
0x258c, // ▌
0x2590, // ▐
0x2591, // ░
0x2592, // ▒
0x2593, // ▓
0x25a0, // ■
0x25b2, // ▲
0x25bc, // ▼
0x25c6, // ◆
0x25ca, // ◊
0x25cf, // ●
0x25d7, // ◗
0x2605, // ★
0x260e, // ☎
0x261b, // ☛
0x261e, // ☞
0x2660, // ♠
0x2663, // ♣
0x2665, // ♥
0x2666, // ♦
0x2701, // ✁
0x2702, // ✂
0x2703, // ✃
0x2704, // ✄
0x2706, // ✆
0x2707, // ✇
0x2708, // ✈
0x2709, // ✉
0x270c, // ✌
0x270d, // ✍
0x270e, // ✎
0x270f, // ✏
0x2710, // ✐
0x2711, // ✑
0x2712, // ✒
0x2713, // ✓
0x2714, // ✔
0x2715, // ✕
0x2716, // ✖
0x2717, // ✗
0x2718, // ✘
0x2719, // ✙
0x271a, // ✚
0x271b, // ✛
0x271c, // ✜
0x271d, // ✝
0x271e, // ✞
0x271f, // ✟
0x2720, // ✠
0x2721, // ✡
0x2722, // ✢
0x2723, // ✣
0x2724, // ✤
0x2725, // ✥
0x2726, // ✦
0x2727, // ✧
0x2729, // ✩
0x272a, // ✪
0x272b, // ✫
0x272c, // ✬
0x272d, // ✭
0x272e, // ✮
0x272f, // ✯
0x2730, // ✰
0x2731, // ✱
0x2732, // ✲
0x2733, // ✳
0x2734, // ✴
0x2735, // ✵
0x2736, // ✶
0x2737, // ✷
0x2738, // ✸
0x2739, // ✹
0x273a, // ✺
0x273b, // ✻
0x273c, // ✼
0x273d, // ✽
0x273e, // ✾
0x273f, // ✿
0x2740, // ❀
0x2741, // ❁
0x2742, // ❂
0x2743, // ❃
0x2744, // ❄
0x2745, // ❅
0x2746, // ❆
0x2747, // ❇
0x2748, // ❈
0x2749, // ❉
0x274a, // ❊
0x274b, // ❋
0x274d, // ❍
0x274f, // ❏
0x2750, // ❐
0x2751, // ❑
0x2752, // ❒
0x2756, // ❖
0x2758, // ❘
0x2759, // ❙
0x275a, // ❚
0x275b, // ❛
0x275c, // ❜
0x275d, // ❝
0x275e, // ❞
0x2761, // ❡
0x2762, // ❢
0x2763, // ❣
0x2764, // ❤
0x2765, // ❥
0x2766, // ❦
0x2767, // ❧
0x277f, // ❿
0x2789, // ➉
0x2793, // ➓
0x2794, // ➔
0x2798, // ➘
0x2799, // ➙
0x279a, // ➚
0x279b, // ➛
0x279c, // ➜
0x279d, // ➝
0x279e, // ➞
0x279f, // ➟
0x27a0, // ➠
0x27a1, // ➡
0x27a2, // ➢
0x27a3, // ➣
0x27a4, // ➤
0x27a5, // ➥
0x27a6, // ➦
0x27a7, // ➧
0x27a8, // ➨
0x27a9, // ➩
0x27aa, // ➪
0x27ab, // ➫
0x27ac, // ➬
0x27ad, // ➭
0x27ae, // ➮
0x27af, // ➯
0x27b1, // ➱
0x27b2, // ➲
0x27b3, // ➳
0x27b4, // ➴
0x27b5, // ➵
0x27b6, // ➶
0x27b7, // ➷
0x27b8, // ➸
0x27b9, // ➹
0x27ba, // ➺
0x27bb, // ➻
0x27bc, // ➼
0x27bd, // ➽
0x27be, // ➾
0x3000, //  
0x3001, // 、
0x3002, // 。
0x3003, // 〃
0x3008, // 〈
0x3009, // 〉
0x300a, // 《
0x300b, // 》
0x300c, // 「
0x300d, // 」
0x300e, // 『
0x300f, // 』
0x3010, // 【
0x3011, // 】
0x3012, // 〒
0x3014, //
0x3015, //
0x3016, // 〖
0x3017, // 〗
0x3018, // 〘
0x3019, // 〙
0x301a, // 〚
0x301b, // 〛
0x3036, // 〶
0xf6d9, // 
0xf6da, // 
0xf6db, // 
0xf8d7, // 
0xf8d8, // 
0xf8d9, // 
0xf8da, // 
0xf8db, // 
0xf8dc, // 
0xf8dd, // 
0xf8de, // 
0xf8df, // 
0xf8e0, // 
0xf8e1, // 
0xf8e2, // 
0xf8e3, // 
0xf8e4, // 
0xf8e5, // 
0xf8e6, // 
0xf8e7, // 
0xf8e8, // 
0xf8e9, // 
0xf8ea, // 
0xf8eb, // 
0xf8ec, // 
0xf8ed, // 
0xf8ee, // 
0xf8ef, // 
0xf8f0, // 
0xf8f1, // 
0xf8f2, // 
0xf8f3, // 
0xf8f4, // 
0xf8f5, // 
0xf8f6, // 
0xf8f7, // 
0xf8f8, // 
0xf8f9, // 
0xf8fa, // 
0xf8fb, // 
0xf8fc, // 
0xf8fd, // 
0xf8fe, // 
0xfe7c, // ﹼ
0xfe7d, // ﹽ
0xff01, //
0xff02, //
0xff03, //
0xff04, //
0xff05, //
0xff06, //
0xff07, //
0xff08, //
0xff09, //
0xff09, //
0xff0a, //
0xff0b, //
0xff0c, //
0xff0d, //
0xff0e, //
0xff0f, //
0xff1a, //
0xff1b, //
0xff1c, //
0xff1d, //
0xff1e, //
0xff1f, //
0xff20, //
0xff3b, //
0xff3c, //
0xff3d, //
0xff3e, //
0xff40, //
0xff5b, //
0xff5c, //
0xff5d, //
0xff5e, //
0xff5f, // ⦅
0xff60, // ⦆
0xff61, // 。
0xff62, // 「
0xff63, // 」
0xff64, // 、
0xff65, // ・
0xffe0, // ¢
0xffe1, // £
0xffe2, // ¬
0xffe3, //  ̄
0xffe4, // ¦
0xffe5, // ¥
0xffe6, // ₩
0xffe8, //
0xffe9, // ←
0xffea, // ↑
0xffeb, // →
0xffec, // ↓
0xffed, // ■
0xffee, // ○
0x1d6fc, // 𝛼
0x1d6fd, // 𝛽
0x1d6fe, // 𝛾
0x1d6ff, // 𝛿
0x1d700, // 𝜀
0x1d701, // 𝜁
0x1d702, // 𝜂
0x1d703, // 𝜃
0x1d704, // 𝜄
0x1d705, // 𝜅
0x1d706, // 𝜆
0x1d707, // 𝜇
0x1d708, // 𝜈
0x1d709, // 𝜉
0x1d70a, // 𝜊
0x1d70b, // 𝜋
0x1d70c, // 𝜌
0x1d70d, // 𝜍
0x1d70e, // 𝜎
0x1d70f, // 𝜏
0x1d710, // 𝜐
0x1d711, // 𝜑
0x1d712, // 𝜒
0x1d713, // 𝜓
0x1d714, // 𝜔
0x1d715, // 𝜕
0x1d716, // 𝜖
0x1d717, // 𝜗
0x1d718, // 𝜘
0x1d719, // 𝜙
0x1d71a, // 𝜚
0x1d71b, // 𝜛
0xc2a0, // 슠
0xe28087, //
0xe280af, //
0xe281a0, //
0xefbbbf, //
];

View File

@ -0,0 +1,114 @@
<?php
/**
* UTF-8 lookup table for upper case accented letters
*
* This lookuptable defines replacements for accented characters from the ASCII-7
* range. This are upper case letters only.
*
* @author Andreas Gohr <andi@splitbrain.org>
* @see \dokuwiki\Utf8\Clean::deaccent()
*/
return [
'Á' => 'A',
'À' => 'A',
'Ă' => 'A',
'Â' => 'A',
'Å' => 'A',
'Ä' => 'Ae',
'Ã' => 'A',
'Ą' => 'A',
'Ā' => 'A',
'Æ' => 'Ae',
'Ḃ' => 'B',
'Ć' => 'C',
'Ĉ' => 'C',
'Č' => 'C',
'Ċ' => 'C',
'Ç' => 'C',
'Ď' => 'D',
'Ḋ' => 'D',
'Đ' => 'D',
'Ð' => 'Dh',
'É' => 'E',
'È' => 'E',
'Ĕ' => 'E',
'Ê' => 'E',
'Ě' => 'E',
'Ë' => 'E',
'Ė' => 'E',
'Ę' => 'E',
'Ē' => 'E',
'Ḟ' => 'F',
'Ƒ' => 'F',
'Ğ' => 'G',
'Ĝ' => 'G',
'Ġ' => 'G',
'Ģ' => 'G',
'Ĥ' => 'H',
'Ħ' => 'H',
'Í' => 'I',
'Ì' => 'I',
'Î' => 'I',
'Ï' => 'I',
'Ĩ' => 'I',
'Į' => 'I',
'Ī' => 'I',
'Ĵ' => 'J',
'Ķ' => 'K',
'Ĺ' => 'L',
'Ľ' => 'L',
'Ļ' => 'L',
'Ł' => 'L',
'Ṁ' => 'M',
'Ń' => 'N',
'Ň' => 'N',
'Ñ' => 'N',
'Ņ' => 'N',
'Ó' => 'O',
'Ò' => 'O',
'Ô' => 'O',
'Ö' => 'Oe',
'Ő' => 'O',
'Õ' => 'O',
'Ø' => 'O',
'Ō' => 'O',
'Ơ' => 'O',
'Ṗ' => 'P',
'Ŕ' => 'R',
'Ř' => 'R',
'Ŗ' => 'R',
'Ś' => 'S',
'Ŝ' => 'S',
'Š' => 'S',
'Ṡ' => 'S',
'Ş' => 'S',
'Ș' => 'S',
'Ť' => 'T',
'Ṫ' => 'T',
'Ţ' => 'T',
'Ț' => 'T',
'Ŧ' => 'T',
'Ú' => 'U',
'Ù' => 'U',
'Ŭ' => 'U',
'Û' => 'U',
'Ů' => 'U',
'Ü' => 'Ue',
'Ű' => 'U',
'Ũ' => 'U',
'Ų' => 'U',
'Ū' => 'U',
'Ư' => 'U',
'Ẃ' => 'W',
'Ẁ' => 'W',
'Ŵ' => 'W',
'Ẅ' => 'W',
'Ý' => 'Y',
'Ỳ' => 'Y',
'Ŷ' => 'Y',
'Ÿ' => 'Y',
'Ź' => 'Z',
'Ž' => 'Z',
'Ż' => 'Z',
'Þ' => 'Th',
];

View File

@ -518,7 +518,7 @@ function auth_isMember($memberlist, $user, array $groups) {
// clean user and groups
if(!$auth->isCaseSensitive()) {
$user = utf8_strtolower($user);
$user = \dokuwiki\Utf8\PhpString::strtolower($user);
$groups = array_map('utf8_strtolower', $groups);
}
$user = $auth->cleanUser($user);
@ -533,7 +533,7 @@ function auth_isMember($memberlist, $user, array $groups) {
// compare cleaned values
foreach($members as $member) {
if($member == '@ALL' ) return true;
if(!$auth->isCaseSensitive()) $member = utf8_strtolower($member);
if(!$auth->isCaseSensitive()) $member = \dokuwiki\Utf8\PhpString::strtolower($member);
if($member[0] == '@') {
$member = $auth->cleanGroup(substr($member, 1));
if(in_array($member, $groups)) return true;
@ -621,7 +621,7 @@ function auth_aclcheck_cb($data) {
}
if(!$auth->isCaseSensitive()) {
$user = utf8_strtolower($user);
$user = \dokuwiki\Utf8\PhpString::strtolower($user);
$groups = array_map('utf8_strtolower', $groups);
}
$user = auth_nameencode($auth->cleanUser($user));
@ -648,7 +648,7 @@ function auth_aclcheck_cb($data) {
$match = preg_replace('/#.*$/', '', $match); //ignore comments
$acl = preg_split('/[ \t]+/', $match);
if(!$auth->isCaseSensitive() && $acl[1] !== '@ALL') {
$acl[1] = utf8_strtolower($acl[1]);
$acl[1] = \dokuwiki\Utf8\PhpString::strtolower($acl[1]);
}
if(!in_array($acl[1], $groups)) {
continue;
@ -678,7 +678,7 @@ function auth_aclcheck_cb($data) {
$match = preg_replace('/#.*$/', '', $match); //ignore comments
$acl = preg_split('/[ \t]+/', $match);
if(!$auth->isCaseSensitive() && $acl[1] !== '@ALL') {
$acl[1] = utf8_strtolower($acl[1]);
$acl[1] = \dokuwiki\Utf8\PhpString::strtolower($acl[1]);
}
if(!in_array($acl[1], $groups)) {
continue;

View File

@ -93,7 +93,7 @@ function addLogEntry($date, $id, $type=DOKU_CHANGE_TYPE_EDIT, $summary='', $extr
'type' => str_replace($strip, '', $type),
'id' => $id,
'user' => $user,
'sum' => utf8_substr(str_replace($strip, '', $summary), 0, 255),
'sum' => \dokuwiki\Utf8\PhpString::substr(str_replace($strip, '', $summary), 0, 255),
'extra' => str_replace($strip, '', $extra),
'sizechange' => $sizechange
);
@ -180,7 +180,7 @@ function addMediaLogEntry(
'type' => str_replace($strip, '', $type),
'id' => $id,
'user' => $user,
'sum' => utf8_substr(str_replace($strip, '', $summary), 0, 255),
'sum' => \dokuwiki\Utf8\PhpString::substr(str_replace($strip, '', $summary), 0, 255),
'extra' => str_replace($strip, '', $extra),
'sizechange' => $sizechange
);

View File

@ -1017,7 +1017,7 @@ function cleanText($text) {
// if the text is not valid UTF-8 we simply assume latin1
// this won't break any worse than it breaks with the wrong encoding
// but might actually fix the problem in many cases
if(!utf8_check($text)) $text = utf8_encode($text);
if(!\dokuwiki\Utf8\Clean::isUtf8($text)) $text = utf8_encode($text);
return $text;
}
@ -1173,12 +1173,12 @@ function parsePageTemplate(&$data) {
utf8_ucwords(curNS($id)),
utf8_strtoupper(curNS($id)),
$file,
utf8_ucfirst($file),
utf8_strtoupper($file),
\dokuwiki\Utf8\PhpString::ucfirst($file),
\dokuwiki\Utf8\PhpString::strtoupper($file),
$page,
utf8_ucfirst($page),
utf8_ucwords($page),
utf8_strtoupper($page),
\dokuwiki\Utf8\PhpString::ucfirst($page),
\dokuwiki\Utf8\PhpString::ucwords($page),
\dokuwiki\Utf8\PhpString::strtoupper($page),
$INPUT->server->str('REMOTE_USER'),
$USERINFO['name'],
$USERINFO['mail'],
@ -1741,12 +1741,12 @@ function preg_quote_cb($string) {
* @return string
*/
function shorten($keep, $short, $max, $min = 9, $char = '…') {
$max = $max - utf8_strlen($keep);
$max = $max - \dokuwiki\Utf8\PhpString::strlen($keep);
if($max < $min) return $keep;
$len = utf8_strlen($short);
$len = \dokuwiki\Utf8\PhpString::strlen($short);
if($len <= $max) return $keep.$short;
$half = floor($max / 2);
return $keep.utf8_substr($short, 0, $half - 1).$char.utf8_substr($short, $len - $half);
return $keep.\dokuwiki\Utf8\PhpString::substr($short, 0, $half - 1).$char.\dokuwiki\Utf8\PhpString::substr($short, $len - $half);
}
/**

View File

@ -69,9 +69,9 @@ function sendFile($file, $mime, $dl, $cache, $public = false, $orig = null) {
//download or display?
if($dl) {
header('Content-Disposition: attachment;'.rfc2231_encode('filename', utf8_basename($orig)).';');
header('Content-Disposition: attachment;'.rfc2231_encode('filename', \dokuwiki\Utf8\PhpString::basename($orig)).';');
} else {
header('Content-Disposition: inline;'.rfc2231_encode('filename', utf8_basename($orig)).';');
header('Content-Disposition: inline;'.rfc2231_encode('filename', \dokuwiki\Utf8\PhpString::basename($orig)).';');
}
//use x-sendfile header to pass the delivery to compatible webservers

View File

@ -97,7 +97,7 @@ function _ft_pageSearch(&$data) {
);
$evt = new Event('FULLTEXT_PHRASE_MATCH',$evdata);
if ($evt->advise_before() && $evt->result !== true) {
$text = utf8_strtolower($evdata['text']);
$text = \dokuwiki\Utf8\PhpString::strtolower($evdata['text']);
if (strpos($text, $phrase) !== false) {
$evt->result = true;
}
@ -412,7 +412,7 @@ function ft_snippet($id,$highlight){
$match = array();
$snippets = array();
$utf8_offset = $offset = $end = 0;
$len = utf8_strlen($text);
$len = \dokuwiki\Utf8\PhpString::strlen($text);
// build a regexp from the phrases to highlight
$re1 = '(' .
@ -442,8 +442,8 @@ function ft_snippet($id,$highlight){
list($str,$idx) = $match[0];
// convert $idx (a byte offset) into a utf8 character offset
$utf8_idx = utf8_strlen(substr($text,0,$idx));
$utf8_len = utf8_strlen($str);
$utf8_idx = \dokuwiki\Utf8\PhpString::strlen(substr($text,0,$idx));
$utf8_len = \dokuwiki\Utf8\PhpString::strlen($str);
// establish context, 100 bytes surrounding the match string
// first look to see if we can go 100 either side,
@ -472,9 +472,9 @@ function ft_snippet($id,$highlight){
$end = $utf8_idx + $utf8_len + $post; // now set it to the end of this context
if ($append) {
$snippets[count($snippets)-1] .= utf8_substr($text,$append,$end-$append);
$snippets[count($snippets)-1] .= \dokuwiki\Utf8\PhpString::substr($text,$append,$end-$append);
} else {
$snippets[] = utf8_substr($text,$start,$end-$start);
$snippets[] = \dokuwiki\Utf8\PhpString::substr($text,$start,$end-$start);
}
// set $offset for next match attempt
@ -483,8 +483,8 @@ function ft_snippet($id,$highlight){
// this prevents further matching of this snippet but for possible matches of length
// smaller than match length + context (at least 50 characters) this match is part of the context
$utf8_offset = $utf8_idx + $utf8_len;
$offset = $idx + strlen(utf8_substr($text,$utf8_idx,$utf8_len));
$offset = utf8_correctIdx($text,$offset);
$offset = $idx + strlen(\dokuwiki\Utf8\PhpString::substr($text,$utf8_idx,$utf8_len));
$offset = \dokuwiki\Utf8\Clean::correctIdx($text,$offset);
}
$m = "\1";
@ -674,7 +674,7 @@ function ft_queryParser($Indexer, $query){
*/
$parsed_query = '';
$parens_level = 0;
$terms = preg_split('/(-?".*?")/u', utf8_strtolower($query), -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
$terms = preg_split('/(-?".*?")/u', \dokuwiki\Utf8\PhpString::strtolower($query), -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
foreach ($terms as $term) {
$parsed = '';

View File

@ -355,7 +355,7 @@ function html_hilight($html,$phrases){
$regex = join('|',$phrases);
if ($regex === '') return $html;
if (!utf8_check($regex)) return $html;
if (!\dokuwiki\Utf8\Clean::isUtf8($regex)) return $html;
$html = @preg_replace_callback("/((<[^>]*)|$regex)/ui",'html_hilight_callback',$html);
return $html;
}

View File

@ -605,12 +605,12 @@ class Doku_Indexer {
)
);
if (preg_match('/[^0-9A-Za-z ]/u', $text))
$text = utf8_stripspecials($text, ' ', '\._\-:'.$wc);
$text = \dokuwiki\Utf8\Clean::stripspecials($text, ' ', '\._\-:'.$wc);
$wordlist = explode(' ', $text);
foreach ($wordlist as $i => $word) {
$wordlist[$i] = (preg_match('/[^0-9A-Za-z]/u', $word)) ?
utf8_strtolower($word) : strtolower($word);
\dokuwiki\Utf8\PhpString::strtolower($word) : strtolower($word);
}
foreach ($wordlist as $i => $word) {
@ -1603,7 +1603,7 @@ function idx_indexLengths($filter) {
* @return string
*/
function idx_cleanName($name) {
$name = utf8_romanize(trim((string)$name));
$name = \dokuwiki\Utf8\Clean::romanize(trim((string)$name));
$name = preg_replace('#[ \./\\:-]+#', '_', $name);
$name = preg_replace('/[^A-Za-z0-9_]/', '', $name);
return strtolower($name);

View File

@ -342,7 +342,7 @@ function msg($message,$lvl=0,$line='',$file='',$allow=MSG_PUBLIC){
$errors[1] = 'success';
$errors[2] = 'notify';
if($line || $file) $message.=' ['.utf8_basename($file).':'.$line.']';
if($line || $file) $message.=' ['.\dokuwiki\Utf8\PhpString::basename($file).':'.$line.']';
if(!isset($MSG)) $MSG = array();
$MSG[]=array('lvl' => $errors[$lvl], 'msg' => $message, 'allow' => $allow);

View File

@ -623,7 +623,7 @@ function io_download($url,$file,$useAttachment=false,$defaultName='',$maxSize=20
if (is_string($content_disposition) &&
preg_match('/attachment;\s*filename\s*=\s*"([^"]*)"/i', $content_disposition, $match)) {
$name = utf8_basename($match[1]);
$name = \dokuwiki\Utf8\PhpString::basename($match[1]);
}
}

View File

@ -134,11 +134,11 @@ function _mail_send_action($data) {
// end additional code to support event ... original mail_send() code from here
if(defined('MAILHEADER_ASCIIONLY')){
$subject = utf8_deaccent($subject);
$subject = utf8_strip($subject);
$subject = \dokuwiki\Utf8\Clean::deaccent($subject);
$subject = \dokuwiki\Utf8\Clean::strip($subject);
}
if(!utf8_isASCII($subject)) {
if(!\dokuwiki\Utf8\Clean::isASCII($subject)) {
$enc_subj = '=?UTF-8?Q?'.mail_quotedprintable_encode($subject,0).'?=';
// Spaces must be encoded according to rfc2047. Use the "_" shorthand
$enc_subj = preg_replace('/ /', '_', $enc_subj);
@ -212,7 +212,7 @@ function mail_encode_address($string,$header='',$names=true){
}
// FIXME: is there a way to encode the localpart of a emailaddress?
if(!utf8_isASCII($addr)){
if(!\dokuwiki\Utf8\Clean::isASCII($addr)){
msg(hsc("E-Mail address <$addr> is not ASCII"),-1);
continue;
}
@ -228,11 +228,11 @@ function mail_encode_address($string,$header='',$names=true){
$addr = "<$addr>";
if(defined('MAILHEADER_ASCIIONLY')){
$text = utf8_deaccent($text);
$text = utf8_strip($text);
$text = \dokuwiki\Utf8\Clean::deaccent($text);
$text = \dokuwiki\Utf8\Clean::strip($text);
}
if(!utf8_isASCII($text)){
if(!\dokuwiki\Utf8\Clean::isASCII($text)){
// put the quotes outside as in =?UTF-8?Q?"Elan Ruusam=C3=A4e"?= vs "=?UTF-8?Q?Elan Ruusam=C3=A4e?="
if (preg_match('/^"(.+)"$/', $text, $matches)) {
$text = '"=?UTF-8?Q?'.mail_quotedprintable_encode($matches[1], 0).'?="';

View File

@ -261,7 +261,7 @@ function media_delete($id,$auth){
// trigger an event - MEDIA_DELETE_FILE
$data = array();
$data['id'] = $id;
$data['name'] = utf8_basename($file);
$data['name'] = \dokuwiki\Utf8\PhpString::basename($file);
$data['path'] = $file;
$data['size'] = (file_exists($file)) ? filesize($file) : 0;
@ -1762,7 +1762,7 @@ function media_printimgdetail($item, $fullscreen=false){
$d = $item['meta']->getField(array('IPTC.Caption','EXIF.UserComment',
'EXIF.TIFFImageDescription',
'EXIF.TIFFUserComment'));
if(utf8_strlen($d) > 250) $d = utf8_substr($d,0,250).'...';
if(\dokuwiki\Utf8\PhpString::strlen($d) > 250) $d = \dokuwiki\Utf8\PhpString::substr($d,0,250).'...';
$k = $item['meta']->getField(array('IPTC.Keywords','IPTC.Category','xmp.dc:subject'));
// print EXIF/IPTC data

View File

@ -44,7 +44,7 @@ function getID($param='id',$clean=true){
if($param != 'id') {
$relpath = 'lib/exe/';
}
$script = $conf['basedir'].$relpath.utf8_basename($INPUT->server->str('SCRIPT_FILENAME'));
$script = $conf['basedir'].$relpath.\dokuwiki\Utf8\PhpString::basename($INPUT->server->str('SCRIPT_FILENAME'));
}elseif($INPUT->server->str('PATH_INFO')){
$request = $INPUT->server->str('PATH_INFO');
@ -127,7 +127,7 @@ function cleanID($raw_id,$ascii=false){
$sepcharpat = '#\\'.$sepchar.'+#';
$id = trim((string)$raw_id);
$id = utf8_strtolower($id);
$id = \dokuwiki\Utf8\PhpString::strtolower($id);
//alternative namespace seperator
if($conf['useslash']){
@ -136,13 +136,13 @@ function cleanID($raw_id,$ascii=false){
$id = strtr($id,';/',':'.$sepchar);
}
if($conf['deaccent'] == 2 || $ascii) $id = utf8_romanize($id);
if($conf['deaccent'] || $ascii) $id = utf8_deaccent($id,-1);
if($conf['deaccent'] == 2 || $ascii) $id = \dokuwiki\Utf8\Clean::romanize($id);
if($conf['deaccent'] || $ascii) $id = \dokuwiki\Utf8\Clean::deaccent($id,-1);
//remove specials
$id = utf8_stripspecials($id,$sepchar,'\*');
$id = \dokuwiki\Utf8\Clean::stripspecials($id,$sepchar,'\*');
if($ascii) $id = utf8_strip($id);
if($ascii) $id = \dokuwiki\Utf8\Clean::strip($id);
//clean up
$id = preg_replace($sepcharpat,$sepchar,$id);

View File

@ -21,8 +21,8 @@ class Doku_Renderer_code extends Doku_Renderer {
if(!$language) $language = 'txt';
$language = preg_replace(PREG_PATTERN_VALID_LANGUAGE, '', $language);
if(!$filename) $filename = 'snippet.'.$language;
$filename = utf8_basename($filename);
$filename = utf8_stripspecials($filename, '_');
$filename = \dokuwiki\Utf8\PhpString::basename($filename);
$filename = \dokuwiki\Utf8\Clean::stripspecials($filename, '_');
// send CRLF to Windows clients
if(strpos($INPUT->server->str('HTTP_USER_AGENT'), 'Windows') !== false) {

View File

@ -93,7 +93,7 @@ class Doku_Renderer_metadata extends Doku_Renderer
// cut off too long abstracts
$this->doc = trim($this->doc);
if (strlen($this->doc) > self::ABSTRACT_MAX) {
$this->doc = utf8_substr($this->doc, 0, self::ABSTRACT_MAX).'…';
$this->doc = \dokuwiki\Utf8\PhpString::substr($this->doc, 0, self::ABSTRACT_MAX).'…';
}
$this->meta['description']['abstract'] = $this->doc;
}

View File

@ -1639,7 +1639,7 @@ class Doku_Renderer_xhtml extends Doku_Renderer {
// return the title of the picture
if(!$title) {
// just show the sourcename
$title = $this->_xmlEntities(utf8_basename(noNS($src)));
$title = $this->_xmlEntities(\dokuwiki\Utf8\PhpString::basename(noNS($src)));
}
return $title;
}
@ -1675,7 +1675,7 @@ class Doku_Renderer_xhtml extends Doku_Renderer {
if(!$render) {
// if the file is not supposed to be rendered
// return the title of the file (just the sourcename if there is no title)
return $title ? $title : $this->_xmlEntities(utf8_basename(noNS($src)));
return $title ? $title : $this->_xmlEntities(\dokuwiki\Utf8\PhpString::basename(noNS($src)));
}
$att = array();
@ -1699,7 +1699,7 @@ class Doku_Renderer_xhtml extends Doku_Renderer {
// return the title of the flash
if(!$title) {
// just show the sourcename
$title = utf8_basename(noNS($src));
$title = \dokuwiki\Utf8\PhpString::basename(noNS($src));
}
return $this->_xmlEntities($title);
}
@ -1720,7 +1720,7 @@ class Doku_Renderer_xhtml extends Doku_Renderer {
$ret .= $this->_xmlEntities($title);
} else {
// just show the sourcename
$ret .= $this->_xmlEntities(utf8_basename(noNS($src)));
$ret .= $this->_xmlEntities(\dokuwiki\Utf8\PhpString::basename(noNS($src)));
}
return $ret;
@ -1882,7 +1882,7 @@ class Doku_Renderer_xhtml extends Doku_Renderer {
$url = ml($file, '', true, '&');
$linkType = 'internalmedia';
}
$title = $atts['title'] ? $atts['title'] : $this->_xmlEntities(utf8_basename(noNS($file)));
$title = $atts['title'] ? $atts['title'] : $this->_xmlEntities(\dokuwiki\Utf8\PhpString::basename(noNS($file)));
$out .= '<source src="'.hsc($url).'" type="'.$mime.'" />'.NL;
// alternative content (just a link to the file)
@ -1949,7 +1949,7 @@ class Doku_Renderer_xhtml extends Doku_Renderer {
$url = ml($file, '', true, '&');
$linkType = 'internalmedia';
}
$title = $atts['title'] ? $atts['title'] : $this->_xmlEntities(utf8_basename(noNS($file)));
$title = $atts['title'] ? $atts['title'] : $this->_xmlEntities(\dokuwiki\Utf8\PhpString::basename(noNS($file)));
$out .= '<source src="'.hsc($url).'" type="'.$mime.'" />'.NL;
// alternative content (just a link to the file)

View File

@ -211,7 +211,7 @@ function search_media(&$data,$base,$file,$type,$lvl,$opts){
return false;
}
$info['file'] = utf8_basename($file);
$info['file'] = \dokuwiki\Utf8\PhpString::basename($file);
$info['size'] = filesize($base.'/'.$file);
$info['mtime'] = filemtime($base.'/'.$file);
$info['writable'] = is_writable($base.'/'.$file);
@ -497,7 +497,7 @@ function search_universal(&$data,$base,$file,$type,$lvl,$opts){
$item['open'] = $return;
if(!empty($opts['meta'])){
$item['file'] = utf8_basename($file);
$item['file'] = \dokuwiki\Utf8\PhpString::basename($file);
$item['size'] = filesize($base.'/'.$file);
$item['mtime'] = filemtime($base.'/'.$file);
$item['rev'] = $item['mtime'];

File diff suppressed because it is too large Load Diff

View File

@ -176,7 +176,7 @@ function js_load($file){
// is it a include_once?
if($match[1]){
$base = utf8_basename($ifile);
$base = \dokuwiki\Utf8\PhpString::basename($ifile);
if(array_key_exists($base, $loaded) && $loaded[$base] === true){
$data = str_replace($match[0], '' ,$data);
continue;

View File

@ -101,7 +101,7 @@ class auth_plugin_authad extends DokuWiki_Auth_Plugin
// make sure the right encoding is used
if ($this->getConf('sso_charset')) {
$_SERVER['REMOTE_USER'] = iconv($this->getConf('sso_charset'), 'UTF-8', $_SERVER['REMOTE_USER']);
} elseif (!utf8_check($_SERVER['REMOTE_USER'])) {
} elseif (!\dokuwiki\Utf8\Clean::isUtf8($_SERVER['REMOTE_USER'])) {
$_SERVER['REMOTE_USER'] = utf8_encode($_SERVER['REMOTE_USER']);
}
@ -297,7 +297,7 @@ class auth_plugin_authad extends DokuWiki_Auth_Plugin
$group = str_replace('\\', '', $group);
$group = str_replace('#', '', $group);
$group = preg_replace('[\s]', '_', $group);
$group = utf8_strtolower(trim($group));
$group = \dokuwiki\Utf8\PhpString::strtolower(trim($group));
return $group;
}
@ -322,8 +322,8 @@ class auth_plugin_authad extends DokuWiki_Auth_Plugin
if ($dom) $domain = $dom;
// clean up both
$domain = utf8_strtolower(trim($domain));
$user = utf8_strtolower(trim($user));
$domain = \dokuwiki\Utf8\PhpString::strtolower(trim($domain));
$user = \dokuwiki\Utf8\PhpString::strtolower(trim($user));
// is this a known, valid domain? if not discard
if (!is_array($this->conf[$domain])) {

View File

@ -913,7 +913,7 @@ class helper_plugin_extension_extension extends DokuWiki_Plugin
if (is_string($content_disposition) &&
preg_match('/attachment;\s*filename\s*=\s*"([^"]*)"/i', $content_disposition, $match)) {
$name = utf8_basename($match[1]);
$name = \dokuwiki\Utf8\PhpString::basename($match[1]);
}
}
@ -953,7 +953,7 @@ class helper_plugin_extension_extension extends DokuWiki_Plugin
if (is_null($file)) {
$file = md5($url);
} else {
$file = utf8_basename($file);
$file = \dokuwiki\Utf8\PhpString::basename($file);
}
// create tmp directory for download

View File

@ -1080,7 +1080,7 @@ class admin_plugin_usermanager extends DokuWiki_Admin_Plugin
$fd = fopen($_FILES['import']['tmp_name'], 'r');
if ($fd) {
while ($csv = fgets($fd)) {
if (!utf8_check($csv)) {
if (!\dokuwiki\Utf8\Clean::isUtf8($csv)) {
$csv = utf8_encode($csv);
}
$raw = str_getcsv($csv);