384 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
			
		
		
	
	
			384 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
| <?php
 | |
| 
 | |
| namespace dokuwiki\Utf8;
 | |
| 
 | |
| /**
 | |
|  * UTF-8 aware equivalents to PHP's string functions
 | |
|  */
 | |
| class PhpString
 | |
| {
 | |
| 
 | |
|     /**
 | |
|      * A locale independent basename() implementation
 | |
|      *
 | |
|      * works around a bug in PHP's basename() implementation
 | |
|      *
 | |
|      * @param string $path A path
 | |
|      * @param string $suffix If the name component ends in suffix this will also be cut off
 | |
|      * @return string
 | |
|      * @link   https://bugs.php.net/bug.php?id=37738
 | |
|      *
 | |
|      * @see basename()
 | |
|      */
 | |
|     public static function basename($path, $suffix = '')
 | |
|     {
 | |
|         $path = trim($path, '\\/');
 | |
|         $rpos = max(strrpos($path, '/'), strrpos($path, '\\'));
 | |
|         if ($rpos) {
 | |
|             $path = substr($path, $rpos + 1);
 | |
|         }
 | |
| 
 | |
|         $suflen = strlen($suffix);
 | |
|         if ($suflen && (substr($path, -$suflen) === $suffix)) {
 | |
|             $path = substr($path, 0, -$suflen);
 | |
|         }
 | |
| 
 | |
|         return $path;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Unicode aware replacement for strlen()
 | |
|      *
 | |
|      * utf8_decode() converts characters that are not in ISO-8859-1
 | |
|      * to '?', which, for the purpose of counting, is alright - It's
 | |
|      * even faster than mb_strlen.
 | |
|      *
 | |
|      * @param string $string
 | |
|      * @return int
 | |
|      * @see    utf8_decode()
 | |
|      *
 | |
|      * @author <chernyshevsky at hotmail dot com>
 | |
|      * @see    strlen()
 | |
|      */
 | |
|     public static function strlen($string)
 | |
|     {
 | |
|         if (function_exists('utf8_decode')) {
 | |
|             return strlen(utf8_decode($string));
 | |
|         }
 | |
| 
 | |
|         if (UTF8_MBSTRING) {
 | |
|             return mb_strlen($string, 'UTF-8');
 | |
|         }
 | |
| 
 | |
|         if (function_exists('iconv_strlen')) {
 | |
|             return iconv_strlen($string, 'UTF-8');
 | |
|         }
 | |
| 
 | |
|         return strlen($string);
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * UTF-8 aware alternative to substr
 | |
|      *
 | |
|      * Return part of a string given character offset (and optionally length)
 | |
|      *
 | |
|      * @param string $str
 | |
|      * @param int $offset number of UTF-8 characters offset (from left)
 | |
|      * @param int $length (optional) length in UTF-8 characters from offset
 | |
|      * @return string
 | |
|      * @author Harry Fuecks <hfuecks@gmail.com>
 | |
|      * @author Chris Smith <chris@jalakai.co.uk>
 | |
|      *
 | |
|      */
 | |
|     public static function substr($str, $offset, $length = null)
 | |
|     {
 | |
|         if (UTF8_MBSTRING) {
 | |
|             if ($length === null) {
 | |
|                 return mb_substr($str, $offset);
 | |
|             }
 | |
| 
 | |
|             return mb_substr($str, $offset, $length);
 | |
|         }
 | |
| 
 | |
|         /*
 | |
|          * Notes:
 | |
|          *
 | |
|          * no mb string support, so we'll use pcre regex's with 'u' flag
 | |
|          * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
 | |
|          * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
 | |
|          *
 | |
|          * substr documentation states false can be returned in some cases (e.g. offset > string length)
 | |
|          * mb_substr never returns false, it will return an empty string instead.
 | |
|          *
 | |
|          * calculating the number of characters in the string is a relatively expensive operation, so
 | |
|          * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
 | |
|          */
 | |
| 
 | |
|         // cast parameters to appropriate types to avoid multiple notices/warnings
 | |
|         $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
 | |
|         $offset = (int)$offset;
 | |
|         if ($length !== null) $length = (int)$length;
 | |
| 
 | |
|         // handle trivial cases
 | |
|         if ($length === 0) return '';
 | |
|         if ($offset < 0 && $length < 0 && $length < $offset) return '';
 | |
| 
 | |
|         $offset_pattern = '';
 | |
|         $length_pattern = '';
 | |
| 
 | |
|         // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
 | |
|         if ($offset < 0) {
 | |
|             $strlen = self::strlen($str);        // see notes
 | |
|             $offset = $strlen + $offset;
 | |
|             if ($offset < 0) $offset = 0;
 | |
|         }
 | |
| 
 | |
|         // establish a pattern for offset, a non-captured group equal in length to offset
 | |
|         if ($offset > 0) {
 | |
|             $Ox = (int)($offset / 65535);
 | |
|             $Oy = $offset % 65535;
 | |
| 
 | |
|             if ($Ox) $offset_pattern = '(?:.{65535}){' . $Ox . '}';
 | |
|             $offset_pattern = '^(?:' . $offset_pattern . '.{' . $Oy . '})';
 | |
|         } else {
 | |
|             $offset_pattern = '^';                      // offset == 0; just anchor the pattern
 | |
|         }
 | |
| 
 | |
|         // establish a pattern for length
 | |
|         if ($length === null) {
 | |
|             $length_pattern = '(.*)$';                  // the rest of the string
 | |
|         } else {
 | |
| 
 | |
|             if (!isset($strlen)) $strlen = self::strlen($str);    // see notes
 | |
|             if ($offset > $strlen) return '';           // another trivial case
 | |
| 
 | |
|             if ($length > 0) {
 | |
| 
 | |
|                 // reduce any length that would go past the end of the string
 | |
|                 $length = min($strlen - $offset, $length);
 | |
| 
 | |
|                 $Lx = (int)($length / 65535);
 | |
|                 $Ly = $length % 65535;
 | |
| 
 | |
|                 // +ve length requires ... a captured group of length characters
 | |
|                 if ($Lx) $length_pattern = '(?:.{65535}){' . $Lx . '}';
 | |
|                 $length_pattern = '(' . $length_pattern . '.{' . $Ly . '})';
 | |
| 
 | |
|             } else if ($length < 0) {
 | |
| 
 | |
|                 if ($length < ($offset - $strlen)) return '';
 | |
| 
 | |
|                 $Lx = (int)((-$length) / 65535);
 | |
|                 $Ly = (-$length) % 65535;
 | |
| 
 | |
|                 // -ve length requires ... capture everything except a group of -length characters
 | |
|                 //                         anchored at the tail-end of the string
 | |
|                 if ($Lx) $length_pattern = '(?:.{65535}){' . $Lx . '}';
 | |
|                 $length_pattern = '(.*)(?:' . $length_pattern . '.{' . $Ly . '})$';
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         if (!preg_match('#' . $offset_pattern . $length_pattern . '#us', $str, $match)) return '';
 | |
|         return $match[1];
 | |
|     }
 | |
| 
 | |
|     // phpcs:disable PSR1.Methods.CamelCapsMethodName.NotCamelCaps
 | |
|     /**
 | |
|      * Unicode aware replacement for substr_replace()
 | |
|      *
 | |
|      * @param string $string input string
 | |
|      * @param string $replacement the replacement
 | |
|      * @param int $start the replacing will begin at the start'th offset into string.
 | |
|      * @param int $length If given and is positive, it represents the length of the portion of string which is
 | |
|      *                            to be replaced. If length is zero then this function will have the effect of inserting
 | |
|      *                            replacement into string at the given start offset.
 | |
|      * @return string
 | |
|      * @see    substr_replace()
 | |
|      *
 | |
|      * @author Andreas Gohr <andi@splitbrain.org>
 | |
|      */
 | |
|     public static function substr_replace($string, $replacement, $start, $length = 0)
 | |
|     {
 | |
|         $ret = '';
 | |
|         if ($start > 0) $ret .= self::substr($string, 0, $start);
 | |
|         $ret .= $replacement;
 | |
|         $ret .= self::substr($string, $start + $length);
 | |
|         return $ret;
 | |
|     }
 | |
|     // phpcs:enable PSR1.Methods.CamelCapsMethodName.NotCamelCaps
 | |
| 
 | |
|     /**
 | |
|      * Unicode aware replacement for ltrim()
 | |
|      *
 | |
|      * @param string $str
 | |
|      * @param string $charlist
 | |
|      * @return string
 | |
|      * @see    ltrim()
 | |
|      *
 | |
|      * @author Andreas Gohr <andi@splitbrain.org>
 | |
|      */
 | |
|     public static function ltrim($str, $charlist = '')
 | |
|     {
 | |
|         if ($charlist === '') return ltrim($str);
 | |
| 
 | |
|         //quote charlist for use in a characterclass
 | |
|         $charlist = preg_replace('!([\\\\\\-\\]\\[/])!', '\\\${1}', $charlist);
 | |
| 
 | |
|         return preg_replace('/^[' . $charlist . ']+/u', '', $str);
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Unicode aware replacement for rtrim()
 | |
|      *
 | |
|      * @param string $str
 | |
|      * @param string $charlist
 | |
|      * @return string
 | |
|      * @see    rtrim()
 | |
|      *
 | |
|      * @author Andreas Gohr <andi@splitbrain.org>
 | |
|      */
 | |
|     public static function rtrim($str, $charlist = '')
 | |
|     {
 | |
|         if ($charlist === '') return rtrim($str);
 | |
| 
 | |
|         //quote charlist for use in a characterclass
 | |
|         $charlist = preg_replace('!([\\\\\\-\\]\\[/])!', '\\\${1}', $charlist);
 | |
| 
 | |
|         return preg_replace('/[' . $charlist . ']+$/u', '', $str);
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Unicode aware replacement for trim()
 | |
|      *
 | |
|      * @param string $str
 | |
|      * @param string $charlist
 | |
|      * @return string
 | |
|      * @see    trim()
 | |
|      *
 | |
|      * @author Andreas Gohr <andi@splitbrain.org>
 | |
|      */
 | |
|     public static function trim($str, $charlist = '')
 | |
|     {
 | |
|         if ($charlist === '') return trim($str);
 | |
| 
 | |
|         return self::ltrim(self::rtrim($str, $charlist), $charlist);
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * This is a unicode aware replacement for strtolower()
 | |
|      *
 | |
|      * Uses mb_string extension if available
 | |
|      *
 | |
|      * @param string $string
 | |
|      * @return string
 | |
|      * @see    \dokuwiki\Utf8\PhpString::strtoupper()
 | |
|      *
 | |
|      * @author Leo Feyer <leo@typolight.org>
 | |
|      * @see    strtolower()
 | |
|      */
 | |
|     public static function strtolower($string)
 | |
|     {
 | |
|         if (UTF8_MBSTRING) {
 | |
|             if (class_exists('Normalizer', $autoload = false)) {
 | |
|                 return \Normalizer::normalize(mb_strtolower($string, 'utf-8'));
 | |
|             }
 | |
|             return (mb_strtolower($string, 'utf-8'));
 | |
|         }
 | |
|         return strtr($string, Table::upperCaseToLowerCase());
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * This is a unicode aware replacement for strtoupper()
 | |
|      *
 | |
|      * Uses mb_string extension if available
 | |
|      *
 | |
|      * @param string $string
 | |
|      * @return string
 | |
|      * @see    \dokuwiki\Utf8\PhpString::strtoupper()
 | |
|      *
 | |
|      * @author Leo Feyer <leo@typolight.org>
 | |
|      * @see    strtoupper()
 | |
|      */
 | |
|     public static function strtoupper($string)
 | |
|     {
 | |
|         if (UTF8_MBSTRING) return mb_strtoupper($string, 'utf-8');
 | |
| 
 | |
|         return strtr($string, Table::lowerCaseToUpperCase());
 | |
|     }
 | |
| 
 | |
| 
 | |
|     /**
 | |
|      * UTF-8 aware alternative to ucfirst
 | |
|      * Make a string's first character uppercase
 | |
|      *
 | |
|      * @param string $str
 | |
|      * @return string with first character as upper case (if applicable)
 | |
|      * @author Harry Fuecks
 | |
|      *
 | |
|      */
 | |
|     public static function ucfirst($str)
 | |
|     {
 | |
|         switch (self::strlen($str)) {
 | |
|             case 0:
 | |
|                 return '';
 | |
|             case 1:
 | |
|                 return self::strtoupper($str);
 | |
|             default:
 | |
|                 preg_match('/^(.{1})(.*)$/us', $str, $matches);
 | |
|                 return self::strtoupper($matches[1]) . $matches[2];
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * UTF-8 aware alternative to ucwords
 | |
|      * Uppercase the first character of each word in a string
 | |
|      *
 | |
|      * @param string $str
 | |
|      * @return string with first char of each word uppercase
 | |
|      * @author Harry Fuecks
 | |
|      * @see http://php.net/ucwords
 | |
|      *
 | |
|      */
 | |
|     public static function ucwords($str)
 | |
|     {
 | |
|         // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches;
 | |
|         // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns
 | |
|         // This corresponds to the definition of a "word" defined at http://php.net/ucwords
 | |
|         $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u';
 | |
| 
 | |
|         return preg_replace_callback(
 | |
|             $pattern,
 | |
|             function ($matches) {
 | |
|                 $leadingws = $matches[2];
 | |
|                 $ucfirst = self::strtoupper($matches[3]);
 | |
|                 $ucword = self::substr_replace(ltrim($matches[0]), $ucfirst, 0, 1);
 | |
|                 return $leadingws . $ucword;
 | |
|             },
 | |
|             $str
 | |
|         );
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * This is an Unicode aware replacement for strpos
 | |
|      *
 | |
|      * @param string $haystack
 | |
|      * @param string $needle
 | |
|      * @param integer $offset
 | |
|      * @return integer
 | |
|      * @author Leo Feyer <leo@typolight.org>
 | |
|      * @see    strpos()
 | |
|      *
 | |
|      */
 | |
|     public static function strpos($haystack, $needle, $offset = 0)
 | |
|     {
 | |
|         $comp = 0;
 | |
|         $length = null;
 | |
| 
 | |
|         while ($length === null || $length < $offset) {
 | |
|             $pos = strpos($haystack, $needle, $offset + $comp);
 | |
| 
 | |
|             if ($pos === false)
 | |
|                 return false;
 | |
| 
 | |
|             $length = self::strlen(substr($haystack, 0, $pos));
 | |
| 
 | |
|             if ($length < $offset)
 | |
|                 $comp = $pos - $length;
 | |
|         }
 | |
| 
 | |
|         return $length;
 | |
|     }
 | |
| 
 | |
| 
 | |
| }
 |