00001 <?php 00002 # Copyright (C) 2004 Brion Vibber <brion@pobox.com> 00003 # http://www.mediawiki.org/ 00004 # 00005 # This program is free software; you can redistribute it and/or modify 00006 # it under the terms of the GNU General Public License as published by 00007 # the Free Software Foundation; either version 2 of the License, or 00008 # (at your option) any later version. 00009 # 00010 # This program is distributed in the hope that it will be useful, 00011 # but WITHOUT ANY WARRANTY; without even the implied warranty of 00012 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00013 # GNU General Public License for more details. 00014 # 00015 # You should have received a copy of the GNU General Public License along 00016 # with this program; if not, write to the Free Software Foundation, Inc., 00017 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 00018 # http://www.gnu.org/copyleft/gpl.html 00019 00025 require_once dirname(__FILE__).'/UtfNormalUtil.php'; 00026 00027 global $utfCombiningClass, $utfCanonicalComp, $utfCanonicalDecomp; 00028 $utfCombiningClass = NULL; 00029 $utfCanonicalComp = NULL; 00030 $utfCanonicalDecomp = NULL; 00031 00032 # Load compatibility decompositions on demand if they are needed. 00033 global $utfCompatibilityDecomp; 00034 $utfCompatibilityDecomp = NULL; 00035 00039 define( 'UNORM_NONE', 1 ); 00040 define( 'UNORM_NFD', 2 ); 00041 define( 'UNORM_NFKD', 3 ); 00042 define( 'UNORM_NFC', 4 ); 00043 define( 'UNORM_DEFAULT', UNORM_NFC ); 00044 define( 'UNORM_NFKC', 5 ); 00045 define( 'UNORM_FCD', 6 ); 00046 00047 define( 'NORMALIZE_ICU', function_exists( 'utf8_normalize' ) ); 00048 00063 class UtfNormal { 00074 static function cleanUp( $string ) { 00075 if( NORMALIZE_ICU ) { 00076 # We exclude a few chars that ICU would not. 00077 $string = preg_replace( 00078 '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', 00079 UTF8_REPLACEMENT, 00080 $string ); 00081 $string = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $string ); 00082 $string = str_replace( UTF8_FFFF, UTF8_REPLACEMENT, $string ); 00083 00084 # UnicodeString constructor fails if the string ends with a 00085 # head byte. Add a junk char at the end, we'll strip it off. 00086 return rtrim( utf8_normalize( $string . "\x01", UNORM_NFC ), "\x01" ); 00087 } elseif( UtfNormal::quickIsNFCVerify( $string ) ) { 00088 # Side effect -- $string has had UTF-8 errors cleaned up. 00089 return $string; 00090 } else { 00091 return UtfNormal::NFC( $string ); 00092 } 00093 } 00094 00103 static function toNFC( $string ) { 00104 if( NORMALIZE_ICU ) 00105 return utf8_normalize( $string, UNORM_NFC ); 00106 elseif( UtfNormal::quickIsNFC( $string ) ) 00107 return $string; 00108 else 00109 return UtfNormal::NFC( $string ); 00110 } 00111 00119 static function toNFD( $string ) { 00120 if( NORMALIZE_ICU ) 00121 return utf8_normalize( $string, UNORM_NFD ); 00122 elseif( preg_match( '/[\x80-\xff]/', $string ) ) 00123 return UtfNormal::NFD( $string ); 00124 else 00125 return $string; 00126 } 00127 00136 static function toNFKC( $string ) { 00137 if( NORMALIZE_ICU ) 00138 return utf8_normalize( $string, UNORM_NFKC ); 00139 elseif( preg_match( '/[\x80-\xff]/', $string ) ) 00140 return UtfNormal::NFKC( $string ); 00141 else 00142 return $string; 00143 } 00144 00153 static function toNFKD( $string ) { 00154 if( NORMALIZE_ICU ) 00155 return utf8_normalize( $string, UNORM_NFKD ); 00156 elseif( preg_match( '/[\x80-\xff]/', $string ) ) 00157 return UtfNormal::NFKD( $string ); 00158 else 00159 return $string; 00160 } 00161 00166 static function loadData() { 00167 global $utfCombiningClass; 00168 if( !isset( $utfCombiningClass ) ) { 00169 require_once( dirname(__FILE__) . '/UtfNormalData.inc' ); 00170 } 00171 } 00172 00179 static function quickIsNFC( $string ) { 00180 # ASCII is always valid NFC! 00181 # If it's pure ASCII, let it through. 00182 if( !preg_match( '/[\x80-\xff]/', $string ) ) return true; 00183 00184 UtfNormal::loadData(); 00185 global $utfCheckNFC, $utfCombiningClass; 00186 $len = strlen( $string ); 00187 for( $i = 0; $i < $len; $i++ ) { 00188 $c = $string{$i}; 00189 $n = ord( $c ); 00190 if( $n < 0x80 ) { 00191 continue; 00192 } elseif( $n >= 0xf0 ) { 00193 $c = substr( $string, $i, 4 ); 00194 $i += 3; 00195 } elseif( $n >= 0xe0 ) { 00196 $c = substr( $string, $i, 3 ); 00197 $i += 2; 00198 } elseif( $n >= 0xc0 ) { 00199 $c = substr( $string, $i, 2 ); 00200 $i++; 00201 } 00202 if( isset( $utfCheckNFC[$c] ) ) { 00203 # If it's NO or MAYBE, bail and do the slow check. 00204 return false; 00205 } 00206 if( isset( $utfCombiningClass[$c] ) ) { 00207 # Combining character? We might have to do sorting, at least. 00208 return false; 00209 } 00210 } 00211 return true; 00212 } 00213 00219 static function quickIsNFCVerify( &$string ) { 00220 # Screen out some characters that eg won't be allowed in XML 00221 $string = preg_replace( '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', UTF8_REPLACEMENT, $string ); 00222 00223 # ASCII is always valid NFC! 00224 # If we're only ever given plain ASCII, we can avoid the overhead 00225 # of initializing the decomposition tables by skipping out early. 00226 if( !preg_match( '/[\x80-\xff]/', $string ) ) return true; 00227 00228 static $checkit = null, $tailBytes = null, $utfCheckOrCombining = null; 00229 if( !isset( $checkit ) ) { 00230 # Load/build some scary lookup tables... 00231 UtfNormal::loadData(); 00232 global $utfCheckNFC, $utfCombiningClass; 00233 00234 $utfCheckOrCombining = array_merge( $utfCheckNFC, $utfCombiningClass ); 00235 00236 # Head bytes for sequences which we should do further validity checks 00237 $checkit = array_flip( array_map( 'chr', 00238 array( 0xc0, 0xc1, 0xe0, 0xed, 0xef, 00239 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 00240 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff ) ) ); 00241 00242 # Each UTF-8 head byte is followed by a certain 00243 # number of tail bytes. 00244 $tailBytes = array(); 00245 for( $n = 0; $n < 256; $n++ ) { 00246 if( $n < 0xc0 ) { 00247 $remaining = 0; 00248 } elseif( $n < 0xe0 ) { 00249 $remaining = 1; 00250 } elseif( $n < 0xf0 ) { 00251 $remaining = 2; 00252 } elseif( $n < 0xf8 ) { 00253 $remaining = 3; 00254 } elseif( $n < 0xfc ) { 00255 $remaining = 4; 00256 } elseif( $n < 0xfe ) { 00257 $remaining = 5; 00258 } else { 00259 $remaining = 0; 00260 } 00261 $tailBytes[chr($n)] = $remaining; 00262 } 00263 } 00264 00265 # Chop the text into pure-ASCII and non-ASCII areas; 00266 # large ASCII parts can be handled much more quickly. 00267 # Don't chop up Unicode areas for punctuation, though, 00268 # that wastes energy. 00269 $matches = array(); 00270 preg_match_all( 00271 '/([\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*)/', 00272 $string, $matches ); 00273 00274 $looksNormal = true; 00275 $base = 0; 00276 $replace = array(); 00277 foreach( $matches[1] as $str ) { 00278 $chunk = strlen( $str ); 00279 00280 if( $str{0} < "\x80" ) { 00281 # ASCII chunk: guaranteed to be valid UTF-8 00282 # and in normal form C, so skip over it. 00283 $base += $chunk; 00284 continue; 00285 } 00286 00287 # We'll have to examine the chunk byte by byte to ensure 00288 # that it consists of valid UTF-8 sequences, and to see 00289 # if any of them might not be normalized. 00290 # 00291 # Since PHP is not the fastest language on earth, some of 00292 # this code is a little ugly with inner loop optimizations. 00293 00294 $head = ''; 00295 $len = $chunk + 1; # Counting down is faster. I'm *so* sorry. 00296 00297 for( $i = -1; --$len; ) { 00298 if( $remaining = $tailBytes[$c = $str{++$i}] ) { 00299 # UTF-8 head byte! 00300 $sequence = $head = $c; 00301 do { 00302 # Look for the defined number of tail bytes... 00303 if( --$len && ( $c = $str{++$i} ) >= "\x80" && $c < "\xc0" ) { 00304 # Legal tail bytes are nice. 00305 $sequence .= $c; 00306 } else { 00307 if( 0 == $len ) { 00308 # Premature end of string! 00309 # Drop a replacement character into output to 00310 # represent the invalid UTF-8 sequence. 00311 $replace[] = array( UTF8_REPLACEMENT, 00312 $base + $i + 1 - strlen( $sequence ), 00313 strlen( $sequence ) ); 00314 break 2; 00315 } else { 00316 # Illegal tail byte; abandon the sequence. 00317 $replace[] = array( UTF8_REPLACEMENT, 00318 $base + $i - strlen( $sequence ), 00319 strlen( $sequence ) ); 00320 # Back up and reprocess this byte; it may itself 00321 # be a legal ASCII or UTF-8 sequence head. 00322 --$i; 00323 ++$len; 00324 continue 2; 00325 } 00326 } 00327 } while( --$remaining ); 00328 00329 if( isset( $checkit[$head] ) ) { 00330 # Do some more detailed validity checks, for 00331 # invalid characters and illegal sequences. 00332 if( $head == "\xed" ) { 00333 # 0xed is relatively frequent in Korean, which 00334 # abuts the surrogate area, so we're doing 00335 # this check separately to speed things up. 00336 00337 if( $sequence >= UTF8_SURROGATE_FIRST ) { 00338 # Surrogates are legal only in UTF-16 code. 00339 # They are totally forbidden here in UTF-8 00340 # utopia. 00341 $replace[] = array( UTF8_REPLACEMENT, 00342 $base + $i + 1 - strlen( $sequence ), 00343 strlen( $sequence ) ); 00344 $head = ''; 00345 continue; 00346 } 00347 } else { 00348 # Slower, but rarer checks... 00349 $n = ord( $head ); 00350 if( 00351 # "Overlong sequences" are those that are syntactically 00352 # correct but use more UTF-8 bytes than are necessary to 00353 # encode a character. Naïve string comparisons can be 00354 # tricked into failing to see a match for an ASCII 00355 # character, for instance, which can be a security hole 00356 # if blacklist checks are being used. 00357 ($n < 0xc2 && $sequence <= UTF8_OVERLONG_A) 00358 || ($n == 0xe0 && $sequence <= UTF8_OVERLONG_B) 00359 || ($n == 0xf0 && $sequence <= UTF8_OVERLONG_C) 00360 00361 # U+FFFE and U+FFFF are explicitly forbidden in Unicode. 00362 || ($n == 0xef && 00363 ($sequence == UTF8_FFFE) 00364 || ($sequence == UTF8_FFFF) ) 00365 00366 # Unicode has been limited to 21 bits; longer 00367 # sequences are not allowed. 00368 || ($n >= 0xf0 && $sequence > UTF8_MAX) ) { 00369 00370 $replace[] = array( UTF8_REPLACEMENT, 00371 $base + $i + 1 - strlen( $sequence ), 00372 strlen( $sequence ) ); 00373 $head = ''; 00374 continue; 00375 } 00376 } 00377 } 00378 00379 if( isset( $utfCheckOrCombining[$sequence] ) ) { 00380 # If it's NO or MAYBE, we'll have to rip 00381 # the string apart and put it back together. 00382 # That's going to be mighty slow. 00383 $looksNormal = false; 00384 } 00385 00386 # The sequence is legal! 00387 $head = ''; 00388 } elseif( $c < "\x80" ) { 00389 # ASCII byte. 00390 $head = ''; 00391 } elseif( $c < "\xc0" ) { 00392 # Illegal tail bytes 00393 if( $head == '' ) { 00394 # Out of the blue! 00395 $replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 ); 00396 } else { 00397 # Don't add if we're continuing a broken sequence; 00398 # we already put a replacement character when we looked 00399 # at the broken sequence. 00400 $replace[] = array( '', $base + $i, 1 ); 00401 } 00402 } else { 00403 # Miscellaneous freaks. 00404 $replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 ); 00405 $head = ''; 00406 } 00407 } 00408 $base += $chunk; 00409 } 00410 if( count( $replace ) ) { 00411 # There were illegal UTF-8 sequences we need to fix up. 00412 $out = ''; 00413 $last = 0; 00414 foreach( $replace as $rep ) { 00415 list( $replacement, $start, $length ) = $rep; 00416 if( $last < $start ) { 00417 $out .= substr( $string, $last, $start - $last ); 00418 } 00419 $out .= $replacement; 00420 $last = $start + $length; 00421 } 00422 if( $last < strlen( $string ) ) { 00423 $out .= substr( $string, $last ); 00424 } 00425 $string = $out; 00426 } 00427 return $looksNormal; 00428 } 00429 00430 # These take a string and run the normalization on them, without 00431 # checking for validity or any optimization etc. Input must be 00432 # VALID UTF-8! 00433 00438 static function NFC( $string ) { 00439 return UtfNormal::fastCompose( UtfNormal::NFD( $string ) ); 00440 } 00441 00447 static function NFD( $string ) { 00448 UtfNormal::loadData(); 00449 global $utfCanonicalDecomp; 00450 return UtfNormal::fastCombiningSort( 00451 UtfNormal::fastDecompose( $string, $utfCanonicalDecomp ) ); 00452 } 00453 00459 static function NFKC( $string ) { 00460 return UtfNormal::fastCompose( UtfNormal::NFKD( $string ) ); 00461 } 00462 00468 static function NFKD( $string ) { 00469 global $utfCompatibilityDecomp; 00470 if( !isset( $utfCompatibilityDecomp ) ) { 00471 require_once( 'UtfNormalDataK.inc' ); 00472 } 00473 return UtfNormal::fastCombiningSort( 00474 UtfNormal::fastDecompose( $string, $utfCompatibilityDecomp ) ); 00475 } 00476 00477 00487 static function fastDecompose( $string, $map ) { 00488 UtfNormal::loadData(); 00489 $len = strlen( $string ); 00490 $out = ''; 00491 for( $i = 0; $i < $len; $i++ ) { 00492 $c = $string{$i}; 00493 $n = ord( $c ); 00494 if( $n < 0x80 ) { 00495 # ASCII chars never decompose 00496 # THEY ARE IMMORTAL 00497 $out .= $c; 00498 continue; 00499 } elseif( $n >= 0xf0 ) { 00500 $c = substr( $string, $i, 4 ); 00501 $i += 3; 00502 } elseif( $n >= 0xe0 ) { 00503 $c = substr( $string, $i, 3 ); 00504 $i += 2; 00505 } elseif( $n >= 0xc0 ) { 00506 $c = substr( $string, $i, 2 ); 00507 $i++; 00508 } 00509 if( isset( $map[$c] ) ) { 00510 $out .= $map[$c]; 00511 continue; 00512 } else { 00513 if( $c >= UTF8_HANGUL_FIRST && $c <= UTF8_HANGUL_LAST ) { 00514 # Decompose a hangul syllable into jamo; 00515 # hardcoded for three-byte UTF-8 sequence. 00516 # A lookup table would be slightly faster, 00517 # but adds a lot of memory & disk needs. 00518 # 00519 $index = ( (ord( $c{0} ) & 0x0f) << 12 00520 | (ord( $c{1} ) & 0x3f) << 6 00521 | (ord( $c{2} ) & 0x3f) ) 00522 - UNICODE_HANGUL_FIRST; 00523 $l = intval( $index / UNICODE_HANGUL_NCOUNT ); 00524 $v = intval( ($index % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT); 00525 $t = $index % UNICODE_HANGUL_TCOUNT; 00526 $out .= "\xe1\x84" . chr( 0x80 + $l ) . "\xe1\x85" . chr( 0xa1 + $v ); 00527 if( $t >= 25 ) { 00528 $out .= "\xe1\x87" . chr( 0x80 + $t - 25 ); 00529 } elseif( $t ) { 00530 $out .= "\xe1\x86" . chr( 0xa7 + $t ); 00531 } 00532 continue; 00533 } 00534 } 00535 $out .= $c; 00536 } 00537 return $out; 00538 } 00539 00547 static function fastCombiningSort( $string ) { 00548 UtfNormal::loadData(); 00549 global $utfCombiningClass; 00550 $len = strlen( $string ); 00551 $out = ''; 00552 $combiners = array(); 00553 $lastClass = -1; 00554 for( $i = 0; $i < $len; $i++ ) { 00555 $c = $string{$i}; 00556 $n = ord( $c ); 00557 if( $n >= 0x80 ) { 00558 if( $n >= 0xf0 ) { 00559 $c = substr( $string, $i, 4 ); 00560 $i += 3; 00561 } elseif( $n >= 0xe0 ) { 00562 $c = substr( $string, $i, 3 ); 00563 $i += 2; 00564 } elseif( $n >= 0xc0 ) { 00565 $c = substr( $string, $i, 2 ); 00566 $i++; 00567 } 00568 if( isset( $utfCombiningClass[$c] ) ) { 00569 $lastClass = $utfCombiningClass[$c]; 00570 if( isset( $combiners[$lastClass] ) ) { 00571 $combiners[$lastClass] .= $c; 00572 } else { 00573 $combiners[$lastClass] = $c; 00574 } 00575 continue; 00576 } 00577 } 00578 if( $lastClass ) { 00579 ksort( $combiners ); 00580 $out .= implode( '', $combiners ); 00581 $combiners = array(); 00582 } 00583 $out .= $c; 00584 $lastClass = 0; 00585 } 00586 if( $lastClass ) { 00587 ksort( $combiners ); 00588 $out .= implode( '', $combiners ); 00589 } 00590 return $out; 00591 } 00592 00600 static function fastCompose( $string ) { 00601 UtfNormal::loadData(); 00602 global $utfCanonicalComp, $utfCombiningClass; 00603 $len = strlen( $string ); 00604 $out = ''; 00605 $lastClass = -1; 00606 $lastHangul = 0; 00607 $startChar = ''; 00608 $combining = ''; 00609 $x1 = ord(substr(UTF8_HANGUL_VBASE,0,1)); 00610 $x2 = ord(substr(UTF8_HANGUL_TEND,0,1)); 00611 for( $i = 0; $i < $len; $i++ ) { 00612 $c = $string{$i}; 00613 $n = ord( $c ); 00614 if( $n < 0x80 ) { 00615 # No combining characters here... 00616 $out .= $startChar; 00617 $out .= $combining; 00618 $startChar = $c; 00619 $combining = ''; 00620 $lastClass = 0; 00621 continue; 00622 } elseif( $n >= 0xf0 ) { 00623 $c = substr( $string, $i, 4 ); 00624 $i += 3; 00625 } elseif( $n >= 0xe0 ) { 00626 $c = substr( $string, $i, 3 ); 00627 $i += 2; 00628 } elseif( $n >= 0xc0 ) { 00629 $c = substr( $string, $i, 2 ); 00630 $i++; 00631 } 00632 $pair = $startChar . $c; 00633 if( $n > 0x80 ) { 00634 if( isset( $utfCombiningClass[$c] ) ) { 00635 # A combining char; see what we can do with it 00636 $class = $utfCombiningClass[$c]; 00637 if( !empty( $startChar ) && 00638 $lastClass < $class && 00639 $class > 0 && 00640 isset( $utfCanonicalComp[$pair] ) ) { 00641 $startChar = $utfCanonicalComp[$pair]; 00642 $class = 0; 00643 } else { 00644 $combining .= $c; 00645 } 00646 $lastClass = $class; 00647 $lastHangul = 0; 00648 continue; 00649 } 00650 } 00651 # New start char 00652 if( $lastClass == 0 ) { 00653 if( isset( $utfCanonicalComp[$pair] ) ) { 00654 $startChar = $utfCanonicalComp[$pair]; 00655 $lastHangul = 0; 00656 continue; 00657 } 00658 if( $n >= $x1 && $n <= $x2 ) { 00659 # WARNING: Hangul code is painfully slow. 00660 # I apologize for this ugly, ugly code; however 00661 # performance is even more teh suck if we call 00662 # out to nice clean functions. Lookup tables are 00663 # marginally faster, but require a lot of space. 00664 # 00665 if( $c >= UTF8_HANGUL_VBASE && 00666 $c <= UTF8_HANGUL_VEND && 00667 $startChar >= UTF8_HANGUL_LBASE && 00668 $startChar <= UTF8_HANGUL_LEND ) { 00669 # 00670 #$lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE; 00671 #$vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE; 00672 $lIndex = ord( $startChar{2} ) - 0x80; 00673 $vIndex = ord( $c{2} ) - 0xa1; 00674 00675 $hangulPoint = UNICODE_HANGUL_FIRST + 00676 UNICODE_HANGUL_TCOUNT * 00677 (UNICODE_HANGUL_VCOUNT * $lIndex + $vIndex); 00678 00679 # Hardcode the limited-range UTF-8 conversion: 00680 $startChar = chr( $hangulPoint >> 12 & 0x0f | 0xe0 ) . 00681 chr( $hangulPoint >> 6 & 0x3f | 0x80 ) . 00682 chr( $hangulPoint & 0x3f | 0x80 ); 00683 $lastHangul = 0; 00684 continue; 00685 } elseif( $c >= UTF8_HANGUL_TBASE && 00686 $c <= UTF8_HANGUL_TEND && 00687 $startChar >= UTF8_HANGUL_FIRST && 00688 $startChar <= UTF8_HANGUL_LAST && 00689 !$lastHangul ) { 00690 # $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE; 00691 $tIndex = ord( $c{2} ) - 0xa7; 00692 if( $tIndex < 0 ) $tIndex = ord( $c{2} ) - 0x80 + (0x11c0 - 0x11a7); 00693 00694 # Increment the code point by $tIndex, without 00695 # the function overhead of decoding and recoding UTF-8 00696 # 00697 $tail = ord( $startChar{2} ) + $tIndex; 00698 if( $tail > 0xbf ) { 00699 $tail -= 0x40; 00700 $mid = ord( $startChar{1} ) + 1; 00701 if( $mid > 0xbf ) { 00702 $startChar{0} = chr( ord( $startChar{0} ) + 1 ); 00703 $mid -= 0x40; 00704 } 00705 $startChar{1} = chr( $mid ); 00706 } 00707 $startChar{2} = chr( $tail ); 00708 00709 # If there's another jamo char after this, *don't* try to merge it. 00710 $lastHangul = 1; 00711 continue; 00712 } 00713 } 00714 } 00715 $out .= $startChar; 00716 $out .= $combining; 00717 $startChar = $c; 00718 $combining = ''; 00719 $lastClass = 0; 00720 $lastHangul = 0; 00721 } 00722 $out .= $startChar . $combining; 00723 return $out; 00724 } 00725 00732 static function placebo( $string ) { 00733 $len = strlen( $string ); 00734 $out = ''; 00735 for( $i = 0; $i < $len; $i++ ) { 00736 $out .= $string{$i}; 00737 } 00738 return $out; 00739 } 00740 }