00001 <?php 00002 # Copyright (C) 2004 Brion Vibber <brion@pobox.com> 00003 # http://www.mediawiki.org/ 00004 # 00005 # This program is free software; you can redistribute it and/or modify 00006 # it under the terms of the GNU General Public License as published by 00007 # the Free Software Foundation; either version 2 of the License, or 00008 # (at your option) any later version. 00009 # 00010 # This program is distributed in the hope that it will be useful, 00011 # but WITHOUT ANY WARRANTY; without even the implied warranty of 00012 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00013 # GNU General Public License for more details. 00014 # 00015 # You should have received a copy of the GNU General Public License along 00016 # with this program; if not, write to the Free Software Foundation, Inc., 00017 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 00018 # http://www.gnu.org/copyleft/gpl.html 00019 00030 if( php_sapi_name() != 'cli' ) { 00031 die( "Run me from the command line please.\n" ); 00032 } 00033 00034 require_once 'UtfNormalUtil.php'; 00035 00036 $in = fopen("DerivedNormalizationProps.txt", "rt" ); 00037 if( !$in ) { 00038 print "Can't open DerivedNormalizationProps.txt for reading.\n"; 00039 print "If necessary, fetch this file from the internet:\n"; 00040 print "http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt\n"; 00041 exit(-1); 00042 } 00043 print "Initializing normalization quick check tables...\n"; 00044 $checkNFC = array(); 00045 while( false !== ($line = fgets( $in ) ) ) { 00046 $matches = array(); 00047 if( preg_match( '/^([0-9A-F]+)(?:..([0-9A-F]+))?\s*;\s*(NFC_QC)\s*;\s*([MN])/', $line, $matches ) ) { 00048 list( $junk, $first, $last, $prop, $value ) = $matches; 00049 #print "$first $last $prop $value\n"; 00050 if( !$last ) $last = $first; 00051 for( $i = hexdec( $first ); $i <= hexdec( $last ); $i++) { 00052 $char = codepointToUtf8( $i ); 00053 $checkNFC[$char] = $value; 00054 } 00055 } 00056 } 00057 fclose( $in ); 00058 00059 $in = fopen("CompositionExclusions.txt", "rt" ); 00060 if( !$in ) { 00061 print "Can't open CompositionExclusions.txt for reading.\n"; 00062 print "If necessary, fetch this file from the internet:\n"; 00063 print "http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt\n"; 00064 exit(-1); 00065 } 00066 $exclude = array(); 00067 while( false !== ($line = fgets( $in ) ) ) { 00068 if( preg_match( '/^([0-9A-F]+)/i', $line, $matches ) ) { 00069 $codepoint = $matches[1]; 00070 $source = codepointToUtf8( hexdec( $codepoint ) ); 00071 $exclude[$source] = true; 00072 } 00073 } 00074 fclose($in); 00075 00076 $in = fopen("UnicodeData.txt", "rt" ); 00077 if( !$in ) { 00078 print "Can't open UnicodeData.txt for reading.\n"; 00079 print "If necessary, fetch this file from the internet:\n"; 00080 print "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n"; 00081 exit(-1); 00082 } 00083 00084 $compatibilityDecomp = array(); 00085 $canonicalDecomp = array(); 00086 $canonicalComp = array(); 00087 $combiningClass = array(); 00088 $total = 0; 00089 $compat = 0; 00090 $canon = 0; 00091 00092 print "Reading character definitions...\n"; 00093 while( false !== ($line = fgets( $in ) ) ) { 00094 $columns = split(';', $line); 00095 $codepoint = $columns[0]; 00096 $name = $columns[1]; 00097 $canonicalCombiningClass = $columns[3]; 00098 $decompositionMapping = $columns[5]; 00099 00100 $source = codepointToUtf8( hexdec( $codepoint ) ); 00101 00102 if( $canonicalCombiningClass != 0 ) { 00103 $combiningClass[$source] = intval( $canonicalCombiningClass ); 00104 } 00105 00106 if( $decompositionMapping === '' ) continue; 00107 if( preg_match( '/^<(.+)> (.*)$/', $decompositionMapping, $matches ) ) { 00108 # Compatibility decomposition 00109 $canonical = false; 00110 $decompositionMapping = $matches[2]; 00111 $compat++; 00112 } else { 00113 $canonical = true; 00114 $canon++; 00115 } 00116 $total++; 00117 $dest = hexSequenceToUtf8( $decompositionMapping ); 00118 00119 $compatibilityDecomp[$source] = $dest; 00120 if( $canonical ) { 00121 $canonicalDecomp[$source] = $dest; 00122 if( empty( $exclude[$source] ) ) { 00123 $canonicalComp[$dest] = $source; 00124 } 00125 } 00126 #print "$codepoint | $canonicalCombiningClasses | $decompositionMapping\n"; 00127 } 00128 fclose( $in ); 00129 00130 print "Recursively expanding canonical mappings...\n"; 00131 $changed = 42; 00132 $pass = 1; 00133 while( $changed > 0 ) { 00134 print "pass $pass\n"; 00135 $changed = 0; 00136 foreach( $canonicalDecomp as $source => $dest ) { 00137 $newDest = preg_replace_callback( 00138 '/([\xc0-\xff][\x80-\xbf]+)/', 00139 'callbackCanonical', 00140 $dest); 00141 if( $newDest === $dest ) continue; 00142 $changed++; 00143 $canonicalDecomp[$source] = $newDest; 00144 } 00145 $pass++; 00146 } 00147 00148 print "Recursively expanding compatibility mappings...\n"; 00149 $changed = 42; 00150 $pass = 1; 00151 while( $changed > 0 ) { 00152 print "pass $pass\n"; 00153 $changed = 0; 00154 foreach( $compatibilityDecomp as $source => $dest ) { 00155 $newDest = preg_replace_callback( 00156 '/([\xc0-\xff][\x80-\xbf]+)/', 00157 'callbackCompat', 00158 $dest); 00159 if( $newDest === $dest ) continue; 00160 $changed++; 00161 $compatibilityDecomp[$source] = $newDest; 00162 } 00163 $pass++; 00164 } 00165 00166 print "$total decomposition mappings ($canon canonical, $compat compatibility)\n"; 00167 00168 $out = fopen("UtfNormalData.inc", "wt"); 00169 if( $out ) { 00170 $serCombining = escapeSingleString( serialize( $combiningClass ) ); 00171 $serComp = escapeSingleString( serialize( $canonicalComp ) ); 00172 $serCanon = escapeSingleString( serialize( $canonicalDecomp ) ); 00173 $serCheckNFC = escapeSingleString( serialize( $checkNFC ) ); 00174 $outdata = "<" . "?php 00180 global \$utfCombiningClass, \$utfCanonicalComp, \$utfCanonicalDecomp, \$utfCheckNFC; 00181 \$utfCombiningClass = unserialize( '$serCombining' ); 00182 \$utfCanonicalComp = unserialize( '$serComp' ); 00183 \$utfCanonicalDecomp = unserialize( '$serCanon' ); 00184 \$utfCheckNFC = unserialize( '$serCheckNFC' ); 00185 ?" . ">\n"; 00186 fputs( $out, $outdata ); 00187 fclose( $out ); 00188 print "Wrote out UtfNormalData.inc\n"; 00189 } else { 00190 print "Can't create file UtfNormalData.inc\n"; 00191 exit(-1); 00192 } 00193 00194 00195 $out = fopen("UtfNormalDataK.inc", "wt"); 00196 if( $out ) { 00197 $serCompat = escapeSingleString( serialize( $compatibilityDecomp ) ); 00198 $outdata = "<" . "?php 00204 global \$utfCompatibilityDecomp; 00205 \$utfCompatibilityDecomp = unserialize( '$serCompat' ); 00206 ?" . ">\n"; 00207 fputs( $out, $outdata ); 00208 fclose( $out ); 00209 print "Wrote out UtfNormalDataK.inc\n"; 00210 exit(0); 00211 } else { 00212 print "Can't create file UtfNormalDataK.inc\n"; 00213 exit(-1); 00214 } 00215 00216 # --------------- 00217 00218 function callbackCanonical( $matches ) { 00219 global $canonicalDecomp; 00220 if( isset( $canonicalDecomp[$matches[1]] ) ) { 00221 return $canonicalDecomp[$matches[1]]; 00222 } 00223 return $matches[1]; 00224 } 00225 00226 function callbackCompat( $matches ) { 00227 global $compatibilityDecomp; 00228 if( isset( $compatibilityDecomp[$matches[1]] ) ) { 00229 return $compatibilityDecomp[$matches[1]]; 00230 } 00231 return $matches[1]; 00232 }