1 : /*
2 : +----------------------------------------------------------------------+
3 : | PHP Version 5 |
4 : +----------------------------------------------------------------------+
5 : | Copyright (c) 1997-2009 The PHP Group |
6 : +----------------------------------------------------------------------+
7 : | This source file is subject to version 3.01 of the PHP license, |
8 : | that is bundled with this package in the file LICENSE, and is |
9 : | available through the world-wide-web at the following url: |
10 : | http://www.php.net/license/3_01.txt |
11 : | If you did not receive a copy of the PHP license and are unable to |
12 : | obtain it through the world-wide-web, please send a note to |
13 : | license@php.net so we can mail you a copy immediately. |
14 : +----------------------------------------------------------------------+
15 : | Author: Wez Furlong (wez@thebrainroom.com) |
16 : +----------------------------------------------------------------------+
17 :
18 : Based on code from ucdata-2.5, which has the following Copyright:
19 :
20 : Copyright 2001 Computing Research Labs, New Mexico State University
21 :
22 : Permission is hereby granted, free of charge, to any person obtaining a
23 : copy of this software and associated documentation files (the "Software"),
24 : to deal in the Software without restriction, including without limitation
25 : the rights to use, copy, modify, merge, publish, distribute, sublicense,
26 : and/or sell copies of the Software, and to permit persons to whom the
27 : Software is furnished to do so, subject to the following conditions:
28 :
29 : The above copyright notice and this permission notice shall be included in
30 : all copies or substantial portions of the Software.
31 : */
32 :
33 : #ifdef HAVE_CONFIG_H
34 : #include "config.h"
35 : #endif
36 :
37 : #include "php.h"
38 : #include "php_ini.h"
39 :
40 : #if HAVE_MBSTRING
41 :
42 : /* include case folding data generated from the official UnicodeData.txt file */
43 : #include "mbstring.h"
44 : #include "php_unicode.h"
45 : #include "unicode_data.h"
46 :
47 : ZEND_EXTERN_MODULE_GLOBALS(mbstring)
48 :
49 : /*
50 : * A simple array of 32-bit masks for lookup.
51 : */
52 : static unsigned long masks32[32] = {
53 : 0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 0x00000020,
54 : 0x00000040, 0x00000080, 0x00000100, 0x00000200, 0x00000400, 0x00000800,
55 : 0x00001000, 0x00002000, 0x00004000, 0x00008000, 0x00010000, 0x00020000,
56 : 0x00040000, 0x00080000, 0x00100000, 0x00200000, 0x00400000, 0x00800000,
57 : 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000,
58 : 0x40000000, 0x80000000
59 : };
60 :
61 :
62 : static int prop_lookup(unsigned long code, unsigned long n)
63 19293 : {
64 : long l, r, m;
65 :
66 : /*
67 : * There is an extra node on the end of the offsets to allow this routine
68 : * to work right. If the index is 0xffff, then there are no nodes for the
69 : * property.
70 : */
71 19293 : if ((l = _ucprop_offsets[n]) == 0xffff)
72 0 : return 0;
73 :
74 : /*
75 : * Locate the next offset that is not 0xffff. The sentinel at the end of
76 : * the array is the max index value.
77 : */
78 19293 : for (m = 1; n + m < _ucprop_size && _ucprop_offsets[n + m] == 0xffff; m++)
79 : ;
80 :
81 19293 : r = _ucprop_offsets[n + m] - 1;
82 :
83 196313 : while (l <= r) {
84 : /*
85 : * Determine a "mid" point and adjust to make sure the mid point is at
86 : * the beginning of a range pair.
87 : */
88 165139 : m = (l + r) >> 1;
89 165139 : m -= (m & 1);
90 165139 : if (code > _ucprop_ranges[m + 1])
91 41248 : l = m + 2;
92 123891 : else if (code < _ucprop_ranges[m])
93 116479 : r = m - 2;
94 7412 : else if (code >= _ucprop_ranges[m] && code <= _ucprop_ranges[m + 1])
95 7412 : return 1;
96 : }
97 11881 : return 0;
98 :
99 : }
100 :
101 : MBSTRING_API int php_unicode_is_prop(unsigned long code, unsigned long mask1,
102 : unsigned long mask2)
103 19269 : {
104 : unsigned long i;
105 :
106 19269 : if (mask1 == 0 && mask2 == 0)
107 0 : return 0;
108 :
109 507485 : for (i = 0; mask1 && i < 32; i++) {
110 495628 : if ((mask1 & masks32[i]) && prop_lookup(code, i))
111 7412 : return 1;
112 : }
113 :
114 11857 : for (i = 32; mask2 && i < _ucprop_size; i++) {
115 0 : if ((mask2 & masks32[i & 31]) && prop_lookup(code, i))
116 0 : return 1;
117 : }
118 :
119 11857 : return 0;
120 : }
121 :
122 : static unsigned long case_lookup(unsigned long code, long l, long r, int field)
123 8496 : {
124 : long m;
125 :
126 : /*
127 : * Do the binary search.
128 : */
129 57336 : while (l <= r) {
130 : /*
131 : * Determine a "mid" point and adjust to make sure the mid point is at
132 : * the beginning of a case mapping triple.
133 : */
134 45481 : m = (l + r) >> 1;
135 45481 : m -= (m % 3);
136 45481 : if (code > _uccase_map[m])
137 11237 : l = m + 3;
138 34244 : else if (code < _uccase_map[m])
139 29107 : r = m - 3;
140 5137 : else if (code == _uccase_map[m])
141 5137 : return _uccase_map[m + field];
142 : }
143 :
144 3359 : return code;
145 : }
146 :
147 : MBSTRING_API unsigned long php_turkish_toupper(unsigned long code, long l, long r, int field)
148 0 : {
149 0 : if (code == 0x0069L) {
150 0 : return 0x0130L;
151 : }
152 0 : return case_lookup(code, l, r, field);
153 : }
154 :
155 : MBSTRING_API unsigned long php_turkish_tolower(unsigned long code, long l, long r, int field)
156 0 : {
157 0 : if (code == 0x0049L) {
158 0 : return 0x0131L;
159 : }
160 0 : return case_lookup(code, l, r, field);
161 : }
162 :
163 : MBSTRING_API unsigned long php_unicode_toupper(unsigned long code, enum mbfl_no_encoding enc TSRMLS_DC)
164 10433 : {
165 : int field;
166 : long l, r;
167 :
168 10433 : if (php_unicode_is_upper(code))
169 2211 : return code;
170 :
171 8222 : if (php_unicode_is_lower(code)) {
172 : /*
173 : * The character is lower case.
174 : */
175 4959 : field = 2;
176 4959 : l = _uccase_len[0];
177 4959 : r = (l + _uccase_len[1]) - 3;
178 :
179 4959 : if (enc == mbfl_no_encoding_8859_9) {
180 0 : return php_turkish_toupper(code, l, r, field);
181 : }
182 :
183 : } else {
184 : /*
185 : * The character is title case.
186 : */
187 3263 : field = 1;
188 3263 : l = _uccase_len[0] + _uccase_len[1];
189 3263 : r = _uccase_size - 3;
190 : }
191 8222 : return case_lookup(code, l, r, field);
192 : }
193 :
194 : MBSTRING_API unsigned long php_unicode_tolower(unsigned long code, enum mbfl_no_encoding enc TSRMLS_DC)
195 335 : {
196 : int field;
197 : long l, r;
198 :
199 335 : if (php_unicode_is_lower(code))
200 62 : return code;
201 :
202 273 : if (php_unicode_is_upper(code)) {
203 : /*
204 : * The character is upper case.
205 : */
206 177 : field = 1;
207 177 : l = 0;
208 177 : r = _uccase_len[0] - 3;
209 :
210 177 : if (enc == mbfl_no_encoding_8859_9) {
211 0 : return php_turkish_tolower(code, l, r, field);
212 : }
213 :
214 : } else {
215 : /*
216 : * The character is title case.
217 : */
218 96 : field = 2;
219 96 : l = _uccase_len[0] + _uccase_len[1];
220 96 : r = _uccase_size - 3;
221 : }
222 273 : return case_lookup(code, l, r, field);
223 : }
224 :
225 : MBSTRING_API unsigned long php_unicode_totitle(unsigned long code, enum mbfl_no_encoding enc TSRMLS_DC)
226 1 : {
227 : int field;
228 : long l, r;
229 :
230 1 : if (php_unicode_is_title(code))
231 0 : return code;
232 :
233 : /*
234 : * The offset will always be the same for converting to title case.
235 : */
236 1 : field = 2;
237 :
238 1 : if (php_unicode_is_upper(code)) {
239 : /*
240 : * The character is upper case.
241 : */
242 0 : l = 0;
243 0 : r = _uccase_len[0] - 3;
244 : } else {
245 : /*
246 : * The character is lower case.
247 : */
248 1 : l = _uccase_len[0];
249 1 : r = (l + _uccase_len[1]) - 3;
250 : }
251 1 : return case_lookup(code, l, r, field);
252 :
253 : }
254 :
255 :
256 : #define BE_ARY_TO_UINT32(ptr) (\
257 : ((unsigned char*)(ptr))[0]<<24 |\
258 : ((unsigned char*)(ptr))[1]<<16 |\
259 : ((unsigned char*)(ptr))[2]<< 8 |\
260 : ((unsigned char*)(ptr))[3] )
261 :
262 : #define UINT32_TO_BE_ARY(ptr,val) { \
263 : unsigned int v = val; \
264 : ((unsigned char*)(ptr))[0] = (v>>24) & 0xff,\
265 : ((unsigned char*)(ptr))[1] = (v>>16) & 0xff,\
266 : ((unsigned char*)(ptr))[2] = (v>> 8) & 0xff,\
267 : ((unsigned char*)(ptr))[3] = (v ) & 0xff;\
268 : }
269 :
270 : MBSTRING_API char *php_unicode_convert_case(int case_mode, char *srcstr, size_t srclen, size_t *ret_len,
271 : char *src_encoding TSRMLS_DC)
272 1267 : {
273 : char *unicode, *newstr;
274 : size_t unicode_len;
275 : unsigned char *unicode_ptr;
276 : size_t i;
277 1267 : enum mbfl_no_encoding _src_encoding = mbfl_name2no_encoding(src_encoding);
278 :
279 1267 : if (_src_encoding == mbfl_no_encoding_invalid) {
280 90 : php_error_docref(NULL TSRMLS_CC, E_WARNING, "Unknown encoding \"%s\"", src_encoding);
281 90 : return NULL;
282 : }
283 :
284 1177 : unicode = php_mb_convert_encoding(srcstr, srclen, "UCS-4BE", src_encoding, &unicode_len TSRMLS_CC);
285 1177 : if (unicode == NULL)
286 0 : return NULL;
287 :
288 1177 : unicode_ptr = (unsigned char *)unicode;
289 :
290 1177 : switch(case_mode) {
291 : case PHP_UNICODE_CASE_UPPER:
292 11573 : for (i = 0; i < unicode_len; i+=4) {
293 10433 : UINT32_TO_BE_ARY(&unicode_ptr[i],
294 : php_unicode_toupper(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding TSRMLS_CC));
295 : }
296 1140 : break;
297 :
298 : case PHP_UNICODE_CASE_LOWER:
299 369 : for (i = 0; i < unicode_len; i+=4) {
300 333 : UINT32_TO_BE_ARY(&unicode_ptr[i],
301 : php_unicode_tolower(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding TSRMLS_CC));
302 : }
303 36 : break;
304 :
305 : case PHP_UNICODE_CASE_TITLE: {
306 1 : int mode = 0;
307 :
308 5 : for (i = 0; i < unicode_len; i+=4) {
309 : int res = php_unicode_is_prop(
310 : BE_ARY_TO_UINT32(&unicode_ptr[i]),
311 4 : UC_MN|UC_ME|UC_CF|UC_LM|UC_SK|UC_LU|UC_LL|UC_LT|UC_PO|UC_OS, 0);
312 4 : if (mode) {
313 3 : if (res) {
314 2 : UINT32_TO_BE_ARY(&unicode_ptr[i],
315 : php_unicode_tolower(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding TSRMLS_CC));
316 : } else {
317 1 : mode = 0;
318 : }
319 : } else {
320 1 : if (res) {
321 1 : mode = 1;
322 1 : UINT32_TO_BE_ARY(&unicode_ptr[i],
323 : php_unicode_totitle(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding TSRMLS_CC));
324 : }
325 : }
326 : }
327 : } break;
328 :
329 : }
330 :
331 1177 : newstr = php_mb_convert_encoding(unicode, unicode_len, src_encoding, "UCS-4BE", ret_len TSRMLS_CC);
332 1177 : efree(unicode);
333 :
334 1177 : return newstr;
335 : }
336 :
337 :
338 : #endif /* HAVE_MBSTRING */
339 :
340 : /*
341 : * Local variables:
342 : * tab-width: 4
343 : * c-basic-offset: 4
344 : * End:
345 : * vim600: sw=4 ts=4 fdm=marker
346 : * vim<600: sw=4 ts=4
347 : */
|