1 : /*
2 : +----------------------------------------------------------------------+
3 : | Zend Engine |
4 : +----------------------------------------------------------------------+
5 : | Copyright (c) 1998-2009 Zend Technologies Ltd. (http://www.zend.com) |
6 : +----------------------------------------------------------------------+
7 : | This source file is subject to version 2.00 of the Zend license, |
8 : | that is bundled with this package in the file LICENSE, and is |
9 : | available through the world-wide-web at |
10 : | http://www.zend.com/license/2_00.txt. |
11 : | If you did not receive a copy of the Zend license and are unable to |
12 : | obtain it through the world-wide-web, please send a note to |
13 : | license@zend.com so we can mail you a copy immediately. |
14 : +----------------------------------------------------------------------+
15 : | Authors: Andrei Zmievski <andrei@php.net> |
16 : +----------------------------------------------------------------------+
17 : */
18 :
19 : #include "zend.h"
20 : #include "zend_globals.h"
21 : #include "zend_operators.h"
22 : #include "zend_exceptions.h"
23 : #include "zend_API.h"
24 : #include "zend_unicode.h"
25 : #include <unicode/unorm.h>
26 :
27 : #ifdef ZTS
28 : ZEND_API ts_rsrc_id unicode_globals_id;
29 : #else
30 : ZEND_API zend_unicode_globals unicode_globals;
31 : #endif
32 :
33 : ZEND_API zend_class_entry *unicodeConversionException;
34 :
35 : /* {{{ zend_set_converter_error_mode */
36 : void zend_set_converter_error_mode(UConverter *conv, zend_conv_direction direction, uint16_t error_mode)
37 189662 : {
38 189662 : UErrorCode status = U_ZERO_ERROR;
39 :
40 189662 : switch (error_mode & 0xff) {
41 : case ZEND_CONV_ERROR_STOP:
42 114516 : if (direction == ZEND_FROM_UNICODE)
43 17007 : ucnv_setFromUCallBack(conv, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
44 : else
45 97509 : ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
46 114516 : break;
47 :
48 : case ZEND_CONV_ERROR_SKIP:
49 1 : if (direction == ZEND_FROM_UNICODE)
50 0 : ucnv_setFromUCallBack(conv, UCNV_FROM_U_CALLBACK_SKIP, NULL, NULL, NULL, &status);
51 : else
52 1 : ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_SKIP, NULL, NULL, NULL, &status);
53 1 : break;
54 :
55 : case ZEND_CONV_ERROR_SUBST:
56 73703 : if (direction == ZEND_FROM_UNICODE)
57 73703 : ucnv_setFromUCallBack(conv, UCNV_FROM_U_CALLBACK_SUBSTITUTE, NULL, NULL, NULL, &status);
58 : else
59 0 : ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_SUBSTITUTE, NULL, NULL, NULL, &status);
60 73703 : break;
61 :
62 : case ZEND_CONV_ERROR_ESCAPE_UNICODE:
63 1396 : if (direction == ZEND_FROM_UNICODE)
64 0 : ucnv_setFromUCallBack(conv, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_UNICODE, NULL, NULL, &status);
65 : else
66 1396 : ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_UNICODE, NULL, NULL, &status);
67 1396 : break;
68 :
69 : case ZEND_CONV_ERROR_ESCAPE_ICU:
70 0 : if (direction == ZEND_FROM_UNICODE)
71 0 : ucnv_setFromUCallBack(conv, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_ICU, NULL, NULL, &status);
72 : else
73 0 : ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_ICU, NULL, NULL, &status);
74 0 : break;
75 :
76 : case ZEND_CONV_ERROR_ESCAPE_JAVA:
77 46 : if (direction == ZEND_FROM_UNICODE)
78 46 : ucnv_setFromUCallBack(conv, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_JAVA, NULL, NULL, &status);
79 : else
80 : /*
81 : * use C escape, even though JAVA is requested, so that we don't
82 : * have to expose another constant
83 : */
84 0 : ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C, NULL, NULL, &status);
85 46 : break;
86 :
87 : case ZEND_CONV_ERROR_ESCAPE_XML_DEC:
88 0 : if (direction == ZEND_FROM_UNICODE)
89 0 : ucnv_setFromUCallBack(conv, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, NULL, NULL, &status);
90 : else
91 0 : ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, NULL, NULL, &status);
92 0 : break;
93 :
94 : case ZEND_CONV_ERROR_ESCAPE_XML_HEX:
95 0 : if (direction == ZEND_FROM_UNICODE)
96 0 : ucnv_setFromUCallBack(conv, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX, NULL, NULL, &status);
97 : else
98 0 : ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX, NULL, NULL, &status);
99 0 : break;
100 :
101 : default:
102 0 : assert(0);
103 : break;
104 : }
105 189662 : }
106 : /* }}} */
107 :
108 : /* {{{ zend_set_converter_subst_char */
109 : void zend_set_converter_subst_char(UConverter *conv, UChar *subst_char)
110 73657 : {
111 : char dest[8], *dest_ptr;
112 73657 : int8_t dest_len = 8;
113 73657 : UErrorCode status = U_ZERO_ERROR;
114 73657 : UErrorCode temp = U_ZERO_ERROR;
115 : const void *old_context;
116 : UConverterFromUCallback old_cb;
117 73657 : int32_t subst_char_len = u_strlen(subst_char);
118 :
119 73657 : if (!subst_char_len)
120 0 : return;
121 :
122 73657 : ucnv_setFromUCallBack(conv, UCNV_FROM_U_CALLBACK_STOP, NULL, &old_cb, &old_context, &temp);
123 73657 : dest_len = ucnv_fromUChars(conv, dest, dest_len, subst_char, subst_char_len, &status);
124 73657 : ucnv_setFromUCallBack(conv, old_cb, old_context, NULL, NULL, &temp);
125 73657 : if (U_FAILURE(status)) {
126 0 : zend_error(E_WARNING, "Could not set substitution character for the converter");
127 0 : return;
128 : }
129 :
130 : /* skip BOM for UTF-16/32 converters */
131 73657 : switch (ucnv_getType(conv)) {
132 : case UCNV_UTF16:
133 7 : dest_ptr = dest + 2;
134 7 : dest_len -= 2;
135 7 : break;
136 :
137 : case UCNV_UTF32:
138 0 : dest_ptr = dest + 4;
139 0 : dest_len -= 4;
140 0 : break;
141 :
142 : default:
143 73650 : dest_ptr = dest;
144 : break;
145 : }
146 :
147 73657 : ucnv_setSubstChars(conv, dest_ptr, dest_len, &status);
148 73657 : if (status == U_ILLEGAL_ARGUMENT_ERROR) {
149 0 : zend_error(E_WARNING, "Substitution character byte sequence is too short or long for this converter");
150 0 : return;
151 : }
152 : }
153 : /* }}} */
154 :
155 : /* {{{ zend_set_converter_encoding */
156 : int zend_set_converter_encoding(UConverter **converter, const char *encoding)
157 161106 : {
158 161106 : UErrorCode status = U_ZERO_ERROR;
159 161106 : UConverter *new_converter = NULL;
160 :
161 161106 : if (!converter) {
162 0 : return FAILURE;
163 : }
164 :
165 : /*
166 : * The specified encoding might be the same as converter's existing one,
167 : * which results in a no-op.
168 : */
169 161106 : if (*converter && encoding && encoding[0]) {
170 60 : const char *current = ucnv_getName(*converter, &status);
171 60 : status = U_ZERO_ERROR; /* reset error */
172 60 : if (!ucnv_compareNames(current, encoding)) {
173 2 : return SUCCESS;
174 : }
175 : }
176 :
177 : /*
178 : * If encoding is NULL, ucnv_open() will return a converter based on
179 : * the default platform encoding as determined by ucnv_getDefaultName().
180 : */
181 161104 : new_converter = ucnv_open(encoding, &status);
182 161104 : if (U_FAILURE(status)) {
183 7 : return FAILURE;
184 : }
185 :
186 161097 : if (*converter) {
187 58 : ucnv_close(*converter);
188 : }
189 161097 : *converter = new_converter;
190 :
191 161097 : return SUCCESS;
192 : }
193 : /* }}} */
194 :
195 : /* {{{ zend_copy_converter */
196 : int zend_copy_converter(UConverter **target, UConverter *source)
197 0 : {
198 0 : UErrorCode status = U_ZERO_ERROR;
199 : const char *encoding;
200 :
201 0 : assert(source != NULL);
202 :
203 0 : encoding = ucnv_getName(source, &status);
204 0 : if (U_FAILURE(status)) {
205 0 : return FAILURE;
206 : }
207 :
208 0 : return zend_set_converter_encoding(target, encoding);
209 : }
210 : /* }}} */
211 :
212 : /* {{{ zend_string_to_unicode_ex */
213 : ZEND_API int zend_string_to_unicode_ex(UConverter *conv, UChar **target, int *target_len, const char *source, int source_len, UErrorCode *status)
214 3792131 : {
215 3792131 : UChar *buffer = NULL;
216 : UChar *output;
217 3792131 : int32_t buffer_len = 0;
218 3792131 : int32_t converted = 0;
219 3792131 : const char *input = source;
220 : UConverterType conv_type;
221 :
222 3792131 : if (U_FAILURE(*status)) {
223 0 : return 0;
224 : }
225 :
226 3792131 : ucnv_resetToUnicode(conv);
227 3792131 : conv_type = ucnv_getType(conv);
228 :
229 3792131 : switch (conv_type) {
230 : case UCNV_SBCS:
231 : case UCNV_LATIN_1:
232 : case UCNV_US_ASCII:
233 : /*
234 : * For single-byte charsets, 1 input byte = 1 output UChar
235 : */
236 1694125 : buffer_len = source_len;
237 1694125 : break;
238 :
239 : default:
240 : /*
241 : * Initial estimate: 1.25 UChar's for every 2 source bytes + 2 (past a
242 : * certain limit (2)). The rationale behind this is that (atleast
243 : * in the case of GB2312) it is possible that there are single byte
244 : * characters in the input string. By using an GD2312 text as
245 : * example it seemed that a value of 1.25 allowed for as little
246 : * re-allocations as possible without over estimating the buffer
247 : * too much. In case there is a lot of single-byte characters
248 : * around a single multi-byte character this estimation is too low,
249 : * and then the re-allocation routines in the loop below kick in.
250 : * There we multiply by 1.33 and add 1 so that it's quite efficient
251 : * for smaller input strings without causing too many iterations of
252 : * this loop.
253 : */
254 2098006 : buffer_len = (source_len > 2) ? ((source_len >> 1) + (source_len >> 3) + 2) : source_len;
255 : break;
256 : }
257 :
258 : while (1) {
259 5303210 : buffer = eurealloc(buffer, buffer_len + 1);
260 5303210 : output = buffer + converted;
261 5303210 : ucnv_toUnicode(conv, &output, buffer + buffer_len, &input, source + source_len, NULL, TRUE, status);
262 5303210 : converted = (int32_t) (output - buffer);
263 5303210 : if (*status == U_BUFFER_OVERFLOW_ERROR) {
264 1511079 : buffer_len = (buffer_len * 1.33) + 1;
265 1511079 : *status = U_ZERO_ERROR;
266 : } else {
267 3792131 : break;
268 : }
269 1511079 : }
270 :
271 : /*
272 : * We return the buffer in case of failure anyway. The caller may want to
273 : * use partially converted string for something.
274 : */
275 :
276 3792131 : buffer[converted] = 0;
277 3792131 : *target = buffer;
278 3792131 : *target_len = converted;
279 :
280 3792131 : return input - source;
281 : }
282 : /* }}} */
283 :
284 : /* {{{ zend_unicode_to_string_ex */
285 : ZEND_API int zend_unicode_to_string_ex(UConverter *conv, char **target, int *target_len, const UChar *source, int source_len, UErrorCode *status)
286 12635333 : {
287 12635333 : char *buffer = NULL;
288 : char *output;
289 12635333 : int32_t buffer_len = 0;
290 12635333 : int32_t converted = 0;
291 12635333 : const UChar *input = source;
292 :
293 12635333 : if (U_FAILURE(*status)) {
294 0 : return 0;
295 : }
296 :
297 12635333 : ucnv_resetFromUnicode(conv);
298 :
299 12635333 : buffer_len = ucnv_getMaxCharSize(conv) * source_len;
300 :
301 : while (1) {
302 12635333 : buffer = erealloc(buffer, buffer_len + 1);
303 12635333 : output = buffer + converted;
304 12635333 : ucnv_fromUnicode(conv, &output, buffer + buffer_len, &input, source + source_len, NULL, TRUE, status);
305 12635333 : converted = (int32_t) (output - buffer);
306 12635333 : if (*status == U_BUFFER_OVERFLOW_ERROR) {
307 0 : buffer_len += 64;
308 0 : *status = U_ZERO_ERROR;
309 : } else {
310 12635333 : break;
311 : }
312 0 : }
313 :
314 : /*
315 : * We return the buffer in case of failure anyway. The caller may want to
316 : * use partially converted string for something.
317 : */
318 :
319 12635333 : buffer[converted] = 0; /* NULL-terminate the output string */
320 12635333 : *target = buffer;
321 12635333 : *target_len = converted;
322 :
323 12635333 : return input - source;
324 : }
325 : /* }}} */
326 :
327 : /* {{{ zend_convert_encodings */
328 : ZEND_API void zend_convert_encodings(UConverter *target_conv, UConverter *source_conv,
329 : char **target, int *target_len,
330 : const char *source, int source_len, UErrorCode *status)
331 121527 : {
332 121527 : char *buffer = NULL;
333 : char *output;
334 121527 : const char *input = source;
335 121527 : int32_t allocated = 0;
336 121527 : int32_t converted = 0;
337 : int8_t null_size;
338 : UChar pivot_buf[1024], *pivot, *pivot2;
339 :
340 121527 : if (U_FAILURE(*status)) {
341 0 : return;
342 : }
343 :
344 121527 : null_size = ucnv_getMinCharSize(target_conv);
345 121527 : allocated = source_len + null_size;
346 :
347 121527 : ucnv_resetToUnicode(source_conv);
348 121527 : ucnv_resetFromUnicode(target_conv);
349 121527 : pivot = pivot2 = pivot_buf;
350 :
351 : while (1) {
352 121527 : buffer = (char *) erealloc(buffer, allocated);
353 121527 : output = buffer + converted;
354 121527 : ucnv_convertEx(target_conv, source_conv, &output, buffer + allocated - null_size,
355 : &input, source + source_len, pivot_buf, &pivot, &pivot2, pivot_buf + 1024, FALSE, TRUE, status);
356 121527 : converted = (int32_t) (output - buffer);
357 121527 : if (*status == U_BUFFER_OVERFLOW_ERROR) {
358 0 : allocated += 1024;
359 0 : *status = U_ZERO_ERROR;
360 : } else {
361 121527 : break;
362 : }
363 0 : }
364 :
365 121527 : memset(buffer + converted, 0, null_size); /* NULL-terminate the output string */
366 121527 : *target = buffer;
367 121527 : *target_len = converted;
368 : }
369 : /* }}} */
370 :
371 : /* {{{ zend_unicode_to_ascii */
372 : ZEND_API char* zend_unicode_to_ascii(const UChar *us, int us_len TSRMLS_DC)
373 307522 : {
374 : char *cs;
375 : int cs_len;
376 307522 : UErrorCode status = U_ZERO_ERROR;
377 :
378 307522 : zend_unicode_to_string_ex(UG(ascii_conv), &cs, &cs_len, us, us_len, &status);
379 307522 : if (U_FAILURE(status)) {
380 7 : efree(cs);
381 7 : return NULL;
382 : }
383 307515 : return cs;
384 : }
385 : /* }}} */
386 :
387 : /* {{{ zend_default_conversion_error_handler */
388 : static void zend_default_conversion_error_handler(char *message, UConverter *conv, zend_conv_direction dir, int error_char_offset TSRMLS_DC)
389 18 : {
390 : const char *conv_name;
391 18 : UErrorCode status = U_ZERO_ERROR;
392 :
393 18 : if (!message)
394 0 : return;
395 :
396 18 : conv_name = ucnv_getName(conv, &status);
397 : /*
398 : * UTODO
399 : * use some other standard than MIME? or fallback onto IANA? or use
400 : * internal converter name? ponder
401 : */
402 18 : conv_name = ucnv_getStandardName(conv_name, "MIME", &status);
403 18 : status = U_ZERO_ERROR;
404 :
405 18 : if (dir == ZEND_FROM_UNICODE) {
406 : UChar err_char[U16_MAX_LENGTH];
407 5 : int8_t err_char_len = sizeof(err_char);
408 : UChar32 codepoint;
409 5 : char *message_fmt = "%s (converter %s failed on character {U+%04X} at offset %d)";
410 :
411 5 : memset(&err_char, '\0', U16_MAX_LENGTH);
412 5 : ucnv_getInvalidUChars(conv, err_char, &err_char_len, &status);
413 5 : codepoint = (err_char_len < 2) ? err_char[0] : U16_GET_SUPPLEMENTARY(err_char[0], err_char[1]);
414 :
415 5 : zend_error(E_WARNING, message_fmt, message, conv_name?conv_name:"", codepoint, error_char_offset-1);
416 : } else {
417 : char err_char[8]; /* UTF-8 uses up to 8 bytes */
418 : char buf[40]; /* 4x number of error bytes + 7 separators + 1 for safety */
419 13 : int8_t err_char_len = sizeof(err_char);
420 13 : char *message_fmt = "%s (converter %s failed on bytes (%s) at offset %d)";
421 : char *p;
422 : int i;
423 :
424 13 : memset(&err_char, '\0', 8);
425 13 : ucnv_getInvalidChars(conv, err_char, &err_char_len, &status);
426 13 : p = buf;
427 26 : for (i = 0; i < err_char_len; i++) {
428 13 : sprintf(p, "0x%02X%s", (unsigned char)err_char[i], (i+1<err_char_len)?",":"");
429 13 : p += 4 + (i+1<err_char_len?1:0);
430 : }
431 13 : *p = 0;
432 :
433 13 : zend_error(E_WARNING, message_fmt, message, conv_name?conv_name:"", buf, error_char_offset-err_char_len);
434 : }
435 : }
436 : /* }}} */
437 :
438 : /* {{{ zend_call_conversion_error_handler */
439 : static void zend_call_conversion_error_handler(char *message, UConverter *conv, zend_conv_direction dir, int error_char_offset TSRMLS_DC)
440 0 : {
441 : zval *z_message, *z_dir, *z_encoding, *z_char, *z_offset;
442 : zval ***params;
443 : zval *retval;
444 : zval *orig_user_error_handler;
445 : const char *conv_name;
446 0 : UErrorCode status = U_ZERO_ERROR;
447 :
448 0 : ALLOC_INIT_ZVAL(z_message);
449 0 : ALLOC_INIT_ZVAL(z_dir);
450 0 : ALLOC_INIT_ZVAL(z_encoding);
451 0 : ALLOC_INIT_ZVAL(z_char);
452 0 : ALLOC_INIT_ZVAL(z_offset);
453 :
454 0 : if (message) {
455 0 : ZVAL_STRING(z_message, message, 1);
456 : } else {
457 0 : ZVAL_NULL(z_message);
458 : }
459 :
460 0 : ZVAL_LONG(z_dir, dir);
461 :
462 0 : conv_name = ucnv_getName(conv, &status);
463 : /*
464 : * UTODO
465 : * use some other standard than MIME? or fallback onto IANA? or use
466 : * internal converter name? ponder
467 : * maybe pass Converter object, when it's implemented?
468 : */
469 0 : conv_name = ucnv_getStandardName(conv_name, "MIME", &status);
470 0 : ZVAL_STRING(z_encoding, (char *) conv_name, 1);
471 :
472 0 : if (dir == ZEND_FROM_UNICODE) {
473 : UChar err_char[U16_MAX_LENGTH];
474 0 : int8_t err_char_len = sizeof(err_char);
475 :
476 0 : memset(&err_char, '\0', U16_MAX_LENGTH);
477 0 : ucnv_getInvalidUChars(conv, err_char, &err_char_len, &status);
478 0 : ZVAL_UNICODEL(z_char, err_char, err_char_len, 1);
479 0 : ZVAL_LONG(z_offset, error_char_offset-1);
480 : } else {
481 : char err_char[8]; /* UTF-8 uses up to 8 bytes */
482 0 : int8_t err_char_len = sizeof(err_char);
483 :
484 0 : memset(&err_char, '\0', 8);
485 0 : ucnv_getInvalidChars(conv, err_char, &err_char_len, &status);
486 0 : ZVAL_STRINGL(z_char, err_char, err_char_len, 1);
487 0 : ZVAL_LONG(z_offset, error_char_offset-err_char_len);
488 : }
489 :
490 0 : params = (zval ***) emalloc(sizeof(zval **) * 6);
491 0 : params[0] = &z_dir;
492 0 : params[1] = &z_encoding;
493 0 : params[2] = &z_char;
494 0 : params[3] = &z_offset;
495 0 : params[4] = &z_message;
496 :
497 0 : orig_user_error_handler = UG(conv_error_handler);
498 0 : UG(conv_error_handler) = NULL;
499 :
500 0 : if (call_user_function_ex(EG(function_table), NULL, orig_user_error_handler, &retval, 5, params, 1, NULL TSRMLS_CC)==SUCCESS) {
501 0 : if (retval) {
502 : /* user error handler returned 'false', use built-in error handler */
503 0 : if (Z_TYPE_P(retval) == IS_BOOL && Z_LVAL_P(retval) == 0) {
504 0 : zend_default_conversion_error_handler(message, conv, dir, error_char_offset TSRMLS_CC);
505 : }
506 0 : zval_ptr_dtor(&retval);
507 : }
508 0 : } else if (!EG(exception)) {
509 : /* The user error handler failed, use built-in error handler */
510 0 : zend_default_conversion_error_handler(message, conv, dir, error_char_offset TSRMLS_CC);
511 : }
512 :
513 0 : if (!UG(conv_error_handler)) {
514 0 : UG(conv_error_handler) = orig_user_error_handler;
515 : } else {
516 0 : zval_ptr_dtor(&orig_user_error_handler);
517 : }
518 :
519 0 : efree(params);
520 0 : zval_ptr_dtor(&z_dir);
521 0 : zval_ptr_dtor(&z_encoding);
522 0 : zval_ptr_dtor(&z_char);
523 0 : zval_ptr_dtor(&z_offset);
524 0 : zval_ptr_dtor(&z_message);
525 0 : }
526 : /* }}} */
527 :
528 : /* {{{ zend_raise_conversion_error_ex */
529 : ZEND_API void zend_raise_conversion_error_ex(char *message, UConverter *conv, zend_conv_direction dir, int error_char_offset TSRMLS_DC)
530 18 : {
531 18 : if (UG(conv_error_handler)) {
532 0 : zend_call_conversion_error_handler(message, conv, dir, error_char_offset TSRMLS_CC);
533 : } else {
534 18 : zend_default_conversion_error_handler(message, conv, dir, error_char_offset TSRMLS_CC);
535 : }
536 18 : }
537 : /* }}} */
538 :
539 : /* {{{ zend_unicode_to_string */
540 : ZEND_API int zend_unicode_to_string(UConverter *conv, char **s, int *s_len, const UChar *u, int u_len TSRMLS_DC)
541 3588269 : {
542 3588269 : UErrorCode status = U_ZERO_ERROR;
543 : int num_conv;
544 :
545 3588269 : if (conv == NULL) {
546 0 : conv = UG(runtime_encoding_conv);
547 : }
548 :
549 3588269 : num_conv = zend_unicode_to_string_ex(conv, s, s_len, u, u_len, &status);
550 :
551 3588269 : if (U_FAILURE(status)) {
552 5 : int32_t offset = u_countChar32(u, num_conv);
553 :
554 5 : zend_raise_conversion_error_ex("Could not convert Unicode string to binary string", conv, ZEND_FROM_UNICODE, offset TSRMLS_CC);
555 5 : if (*s) {
556 5 : efree(*s);
557 : }
558 5 : *s = NULL;
559 5 : *s_len = 0;
560 5 : return FAILURE;
561 : }
562 3588264 : return SUCCESS;
563 : }
564 : /* }}} */
565 :
566 : /* {{{ zval_unicode_to_string_ex */
567 : ZEND_API int zval_unicode_to_string_ex(zval *string, UConverter *conv TSRMLS_DC)
568 3573825 : {
569 3573825 : char *s = NULL;
570 : int s_len;
571 :
572 3573825 : UChar *u = Z_USTRVAL_P(string);
573 3573825 : int u_len = Z_USTRLEN_P(string);
574 :
575 3573825 : if (zend_unicode_to_string(conv, &s, &s_len, u, u_len TSRMLS_CC) == SUCCESS) {
576 3573820 : ZVAL_STRINGL(string, s, s_len, 0);
577 3573820 : efree((UChar*)u);
578 3573820 : return SUCCESS;
579 : } else {
580 5 : ZVAL_EMPTY_STRING(string);
581 5 : efree((UChar*)u);
582 5 : return FAILURE;
583 : }
584 : }
585 : /* }}} */
586 :
587 : /* {{{ zval_unicode_to_string */
588 : ZEND_API int zval_unicode_to_string(zval *string TSRMLS_DC)
589 92 : {
590 92 : return zval_unicode_to_string_ex(string, ZEND_U_CONVERTER(UG(runtime_encoding_conv)) TSRMLS_CC);
591 : }
592 : /* }}} */
593 :
594 : /* {{{ zend_string_to_unicode */
595 : ZEND_API int zend_string_to_unicode(UConverter *conv, UChar **u, int *u_len, char *s, int s_len TSRMLS_DC)
596 1013172 : {
597 1013172 : UErrorCode status = U_ZERO_ERROR;
598 : int num_conv;
599 :
600 1013172 : if (conv == NULL) {
601 4176 : conv = ZEND_U_CONVERTER(UG(runtime_encoding_conv));
602 : }
603 :
604 1013172 : num_conv = zend_string_to_unicode_ex(conv, u, u_len, s, s_len, &status);
605 :
606 1013172 : if (U_FAILURE(status)) {
607 13 : zend_raise_conversion_error_ex("Could not convert binary string to Unicode string", conv, ZEND_TO_UNICODE, num_conv TSRMLS_CC);
608 13 : if (*u) {
609 13 : efree(*u);
610 : }
611 13 : *u = NULL;
612 13 : *u_len = 0;
613 13 : return FAILURE;
614 : }
615 1013159 : return SUCCESS;
616 : }
617 : /* }}} */
618 :
619 : /* {{{ zval_string_to_unicode_ex */
620 : ZEND_API int zval_string_to_unicode_ex(zval *string, UConverter *conv TSRMLS_DC)
621 304917 : {
622 304917 : UChar *u = NULL;
623 : int u_len;
624 :
625 304917 : char *s = Z_STRVAL_P(string);
626 304917 : int s_len = Z_STRLEN_P(string);
627 :
628 304917 : if (zend_string_to_unicode(conv, &u, &u_len, s, s_len TSRMLS_CC) == SUCCESS) {
629 304914 : ZVAL_UNICODEL(string, u, u_len, 0);
630 304914 : efree(s);
631 304914 : return SUCCESS;
632 : } else {
633 3 : ZVAL_EMPTY_UNICODE(string);
634 3 : efree(s);
635 3 : return FAILURE;
636 : }
637 : }
638 : /* }}} */
639 :
640 : /* {{{ zval_string_to_unicode */
641 : ZEND_API int zval_string_to_unicode(zval *string TSRMLS_DC)
642 6 : {
643 6 : return zval_string_to_unicode_ex(string, ZEND_U_CONVERTER(UG(runtime_encoding_conv)) TSRMLS_CC);
644 : }
645 : /* }}} */
646 :
647 : /* {{{ zend_cmp_unicode_and_string */
648 : ZEND_API int zend_cmp_unicode_and_string(UChar *ustr, char* str, uint len)
649 4176 : {
650 4176 : UChar *u = NULL;
651 : int u_len;
652 4176 : int retval = TRUE;
653 : TSRMLS_FETCH();
654 :
655 4176 : if (zend_string_to_unicode(NULL, &u, &u_len, str, len TSRMLS_CC) == FAILURE) {
656 0 : return FAILURE;
657 : }
658 4176 : retval = u_memcmp(ustr, u, u_len);
659 4176 : efree(u);
660 4176 : return retval;
661 : }
662 : /* }}} */
663 :
664 : /* {{{ zend_cmp_unicode_and_literal */
665 : /*
666 : * Compare a Unicode string and an ASCII literal. Because ASCII maps nicely onto Unicode
667 : * range U+0000 .. U+007F, we can simply cast ASCII chars to Unicode values and avoid
668 : * memory allocation.
669 : */
670 : ZEND_API int zend_cmp_unicode_and_literal(UChar *ustr, int ulen, char *str, int slen)
671 1159203 : {
672 : int result;
673 1159203 : uint len = MIN(ulen, slen);
674 :
675 : /* UTODO: make sure we're only comparing against ASCII values here (< 0x80) */
676 7336559 : while (len--) {
677 5732516 : result = (int)(uint16_t)*ustr - (int)(uint16_t)*str;
678 5732516 : if (result != 0)
679 714363 : return result;
680 5018153 : ustr++;
681 5018153 : str++;
682 : }
683 :
684 444840 : return ulen - slen;
685 : }
686 : /* }}} */
687 :
688 : /* {{{ zend_is_valid_identifier */
689 : ZEND_API int zend_is_valid_identifier(UChar *ident, int len)
690 938434 : {
691 : UChar32 codepoint;
692 : int32_t i;
693 938434 : int32_t ident_len = len;
694 938434 : UProperty id_prop = UCHAR_XID_START;
695 :
696 18894636 : for (i = 0; i < ident_len; ) {
697 17017789 : U16_NEXT(ident, i, ident_len, codepoint);
698 17017789 : if (!u_hasBinaryProperty(codepoint, id_prop) &&
699 : codepoint != 0x5f) { /* special case for starting '_' */
700 21 : return 0;
701 : }
702 17017768 : id_prop = UCHAR_XID_CONTINUE;
703 : }
704 :
705 938413 : return 1;
706 : }
707 : /* }}} */
708 :
709 : /* {{{ zend_normalize_string */
710 : static inline void zend_normalize_string(UChar **dest, int32_t *dest_len, UChar *src, int src_len, UErrorCode *status)
711 5 : {
712 5 : UChar *buffer = NULL;
713 : int32_t buffer_len;
714 :
715 5 : buffer_len = src_len;
716 : while (1) {
717 5 : *status = U_ZERO_ERROR;
718 5 : buffer = eurealloc(buffer, buffer_len+1);
719 5 : buffer_len = unorm_normalize(src, src_len, UNORM_NFKC, 0, buffer, buffer_len, status);
720 5 : if (*status != U_BUFFER_OVERFLOW_ERROR) {
721 5 : break;
722 : }
723 0 : }
724 5 : if (U_SUCCESS(*status)) {
725 5 : buffer[buffer_len] = 0;
726 5 : *dest = buffer;
727 5 : *dest_len = buffer_len;
728 : } else {
729 0 : efree(buffer);
730 : }
731 5 : }
732 : /* }}} */
733 :
734 : /* {{{ zend_case_fold_string */
735 : ZEND_API void zend_case_fold_string(UChar **dest, int *dest_len, UChar *src, int src_len, uint32_t options, UErrorCode *status)
736 29748695 : {
737 29748695 : UChar *buffer = NULL;
738 : int32_t buffer_len;
739 :
740 29748695 : buffer_len = src_len;
741 : while (1) {
742 29748695 : *status = U_ZERO_ERROR;
743 29748695 : buffer = eurealloc(buffer, buffer_len+1);
744 29748695 : buffer_len = u_strFoldCase(buffer, buffer_len, src, src_len, options, status);
745 29748695 : if (*status != U_BUFFER_OVERFLOW_ERROR) {
746 29748695 : break;
747 : }
748 0 : }
749 29748695 : if (U_SUCCESS(*status)) {
750 29748695 : buffer[buffer_len] = 0;
751 29748695 : *dest = buffer;
752 29748695 : *dest_len = buffer_len;
753 : } else {
754 0 : efree(buffer);
755 : }
756 29748695 : }
757 : /* }}} */
758 :
759 : /* {{{ zend_normalize_identifier */
760 : ZEND_API int zend_normalize_identifier(UChar **dest, int *dest_len, UChar *ident, int ident_len, zend_bool fold_case)
761 2425061 : {
762 2425061 : UChar *buffer = NULL;
763 2425061 : UChar *orig_ident = ident;
764 : int32_t buffer_len;
765 2425061 : UErrorCode status = U_ZERO_ERROR;
766 :
767 2425061 : if (unorm_quickCheck(ident, ident_len, UNORM_NFKC, &status) != UNORM_YES) {
768 5 : zend_normalize_string(&buffer, &buffer_len, ident, ident_len, &status);
769 5 : if (U_FAILURE(status)) {
770 0 : return FAILURE;
771 : }
772 5 : ident = buffer;
773 5 : ident_len = buffer_len;
774 : }
775 :
776 2425061 : if (fold_case) {
777 1470375 : zend_case_fold_string(&buffer, &buffer_len, ident, ident_len, U_FOLD_CASE_DEFAULT, &status);
778 1470375 : if (ident != orig_ident) {
779 0 : efree(ident);
780 : }
781 1470375 : if (U_FAILURE(status)) {
782 0 : return FAILURE;
783 : }
784 1470375 : ident = buffer;
785 1470375 : ident_len = buffer_len;
786 :
787 1470375 : if (unorm_quickCheck(ident, ident_len, UNORM_NFKC, &status) != UNORM_YES) {
788 0 : zend_normalize_string(&buffer, &buffer_len, ident, ident_len, &status);
789 0 : if (ident != orig_ident) {
790 0 : efree(ident);
791 : }
792 0 : if (U_FAILURE(status)) {
793 0 : return FAILURE;
794 : }
795 0 : ident = buffer;
796 0 : ident_len = buffer_len;
797 : }
798 : }
799 :
800 2425061 : *dest = ident;
801 2425061 : *dest_len = ident_len;
802 2425061 : return SUCCESS;
803 : }
804 : /* }}} */
805 :
806 : /* {{{ zend_register_unicode_exceptions */
807 : void zend_register_unicode_exceptions(TSRMLS_D)
808 17007 : {
809 : zend_class_entry ce;
810 :
811 17007 : INIT_CLASS_ENTRY(ce, "UnicodeConversionException", NULL);
812 17007 : unicodeConversionException = zend_register_internal_class_ex(&ce, zend_exception_get_default(TSRMLS_C), NULL TSRMLS_CC);
813 17007 : }
814 : /* }}} */
815 :
816 : zend_collator* zend_collator_create(UCollator *coll) /* {{{ */
817 17017 : {
818 17017 : zend_collator *zcoll = NULL;
819 :
820 17017 : zcoll = emalloc(sizeof(zend_collator));
821 17017 : zcoll->coll = coll;
822 17017 : zcoll->refcount = 1;
823 :
824 17017 : return zcoll;
825 : }
826 : /* }}} */
827 :
828 : void zend_collator_destroy(zend_collator *zcoll) /* {{{ */
829 17049 : {
830 17049 : zcoll->refcount--;
831 17049 : if (zcoll->refcount == 0) {
832 17049 : ucol_close(zcoll->coll);
833 17049 : efree(zcoll);
834 : }
835 17049 : }
836 : /* }}} */
837 :
838 : /*
839 : * Local variables:
840 : * tab-width: 4
841 : * c-basic-offset: 4
842 : * indent-tabs-mode: t
843 : * End:
844 : * vim: noet sw=4 ts=4 fdm=marker
845 : */
|