1 : /*
2 : +----------------------------------------------------------------------+
3 : | PHP Version 5 |
4 : +----------------------------------------------------------------------+
5 : | This source file is subject to version 3.01 of the PHP license, |
6 : | that is bundled with this package in the file LICENSE, and is |
7 : | available through the world-wide-web at the following url: |
8 : | http://www.php.net/license/3_01.txt |
9 : | If you did not receive a copy of the PHP license and are unable to |
10 : | obtain it through the world-wide-web, please send a note to |
11 : | license@php.net so we can mail you a copy immediately. |
12 : +----------------------------------------------------------------------+
13 : | Authors: Ed Batutis <ed@batutis.com> |
14 : +----------------------------------------------------------------------+
15 : */
16 :
17 : #ifdef HAVE_CONFIG_H
18 : #include "config.h"
19 : #endif
20 :
21 : #include "php_intl.h"
22 : #include "unicode/unorm.h"
23 : #include "normalizer.h"
24 : #include "normalizer_class.h"
25 : #include "normalizer_normalize.h"
26 : #include "intl_convert.h"
27 :
28 : /* {{{ proto string Normalizer::normalize( string $input [, string $form = FORM_C] )
29 : * Normalize a string. }}} */
30 : /* {{{ proto string normalizer_normalize( string $input [, string $form = FORM_C] )
31 : * Normalize a string.
32 : */
33 : PHP_FUNCTION( normalizer_normalize )
34 60 : {
35 60 : char* input = NULL;
36 : /* form is optional, defaults to FORM_C */
37 60 : long form = NORMALIZER_DEFAULT;
38 60 : int input_len = 0;
39 :
40 60 : UChar* uinput = NULL;
41 60 : int uinput_len = 0;
42 60 : int expansion_factor = 1;
43 60 : UErrorCode status = U_ZERO_ERROR;
44 :
45 60 : UChar* uret_buf = NULL;
46 60 : int uret_len = 0;
47 :
48 60 : char* ret_buf = NULL;
49 60 : int32_t ret_len = 0;
50 :
51 : int32_t size_needed;
52 :
53 60 : NORMALIZER_METHOD_INIT_VARS
54 :
55 60 : intl_error_reset( NULL TSRMLS_CC );
56 :
57 : /* Parse parameters. */
58 60 : if( zend_parse_method_parameters( ZEND_NUM_ARGS() TSRMLS_CC, getThis(), "s|l",
59 : &input, &input_len, &form ) == FAILURE )
60 : {
61 0 : intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
62 : "normalizer_normalize: unable to parse input params", 1 TSRMLS_CC );
63 :
64 0 : RETURN_NULL();
65 : }
66 :
67 60 : expansion_factor = 1;
68 :
69 60 : switch(form) {
70 : case NORMALIZER_NONE:
71 12 : break;
72 : case NORMALIZER_FORM_D:
73 12 : expansion_factor = 3;
74 12 : break;
75 : case NORMALIZER_FORM_KD:
76 12 : expansion_factor = 3;
77 12 : break;
78 : case NORMALIZER_FORM_C:
79 : case NORMALIZER_FORM_KC:
80 24 : break;
81 : default:
82 0 : intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
83 : "normalizer_normalize: illegal normalization form", 1 TSRMLS_CC );
84 0 : RETURN_NULL();
85 : }
86 :
87 : /*
88 : * Normalize string (converting it to UTF-16 first).
89 : */
90 :
91 : /* First convert the string to UTF-16. */
92 60 : intl_convert_utf8_to_utf16(&uinput, &uinput_len, input, input_len, &status );
93 :
94 60 : if( U_FAILURE( status ) )
95 : {
96 : /* Set global error code. */
97 0 : intl_error_set_code( NULL, status TSRMLS_CC );
98 :
99 : /* Set error messages. */
100 0 : intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 1 TSRMLS_CC );
101 0 : efree( uinput );
102 0 : RETURN_NULL();
103 : }
104 :
105 :
106 : /* Allocate memory for the destination buffer for normalization */
107 60 : uret_len = uinput_len * expansion_factor;
108 60 : uret_buf = eumalloc( uret_len + 1 );
109 :
110 : /* normalize */
111 60 : size_needed = unorm_normalize( uinput, uinput_len, form, (int32_t) 0 /* options */, uret_buf, uret_len, &status);
112 :
113 : /* Bail out if an unexpected error occured.
114 : * (U_BUFFER_OVERFLOW_ERROR means that *target buffer is not large enough).
115 : * (U_STRING_NOT_TERMINATED_WARNING usually means that the input string is empty).
116 : */
117 60 : if( U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR && status != U_STRING_NOT_TERMINATED_WARNING ) {
118 0 : efree( uret_buf );
119 0 : efree( uinput );
120 0 : RETURN_NULL();
121 : }
122 :
123 60 : if ( size_needed > uret_len ) {
124 : /* realloc does not seem to work properly - memory is corrupted
125 : * uret_buf = eurealloc(uret_buf, size_needed + 1);
126 : */
127 2 : efree( uret_buf );
128 2 : uret_buf = eumalloc( size_needed + 1 );
129 2 : uret_len = size_needed;
130 :
131 2 : status = U_ZERO_ERROR;
132 :
133 : /* try normalize again */
134 2 : size_needed = unorm_normalize( uinput, uinput_len, form, (int32_t) 0 /* options */, uret_buf, uret_len, &status);
135 :
136 : /* Bail out if an unexpected error occured. */
137 2 : if( U_FAILURE(status) ) {
138 : /* Set error messages. */
139 0 : intl_error_set_custom_msg( NULL,"Error normalizing string", 1 TSRMLS_CC );
140 0 : efree( uret_buf );
141 0 : efree( uinput );
142 0 : RETURN_NULL();
143 : }
144 : }
145 :
146 60 : efree( uinput );
147 :
148 : /* the buffer we actually used */
149 60 : uret_len = size_needed;
150 :
151 : /* Convert normalized string from UTF-16 to UTF-8. */
152 60 : intl_convert_utf16_to_utf8( &ret_buf, &ret_len, uret_buf, uret_len, &status );
153 60 : efree( uret_buf );
154 60 : if( U_FAILURE( status ) )
155 : {
156 0 : intl_error_set( NULL, status,
157 : "normalizer_normalize: error converting normalized text UTF-8", 1 TSRMLS_CC );
158 0 : RETURN_NULL();
159 : }
160 :
161 : /* Return it. */
162 60 : RETVAL_STRINGL( ret_buf, ret_len, FALSE );
163 : }
164 : /* }}} */
165 :
166 : /* {{{ proto bool Normalizer::isNormalized( string $input [, string $form = FORM_C] )
167 : * Test if a string is in a given normalization form. }}} */
168 : /* {{{ proto bool normalizer_is_normalize( string $input [, string $form = FORM_C] )
169 : * Test if a string is in a given normalization form.
170 : */
171 : PHP_FUNCTION( normalizer_is_normalized )
172 60 : {
173 60 : char* input = NULL;
174 : /* form is optional, defaults to FORM_C */
175 60 : long form = NORMALIZER_DEFAULT;
176 60 : int input_len = 0;
177 :
178 60 : UChar* uinput = NULL;
179 60 : int uinput_len = 0;
180 60 : UErrorCode status = U_ZERO_ERROR;
181 :
182 60 : UBool uret = FALSE;
183 :
184 60 : NORMALIZER_METHOD_INIT_VARS
185 :
186 60 : intl_error_reset( NULL TSRMLS_CC );
187 :
188 : /* Parse parameters. */
189 60 : if( zend_parse_method_parameters( ZEND_NUM_ARGS() TSRMLS_CC, getThis(), "s|l",
190 : &input, &input_len, &form) == FAILURE )
191 : {
192 0 : intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
193 : "normalizer_is_normalized: unable to parse input params", 1 TSRMLS_CC );
194 :
195 0 : RETURN_FALSE;
196 : }
197 :
198 60 : switch(form) {
199 : /* case NORMALIZER_NONE: not allowed - doesn't make sense */
200 :
201 : case NORMALIZER_FORM_D:
202 : case NORMALIZER_FORM_KD:
203 : case NORMALIZER_FORM_C:
204 : case NORMALIZER_FORM_KC:
205 : break;
206 : default:
207 12 : intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
208 : "normalizer_normalize: illegal normalization form", 1 TSRMLS_CC );
209 12 : RETURN_NULL();
210 : }
211 :
212 :
213 : /*
214 : * Test normalization of string (converting it to UTF-16 first).
215 : */
216 :
217 : /* First convert the string to UTF-16. */
218 48 : intl_convert_utf8_to_utf16(&uinput, &uinput_len, input, input_len, &status );
219 :
220 48 : if( U_FAILURE( status ) )
221 : {
222 : /* Set global error code. */
223 0 : intl_error_set_code( NULL, status TSRMLS_CC );
224 :
225 : /* Set error messages. */
226 0 : intl_error_set_custom_msg( NULL, "Error converting string to UTF-16.", 1 TSRMLS_CC );
227 0 : efree( uinput );
228 0 : RETURN_FALSE;
229 : }
230 :
231 :
232 : /* test string */
233 48 : uret = unorm_isNormalizedWithOptions( uinput, uinput_len, form, (int32_t) 0 /* options */, &status);
234 :
235 48 : efree( uinput );
236 :
237 : /* Bail out if an unexpected error occured. */
238 48 : if( U_FAILURE(status) ) {
239 : /* Set error messages. */
240 0 : intl_error_set_custom_msg( NULL,"Error testing if string is the given normalization form.", 1 TSRMLS_CC );
241 0 : RETURN_FALSE;
242 : }
243 :
244 48 : if ( uret )
245 18 : RETURN_TRUE;
246 :
247 30 : RETURN_FALSE;
248 : }
249 : /* }}} */
250 :
251 : /*
252 : * Local variables:
253 : * tab-width: 4
254 : * c-basic-offset: 4
255 : * End:
256 : * vim600: noet sw=4 ts=4 fdm=marker
257 : * vim<600: noet sw=4 ts=4
258 : */
|