1 : /*
2 : +----------------------------------------------------------------------+
3 : | PHP Version 6 |
4 : +----------------------------------------------------------------------+
5 : | This source file is subject to version 3.01 of the PHP license, |
6 : | that is bundled with this package in the file LICENSE, and is |
7 : | available through the world-wide-web at the following url: |
8 : | http://www.php.net/license/3_01.txt |
9 : | If you did not receive a copy of the PHP license and are unable to |
10 : | obtain it through the world-wide-web, please send a note to |
11 : | license@php.net so we can mail you a copy immediately. |
12 : +----------------------------------------------------------------------+
13 : | Authors: Andrei Zmievski <andrei@php.net> |
14 : +----------------------------------------------------------------------+
15 : */
16 :
17 : /* $Id: unicode_iterators.c 284435 2009-07-20 12:58:17Z pajoye $ */
18 :
19 : /*
20 : * TODO
21 : *
22 : * - test with empty and 1 character strings
23 : * - optimize current() to pass return_value to the handler so that it fills it
24 : * in directly instead of creating a new zval
25 : * - implement Countable (or count_elements handler) and Seekable interfaces
26 : */
27 :
28 : #include "php.h"
29 : #include "zend_interfaces.h"
30 : #include "zend_exceptions.h"
31 : #include <unicode/ubrk.h>
32 :
33 : typedef enum {
34 : ITER_CODE_POINT,
35 : ITER_COMB_SEQUENCE,
36 : ITER_CHARACTER,
37 : ITER_WORD,
38 : ITER_LINE,
39 : ITER_SENTENCE,
40 : ITER_TYPE_LAST,
41 : } text_iter_type;
42 :
43 : static const uint32_t ITER_REVERSE = 0x100;
44 : static const uint32_t ITER_TYPE_MASK = 0xFF;
45 :
46 : typedef struct {
47 : zend_object std;
48 : UChar* text;
49 : uint32_t text_len;
50 : text_iter_type type;
51 : zval* current;
52 : size_t current_alloc;
53 : long flags;
54 : union {
55 : struct {
56 : int32_t offset;
57 : int32_t cp_offset;
58 : int32_t index;
59 : } cp;
60 : struct {
61 : int32_t start;
62 : int32_t end;
63 : int32_t index;
64 : int32_t start_cp_offset;
65 : int32_t end_cp_offset;
66 : } cs;
67 : struct {
68 : int32_t bound;
69 : int32_t next;
70 : int32_t index;
71 : int32_t cp_offset;
72 : UBreakIterator *iter;
73 : UBreakIterator *n_iter;
74 : } brk;
75 : } u;
76 : zend_object_iterator iter;
77 : } text_iter_obj;
78 :
79 : static inline text_iter_obj* text_iter_to_obj(zend_object_iterator *iter)
80 54 : {
81 54 : return (text_iter_obj *)((char*)iter - offsetof(text_iter_obj, iter));
82 : }
83 :
84 : typedef struct {
85 : int (*valid) (text_iter_obj* object, long flags TSRMLS_DC);
86 : void (*current) (text_iter_obj* object, long flags TSRMLS_DC);
87 : int (*key) (text_iter_obj* object, long flags TSRMLS_DC);
88 : int (*offset) (text_iter_obj* object, long flags, int32_t *cp_offset TSRMLS_DC);
89 : void (*next) (text_iter_obj* object, long flags TSRMLS_DC);
90 : void (*rewind) (text_iter_obj* object, long flags TSRMLS_DC);
91 : void (*following) (text_iter_obj* object, int32_t offset, long flags TSRMLS_DC);
92 : zend_bool (*isBoundary)(text_iter_obj* object, int32_t offset, long flags TSRMLS_DC);
93 : } text_iter_ops;
94 :
95 : enum UBreakIteratorType brk_type_map[] = {
96 : UBRK_CHARACTER,
97 : UBRK_WORD,
98 : UBRK_LINE,
99 : UBRK_SENTENCE,
100 : };
101 :
102 : PHPAPI zend_class_entry* text_iterator_ce;
103 : PHPAPI zend_class_entry* rev_text_iterator_ce;
104 :
105 : /* Code point ops */
106 :
107 : static int text_iter_cp_valid(text_iter_obj* object, long flags TSRMLS_DC)
108 16 : {
109 16 : if (object->u.cp.offset == UBRK_DONE) {
110 0 : return 0;
111 : }
112 :
113 16 : if (flags & ITER_REVERSE) {
114 8 : return (object->u.cp.offset != 0);
115 : } else {
116 8 : return (object->u.cp.offset != object->text_len);
117 : }
118 : }
119 :
120 : static void text_iter_cp_current(text_iter_obj* object, long flags TSRMLS_DC)
121 12 : {
122 12 : UChar32 cp = 0;
123 12 : int32_t tmp, buf_len = 0;
124 :
125 12 : tmp = object->u.cp.offset;
126 :
127 12 : if (flags & ITER_REVERSE) {
128 6 : if (object->u.cp.offset != UBRK_DONE && object->u.cp.offset > 0) {
129 6 : U16_PREV(object->text, 0, tmp, cp);
130 6 : buf_len = zend_codepoint_to_uchar(cp, Z_USTRVAL_P(object->current));
131 : }
132 : } else {
133 6 : if (object->u.cp.offset != UBRK_DONE && ((uint32_t)object->u.cp.offset) < object->text_len) {
134 6 : U16_NEXT(object->text, tmp, (int32_t)object->text_len, cp);
135 6 : buf_len = zend_codepoint_to_uchar(cp, Z_USTRVAL_P(object->current));
136 : }
137 : }
138 12 : Z_USTRVAL_P(object->current)[buf_len] = 0;
139 12 : Z_USTRLEN_P(object->current) = buf_len;
140 12 : }
141 :
142 : static int text_iter_cp_key(text_iter_obj* object, long flags TSRMLS_DC)
143 6 : {
144 6 : return object->u.cp.index;
145 : }
146 :
147 : static int text_iter_cp_offset(text_iter_obj* object, long flags, int32_t *cp_offset TSRMLS_DC)
148 7 : {
149 7 : if (cp_offset) {
150 1 : *cp_offset = object->u.cp.cp_offset;
151 : }
152 7 : return object->u.cp.offset;
153 : }
154 :
155 : static void text_iter_cp_next(text_iter_obj* object, long flags TSRMLS_DC)
156 18 : {
157 18 : if (object->u.cp.offset == UBRK_DONE) {
158 0 : return;
159 : }
160 :
161 18 : if (object->text == NULL) {
162 1 : object->u.cp.offset = object->u.cp.cp_offset = UBRK_DONE;
163 1 : return;
164 : }
165 :
166 17 : if (flags & ITER_REVERSE) {
167 6 : U16_BACK_1(object->text, 0, object->u.cp.offset);
168 6 : if ((uint32_t)object->u.cp.offset <= object->text_len) {
169 6 : object->u.cp.cp_offset--;
170 : } else {
171 0 : object->u.cp.offset = object->u.cp.cp_offset = UBRK_DONE;
172 : }
173 : } else {
174 11 : U16_FWD_1(object->text, object->u.cp.offset, (int32_t) object->text_len);
175 11 : if ((uint32_t) object->u.cp.offset <= object->text_len) {
176 10 : object->u.cp.cp_offset++;
177 : } else {
178 1 : object->u.cp.offset = object->u.cp.cp_offset = UBRK_DONE;
179 : }
180 : }
181 17 : object->u.cp.index++;
182 : }
183 :
184 : static void text_iter_cp_rewind(text_iter_obj *object, long flags TSRMLS_DC)
185 10 : {
186 10 : if (flags & ITER_REVERSE) {
187 4 : object->u.cp.offset = object->text_len;
188 4 : object->u.cp.cp_offset = u_countChar32(object->text, object->text_len);
189 : } else {
190 6 : object->u.cp.offset = 0;
191 6 : object->u.cp.cp_offset = 0;
192 : }
193 10 : object->u.cp.index = 0;
194 10 : }
195 :
196 : static void text_iter_cp_following(text_iter_obj *object, int32_t offset, long flags TSRMLS_DC)
197 0 : {
198 : int32_t k;
199 :
200 0 : if (offset < 0) {
201 0 : offset = 0;
202 : }
203 :
204 : /*
205 : * On invalid iterator we always want to start looking for the code unit
206 : * offset from the beginning of the string.
207 : */
208 0 : if (object->u.cp.cp_offset == UBRK_DONE) {
209 0 : object->u.cp.cp_offset = 0;
210 0 : object->u.cp.offset = 0;
211 : }
212 :
213 : /*
214 : * Try to locate the code unit position relative to the last known codepoint
215 : * offset.
216 : */
217 0 : k = object->u.cp.offset;
218 0 : if (offset > object->u.cp.cp_offset) {
219 0 : U16_FWD_N(object->text, k, (int32_t) object->text_len, offset - object->u.cp.cp_offset);
220 : } else {
221 0 : U16_BACK_N(object->text, 0, k, object->u.cp.cp_offset - offset);
222 : }
223 :
224 : /*
225 : * Locate the actual boundary.
226 : */
227 0 : if (flags & ITER_REVERSE) {
228 0 : if (k == 0) {
229 0 : object->u.cp.cp_offset = UBRK_DONE;
230 0 : object->u.cp.offset = UBRK_DONE;
231 0 : return;
232 : } else {
233 0 : U16_BACK_1(object->text, 0, k);
234 : }
235 : } else {
236 0 : if (k == object->text_len) {
237 0 : object->u.cp.cp_offset = UBRK_DONE;
238 0 : object->u.cp.offset = UBRK_DONE;
239 0 : return;
240 : } else {
241 0 : U16_FWD_1(object->text, k, (int32_t) object->text_len);
242 : }
243 : }
244 :
245 : /*
246 : * If boundary is the same one as where we were at before, simply return.
247 : */
248 0 : if (k == object->u.cp.offset) {
249 0 : return;
250 : }
251 :
252 : /*
253 : * Adjust the internal codepoint offset based on how far we've moved.
254 : */
255 0 : if (k > object->u.cp.offset) {
256 0 : if (k - object->u.cp.offset > 1) {
257 0 : object->u.cp.cp_offset += u_countChar32(object->text + object->u.cp.offset, k - object->u.cp.offset);
258 : } else {
259 0 : object->u.cp.cp_offset++;
260 : }
261 : } else {
262 0 : if (object->u.cp.offset - k > 1) {
263 0 : object->u.cp.cp_offset -= u_countChar32(object->text + k, object->u.cp.offset - k);
264 : } else {
265 0 : object->u.cp.cp_offset--;
266 : }
267 : }
268 :
269 0 : object->u.cp.offset = k;
270 : }
271 :
272 : static zend_bool text_iter_cp_isBoundary(text_iter_obj *object, int32_t offset, long flags TSRMLS_DC)
273 0 : {
274 : int32_t k;
275 :
276 0 : if (offset < 0) {
277 0 : offset = 0;
278 : }
279 :
280 : /*
281 : * On invalid iterator we always want to start looking for the code unit
282 : * offset from the beginning of the string.
283 : */
284 0 : if (object->u.cp.cp_offset == UBRK_DONE) {
285 0 : object->u.cp.cp_offset = 0;
286 0 : object->u.cp.offset = 0;
287 : }
288 :
289 : /*
290 : * Try to locate the code unit position relative to the last known codepoint
291 : * offset.
292 : */
293 0 : k = object->u.cp.offset;
294 0 : if (offset > object->u.cp.cp_offset) {
295 0 : U16_FWD_N(object->text, k, (int32_t) object->text_len, offset - object->u.cp.cp_offset);
296 : } else {
297 0 : U16_BACK_N(object->text, 0, k, object->u.cp.cp_offset - offset);
298 : }
299 :
300 0 : if (k == object->text_len) {
301 0 : object->u.cp.cp_offset += u_countChar32(object->text + object->u.cp.offset, k - object->u.cp.offset);
302 : } else {
303 0 : object->u.cp.cp_offset = offset;
304 : }
305 0 : object->u.cp.offset = k;
306 :
307 : /*
308 : * Every codepoint is a boundary.
309 : */
310 0 : return TRUE;
311 : }
312 :
313 : static text_iter_ops text_iter_cp_ops = {
314 : text_iter_cp_valid,
315 : text_iter_cp_current,
316 : text_iter_cp_key,
317 : text_iter_cp_offset,
318 : text_iter_cp_next,
319 : text_iter_cp_rewind,
320 : text_iter_cp_following,
321 : text_iter_cp_isBoundary,
322 : };
323 :
324 : /* Combining sequence ops */
325 :
326 : static void text_iter_helper_move(zend_bool forward, UChar *text, int32_t text_len, int32_t *offset, int32_t *cp_offset)
327 0 : {
328 : UChar32 cp;
329 : int32_t tmp, tmp2;
330 :
331 0 : if (*offset == UBRK_DONE) {
332 0 : return;
333 : }
334 :
335 0 : if (forward) {
336 0 : if (*offset == text_len) {
337 0 : *offset = UBRK_DONE;
338 0 : *cp_offset = UBRK_DONE;
339 : } else {
340 0 : U16_NEXT(text, (*offset), text_len, cp);
341 0 : (*cp_offset)++;
342 :
343 0 : if (u_getCombiningClass(cp) == 0) {
344 0 : tmp = *offset;
345 0 : tmp2 = *cp_offset;
346 : /*
347 : * At the end of the string cp will be 0 because of the NULL
348 : * terminating NULL, so combining class will be 0 as well.
349 : */
350 0 : while (tmp < text_len) {
351 0 : U16_NEXT(text, tmp, text_len, cp);
352 0 : tmp2++;
353 0 : if (u_getCombiningClass(cp) == 0) {
354 0 : break;
355 : } else {
356 0 : *offset = tmp;
357 0 : *cp_offset = tmp2;
358 : }
359 : }
360 : }
361 : }
362 : } else {
363 0 : if (*offset == 0) {
364 0 : *offset = UBRK_DONE;
365 0 : *cp_offset = UBRK_DONE;
366 : } else {
367 0 : U16_PREV(text, 0, (*offset), cp);
368 0 : (*cp_offset)--;
369 0 : if (u_getCombiningClass(cp) != 0) {
370 : do {
371 0 : U16_PREV(text, 0, (*offset), cp);
372 0 : (*cp_offset)--;
373 0 : } while (*offset > 0 && u_getCombiningClass(cp) != 0);
374 : }
375 : }
376 : }
377 : }
378 :
379 : static int text_iter_cs_valid(text_iter_obj* object, long flags TSRMLS_DC)
380 0 : {
381 0 : if (object->u.cs.start == UBRK_DONE) {
382 0 : return 0;
383 : }
384 :
385 0 : if (flags & ITER_REVERSE) {
386 0 : return (object->u.cs.start != 0);
387 : } else {
388 0 : return (object->u.cs.start != object->text_len);
389 : }
390 : }
391 :
392 : static void text_iter_cs_current(text_iter_obj* object, long flags TSRMLS_DC)
393 0 : {
394 : UChar *start;
395 0 : int32_t length = -1;
396 :
397 0 : if (object->u.cs.start != UBRK_DONE) {
398 0 : if (flags & ITER_REVERSE) {
399 0 : if (object->u.cs.end == object->u.cs.start) {
400 0 : text_iter_helper_move(0, object->text, object->text_len,
401 : &object->u.cs.end, &object->u.cs.end_cp_offset);
402 : }
403 0 : start = object->text + object->u.cs.end;
404 : } else {
405 0 : if (object->u.cs.end == object->u.cs.start) {
406 0 : text_iter_helper_move(1, object->text, object->text_len,
407 : &object->u.cs.end, &object->u.cs.end_cp_offset);
408 : }
409 0 : start = object->text + object->u.cs.start;
410 : }
411 :
412 0 : if (object->u.cs.end == UBRK_DONE) {
413 0 : length = 0;
414 : } else {
415 0 : length = abs(object->u.cs.end - object->u.cs.start);
416 : }
417 : } else {
418 0 : length = 0;
419 : }
420 :
421 0 : if (length != 0) {
422 0 : if (length+1 > (int32_t) object->current_alloc) {
423 0 : object->current_alloc = length+1;
424 0 : Z_USTRVAL_P(object->current) = eurealloc(Z_USTRVAL_P(object->current), object->current_alloc);
425 : }
426 0 : u_memcpy(Z_USTRVAL_P(object->current), start, length);
427 : }
428 :
429 0 : Z_USTRVAL_P(object->current)[length] = 0;
430 0 : Z_USTRLEN_P(object->current) = length;
431 0 : }
432 :
433 : static int text_iter_cs_key(text_iter_obj* object, long flags TSRMLS_DC)
434 0 : {
435 0 : return object->u.cs.index;
436 : }
437 :
438 : static int text_iter_cs_offset(text_iter_obj* object, long flags, int32_t *cp_offset TSRMLS_DC)
439 0 : {
440 0 : if (cp_offset) {
441 0 : *cp_offset = object->u.cs.start_cp_offset;
442 : }
443 0 : return object->u.cs.start;
444 : }
445 :
446 : static void text_iter_cs_next(text_iter_obj* object, long flags TSRMLS_DC)
447 0 : {
448 0 : if (object->u.cs.start == UBRK_DONE) {
449 0 : return;
450 : }
451 :
452 0 : if (flags & ITER_REVERSE) {
453 0 : text_iter_helper_move(0, object->text, object->text_len,
454 : &object->u.cs.start, &object->u.cs.start_cp_offset);
455 : } else {
456 0 : text_iter_helper_move(1, object->text, object->text_len,
457 : &object->u.cs.start, &object->u.cs.start_cp_offset);
458 : }
459 0 : object->u.cs.end = object->u.cs.start;
460 0 : object->u.cs.end_cp_offset = object->u.cs.start_cp_offset;
461 0 : object->u.cs.index++;
462 : }
463 :
464 : static void text_iter_cs_rewind(text_iter_obj *object, long flags TSRMLS_DC)
465 0 : {
466 0 : if (flags & ITER_REVERSE) {
467 0 : object->u.cs.start = object->u.cs.end = object->text_len;
468 0 : object->u.cs.start_cp_offset = object->u.cs.end_cp_offset =
469 : u_countChar32(object->text, object->text_len);
470 : } else {
471 0 : object->u.cs.start = object->u.cs.end = 0;
472 0 : object->u.cs.start_cp_offset = object->u.cs.end_cp_offset = 0;
473 : }
474 0 : object->u.cs.index = 0; /* because _next increments index */
475 0 : }
476 :
477 : static void text_iter_cs_following(text_iter_obj *object, int32_t offset, long flags TSRMLS_DC)
478 0 : {
479 : int32_t k;
480 :
481 0 : if (offset < 0) {
482 0 : offset = 0;
483 : }
484 :
485 : /*
486 : * On invalid iterator we always want to start looking for the code unit
487 : * offset from the beginning of the string.
488 : */
489 0 : if (object->u.cs.start_cp_offset == UBRK_DONE) {
490 0 : object->u.cs.start_cp_offset = 0;
491 0 : object->u.cs.start = 0;
492 : }
493 :
494 : /*
495 : * Try to locate the code unit position relative to the last known codepoint
496 : * offset.
497 : */
498 0 : k = object->u.cs.start;
499 0 : if (offset > object->u.cs.start_cp_offset) {
500 0 : U16_FWD_N(object->text, k, (int32_t) object->text_len, offset - object->u.cs.start_cp_offset);
501 : } else {
502 0 : U16_BACK_N(object->text, 0, k, object->u.cs.start_cp_offset - offset);
503 : }
504 :
505 : /*
506 : * Locate the actual boundary.
507 : */
508 0 : if (flags & ITER_REVERSE) {
509 : /*
510 : * If offset was at or beyond the length of text, we need to adjust it
511 : * to the number of codepoints in the text.
512 : */
513 0 : if (k == object->text_len) {
514 0 : offset = u_countChar32(object->text, object->text_len);
515 : }
516 0 : text_iter_helper_move(0, object->text, object->text_len, &k, &offset);
517 : } else {
518 0 : text_iter_helper_move(1, object->text, object->text_len, &k, &offset);
519 : }
520 :
521 0 : if (k == object->u.cs.start) {
522 0 : return;
523 : }
524 :
525 0 : object->u.cs.start = k;
526 0 : object->u.cs.start_cp_offset = offset;
527 0 : object->u.cs.end = object->u.cs.start;
528 : }
529 :
530 : static zend_bool text_iter_cs_isBoundary(text_iter_obj *object, int32_t offset, long flags TSRMLS_DC)
531 0 : {
532 : UChar32 cp;
533 : int32_t k, tmp;
534 : zend_bool result;
535 :
536 0 : if (offset < 0) {
537 0 : offset = 0;
538 : }
539 :
540 : /*
541 : * On invalid iterator we always want to start looking for the code unit
542 : * offset from the beginning of the string.
543 : */
544 0 : if (object->u.cs.start_cp_offset == UBRK_DONE) {
545 0 : object->u.cs.start_cp_offset = 0;
546 0 : object->u.cs.start = 0;
547 : }
548 :
549 : /*
550 : * Try to locate the code unit position relative to the last known codepoint
551 : * offset.
552 : */
553 0 : k = object->u.cs.start;
554 0 : if (offset > object->u.cs.start_cp_offset) {
555 0 : U16_FWD_N(object->text, k, (int32_t) object->text_len, offset - object->u.cs.start_cp_offset);
556 : } else {
557 0 : U16_BACK_N(object->text, 0, k, object->u.cs.start_cp_offset - offset);
558 : }
559 :
560 : /* end of the text is always a boundary */
561 0 : if (k == object->text_len) {
562 0 : offset = u_countChar32(object->text, object->text_len);
563 0 : result = 1;
564 : } else {
565 : /* if the next codepoint is a base character, it's a boundary */
566 0 : tmp = k;
567 0 : U16_NEXT(object->text, tmp, (int32_t) object->text_len, cp);
568 0 : result = (u_getCombiningClass(cp) == 0);
569 : }
570 :
571 0 : if (k == object->u.cs.start) {
572 0 : return result;
573 : }
574 :
575 0 : object->u.cs.start = k;
576 0 : object->u.cs.start_cp_offset = offset;
577 0 : object->u.cs.end = object->u.cs.start;
578 :
579 0 : return result;
580 : }
581 :
582 : static text_iter_ops text_iter_cs_ops = {
583 : text_iter_cs_valid,
584 : text_iter_cs_current,
585 : text_iter_cs_key,
586 : text_iter_cs_offset,
587 : text_iter_cs_next,
588 : text_iter_cs_rewind,
589 : text_iter_cs_following,
590 : text_iter_cs_isBoundary,
591 : };
592 :
593 :
594 : /* UBreakIterator Ops */
595 :
596 : static int text_iter_brk_valid(text_iter_obj* object, long flags TSRMLS_DC)
597 0 : {
598 0 : if (object->u.brk.bound == UBRK_DONE) {
599 0 : return 0;
600 : }
601 :
602 0 : if (flags & ITER_REVERSE) {
603 0 : return (object->u.brk.bound != 0);
604 : } else {
605 0 : return (object->u.brk.bound != object->text_len);
606 : }
607 : }
608 :
609 : static void text_iter_brk_current(text_iter_obj* object, long flags TSRMLS_DC)
610 0 : {
611 : UChar *start;
612 0 : int32_t length = -1;
613 :
614 0 : if (object->u.brk.bound != UBRK_DONE) {
615 0 : if (flags & ITER_REVERSE) {
616 0 : if (object->u.brk.next == object->u.brk.bound) {
617 0 : object->u.brk.next = ubrk_preceding(object->u.brk.n_iter, object->u.brk.bound);
618 : }
619 0 : start = object->text + object->u.brk.next;
620 : } else {
621 0 : if (object->u.brk.next == object->u.brk.bound) {
622 0 : object->u.brk.next = ubrk_following(object->u.brk.n_iter, object->u.brk.bound);
623 : }
624 0 : start = object->text + object->u.brk.bound;
625 : }
626 :
627 0 : if (object->u.brk.next == UBRK_DONE) {
628 0 : length = 0;
629 : } else {
630 0 : length = abs(object->u.brk.next - object->u.brk.bound);
631 : }
632 : } else {
633 0 : length = 0;
634 : }
635 :
636 0 : if (length != 0) {
637 0 : if (length+1 > (int32_t) object->current_alloc) {
638 0 : object->current_alloc = length+1;
639 0 : Z_USTRVAL_P(object->current) = eurealloc(Z_USTRVAL_P(object->current), object->current_alloc);
640 : }
641 0 : u_memcpy(Z_USTRVAL_P(object->current), start, length);
642 : }
643 :
644 0 : Z_USTRVAL_P(object->current)[length] = 0;
645 0 : Z_USTRLEN_P(object->current) = length;
646 0 : }
647 :
648 : static int text_iter_brk_key(text_iter_obj* object, long flags TSRMLS_DC)
649 0 : {
650 0 : return object->u.brk.index;
651 : }
652 :
653 : static int text_iter_brk_offset(text_iter_obj* object, long flags, int32_t *cp_offset TSRMLS_DC)
654 0 : {
655 0 : if (cp_offset) {
656 0 : *cp_offset = object->u.brk.cp_offset;
657 : }
658 0 : return object->u.brk.bound;
659 : }
660 :
661 : static void text_iter_brk_next(text_iter_obj* object, long flags TSRMLS_DC)
662 0 : {
663 0 : int32_t tmp = object->u.brk.bound;
664 :
665 0 : if (object->u.brk.bound == UBRK_DONE) {
666 0 : return;
667 : }
668 :
669 0 : if (flags & ITER_REVERSE) {
670 0 : object->u.brk.bound = ubrk_previous(object->u.brk.iter);
671 0 : object->u.brk.next = object->u.brk.bound;
672 0 : if (object->u.brk.bound != UBRK_DONE) {
673 0 : if (tmp - object->u.brk.bound > 1) {
674 0 : object->u.brk.cp_offset -= u_countChar32(object->text + object->u.brk.bound, tmp - object->u.brk.bound);
675 : } else {
676 0 : object->u.brk.cp_offset--;
677 : }
678 : } else {
679 0 : object->u.brk.cp_offset = UBRK_DONE;
680 : }
681 : } else {
682 0 : object->u.brk.bound = ubrk_next(object->u.brk.iter);
683 0 : object->u.brk.next = object->u.brk.bound;
684 0 : if (object->u.brk.bound != UBRK_DONE) {
685 0 : if (object->u.brk.bound - tmp > 1) {
686 0 : object->u.brk.cp_offset += u_countChar32(object->text + tmp, object->u.brk.bound - tmp);
687 : } else {
688 0 : object->u.brk.cp_offset++;
689 : }
690 : } else {
691 0 : object->u.brk.cp_offset = UBRK_DONE;
692 : }
693 : }
694 0 : object->u.brk.index++;
695 : }
696 :
697 : static void text_iter_brk_rewind(text_iter_obj *object, long flags TSRMLS_DC)
698 0 : {
699 0 : if (flags & ITER_REVERSE) {
700 0 : object->u.brk.bound = ubrk_last(object->u.brk.iter);
701 0 : object->u.brk.next = ubrk_last(object->u.brk.n_iter);
702 0 : object->u.brk.cp_offset = u_countChar32(object->text, object->u.brk.bound);
703 : } else {
704 0 : object->u.brk.bound = ubrk_first(object->u.brk.iter);
705 0 : object->u.brk.next = ubrk_first(object->u.brk.n_iter);
706 0 : object->u.brk.cp_offset = 0;
707 : }
708 0 : object->u.brk.index = 0;
709 0 : }
710 :
711 : static void text_iter_brk_following(text_iter_obj *object, int32_t offset, long flags TSRMLS_DC)
712 0 : {
713 : int32_t k, tmp;
714 :
715 0 : if (offset < 0) {
716 0 : offset = 0;
717 : }
718 :
719 : /*
720 : * On invalid iterator we always want to start looking for the code unit
721 : * offset from the beginning of the string.
722 : */
723 0 : if (object->u.brk.cp_offset == UBRK_DONE) {
724 0 : object->u.brk.cp_offset = 0;
725 0 : object->u.brk.bound = 0;
726 : }
727 :
728 : /*
729 : * Try to locate the code unit position relative to the last known codepoint
730 : * offset.
731 : */
732 0 : k = tmp = object->u.brk.bound;
733 0 : if (offset > object->u.brk.cp_offset) {
734 0 : U16_FWD_N(object->text, k, (int32_t) object->text_len, offset - object->u.brk.cp_offset);
735 : } else {
736 0 : U16_BACK_N(object->text, 0, k, object->u.brk.cp_offset - offset);
737 : }
738 :
739 : /*
740 : * Locate the actual boundary.
741 : */
742 0 : if (flags & ITER_REVERSE) {
743 0 : object->u.brk.bound = ubrk_preceding(object->u.brk.iter, k);
744 : } else {
745 0 : object->u.brk.bound = ubrk_following(object->u.brk.iter, k);
746 : }
747 :
748 : /*
749 : * If boundary is the same one as where we were at before, simply return.
750 : */
751 0 : if (object->u.brk.bound == tmp) {
752 0 : return;
753 : }
754 :
755 0 : object->u.brk.next = object->u.brk.bound;
756 :
757 : /*
758 : * Adjust the internal codepoint offset based on how far we've moved.
759 : */
760 0 : if (object->u.brk.bound != UBRK_DONE) {
761 0 : if (object->u.brk.bound > tmp) {
762 0 : if (object->u.brk.bound - tmp > 1) {
763 0 : object->u.brk.cp_offset += u_countChar32(object->text + tmp, object->u.brk.bound - tmp);
764 : } else {
765 0 : object->u.brk.cp_offset++;
766 : }
767 : } else {
768 0 : if (tmp - object->u.brk.bound > 1) {
769 0 : object->u.brk.cp_offset -= u_countChar32(object->text + object->u.brk.bound, tmp - object->u.brk.bound);
770 : } else {
771 0 : object->u.brk.cp_offset--;
772 : }
773 : }
774 : } else {
775 0 : object->u.brk.cp_offset = UBRK_DONE;
776 : }
777 : }
778 :
779 : static zend_bool text_iter_brk_isBoundary(text_iter_obj *object, int32_t offset, long flags TSRMLS_DC)
780 0 : {
781 : int32_t k, tmp;
782 : UBool result;
783 :
784 0 : if (offset < 0) {
785 0 : offset = 0;
786 : }
787 :
788 : /*
789 : * On invalid iterator we always want to start looking for the code unit
790 : * offset from the beginning of the string.
791 : */
792 0 : if (object->u.brk.cp_offset == UBRK_DONE) {
793 0 : object->u.brk.cp_offset = 0;
794 0 : object->u.brk.bound = 0;
795 : }
796 :
797 : /*
798 : * Try to locate the code unit position relative to the last known codepoint
799 : * offset.
800 : */
801 0 : k = tmp = object->u.brk.bound;
802 0 : if (offset > object->u.brk.cp_offset) {
803 0 : U16_FWD_N(object->text, k, (int32_t) object->text_len, offset - object->u.brk.cp_offset);
804 : } else {
805 0 : U16_BACK_N(object->text, 0, k, object->u.brk.cp_offset - offset);
806 : }
807 :
808 0 : result = ubrk_isBoundary(object->u.brk.iter, k);
809 :
810 0 : object->u.brk.bound = ubrk_current(object->u.brk.iter);
811 0 : object->u.brk.next = object->u.brk.bound;
812 :
813 : /*
814 : * If boundary is the same one as where we were at before, simply return.
815 : */
816 0 : if (object->u.brk.bound == tmp) {
817 0 : return result;
818 : }
819 :
820 : /*
821 : * Adjust the internal codepoint offset based on how far we've moved.
822 : */
823 0 : if (object->u.brk.bound != UBRK_DONE) {
824 0 : if (object->u.brk.bound > tmp) {
825 0 : if (object->u.brk.bound - tmp > 1) {
826 0 : object->u.brk.cp_offset += u_countChar32(object->text + tmp, object->u.brk.bound - tmp);
827 : } else {
828 0 : object->u.brk.cp_offset++;
829 : }
830 : } else {
831 0 : if (tmp - object->u.brk.bound > 1) {
832 0 : object->u.brk.cp_offset -= u_countChar32(object->text + object->u.brk.bound, tmp - object->u.brk.bound);
833 : } else {
834 0 : object->u.brk.cp_offset--;
835 : }
836 : }
837 : } else {
838 0 : object->u.brk.cp_offset = UBRK_DONE;
839 : }
840 :
841 0 : return result;
842 : }
843 :
844 : static text_iter_ops text_iter_brk_ops = {
845 : text_iter_brk_valid,
846 : text_iter_brk_current,
847 : text_iter_brk_key,
848 : text_iter_brk_offset,
849 : text_iter_brk_next,
850 : text_iter_brk_rewind,
851 : text_iter_brk_following,
852 : text_iter_brk_isBoundary,
853 : };
854 :
855 :
856 : /* Ops array */
857 :
858 : static text_iter_ops* iter_ops[] = {
859 : &text_iter_cp_ops,
860 : &text_iter_cs_ops,
861 : &text_iter_brk_ops,
862 : &text_iter_brk_ops,
863 : &text_iter_brk_ops,
864 : &text_iter_brk_ops,
865 : };
866 :
867 : /* Iterator Funcs */
868 :
869 : static void text_iter_dtor(zend_object_iterator* iter TSRMLS_DC)
870 4 : {
871 4 : text_iter_obj* obj = text_iter_to_obj(iter);
872 4 : zval *object = obj->iter.data;
873 :
874 4 : zval_ptr_dtor(&object);
875 4 : }
876 :
877 : static int text_iter_valid(zend_object_iterator* iter TSRMLS_DC)
878 16 : {
879 16 : text_iter_obj* obj = text_iter_to_obj(iter);
880 :
881 16 : if (iter_ops[obj->type]->valid(obj, obj->flags TSRMLS_CC)) {
882 12 : return SUCCESS;
883 : } else {
884 4 : return FAILURE;
885 : }
886 : }
887 :
888 : static void text_iter_get_current_data(zend_object_iterator* iter, zval*** data TSRMLS_DC)
889 12 : {
890 12 : text_iter_obj* obj = text_iter_to_obj(iter);
891 :
892 12 : iter_ops[obj->type]->current(obj, obj->flags TSRMLS_CC);
893 12 : *data = &obj->current;
894 12 : }
895 :
896 : static int text_iter_get_current_key(zend_object_iterator* iter, zstr *str_key, uint *str_key_len, ulong *int_key TSRMLS_DC)
897 6 : {
898 6 : text_iter_obj* obj = text_iter_to_obj(iter);
899 :
900 6 : *int_key = iter_ops[obj->type]->key(obj, obj->flags TSRMLS_CC);
901 6 : return HASH_KEY_IS_LONG;
902 : }
903 :
904 : static void text_iter_move_forward(zend_object_iterator* iter TSRMLS_DC)
905 12 : {
906 12 : text_iter_obj* obj = text_iter_to_obj(iter);
907 :
908 12 : iter_ops[obj->type]->next(obj, obj->flags TSRMLS_CC);
909 12 : }
910 :
911 : static void text_iter_rewind(zend_object_iterator* iter TSRMLS_DC)
912 4 : {
913 4 : text_iter_obj* obj = text_iter_to_obj(iter);
914 :
915 4 : iter_ops[obj->type]->rewind(obj, obj->flags TSRMLS_CC);
916 4 : }
917 :
918 : zend_object_iterator_funcs text_iter_funcs = {
919 : text_iter_dtor,
920 : text_iter_valid,
921 : text_iter_get_current_data,
922 : text_iter_get_current_key,
923 : text_iter_move_forward,
924 : text_iter_rewind,
925 : };
926 :
927 : static zend_object_iterator* text_iter_get_iterator(zend_class_entry *ce, zval *object, int by_ref TSRMLS_DC)
928 4 : {
929 : text_iter_obj* iter_object;
930 :
931 4 : if (by_ref) {
932 0 : zend_error(E_ERROR, "An iterator cannot be used with foreach by reference");
933 : }
934 4 : iter_object = (text_iter_obj *) zend_object_store_get_object(object TSRMLS_CC);
935 :
936 4 : Z_ADDREF_P(object);
937 4 : iter_object->iter.data = (void *) object;
938 4 : iter_object->iter.funcs = &text_iter_funcs;
939 :
940 4 : return (zend_object_iterator *) &iter_object->iter;
941 : }
942 :
943 : static void text_iterator_free_storage(void *object TSRMLS_DC)
944 6 : {
945 6 : text_iter_obj *intern = (text_iter_obj *) object;
946 :
947 6 : zend_hash_destroy(intern->std.properties);
948 6 : FREE_HASHTABLE(intern->std.properties);
949 :
950 6 : if (intern->text) {
951 5 : efree(intern->text);
952 : }
953 6 : if (intern->type > ITER_CHARACTER) {
954 0 : if (intern->u.brk.iter) {
955 0 : ubrk_close(intern->u.brk.iter);
956 : }
957 0 : if (intern->u.brk.n_iter) {
958 0 : ubrk_close(intern->u.brk.n_iter);
959 : }
960 : }
961 6 : zval_ptr_dtor(&intern->current);
962 6 : efree(object);
963 6 : }
964 :
965 : static zend_object_value text_iterator_new(zend_class_entry *class_type TSRMLS_DC)
966 6 : {
967 : zend_object_value retval;
968 : text_iter_obj *intern;
969 : zval *tmp;
970 :
971 6 : intern = emalloc(sizeof(text_iter_obj));
972 6 : memset(intern, 0, sizeof(text_iter_obj));
973 6 : intern->std.ce = class_type;
974 :
975 6 : ALLOC_HASHTABLE(intern->std.properties);
976 6 : zend_hash_init(intern->std.properties, 0, NULL, ZVAL_PTR_DTOR, 0);
977 6 : zend_hash_copy(intern->std.properties, &class_type->default_properties, (copy_ctor_func_t) zval_add_ref, (void *) &tmp, sizeof(zval *));
978 :
979 6 : intern->type = ITER_CODE_POINT;
980 6 : MAKE_STD_ZVAL(intern->current); /* pre-allocate buffer for codepoint */
981 6 : intern->current_alloc = 3;
982 6 : Z_USTRVAL_P(intern->current) = eumalloc(3);
983 6 : Z_USTRVAL_P(intern->current)[0] = 0;
984 6 : Z_USTRLEN_P(intern->current) = 0;
985 6 : Z_TYPE_P(intern->current) = IS_UNICODE;
986 :
987 6 : retval.handle = zend_objects_store_put(intern, (zend_objects_store_dtor_t)zend_objects_destroy_object, (zend_objects_free_object_storage_t) text_iterator_free_storage, NULL TSRMLS_CC);
988 6 : retval.handlers = zend_get_std_object_handlers();
989 :
990 6 : return retval;
991 : }
992 :
993 : /* {{{ proto void TextIterator::__construct(unicode text [, int flags = TextIterator::CODEPOINT [, string locale ]]) U
994 : TextIterator constructor */
995 : PHP_METHOD(TextIterator, __construct)
996 6 : {
997 : UChar *text;
998 : int32_t text_len;
999 6 : zval *object = getThis();
1000 : text_iter_obj *intern;
1001 : text_iter_type ti_type;
1002 6 : char *locale = NULL;
1003 : int locale_len;
1004 6 : long flags = 0;
1005 :
1006 6 : if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "u|ls", &text, &text_len, &flags, &locale, &locale_len) == FAILURE) {
1007 1 : return;
1008 : }
1009 :
1010 5 : intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC);
1011 :
1012 5 : intern->text = eustrndup(text, text_len);
1013 5 : intern->text_len = text_len;
1014 5 : if (ZEND_NUM_ARGS() > 1) {
1015 0 : ti_type = flags & ITER_TYPE_MASK;
1016 0 : if (ti_type < 0 || ti_type >= ITER_TYPE_LAST) {
1017 0 : php_error(E_WARNING, "Invalid iterator type in TextIterator constructor");
1018 0 : ti_type = ITER_CODE_POINT;
1019 : }
1020 0 : intern->type = ti_type;
1021 0 : intern->flags = flags;
1022 : }
1023 :
1024 5 : if (Z_OBJCE_P(this_ptr) == rev_text_iterator_ce) {
1025 2 : intern->flags |= ITER_REVERSE;
1026 : }
1027 :
1028 5 : if (intern->type >= ITER_CHARACTER && intern->type < ITER_TYPE_LAST) {
1029 0 : UErrorCode status = U_ZERO_ERROR;
1030 0 : UErrorCode status2 = U_ZERO_ERROR;
1031 0 : locale = locale ? locale : UG(default_locale);
1032 0 : intern->u.brk.iter = ubrk_open(brk_type_map[intern->type - ITER_CHARACTER], locale, intern->text, intern->text_len, &status);
1033 0 : intern->u.brk.n_iter = ubrk_open(brk_type_map[intern->type - ITER_CHARACTER], locale, intern->text, intern->text_len, &status);
1034 0 : if (!U_SUCCESS(status) || !U_SUCCESS(status2)) {
1035 0 : php_error(E_RECOVERABLE_ERROR, "Could not create UBreakIterator for '%s' locale: %s", locale, u_errorName(status));
1036 0 : return;
1037 : }
1038 : }
1039 :
1040 5 : iter_ops[intern->type]->rewind(intern, intern->flags TSRMLS_CC);
1041 : }
1042 : /* }}} */
1043 :
1044 : /* {{{ proto unicode TextIterator::current() U
1045 : Returns the element at the current boundary */
1046 : PHP_METHOD(TextIterator, current)
1047 0 : {
1048 0 : zval *object = getThis();
1049 0 : text_iter_obj *intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC);
1050 :
1051 0 : if (zend_parse_parameters_none() == FAILURE) {
1052 0 : return;
1053 : }
1054 :
1055 0 : iter_ops[intern->type]->current(intern, intern->flags TSRMLS_CC);
1056 0 : RETURN_UNICODEL(Z_USTRVAL_P(intern->current), Z_USTRLEN_P(intern->current), 1);
1057 : }
1058 : /* }}} */
1059 :
1060 : /* {{{ proto int TextIterator::next([int n]) U
1061 : Advances to the n'th text boundary following the current one and returns its offset */
1062 : PHP_METHOD(TextIterator, next)
1063 1 : {
1064 1 : long i, step = 1;
1065 : int32_t cp_offset;
1066 1 : zval *object = getThis();
1067 1 : text_iter_obj *intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC);
1068 :
1069 1 : if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "|l", &step) == FAILURE) {
1070 0 : return;
1071 : }
1072 :
1073 1 : if (step <= 0) {
1074 0 : step = 1;
1075 : }
1076 :
1077 2 : for (i = 0; i < step; i++) {
1078 1 : iter_ops[intern->type]->next(intern, intern->flags TSRMLS_CC);
1079 : }
1080 :
1081 1 : if (return_value_used) {
1082 1 : iter_ops[intern->type]->offset(intern, intern->flags, &cp_offset TSRMLS_CC);
1083 1 : RETURN_LONG(cp_offset);
1084 : }
1085 : }
1086 : /* }}} */
1087 :
1088 : /* {{{ proto int TextIterator::key() U
1089 : Returns the number boundaries iterated through */
1090 : PHP_METHOD(TextIterator, key)
1091 0 : {
1092 0 : zval *object = getThis();
1093 0 : text_iter_obj *intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC);
1094 :
1095 0 : if (zend_parse_parameters_none() == FAILURE) {
1096 0 : return;
1097 : }
1098 :
1099 0 : RETURN_LONG(iter_ops[intern->type]->key(intern, intern->flags TSRMLS_CC));
1100 : }
1101 : /* }}} */
1102 :
1103 : /* {{{ proto bool TextIterator::valid() U
1104 : Determines validity of the iterator */
1105 : PHP_METHOD(TextIterator, valid)
1106 0 : {
1107 0 : zval *object = getThis();
1108 0 : text_iter_obj *intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC);
1109 :
1110 0 : if (zend_parse_parameters_none() == FAILURE) {
1111 0 : return;
1112 : }
1113 :
1114 0 : RETURN_BOOL(iter_ops[intern->type]->valid(intern, intern->flags TSRMLS_CC));
1115 : }
1116 : /* }}} */
1117 :
1118 : /* {{{ proto int TextIterator::first() U
1119 : Positions iterator at the first character in the text and returns the offset */
1120 : PHP_METHOD(TextIterator, rewind)
1121 0 : {
1122 : int32_t cp_offset;
1123 0 : zval *object = getThis();
1124 0 : text_iter_obj *intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC);
1125 :
1126 0 : if (zend_parse_parameters_none() == FAILURE) {
1127 0 : return;
1128 : }
1129 :
1130 0 : iter_ops[intern->type]->rewind(intern, intern->flags TSRMLS_CC);
1131 :
1132 0 : if (return_value_used) {
1133 0 : iter_ops[intern->type]->offset(intern, intern->flags, &cp_offset TSRMLS_CC);
1134 0 : RETURN_LONG(cp_offset);
1135 : }
1136 : }
1137 : /* }}} */
1138 :
1139 : /* {{{ proto int TextIterator::last() U
1140 : Positions iterator beyond the last character in the text and returns the offset */
1141 : PHP_METHOD(TextIterator, last)
1142 0 : {
1143 : long flags;
1144 : int32_t cp_offset;
1145 0 : zval *object = getThis();
1146 0 : text_iter_obj *intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC);
1147 :
1148 0 : if (zend_parse_parameters_none() == FAILURE) {
1149 0 : return;
1150 : }
1151 :
1152 0 : flags = intern->flags ^ ITER_REVERSE;
1153 0 : iter_ops[intern->type]->rewind(intern, flags TSRMLS_CC);
1154 :
1155 0 : if (return_value_used) {
1156 0 : iter_ops[intern->type]->offset(intern, flags, &cp_offset TSRMLS_CC);
1157 0 : RETURN_LONG(cp_offset);
1158 : }
1159 : }
1160 : /* }}} */
1161 :
1162 : /* {{{ proto int TextIterator::offset() U
1163 : Returns the offset of the current text boundary */
1164 : PHP_METHOD(TextIterator, offset)
1165 0 : {
1166 : int32_t cp_offset;
1167 0 : zval *object = getThis();
1168 0 : text_iter_obj *intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC);
1169 :
1170 0 : if (zend_parse_parameters_none() == FAILURE) {
1171 0 : return;
1172 : }
1173 :
1174 0 : iter_ops[intern->type]->offset(intern, intern->flags, &cp_offset TSRMLS_CC);
1175 0 : RETURN_LONG(cp_offset);
1176 : }
1177 : /* }}} */
1178 :
1179 : /* {{{ proto int TextIterator::previous([int n]) U
1180 : Advances to the n'th text boundary preceding the current one and returns its offset */
1181 : PHP_METHOD(TextIterator, previous)
1182 0 : {
1183 0 : long flags, i, step = 1;
1184 : int32_t cp_offset;
1185 0 : zval *object = getThis();
1186 0 : text_iter_obj *intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC);
1187 :
1188 0 : if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "|l", &step) == FAILURE) {
1189 0 : return;
1190 : }
1191 :
1192 0 : if (step <= 0) {
1193 0 : step = 1;
1194 : }
1195 0 : flags = intern->flags ^ ITER_REVERSE;
1196 :
1197 0 : for (i = 0; i < step; i++) {
1198 0 : iter_ops[intern->type]->next(intern, flags TSRMLS_CC);
1199 : }
1200 :
1201 0 : if (return_value_used) {
1202 0 : iter_ops[intern->type]->offset(intern, flags, &cp_offset TSRMLS_CC);
1203 0 : RETURN_LONG(cp_offset);
1204 : }
1205 : }
1206 : /* }}} */
1207 :
1208 : /* {{{ proto int TextIterator::following(int offset) U
1209 : Advances to the text boundary following the specified offset and returns its offset */
1210 : PHP_METHOD(TextIterator, following)
1211 0 : {
1212 : long offset;
1213 : int32_t cp_offset;
1214 0 : zval *object = getThis();
1215 0 : text_iter_obj *intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC);
1216 :
1217 0 : if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "l", &offset) == FAILURE) {
1218 0 : return;
1219 : }
1220 :
1221 0 : iter_ops[intern->type]->following(intern, offset, intern->flags TSRMLS_CC);
1222 0 : iter_ops[intern->type]->offset(intern, intern->flags, &cp_offset TSRMLS_CC);
1223 0 : RETURN_LONG(cp_offset);
1224 : }
1225 : /* }}} */
1226 :
1227 : /* {{{ proto int TextIterator::preceding(int offset) U
1228 : Advances to the text boundary preceding the specified offset and returns its offset */
1229 : PHP_METHOD(TextIterator, preceding)
1230 0 : {
1231 : long flags, offset;
1232 : int32_t cp_offset;
1233 0 : zval *object = getThis();
1234 0 : text_iter_obj *intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC);
1235 :
1236 0 : if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "l", &offset) == FAILURE) {
1237 0 : return;
1238 : }
1239 :
1240 : /*
1241 : * ReverseTextIterator will behave in the same way as the normal one.
1242 : */
1243 0 : flags = intern->flags | ITER_REVERSE;
1244 0 : iter_ops[intern->type]->following(intern, offset, flags TSRMLS_CC);
1245 0 : iter_ops[intern->type]->offset(intern, flags, &cp_offset TSRMLS_CC);
1246 0 : RETURN_LONG(cp_offset);
1247 : }
1248 : /* }}} */
1249 :
1250 : /* {{{ proto bool TextIterator::isBoundary(int offset) U
1251 : Determines whether specified offset is a text boundary */
1252 : PHP_METHOD(TextIterator, isBoundary)
1253 0 : {
1254 : long offset;
1255 0 : zval *object = getThis();
1256 0 : text_iter_obj *intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC);
1257 :
1258 0 : if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "l", &offset) == FAILURE) {
1259 0 : return;
1260 : }
1261 :
1262 : /*
1263 : * ReverseTextIterator will behave in the same way as the normal one.
1264 : */
1265 0 : RETURN_BOOL(iter_ops[intern->type]->isBoundary(intern, offset, intern->flags TSRMLS_CC));
1266 : }
1267 : /* }}} */
1268 :
1269 : /* {{{ proto array TextIterator::getAvailableLocales() U
1270 : Returns locales for which text boundary information is available */
1271 : PHP_METHOD(TextIterator, getAvailableLocales)
1272 0 : {
1273 : int32_t count, i;
1274 :
1275 0 : if (zend_parse_parameters_none() == FAILURE) {
1276 0 : return;
1277 : }
1278 :
1279 0 : array_init(return_value);
1280 0 : count = ubrk_countAvailable();
1281 0 : for (i = 0; i < count; i++) {
1282 0 : add_next_index_ascii_string(return_value, (char*)ubrk_getAvailable(i), ZSTR_DUPLICATE);
1283 : }
1284 : }
1285 : /* }}} */
1286 :
1287 : /* {{{ proto int TextIterator::getRuleStatus() U
1288 : Return the status from the break rule that determined the most recent boundary */
1289 : PHP_METHOD(TextIterator, getRuleStatus)
1290 0 : {
1291 0 : zval *object = getThis();
1292 0 : text_iter_obj *intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC);
1293 :
1294 0 : if (zend_parse_parameters_none() == FAILURE) {
1295 0 : return;
1296 : }
1297 :
1298 0 : if (intern->type >= ITER_CHARACTER && intern->type < ITER_TYPE_LAST) {
1299 0 : RETURN_LONG(ubrk_getRuleStatus(intern->u.brk.iter));
1300 : } else {
1301 0 : RETURN_LONG(0);
1302 : }
1303 : }
1304 : /* }}} */
1305 :
1306 : /* {{{ proto array TextIterator::getRuleStatusArray() U
1307 : Return the statuses from the break rules that determined the most recent boundary */
1308 : PHP_METHOD(TextIterator, getRuleStatusArray)
1309 0 : {
1310 0 : int32_t status_vec[32], *vec_ptr = status_vec;
1311 : int32_t vec_size, i;
1312 0 : zval *object = getThis();
1313 0 : text_iter_obj *intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC);
1314 0 : UErrorCode status = U_ZERO_ERROR;
1315 :
1316 0 : if (zend_parse_parameters_none() == FAILURE) {
1317 0 : return;
1318 : }
1319 :
1320 0 : array_init(return_value);
1321 0 : if (intern->type < ITER_CHARACTER) {
1322 0 : add_next_index_long(return_value, 0);
1323 : } else {
1324 0 : vec_size = sizeof(status_vec) / sizeof(status_vec[0]);
1325 0 : vec_size = ubrk_getRuleStatusVec(intern->u.brk.iter, vec_ptr, vec_size, &status);
1326 0 : if (status == U_BUFFER_OVERFLOW_ERROR) {
1327 0 : vec_ptr = safe_emalloc(vec_size, sizeof(int32_t), 0);
1328 0 : status = U_ZERO_ERROR;
1329 0 : vec_size = ubrk_getRuleStatusVec(intern->u.brk.iter, vec_ptr, vec_size, &status);
1330 : }
1331 :
1332 0 : for (i = 0; i < vec_size; i++) {
1333 0 : add_next_index_long(return_value, vec_ptr[i]);
1334 : }
1335 :
1336 0 : if (vec_ptr != status_vec) {
1337 0 : efree(vec_ptr);
1338 : }
1339 : }
1340 : }
1341 : /* }}} */
1342 :
1343 : /* {{{ proto array TextIterator::getAll() U
1344 : Return all text pieces determined by the text boundaries */
1345 : PHP_METHOD(TextIterator, getAll)
1346 1 : {
1347 : int32_t start, end;
1348 1 : zval *object = getThis();
1349 1 : text_iter_obj *intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC);
1350 1 : text_iter_ops *ops = iter_ops[intern->type];
1351 :
1352 1 : if (zend_parse_parameters_none() == FAILURE) {
1353 0 : return;
1354 : }
1355 :
1356 1 : array_init(return_value);
1357 1 : ops->rewind(intern, intern->flags TSRMLS_CC);
1358 1 : start = ops->offset(intern, intern->flags, NULL TSRMLS_CC);
1359 1 : for (ops->next(intern, intern->flags TSRMLS_CC), end = ops->offset(intern, intern->flags, NULL TSRMLS_CC);
1360 6 : end != UBRK_DONE;
1361 4 : start = end, ops->next(intern, intern->flags TSRMLS_CC), end = ops->offset(intern, intern->flags, NULL TSRMLS_CC)) {
1362 4 : if (end > start) {
1363 4 : add_next_index_unicodel(return_value, intern->text + start, end - start, 1);
1364 : } else {
1365 0 : add_next_index_unicodel(return_value, intern->text + end, start - end, 1);
1366 : }
1367 : }
1368 : }
1369 : /* }}} */
1370 :
1371 : /* {{{ TextIterator function entry table */
1372 : static const zend_function_entry text_iterator_funcs[] = {
1373 :
1374 : PHP_ME(TextIterator, __construct, NULL, ZEND_ACC_PUBLIC)
1375 :
1376 : /* Iterator interface methods */
1377 : PHP_ME(TextIterator, current, NULL, ZEND_ACC_PUBLIC)
1378 : PHP_ME(TextIterator, next, NULL, ZEND_ACC_PUBLIC)
1379 : PHP_ME(TextIterator, key, NULL, ZEND_ACC_PUBLIC)
1380 : PHP_ME(TextIterator, valid, NULL, ZEND_ACC_PUBLIC)
1381 : PHP_ME(TextIterator, rewind, NULL, ZEND_ACC_PUBLIC)
1382 :
1383 : PHP_ME(TextIterator, offset, NULL, ZEND_ACC_PUBLIC)
1384 : PHP_ME(TextIterator, previous, NULL, ZEND_ACC_PUBLIC)
1385 : PHP_ME(TextIterator, last, NULL, ZEND_ACC_PUBLIC)
1386 : PHP_ME(TextIterator, following, NULL, ZEND_ACC_PUBLIC)
1387 : PHP_ME(TextIterator, preceding, NULL, ZEND_ACC_PUBLIC)
1388 : PHP_ME(TextIterator, isBoundary, NULL, ZEND_ACC_PUBLIC)
1389 :
1390 : PHP_ME(TextIterator, getAll, NULL, ZEND_ACC_PUBLIC)
1391 :
1392 : PHP_ME(TextIterator, getRuleStatus, NULL, ZEND_ACC_PUBLIC)
1393 : PHP_ME(TextIterator, getRuleStatusArray, NULL, ZEND_ACC_PUBLIC)
1394 :
1395 : PHP_ME(TextIterator, getAvailableLocales, NULL, ZEND_ACC_PUBLIC | ZEND_ACC_STATIC)
1396 :
1397 : PHP_MALIAS(TextIterator, first, rewind, NULL, ZEND_ACC_PUBLIC)
1398 : {NULL, NULL, NULL}
1399 : };
1400 : /* }}} */
1401 :
1402 : void php_register_unicode_iterators(TSRMLS_D)
1403 17007 : {
1404 : zend_class_entry ce;
1405 :
1406 17007 : INIT_CLASS_ENTRY(ce, "TextIterator", text_iterator_funcs);
1407 17007 : text_iterator_ce = zend_register_internal_class(&ce TSRMLS_CC);
1408 17007 : text_iterator_ce->create_object = text_iterator_new;
1409 17007 : text_iterator_ce->get_iterator = text_iter_get_iterator;
1410 17007 : text_iterator_ce->ce_flags |= ZEND_ACC_FINAL_CLASS;
1411 17007 : zend_class_implements(text_iterator_ce TSRMLS_CC, 1, zend_ce_iterator);
1412 :
1413 17007 : INIT_CLASS_ENTRY(ce, "ReverseTextIterator", text_iterator_funcs);
1414 17007 : rev_text_iterator_ce = zend_register_internal_class(&ce TSRMLS_CC);
1415 17007 : rev_text_iterator_ce->create_object = text_iterator_new;
1416 17007 : rev_text_iterator_ce->get_iterator = text_iter_get_iterator;
1417 17007 : rev_text_iterator_ce->ce_flags |= ZEND_ACC_FINAL_CLASS;
1418 17007 : zend_class_implements(rev_text_iterator_ce TSRMLS_CC, 1, zend_ce_iterator);
1419 :
1420 17007 : zend_declare_class_constant_long(text_iterator_ce, "CODE_POINT", sizeof("CODE_POINT")-1, ITER_CODE_POINT TSRMLS_CC);
1421 17007 : zend_declare_class_constant_long(text_iterator_ce, "COMB_SEQUENCE", sizeof("COMB_SEQUENCE")-1, ITER_COMB_SEQUENCE TSRMLS_CC);
1422 17007 : zend_declare_class_constant_long(text_iterator_ce, "CHARACTER", sizeof("CHARACTER")-1, ITER_CHARACTER TSRMLS_CC);
1423 17007 : zend_declare_class_constant_long(text_iterator_ce, "WORD", sizeof("WORD")-1, ITER_WORD TSRMLS_CC);
1424 17007 : zend_declare_class_constant_long(text_iterator_ce, "LINE", sizeof("LINE")-1, ITER_LINE TSRMLS_CC);
1425 17007 : zend_declare_class_constant_long(text_iterator_ce, "SENTENCE", sizeof("SENTENCE")-1, ITER_SENTENCE TSRMLS_CC);
1426 :
1427 17007 : zend_declare_class_constant_long(text_iterator_ce, "DONE", sizeof("DONE")-1, UBRK_DONE TSRMLS_CC);
1428 :
1429 17007 : zend_declare_class_constant_long(text_iterator_ce, "WORD_NONE", sizeof("WORD_NONE")-1, UBRK_WORD_NONE TSRMLS_CC);
1430 17007 : zend_declare_class_constant_long(text_iterator_ce, "WORD_NONE_LIMIT", sizeof("WORD_NONE_LIMIT")-1, UBRK_WORD_NONE_LIMIT TSRMLS_CC);
1431 17007 : zend_declare_class_constant_long(text_iterator_ce, "WORD_NUMBER", sizeof("WORD_NUMBER")-1, UBRK_WORD_NUMBER TSRMLS_CC);
1432 17007 : zend_declare_class_constant_long(text_iterator_ce, "WORD_NUMBER_LIMIT", sizeof("WORD_NUMBER_LIMIT")-1, UBRK_WORD_NUMBER_LIMIT TSRMLS_CC);
1433 17007 : zend_declare_class_constant_long(text_iterator_ce, "WORD_LETTER", sizeof("WORD_LETTER")-1, UBRK_WORD_LETTER TSRMLS_CC);
1434 17007 : zend_declare_class_constant_long(text_iterator_ce, "WORD_LETTER_LIMIT", sizeof("WORD_LETTER_LIMIT")-1, UBRK_WORD_LETTER_LIMIT TSRMLS_CC);
1435 17007 : zend_declare_class_constant_long(text_iterator_ce, "WORD_KANA", sizeof("WORD_KANA")-1, UBRK_WORD_KANA TSRMLS_CC);
1436 17007 : zend_declare_class_constant_long(text_iterator_ce, "WORD_KANA_LIMIT", sizeof("WORD_KANA_LIMIT")-1, UBRK_WORD_KANA_LIMIT TSRMLS_CC);
1437 17007 : zend_declare_class_constant_long(text_iterator_ce, "WORD_IDEO", sizeof("WORD_IDEO")-1, UBRK_WORD_IDEO TSRMLS_CC);
1438 17007 : zend_declare_class_constant_long(text_iterator_ce, "WORD_IDEO_LIMIT", sizeof("WORD_IDEO_LIMIT")-1, UBRK_WORD_IDEO_LIMIT TSRMLS_CC);
1439 :
1440 17007 : zend_declare_class_constant_long(text_iterator_ce, "LINE_SOFT", sizeof("LINE_SOFT")-1, UBRK_LINE_SOFT TSRMLS_CC);
1441 17007 : zend_declare_class_constant_long(text_iterator_ce, "LINE_SOFT_LIMIT", sizeof("LINE_SOFT_LIMIT")-1, UBRK_LINE_SOFT_LIMIT TSRMLS_CC);
1442 17007 : zend_declare_class_constant_long(text_iterator_ce, "LINE_HARD", sizeof("LINE_HARD")-1, UBRK_LINE_HARD TSRMLS_CC);
1443 17007 : zend_declare_class_constant_long(text_iterator_ce, "LINE_HARD_LIMIT", sizeof("LINE_HARD_LIMIT")-1, UBRK_LINE_HARD_LIMIT TSRMLS_CC);
1444 :
1445 17007 : zend_declare_class_constant_long(text_iterator_ce, "SENTENCE_TERM", sizeof("SENTENCE_TERM")-1, UBRK_SENTENCE_TERM TSRMLS_CC);
1446 17007 : zend_declare_class_constant_long(text_iterator_ce, "SENTENCE_TERM_LIMIT", sizeof("SENTENCE_TERM_LIMIT")-1, UBRK_SENTENCE_TERM_LIMIT TSRMLS_CC);
1447 17007 : zend_declare_class_constant_long(text_iterator_ce, "SENTENCE_SEP", sizeof("SENTENCE_SEP")-1, UBRK_SENTENCE_SEP TSRMLS_CC);
1448 17007 : zend_declare_class_constant_long(text_iterator_ce, "SENTENCE_SEP_LIMIT", sizeof("SENTENCE_SEP_LIMIT")-1, UBRK_SENTENCE_SEP_LIMIT TSRMLS_CC);
1449 17007 : }
1450 :
1451 : /*
1452 : * Local variables:
1453 : * tab-width: 4
1454 : * c-basic-offset: 4
1455 : * End:
1456 : * vim600: sw=4 ts=4 fdm=marker
1457 : * vim<600: sw=4 ts=4
1458 : */
1459 :
|