PHP  
 PHP: Test and Code Coverage Analysis
downloads | QA | documentation | faq | getting help | mailing lists | reporting bugs | php.net sites | links | my php.net 
 

LCOV - code coverage report
Current view: top level - ext/intl/grapheme - grapheme_string.c (source / functions) Hit Total Coverage
Test: PHP Code Coverage Lines: 328 372 88.2 %
Date: 2022-01-18 Functions: 14 14 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :    +----------------------------------------------------------------------+
       3             :    | PHP Version 7                                                                                                                |
       4             :    +----------------------------------------------------------------------+
       5             :    | This source file is subject to version 3.01 of the PHP license,      |
       6             :    | that is bundled with this package in the file LICENSE, and is                |
       7             :    | available through the world-wide-web at the following url:                   |
       8             :    | http://www.php.net/license/3_01.txt                                                                  |
       9             :    | If you did not receive a copy of the PHP license and are unable to   |
      10             :    | obtain it through the world-wide-web, please send a note to                  |
      11             :    | license@php.net so we can mail you a copy immediately.                               |
      12             :    +----------------------------------------------------------------------+
      13             :    | Author: Ed Batutis <ed@batutis.com>                                                            |
      14             :    +----------------------------------------------------------------------+
      15             :  */
      16             : 
      17             : /* {{{ includes */
      18             : #ifdef HAVE_CONFIG_H
      19             : #include "config.h"
      20             : #endif
      21             : 
      22             : #include <php.h>
      23             : #include "grapheme.h"
      24             : #include "grapheme_util.h"
      25             : 
      26             : #include <unicode/utypes.h>
      27             : #if U_ICU_VERSION_MAJOR_NUM >= 49
      28             : #include <unicode/utf8.h>
      29             : #endif
      30             : #include <unicode/ucol.h>
      31             : #include <unicode/ustring.h>
      32             : #include <unicode/ubrk.h>
      33             : 
      34             : #include "ext/standard/php_string.h"
      35             : 
      36             : /* }}} */
      37             : 
      38             : #define GRAPHEME_EXTRACT_TYPE_COUNT             0
      39             : #define GRAPHEME_EXTRACT_TYPE_MAXBYTES  1
      40             : #define GRAPHEME_EXTRACT_TYPE_MAXCHARS  2
      41             : #define GRAPHEME_EXTRACT_TYPE_MIN       GRAPHEME_EXTRACT_TYPE_COUNT
      42             : #define GRAPHEME_EXTRACT_TYPE_MAX       GRAPHEME_EXTRACT_TYPE_MAXCHARS
      43             : 
      44             : 
      45             : /* {{{ grapheme_register_constants
      46             :  * Register API constants
      47             :  */
      48       26000 : void grapheme_register_constants( INIT_FUNC_ARGS )
      49             : {
      50       26000 :         REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_COUNT", GRAPHEME_EXTRACT_TYPE_COUNT, CONST_CS | CONST_PERSISTENT);
      51       26000 :         REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXBYTES", GRAPHEME_EXTRACT_TYPE_MAXBYTES, CONST_CS | CONST_PERSISTENT);
      52       26000 :         REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXCHARS", GRAPHEME_EXTRACT_TYPE_MAXCHARS, CONST_CS | CONST_PERSISTENT);
      53       26000 : }
      54             : /* }}} */
      55             : 
      56             : /* {{{ proto size_t grapheme_strlen(string str)
      57             :    Get number of graphemes in a string */
      58           8 : PHP_FUNCTION(grapheme_strlen)
      59             : {
      60             :         char* string;
      61             :         size_t string_len;
      62           8 :         UChar* ustring = NULL;
      63           8 :         int ustring_len = 0;
      64             :         zend_long ret_len;
      65             :         UErrorCode status;
      66             : 
      67           8 :         if (zend_parse_parameters(ZEND_NUM_ARGS(), "s", &string, &string_len) == FAILURE) {
      68           1 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
      69             :                          "grapheme_strlen: unable to parse input param", 0 );
      70           4 :                 RETURN_FALSE;
      71             :         }
      72             : 
      73           7 :         ret_len = grapheme_ascii_check((unsigned char *)string, string_len);
      74             : 
      75           7 :         if ( ret_len >= 0 )
      76           2 :                 RETURN_LONG(string_len);
      77             : 
      78             :         /* convert the string to UTF-16. */
      79           5 :         status = U_ZERO_ERROR;
      80           5 :         intl_convert_utf8_to_utf16(&ustring, &ustring_len, string, string_len, &status );
      81             : 
      82           5 :         if ( U_FAILURE( status ) ) {
      83             :                 /* Set global error code. */
      84           0 :                 intl_error_set_code( NULL, status );
      85             : 
      86             :                 /* Set error messages. */
      87           0 :                 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
      88           0 :                 if (ustring) {
      89           0 :                         efree( ustring );
      90             :                 }
      91           0 :                 RETURN_NULL();
      92             :         }
      93             : 
      94           5 :         ret_len = grapheme_split_string(ustring, ustring_len, NULL, 0 );
      95             : 
      96           5 :         if (ustring) {
      97           5 :                 efree( ustring );
      98             :         }
      99             : 
     100           5 :         if (ret_len >= 0) {
     101           5 :                 RETVAL_LONG(ret_len);
     102             :         } else {
     103           0 :                 RETVAL_FALSE;
     104             :         }
     105             : }
     106             : /* }}} */
     107             : 
     108             : /* {{{ proto int grapheme_strpos(string haystack, string needle [, int offset ])
     109             :    Find position of first occurrence of a string within another */
     110          44 : PHP_FUNCTION(grapheme_strpos)
     111             : {
     112             :         char *haystack, *needle;
     113             :         size_t haystack_len, needle_len;
     114             :         const char *found;
     115          44 :         zend_long loffset = 0;
     116          44 :         int32_t offset = 0;
     117          44 :         size_t noffset = 0;
     118             :         zend_long ret_pos;
     119             : 
     120          44 :         if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
     121           1 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
     122             :                          "grapheme_strpos: unable to parse input param", 0 );
     123           1 :                 RETURN_FALSE;
     124             :         }
     125             : 
     126          43 :         if ( OUTSIDE_STRING(loffset, haystack_len) ) {
     127           1 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 );
     128           1 :                 RETURN_FALSE;
     129             :         }
     130             : 
     131             :         /* we checked that it will fit: */
     132          42 :         offset = (int32_t) loffset;
     133          42 :         noffset = offset >= 0 ? offset : (int32_t)haystack_len + offset;
     134             : 
     135             :         /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
     136             : 
     137          42 :         if (needle_len == 0) {
     138           0 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 );
     139           0 :                 RETURN_FALSE;
     140             :         }
     141             : 
     142          42 :         if (offset >= 0) {
     143             :                 /* quick check to see if the string might be there
     144             :                  * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
     145             :                 */
     146          72 :                 found = php_memnstr(haystack + noffset, needle, needle_len, haystack + haystack_len);
     147             : 
     148             :                 /* if it isn't there the we are done */
     149          36 :                 if (!found) {
     150           9 :                         RETURN_FALSE;
     151             :                 }
     152             : 
     153             :                 /* if it is there, and if the haystack is ascii, we are all done */
     154          27 :                 if ( grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0 ) {
     155          12 :                         RETURN_LONG(found - haystack);
     156             :                 }
     157             :         }
     158             : 
     159             :         /* do utf16 part of the strpos */
     160          21 :         ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* fIgnoreCase */, 0 /* last */ );
     161             : 
     162          21 :         if ( ret_pos >= 0 ) {
     163          18 :                 RETURN_LONG(ret_pos);
     164             :         } else {
     165           3 :                 RETURN_FALSE;
     166             :         }
     167             : 
     168             : }
     169             : /* }}} */
     170             : 
     171             : /* {{{ proto int grapheme_stripos(string haystack, string needle [, int offset ])
     172             :    Find position of first occurrence of a string within another, ignoring case differences */
     173          44 : PHP_FUNCTION(grapheme_stripos)
     174             : {
     175             :         char *haystack, *needle;
     176             :         size_t haystack_len, needle_len;
     177             :         const char *found;
     178          44 :         zend_long loffset = 0;
     179          44 :         int32_t offset = 0;
     180             :         zend_long ret_pos;
     181             :         int is_ascii;
     182             : 
     183          44 :         if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
     184           1 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
     185             :                          "grapheme_stripos: unable to parse input param", 0 );
     186           1 :                 RETURN_FALSE;
     187             :         }
     188             : 
     189          43 :         if ( OUTSIDE_STRING(loffset, haystack_len) ) {
     190           1 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Offset not contained in string", 1 );
     191           1 :                 RETURN_FALSE;
     192             :         }
     193             : 
     194             :         /* we checked that it will fit: */
     195          42 :         offset = (int32_t) loffset;
     196             : 
     197             :         /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
     198             : 
     199          42 :         if (needle_len == 0) {
     200           0 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Empty delimiter", 1 );
     201           0 :                 RETURN_FALSE;
     202             :         }
     203             : 
     204          42 :         is_ascii = ( grapheme_ascii_check((unsigned char*)haystack, haystack_len) >= 0 );
     205             : 
     206          42 :         if ( is_ascii ) {
     207             :                 char *haystack_dup, *needle_dup;
     208          21 :                 int32_t noffset = offset >= 0 ? offset : (int32_t)haystack_len + offset;
     209          21 :                 needle_dup = estrndup(needle, needle_len);
     210          21 :                 php_strtolower(needle_dup, needle_len);
     211          21 :                 haystack_dup = estrndup(haystack, haystack_len);
     212          21 :                 php_strtolower(haystack_dup, haystack_len);
     213             : 
     214          42 :                 found = php_memnstr(haystack_dup + noffset, needle_dup, needle_len, haystack_dup + haystack_len);
     215             : 
     216          21 :                 efree(haystack_dup);
     217          21 :                 efree(needle_dup);
     218             : 
     219          21 :                 if (found) {
     220          14 :                         RETURN_LONG(found - haystack_dup);
     221             :                 }
     222             : 
     223             :                 /* if needle was ascii too, we are all done, otherwise we need to try using Unicode to see what we get */
     224           7 :                 if ( grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
     225           5 :                         RETURN_FALSE;
     226             :                 }
     227             :         }
     228             : 
     229             :         /* do utf16 part of the strpos */
     230          23 :         ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* fIgnoreCase */, 0 /*last */ );
     231             : 
     232          23 :         if ( ret_pos >= 0 ) {
     233          17 :                 RETURN_LONG(ret_pos);
     234             :         } else {
     235           6 :                 RETURN_FALSE;
     236             :         }
     237             : 
     238             : }
     239             : /* }}} */
     240             : 
     241             : /* {{{ proto int grapheme_strrpos(string haystack, string needle [, int offset])
     242             :    Find position of last occurrence of a string within another */
     243          38 : PHP_FUNCTION(grapheme_strrpos)
     244             : {
     245             :         char *haystack, *needle;
     246             :         size_t haystack_len, needle_len;
     247          38 :         zend_long loffset = 0;
     248          38 :         int32_t offset = 0;
     249             :         zend_long ret_pos;
     250             :         int is_ascii;
     251             : 
     252          38 :         if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
     253           1 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
     254             :                          "grapheme_strrpos: unable to parse input param", 0 );
     255           1 :                 RETURN_FALSE;
     256             :         }
     257             : 
     258          37 :         if ( OUTSIDE_STRING(loffset, haystack_len) ) {
     259           0 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 );
     260           0 :                 RETURN_FALSE;
     261             :         }
     262             : 
     263             :         /* we checked that it will fit: */
     264          37 :         offset = (int32_t) loffset;
     265             : 
     266             :         /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
     267             : 
     268          37 :         if (needle_len == 0) {
     269           0 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 );
     270           0 :                 RETURN_FALSE;
     271             :         }
     272             : 
     273          37 :         is_ascii = grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0;
     274             : 
     275          37 :         if ( is_ascii ) {
     276             : 
     277          19 :                 ret_pos = grapheme_strrpos_ascii(haystack, haystack_len, needle, needle_len, offset);
     278             : 
     279          19 :                 if ( ret_pos >= 0 ) {
     280          12 :                         RETURN_LONG(ret_pos);
     281             :                 }
     282             : 
     283             :                 /* if the needle was ascii too, we are done */
     284             : 
     285           7 :                 if (  grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
     286           5 :                         RETURN_FALSE;
     287             :                 }
     288             : 
     289             :                 /* else we need to continue via utf16 */
     290             :         }
     291             : 
     292          20 :         ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* f_ignore_case */, 1/* last */);
     293             : 
     294          20 :         if ( ret_pos >= 0 ) {
     295          14 :                 RETURN_LONG(ret_pos);
     296             :         } else {
     297           6 :                 RETURN_FALSE;
     298             :         }
     299             : 
     300             : 
     301             : }
     302             : /* }}} */
     303             : 
     304             : /* {{{ proto int grapheme_strripos(string haystack, string needle [, int offset])
     305             :    Find position of last occurrence of a string within another, ignoring case */
     306          37 : PHP_FUNCTION(grapheme_strripos)
     307             : {
     308             :         char *haystack, *needle;
     309             :         size_t haystack_len, needle_len;
     310          37 :         zend_long loffset = 0;
     311          37 :         int32_t offset = 0;
     312             :         zend_long ret_pos;
     313             :         int is_ascii;
     314             : 
     315          37 :         if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
     316           1 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
     317             :                          "grapheme_strrpos: unable to parse input param", 0 );
     318           1 :                 RETURN_FALSE;
     319             :         }
     320             : 
     321          36 :         if ( OUTSIDE_STRING(loffset, haystack_len) ) {
     322           0 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 );
     323           0 :                 RETURN_FALSE;
     324             :         }
     325             : 
     326             :         /* we checked that it will fit: */
     327          36 :         offset = (int32_t) loffset;
     328             : 
     329             :         /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
     330             : 
     331          36 :         if (needle_len == 0) {
     332           0 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 );
     333           0 :                 RETURN_FALSE;
     334             :         }
     335             : 
     336          36 :         is_ascii = grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0;
     337             : 
     338          36 :         if ( is_ascii ) {
     339             :                 char *needle_dup, *haystack_dup;
     340             : 
     341          19 :                 needle_dup = estrndup(needle, needle_len);
     342          19 :                 php_strtolower(needle_dup, needle_len);
     343          19 :                 haystack_dup = estrndup(haystack, haystack_len);
     344          19 :                 php_strtolower(haystack_dup, haystack_len);
     345             : 
     346          19 :                 ret_pos = grapheme_strrpos_ascii(haystack_dup, haystack_len, needle_dup, needle_len, offset);
     347             : 
     348          19 :                 efree(haystack_dup);
     349          19 :                 efree(needle_dup);
     350             : 
     351          19 :                 if ( ret_pos >= 0 ) {
     352          12 :                         RETURN_LONG(ret_pos);
     353             :                 }
     354             : 
     355             :                 /* if the needle was ascii too, we are done */
     356             : 
     357           7 :                 if (  grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
     358           5 :                         RETURN_FALSE;
     359             :                 }
     360             : 
     361             :                 /* else we need to continue via utf16 */
     362             :         }
     363             : 
     364          19 :         ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL,  1 /* f_ignore_case */, 1 /*last */);
     365             : 
     366          19 :         if ( ret_pos >= 0 ) {
     367          13 :                 RETURN_LONG(ret_pos);
     368             :         } else {
     369           6 :                 RETURN_FALSE;
     370             :         }
     371             : 
     372             : 
     373             : }
     374             : /* }}} */
     375             : 
     376             : /* {{{ proto string grapheme_substr(string str, int start [, int length])
     377             :    Returns part of a string */
     378          72 : PHP_FUNCTION(grapheme_substr)
     379             : {
     380             :         char *str;
     381             :         zend_string *u8_sub_str;
     382             :         UChar *ustr;
     383             :         size_t str_len;
     384             :         int32_t ustr_len;
     385          72 :         zend_long lstart = 0, length = 0;
     386          72 :         int32_t start = 0;
     387             :         int iter_val;
     388             :         UErrorCode status;
     389             :         unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
     390          72 :         UBreakIterator* bi = NULL;
     391             :         int sub_str_start_pos, sub_str_end_pos;
     392             :         int32_t (*iter_func)(UBreakIterator *);
     393          72 :         zend_bool no_length = 1;
     394             : 
     395          72 :         if (zend_parse_parameters(ZEND_NUM_ARGS(), "sl|l!", &str, &str_len, &lstart, &length, &no_length) == FAILURE) {
     396           1 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
     397             :                          "grapheme_substr: unable to parse input param", 0 );
     398          45 :                 RETURN_FALSE;
     399             :         }
     400             : 
     401          71 :         if ( OUTSIDE_STRING(lstart, str_len)) {
     402           5 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 );
     403           5 :                 RETURN_FALSE;
     404             :         }
     405             : 
     406             :         /* we checked that it will fit: */
     407          66 :         start = (int32_t) lstart;
     408             : 
     409          66 :         if(no_length) {
     410          23 :                 length = str_len;
     411             :         }
     412             : 
     413          66 :         if(length < INT32_MIN) {
     414           0 :                 length = INT32_MIN;
     415          66 :         } else if(length > INT32_MAX) {
     416           0 :                 length = INT32_MAX;
     417             :         }
     418             : 
     419             :         /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
     420             : 
     421          66 :         if ( grapheme_ascii_check((unsigned char *)str, str_len) >= 0 ) {
     422             :                 int32_t asub_str_len;
     423             :                 char *sub_str;
     424           9 :                 grapheme_substr_ascii(str, str_len, start, (int32_t)length, &sub_str, &asub_str_len);
     425             : 
     426           9 :                 if ( NULL == sub_str ) {
     427           1 :                         intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: invalid parameters", 1 );
     428           1 :                         RETURN_FALSE;
     429             :                 }
     430             : 
     431          16 :                 RETURN_STRINGL(sub_str, asub_str_len);
     432             :         }
     433             : 
     434          57 :         ustr = NULL;
     435          57 :         ustr_len = 0;
     436          57 :         status = U_ZERO_ERROR;
     437          57 :         intl_convert_utf8_to_utf16(&ustr, &ustr_len, str, str_len, &status);
     438             : 
     439          57 :         if ( U_FAILURE( status ) ) {
     440             :                 /* Set global error code. */
     441           0 :                 intl_error_set_code( NULL, status );
     442             : 
     443             :                 /* Set error messages. */
     444           0 :                 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
     445           0 :                 if (ustr) {
     446           0 :                         efree( ustr );
     447             :                 }
     448           0 :                 RETURN_FALSE;
     449             :         }
     450             : 
     451          57 :         bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status );
     452             : 
     453          57 :         if( U_FAILURE(status) ) {
     454           0 :                 RETURN_FALSE;
     455             :         }
     456             : 
     457          57 :         ubrk_setText(bi, ustr, ustr_len,        &status);
     458             : 
     459          57 :         if ( start < 0 ) {
     460          28 :                 iter_func = ubrk_previous;
     461          28 :                 ubrk_last(bi);
     462          28 :                 iter_val = 1;
     463             :         }
     464             :         else {
     465          29 :                 iter_func = ubrk_next;
     466          29 :                 iter_val = -1;
     467             :         }
     468             : 
     469          57 :         sub_str_start_pos = 0;
     470             : 
     471         384 :         while ( start ) {
     472         270 :                 sub_str_start_pos = iter_func(bi);
     473             : 
     474         270 :                 if ( UBRK_DONE == sub_str_start_pos ) {
     475           0 :                         break;
     476             :                 }
     477             : 
     478         270 :                 start += iter_val;
     479             :         }
     480             : 
     481          57 :         if ( 0 != start || sub_str_start_pos >= ustr_len ) {
     482             : 
     483           3 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 );
     484             : 
     485           3 :                 if (ustr) {
     486           3 :                         efree(ustr);
     487             :                 }
     488           3 :                 ubrk_close(bi);
     489           3 :                 RETURN_FALSE;
     490             :         }
     491             : 
     492             :         /* OK to convert here since if str_len were big, convert above would fail */
     493          54 :         if (length >= (int32_t)str_len) {
     494             : 
     495             :                 /* no length supplied or length is too big, return the rest of the string */
     496             : 
     497          19 :                 status = U_ZERO_ERROR;
     498          19 :                 u8_sub_str = intl_convert_utf16_to_utf8(ustr + sub_str_start_pos, ustr_len - sub_str_start_pos, &status);
     499             : 
     500          19 :                 if (ustr) {
     501          19 :                         efree( ustr );
     502             :                 }
     503          19 :                 ubrk_close( bi );
     504             : 
     505          19 :                 if ( !u8_sub_str ) {
     506             :                         /* Set global error code. */
     507           0 :                         intl_error_set_code( NULL, status );
     508             : 
     509             :                         /* Set error messages. */
     510           0 :                         intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 );
     511             : 
     512           0 :                         RETURN_FALSE;
     513             :                 }
     514             : 
     515             :                 /* return the allocated string, not a duplicate */
     516          19 :                 RETVAL_NEW_STR(u8_sub_str);
     517          19 :                 return;
     518             :         }
     519             : 
     520          35 :         if(length == 0) {
     521             :                 /* empty length - we've validated start, we can return "" now */
     522           2 :                 if (ustr) {
     523           2 :                         efree(ustr);
     524             :                 }
     525           2 :                 ubrk_close(bi);
     526           2 :                 RETURN_EMPTY_STRING();
     527             :         }
     528             : 
     529             :         /* find the end point of the string to return */
     530             : 
     531          33 :         if ( length < 0 ) {
     532          24 :                 iter_func = ubrk_previous;
     533          24 :                 ubrk_last(bi);
     534          24 :                 iter_val = 1;
     535             :         }
     536             :         else {
     537           9 :                 iter_func = ubrk_next;
     538           9 :                 iter_val = -1;
     539             :         }
     540             : 
     541          33 :         sub_str_end_pos = 0;
     542             : 
     543         220 :         while ( length ) {
     544         158 :                 sub_str_end_pos = iter_func(bi);
     545             : 
     546         158 :                 if ( UBRK_DONE == sub_str_end_pos ) {
     547           4 :                         break;
     548             :                 }
     549             : 
     550         154 :                 length += iter_val;
     551             :         }
     552             : 
     553          33 :         ubrk_close(bi);
     554             : 
     555          33 :         if ( UBRK_DONE == sub_str_end_pos) {
     556           4 :                 if(length < 0) {
     557           3 :                         intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length not contained in string", 1 );
     558             : 
     559           3 :                         efree(ustr);
     560           3 :                         RETURN_FALSE;
     561             :                 } else {
     562           1 :                         sub_str_end_pos = ustr_len;
     563             :                 }
     564             :         }
     565             : 
     566          30 :         if(sub_str_start_pos > sub_str_end_pos) {
     567           2 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length is beyond start", 1 );
     568             : 
     569           2 :                 efree(ustr);
     570           2 :                 RETURN_FALSE;
     571             :         }
     572             : 
     573          28 :         status = U_ZERO_ERROR;
     574          28 :         u8_sub_str = intl_convert_utf16_to_utf8(ustr + sub_str_start_pos, ( sub_str_end_pos - sub_str_start_pos ), &status);
     575             : 
     576          28 :         efree( ustr );
     577             : 
     578          28 :         if ( !u8_sub_str ) {
     579             :                 /* Set global error code. */
     580           0 :                 intl_error_set_code( NULL, status );
     581             : 
     582             :                 /* Set error messages. */
     583           0 :                 intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 );
     584             : 
     585           0 :                 RETURN_FALSE;
     586             :         }
     587             : 
     588             :          /* return the allocated string, not a duplicate */
     589          28 :         RETVAL_NEW_STR(u8_sub_str);
     590             : }
     591             : /* }}} */
     592             : 
     593             : /* {{{  strstr_common_handler */
     594          73 : static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case)
     595             : {
     596             :         char *haystack, *needle;
     597             :         const char *found;
     598             :         size_t haystack_len, needle_len;
     599             :         int32_t ret_pos, uchar_pos;
     600          73 :         zend_bool part = 0;
     601             : 
     602          73 :         if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|b", &haystack, &haystack_len, &needle, &needle_len, &part) == FAILURE) {
     603             : 
     604           2 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
     605             :                          "grapheme_strstr: unable to parse input param", 0 );
     606             : 
     607           2 :                 RETURN_FALSE;
     608             :         }
     609             : 
     610          71 :         if (needle_len == 0) {
     611             : 
     612           0 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 );
     613             : 
     614           0 :                 RETURN_FALSE;
     615             :         }
     616             : 
     617             : 
     618          71 :         if ( !f_ignore_case ) {
     619             : 
     620             :                 /* ASCII optimization: quick check to see if the string might be there
     621             :                  * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
     622             :                 */
     623          70 :                 found = php_memnstr(haystack, needle, needle_len, haystack + haystack_len);
     624             : 
     625             :                 /* if it isn't there the we are done */
     626          35 :                 if ( !found ) {
     627           3 :                         RETURN_FALSE;
     628             :                 }
     629             : 
     630             :                 /* if it is there, and if the haystack is ascii, we are all done */
     631          32 :                 if ( grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0 ) {
     632          13 :                         size_t found_offset = found - haystack;
     633             : 
     634          13 :                         if (part) {
     635          10 :                                 RETURN_STRINGL(haystack, found_offset);
     636             :                         } else {
     637          16 :                                 RETURN_STRINGL(found, haystack_len - found_offset);
     638             :                         }
     639             :                 }
     640             : 
     641             :         }
     642             : 
     643             :         /* need to work in utf16 */
     644          55 :         ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case, 0 /*last */ );
     645             : 
     646          55 :         if ( ret_pos < 0 ) {
     647           9 :                 RETURN_FALSE;
     648             :         }
     649             : 
     650             :         /* uchar_pos is the 'nth' Unicode character position of the needle */
     651             : 
     652          46 :         ret_pos = 0;
     653          46 :         U8_FWD_N(haystack, ret_pos, haystack_len, uchar_pos);
     654             : 
     655          46 :         if (part) {
     656          30 :                 RETURN_STRINGL(haystack, ret_pos);
     657             :         } else {
     658          62 :                 RETURN_STRINGL(haystack + ret_pos, haystack_len - ret_pos);
     659             :         }
     660             : 
     661             : }
     662             : /* }}} */
     663             : 
     664             : /* {{{ proto string grapheme_strstr(string haystack, string needle[, bool part])
     665             :    Finds first occurrence of a string within another */
     666          36 : PHP_FUNCTION(grapheme_strstr)
     667             : {
     668          36 :         strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 /* f_ignore_case */);
     669          36 : }
     670             : /* }}} */
     671             : 
     672             : /* {{{ proto string grapheme_stristr(string haystack, string needle[, bool part])
     673             :    Finds first occurrence of a string within another */
     674          37 : PHP_FUNCTION(grapheme_stristr)
     675             : {
     676          37 :         strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1 /* f_ignore_case */);
     677          37 : }
     678             : /* }}} */
     679             : 
     680             : /* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */
     681             : static inline int32_t
     682          21 : grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
     683             : {
     684          21 :         int pos = 0;
     685          21 :         int ret_pos = 0;
     686             :         int break_pos, prev_break_pos;
     687          21 :         int count = 0;
     688             : 
     689             :         while ( 1 ) {
     690         183 :                 pos = ubrk_next(bi);
     691             : 
     692         102 :                 if ( UBRK_DONE == pos ) {
     693           7 :                         break;
     694             :                 }
     695             : 
     696         311 :                 for ( break_pos = ret_pos; break_pos < pos; ) {
     697         121 :                         count++;
     698         121 :                         prev_break_pos = break_pos;
     699         121 :                         U8_FWD_1(pstr, break_pos, str_len);
     700             : 
     701         121 :                         if ( prev_break_pos == break_pos ) {
     702             :                                 /* something wrong - malformed utf8? */
     703           0 :                                 csize = 0;
     704           0 :                                 break;
     705             :                         }
     706             :                 }
     707             : 
     708             :                 /* if we are beyond our limit, then the loop is done */
     709          95 :                 if ( count > csize ) {
     710          14 :                         break;
     711             :                 }
     712             : 
     713          81 :                 ret_pos = break_pos;
     714             :         }
     715             : 
     716          21 :         return ret_pos;
     717             : }
     718             : /* }}} */
     719             : 
     720             : /* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */
     721             : static inline int32_t
     722          26 : grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
     723             : {
     724          26 :         int pos = 0;
     725          26 :         int ret_pos = 0;
     726             : 
     727             :         while ( 1 ) {
     728         112 :                 pos = ubrk_next(bi);
     729             : 
     730          69 :                 if ( UBRK_DONE == pos ) {
     731           8 :                         break;
     732             :                 }
     733             : 
     734          61 :                 if ( pos > bsize ) {
     735          18 :                         break;
     736             :                 }
     737             : 
     738          43 :                 ret_pos = pos;
     739             :         }
     740             : 
     741          26 :         return ret_pos;
     742             : }
     743             : /* }}} */
     744             : 
     745             : /* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */
     746             : static inline int32_t
     747          26 : grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
     748             : {
     749          26 :         int next_pos = 0;
     750          26 :         int ret_pos = 0;
     751             : 
     752         100 :         while ( size ) {
     753          50 :                 next_pos = ubrk_next(bi);
     754             : 
     755          50 :                 if ( UBRK_DONE == next_pos ) {
     756           2 :                         break;
     757             :                 }
     758          48 :                 ret_pos = next_pos;
     759          48 :                 size--;
     760             :         }
     761             : 
     762          26 :         return ret_pos;
     763             : }
     764             : /* }}} */
     765             : 
     766             : /* {{{ grapheme extract iter function pointer array */
     767             : typedef int32_t (*grapheme_extract_iter)(UBreakIterator * /*bi*/, int32_t /*size*/, unsigned char * /*pstr*/, int32_t /*str_len*/);
     768             : 
     769             : static grapheme_extract_iter grapheme_extract_iters[] = {
     770             :         &grapheme_extract_count_iter,
     771             :         &grapheme_extract_bytecount_iter,
     772             :         &grapheme_extract_charcount_iter,
     773             : };
     774             : /* }}} */
     775             : 
     776             : /* {{{ proto string grapheme_extract(string str, int size[, int extract_type[, int start[, int next]]])
     777             :         Function to extract a sequence of default grapheme clusters */
     778         119 : PHP_FUNCTION(grapheme_extract)
     779             : {
     780             :         char *str, *pstr;
     781         119 :         UText ut = UTEXT_INITIALIZER;
     782             :         size_t str_len;
     783             :         zend_long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
     784         119 :         zend_long lstart = 0; /* starting position in str in bytes */
     785         119 :         int32_t start = 0;
     786         119 :         zend_long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT;
     787             :         UErrorCode status;
     788             :         unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
     789         119 :         UBreakIterator* bi = NULL;
     790             :         int ret_pos;
     791         119 :         zval *next = NULL; /* return offset of next part of the string */
     792             : 
     793         119 :         if (zend_parse_parameters(ZEND_NUM_ARGS(), "sl|llz", &str, &str_len, &size, &extract_type, &lstart, &next) == FAILURE) {
     794           1 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
     795             :                          "grapheme_extract: unable to parse input param", 0 );
     796           1 :                 RETURN_FALSE;
     797             :         }
     798             : 
     799         118 :         if (lstart < 0) {
     800           5 :                 lstart += str_len;
     801             :         }
     802             : 
     803         118 :         if ( NULL != next ) {
     804          40 :                 if ( !Z_ISREF_P(next) ) {
     805           0 :                         intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
     806             :                                  "grapheme_extract: 'next' was not passed by reference", 0 );
     807           0 :                         RETURN_FALSE;
     808             :                 } else {
     809          40 :                         ZVAL_DEREF(next);
     810             :                         /* initialize next */
     811          20 :                         zval_ptr_dtor(next);
     812          20 :             ZVAL_LONG(next, lstart);
     813             :                 }
     814             :         }
     815             : 
     816         118 :         if ( extract_type < GRAPHEME_EXTRACT_TYPE_MIN || extract_type > GRAPHEME_EXTRACT_TYPE_MAX ) {
     817           1 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
     818             :                          "grapheme_extract: unknown extract type param", 0 );
     819           1 :                 RETURN_FALSE;
     820             :         }
     821             : 
     822         117 :         if ( lstart > INT32_MAX || lstart < 0 || (size_t)lstart >= str_len ) {
     823           5 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: start not contained in string", 0 );
     824           5 :                 RETURN_FALSE;
     825             :         }
     826             : 
     827         112 :         if ( size > INT32_MAX || size < 0) {
     828           0 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: size is invalid", 0 );
     829           0 :                 RETURN_FALSE;
     830             :         }
     831         112 :         if (size == 0) {
     832           8 :                 RETURN_EMPTY_STRING();
     833             :         }
     834             : 
     835             :         /* we checked that it will fit: */
     836         104 :         start = (int32_t) lstart;
     837             : 
     838         104 :         pstr = str + start;
     839             : 
     840             :         /* just in case pstr points in the middle of a character, move forward to the start of the next char */
     841         104 :         if ( !U8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
     842           9 :                 char *str_end = str + str_len;
     843             : 
     844          25 :                 while ( !U8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
     845           9 :                         pstr++;
     846           9 :                         if ( pstr >= str_end ) {
     847           2 :                                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
     848             :                                                                 "grapheme_extract: invalid input string", 0 );
     849             : 
     850           2 :                                 RETURN_FALSE;
     851             :                         }
     852             :                 }
     853             :         }
     854             : 
     855         102 :         str_len -= (pstr - str);
     856             : 
     857             :         /* if the string is all ASCII up to size+1 - or str_len whichever is first - then we are done.
     858             :                 (size + 1 because the size-th character might be the beginning of a grapheme cluster)
     859             :          */
     860             : 
     861         102 :         if ( -1 != grapheme_ascii_check((unsigned char *)pstr, MIN(size + 1, str_len)) ) {
     862          29 :         size_t nsize = MIN(size, str_len);
     863          29 :                 if ( NULL != next ) {
     864          11 :                         ZVAL_LONG(next, start+nsize);
     865             :                 }
     866          58 :                 RETURN_STRINGL(pstr, nsize);
     867             :         }
     868             : 
     869          73 :         status = U_ZERO_ERROR;
     870          73 :         utext_openUTF8(&ut, pstr, str_len, &status);
     871             : 
     872          73 :         if ( U_FAILURE( status ) ) {
     873             :                 /* Set global error code. */
     874           0 :                 intl_error_set_code( NULL, status );
     875             : 
     876             :                 /* Set error messages. */
     877           0 :                 intl_error_set_custom_msg( NULL, "Error opening UTF-8 text", 0 );
     878             : 
     879           0 :                 RETURN_FALSE;
     880             :         }
     881             : 
     882          73 :         bi = NULL;
     883          73 :         status = U_ZERO_ERROR;
     884          73 :         bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status );
     885             : 
     886          73 :         ubrk_setUText(bi, &ut, &status);
     887             :         /* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
     888             :                 can't back up. So, we will not do anything. */
     889             : 
     890             :         /* now we need to find the end of the chunk the user wants us to return */
     891             :         /* it's ok to convert str_len to in32_t since if it were too big intl_convert_utf8_to_utf16 above would fail */
     892          73 :         ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, (unsigned char *)pstr, (int32_t)str_len);
     893             : 
     894          73 :         utext_close(&ut);
     895          73 :         ubrk_close(bi);
     896             : 
     897          73 :         if ( NULL != next ) {
     898           6 :                 ZVAL_LONG(next, start+ret_pos);
     899             :         }
     900             : 
     901         146 :         RETURN_STRINGL(((char *)pstr), ret_pos);
     902             : }
     903             : 
     904             : /* }}} */
     905             : 
     906             : /*
     907             :  * Local variables:
     908             :  * tab-width: 4
     909             :  * c-basic-offset: 4
     910             :  * End:
     911             :  * vim600: fdm=marker
     912             :  * vim: noet sw=4 ts=4
     913             :  */

Generated by: LCOV version 1.10

Generated at Wed, 19 Jan 2022 00:14:12 +0000 (3 days ago)

Copyright © 2005-2022 The PHP Group
All rights reserved.