PHP  
 PHP: Test and Code Coverage Analysis
downloads | QA | documentation | faq | getting help | mailing lists | reporting bugs | php.net sites | links | my php.net 
 

LTP GCOV extension - code coverage report
Current view: directory - intl/grapheme - grapheme_string.c
Test: PHP Code Coverage
Date: 2009-11-21 Instrumented lines: 348
Code covered: 82.2 % Executed lines: 286
Legend: not executed executed

       1                 : /*
       2                 :    +----------------------------------------------------------------------+
       3                 :    | PHP Version 5                                                                                                                |
       4                 :    +----------------------------------------------------------------------+
       5                 :    | This source file is subject to version 3.01 of the PHP license,      |
       6                 :    | that is bundled with this package in the file LICENSE, and is                |
       7                 :    | available through the world-wide-web at the following url:                   |
       8                 :    | http://www.php.net/license/3_01.txt                                                                  |
       9                 :    | If you did not receive a copy of the PHP license and are unable to   |
      10                 :    | obtain it through the world-wide-web, please send a note to                  |
      11                 :    | license@php.net so we can mail you a copy immediately.                               |
      12                 :    +----------------------------------------------------------------------+
      13                 :    | Author: Ed Batutis <ed@batutis.com>                                                            |
      14                 :    +----------------------------------------------------------------------+
      15                 :  */
      16                 : 
      17                 : /* {{{ includes */
      18                 : #ifdef HAVE_CONFIG_H
      19                 : #include "config.h"
      20                 : #endif
      21                 : 
      22                 : #include <php.h>
      23                 : #include "grapheme.h"
      24                 : #include "grapheme_util.h"
      25                 : 
      26                 : #include <unicode/utypes.h>
      27                 : #include <unicode/ucol.h>
      28                 : #include <unicode/ustring.h>
      29                 : #include <unicode/ubrk.h>
      30                 : 
      31                 : #include "ext/standard/php_string.h"
      32                 : 
      33                 : /* }}} */
      34                 : 
      35                 : #define GRAPHEME_EXTRACT_TYPE_COUNT             0
      36                 : #define GRAPHEME_EXTRACT_TYPE_MAXBYTES  1
      37                 : #define GRAPHEME_EXTRACT_TYPE_MAXCHARS  2
      38                 : #define GRAPHEME_EXTRACT_TYPE_MIN       GRAPHEME_EXTRACT_TYPE_COUNT
      39                 : #define GRAPHEME_EXTRACT_TYPE_MAX       GRAPHEME_EXTRACT_TYPE_MAXCHARS
      40                 : 
      41                 : 
      42                 : /* {{{ grapheme_register_constants
      43                 :  * Register API constants
      44                 :  */
      45                 : void grapheme_register_constants( INIT_FUNC_ARGS )
      46           17633 : {
      47           17633 :         REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_COUNT", GRAPHEME_EXTRACT_TYPE_COUNT, CONST_CS | CONST_PERSISTENT);
      48           17633 :         REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXBYTES", GRAPHEME_EXTRACT_TYPE_MAXBYTES, CONST_CS | CONST_PERSISTENT);
      49           17633 :         REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXCHARS", GRAPHEME_EXTRACT_TYPE_MAXCHARS, CONST_CS | CONST_PERSISTENT);
      50           17633 : }
      51                 : /* }}} */
      52                 : 
      53                 : /* {{{ proto int grapheme_strlen(string str)
      54                 :    Get number of graphemes in a string */
      55                 : PHP_FUNCTION(grapheme_strlen)
      56               6 : {
      57                 :         unsigned char* string;
      58                 :         int string_len;
      59               6 :         UChar* ustring = NULL;
      60               6 :         int ustring_len = 0;
      61                 :         int ret_len;
      62                 :         UErrorCode status;
      63                 : 
      64               6 :         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", (char **)&string, &string_len) == FAILURE) {
      65                 : 
      66               0 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
      67                 :                          "grapheme_strlen: unable to parse input param", 0 TSRMLS_CC );
      68                 : 
      69               0 :                 RETURN_FALSE;
      70                 :         }
      71                 : 
      72               6 :         ret_len = grapheme_ascii_check(string, string_len);
      73                 :         
      74               6 :         if ( ret_len >= 0 ) 
      75               2 :                 RETURN_LONG(ret_len);
      76                 : 
      77                 :         /* convert the string to UTF-16. */
      78               4 :         status = U_ZERO_ERROR;
      79               4 :         intl_convert_utf8_to_utf16(&ustring, &ustring_len, (char*) string, string_len, &status );
      80                 : 
      81               4 :         if ( U_FAILURE( status ) ) {
      82                 :                 /* Set global error code. */
      83               0 :                 intl_error_set_code( NULL, status TSRMLS_CC );
      84                 : 
      85                 :                 /* Set error messages. */
      86               0 :                 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 1 TSRMLS_CC );
      87               0 :                 efree( ustring );
      88               0 :                 RETURN_NULL();
      89                 :         }
      90                 :         
      91               4 :         ret_len = grapheme_split_string(ustring, ustring_len, NULL, 0 TSRMLS_CC );
      92                 : 
      93               4 :         efree( ustring );
      94                 : 
      95               4 :         if (ret_len >= 0) {
      96               4 :                 RETVAL_LONG(ret_len);
      97                 :         } else {
      98               0 :                 RETVAL_FALSE;
      99                 :         }
     100                 : }
     101                 : /* }}} */
     102                 : 
     103                 : /* {{{ proto int grapheme_strpos(string haystack, string needle [, int offset ])
     104                 :    Find position of first occurrence of a string within another */
     105                 : PHP_FUNCTION(grapheme_strpos)
     106              36 : {
     107                 :         unsigned char *haystack, *needle;
     108                 :         int haystack_len, needle_len;
     109                 :         unsigned char *found;
     110              36 :         long loffset = 0;
     111              36 :         int32_t offset = 0;
     112                 :         int ret_pos, uchar_pos;
     113                 :         
     114              36 :         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
     115                 :         
     116               0 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
     117                 :                          "grapheme_strpos: unable to parse input param", 0 TSRMLS_CC );
     118                 :                          
     119               0 :                 RETURN_FALSE;
     120                 :         }
     121                 : 
     122              36 :         if ( OUTSIDE_STRING(loffset, haystack_len) ) {
     123                 :         
     124               0 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
     125                 :                 
     126               0 :                 RETURN_FALSE;
     127                 :         }
     128                 : 
     129                 :         /* we checked that it will fit: */      
     130              36 :         offset = (int32_t) loffset;
     131                 : 
     132                 :         /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
     133                 : 
     134              36 :         if (needle_len == 0) {
     135                 :         
     136               0 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
     137                 :                 
     138               0 :                 RETURN_FALSE;
     139                 :         }
     140                 : 
     141                 : 
     142                 :         /* quick check to see if the string might be there
     143                 :          * I realize that 'offset' is 'grapheme count offset' but will work in spite of that 
     144                 :         */
     145              36 :         found = (unsigned char *)php_memnstr((char *)haystack + offset, (char *)needle, needle_len, (char *)haystack + haystack_len);
     146                 : 
     147                 :         /* if it isn't there the we are done */
     148              36 :         if (!found) {
     149               9 :                 RETURN_FALSE;
     150                 :         }
     151                 : 
     152                 :         /* if it is there, and if the haystack is ascii, we are all done */
     153              27 :         if ( grapheme_ascii_check(haystack, haystack_len) >= 0 ) {
     154                 : 
     155              12 :                 RETURN_LONG(found - haystack);
     156                 :         }
     157                 : 
     158                 :         /* do utf16 part of the strpos */
     159              15 :         ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, &uchar_pos, 0 /* fIgnoreCase */ TSRMLS_CC );
     160                 : 
     161              15 :         if ( ret_pos >= 0 ) {
     162              13 :                 RETURN_LONG(ret_pos + offset);
     163                 :         } else {
     164               2 :                 RETURN_FALSE;
     165                 :         }
     166                 : 
     167                 : }
     168                 : /* }}} */
     169                 : 
     170                 : /* {{{ proto int grapheme_stripos(string haystack, string needle [, int offset ])
     171                 :    Find position of first occurrence of a string within another, ignoring case differences */
     172                 : PHP_FUNCTION(grapheme_stripos)
     173              36 : {
     174                 :         unsigned char *haystack, *needle, *haystack_dup, *needle_dup;
     175                 :         int haystack_len, needle_len;
     176                 :         unsigned char *found;
     177              36 :         long loffset = 0;
     178              36 :         int32_t offset = 0;
     179                 :         int ret_pos, uchar_pos;
     180                 :         int is_ascii;
     181                 :         
     182              36 :         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
     183                 :         
     184               0 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
     185                 :                          "grapheme_stripos: unable to parse input param", 0 TSRMLS_CC );
     186                 :                          
     187               0 :                 RETURN_FALSE;
     188                 :         }
     189                 : 
     190              36 :         if ( OUTSIDE_STRING(loffset, haystack_len) ) {
     191                 :         
     192               0 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Offset not contained in string", 1 TSRMLS_CC );
     193                 :                 
     194               0 :                 RETURN_FALSE;
     195                 :         }
     196                 :         
     197                 :         /* we checked that it will fit: */
     198              36 :         offset = (int32_t) loffset;
     199                 : 
     200                 :         /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
     201                 : 
     202              36 :         if (needle_len == 0) {
     203                 :         
     204               0 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Empty delimiter", 1 TSRMLS_CC );
     205                 :                 
     206               0 :                 RETURN_FALSE;
     207                 :         }
     208                 : 
     209                 : 
     210              36 :         is_ascii = ( grapheme_ascii_check(haystack, haystack_len) >= 0 );
     211                 : 
     212              36 :         if ( is_ascii ) {
     213              19 :                 needle_dup = (unsigned char *)estrndup((char *)needle, needle_len);
     214              19 :                 php_strtolower((char *)needle_dup, needle_len);
     215              19 :                 haystack_dup = (unsigned char *)estrndup((char *)haystack, haystack_len);
     216              19 :                 php_strtolower((char *)haystack_dup, haystack_len);
     217                 : 
     218              19 :                 found = (unsigned char*) php_memnstr((char *)haystack_dup + offset, (char *)needle_dup, needle_len, (char *)haystack_dup + haystack_len);
     219                 : 
     220              19 :                 efree(haystack_dup);
     221              19 :                 efree(needle_dup);
     222                 : 
     223              19 :                 if (found) {
     224              12 :                         RETURN_LONG(found - haystack_dup);
     225                 :                 }
     226                 : 
     227                 :                 /* if needle was ascii too, we are all done, otherwise we need to try using Unicode to see what we get */
     228               7 :                 if ( grapheme_ascii_check(needle, needle_len) >= 0 ) {
     229               5 :                         RETURN_FALSE;
     230                 :                 }
     231                 :         }
     232                 : 
     233                 :         /* do utf16 part of the strpos */
     234              19 :         ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, &uchar_pos, 1 /* fIgnoreCase */ TSRMLS_CC );
     235                 : 
     236              19 :         if ( ret_pos >= 0 ) {
     237              13 :                 RETURN_LONG(ret_pos + offset);
     238                 :         } else {
     239               6 :                 RETURN_FALSE;
     240                 :         }
     241                 : 
     242                 : }
     243                 : /* }}} */
     244                 : 
     245                 : /* {{{ proto int grapheme_strrpos(string haystack, string needle [, int offset])
     246                 :    Find position of last occurrence of a string within another */
     247                 : PHP_FUNCTION(grapheme_strrpos)
     248              36 : {
     249                 :         unsigned char *haystack, *needle;
     250                 :         int haystack_len, needle_len;
     251              36 :         long loffset = 0;
     252              36 :         int32_t offset = 0;
     253                 :         int32_t ret_pos;
     254                 :         int is_ascii;
     255                 :         
     256              36 :         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
     257                 :         
     258               0 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
     259                 :                          "grapheme_strrpos: unable to parse input param", 0 TSRMLS_CC );
     260                 :                          
     261               0 :                 RETURN_FALSE;
     262                 :         }
     263                 : 
     264              36 :         if ( OUTSIDE_STRING(loffset, haystack_len) ) {
     265                 :         
     266               0 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
     267                 :                 
     268               0 :                 RETURN_FALSE;
     269                 :         }
     270                 :         
     271                 :         /* we checked that it will fit: */
     272              36 :         offset = (int32_t) loffset;
     273                 : 
     274                 :         /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
     275                 : 
     276              36 :         if (needle_len == 0) {
     277                 :         
     278               0 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
     279                 :                 
     280               0 :                 RETURN_FALSE;
     281                 :         }
     282                 : 
     283              36 :         is_ascii = grapheme_ascii_check(haystack, haystack_len) >= 0;
     284                 : 
     285              36 :         if ( is_ascii ) {
     286                 :         
     287              19 :                 ret_pos = grapheme_strrpos_ascii(haystack, haystack_len, needle, needle_len, offset);
     288                 :                 
     289                 : 
     290              19 :                 if ( ret_pos >= 0 ) {
     291              12 :                         RETURN_LONG(ret_pos);
     292                 :                 }
     293                 : 
     294                 :                 /* if the needle was ascii too, we are done */
     295                 : 
     296               7 :                 if (  grapheme_ascii_check(needle, needle_len) >= 0 ) {
     297               5 :                         RETURN_FALSE;
     298                 :                 }
     299                 : 
     300                 :                 /* else we need to continue via utf16 */
     301                 :         }
     302                 : 
     303              19 :         ret_pos = grapheme_strrpos_utf16(haystack, haystack_len, needle, needle_len, offset, 0 /* f_ignore_case */ TSRMLS_CC);
     304                 : 
     305              19 :         if ( ret_pos >= 0 ) {
     306              13 :                 RETURN_LONG(ret_pos);
     307                 :         } else {
     308               6 :                 RETURN_FALSE;
     309                 :         }
     310                 :         
     311                 : 
     312                 : }
     313                 : /* }}} */
     314                 : 
     315                 : /* {{{ proto int grapheme_strripos(string haystack, string needle [, int offset])
     316                 :    Find position of last occurrence of a string within another, ignoring case */
     317                 : PHP_FUNCTION(grapheme_strripos)
     318              36 : {
     319                 :         unsigned char *haystack, *needle;
     320                 :         int haystack_len, needle_len;
     321              36 :         long loffset = 0;
     322              36 :         int32_t offset = 0;
     323                 :         int32_t ret_pos;
     324                 :         int is_ascii;
     325                 :         
     326              36 :         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
     327                 :         
     328               0 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
     329                 :                          "grapheme_strrpos: unable to parse input param", 0 TSRMLS_CC );
     330                 :                          
     331               0 :                 RETURN_FALSE;
     332                 :         }
     333                 : 
     334              36 :         if ( OUTSIDE_STRING(loffset, haystack_len) ) {
     335                 :         
     336               0 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
     337                 :                 
     338               0 :                 RETURN_FALSE;
     339                 :         }
     340                 : 
     341                 :         /* we checked that it will fit: */
     342              36 :         offset = (int32_t) loffset;
     343                 :         
     344                 :         /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
     345                 : 
     346              36 :         if (needle_len == 0) {
     347                 :         
     348               0 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
     349                 :                 
     350               0 :                 RETURN_FALSE;
     351                 :         }
     352                 : 
     353              36 :         is_ascii = grapheme_ascii_check(haystack, haystack_len) >= 0;
     354                 : 
     355              36 :         if ( is_ascii ) {
     356                 :                 unsigned char *needle_dup, *haystack_dup;
     357                 : 
     358              19 :                 needle_dup = (unsigned char *)estrndup((char *)needle, needle_len);
     359              19 :                 php_strtolower((char *)needle_dup, needle_len);
     360              19 :                 haystack_dup = (unsigned char *)estrndup((char *)haystack, haystack_len);
     361              19 :                 php_strtolower((char *)haystack_dup, haystack_len);
     362                 : 
     363              19 :                 ret_pos = grapheme_strrpos_ascii(haystack_dup, haystack_len, needle_dup, needle_len, offset);
     364                 :                 
     365              19 :                 efree(haystack_dup);
     366              19 :                 efree(needle_dup);
     367                 : 
     368              19 :                 if ( ret_pos >= 0 ) {
     369              12 :                         RETURN_LONG(ret_pos);
     370                 :                 }
     371                 : 
     372                 :                 /* if the needle was ascii too, we are done */
     373                 : 
     374               7 :                 if (  grapheme_ascii_check(needle, needle_len) >= 0 ) {
     375               5 :                         RETURN_FALSE;
     376                 :                 }
     377                 : 
     378                 :                 /* else we need to continue via utf16 */
     379                 :         }
     380                 : 
     381              19 :         ret_pos = grapheme_strrpos_utf16(haystack, haystack_len, needle, needle_len, offset, 1 /* f_ignore_case */ TSRMLS_CC);
     382                 : 
     383              19 :         if ( ret_pos >= 0 ) {
     384              13 :                 RETURN_LONG(ret_pos);
     385                 :         } else {
     386               6 :                 RETURN_FALSE;
     387                 :         }
     388                 :         
     389                 : 
     390                 : }
     391                 : /* }}} */
     392                 : 
     393                 : /* {{{ proto string grapheme_substr(string str, int start [, int length])
     394                 :    Returns part of a string */
     395                 : PHP_FUNCTION(grapheme_substr)
     396              63 : {
     397                 :         unsigned char *str, *sub_str;
     398                 :         UChar *ustr;
     399                 :         int str_len, sub_str_len, ustr_len;
     400              63 :         long lstart = 0, length = 0;
     401              63 :         int32_t start = 0;
     402                 :         int iter_val;
     403                 :         UErrorCode status;
     404                 :         unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
     405              63 :         UBreakIterator* bi = NULL;
     406                 :         int sub_str_start_pos, sub_str_end_pos;
     407                 :         int32_t (*iter_func)(UBreakIterator *);
     408                 : 
     409              63 :         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sl|l", (char **)&str, &str_len, &lstart, &length) == FAILURE) {
     410                 :         
     411               0 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
     412                 :                          "grapheme_substr: unable to parse input param", 0 TSRMLS_CC );
     413                 :                          
     414               0 :                 RETURN_FALSE;
     415                 :         }
     416                 : 
     417              63 :         if ( OUTSIDE_STRING(lstart, str_len) ) {
     418                 :         
     419               5 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 TSRMLS_CC );
     420                 :                 
     421               5 :                 RETURN_FALSE;
     422                 :         }
     423                 : 
     424                 :         /* we checked that it will fit: */
     425              58 :         start = (int32_t) lstart;
     426                 : 
     427                 :         /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
     428                 : 
     429              58 :         if ( grapheme_ascii_check(str, str_len) >= 0 ) {
     430               6 :                 grapheme_substr_ascii((char *)str, str_len, start, length, ZEND_NUM_ARGS(), (char **) &sub_str, &sub_str_len);
     431                 : 
     432               6 :                 if ( NULL == sub_str ) {
     433               0 :                         RETURN_FALSE;
     434                 :                 }
     435                 : 
     436               6 :                 RETURN_STRINGL(((char *)sub_str), sub_str_len, 1);
     437                 :         }
     438                 : 
     439              52 :         ustr = NULL;
     440              52 :         ustr_len = 0;
     441              52 :         status = U_ZERO_ERROR;
     442              52 :         intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)str, str_len, &status);
     443                 : 
     444              52 :         if ( U_FAILURE( status ) ) {
     445                 :                 /* Set global error code. */
     446               0 :                 intl_error_set_code( NULL, status TSRMLS_CC );
     447                 : 
     448                 :                 /* Set error messages. */
     449               0 :                 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 1 TSRMLS_CC );
     450               0 :                 efree( ustr );
     451               0 :                 RETURN_FALSE;
     452                 :         }
     453                 : 
     454              52 :         bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status TSRMLS_CC );
     455                 : 
     456              52 :         if( U_FAILURE(status) ) {
     457               0 :                 RETURN_FALSE;
     458                 :         }
     459                 :         
     460              52 :         ubrk_setText(bi, ustr, ustr_len,        &status);
     461                 : 
     462              52 :         if ( start < 0 ) {
     463              27 :                 iter_func = ubrk_previous;
     464              27 :                 ubrk_last(bi);
     465              27 :                 iter_val = 1;
     466                 :         }
     467                 :         else {
     468              25 :                 iter_func = ubrk_next;
     469              25 :                 iter_val = -1;
     470                 :         }
     471                 : 
     472              52 :         sub_str_start_pos = 0;
     473                 : 
     474             346 :         while ( start ) {
     475             242 :                 sub_str_start_pos = iter_func(bi);
     476                 : 
     477             242 :                 if ( UBRK_DONE == sub_str_start_pos ) {
     478               0 :                         break;
     479                 :                 }
     480                 : 
     481             242 :                 start += iter_val;
     482                 :         }
     483                 : 
     484              52 :         if ( 0 != start || sub_str_start_pos >= ustr_len ) {
     485                 :         
     486               3 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 TSRMLS_CC );
     487                 :         
     488               3 :                 efree(ustr);
     489               3 :                 ubrk_close(bi);
     490               3 :                 RETURN_FALSE;
     491                 :         }
     492                 : 
     493              49 :         if (ZEND_NUM_ARGS() <= 2) {
     494                 : 
     495                 :                 /* no length supplied, return the rest of the string */
     496                 : 
     497              16 :                 sub_str = NULL;
     498              16 :                 sub_str_len = 0;
     499              16 :                 status = U_ZERO_ERROR;
     500              16 :                 intl_convert_utf16_to_utf8((char **)&sub_str, &sub_str_len, ustr + sub_str_start_pos, ustr_len - sub_str_start_pos, &status);
     501                 : 
     502              16 :                 efree( ustr );
     503              16 :                 ubrk_close( bi );
     504                 : 
     505              16 :                 if ( U_FAILURE( status ) ) {
     506                 :                         /* Set global error code. */
     507               0 :                         intl_error_set_code( NULL, status TSRMLS_CC );
     508                 : 
     509                 :                         /* Set error messages. */
     510               0 :                         intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 1 TSRMLS_CC );
     511                 : 
     512               0 :                         efree( sub_str );
     513                 : 
     514               0 :                         RETURN_FALSE;
     515                 :                 }
     516                 : 
     517                 :                 /* return the allocated string, not a duplicate */
     518              16 :                 RETURN_STRINGL(((char *)sub_str), sub_str_len, 0);
     519                 :         }
     520                 : 
     521                 :         /* find the end point of the string to return */
     522                 : 
     523              33 :         if ( length < 0 ) {
     524              23 :                 iter_func = ubrk_previous;
     525              23 :                 ubrk_last(bi);
     526              23 :                 iter_val = 1;
     527                 :         }
     528                 :         else {
     529              10 :                 iter_func = ubrk_next;
     530              10 :                 iter_val = -1;
     531                 :         }
     532                 : 
     533              33 :         sub_str_end_pos = 0;
     534                 : 
     535             216 :         while ( length ) {
     536             154 :                 sub_str_end_pos = iter_func(bi);
     537                 : 
     538             154 :                 if ( UBRK_DONE == sub_str_end_pos ) {
     539               4 :                         break;
     540                 :                 }
     541                 : 
     542             150 :                 length += iter_val;
     543                 :         }
     544                 :         
     545              33 :         if ( UBRK_DONE == sub_str_end_pos && length < 0) {
     546                 :         
     547               3 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length not contained in string", 1 TSRMLS_CC );
     548                 :         
     549               3 :                 efree(ustr);
     550               3 :                 ubrk_close(bi);
     551               3 :                 RETURN_FALSE;
     552                 :         }
     553                 : 
     554              30 :         sub_str = NULL;
     555              30 :         status = U_ZERO_ERROR;
     556              30 :         intl_convert_utf16_to_utf8((char **)&sub_str, &sub_str_len, ustr + sub_str_start_pos, ( sub_str_end_pos - sub_str_start_pos ), &status);
     557                 : 
     558              30 :         efree( ustr );
     559              30 :         ubrk_close( bi );
     560                 : 
     561              30 :         if ( U_FAILURE( status ) ) {
     562                 :                 /* Set global error code. */
     563               1 :                 intl_error_set_code( NULL, status TSRMLS_CC );
     564                 : 
     565                 :                 /* Set error messages. */
     566               1 :                 intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 1 TSRMLS_CC );
     567                 : 
     568               1 :                 if ( NULL != sub_str )
     569               0 :                         efree( sub_str );
     570                 : 
     571               1 :                 RETURN_FALSE;
     572                 :         }
     573                 : 
     574                 :          /* return the allocated string, not a duplicate */
     575              29 :         RETURN_STRINGL(((char *)sub_str), sub_str_len, 0);
     576                 : 
     577                 : }
     578                 : /* }}} */
     579                 : 
     580                 : /* {{{  strstr_common_handler */
     581                 : static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case)
     582              70 : {
     583                 :         unsigned char *haystack, *needle, *found;
     584                 :         int haystack_len, needle_len;
     585                 :         int ret_pos, uchar_pos;
     586              70 :         zend_bool part = 0;
     587                 : 
     588              70 :         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|b", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &part) == FAILURE) {
     589                 :         
     590               0 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
     591                 :                          "grapheme_strstr: unable to parse input param", 0 TSRMLS_CC );
     592                 :                          
     593               0 :                 RETURN_FALSE;
     594                 :         }
     595                 : 
     596              70 :         if (needle_len == 0) {
     597                 :         
     598               0 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
     599                 :                 
     600               0 :                 RETURN_FALSE;
     601                 :         }
     602                 : 
     603                 : 
     604              70 :         if ( !f_ignore_case ) {
     605                 : 
     606                 :                 /* ASCII optimization: quick check to see if the string might be there
     607                 :                  * I realize that 'offset' is 'grapheme count offset' but will work in spite of that 
     608                 :                 */
     609              35 :                 found = (unsigned char *)php_memnstr((char *)haystack, (char *)needle, needle_len, (char *)haystack + haystack_len);
     610                 : 
     611                 :                 /* if it isn't there the we are done */
     612              35 :                 if ( !found ) {
     613               3 :                         RETURN_FALSE;
     614                 :                 }
     615                 : 
     616                 :                 /* if it is there, and if the haystack is ascii, we are all done */
     617              32 :                 if ( grapheme_ascii_check(haystack, haystack_len) >= 0 ) {
     618              13 :                         size_t found_offset = found - haystack;
     619                 : 
     620              13 :                         if (part) {
     621               5 :                                 RETURN_STRINGL(((char *)haystack) , found_offset, 1);
     622                 :                         } else {
     623               8 :                                 RETURN_STRINGL(((char *)found), haystack_len - found_offset, 1);
     624                 :                         }
     625                 :                 }
     626                 : 
     627                 :         }
     628                 : 
     629                 :         /* need to work in utf16 */
     630              54 :         ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case TSRMLS_CC );
     631                 : 
     632              54 :         if ( ret_pos < 0 ) {
     633               9 :                 RETURN_FALSE;
     634                 :         }
     635                 : 
     636                 :         /* uchar_pos is the 'nth' Unicode character position of the needle */
     637                 : 
     638              45 :         ret_pos = 0;
     639              45 :         U8_FWD_N(haystack, ret_pos, haystack_len, uchar_pos);
     640                 : 
     641              45 :         if (part) {
     642              15 :                 RETURN_STRINGL(((char *)haystack), ret_pos, 1);
     643                 :         } 
     644                 :         else {
     645              30 :                 RETURN_STRINGL(((char *)haystack) + ret_pos, haystack_len - ret_pos, 1);
     646                 :         }
     647                 : 
     648                 : }
     649                 : /* }}} */
     650                 : 
     651                 : /* {{{ proto string grapheme_strstr(string haystack, string needle[, bool part])
     652                 :    Finds first occurrence of a string within another */
     653                 : PHP_FUNCTION(grapheme_strstr)
     654              35 : {
     655              35 :         strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 /* f_ignore_case */);
     656              35 : }
     657                 : /* }}} */
     658                 : 
     659                 : /* {{{ proto string grapheme_stristr(string haystack, string needle[, bool part])
     660                 :    Finds first occurrence of a string within another */
     661                 : PHP_FUNCTION(grapheme_stristr)
     662              35 : {
     663              35 :         strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1 /* f_ignore_case */);
     664              35 : }
     665                 : /* }}} */
     666                 : 
     667                 : /* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */
     668                 : inline int32_t
     669                 : grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
     670              18 : {
     671              18 :         int pos = 0, prev_pos = 0;
     672              18 :         int ret_pos = 0, prev_ret_pos = 0;
     673                 : 
     674                 :         while ( 1 ) {
     675              96 :                 pos = ubrk_next(bi);
     676                 : 
     677              96 :                 if ( UBRK_DONE == pos ) {
     678               7 :                         break;
     679                 :                 }
     680                 : 
     681                 :                 /* if we are beyond our limit, then the loop is done */
     682              89 :                 if ( pos > csize ) {
     683              11 :                         break;
     684                 :                 }
     685                 : 
     686                 :                 /* update our pointer in the original UTF-8 buffer by as many characters
     687                 :                    as ubrk_next iterated over */
     688                 : 
     689              78 :                 prev_ret_pos = ret_pos;
     690              78 :                 U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
     691                 : 
     692              78 :                 if ( prev_ret_pos == ret_pos ) {
     693                 :                         /* something wrong - malformed utf8? */
     694               0 :                         break;
     695                 :                 }
     696                 : 
     697              78 :                 prev_pos = pos;
     698              78 :         }
     699                 : 
     700              18 :         return ret_pos;
     701                 : }
     702                 : /* }}} */
     703                 : 
     704                 : /* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */
     705                 : inline int32_t
     706                 : grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
     707              23 : {
     708              23 :         int pos = 0, prev_pos = 0;
     709              23 :         int ret_pos = 0, prev_ret_pos = 0;
     710                 : 
     711                 :         while ( 1 ) {
     712              63 :                 pos = ubrk_next(bi);
     713                 : 
     714              63 :                 if ( UBRK_DONE == pos ) {
     715               8 :                         break;
     716                 :                 }
     717                 : 
     718              55 :                 prev_ret_pos = ret_pos;
     719              55 :                 U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
     720                 : 
     721              55 :                 if ( ret_pos > bsize ) {
     722              15 :                         ret_pos = prev_ret_pos;
     723              15 :                         break;
     724                 :                 }
     725                 : 
     726              40 :                 if ( prev_ret_pos == ret_pos ) {
     727                 :                         /* something wrong - malformed utf8? */
     728               0 :                         break;
     729                 :                 }
     730                 : 
     731              40 :                 prev_pos = pos;
     732              40 :         }
     733                 : 
     734              23 :         return ret_pos;
     735                 : }
     736                 : /* }}} */
     737                 : 
     738                 : /* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */
     739                 : inline int32_t
     740                 : grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
     741              21 : {
     742              21 :         int pos = 0, next_pos = 0;
     743              21 :         int ret_pos = 0;
     744                 : 
     745              80 :         while ( size ) {
     746              40 :                 next_pos = ubrk_next(bi);
     747                 : 
     748              40 :                 if ( UBRK_DONE == next_pos ) {
     749               2 :                         break;
     750                 :                 }
     751              38 :                 pos = next_pos;
     752              38 :                 size--;
     753                 :         }
     754                 : 
     755                 :         /* pos is one past the last UChar - and represent the number of code units to 
     756                 :                 advance in the utf-8 buffer
     757                 :         */
     758                 : 
     759              21 :         U8_FWD_N(pstr, ret_pos, str_len, pos);
     760                 : 
     761              21 :         return ret_pos;
     762                 : }
     763                 : /* }}} */
     764                 : 
     765                 : /* {{{ grapheme extract iter function pointer array */
     766                 : typedef int32_t (*grapheme_extract_iter)(UBreakIterator * /*bi*/, int32_t /*size*/, unsigned char * /*pstr*/, int32_t /*str_len*/);
     767                 : 
     768                 : static grapheme_extract_iter grapheme_extract_iters[] = {
     769                 :         &grapheme_extract_count_iter,
     770                 :         &grapheme_extract_bytecount_iter,
     771                 :         &grapheme_extract_charcount_iter,
     772                 : };
     773                 : /* }}} */
     774                 : 
     775                 : /* {{{ proto string grapheme_extract(string str, int size[, int extract_type[, int start[, int next]]])
     776                 :         Function to extract a sequence of default grapheme clusters */
     777                 : PHP_FUNCTION(grapheme_extract)
     778             104 : {
     779                 :         unsigned char *str, *pstr;
     780                 :         UChar *ustr;
     781                 :         int str_len, ustr_len;
     782                 :         long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
     783             104 :         long lstart = 0; /* starting position in str in bytes */
     784             104 :         int32_t start = 0;
     785             104 :         long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT;
     786                 :         UErrorCode status;
     787                 :         unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
     788             104 :         UBreakIterator* bi = NULL;
     789                 :         int ret_pos;
     790             104 :         zval *next = NULL; /* return offset of next part of the string */
     791                 : 
     792             104 :         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sl|llz", (char **)&str, &str_len, &size, &extract_type, &lstart, &next) == FAILURE) {
     793                 :         
     794               0 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
     795                 :                          "grapheme_extract: unable to parse input param", 0 TSRMLS_CC );
     796                 :                          
     797               0 :                 RETURN_FALSE;
     798                 :         }
     799                 : 
     800             104 :         if ( NULL != next ) {
     801              15 :                 if ( !PZVAL_IS_REF(next) ) {
     802               0 :                         intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
     803                 :                                  "grapheme_extract: 'next' was not passed by reference", 0 TSRMLS_CC );
     804                 :                          
     805               0 :                         RETURN_FALSE;
     806                 :                 }
     807                 :                 else {
     808                 :                         /* initialize next */
     809              15 :             ZVAL_LONG(next, start);
     810                 :                 }
     811                 :         }
     812                 : 
     813             104 :         if ( extract_type < GRAPHEME_EXTRACT_TYPE_MIN || extract_type > GRAPHEME_EXTRACT_TYPE_MAX ) {
     814                 : 
     815               0 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
     816                 :                          "grapheme_extract: unknown extract type param", 0 TSRMLS_CC );
     817                 :                          
     818               0 :                 RETURN_FALSE;
     819                 :         }
     820                 : 
     821             104 :         if ( lstart > INT32_MAX || lstart < 0 || lstart >= str_len ) {
     822                 : 
     823               5 :                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: start not contained in string", 1 TSRMLS_CC );
     824                 : 
     825               5 :                 RETURN_FALSE;
     826                 :         }
     827                 : 
     828                 :         /* we checked that it will fit: */
     829              99 :         start = (int32_t) lstart;
     830                 : 
     831              99 :         pstr = str + start;
     832                 : 
     833                 :         /* just in case pstr points in the middle of a character, move forward to the start of the next char */
     834              99 :         if ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
     835               9 :                 unsigned char *str_end = str + str_len;
     836                 : 
     837              25 :                 while ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
     838               9 :                         pstr++;
     839               9 :                         if ( pstr >= str_end ) {
     840               2 :                                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
     841                 :                                                                 "grapheme_extract: invalid input string", 0 TSRMLS_CC );
     842                 :                          
     843               2 :                                 RETURN_FALSE;
     844                 :                         }
     845                 :                 }
     846                 :         }
     847                 : 
     848              97 :         str_len -= (pstr - str);
     849                 : 
     850                 :         /* if the string is all ASCII up to size+1 - or str_len whichever is first - then we are done.
     851                 :                 (size + 1 because the size-th character might be the beginning of a grapheme cluster)
     852                 :          */
     853                 :         
     854              97 :         if ( -1 != grapheme_ascii_check(pstr, size + 1 < str_len ? size + 1 : str_len ) ) {
     855              35 :         long nsize = ( size < str_len ? size : str_len ); 
     856              35 :                 if ( NULL != next ) {
     857              11 :                         ZVAL_LONG(next, start+nsize);
     858                 :                 }
     859              35 :                 RETURN_STRINGL(((char *)pstr), nsize, 1);
     860                 :         }
     861                 : 
     862                 :         /* convert the strings to UTF-16. */
     863              62 :         ustr = NULL;
     864              62 :         ustr_len = 0;
     865              62 :         status = U_ZERO_ERROR;
     866              62 :         intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)pstr, str_len, &status );
     867                 : 
     868              62 :         if ( U_FAILURE( status ) ) {
     869                 :                 /* Set global error code. */
     870               0 :                 intl_error_set_code( NULL, status TSRMLS_CC );
     871                 : 
     872                 :                 /* Set error messages. */
     873               0 :                 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 1 TSRMLS_CC );
     874                 : 
     875               0 :                 if ( NULL != ustr )
     876               0 :                         efree( ustr );
     877                 : 
     878               0 :                 RETURN_FALSE;
     879                 :         }
     880                 : 
     881              62 :         bi = NULL;
     882              62 :         status = U_ZERO_ERROR;
     883              62 :         bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC );
     884                 : 
     885              62 :         ubrk_setText(bi, ustr, ustr_len, &status);
     886                 : 
     887                 :         /* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
     888                 :                 can't back up. So, we will not do anything. */
     889                 : 
     890                 :         /* now we need to find the end of the chunk the user wants us to return */
     891                 : 
     892              62 :         ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, pstr, str_len);
     893                 : 
     894              62 :         efree(ustr);
     895              62 :         ubrk_close(bi);
     896                 : 
     897              62 :         if ( NULL != next ) {
     898               4 :                 ZVAL_LONG(next, start+ret_pos);
     899                 :         }
     900                 : 
     901              62 :         RETURN_STRINGL(((char *)pstr), ret_pos, 1);
     902                 : }
     903                 : 
     904                 : /* }}} */
     905                 : 
     906                 : /*
     907                 :  * Local variables:
     908                 :  * tab-width: 4
     909                 :  * c-basic-offset: 4
     910                 :  * End:
     911                 :  * vim600: fdm=marker
     912                 :  * vim: noet sw=4 ts=4
     913                 :  */
     914                 : 

Generated by: LTP GCOV extension version 1.5

Generated at Sat, 21 Nov 2009 12:27:01 +0000 (3 days ago)

Copyright © 2005-2009 The PHP Group
All rights reserved.