PHP  
 PHP: Test and Code Coverage Analysis
downloads | QA | documentation | faq | getting help | mailing lists | reporting bugs | php.net sites | links | my php.net 
 

LTP GCOV extension - code coverage report
Current view: directory - standard - metaphone.c
Test: PHP Code Coverage
Date: 2009-11-21 Instrumented lines: 141
Code covered: 84.4 % Executed lines: 119
Legend: not executed executed

       1                 : /*
       2                 :    +----------------------------------------------------------------------+
       3                 :    | PHP Version 5                                                        |
       4                 :    +----------------------------------------------------------------------+
       5                 :    | Copyright (c) 1997-2009 The PHP Group                                |
       6                 :    +----------------------------------------------------------------------+
       7                 :    | This source file is subject to version 3.01 of the PHP license,      |
       8                 :    | that is bundled with this package in the file LICENSE, and is        |
       9                 :    | available through the world-wide-web at the following url:           |
      10                 :    | http://www.php.net/license/3_01.txt                                  |
      11                 :    | If you did not receive a copy of the PHP license and are unable to   |
      12                 :    | obtain it through the world-wide-web, please send a note to          |
      13                 :    | license@php.net so we can mail you a copy immediately.               |
      14                 :    +----------------------------------------------------------------------+
      15                 :    | Author: Thies C. Arntzen <thies@thieso.net>                          |
      16                 :    +----------------------------------------------------------------------+
      17                 : */
      18                 : 
      19                 : /* $Id: metaphone.c 283127 2009-06-30 11:46:20Z felipe $ */
      20                 : 
      21                 : /*
      22                 :         Based on CPANs "Text-Metaphone-1.96" by Michael G Schwern <schwern@pobox.com> 
      23                 : */
      24                 : 
      25                 : #include "php.h"
      26                 : #include "php_metaphone.h"
      27                 : 
      28                 : static int metaphone(unsigned char *word, int word_len, long max_phonemes, char **phoned_word, int traditional);
      29                 : 
      30                 : /* {{{ proto string metaphone(string text[, int phones])
      31                 :    Break english phrases down into their phonemes */
      32                 : PHP_FUNCTION(metaphone)
      33              25 : {
      34                 :         char *str;
      35              25 :         char *result = 0;
      36                 :         int str_len;
      37              25 :         long phones = 0;
      38                 : 
      39              25 :         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &str_len,
      40                 :                                                           &phones) == FAILURE) {
      41               1 :                 return;
      42                 :         }
      43                 : 
      44              24 :         if (metaphone((unsigned char *)str, str_len, phones, &result, 1) == 0) {
      45              22 :                 RETVAL_STRING(result, 0);
      46                 :         } else {
      47               2 :                 if (result) {
      48               0 :                         efree(result);
      49                 :                 }
      50               2 :                 RETURN_FALSE;
      51                 :         }
      52                 : }
      53                 : /* }}} */
      54                 : 
      55                 : /* 
      56                 :    this is now the original code by Michael G Schwern:
      57                 :    i've changed it just a slightly bit (use emalloc, 
      58                 :    get rid of includes etc) 
      59                 :         - thies - 13.09.1999
      60                 : */
      61                 : 
      62                 : /*-----------------------------  */
      63                 : /* this used to be "metaphone.h" */
      64                 : /*-----------------------------  */
      65                 : 
      66                 : /* Special encodings */
      67                 : #define  SH     'X'
      68                 : #define  TH             '0'
      69                 : 
      70                 : /*-----------------------------  */
      71                 : /* end of "metaphone.h"          */
      72                 : /*-----------------------------  */
      73                 : 
      74                 : /*----------------------------- */
      75                 : /* this used to be "metachar.h" */
      76                 : /*----------------------------- */
      77                 : 
      78                 : /* Metachar.h ... little bits about characters for metaphone */
      79                 : /*-- Character encoding array & accessing macros --*/
      80                 : /* Stolen directly out of the book... */
      81                 : char _codes[26] =
      82                 : {
      83                 :         1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0
      84                 : /*  a  b c  d e f g  h i j k l m n o p q r s t u v w x y z */
      85                 : };
      86                 : 
      87                 : 
      88                 : #define ENCODE(c) (isalpha(c) ? _codes[((toupper(c)) - 'A')] : 0)
      89                 : 
      90                 : #define isvowel(c)  (ENCODE(c) & 1)         /* AEIOU */
      91                 : 
      92                 : /* These letters are passed through unchanged */
      93                 : #define NOCHANGE(c) (ENCODE(c) & 2)         /* FJMNR */
      94                 : 
      95                 : /* These form dipthongs when preceding H */
      96                 : #define AFFECTH(c)  (ENCODE(c) & 4)         /* CGPST */
      97                 : 
      98                 : /* These make C and G soft */
      99                 : #define MAKESOFT(c) (ENCODE(c) & 8)         /* EIY */
     100                 : 
     101                 : /* These prevent GH from becoming F */
     102                 : #define NOGHTOF(c)  (ENCODE(c) & 16)        /* BDH */
     103                 : 
     104                 : /*----------------------------- */
     105                 : /* end of "metachar.h"          */
     106                 : /*----------------------------- */
     107                 : 
     108                 : /* I suppose I could have been using a character pointer instead of
     109                 :  * accesssing the array directly... */
     110                 : 
     111                 : /* Look at the next letter in the word */
     112                 : #define Next_Letter (toupper(word[w_idx+1]))
     113                 : /* Look at the current letter in the word */
     114                 : #define Curr_Letter (toupper(word[w_idx]))
     115                 : /* Go N letters back. */
     116                 : #define Look_Back_Letter(n)     (w_idx >= n ? toupper(word[w_idx-n]) : '\0')
     117                 : /* Previous letter.  I dunno, should this return null on failure? */
     118                 : #define Prev_Letter (Look_Back_Letter(1))
     119                 : /* Look two letters down.  It makes sure you don't walk off the string. */
     120                 : #define After_Next_Letter       (Next_Letter != '\0' ? toupper(word[w_idx+2]) \
     121                 :                                                                                              : '\0')
     122                 : #define Look_Ahead_Letter(n) (toupper(Lookahead(word+w_idx, n)))
     123                 : 
     124                 : 
     125                 : /* Allows us to safely look ahead an arbitrary # of letters */
     126                 : /* I probably could have just used strlen... */
     127                 : static char Lookahead(char *word, int how_far)
     128               0 : {
     129               0 :         char letter_ahead = '\0';       /* null by default */
     130                 :         int idx;
     131               0 :         for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
     132                 :         /* Edge forward in the string... */
     133                 : 
     134               0 :         letter_ahead = word[idx];       /* idx will be either == to how_far or
     135                 :                                                                  * at the end of the string
     136                 :                                                                  */
     137               0 :         return letter_ahead;
     138                 : }
     139                 : 
     140                 : 
     141                 : /* phonize one letter
     142                 :  * We don't know the buffers size in advance. On way to solve this is to just
     143                 :  * re-allocate the buffer size. We're using an extra of 2 characters (this
     144                 :  * could be one though; or more too). */
     145                 : #define Phonize(c)      { \
     146                 :                                                 if (p_idx >= max_buffer_len) { \
     147                 :                                                         *phoned_word = safe_erealloc(*phoned_word, 2, sizeof(char), max_buffer_len); \
     148                 :                                                         max_buffer_len += 2; \
     149                 :                                                 } \
     150                 :                                                 (*phoned_word)[p_idx++] = c; \
     151                 :                                         }
     152                 : /* Slap a null character on the end of the phoned word */
     153                 : #define End_Phoned_Word { \
     154                 :                                                         if (p_idx == max_buffer_len) { \
     155                 :                                                                 *phoned_word = safe_erealloc(*phoned_word, 1, sizeof(char), max_buffer_len); \
     156                 :                                                         } \
     157                 :                                                         (*phoned_word)[p_idx] = '\0'; \
     158                 :                                                 }
     159                 : /* How long is the phoned word? */
     160                 : #define Phone_Len       (p_idx)
     161                 : 
     162                 : /* Note is a letter is a 'break' in the word */
     163                 : #define Isbreak(c)  (!isalpha(c))
     164                 : 
     165                 : /* {{{ metaphone
     166                 :  */
     167                 : static int metaphone(unsigned char *word, int word_len, long max_phonemes, char **phoned_word, int traditional)
     168              24 : {
     169              24 :         int w_idx = 0;                          /* point in the phonization we're at. */
     170              24 :         int p_idx = 0;                          /* end of the phoned phrase */
     171              24 :         int max_buffer_len = 0;         /* maximum length of the destination buffer */
     172                 : 
     173                 : /*-- Parameter checks --*/
     174                 :         /* Negative phoneme length is meaningless */
     175                 : 
     176              24 :         if (max_phonemes < 0)
     177               2 :                 return -1;
     178                 : 
     179                 :         /* Empty/null string is meaningless */
     180                 :         /* Overly paranoid */
     181                 :         /* assert(word != NULL && word[0] != '\0'); */
     182                 : 
     183              22 :         if (word == NULL)
     184               0 :                 return -1;
     185                 : 
     186                 : /*-- Allocate memory for our phoned_phrase --*/
     187              22 :         if (max_phonemes == 0) {        /* Assume largest possible */
     188              21 :                 max_buffer_len = word_len;
     189              21 :                 *phoned_word = safe_emalloc(sizeof(char), word_len, 1);
     190                 :         } else {
     191               1 :                 max_buffer_len = max_phonemes;
     192               1 :                 *phoned_word = safe_emalloc(sizeof(char), max_phonemes, 1);
     193                 :         }
     194                 : 
     195                 : 
     196                 : /*-- The first phoneme has to be processed specially. --*/
     197                 :         /* Find our first letter */
     198              24 :         for (; !isalpha(Curr_Letter); w_idx++) {
     199                 :                 /* On the off chance we were given nothing but crap... */
     200               4 :                 if (Curr_Letter == '\0') {
     201               2 :                         End_Phoned_Word
     202               2 :                                 return SUCCESS; /* For testing */
     203                 :                 }
     204                 :         }
     205                 : 
     206              20 :         switch (Curr_Letter) {
     207                 :                 /* AE becomes E */
     208                 :         case 'A':
     209               2 :                 if (Next_Letter == 'E') {
     210               1 :                         Phonize('E');
     211               1 :                         w_idx += 2;
     212                 :                 }
     213                 :                 /* Remember, preserve vowels at the beginning */
     214                 :                 else {
     215               1 :                         Phonize('A');
     216               1 :                         w_idx++;
     217                 :                 }
     218               2 :                 break;
     219                 :                 /* [GKP]N becomes N */
     220                 :         case 'G':
     221                 :         case 'K':
     222                 :         case 'P':
     223               3 :                 if (Next_Letter == 'N') {
     224               3 :                         Phonize('N');
     225               3 :                         w_idx += 2;
     226                 :                 }
     227               3 :                 break;
     228                 :                 /* WH becomes W, 
     229                 :                    WR becomes R 
     230                 :                    W if followed by a vowel */
     231                 :         case 'W':
     232               3 :                 if (Next_Letter == 'R') {
     233               1 :                         Phonize(Next_Letter);
     234               1 :                         w_idx += 2;
     235               2 :                 } else if (Next_Letter == 'H' || isvowel(Next_Letter)) {
     236               2 :                         Phonize('W');
     237               2 :                         w_idx += 2;
     238                 :                 }
     239                 :                 /* else ignore */
     240               3 :                 break;
     241                 :                 /* X becomes S */
     242                 :         case 'X':
     243               1 :                 Phonize('S');
     244               1 :                 w_idx++;
     245               1 :                 break;
     246                 :                 /* Vowels are kept */
     247                 :                 /* We did A already
     248                 :                    case 'A':
     249                 :                    case 'a':
     250                 :                  */
     251                 :         case 'E':
     252                 :         case 'I':
     253                 :         case 'O':
     254                 :         case 'U':
     255               0 :                 Phonize(Curr_Letter);
     256               0 :                 w_idx++;
     257                 :                 break;
     258                 :         default:
     259                 :                 /* do nothing */
     260                 :                 break;
     261                 :         }
     262                 : 
     263                 : 
     264                 : 
     265                 :         /* On to the metaphoning */
     266             554 :         for (; Curr_Letter != '\0' &&
     267                 :                  (max_phonemes == 0 || Phone_Len < max_phonemes);
     268             514 :                  w_idx++) {
     269                 :                 /* How many letters to skip because an eariler encoding handled     
     270                 :                  * multiple letters */
     271             514 :                 unsigned short int skip_letter = 0;
     272                 : 
     273                 : 
     274                 :                 /* THOUGHT:  It would be nice if, rather than having things like...
     275                 :                  * well, SCI.  For SCI you encode the S, then have to remember
     276                 :                  * to skip the C.  So the phonome SCI invades both S and C.  It would
     277                 :                  * be better, IMHO, to skip the C from the S part of the encoding.
     278                 :                  * Hell, I'm trying it.
     279                 :                  */
     280                 : 
     281                 :                 /* Ignore non-alphas */
     282             514 :                 if (!isalpha(Curr_Letter))
     283              98 :                         continue;
     284                 : 
     285                 :                 /* Drop duplicates, except CC */
     286             416 :                 if (Curr_Letter == Prev_Letter &&
     287                 :                         Curr_Letter != 'C')
     288              12 :                         continue;
     289                 : 
     290             404 :                 switch (Curr_Letter) {
     291                 :                         /* B -> B unless in MB */
     292                 :                 case 'B':
     293               6 :                         if (Prev_Letter != 'M')
     294               6 :                                 Phonize('B');
     295               6 :                         break;
     296                 :                         /* 'sh' if -CIA- or -CH, but not SCH, except SCHW.
     297                 :                          * (SCHW is handled in S)
     298                 :                          *  S if -CI-, -CE- or -CY-
     299                 :                          *  dropped if -SCI-, SCE-, -SCY- (handed in S)
     300                 :                          *  else K
     301                 :                          */
     302                 :                 case 'C':
     303              15 :                         if (MAKESOFT(Next_Letter)) {    /* C[IEY] */
     304               2 :                                 if (After_Next_Letter == 'A' &&
     305                 :                                         Next_Letter == 'I') {   /* CIA */
     306               0 :                                         Phonize(SH);
     307                 :                                 }
     308                 :                                 /* SC[IEY] */
     309               2 :                                 else if (Prev_Letter == 'S') {
     310                 :                                         /* Dropped */
     311                 :                                 } else {
     312               2 :                                         Phonize('S');
     313                 :                                 }
     314              11 :                         } else if (Next_Letter == 'H') {
     315               1 :                                 if ((!traditional) && (After_Next_Letter == 'R' || Prev_Letter == 'S')) {       /* Christ, School */
     316               0 :                                         Phonize('K');
     317                 :                                 } else {
     318               1 :                                         Phonize(SH);
     319                 :                                 }
     320               1 :                                 skip_letter++;
     321                 :                         } else {
     322              10 :                                 Phonize('K');
     323                 :                         }
     324              13 :                         break;
     325                 :                         /* J if in -DGE-, -DGI- or -DGY-
     326                 :                          * else T
     327                 :                          */
     328                 :                 case 'D':
     329              24 :                         if (Next_Letter == 'G' &&
     330                 :                                 MAKESOFT(After_Next_Letter)) {
     331               0 :                                 Phonize('J');
     332               0 :                                 skip_letter++;
     333                 :                         } else
     334              24 :                                 Phonize('T');
     335              24 :                         break;
     336                 :                         /* F if in -GH and not B--GH, D--GH, -H--GH, -H---GH
     337                 :                          * else dropped if -GNED, -GN, 
     338                 :                          * else dropped if -DGE-, -DGI- or -DGY- (handled in D)
     339                 :                          * else J if in -GE-, -GI, -GY and not GG
     340                 :                          * else K
     341                 :                          */
     342                 :                 case 'G':
     343              11 :                         if (Next_Letter == 'H') {
     344               3 :                                 if (!(NOGHTOF(Look_Back_Letter(3)) ||
     345                 :                                           Look_Back_Letter(4) == 'H')) {
     346               3 :                                         Phonize('F');
     347               3 :                                         skip_letter++;
     348                 :                                 } else {
     349                 :                                         /* silent */
     350                 :                                 }
     351               8 :                         } else if (Next_Letter == 'N') {
     352               0 :                                 if (Isbreak(After_Next_Letter) ||
     353                 :                                         (After_Next_Letter == 'E' &&
     354                 :                                          Look_Ahead_Letter(3) == 'D')) {
     355                 :                                         /* dropped */
     356                 :                                 } else
     357               0 :                                         Phonize('K');
     358               8 :                         } else if (MAKESOFT(Next_Letter) &&
     359                 :                                            Prev_Letter != 'G') {
     360               0 :                                 Phonize('J');
     361                 :                         } else {
     362               8 :                                 Phonize('K');
     363                 :                         }
     364              11 :                         break;
     365                 :                         /* H if before a vowel and not after C,G,P,S,T */
     366                 :                 case 'H':
     367               6 :                         if (isvowel(Next_Letter) &&
     368                 :                                 !AFFECTH(Prev_Letter))
     369               4 :                                 Phonize('H');
     370               6 :                         break;
     371                 :                         /* dropped if after C
     372                 :                          * else K
     373                 :                          */
     374                 :                 case 'K':
     375               4 :                         if (Prev_Letter != 'C')
     376               3 :                                 Phonize('K');
     377               4 :                         break;
     378                 :                         /* F if before H
     379                 :                          * else P
     380                 :                          */
     381                 :                 case 'P':
     382               7 :                         if (Next_Letter == 'H') {
     383               2 :                                 Phonize('F');
     384                 :                         } else {
     385               5 :                                 Phonize('P');
     386                 :                         }
     387               7 :                         break;
     388                 :                         /* K
     389                 :                          */
     390                 :                 case 'Q':
     391               0 :                         Phonize('K');
     392               0 :                         break;
     393                 :                         /* 'sh' in -SH-, -SIO- or -SIA- or -SCHW-
     394                 :                          * else S
     395                 :                          */
     396                 :                 case 'S':
     397              23 :                         if (Next_Letter == 'I' &&
     398                 :                                 (After_Next_Letter == 'O' ||
     399                 :                                  After_Next_Letter == 'A')) {
     400               0 :                                 Phonize(SH);
     401              23 :                         } else if (Next_Letter == 'H') {
     402               2 :                                 Phonize(SH);
     403               2 :                                 skip_letter++;
     404              21 :                         } else if ((!traditional) && (Next_Letter == 'C' && Look_Ahead_Letter(2) == 'H' && Look_Ahead_Letter(3) == 'W')) {
     405               0 :                                 Phonize(SH);
     406               0 :                                 skip_letter += 2;
     407                 :                         } else {
     408              21 :                                 Phonize('S');
     409                 :                         }
     410              23 :                         break;
     411                 :                         /* 'sh' in -TIA- or -TIO-
     412                 :                          * else 'th' before H
     413                 :                          * else T
     414                 :                          */
     415                 :                 case 'T':
     416              38 :                         if (Next_Letter == 'I' &&
     417                 :                                 (After_Next_Letter == 'O' ||
     418                 :                                  After_Next_Letter == 'A')) {
     419               0 :                                 Phonize(SH);
     420              38 :                         } else if (Next_Letter == 'H') {
     421              21 :                                 Phonize(TH);
     422              21 :                                 skip_letter++;
     423              17 :                         } else if (!(Next_Letter == 'C' && After_Next_Letter == 'H')) {
     424              16 :                                 Phonize('T');
     425                 :                         }
     426              38 :                         break;
     427                 :                         /* F */
     428                 :                 case 'V':
     429               7 :                         Phonize('F');
     430               7 :                         break;
     431                 :                         /* W before a vowel, else dropped */
     432                 :                 case 'W':
     433              16 :                         if (isvowel(Next_Letter))
     434              12 :                                 Phonize('W');
     435              16 :                         break;
     436                 :                         /* KS */
     437                 :                 case 'X':
     438               7 :                         Phonize('K');
     439               7 :                         Phonize('S');
     440               7 :                         break;
     441                 :                         /* Y if followed by a vowel */
     442                 :                 case 'Y':
     443               6 :                         if (isvowel(Next_Letter))
     444               2 :                                 Phonize('Y');
     445               6 :                         break;
     446                 :                         /* S */
     447                 :                 case 'Z':
     448               3 :                         Phonize('S');
     449               3 :                         break;
     450                 :                         /* No transformation */
     451                 :                 case 'F':
     452                 :                 case 'J':
     453                 :                 case 'L':
     454                 :                 case 'M':
     455                 :                 case 'N':
     456                 :                 case 'R':
     457              92 :                         Phonize(Curr_Letter);
     458                 :                         break;
     459                 :                 default:
     460                 :                         /* nothing */
     461                 :                         break;
     462                 :                 }                                               /* END SWITCH */
     463                 : 
     464             404 :                 w_idx += skip_letter;
     465                 :         }                                                       /* END FOR */
     466                 : 
     467              20 :         End_Phoned_Word;
     468                 : 
     469              20 :         return 0;
     470                 : }                                                               /* END metaphone */
     471                 : /* }}} */
     472                 : 
     473                 : /*
     474                 :  * Local variables:
     475                 :  * tab-width: 4
     476                 :  * c-basic-offset: 4
     477                 :  * End:
     478                 :  * vim600: sw=4 ts=4 fdm=marker
     479                 :  * vim<600: sw=4 ts=4
     480                 :  */

Generated by: LTP GCOV extension version 1.5

Generated at Sat, 21 Nov 2009 12:27:10 +0000 (3 days ago)

Copyright © 2005-2009 The PHP Group
All rights reserved.