PHP  
 PHP: Test and Code Coverage Analysis
downloads | QA | documentation | faq | getting help | mailing lists | reporting bugs | php.net sites | links | my php.net 
 

LTP GCOV extension - code coverage report
Current view: directory - standard - metaphone.c
Test: PHP Code Coverage
Date: 2009-11-23 Instrumented lines: 141
Code covered: 84.4 % Executed lines: 119
Legend: not executed executed

       1                 : /*
       2                 :    +----------------------------------------------------------------------+
       3                 :    | PHP Version 6                                                        |
       4                 :    +----------------------------------------------------------------------+
       5                 :    | Copyright (c) 1997-2009 The PHP Group                                |
       6                 :    +----------------------------------------------------------------------+
       7                 :    | This source file is subject to version 3.01 of the PHP license,      |
       8                 :    | that is bundled with this package in the file LICENSE, and is        |
       9                 :    | available through the world-wide-web at the following url:           |
      10                 :    | http://www.php.net/license/3_01.txt                                  |
      11                 :    | If you did not receive a copy of the PHP license and are unable to   |
      12                 :    | obtain it through the world-wide-web, please send a note to          |
      13                 :    | license@php.net so we can mail you a copy immediately.               |
      14                 :    +----------------------------------------------------------------------+
      15                 :    | Author: Thies C. Arntzen <thies@thieso.net>                          |
      16                 :    +----------------------------------------------------------------------+
      17                 : */
      18                 : 
      19                 : /* $Id: metaphone.c 282958 2009-06-28 18:41:20Z felipe $ */
      20                 : 
      21                 : /*
      22                 :         Based on CPANs "Text-Metaphone-1.96" by Michael G Schwern <schwern@pobox.com> 
      23                 : */
      24                 : 
      25                 : #include "php.h"
      26                 : #include "php_metaphone.h"
      27                 : 
      28                 : static int metaphone(unsigned char *word, int word_len, long max_phonemes, char **phoned_word, int traditional);
      29                 : 
      30                 : /* {{{ proto string metaphone(string text[, int phones]) U
      31                 :    Break english phrases down into their phonemes */
      32                 : PHP_FUNCTION(metaphone)
      33              25 : {
      34                 :         char *str;
      35              25 :         char *result = 0;
      36                 :         int str_len;
      37              25 :         long phones = 0;
      38                 : 
      39              25 :         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &str_len,
      40                 :                                                           &phones) == FAILURE) {
      41               1 :                 return;
      42                 :         }
      43                 : 
      44              24 :         if (metaphone((unsigned char *)str, str_len, phones, &result, 1) == 0) {
      45              22 :                 RETVAL_ASCII_STRING(result, ZSTR_AUTOFREE);
      46                 :         } else {
      47               2 :                 if (result) {
      48               0 :                         efree(result);
      49                 :                 }
      50               2 :                 RETURN_FALSE;
      51                 :         }
      52                 : }
      53                 : /* }}} */
      54                 : 
      55                 : /* 
      56                 :    this is now the original code by Michael G Schwern:
      57                 :    i've changed it just a slightly bit (use emalloc, 
      58                 :    get rid of includes etc) 
      59                 :         - thies - 13.09.1999
      60                 : */
      61                 : 
      62                 : /*-----------------------------  */
      63                 : /* this used to be "metaphone.h" */
      64                 : /*-----------------------------  */
      65                 : 
      66                 : /* Special encodings */
      67                 : #define  SH     'X'
      68                 : #define  TH             '0'
      69                 : 
      70                 : /*-----------------------------  */
      71                 : /* end of "metaphone.h"          */
      72                 : /*-----------------------------  */
      73                 : 
      74                 : /*----------------------------- */
      75                 : /* this used to be "metachar.h" */
      76                 : /*----------------------------- */
      77                 : 
      78                 : /* Metachar.h ... little bits about characters for metaphone */
      79                 : /*-- Character encoding array & accessing macros --*/
      80                 : /* Stolen directly out of the book... */
      81                 : char _codes[26] =
      82                 : {
      83                 :         1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0
      84                 : /*  a  b c  d e f g  h i j k l m n o p q r s t u v w x y z */
      85                 : };
      86                 : 
      87                 : 
      88                 : #define ENCODE(c) (isalpha(c) ? _codes[((toupper(c)) - 'A')] : 0)
      89                 : 
      90                 : #define isvowel(c)  (ENCODE(c) & 1)         /* AEIOU */
      91                 : 
      92                 : /* These letters are passed through unchanged */
      93                 : #define NOCHANGE(c) (ENCODE(c) & 2)         /* FJMNR */
      94                 : 
      95                 : /* These form dipthongs when preceding H */
      96                 : #define AFFECTH(c)  (ENCODE(c) & 4)         /* CGPST */
      97                 : 
      98                 : /* These make C and G soft */
      99                 : #define MAKESOFT(c) (ENCODE(c) & 8)         /* EIY */
     100                 : 
     101                 : /* These prevent GH from becoming F */
     102                 : #define NOGHTOF(c)  (ENCODE(c) & 16)        /* BDH */
     103                 : 
     104                 : /*----------------------------- */
     105                 : /* end of "metachar.h"          */
     106                 : /*----------------------------- */
     107                 : 
     108                 : /* I suppose I could have been using a character pointer instead of
     109                 :  * accesssing the array directly... */
     110                 : 
     111                 : /* Look at the next letter in the word */
     112                 : #define Next_Letter (toupper(word[w_idx+1]))
     113                 : /* Look at the current letter in the word */
     114                 : #define Curr_Letter (toupper(word[w_idx]))
     115                 : /* Go N letters back. */
     116                 : #define Look_Back_Letter(n)     (w_idx >= n ? toupper(word[w_idx-n]) : '\0')
     117                 : /* Previous letter.  I dunno, should this return null on failure? */
     118                 : #define Prev_Letter (Look_Back_Letter(1))
     119                 : /* Look two letters down.  It makes sure you don't walk off the string. */
     120                 : #define After_Next_Letter       (Next_Letter != '\0' ? toupper(word[w_idx+2]) \
     121                 :                                                                                              : '\0')
     122                 : #define Look_Ahead_Letter(n) (toupper(Lookahead(word+w_idx, n)))
     123                 : 
     124                 : 
     125                 : /* Allows us to safely look ahead an arbitrary # of letters */
     126                 : /* I probably could have just used strlen... */
     127                 : 
     128                 : static char Lookahead(char *word, int how_far) /* {{{ */
     129               0 : {
     130               0 :         char letter_ahead = '\0';       /* null by default */
     131                 :         int idx;
     132               0 :         for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
     133                 :         /* Edge forward in the string... */
     134                 : 
     135               0 :         letter_ahead = word[idx];       /* idx will be either == to how_far or
     136                 :                                                                  * at the end of the string
     137                 :                                                                  */
     138               0 :         return letter_ahead;
     139                 : }
     140                 : /* }}} */
     141                 : 
     142                 : /* phonize one letter
     143                 :  * We don't know the buffers size in advance. On way to solve this is to just
     144                 :  * re-allocate the buffer size. We're using an extra of 2 characters (this
     145                 :  * could be one though; or more too). */
     146                 : #define Phonize(c)      { \
     147                 :                                                 if (p_idx >= max_buffer_len) { \
     148                 :                                                         *phoned_word = safe_erealloc(*phoned_word, 2, sizeof(char), max_buffer_len); \
     149                 :                                                         max_buffer_len += 2; \
     150                 :                                                 } \
     151                 :                                                 (*phoned_word)[p_idx++] = c; \
     152                 :                                         }
     153                 : /* Slap a null character on the end of the phoned word */
     154                 : #define End_Phoned_Word { \
     155                 :                                                         if (p_idx == max_buffer_len) { \
     156                 :                                                                 *phoned_word = safe_erealloc(*phoned_word, 1, sizeof(char), max_buffer_len); \
     157                 :                                                         } \
     158                 :                                                         (*phoned_word)[p_idx] = '\0'; \
     159                 :                                                 }
     160                 : /* How long is the phoned word? */
     161                 : #define Phone_Len       (p_idx)
     162                 : 
     163                 : /* Note is a letter is a 'break' in the word */
     164                 : #define Isbreak(c)  (!isalpha(c))
     165                 : 
     166                 : static int metaphone(unsigned char *word, int word_len, long max_phonemes, char **phoned_word, int traditional) /* {{{ */
     167              24 : {
     168              24 :         int w_idx = 0;                          /* point in the phonization we're at. */
     169              24 :         int p_idx = 0;                          /* end of the phoned phrase */
     170              24 :         int max_buffer_len = 0;         /* maximum length of the destination buffer */
     171                 : 
     172                 : /*-- Parameter checks --*/
     173                 :         /* Negative phoneme length is meaningless */
     174                 : 
     175              24 :         if (max_phonemes < 0)
     176               2 :                 return -1;
     177                 : 
     178                 :         /* Empty/null string is meaningless */
     179                 :         /* Overly paranoid */
     180                 :         /* assert(word != NULL && word[0] != '\0'); */
     181                 : 
     182              22 :         if (word == NULL)
     183               0 :                 return -1;
     184                 : 
     185                 : /*-- Allocate memory for our phoned_phrase --*/
     186              22 :         if (max_phonemes == 0) {        /* Assume largest possible */
     187              21 :                 max_buffer_len = word_len;
     188              21 :                 *phoned_word = safe_emalloc(sizeof(char), word_len, 1);
     189                 :         } else {
     190               1 :                 max_buffer_len = max_phonemes;
     191               1 :                 *phoned_word = safe_emalloc(sizeof(char), max_phonemes, 1);
     192                 :         }
     193                 : 
     194                 : 
     195                 : /*-- The first phoneme has to be processed specially. --*/
     196                 :         /* Find our first letter */
     197              24 :         for (; !isalpha(Curr_Letter); w_idx++) {
     198                 :                 /* On the off chance we were given nothing but crap... */
     199               4 :                 if (Curr_Letter == '\0') {
     200               2 :                         End_Phoned_Word
     201               2 :                                 return SUCCESS; /* For testing */
     202                 :                 }
     203                 :         }
     204                 : 
     205              20 :         switch (Curr_Letter) {
     206                 :                 /* AE becomes E */
     207                 :         case 'A':
     208               2 :                 if (Next_Letter == 'E') {
     209               1 :                         Phonize('E');
     210               1 :                         w_idx += 2;
     211                 :                 }
     212                 :                 /* Remember, preserve vowels at the beginning */
     213                 :                 else {
     214               1 :                         Phonize('A');
     215               1 :                         w_idx++;
     216                 :                 }
     217               2 :                 break;
     218                 :                 /* [GKP]N becomes N */
     219                 :         case 'G':
     220                 :         case 'K':
     221                 :         case 'P':
     222               3 :                 if (Next_Letter == 'N') {
     223               3 :                         Phonize('N');
     224               3 :                         w_idx += 2;
     225                 :                 }
     226               3 :                 break;
     227                 :                 /* WH becomes W, 
     228                 :                    WR becomes R 
     229                 :                    W if followed by a vowel */
     230                 :         case 'W':
     231               3 :                 if (Next_Letter == 'R') {
     232               1 :                         Phonize(Next_Letter);
     233               1 :                         w_idx += 2;
     234               2 :                 } else if (Next_Letter == 'H' || isvowel(Next_Letter)) {
     235               2 :                         Phonize('W');
     236               2 :                         w_idx += 2;
     237                 :                 }
     238                 :                 /* else ignore */
     239               3 :                 break;
     240                 :                 /* X becomes S */
     241                 :         case 'X':
     242               1 :                 Phonize('S');
     243               1 :                 w_idx++;
     244               1 :                 break;
     245                 :                 /* Vowels are kept */
     246                 :                 /* We did A already
     247                 :                    case 'A':
     248                 :                    case 'a':
     249                 :                  */
     250                 :         case 'E':
     251                 :         case 'I':
     252                 :         case 'O':
     253                 :         case 'U':
     254               0 :                 Phonize(Curr_Letter);
     255               0 :                 w_idx++;
     256                 :                 break;
     257                 :         default:
     258                 :                 /* do nothing */
     259                 :                 break;
     260                 :         }
     261                 : 
     262                 : 
     263                 : 
     264                 :         /* On to the metaphoning */
     265             554 :         for (; Curr_Letter != '\0' &&
     266                 :                  (max_phonemes == 0 || Phone_Len < max_phonemes);
     267             514 :                  w_idx++) {
     268                 :                 /* How many letters to skip because an eariler encoding handled     
     269                 :                  * multiple letters */
     270             514 :                 unsigned short int skip_letter = 0;
     271                 : 
     272                 : 
     273                 :                 /* THOUGHT:  It would be nice if, rather than having things like...
     274                 :                  * well, SCI.  For SCI you encode the S, then have to remember
     275                 :                  * to skip the C.  So the phonome SCI invades both S and C.  It would
     276                 :                  * be better, IMHO, to skip the C from the S part of the encoding.
     277                 :                  * Hell, I'm trying it.
     278                 :                  */
     279                 : 
     280                 :                 /* Ignore non-alphas */
     281             514 :                 if (!isalpha(Curr_Letter))
     282              98 :                         continue;
     283                 : 
     284                 :                 /* Drop duplicates, except CC */
     285             416 :                 if (Curr_Letter == Prev_Letter &&
     286                 :                         Curr_Letter != 'C')
     287              12 :                         continue;
     288                 : 
     289             404 :                 switch (Curr_Letter) {
     290                 :                         /* B -> B unless in MB */
     291                 :                 case 'B':
     292               6 :                         if (Prev_Letter != 'M')
     293               6 :                                 Phonize('B');
     294               6 :                         break;
     295                 :                         /* 'sh' if -CIA- or -CH, but not SCH, except SCHW.
     296                 :                          * (SCHW is handled in S)
     297                 :                          *  S if -CI-, -CE- or -CY-
     298                 :                          *  dropped if -SCI-, SCE-, -SCY- (handed in S)
     299                 :                          *  else K
     300                 :                          */
     301                 :                 case 'C':
     302              15 :                         if (MAKESOFT(Next_Letter)) {    /* C[IEY] */
     303               2 :                                 if (After_Next_Letter == 'A' &&
     304                 :                                         Next_Letter == 'I') {   /* CIA */
     305               0 :                                         Phonize(SH);
     306                 :                                 }
     307                 :                                 /* SC[IEY] */
     308               2 :                                 else if (Prev_Letter == 'S') {
     309                 :                                         /* Dropped */
     310                 :                                 } else {
     311               2 :                                         Phonize('S');
     312                 :                                 }
     313              11 :                         } else if (Next_Letter == 'H') {
     314               1 :                                 if ((!traditional) && (After_Next_Letter == 'R' || Prev_Letter == 'S')) {       /* Christ, School */
     315               0 :                                         Phonize('K');
     316                 :                                 } else {
     317               1 :                                         Phonize(SH);
     318                 :                                 }
     319               1 :                                 skip_letter++;
     320                 :                         } else {
     321              10 :                                 Phonize('K');
     322                 :                         }
     323              13 :                         break;
     324                 :                         /* J if in -DGE-, -DGI- or -DGY-
     325                 :                          * else T
     326                 :                          */
     327                 :                 case 'D':
     328              24 :                         if (Next_Letter == 'G' &&
     329                 :                                 MAKESOFT(After_Next_Letter)) {
     330               0 :                                 Phonize('J');
     331               0 :                                 skip_letter++;
     332                 :                         } else
     333              24 :                                 Phonize('T');
     334              24 :                         break;
     335                 :                         /* F if in -GH and not B--GH, D--GH, -H--GH, -H---GH
     336                 :                          * else dropped if -GNED, -GN, 
     337                 :                          * else dropped if -DGE-, -DGI- or -DGY- (handled in D)
     338                 :                          * else J if in -GE-, -GI, -GY and not GG
     339                 :                          * else K
     340                 :                          */
     341                 :                 case 'G':
     342              11 :                         if (Next_Letter == 'H') {
     343               3 :                                 if (!(NOGHTOF(Look_Back_Letter(3)) ||
     344                 :                                           Look_Back_Letter(4) == 'H')) {
     345               3 :                                         Phonize('F');
     346               3 :                                         skip_letter++;
     347                 :                                 } else {
     348                 :                                         /* silent */
     349                 :                                 }
     350               8 :                         } else if (Next_Letter == 'N') {
     351               0 :                                 if (Isbreak(After_Next_Letter) ||
     352                 :                                         (After_Next_Letter == 'E' &&
     353                 :                                          Look_Ahead_Letter(3) == 'D')) {
     354                 :                                         /* dropped */
     355                 :                                 } else
     356               0 :                                         Phonize('K');
     357               8 :                         } else if (MAKESOFT(Next_Letter) &&
     358                 :                                            Prev_Letter != 'G') {
     359               0 :                                 Phonize('J');
     360                 :                         } else {
     361               8 :                                 Phonize('K');
     362                 :                         }
     363              11 :                         break;
     364                 :                         /* H if before a vowel and not after C,G,P,S,T */
     365                 :                 case 'H':
     366               6 :                         if (isvowel(Next_Letter) &&
     367                 :                                 !AFFECTH(Prev_Letter))
     368               4 :                                 Phonize('H');
     369               6 :                         break;
     370                 :                         /* dropped if after C
     371                 :                          * else K
     372                 :                          */
     373                 :                 case 'K':
     374               4 :                         if (Prev_Letter != 'C')
     375               3 :                                 Phonize('K');
     376               4 :                         break;
     377                 :                         /* F if before H
     378                 :                          * else P
     379                 :                          */
     380                 :                 case 'P':
     381               7 :                         if (Next_Letter == 'H') {
     382               2 :                                 Phonize('F');
     383                 :                         } else {
     384               5 :                                 Phonize('P');
     385                 :                         }
     386               7 :                         break;
     387                 :                         /* K
     388                 :                          */
     389                 :                 case 'Q':
     390               0 :                         Phonize('K');
     391               0 :                         break;
     392                 :                         /* 'sh' in -SH-, -SIO- or -SIA- or -SCHW-
     393                 :                          * else S
     394                 :                          */
     395                 :                 case 'S':
     396              23 :                         if (Next_Letter == 'I' &&
     397                 :                                 (After_Next_Letter == 'O' ||
     398                 :                                  After_Next_Letter == 'A')) {
     399               0 :                                 Phonize(SH);
     400              23 :                         } else if (Next_Letter == 'H') {
     401               2 :                                 Phonize(SH);
     402               2 :                                 skip_letter++;
     403              21 :                         } else if ((!traditional) && (Next_Letter == 'C' && Look_Ahead_Letter(2) == 'H' && Look_Ahead_Letter(3) == 'W')) {
     404               0 :                                 Phonize(SH);
     405               0 :                                 skip_letter += 2;
     406                 :                         } else {
     407              21 :                                 Phonize('S');
     408                 :                         }
     409              23 :                         break;
     410                 :                         /* 'sh' in -TIA- or -TIO-
     411                 :                          * else 'th' before H
     412                 :                          * else T
     413                 :                          */
     414                 :                 case 'T':
     415              38 :                         if (Next_Letter == 'I' &&
     416                 :                                 (After_Next_Letter == 'O' ||
     417                 :                                  After_Next_Letter == 'A')) {
     418               0 :                                 Phonize(SH);
     419              38 :                         } else if (Next_Letter == 'H') {
     420              21 :                                 Phonize(TH);
     421              21 :                                 skip_letter++;
     422              17 :                         } else if (!(Next_Letter == 'C' && After_Next_Letter == 'H')) {
     423              16 :                                 Phonize('T');
     424                 :                         }
     425              38 :                         break;
     426                 :                         /* F */
     427                 :                 case 'V':
     428               7 :                         Phonize('F');
     429               7 :                         break;
     430                 :                         /* W before a vowel, else dropped */
     431                 :                 case 'W':
     432              16 :                         if (isvowel(Next_Letter))
     433              12 :                                 Phonize('W');
     434              16 :                         break;
     435                 :                         /* KS */
     436                 :                 case 'X':
     437               7 :                         Phonize('K');
     438               7 :                         Phonize('S');
     439               7 :                         break;
     440                 :                         /* Y if followed by a vowel */
     441                 :                 case 'Y':
     442               6 :                         if (isvowel(Next_Letter))
     443               2 :                                 Phonize('Y');
     444               6 :                         break;
     445                 :                         /* S */
     446                 :                 case 'Z':
     447               3 :                         Phonize('S');
     448               3 :                         break;
     449                 :                         /* No transformation */
     450                 :                 case 'F':
     451                 :                 case 'J':
     452                 :                 case 'L':
     453                 :                 case 'M':
     454                 :                 case 'N':
     455                 :                 case 'R':
     456              92 :                         Phonize(Curr_Letter);
     457                 :                         break;
     458                 :                 default:
     459                 :                         /* nothing */
     460                 :                         break;
     461                 :                 }                                               /* END SWITCH */
     462                 : 
     463             404 :                 w_idx += skip_letter;
     464                 :         }                                                       /* END FOR */
     465                 : 
     466              20 :         End_Phoned_Word;
     467                 : 
     468              20 :         return 0;
     469                 : }                                                               /* END metaphone */
     470                 : /* }}} */
     471                 : 
     472                 : /*
     473                 :  * Local variables:
     474                 :  * tab-width: 4
     475                 :  * c-basic-offset: 4
     476                 :  * End:
     477                 :  * vim600: sw=4 ts=4 fdm=marker
     478                 :  * vim<600: sw=4 ts=4
     479                 :  */

Generated by: LTP GCOV extension version 1.5

Generated at Mon, 23 Nov 2009 17:39:41 +0000 (35 hours ago)

Copyright © 2005-2009 The PHP Group
All rights reserved.