PHP  
 PHP: Test and Code Coverage Analysis
downloads | QA | documentation | faq | getting help | mailing lists | reporting bugs | php.net sites | links | my php.net 
 

LCOV - code coverage report
Current view: top level - ext/standard - metaphone.c (source / functions) Hit Total Coverage
Test: PHP Code Coverage Lines: 123 154 79.9 %
Date: 2014-12-15 Functions: 2 3 66.7 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :    +----------------------------------------------------------------------+
       3             :    | PHP Version 5                                                        |
       4             :    +----------------------------------------------------------------------+
       5             :    | Copyright (c) 1997-2014 The PHP Group                                |
       6             :    +----------------------------------------------------------------------+
       7             :    | This source file is subject to version 3.01 of the PHP license,      |
       8             :    | that is bundled with this package in the file LICENSE, and is        |
       9             :    | available through the world-wide-web at the following url:           |
      10             :    | http://www.php.net/license/3_01.txt                                  |
      11             :    | If you did not receive a copy of the PHP license and are unable to   |
      12             :    | obtain it through the world-wide-web, please send a note to          |
      13             :    | license@php.net so we can mail you a copy immediately.               |
      14             :    +----------------------------------------------------------------------+
      15             :    | Author: Thies C. Arntzen <thies@thieso.net>                          |
      16             :    +----------------------------------------------------------------------+
      17             : */
      18             : 
      19             : /* $Id$ */
      20             : 
      21             : /*
      22             :         Based on CPANs "Text-Metaphone-1.96" by Michael G Schwern <schwern@pobox.com> 
      23             : */
      24             : 
      25             : #include "php.h"
      26             : #include "php_metaphone.h"
      27             : 
      28             : static int metaphone(unsigned char *word, int word_len, long max_phonemes, char **phoned_word, int traditional);
      29             : 
      30             : /* {{{ proto string metaphone(string text[, int phones])
      31             :    Break english phrases down into their phonemes */
      32          25 : PHP_FUNCTION(metaphone)
      33             : {
      34             :         char *str;
      35          25 :         char *result = 0;
      36             :         int str_len;
      37          25 :         long phones = 0;
      38             : 
      39          25 :         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &str_len,
      40             :                                                           &phones) == FAILURE) {
      41           1 :                 return;
      42             :         }
      43             : 
      44          24 :         if (metaphone((unsigned char *)str, str_len, phones, &result, 1) == 0) {
      45          22 :                 RETVAL_STRING(result, 0);
      46             :         } else {
      47           2 :                 if (result) {
      48           0 :                         efree(result);
      49             :                 }
      50           2 :                 RETURN_FALSE;
      51             :         }
      52             : }
      53             : /* }}} */
      54             : 
      55             : /* 
      56             :    this is now the original code by Michael G Schwern:
      57             :    i've changed it just a slightly bit (use emalloc, 
      58             :    get rid of includes etc) 
      59             :         - thies - 13.09.1999
      60             : */
      61             : 
      62             : /*-----------------------------  */
      63             : /* this used to be "metaphone.h" */
      64             : /*-----------------------------  */
      65             : 
      66             : /* Special encodings */
      67             : #define  SH     'X'
      68             : #define  TH             '0'
      69             : 
      70             : /*-----------------------------  */
      71             : /* end of "metaphone.h"          */
      72             : /*-----------------------------  */
      73             : 
      74             : /*----------------------------- */
      75             : /* this used to be "metachar.h" */
      76             : /*----------------------------- */
      77             : 
      78             : /* Metachar.h ... little bits about characters for metaphone */
      79             : /*-- Character encoding array & accessing macros --*/
      80             : /* Stolen directly out of the book... */
      81             : char _codes[26] =
      82             : {
      83             :         1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0
      84             : /*  a  b c  d e f g  h i j k l m n o p q r s t u v w x y z */
      85             : };
      86             : 
      87             : 
      88             : #define ENCODE(c) (isalpha(c) ? _codes[((toupper(c)) - 'A')] : 0)
      89             : 
      90             : #define isvowel(c)  (ENCODE(c) & 1)         /* AEIOU */
      91             : 
      92             : /* These letters are passed through unchanged */
      93             : #define NOCHANGE(c) (ENCODE(c) & 2)         /* FJMNR */
      94             : 
      95             : /* These form dipthongs when preceding H */
      96             : #define AFFECTH(c)  (ENCODE(c) & 4)         /* CGPST */
      97             : 
      98             : /* These make C and G soft */
      99             : #define MAKESOFT(c) (ENCODE(c) & 8)         /* EIY */
     100             : 
     101             : /* These prevent GH from becoming F */
     102             : #define NOGHTOF(c)  (ENCODE(c) & 16)        /* BDH */
     103             : 
     104             : /*----------------------------- */
     105             : /* end of "metachar.h"          */
     106             : /*----------------------------- */
     107             : 
     108             : /* I suppose I could have been using a character pointer instead of
     109             :  * accesssing the array directly... */
     110             : 
     111             : /* Look at the next letter in the word */
     112             : #define Next_Letter (toupper(word[w_idx+1]))
     113             : /* Look at the current letter in the word */
     114             : #define Curr_Letter (toupper(word[w_idx]))
     115             : /* Go N letters back. */
     116             : #define Look_Back_Letter(n)     (w_idx >= n ? toupper(word[w_idx-n]) : '\0')
     117             : /* Previous letter.  I dunno, should this return null on failure? */
     118             : #define Prev_Letter (Look_Back_Letter(1))
     119             : /* Look two letters down.  It makes sure you don't walk off the string. */
     120             : #define After_Next_Letter       (Next_Letter != '\0' ? toupper(word[w_idx+2]) \
     121             :                                                                                              : '\0')
     122             : #define Look_Ahead_Letter(n) (toupper(Lookahead(word+w_idx, n)))
     123             : 
     124             : 
     125             : /* Allows us to safely look ahead an arbitrary # of letters */
     126             : /* I probably could have just used strlen... */
     127           0 : static char Lookahead(char *word, int how_far)
     128             : {
     129           0 :         char letter_ahead = '\0';       /* null by default */
     130             :         int idx;
     131           0 :         for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
     132             :         /* Edge forward in the string... */
     133             : 
     134           0 :         letter_ahead = word[idx];       /* idx will be either == to how_far or
     135             :                                                                  * at the end of the string
     136             :                                                                  */
     137           0 :         return letter_ahead;
     138             : }
     139             : 
     140             : 
     141             : /* phonize one letter
     142             :  * We don't know the buffers size in advance. On way to solve this is to just
     143             :  * re-allocate the buffer size. We're using an extra of 2 characters (this
     144             :  * could be one though; or more too). */
     145             : #define Phonize(c)      { \
     146             :                                                 if (p_idx >= max_buffer_len) { \
     147             :                                                         *phoned_word = safe_erealloc(*phoned_word, 2, sizeof(char), max_buffer_len); \
     148             :                                                         max_buffer_len += 2; \
     149             :                                                 } \
     150             :                                                 (*phoned_word)[p_idx++] = c; \
     151             :                                         }
     152             : /* Slap a null character on the end of the phoned word */
     153             : #define End_Phoned_Word { \
     154             :                                                         if (p_idx == max_buffer_len) { \
     155             :                                                                 *phoned_word = safe_erealloc(*phoned_word, 1, sizeof(char), max_buffer_len); \
     156             :                                                         } \
     157             :                                                         (*phoned_word)[p_idx] = '\0'; \
     158             :                                                 }
     159             : /* How long is the phoned word? */
     160             : #define Phone_Len       (p_idx)
     161             : 
     162             : /* Note is a letter is a 'break' in the word */
     163             : #define Isbreak(c)  (!isalpha(c))
     164             : 
     165             : /* {{{ metaphone
     166             :  */
     167          24 : static int metaphone(unsigned char *word, int word_len, long max_phonemes, char **phoned_word, int traditional)
     168             : {
     169          24 :         int w_idx = 0;                          /* point in the phonization we're at. */
     170          24 :         int p_idx = 0;                          /* end of the phoned phrase */
     171          24 :         int max_buffer_len = 0;         /* maximum length of the destination buffer */
     172             : 
     173             : /*-- Parameter checks --*/
     174             :         /* Negative phoneme length is meaningless */
     175             : 
     176          24 :         if (max_phonemes < 0)
     177           2 :                 return -1;
     178             : 
     179             :         /* Empty/null string is meaningless */
     180             :         /* Overly paranoid */
     181             :         /* assert(word != NULL && word[0] != '\0'); */
     182             : 
     183          22 :         if (word == NULL)
     184           0 :                 return -1;
     185             : 
     186             : /*-- Allocate memory for our phoned_phrase --*/
     187          22 :         if (max_phonemes == 0) {        /* Assume largest possible */
     188          21 :                 max_buffer_len = word_len;
     189          21 :                 *phoned_word = safe_emalloc(sizeof(char), word_len, 1);
     190             :         } else {
     191           1 :                 max_buffer_len = max_phonemes;
     192           1 :                 *phoned_word = safe_emalloc(sizeof(char), max_phonemes, 1);
     193             :         }
     194             : 
     195             : 
     196             : /*-- The first phoneme has to be processed specially. --*/
     197             :         /* Find our first letter */
     198          24 :         for (; !isalpha(Curr_Letter); w_idx++) {
     199             :                 /* On the off chance we were given nothing but crap... */
     200           4 :                 if (Curr_Letter == '\0') {
     201           2 :                         End_Phoned_Word
     202           2 :                                 return SUCCESS; /* For testing */
     203             :                 }
     204             :         }
     205             : 
     206          20 :         switch (Curr_Letter) {
     207             :                 /* AE becomes E */
     208             :         case 'A':
     209           2 :                 if (Next_Letter == 'E') {
     210           1 :                         Phonize('E');
     211           1 :                         w_idx += 2;
     212             :                 }
     213             :                 /* Remember, preserve vowels at the beginning */
     214             :                 else {
     215           1 :                         Phonize('A');
     216           1 :                         w_idx++;
     217             :                 }
     218           2 :                 break;
     219             :                 /* [GKP]N becomes N */
     220             :         case 'G':
     221             :         case 'K':
     222             :         case 'P':
     223           3 :                 if (Next_Letter == 'N') {
     224           3 :                         Phonize('N');
     225           3 :                         w_idx += 2;
     226             :                 }
     227           3 :                 break;
     228             :                 /* WH becomes W, 
     229             :                    WR becomes R 
     230             :                    W if followed by a vowel */
     231             :         case 'W':
     232           3 :                 if (Next_Letter == 'R') {
     233           1 :                         Phonize(Next_Letter);
     234           1 :                         w_idx += 2;
     235           2 :                 } else if (Next_Letter == 'H' || isvowel(Next_Letter)) {
     236           2 :                         Phonize('W');
     237           2 :                         w_idx += 2;
     238             :                 }
     239             :                 /* else ignore */
     240           3 :                 break;
     241             :                 /* X becomes S */
     242             :         case 'X':
     243           1 :                 Phonize('S');
     244           1 :                 w_idx++;
     245           1 :                 break;
     246             :                 /* Vowels are kept */
     247             :                 /* We did A already
     248             :                    case 'A':
     249             :                    case 'a':
     250             :                  */
     251             :         case 'E':
     252             :         case 'I':
     253             :         case 'O':
     254             :         case 'U':
     255           0 :                 Phonize(Curr_Letter);
     256           0 :                 w_idx++;
     257             :                 break;
     258             :         default:
     259             :                 /* do nothing */
     260             :                 break;
     261             :         }
     262             : 
     263             : 
     264             : 
     265             :         /* On to the metaphoning */
     266         566 :         for (; Curr_Letter != '\0' &&
     267          12 :                  (max_phonemes == 0 || Phone_Len < max_phonemes);
     268         514 :                  w_idx++) {
     269             :                 /* How many letters to skip because an eariler encoding handled     
     270             :                  * multiple letters */
     271         514 :                 unsigned short int skip_letter = 0;
     272             : 
     273             : 
     274             :                 /* THOUGHT:  It would be nice if, rather than having things like...
     275             :                  * well, SCI.  For SCI you encode the S, then have to remember
     276             :                  * to skip the C.  So the phonome SCI invades both S and C.  It would
     277             :                  * be better, IMHO, to skip the C from the S part of the encoding.
     278             :                  * Hell, I'm trying it.
     279             :                  */
     280             : 
     281             :                 /* Ignore non-alphas */
     282         514 :                 if (!isalpha(Curr_Letter))
     283          98 :                         continue;
     284             : 
     285             :                 /* Drop duplicates, except CC */
     286         428 :                 if (Curr_Letter == Prev_Letter &&
     287          12 :                         Curr_Letter != 'C')
     288          12 :                         continue;
     289             : 
     290         404 :                 switch (Curr_Letter) {
     291             :                         /* B -> B unless in MB */
     292             :                 case 'B':
     293           6 :                         if (Prev_Letter != 'M')
     294           6 :                                 Phonize('B');
     295           6 :                         break;
     296             :                         /* 'sh' if -CIA- or -CH, but not SCH, except SCHW.
     297             :                          * (SCHW is handled in S)
     298             :                          *  S if -CI-, -CE- or -CY-
     299             :                          *  dropped if -SCI-, SCE-, -SCY- (handed in S)
     300             :                          *  else K
     301             :                          */
     302             :                 case 'C':
     303          15 :                         if (MAKESOFT(Next_Letter)) {    /* C[IEY] */
     304           2 :                                 if (After_Next_Letter == 'A' &&
     305           0 :                                         Next_Letter == 'I') {   /* CIA */
     306           0 :                                         Phonize(SH);
     307             :                                 }
     308             :                                 /* SC[IEY] */
     309           2 :                                 else if (Prev_Letter == 'S') {
     310             :                                         /* Dropped */
     311             :                                 } else {
     312           2 :                                         Phonize('S');
     313             :                                 }
     314          11 :                         } else if (Next_Letter == 'H') {
     315           1 :                                 if ((!traditional) && (After_Next_Letter == 'R' || Prev_Letter == 'S')) {       /* Christ, School */
     316           0 :                                         Phonize('K');
     317             :                                 } else {
     318           1 :                                         Phonize(SH);
     319             :                                 }
     320           1 :                                 skip_letter++;
     321             :                         } else {
     322          10 :                                 Phonize('K');
     323             :                         }
     324          13 :                         break;
     325             :                         /* J if in -DGE-, -DGI- or -DGY-
     326             :                          * else T
     327             :                          */
     328             :                 case 'D':
     329          24 :                         if (Next_Letter == 'G' &&
     330           0 :                                 MAKESOFT(After_Next_Letter)) {
     331           0 :                                 Phonize('J');
     332           0 :                                 skip_letter++;
     333             :                         } else
     334          24 :                                 Phonize('T');
     335          24 :                         break;
     336             :                         /* F if in -GH and not B--GH, D--GH, -H--GH, -H---GH
     337             :                          * else dropped if -GNED, -GN, 
     338             :                          * else dropped if -DGE-, -DGI- or -DGY- (handled in D)
     339             :                          * else J if in -GE-, -GI, -GY and not GG
     340             :                          * else K
     341             :                          */
     342             :                 case 'G':
     343          11 :                         if (Next_Letter == 'H') {
     344           6 :                                 if (!(NOGHTOF(Look_Back_Letter(3)) ||
     345           3 :                                           Look_Back_Letter(4) == 'H')) {
     346           3 :                                         Phonize('F');
     347           3 :                                         skip_letter++;
     348             :                                 } else {
     349             :                                         /* silent */
     350             :                                 }
     351           8 :                         } else if (Next_Letter == 'N') {
     352           0 :                                 if (Isbreak(After_Next_Letter) ||
     353           0 :                                         (After_Next_Letter == 'E' &&
     354           0 :                                          Look_Ahead_Letter(3) == 'D')) {
     355             :                                         /* dropped */
     356             :                                 } else
     357           0 :                                         Phonize('K');
     358           8 :                         } else if (MAKESOFT(Next_Letter) &&
     359           0 :                                            Prev_Letter != 'G') {
     360           0 :                                 Phonize('J');
     361             :                         } else {
     362           8 :                                 Phonize('K');
     363             :                         }
     364          11 :                         break;
     365             :                         /* H if before a vowel and not after C,G,P,S,T */
     366             :                 case 'H':
     367          12 :                         if (isvowel(Next_Letter) &&
     368           6 :                                 !AFFECTH(Prev_Letter))
     369           4 :                                 Phonize('H');
     370           6 :                         break;
     371             :                         /* dropped if after C
     372             :                          * else K
     373             :                          */
     374             :                 case 'K':
     375           4 :                         if (Prev_Letter != 'C')
     376           3 :                                 Phonize('K');
     377           4 :                         break;
     378             :                         /* F if before H
     379             :                          * else P
     380             :                          */
     381             :                 case 'P':
     382           7 :                         if (Next_Letter == 'H') {
     383           2 :                                 Phonize('F');
     384             :                         } else {
     385           5 :                                 Phonize('P');
     386             :                         }
     387           7 :                         break;
     388             :                         /* K
     389             :                          */
     390             :                 case 'Q':
     391           0 :                         Phonize('K');
     392           0 :                         break;
     393             :                         /* 'sh' in -SH-, -SIO- or -SIA- or -SCHW-
     394             :                          * else S
     395             :                          */
     396             :                 case 'S':
     397          23 :                         if (Next_Letter == 'I' &&
     398           0 :                                 (After_Next_Letter == 'O' ||
     399           0 :                                  After_Next_Letter == 'A')) {
     400           0 :                                 Phonize(SH);
     401          23 :                         } else if (Next_Letter == 'H') {
     402           2 :                                 Phonize(SH);
     403           2 :                                 skip_letter++;
     404          21 :                         } else if ((!traditional) && (Next_Letter == 'C' && Look_Ahead_Letter(2) == 'H' && Look_Ahead_Letter(3) == 'W')) {
     405           0 :                                 Phonize(SH);
     406           0 :                                 skip_letter += 2;
     407             :                         } else {
     408          21 :                                 Phonize('S');
     409             :                         }
     410          23 :                         break;
     411             :                         /* 'sh' in -TIA- or -TIO-
     412             :                          * else 'th' before H
     413             :                          * else T
     414             :                          */
     415             :                 case 'T':
     416          38 :                         if (Next_Letter == 'I' &&
     417           0 :                                 (After_Next_Letter == 'O' ||
     418           0 :                                  After_Next_Letter == 'A')) {
     419           0 :                                 Phonize(SH);
     420          38 :                         } else if (Next_Letter == 'H') {
     421          21 :                                 Phonize(TH);
     422          21 :                                 skip_letter++;
     423          17 :                         } else if (!(Next_Letter == 'C' && After_Next_Letter == 'H')) {
     424          16 :                                 Phonize('T');
     425             :                         }
     426          38 :                         break;
     427             :                         /* F */
     428             :                 case 'V':
     429           7 :                         Phonize('F');
     430           7 :                         break;
     431             :                         /* W before a vowel, else dropped */
     432             :                 case 'W':
     433          16 :                         if (isvowel(Next_Letter))
     434          12 :                                 Phonize('W');
     435          16 :                         break;
     436             :                         /* KS */
     437             :                 case 'X':
     438           7 :                         Phonize('K');
     439           7 :                         Phonize('S');
     440           7 :                         break;
     441             :                         /* Y if followed by a vowel */
     442             :                 case 'Y':
     443           6 :                         if (isvowel(Next_Letter))
     444           2 :                                 Phonize('Y');
     445           6 :                         break;
     446             :                         /* S */
     447             :                 case 'Z':
     448           3 :                         Phonize('S');
     449           3 :                         break;
     450             :                         /* No transformation */
     451             :                 case 'F':
     452             :                 case 'J':
     453             :                 case 'L':
     454             :                 case 'M':
     455             :                 case 'N':
     456             :                 case 'R':
     457          92 :                         Phonize(Curr_Letter);
     458             :                         break;
     459             :                 default:
     460             :                         /* nothing */
     461             :                         break;
     462             :                 }                                               /* END SWITCH */
     463             : 
     464         404 :                 w_idx += skip_letter;
     465             :         }                                                       /* END FOR */
     466             : 
     467          20 :         End_Phoned_Word;
     468             : 
     469          20 :         return 0;
     470             : }                                                               /* END metaphone */
     471             : /* }}} */
     472             : 
     473             : /*
     474             :  * Local variables:
     475             :  * tab-width: 4
     476             :  * c-basic-offset: 4
     477             :  * End:
     478             :  * vim600: sw=4 ts=4 fdm=marker
     479             :  * vim<600: sw=4 ts=4
     480             :  */

Generated by: LCOV version 1.10

Generated at Mon, 15 Dec 2014 17:02:52 +0000 (5 days ago)

Copyright © 2005-2014 The PHP Group
All rights reserved.