PHP  
 PHP: Test and Code Coverage Analysis
downloads | QA | documentation | faq | getting help | mailing lists | reporting bugs | php.net sites | links | my php.net 
 

LCOV - code coverage report
Current view: top level - ext/standard - metaphone.c (source / functions) Hit Total Coverage
Test: PHP Code Coverage Lines: 123 154 79.9 %
Date: 2014-11-22 Functions: 2 3 66.7 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :    +----------------------------------------------------------------------+
       3             :    | PHP Version 7                                                        |
       4             :    +----------------------------------------------------------------------+
       5             :    | Copyright (c) 1997-2014 The PHP Group                                |
       6             :    +----------------------------------------------------------------------+
       7             :    | This source file is subject to version 3.01 of the PHP license,      |
       8             :    | that is bundled with this package in the file LICENSE, and is        |
       9             :    | available through the world-wide-web at the following url:           |
      10             :    | http://www.php.net/license/3_01.txt                                  |
      11             :    | If you did not receive a copy of the PHP license and are unable to   |
      12             :    | obtain it through the world-wide-web, please send a note to          |
      13             :    | license@php.net so we can mail you a copy immediately.               |
      14             :    +----------------------------------------------------------------------+
      15             :    | Author: Thies C. Arntzen <thies@thieso.net>                          |
      16             :    +----------------------------------------------------------------------+
      17             : */
      18             : 
      19             : /* $Id$ */
      20             : 
      21             : /*
      22             :         Based on CPANs "Text-Metaphone-1.96" by Michael G Schwern <schwern@pobox.com> 
      23             : */
      24             : 
      25             : #include "php.h"
      26             : #include "php_metaphone.h"
      27             : 
      28             : static int metaphone(unsigned char *word, size_t word_len, zend_long max_phonemes, zend_string **phoned_word, int traditional);
      29             : 
      30             : /* {{{ proto string metaphone(string text[, int phones])
      31             :    Break english phrases down into their phonemes */
      32          25 : PHP_FUNCTION(metaphone)
      33             : {
      34             :         zend_string *str;
      35          25 :         zend_string *result = NULL;
      36          25 :         zend_long phones = 0;
      37             : 
      38          25 :         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "S|l", &str, &phones) == FAILURE) {
      39           1 :                 return;
      40             :         }
      41             : 
      42          24 :         if (metaphone((unsigned char *)str->val, str->len, phones, &result, 1) == 0) {
      43          22 :                 RETVAL_STR(result);
      44             :         } else {
      45           2 :                 if (result) {
      46           0 :                         zend_string_free(result);
      47             :                 }
      48           2 :                 RETURN_FALSE;
      49             :         }
      50             : }
      51             : /* }}} */
      52             : 
      53             : /* 
      54             :    this is now the original code by Michael G Schwern:
      55             :    i've changed it just a slightly bit (use emalloc, 
      56             :    get rid of includes etc) 
      57             :         - thies - 13.09.1999
      58             : */
      59             : 
      60             : /*-----------------------------  */
      61             : /* this used to be "metaphone.h" */
      62             : /*-----------------------------  */
      63             : 
      64             : /* Special encodings */
      65             : #define  SH     'X'
      66             : #define  TH             '0'
      67             : 
      68             : /*-----------------------------  */
      69             : /* end of "metaphone.h"          */
      70             : /*-----------------------------  */
      71             : 
      72             : /*----------------------------- */
      73             : /* this used to be "metachar.h" */
      74             : /*----------------------------- */
      75             : 
      76             : /* Metachar.h ... little bits about characters for metaphone */
      77             : /*-- Character encoding array & accessing macros --*/
      78             : /* Stolen directly out of the book... */
      79             : char _codes[26] =
      80             : {
      81             :         1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0
      82             : /*  a  b c  d e f g  h i j k l m n o p q r s t u v w x y z */
      83             : };
      84             : 
      85             : 
      86             : #define ENCODE(c) (isalpha(c) ? _codes[((toupper(c)) - 'A')] : 0)
      87             : 
      88             : #define isvowel(c)  (ENCODE(c) & 1)         /* AEIOU */
      89             : 
      90             : /* These letters are passed through unchanged */
      91             : #define NOCHANGE(c) (ENCODE(c) & 2)         /* FJMNR */
      92             : 
      93             : /* These form diphthongs when preceding H */
      94             : #define AFFECTH(c)  (ENCODE(c) & 4)         /* CGPST */
      95             : 
      96             : /* These make C and G soft */
      97             : #define MAKESOFT(c) (ENCODE(c) & 8)         /* EIY */
      98             : 
      99             : /* These prevent GH from becoming F */
     100             : #define NOGHTOF(c)  (ENCODE(c) & 16)        /* BDH */
     101             : 
     102             : /*----------------------------- */
     103             : /* end of "metachar.h"          */
     104             : /*----------------------------- */
     105             : 
     106             : /* I suppose I could have been using a character pointer instead of
     107             :  * accesssing the array directly... */
     108             : 
     109             : /* Look at the next letter in the word */
     110             : #define Next_Letter (toupper(word[w_idx+1]))
     111             : /* Look at the current letter in the word */
     112             : #define Curr_Letter (toupper(word[w_idx]))
     113             : /* Go N letters back. */
     114             : #define Look_Back_Letter(n)     (w_idx >= n ? toupper(word[w_idx-n]) : '\0')
     115             : /* Previous letter.  I dunno, should this return null on failure? */
     116             : #define Prev_Letter (Look_Back_Letter(1))
     117             : /* Look two letters down.  It makes sure you don't walk off the string. */
     118             : #define After_Next_Letter       (Next_Letter != '\0' ? toupper(word[w_idx+2]) \
     119             :                                                                                              : '\0')
     120             : #define Look_Ahead_Letter(n) (toupper(Lookahead(word+w_idx, n)))
     121             : 
     122             : 
     123             : /* Allows us to safely look ahead an arbitrary # of letters */
     124             : /* I probably could have just used strlen... */
     125           0 : static char Lookahead(char *word, int how_far)
     126             : {
     127           0 :         char letter_ahead = '\0';       /* null by default */
     128             :         int idx;
     129           0 :         for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
     130             :         /* Edge forward in the string... */
     131             : 
     132           0 :         letter_ahead = word[idx];       /* idx will be either == to how_far or
     133             :                                                                  * at the end of the string
     134             :                                                                  */
     135           0 :         return letter_ahead;
     136             : }
     137             : 
     138             : 
     139             : /* phonize one letter
     140             :  * We don't know the buffers size in advance. On way to solve this is to just
     141             :  * re-allocate the buffer size. We're using an extra of 2 characters (this
     142             :  * could be one though; or more too). */
     143             : #define Phonize(c)      { \
     144             :                                                 if (p_idx >= max_buffer_len) { \
     145             :                                                         *phoned_word = zend_string_realloc(*phoned_word, 2 * sizeof(char) + max_buffer_len, 0); \
     146             :                                                         max_buffer_len += 2; \
     147             :                                                 } \
     148             :                                                 (*phoned_word)->val[p_idx++] = c; \
     149             :                                                 (*phoned_word)->len = p_idx; \
     150             :                                         }
     151             : /* Slap a null character on the end of the phoned word */
     152             : #define End_Phoned_Word { \
     153             :                                                         if (p_idx == max_buffer_len) { \
     154             :                                                                 *phoned_word = zend_string_realloc(*phoned_word, 1 * sizeof(char) + max_buffer_len, 0); \
     155             :                                                                 max_buffer_len += 1; \
     156             :                                                         } \
     157             :                                                         (*phoned_word)->val[p_idx] = '\0'; \
     158             :                                                         (*phoned_word)->len = p_idx; \
     159             :                                                 }
     160             : /* How long is the phoned word? */
     161             : #define Phone_Len       (p_idx)
     162             : 
     163             : /* Note is a letter is a 'break' in the word */
     164             : #define Isbreak(c)  (!isalpha(c))
     165             : 
     166             : /* {{{ metaphone
     167             :  */
     168          24 : static int metaphone(unsigned char *word, size_t word_len, zend_long max_phonemes, zend_string **phoned_word, int traditional)
     169             : {
     170          24 :         int w_idx = 0;                          /* point in the phonization we're at. */
     171          24 :         int p_idx = 0;                          /* end of the phoned phrase */
     172          24 :         size_t max_buffer_len = 0;              /* maximum length of the destination buffer */
     173             : 
     174             : /*-- Parameter checks --*/
     175             :         /* Negative phoneme length is meaningless */
     176             : 
     177          24 :         if (max_phonemes < 0)
     178           2 :                 return -1;
     179             : 
     180             :         /* Empty/null string is meaningless */
     181             :         /* Overly paranoid */
     182             :         /* assert(word != NULL && word[0] != '\0'); */
     183             : 
     184          22 :         if (word == NULL)
     185           0 :                 return -1;
     186             : 
     187             : /*-- Allocate memory for our phoned_phrase --*/
     188          22 :         if (max_phonemes == 0) {        /* Assume largest possible */
     189          21 :                 max_buffer_len = word_len;
     190          42 :                 *phoned_word = zend_string_alloc(sizeof(char) * word_len + 1, 0);
     191             :         } else {
     192           1 :                 max_buffer_len = max_phonemes;
     193           2 :                 *phoned_word = zend_string_alloc(sizeof(char) * max_phonemes + 1, 0);
     194             :         }
     195             : 
     196             : 
     197             : /*-- The first phoneme has to be processed specially. --*/
     198             :         /* Find our first letter */
     199          24 :         for (; !isalpha(Curr_Letter); w_idx++) {
     200             :                 /* On the off chance we were given nothing but crap... */
     201           4 :                 if (Curr_Letter == '\0') {
     202           3 :                         End_Phoned_Word
     203           2 :                                 return SUCCESS; /* For testing */
     204             :                 }
     205             :         }
     206             : 
     207          20 :         switch (Curr_Letter) {
     208             :                 /* AE becomes E */
     209             :         case 'A':
     210           2 :                 if (Next_Letter == 'E') {
     211           1 :                         Phonize('E');
     212           1 :                         w_idx += 2;
     213             :                 }
     214             :                 /* Remember, preserve vowels at the beginning */
     215             :                 else {
     216           1 :                         Phonize('A');
     217           1 :                         w_idx++;
     218             :                 }
     219           2 :                 break;
     220             :                 /* [GKP]N becomes N */
     221             :         case 'G':
     222             :         case 'K':
     223             :         case 'P':
     224           3 :                 if (Next_Letter == 'N') {
     225           3 :                         Phonize('N');
     226           3 :                         w_idx += 2;
     227             :                 }
     228           3 :                 break;
     229             :                 /* WH becomes W, 
     230             :                    WR becomes R 
     231             :                    W if followed by a vowel */
     232             :         case 'W':
     233           3 :                 if (Next_Letter == 'R') {
     234           1 :                         Phonize(Next_Letter);
     235           1 :                         w_idx += 2;
     236           2 :                 } else if (Next_Letter == 'H' || isvowel(Next_Letter)) {
     237           2 :                         Phonize('W');
     238           2 :                         w_idx += 2;
     239             :                 }
     240             :                 /* else ignore */
     241           3 :                 break;
     242             :                 /* X becomes S */
     243             :         case 'X':
     244           1 :                 Phonize('S');
     245           1 :                 w_idx++;
     246           1 :                 break;
     247             :                 /* Vowels are kept */
     248             :                 /* We did A already
     249             :                    case 'A':
     250             :                    case 'a':
     251             :                  */
     252             :         case 'E':
     253             :         case 'I':
     254             :         case 'O':
     255             :         case 'U':
     256           0 :                 Phonize(Curr_Letter);
     257           0 :                 w_idx++;
     258             :                 break;
     259             :         default:
     260             :                 /* do nothing */
     261             :                 break;
     262             :         }
     263             : 
     264             : 
     265             : 
     266             :         /* On to the metaphoning */
     267         566 :         for (; Curr_Letter != '\0' &&
     268          12 :                  (max_phonemes == 0 || Phone_Len < max_phonemes);
     269         514 :                  w_idx++) {
     270             :                 /* How many letters to skip because an eariler encoding handled     
     271             :                  * multiple letters */
     272         514 :                 unsigned short int skip_letter = 0;
     273             : 
     274             : 
     275             :                 /* THOUGHT:  It would be nice if, rather than having things like...
     276             :                  * well, SCI.  For SCI you encode the S, then have to remember
     277             :                  * to skip the C.  So the phonome SCI invades both S and C.  It would
     278             :                  * be better, IMHO, to skip the C from the S part of the encoding.
     279             :                  * Hell, I'm trying it.
     280             :                  */
     281             : 
     282             :                 /* Ignore non-alphas */
     283         514 :                 if (!isalpha(Curr_Letter))
     284          98 :                         continue;
     285             : 
     286             :                 /* Drop duplicates, except CC */
     287         428 :                 if (Curr_Letter == Prev_Letter &&
     288          12 :                         Curr_Letter != 'C')
     289          12 :                         continue;
     290             : 
     291         404 :                 switch (Curr_Letter) {
     292             :                         /* B -> B unless in MB */
     293             :                 case 'B':
     294           6 :                         if (Prev_Letter != 'M')
     295           6 :                                 Phonize('B');
     296           6 :                         break;
     297             :                         /* 'sh' if -CIA- or -CH, but not SCH, except SCHW.
     298             :                          * (SCHW is handled in S)
     299             :                          *  S if -CI-, -CE- or -CY-
     300             :                          *  dropped if -SCI-, SCE-, -SCY- (handed in S)
     301             :                          *  else K
     302             :                          */
     303             :                 case 'C':
     304          15 :                         if (MAKESOFT(Next_Letter)) {    /* C[IEY] */
     305           2 :                                 if (After_Next_Letter == 'A' &&
     306           0 :                                         Next_Letter == 'I') {   /* CIA */
     307           0 :                                         Phonize(SH);
     308             :                                 }
     309             :                                 /* SC[IEY] */
     310           2 :                                 else if (Prev_Letter == 'S') {
     311             :                                         /* Dropped */
     312             :                                 } else {
     313           2 :                                         Phonize('S');
     314             :                                 }
     315          11 :                         } else if (Next_Letter == 'H') {
     316           1 :                                 if ((!traditional) && (After_Next_Letter == 'R' || Prev_Letter == 'S')) {       /* Christ, School */
     317           0 :                                         Phonize('K');
     318             :                                 } else {
     319           1 :                                         Phonize(SH);
     320             :                                 }
     321           1 :                                 skip_letter++;
     322             :                         } else {
     323          10 :                                 Phonize('K');
     324             :                         }
     325          13 :                         break;
     326             :                         /* J if in -DGE-, -DGI- or -DGY-
     327             :                          * else T
     328             :                          */
     329             :                 case 'D':
     330          24 :                         if (Next_Letter == 'G' &&
     331           0 :                                 MAKESOFT(After_Next_Letter)) {
     332           0 :                                 Phonize('J');
     333           0 :                                 skip_letter++;
     334             :                         } else
     335          24 :                                 Phonize('T');
     336          24 :                         break;
     337             :                         /* F if in -GH and not B--GH, D--GH, -H--GH, -H---GH
     338             :                          * else dropped if -GNED, -GN, 
     339             :                          * else dropped if -DGE-, -DGI- or -DGY- (handled in D)
     340             :                          * else J if in -GE-, -GI, -GY and not GG
     341             :                          * else K
     342             :                          */
     343             :                 case 'G':
     344          11 :                         if (Next_Letter == 'H') {
     345           6 :                                 if (!(NOGHTOF(Look_Back_Letter(3)) ||
     346           3 :                                           Look_Back_Letter(4) == 'H')) {
     347           3 :                                         Phonize('F');
     348           3 :                                         skip_letter++;
     349             :                                 } else {
     350             :                                         /* silent */
     351             :                                 }
     352           8 :                         } else if (Next_Letter == 'N') {
     353           0 :                                 if (Isbreak(After_Next_Letter) ||
     354           0 :                                         (After_Next_Letter == 'E' &&
     355           0 :                                          Look_Ahead_Letter(3) == 'D')) {
     356             :                                         /* dropped */
     357             :                                 } else
     358           0 :                                         Phonize('K');
     359           8 :                         } else if (MAKESOFT(Next_Letter) &&
     360           0 :                                            Prev_Letter != 'G') {
     361           0 :                                 Phonize('J');
     362             :                         } else {
     363           8 :                                 Phonize('K');
     364             :                         }
     365          11 :                         break;
     366             :                         /* H if before a vowel and not after C,G,P,S,T */
     367             :                 case 'H':
     368          12 :                         if (isvowel(Next_Letter) &&
     369           6 :                                 !AFFECTH(Prev_Letter))
     370           4 :                                 Phonize('H');
     371           6 :                         break;
     372             :                         /* dropped if after C
     373             :                          * else K
     374             :                          */
     375             :                 case 'K':
     376           4 :                         if (Prev_Letter != 'C')
     377           3 :                                 Phonize('K');
     378           4 :                         break;
     379             :                         /* F if before H
     380             :                          * else P
     381             :                          */
     382             :                 case 'P':
     383           7 :                         if (Next_Letter == 'H') {
     384           2 :                                 Phonize('F');
     385             :                         } else {
     386           5 :                                 Phonize('P');
     387             :                         }
     388           7 :                         break;
     389             :                         /* K
     390             :                          */
     391             :                 case 'Q':
     392           0 :                         Phonize('K');
     393           0 :                         break;
     394             :                         /* 'sh' in -SH-, -SIO- or -SIA- or -SCHW-
     395             :                          * else S
     396             :                          */
     397             :                 case 'S':
     398          23 :                         if (Next_Letter == 'I' &&
     399           0 :                                 (After_Next_Letter == 'O' ||
     400           0 :                                  After_Next_Letter == 'A')) {
     401           0 :                                 Phonize(SH);
     402          23 :                         } else if (Next_Letter == 'H') {
     403           2 :                                 Phonize(SH);
     404           2 :                                 skip_letter++;
     405          21 :                         } else if ((!traditional) && (Next_Letter == 'C' && Look_Ahead_Letter(2) == 'H' && Look_Ahead_Letter(3) == 'W')) {
     406           0 :                                 Phonize(SH);
     407           0 :                                 skip_letter += 2;
     408             :                         } else {
     409          21 :                                 Phonize('S');
     410             :                         }
     411          23 :                         break;
     412             :                         /* 'sh' in -TIA- or -TIO-
     413             :                          * else 'th' before H
     414             :                          * else T
     415             :                          */
     416             :                 case 'T':
     417          38 :                         if (Next_Letter == 'I' &&
     418           0 :                                 (After_Next_Letter == 'O' ||
     419           0 :                                  After_Next_Letter == 'A')) {
     420           0 :                                 Phonize(SH);
     421          38 :                         } else if (Next_Letter == 'H') {
     422          21 :                                 Phonize(TH);
     423          21 :                                 skip_letter++;
     424          17 :                         } else if (!(Next_Letter == 'C' && After_Next_Letter == 'H')) {
     425          16 :                                 Phonize('T');
     426             :                         }
     427          38 :                         break;
     428             :                         /* F */
     429             :                 case 'V':
     430           7 :                         Phonize('F');
     431           7 :                         break;
     432             :                         /* W before a vowel, else dropped */
     433             :                 case 'W':
     434          16 :                         if (isvowel(Next_Letter))
     435          12 :                                 Phonize('W');
     436          16 :                         break;
     437             :                         /* KS */
     438             :                 case 'X':
     439           8 :                         Phonize('K');
     440           9 :                         Phonize('S');
     441           7 :                         break;
     442             :                         /* Y if followed by a vowel */
     443             :                 case 'Y':
     444           6 :                         if (isvowel(Next_Letter))
     445           2 :                                 Phonize('Y');
     446           6 :                         break;
     447             :                         /* S */
     448             :                 case 'Z':
     449           4 :                         Phonize('S');
     450           3 :                         break;
     451             :                         /* No transformation */
     452             :                 case 'F':
     453             :                 case 'J':
     454             :                 case 'L':
     455             :                 case 'M':
     456             :                 case 'N':
     457             :                 case 'R':
     458          92 :                         Phonize(Curr_Letter);
     459             :                         break;
     460             :                 default:
     461             :                         /* nothing */
     462             :                         break;
     463             :                 }                                               /* END SWITCH */
     464             : 
     465         404 :                 w_idx += skip_letter;
     466             :         }                                                       /* END FOR */
     467             : 
     468          23 :         End_Phoned_Word;
     469             : 
     470          20 :         return 0;
     471             : }                                                               /* END metaphone */
     472             : /* }}} */
     473             : 
     474             : /*
     475             :  * Local variables:
     476             :  * tab-width: 4
     477             :  * c-basic-offset: 4
     478             :  * End:
     479             :  * vim600: sw=4 ts=4 fdm=marker
     480             :  * vim<600: sw=4 ts=4
     481             :  */

Generated by: LCOV version 1.10

Generated at Sat, 22 Nov 2014 23:01:26 +0000 (3 days ago)

Copyright © 2005-2014 The PHP Group
All rights reserved.