1 : /*
2 : +----------------------------------------------------------------------+
3 : | PHP Version 5 |
4 : +----------------------------------------------------------------------+
5 : | Copyright (c) 1997-2009 The PHP Group |
6 : +----------------------------------------------------------------------+
7 : | This source file is subject to version 3.01 of the PHP license, |
8 : | that is bundled with this package in the file LICENSE, and is |
9 : | available through the world-wide-web at the following url: |
10 : | http://www.php.net/license/3_01.txt |
11 : | If you did not receive a copy of the PHP license and are unable to |
12 : | obtain it through the world-wide-web, please send a note to |
13 : | license@php.net so we can mail you a copy immediately. |
14 : +----------------------------------------------------------------------+
15 : | Author: Thies C. Arntzen <thies@thieso.net> |
16 : +----------------------------------------------------------------------+
17 : */
18 :
19 : /* $Id: metaphone.c 283127 2009-06-30 11:46:20Z felipe $ */
20 :
21 : /*
22 : Based on CPANs "Text-Metaphone-1.96" by Michael G Schwern <schwern@pobox.com>
23 : */
24 :
25 : #include "php.h"
26 : #include "php_metaphone.h"
27 :
28 : static int metaphone(unsigned char *word, int word_len, long max_phonemes, char **phoned_word, int traditional);
29 :
30 : /* {{{ proto string metaphone(string text[, int phones])
31 : Break english phrases down into their phonemes */
32 : PHP_FUNCTION(metaphone)
33 25 : {
34 : char *str;
35 25 : char *result = 0;
36 : int str_len;
37 25 : long phones = 0;
38 :
39 25 : if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &str_len,
40 : &phones) == FAILURE) {
41 1 : return;
42 : }
43 :
44 24 : if (metaphone((unsigned char *)str, str_len, phones, &result, 1) == 0) {
45 22 : RETVAL_STRING(result, 0);
46 : } else {
47 2 : if (result) {
48 0 : efree(result);
49 : }
50 2 : RETURN_FALSE;
51 : }
52 : }
53 : /* }}} */
54 :
55 : /*
56 : this is now the original code by Michael G Schwern:
57 : i've changed it just a slightly bit (use emalloc,
58 : get rid of includes etc)
59 : - thies - 13.09.1999
60 : */
61 :
62 : /*----------------------------- */
63 : /* this used to be "metaphone.h" */
64 : /*----------------------------- */
65 :
66 : /* Special encodings */
67 : #define SH 'X'
68 : #define TH '0'
69 :
70 : /*----------------------------- */
71 : /* end of "metaphone.h" */
72 : /*----------------------------- */
73 :
74 : /*----------------------------- */
75 : /* this used to be "metachar.h" */
76 : /*----------------------------- */
77 :
78 : /* Metachar.h ... little bits about characters for metaphone */
79 : /*-- Character encoding array & accessing macros --*/
80 : /* Stolen directly out of the book... */
81 : char _codes[26] =
82 : {
83 : 1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0
84 : /* a b c d e f g h i j k l m n o p q r s t u v w x y z */
85 : };
86 :
87 :
88 : #define ENCODE(c) (isalpha(c) ? _codes[((toupper(c)) - 'A')] : 0)
89 :
90 : #define isvowel(c) (ENCODE(c) & 1) /* AEIOU */
91 :
92 : /* These letters are passed through unchanged */
93 : #define NOCHANGE(c) (ENCODE(c) & 2) /* FJMNR */
94 :
95 : /* These form dipthongs when preceding H */
96 : #define AFFECTH(c) (ENCODE(c) & 4) /* CGPST */
97 :
98 : /* These make C and G soft */
99 : #define MAKESOFT(c) (ENCODE(c) & 8) /* EIY */
100 :
101 : /* These prevent GH from becoming F */
102 : #define NOGHTOF(c) (ENCODE(c) & 16) /* BDH */
103 :
104 : /*----------------------------- */
105 : /* end of "metachar.h" */
106 : /*----------------------------- */
107 :
108 : /* I suppose I could have been using a character pointer instead of
109 : * accesssing the array directly... */
110 :
111 : /* Look at the next letter in the word */
112 : #define Next_Letter (toupper(word[w_idx+1]))
113 : /* Look at the current letter in the word */
114 : #define Curr_Letter (toupper(word[w_idx]))
115 : /* Go N letters back. */
116 : #define Look_Back_Letter(n) (w_idx >= n ? toupper(word[w_idx-n]) : '\0')
117 : /* Previous letter. I dunno, should this return null on failure? */
118 : #define Prev_Letter (Look_Back_Letter(1))
119 : /* Look two letters down. It makes sure you don't walk off the string. */
120 : #define After_Next_Letter (Next_Letter != '\0' ? toupper(word[w_idx+2]) \
121 : : '\0')
122 : #define Look_Ahead_Letter(n) (toupper(Lookahead(word+w_idx, n)))
123 :
124 :
125 : /* Allows us to safely look ahead an arbitrary # of letters */
126 : /* I probably could have just used strlen... */
127 : static char Lookahead(char *word, int how_far)
128 0 : {
129 0 : char letter_ahead = '\0'; /* null by default */
130 : int idx;
131 0 : for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
132 : /* Edge forward in the string... */
133 :
134 0 : letter_ahead = word[idx]; /* idx will be either == to how_far or
135 : * at the end of the string
136 : */
137 0 : return letter_ahead;
138 : }
139 :
140 :
141 : /* phonize one letter
142 : * We don't know the buffers size in advance. On way to solve this is to just
143 : * re-allocate the buffer size. We're using an extra of 2 characters (this
144 : * could be one though; or more too). */
145 : #define Phonize(c) { \
146 : if (p_idx >= max_buffer_len) { \
147 : *phoned_word = safe_erealloc(*phoned_word, 2, sizeof(char), max_buffer_len); \
148 : max_buffer_len += 2; \
149 : } \
150 : (*phoned_word)[p_idx++] = c; \
151 : }
152 : /* Slap a null character on the end of the phoned word */
153 : #define End_Phoned_Word { \
154 : if (p_idx == max_buffer_len) { \
155 : *phoned_word = safe_erealloc(*phoned_word, 1, sizeof(char), max_buffer_len); \
156 : } \
157 : (*phoned_word)[p_idx] = '\0'; \
158 : }
159 : /* How long is the phoned word? */
160 : #define Phone_Len (p_idx)
161 :
162 : /* Note is a letter is a 'break' in the word */
163 : #define Isbreak(c) (!isalpha(c))
164 :
165 : /* {{{ metaphone
166 : */
167 : static int metaphone(unsigned char *word, int word_len, long max_phonemes, char **phoned_word, int traditional)
168 24 : {
169 24 : int w_idx = 0; /* point in the phonization we're at. */
170 24 : int p_idx = 0; /* end of the phoned phrase */
171 24 : int max_buffer_len = 0; /* maximum length of the destination buffer */
172 :
173 : /*-- Parameter checks --*/
174 : /* Negative phoneme length is meaningless */
175 :
176 24 : if (max_phonemes < 0)
177 2 : return -1;
178 :
179 : /* Empty/null string is meaningless */
180 : /* Overly paranoid */
181 : /* assert(word != NULL && word[0] != '\0'); */
182 :
183 22 : if (word == NULL)
184 0 : return -1;
185 :
186 : /*-- Allocate memory for our phoned_phrase --*/
187 22 : if (max_phonemes == 0) { /* Assume largest possible */
188 21 : max_buffer_len = word_len;
189 21 : *phoned_word = safe_emalloc(sizeof(char), word_len, 1);
190 : } else {
191 1 : max_buffer_len = max_phonemes;
192 1 : *phoned_word = safe_emalloc(sizeof(char), max_phonemes, 1);
193 : }
194 :
195 :
196 : /*-- The first phoneme has to be processed specially. --*/
197 : /* Find our first letter */
198 24 : for (; !isalpha(Curr_Letter); w_idx++) {
199 : /* On the off chance we were given nothing but crap... */
200 4 : if (Curr_Letter == '\0') {
201 2 : End_Phoned_Word
202 2 : return SUCCESS; /* For testing */
203 : }
204 : }
205 :
206 20 : switch (Curr_Letter) {
207 : /* AE becomes E */
208 : case 'A':
209 2 : if (Next_Letter == 'E') {
210 1 : Phonize('E');
211 1 : w_idx += 2;
212 : }
213 : /* Remember, preserve vowels at the beginning */
214 : else {
215 1 : Phonize('A');
216 1 : w_idx++;
217 : }
218 2 : break;
219 : /* [GKP]N becomes N */
220 : case 'G':
221 : case 'K':
222 : case 'P':
223 3 : if (Next_Letter == 'N') {
224 3 : Phonize('N');
225 3 : w_idx += 2;
226 : }
227 3 : break;
228 : /* WH becomes W,
229 : WR becomes R
230 : W if followed by a vowel */
231 : case 'W':
232 3 : if (Next_Letter == 'R') {
233 1 : Phonize(Next_Letter);
234 1 : w_idx += 2;
235 2 : } else if (Next_Letter == 'H' || isvowel(Next_Letter)) {
236 2 : Phonize('W');
237 2 : w_idx += 2;
238 : }
239 : /* else ignore */
240 3 : break;
241 : /* X becomes S */
242 : case 'X':
243 1 : Phonize('S');
244 1 : w_idx++;
245 1 : break;
246 : /* Vowels are kept */
247 : /* We did A already
248 : case 'A':
249 : case 'a':
250 : */
251 : case 'E':
252 : case 'I':
253 : case 'O':
254 : case 'U':
255 0 : Phonize(Curr_Letter);
256 0 : w_idx++;
257 : break;
258 : default:
259 : /* do nothing */
260 : break;
261 : }
262 :
263 :
264 :
265 : /* On to the metaphoning */
266 554 : for (; Curr_Letter != '\0' &&
267 : (max_phonemes == 0 || Phone_Len < max_phonemes);
268 514 : w_idx++) {
269 : /* How many letters to skip because an eariler encoding handled
270 : * multiple letters */
271 514 : unsigned short int skip_letter = 0;
272 :
273 :
274 : /* THOUGHT: It would be nice if, rather than having things like...
275 : * well, SCI. For SCI you encode the S, then have to remember
276 : * to skip the C. So the phonome SCI invades both S and C. It would
277 : * be better, IMHO, to skip the C from the S part of the encoding.
278 : * Hell, I'm trying it.
279 : */
280 :
281 : /* Ignore non-alphas */
282 514 : if (!isalpha(Curr_Letter))
283 98 : continue;
284 :
285 : /* Drop duplicates, except CC */
286 416 : if (Curr_Letter == Prev_Letter &&
287 : Curr_Letter != 'C')
288 12 : continue;
289 :
290 404 : switch (Curr_Letter) {
291 : /* B -> B unless in MB */
292 : case 'B':
293 6 : if (Prev_Letter != 'M')
294 6 : Phonize('B');
295 6 : break;
296 : /* 'sh' if -CIA- or -CH, but not SCH, except SCHW.
297 : * (SCHW is handled in S)
298 : * S if -CI-, -CE- or -CY-
299 : * dropped if -SCI-, SCE-, -SCY- (handed in S)
300 : * else K
301 : */
302 : case 'C':
303 15 : if (MAKESOFT(Next_Letter)) { /* C[IEY] */
304 2 : if (After_Next_Letter == 'A' &&
305 : Next_Letter == 'I') { /* CIA */
306 0 : Phonize(SH);
307 : }
308 : /* SC[IEY] */
309 2 : else if (Prev_Letter == 'S') {
310 : /* Dropped */
311 : } else {
312 2 : Phonize('S');
313 : }
314 11 : } else if (Next_Letter == 'H') {
315 1 : if ((!traditional) && (After_Next_Letter == 'R' || Prev_Letter == 'S')) { /* Christ, School */
316 0 : Phonize('K');
317 : } else {
318 1 : Phonize(SH);
319 : }
320 1 : skip_letter++;
321 : } else {
322 10 : Phonize('K');
323 : }
324 13 : break;
325 : /* J if in -DGE-, -DGI- or -DGY-
326 : * else T
327 : */
328 : case 'D':
329 24 : if (Next_Letter == 'G' &&
330 : MAKESOFT(After_Next_Letter)) {
331 0 : Phonize('J');
332 0 : skip_letter++;
333 : } else
334 24 : Phonize('T');
335 24 : break;
336 : /* F if in -GH and not B--GH, D--GH, -H--GH, -H---GH
337 : * else dropped if -GNED, -GN,
338 : * else dropped if -DGE-, -DGI- or -DGY- (handled in D)
339 : * else J if in -GE-, -GI, -GY and not GG
340 : * else K
341 : */
342 : case 'G':
343 11 : if (Next_Letter == 'H') {
344 3 : if (!(NOGHTOF(Look_Back_Letter(3)) ||
345 : Look_Back_Letter(4) == 'H')) {
346 3 : Phonize('F');
347 3 : skip_letter++;
348 : } else {
349 : /* silent */
350 : }
351 8 : } else if (Next_Letter == 'N') {
352 0 : if (Isbreak(After_Next_Letter) ||
353 : (After_Next_Letter == 'E' &&
354 : Look_Ahead_Letter(3) == 'D')) {
355 : /* dropped */
356 : } else
357 0 : Phonize('K');
358 8 : } else if (MAKESOFT(Next_Letter) &&
359 : Prev_Letter != 'G') {
360 0 : Phonize('J');
361 : } else {
362 8 : Phonize('K');
363 : }
364 11 : break;
365 : /* H if before a vowel and not after C,G,P,S,T */
366 : case 'H':
367 6 : if (isvowel(Next_Letter) &&
368 : !AFFECTH(Prev_Letter))
369 4 : Phonize('H');
370 6 : break;
371 : /* dropped if after C
372 : * else K
373 : */
374 : case 'K':
375 4 : if (Prev_Letter != 'C')
376 3 : Phonize('K');
377 4 : break;
378 : /* F if before H
379 : * else P
380 : */
381 : case 'P':
382 7 : if (Next_Letter == 'H') {
383 2 : Phonize('F');
384 : } else {
385 5 : Phonize('P');
386 : }
387 7 : break;
388 : /* K
389 : */
390 : case 'Q':
391 0 : Phonize('K');
392 0 : break;
393 : /* 'sh' in -SH-, -SIO- or -SIA- or -SCHW-
394 : * else S
395 : */
396 : case 'S':
397 23 : if (Next_Letter == 'I' &&
398 : (After_Next_Letter == 'O' ||
399 : After_Next_Letter == 'A')) {
400 0 : Phonize(SH);
401 23 : } else if (Next_Letter == 'H') {
402 2 : Phonize(SH);
403 2 : skip_letter++;
404 21 : } else if ((!traditional) && (Next_Letter == 'C' && Look_Ahead_Letter(2) == 'H' && Look_Ahead_Letter(3) == 'W')) {
405 0 : Phonize(SH);
406 0 : skip_letter += 2;
407 : } else {
408 21 : Phonize('S');
409 : }
410 23 : break;
411 : /* 'sh' in -TIA- or -TIO-
412 : * else 'th' before H
413 : * else T
414 : */
415 : case 'T':
416 38 : if (Next_Letter == 'I' &&
417 : (After_Next_Letter == 'O' ||
418 : After_Next_Letter == 'A')) {
419 0 : Phonize(SH);
420 38 : } else if (Next_Letter == 'H') {
421 21 : Phonize(TH);
422 21 : skip_letter++;
423 17 : } else if (!(Next_Letter == 'C' && After_Next_Letter == 'H')) {
424 16 : Phonize('T');
425 : }
426 38 : break;
427 : /* F */
428 : case 'V':
429 7 : Phonize('F');
430 7 : break;
431 : /* W before a vowel, else dropped */
432 : case 'W':
433 16 : if (isvowel(Next_Letter))
434 12 : Phonize('W');
435 16 : break;
436 : /* KS */
437 : case 'X':
438 7 : Phonize('K');
439 7 : Phonize('S');
440 7 : break;
441 : /* Y if followed by a vowel */
442 : case 'Y':
443 6 : if (isvowel(Next_Letter))
444 2 : Phonize('Y');
445 6 : break;
446 : /* S */
447 : case 'Z':
448 3 : Phonize('S');
449 3 : break;
450 : /* No transformation */
451 : case 'F':
452 : case 'J':
453 : case 'L':
454 : case 'M':
455 : case 'N':
456 : case 'R':
457 92 : Phonize(Curr_Letter);
458 : break;
459 : default:
460 : /* nothing */
461 : break;
462 : } /* END SWITCH */
463 :
464 404 : w_idx += skip_letter;
465 : } /* END FOR */
466 :
467 20 : End_Phoned_Word;
468 :
469 20 : return 0;
470 : } /* END metaphone */
471 : /* }}} */
472 :
473 : /*
474 : * Local variables:
475 : * tab-width: 4
476 : * c-basic-offset: 4
477 : * End:
478 : * vim600: sw=4 ts=4 fdm=marker
479 : * vim<600: sw=4 ts=4
480 : */
|