1 : /*
2 : +----------------------------------------------------------------------+
3 : | PHP Version 6 |
4 : +----------------------------------------------------------------------+
5 : | Copyright (c) 1997-2009 The PHP Group |
6 : +----------------------------------------------------------------------+
7 : | This source file is subject to version 3.01 of the PHP license, |
8 : | that is bundled with this package in the file LICENSE, and is |
9 : | available through the world-wide-web at the following url: |
10 : | http://www.php.net/license/3_01.txt |
11 : | If you did not receive a copy of the PHP license and are unable to |
12 : | obtain it through the world-wide-web, please send a note to |
13 : | license@php.net so we can mail you a copy immediately. |
14 : +----------------------------------------------------------------------+
15 : | Author: Thies C. Arntzen <thies@thieso.net> |
16 : +----------------------------------------------------------------------+
17 : */
18 :
19 : /* $Id: metaphone.c 282958 2009-06-28 18:41:20Z felipe $ */
20 :
21 : /*
22 : Based on CPANs "Text-Metaphone-1.96" by Michael G Schwern <schwern@pobox.com>
23 : */
24 :
25 : #include "php.h"
26 : #include "php_metaphone.h"
27 :
28 : static int metaphone(unsigned char *word, int word_len, long max_phonemes, char **phoned_word, int traditional);
29 :
30 : /* {{{ proto string metaphone(string text[, int phones]) U
31 : Break english phrases down into their phonemes */
32 : PHP_FUNCTION(metaphone)
33 25 : {
34 : char *str;
35 25 : char *result = 0;
36 : int str_len;
37 25 : long phones = 0;
38 :
39 25 : if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &str_len,
40 : &phones) == FAILURE) {
41 1 : return;
42 : }
43 :
44 24 : if (metaphone((unsigned char *)str, str_len, phones, &result, 1) == 0) {
45 22 : RETVAL_ASCII_STRING(result, ZSTR_AUTOFREE);
46 : } else {
47 2 : if (result) {
48 0 : efree(result);
49 : }
50 2 : RETURN_FALSE;
51 : }
52 : }
53 : /* }}} */
54 :
55 : /*
56 : this is now the original code by Michael G Schwern:
57 : i've changed it just a slightly bit (use emalloc,
58 : get rid of includes etc)
59 : - thies - 13.09.1999
60 : */
61 :
62 : /*----------------------------- */
63 : /* this used to be "metaphone.h" */
64 : /*----------------------------- */
65 :
66 : /* Special encodings */
67 : #define SH 'X'
68 : #define TH '0'
69 :
70 : /*----------------------------- */
71 : /* end of "metaphone.h" */
72 : /*----------------------------- */
73 :
74 : /*----------------------------- */
75 : /* this used to be "metachar.h" */
76 : /*----------------------------- */
77 :
78 : /* Metachar.h ... little bits about characters for metaphone */
79 : /*-- Character encoding array & accessing macros --*/
80 : /* Stolen directly out of the book... */
81 : char _codes[26] =
82 : {
83 : 1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0
84 : /* a b c d e f g h i j k l m n o p q r s t u v w x y z */
85 : };
86 :
87 :
88 : #define ENCODE(c) (isalpha(c) ? _codes[((toupper(c)) - 'A')] : 0)
89 :
90 : #define isvowel(c) (ENCODE(c) & 1) /* AEIOU */
91 :
92 : /* These letters are passed through unchanged */
93 : #define NOCHANGE(c) (ENCODE(c) & 2) /* FJMNR */
94 :
95 : /* These form dipthongs when preceding H */
96 : #define AFFECTH(c) (ENCODE(c) & 4) /* CGPST */
97 :
98 : /* These make C and G soft */
99 : #define MAKESOFT(c) (ENCODE(c) & 8) /* EIY */
100 :
101 : /* These prevent GH from becoming F */
102 : #define NOGHTOF(c) (ENCODE(c) & 16) /* BDH */
103 :
104 : /*----------------------------- */
105 : /* end of "metachar.h" */
106 : /*----------------------------- */
107 :
108 : /* I suppose I could have been using a character pointer instead of
109 : * accesssing the array directly... */
110 :
111 : /* Look at the next letter in the word */
112 : #define Next_Letter (toupper(word[w_idx+1]))
113 : /* Look at the current letter in the word */
114 : #define Curr_Letter (toupper(word[w_idx]))
115 : /* Go N letters back. */
116 : #define Look_Back_Letter(n) (w_idx >= n ? toupper(word[w_idx-n]) : '\0')
117 : /* Previous letter. I dunno, should this return null on failure? */
118 : #define Prev_Letter (Look_Back_Letter(1))
119 : /* Look two letters down. It makes sure you don't walk off the string. */
120 : #define After_Next_Letter (Next_Letter != '\0' ? toupper(word[w_idx+2]) \
121 : : '\0')
122 : #define Look_Ahead_Letter(n) (toupper(Lookahead(word+w_idx, n)))
123 :
124 :
125 : /* Allows us to safely look ahead an arbitrary # of letters */
126 : /* I probably could have just used strlen... */
127 :
128 : static char Lookahead(char *word, int how_far) /* {{{ */
129 0 : {
130 0 : char letter_ahead = '\0'; /* null by default */
131 : int idx;
132 0 : for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
133 : /* Edge forward in the string... */
134 :
135 0 : letter_ahead = word[idx]; /* idx will be either == to how_far or
136 : * at the end of the string
137 : */
138 0 : return letter_ahead;
139 : }
140 : /* }}} */
141 :
142 : /* phonize one letter
143 : * We don't know the buffers size in advance. On way to solve this is to just
144 : * re-allocate the buffer size. We're using an extra of 2 characters (this
145 : * could be one though; or more too). */
146 : #define Phonize(c) { \
147 : if (p_idx >= max_buffer_len) { \
148 : *phoned_word = safe_erealloc(*phoned_word, 2, sizeof(char), max_buffer_len); \
149 : max_buffer_len += 2; \
150 : } \
151 : (*phoned_word)[p_idx++] = c; \
152 : }
153 : /* Slap a null character on the end of the phoned word */
154 : #define End_Phoned_Word { \
155 : if (p_idx == max_buffer_len) { \
156 : *phoned_word = safe_erealloc(*phoned_word, 1, sizeof(char), max_buffer_len); \
157 : } \
158 : (*phoned_word)[p_idx] = '\0'; \
159 : }
160 : /* How long is the phoned word? */
161 : #define Phone_Len (p_idx)
162 :
163 : /* Note is a letter is a 'break' in the word */
164 : #define Isbreak(c) (!isalpha(c))
165 :
166 : static int metaphone(unsigned char *word, int word_len, long max_phonemes, char **phoned_word, int traditional) /* {{{ */
167 24 : {
168 24 : int w_idx = 0; /* point in the phonization we're at. */
169 24 : int p_idx = 0; /* end of the phoned phrase */
170 24 : int max_buffer_len = 0; /* maximum length of the destination buffer */
171 :
172 : /*-- Parameter checks --*/
173 : /* Negative phoneme length is meaningless */
174 :
175 24 : if (max_phonemes < 0)
176 2 : return -1;
177 :
178 : /* Empty/null string is meaningless */
179 : /* Overly paranoid */
180 : /* assert(word != NULL && word[0] != '\0'); */
181 :
182 22 : if (word == NULL)
183 0 : return -1;
184 :
185 : /*-- Allocate memory for our phoned_phrase --*/
186 22 : if (max_phonemes == 0) { /* Assume largest possible */
187 21 : max_buffer_len = word_len;
188 21 : *phoned_word = safe_emalloc(sizeof(char), word_len, 1);
189 : } else {
190 1 : max_buffer_len = max_phonemes;
191 1 : *phoned_word = safe_emalloc(sizeof(char), max_phonemes, 1);
192 : }
193 :
194 :
195 : /*-- The first phoneme has to be processed specially. --*/
196 : /* Find our first letter */
197 24 : for (; !isalpha(Curr_Letter); w_idx++) {
198 : /* On the off chance we were given nothing but crap... */
199 4 : if (Curr_Letter == '\0') {
200 2 : End_Phoned_Word
201 2 : return SUCCESS; /* For testing */
202 : }
203 : }
204 :
205 20 : switch (Curr_Letter) {
206 : /* AE becomes E */
207 : case 'A':
208 2 : if (Next_Letter == 'E') {
209 1 : Phonize('E');
210 1 : w_idx += 2;
211 : }
212 : /* Remember, preserve vowels at the beginning */
213 : else {
214 1 : Phonize('A');
215 1 : w_idx++;
216 : }
217 2 : break;
218 : /* [GKP]N becomes N */
219 : case 'G':
220 : case 'K':
221 : case 'P':
222 3 : if (Next_Letter == 'N') {
223 3 : Phonize('N');
224 3 : w_idx += 2;
225 : }
226 3 : break;
227 : /* WH becomes W,
228 : WR becomes R
229 : W if followed by a vowel */
230 : case 'W':
231 3 : if (Next_Letter == 'R') {
232 1 : Phonize(Next_Letter);
233 1 : w_idx += 2;
234 2 : } else if (Next_Letter == 'H' || isvowel(Next_Letter)) {
235 2 : Phonize('W');
236 2 : w_idx += 2;
237 : }
238 : /* else ignore */
239 3 : break;
240 : /* X becomes S */
241 : case 'X':
242 1 : Phonize('S');
243 1 : w_idx++;
244 1 : break;
245 : /* Vowels are kept */
246 : /* We did A already
247 : case 'A':
248 : case 'a':
249 : */
250 : case 'E':
251 : case 'I':
252 : case 'O':
253 : case 'U':
254 0 : Phonize(Curr_Letter);
255 0 : w_idx++;
256 : break;
257 : default:
258 : /* do nothing */
259 : break;
260 : }
261 :
262 :
263 :
264 : /* On to the metaphoning */
265 554 : for (; Curr_Letter != '\0' &&
266 : (max_phonemes == 0 || Phone_Len < max_phonemes);
267 514 : w_idx++) {
268 : /* How many letters to skip because an eariler encoding handled
269 : * multiple letters */
270 514 : unsigned short int skip_letter = 0;
271 :
272 :
273 : /* THOUGHT: It would be nice if, rather than having things like...
274 : * well, SCI. For SCI you encode the S, then have to remember
275 : * to skip the C. So the phonome SCI invades both S and C. It would
276 : * be better, IMHO, to skip the C from the S part of the encoding.
277 : * Hell, I'm trying it.
278 : */
279 :
280 : /* Ignore non-alphas */
281 514 : if (!isalpha(Curr_Letter))
282 98 : continue;
283 :
284 : /* Drop duplicates, except CC */
285 416 : if (Curr_Letter == Prev_Letter &&
286 : Curr_Letter != 'C')
287 12 : continue;
288 :
289 404 : switch (Curr_Letter) {
290 : /* B -> B unless in MB */
291 : case 'B':
292 6 : if (Prev_Letter != 'M')
293 6 : Phonize('B');
294 6 : break;
295 : /* 'sh' if -CIA- or -CH, but not SCH, except SCHW.
296 : * (SCHW is handled in S)
297 : * S if -CI-, -CE- or -CY-
298 : * dropped if -SCI-, SCE-, -SCY- (handed in S)
299 : * else K
300 : */
301 : case 'C':
302 15 : if (MAKESOFT(Next_Letter)) { /* C[IEY] */
303 2 : if (After_Next_Letter == 'A' &&
304 : Next_Letter == 'I') { /* CIA */
305 0 : Phonize(SH);
306 : }
307 : /* SC[IEY] */
308 2 : else if (Prev_Letter == 'S') {
309 : /* Dropped */
310 : } else {
311 2 : Phonize('S');
312 : }
313 11 : } else if (Next_Letter == 'H') {
314 1 : if ((!traditional) && (After_Next_Letter == 'R' || Prev_Letter == 'S')) { /* Christ, School */
315 0 : Phonize('K');
316 : } else {
317 1 : Phonize(SH);
318 : }
319 1 : skip_letter++;
320 : } else {
321 10 : Phonize('K');
322 : }
323 13 : break;
324 : /* J if in -DGE-, -DGI- or -DGY-
325 : * else T
326 : */
327 : case 'D':
328 24 : if (Next_Letter == 'G' &&
329 : MAKESOFT(After_Next_Letter)) {
330 0 : Phonize('J');
331 0 : skip_letter++;
332 : } else
333 24 : Phonize('T');
334 24 : break;
335 : /* F if in -GH and not B--GH, D--GH, -H--GH, -H---GH
336 : * else dropped if -GNED, -GN,
337 : * else dropped if -DGE-, -DGI- or -DGY- (handled in D)
338 : * else J if in -GE-, -GI, -GY and not GG
339 : * else K
340 : */
341 : case 'G':
342 11 : if (Next_Letter == 'H') {
343 3 : if (!(NOGHTOF(Look_Back_Letter(3)) ||
344 : Look_Back_Letter(4) == 'H')) {
345 3 : Phonize('F');
346 3 : skip_letter++;
347 : } else {
348 : /* silent */
349 : }
350 8 : } else if (Next_Letter == 'N') {
351 0 : if (Isbreak(After_Next_Letter) ||
352 : (After_Next_Letter == 'E' &&
353 : Look_Ahead_Letter(3) == 'D')) {
354 : /* dropped */
355 : } else
356 0 : Phonize('K');
357 8 : } else if (MAKESOFT(Next_Letter) &&
358 : Prev_Letter != 'G') {
359 0 : Phonize('J');
360 : } else {
361 8 : Phonize('K');
362 : }
363 11 : break;
364 : /* H if before a vowel and not after C,G,P,S,T */
365 : case 'H':
366 6 : if (isvowel(Next_Letter) &&
367 : !AFFECTH(Prev_Letter))
368 4 : Phonize('H');
369 6 : break;
370 : /* dropped if after C
371 : * else K
372 : */
373 : case 'K':
374 4 : if (Prev_Letter != 'C')
375 3 : Phonize('K');
376 4 : break;
377 : /* F if before H
378 : * else P
379 : */
380 : case 'P':
381 7 : if (Next_Letter == 'H') {
382 2 : Phonize('F');
383 : } else {
384 5 : Phonize('P');
385 : }
386 7 : break;
387 : /* K
388 : */
389 : case 'Q':
390 0 : Phonize('K');
391 0 : break;
392 : /* 'sh' in -SH-, -SIO- or -SIA- or -SCHW-
393 : * else S
394 : */
395 : case 'S':
396 23 : if (Next_Letter == 'I' &&
397 : (After_Next_Letter == 'O' ||
398 : After_Next_Letter == 'A')) {
399 0 : Phonize(SH);
400 23 : } else if (Next_Letter == 'H') {
401 2 : Phonize(SH);
402 2 : skip_letter++;
403 21 : } else if ((!traditional) && (Next_Letter == 'C' && Look_Ahead_Letter(2) == 'H' && Look_Ahead_Letter(3) == 'W')) {
404 0 : Phonize(SH);
405 0 : skip_letter += 2;
406 : } else {
407 21 : Phonize('S');
408 : }
409 23 : break;
410 : /* 'sh' in -TIA- or -TIO-
411 : * else 'th' before H
412 : * else T
413 : */
414 : case 'T':
415 38 : if (Next_Letter == 'I' &&
416 : (After_Next_Letter == 'O' ||
417 : After_Next_Letter == 'A')) {
418 0 : Phonize(SH);
419 38 : } else if (Next_Letter == 'H') {
420 21 : Phonize(TH);
421 21 : skip_letter++;
422 17 : } else if (!(Next_Letter == 'C' && After_Next_Letter == 'H')) {
423 16 : Phonize('T');
424 : }
425 38 : break;
426 : /* F */
427 : case 'V':
428 7 : Phonize('F');
429 7 : break;
430 : /* W before a vowel, else dropped */
431 : case 'W':
432 16 : if (isvowel(Next_Letter))
433 12 : Phonize('W');
434 16 : break;
435 : /* KS */
436 : case 'X':
437 7 : Phonize('K');
438 7 : Phonize('S');
439 7 : break;
440 : /* Y if followed by a vowel */
441 : case 'Y':
442 6 : if (isvowel(Next_Letter))
443 2 : Phonize('Y');
444 6 : break;
445 : /* S */
446 : case 'Z':
447 3 : Phonize('S');
448 3 : break;
449 : /* No transformation */
450 : case 'F':
451 : case 'J':
452 : case 'L':
453 : case 'M':
454 : case 'N':
455 : case 'R':
456 92 : Phonize(Curr_Letter);
457 : break;
458 : default:
459 : /* nothing */
460 : break;
461 : } /* END SWITCH */
462 :
463 404 : w_idx += skip_letter;
464 : } /* END FOR */
465 :
466 20 : End_Phoned_Word;
467 :
468 20 : return 0;
469 : } /* END metaphone */
470 : /* }}} */
471 :
472 : /*
473 : * Local variables:
474 : * tab-width: 4
475 : * c-basic-offset: 4
476 : * End:
477 : * vim600: sw=4 ts=4 fdm=marker
478 : * vim<600: sw=4 ts=4
479 : */
|