1 : /**********************************************************************
2 : gb18030.c - Oniguruma (regular expression library)
3 : **********************************************************************/
4 : /*-
5 : * Copyright (c) 2005 KUBO Takehiro <kubo AT jiubao DOT org>
6 : * K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
7 : * All rights reserved.
8 : *
9 : * Redistribution and use in source and binary forms, with or without
10 : * modification, are permitted provided that the following conditions
11 : * are met:
12 : * 1. Redistributions of source code must retain the above copyright
13 : * notice, this list of conditions and the following disclaimer.
14 : * 2. Redistributions in binary form must reproduce the above copyright
15 : * notice, this list of conditions and the following disclaimer in the
16 : * documentation and/or other materials provided with the distribution.
17 : *
18 : * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 : * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 : * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 : * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 : * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 : * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 : * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 : * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 : * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 : * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 : * SUCH DAMAGE.
29 : */
30 :
31 : #include "regenc.h"
32 :
33 : #if 1
34 : #define DEBUG_GB18030(arg)
35 : #else
36 : #define DEBUG_GB18030(arg) printf arg
37 : #endif
38 :
39 : enum {
40 : C1, /* one-byte char */
41 : C2, /* one-byte or second of two-byte char */
42 : C4, /* one-byte or second or fourth of four-byte char */
43 : CM /* first of two- or four-byte char or second of two-byte char */
44 : };
45 :
46 : static const char GB18030_MAP[] = {
47 : C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
48 : C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
49 : C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
50 : C4, C4, C4, C4, C4, C4, C4, C4, C4, C4, C1, C1, C1, C1, C1, C1,
51 : C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
52 : C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
53 : C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
54 : C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C1,
55 : C2, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
56 : CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
57 : CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
58 : CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
59 : CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
60 : CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
61 : CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
62 : CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, C1
63 : };
64 :
65 : static int
66 : gb18030_mbc_enc_len(const UChar* p)
67 0 : {
68 0 : if (GB18030_MAP[*p] != CM)
69 0 : return 1;
70 0 : p++;
71 0 : if (GB18030_MAP[*p] == C4)
72 0 : return 4;
73 0 : if (GB18030_MAP[*p] == C1)
74 0 : return 1; /* illegal sequence */
75 0 : return 2;
76 : }
77 :
78 : static OnigCodePoint
79 : gb18030_mbc_to_code(const UChar* p, const UChar* end)
80 0 : {
81 0 : return onigenc_mbn_mbc_to_code(ONIG_ENCODING_GB18030, p, end);
82 : }
83 :
84 : static int
85 : gb18030_code_to_mbc(OnigCodePoint code, UChar *buf)
86 0 : {
87 0 : return onigenc_mb4_code_to_mbc(ONIG_ENCODING_GB18030, code, buf);
88 : }
89 :
90 : static int
91 : gb18030_mbc_to_normalize(OnigAmbigType flag, const UChar** pp, const UChar* end,
92 : UChar* lower)
93 0 : {
94 0 : return onigenc_mbn_mbc_to_normalize(ONIG_ENCODING_GB18030, flag,
95 : pp, end, lower);
96 : }
97 :
98 : static int
99 : gb18030_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end)
100 0 : {
101 0 : return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_GB18030, flag, pp, end);
102 : }
103 :
104 : static int
105 : gb18030_is_code_ctype(OnigCodePoint code, unsigned int ctype)
106 0 : {
107 0 : return onigenc_mb4_is_code_ctype(ONIG_ENCODING_GB18030, code, ctype);
108 : }
109 :
110 : enum state {
111 : S_START,
112 : S_one_C2,
113 : S_one_C4,
114 : S_one_CM,
115 :
116 : S_odd_CM_one_CX,
117 : S_even_CM_one_CX,
118 :
119 : /* CMC4 : pair of "CM C4" */
120 : S_one_CMC4,
121 : S_odd_CMC4,
122 : S_one_C4_odd_CMC4,
123 : S_even_CMC4,
124 : S_one_C4_even_CMC4,
125 :
126 : S_odd_CM_odd_CMC4,
127 : S_even_CM_odd_CMC4,
128 :
129 : S_odd_CM_even_CMC4,
130 : S_even_CM_even_CMC4,
131 :
132 : /* C4CM : pair of "C4 CM" */
133 : S_odd_C4CM,
134 : S_one_CM_odd_C4CM,
135 : S_even_C4CM,
136 : S_one_CM_even_C4CM,
137 :
138 : S_even_CM_odd_C4CM,
139 : S_odd_CM_odd_C4CM,
140 : S_even_CM_even_C4CM,
141 : S_odd_CM_even_C4CM,
142 : };
143 :
144 : static UChar*
145 : gb18030_left_adjust_char_head(const UChar* start, const UChar* s)
146 0 : {
147 : const UChar *p;
148 0 : enum state state = S_START;
149 :
150 : DEBUG_GB18030(("----------------\n"));
151 0 : for (p = s; p >= start; p--) {
152 : DEBUG_GB18030(("state %d --(%02x)-->\n", state, *p));
153 0 : switch (state) {
154 : case S_START:
155 0 : switch (GB18030_MAP[*p]) {
156 : case C1:
157 0 : return (UChar *)s;
158 : case C2:
159 0 : state = S_one_C2; /* C2 */
160 0 : break;
161 : case C4:
162 0 : state = S_one_C4; /* C4 */
163 0 : break;
164 : case CM:
165 0 : state = S_one_CM; /* CM */
166 : break;
167 : }
168 0 : break;
169 : case S_one_C2: /* C2 */
170 0 : switch (GB18030_MAP[*p]) {
171 : case C1:
172 : case C2:
173 : case C4:
174 0 : return (UChar *)s;
175 : case CM:
176 0 : state = S_odd_CM_one_CX; /* CM C2 */
177 : break;
178 : }
179 0 : break;
180 : case S_one_C4: /* C4 */
181 0 : switch (GB18030_MAP[*p]) {
182 : case C1:
183 : case C2:
184 : case C4:
185 0 : return (UChar *)s;
186 : case CM:
187 0 : state = S_one_CMC4;
188 : break;
189 : }
190 0 : break;
191 : case S_one_CM: /* CM */
192 0 : switch (GB18030_MAP[*p]) {
193 : case C1:
194 : case C2:
195 0 : return (UChar *)s;
196 : case C4:
197 0 : state = S_odd_C4CM;
198 0 : break;
199 : case CM:
200 0 : state = S_odd_CM_one_CX; /* CM CM */
201 : break;
202 : }
203 0 : break;
204 :
205 : case S_odd_CM_one_CX: /* CM C2 */ /* CM CM */ /* CM CM CM C4 */
206 0 : switch (GB18030_MAP[*p]) {
207 : case C1:
208 : case C2:
209 : case C4:
210 0 : return (UChar *)(s - 1);
211 : case CM:
212 0 : state = S_even_CM_one_CX;
213 : break;
214 : }
215 0 : break;
216 : case S_even_CM_one_CX: /* CM CM C2 */ /* CM CM CM */ /* CM CM C4 */
217 0 : switch (GB18030_MAP[*p]) {
218 : case C1:
219 : case C2:
220 : case C4:
221 0 : return (UChar *)s;
222 : case CM:
223 0 : state = S_odd_CM_one_CX;
224 : break;
225 : }
226 0 : break;
227 :
228 : case S_one_CMC4: /* CM C4 */
229 0 : switch (GB18030_MAP[*p]) {
230 : case C1:
231 : case C2:
232 0 : return (UChar *)(s - 1);
233 : case C4:
234 0 : state = S_one_C4_odd_CMC4; /* C4 CM C4 */
235 0 : break;
236 : case CM:
237 0 : state = S_even_CM_one_CX; /* CM CM C4 */
238 : break;
239 : }
240 0 : break;
241 : case S_odd_CMC4: /* CM C4 CM C4 CM C4 */
242 0 : switch (GB18030_MAP[*p]) {
243 : case C1:
244 : case C2:
245 0 : return (UChar *)(s - 1);
246 : case C4:
247 0 : state = S_one_C4_odd_CMC4;
248 0 : break;
249 : case CM:
250 0 : state = S_odd_CM_odd_CMC4;
251 : break;
252 : }
253 0 : break;
254 : case S_one_C4_odd_CMC4: /* C4 CM C4 */
255 0 : switch (GB18030_MAP[*p]) {
256 : case C1:
257 : case C2:
258 : case C4:
259 0 : return (UChar *)(s - 1);
260 : case CM:
261 0 : state = S_even_CMC4; /* CM C4 CM C4 */
262 : break;
263 : }
264 0 : break;
265 : case S_even_CMC4: /* CM C4 CM C4 */
266 0 : switch (GB18030_MAP[*p]) {
267 : case C1:
268 : case C2:
269 0 : return (UChar *)(s - 3);
270 : case C4:
271 0 : state = S_one_C4_even_CMC4;
272 0 : break;
273 : case CM:
274 0 : state = S_odd_CM_even_CMC4;
275 : break;
276 : }
277 0 : break;
278 : case S_one_C4_even_CMC4: /* C4 CM C4 CM C4 */
279 0 : switch (GB18030_MAP[*p]) {
280 : case C1:
281 : case C2:
282 : case C4:
283 0 : return (UChar *)(s - 3);
284 : case CM:
285 0 : state = S_odd_CMC4;
286 : break;
287 : }
288 0 : break;
289 :
290 : case S_odd_CM_odd_CMC4: /* CM CM C4 CM C4 CM C4 */
291 0 : switch (GB18030_MAP[*p]) {
292 : case C1:
293 : case C2:
294 : case C4:
295 0 : return (UChar *)(s - 3);
296 : case CM:
297 0 : state = S_even_CM_odd_CMC4;
298 : break;
299 : }
300 0 : break;
301 : case S_even_CM_odd_CMC4: /* CM CM CM C4 CM C4 CM C4 */
302 0 : switch (GB18030_MAP[*p]) {
303 : case C1:
304 : case C2:
305 : case C4:
306 0 : return (UChar *)(s - 1);
307 : case CM:
308 0 : state = S_odd_CM_odd_CMC4;
309 : break;
310 : }
311 0 : break;
312 :
313 : case S_odd_CM_even_CMC4: /* CM CM C4 CM C4 */
314 0 : switch (GB18030_MAP[*p]) {
315 : case C1:
316 : case C2:
317 : case C4:
318 0 : return (UChar *)(s - 1);
319 : case CM:
320 0 : state = S_even_CM_even_CMC4;
321 : break;
322 : }
323 0 : break;
324 : case S_even_CM_even_CMC4: /* CM CM CM C4 CM C4 */
325 0 : switch (GB18030_MAP[*p]) {
326 : case C1:
327 : case C2:
328 : case C4:
329 0 : return (UChar *)(s - 3);
330 : case CM:
331 0 : state = S_odd_CM_even_CMC4;
332 : break;
333 : }
334 0 : break;
335 :
336 : case S_odd_C4CM: /* C4 CM */ /* C4 CM C4 CM C4 CM*/
337 0 : switch (GB18030_MAP[*p]) {
338 : case C1:
339 : case C2:
340 : case C4:
341 0 : return (UChar *)s;
342 : case CM:
343 0 : state = S_one_CM_odd_C4CM; /* CM C4 CM */
344 : break;
345 : }
346 0 : break;
347 : case S_one_CM_odd_C4CM: /* CM C4 CM */ /* CM C4 CM C4 CM C4 CM */
348 0 : switch (GB18030_MAP[*p]) {
349 : case C1:
350 : case C2:
351 0 : return (UChar *)(s - 2); /* |CM C4 CM */
352 : case C4:
353 0 : state = S_even_C4CM;
354 0 : break;
355 : case CM:
356 0 : state = S_even_CM_odd_C4CM;
357 : break;
358 : }
359 0 : break;
360 : case S_even_C4CM: /* C4 CM C4 CM */
361 0 : switch (GB18030_MAP[*p]) {
362 : case C1:
363 : case C2:
364 : case C4:
365 0 : return (UChar *)(s - 2); /* C4|CM C4 CM */
366 : case CM:
367 0 : state = S_one_CM_even_C4CM;
368 : break;
369 : }
370 0 : break;
371 : case S_one_CM_even_C4CM: /* CM C4 CM C4 CM */
372 0 : switch (GB18030_MAP[*p]) {
373 : case C1:
374 : case C2:
375 0 : return (UChar *)(s - 0); /*|CM C4 CM C4|CM */
376 : case C4:
377 0 : state = S_odd_C4CM;
378 0 : break;
379 : case CM:
380 0 : state = S_even_CM_even_C4CM;
381 : break;
382 : }
383 0 : break;
384 :
385 : case S_even_CM_odd_C4CM: /* CM CM C4 CM */
386 0 : switch (GB18030_MAP[*p]) {
387 : case C1:
388 : case C2:
389 : case C4:
390 0 : return (UChar *)(s - 0); /* |CM CM|C4|CM */
391 : case CM:
392 0 : state = S_odd_CM_odd_C4CM;
393 : break;
394 : }
395 0 : break;
396 : case S_odd_CM_odd_C4CM: /* CM CM CM C4 CM */
397 0 : switch (GB18030_MAP[*p]) {
398 : case C1:
399 : case C2:
400 : case C4:
401 0 : return (UChar *)(s - 2); /* |CM CM|CM C4 CM */
402 : case CM:
403 0 : state = S_even_CM_odd_C4CM;
404 : break;
405 : }
406 0 : break;
407 :
408 : case S_even_CM_even_C4CM: /* CM CM C4 CM C4 CM */
409 0 : switch (GB18030_MAP[*p]) {
410 : case C1:
411 : case C2:
412 : case C4:
413 0 : return (UChar *)(s - 2); /* |CM CM|C4|CM C4 CM */
414 : case CM:
415 0 : state = S_odd_CM_even_C4CM;
416 : break;
417 : }
418 0 : break;
419 : case S_odd_CM_even_C4CM: /* CM CM CM C4 CM C4 CM */
420 0 : switch (GB18030_MAP[*p]) {
421 : case C1:
422 : case C2:
423 : case C4:
424 0 : return (UChar *)(s - 0); /* |CM CM|CM C4 CM C4|CM */
425 : case CM:
426 0 : state = S_even_CM_even_C4CM;
427 : break;
428 : }
429 : break;
430 : }
431 : }
432 :
433 : DEBUG_GB18030(("state %d\n", state));
434 0 : switch (state) {
435 0 : case S_START: return (UChar *)(s - 0);
436 0 : case S_one_C2: return (UChar *)(s - 0);
437 0 : case S_one_C4: return (UChar *)(s - 0);
438 0 : case S_one_CM: return (UChar *)(s - 0);
439 :
440 0 : case S_odd_CM_one_CX: return (UChar *)(s - 1);
441 0 : case S_even_CM_one_CX: return (UChar *)(s - 0);
442 :
443 0 : case S_one_CMC4: return (UChar *)(s - 1);
444 0 : case S_odd_CMC4: return (UChar *)(s - 1);
445 0 : case S_one_C4_odd_CMC4: return (UChar *)(s - 1);
446 0 : case S_even_CMC4: return (UChar *)(s - 3);
447 0 : case S_one_C4_even_CMC4: return (UChar *)(s - 3);
448 :
449 0 : case S_odd_CM_odd_CMC4: return (UChar *)(s - 3);
450 0 : case S_even_CM_odd_CMC4: return (UChar *)(s - 1);
451 :
452 0 : case S_odd_CM_even_CMC4: return (UChar *)(s - 1);
453 0 : case S_even_CM_even_CMC4: return (UChar *)(s - 3);
454 :
455 0 : case S_odd_C4CM: return (UChar *)(s - 0);
456 0 : case S_one_CM_odd_C4CM: return (UChar *)(s - 2);
457 0 : case S_even_C4CM: return (UChar *)(s - 2);
458 0 : case S_one_CM_even_C4CM: return (UChar *)(s - 0);
459 :
460 0 : case S_even_CM_odd_C4CM: return (UChar *)(s - 0);
461 0 : case S_odd_CM_odd_C4CM: return (UChar *)(s - 2);
462 0 : case S_even_CM_even_C4CM: return (UChar *)(s - 2);
463 0 : case S_odd_CM_even_C4CM: return (UChar *)(s - 0);
464 : }
465 :
466 0 : return (UChar* )s; /* never come here. (escape warning) */
467 : }
468 :
469 : static int
470 : gb18030_is_allowed_reverse_match(const UChar* s, const UChar* end)
471 0 : {
472 0 : return GB18030_MAP[*s] == C1 ? TRUE : FALSE;
473 : }
474 :
475 : OnigEncodingType OnigEncodingGB18030 = {
476 : gb18030_mbc_enc_len,
477 : "GB18030", /* name */
478 : 4, /* max enc length */
479 : 1, /* min enc length */
480 : ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE,
481 : {
482 : (OnigCodePoint )'\\' /* esc */
483 : , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
484 : , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
485 : , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
486 : , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
487 : , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
488 : },
489 : onigenc_is_mbc_newline_0x0a,
490 : gb18030_mbc_to_code,
491 : onigenc_mb4_code_to_mbclen,
492 : gb18030_code_to_mbc,
493 : gb18030_mbc_to_normalize,
494 : gb18030_is_mbc_ambiguous,
495 : onigenc_ascii_get_all_pair_ambig_codes,
496 : onigenc_nothing_get_all_comp_ambig_codes,
497 : gb18030_is_code_ctype,
498 : onigenc_not_support_get_ctype_code_range,
499 : gb18030_left_adjust_char_head,
500 : gb18030_is_allowed_reverse_match
501 : };
|