1 : /**********************************************************************
2 : big5.c - Oniguruma (regular expression library)
3 : **********************************************************************/
4 : /*-
5 : * Copyright (c) 2002-2005 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 : * All rights reserved.
7 : *
8 : * Redistribution and use in source and binary forms, with or without
9 : * modification, are permitted provided that the following conditions
10 : * are met:
11 : * 1. Redistributions of source code must retain the above copyright
12 : * notice, this list of conditions and the following disclaimer.
13 : * 2. Redistributions in binary form must reproduce the above copyright
14 : * notice, this list of conditions and the following disclaimer in the
15 : * documentation and/or other materials provided with the distribution.
16 : *
17 : * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 : * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 : * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 : * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 : * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 : * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 : * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 : * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 : * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 : * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 : * SUCH DAMAGE.
28 : */
29 :
30 : #include "regenc.h"
31 :
32 : static const int EncLen_BIG5[] = {
33 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
34 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
35 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
36 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
37 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
38 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
39 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
40 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
42 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
43 : 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
44 : 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
45 : 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
46 : 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
47 : 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
48 : 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
49 : };
50 :
51 : static int
52 : big5_mbc_enc_len(const UChar* p)
53 0 : {
54 0 : return EncLen_BIG5[*p];
55 : }
56 :
57 : static OnigCodePoint
58 : big5_mbc_to_code(const UChar* p, const UChar* end)
59 0 : {
60 0 : return onigenc_mbn_mbc_to_code(ONIG_ENCODING_BIG5, p, end);
61 : }
62 :
63 : static int
64 : big5_code_to_mbc(OnigCodePoint code, UChar *buf)
65 0 : {
66 0 : return onigenc_mb2_code_to_mbc(ONIG_ENCODING_BIG5, code, buf);
67 : }
68 :
69 : static int
70 : big5_mbc_to_normalize(OnigAmbigType flag, const UChar** pp, const UChar* end,
71 : UChar* lower)
72 0 : {
73 0 : return onigenc_mbn_mbc_to_normalize(ONIG_ENCODING_BIG5, flag,
74 : pp, end, lower);
75 : }
76 :
77 : static int
78 : big5_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end)
79 0 : {
80 0 : return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_BIG5, flag, pp, end);
81 : }
82 :
83 : static int
84 : big5_is_code_ctype(OnigCodePoint code, unsigned int ctype)
85 0 : {
86 0 : return onigenc_mb2_is_code_ctype(ONIG_ENCODING_BIG5, code, ctype);
87 : }
88 :
89 : static const char BIG5_CAN_BE_TRAIL_TABLE[256] = {
90 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
91 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
92 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
93 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
94 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
95 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
96 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
97 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
98 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
99 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
100 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
101 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
102 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
103 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
104 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
105 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0
106 : };
107 :
108 : #define BIG5_ISMB_FIRST(byte) (EncLen_BIG5[byte] > 1)
109 : #define BIG5_ISMB_TRAIL(byte) BIG5_CAN_BE_TRAIL_TABLE[(byte)]
110 :
111 : static UChar*
112 : big5_left_adjust_char_head(const UChar* start, const UChar* s)
113 0 : {
114 : const UChar *p;
115 : int len;
116 :
117 0 : if (s <= start) return (UChar* )s;
118 0 : p = s;
119 :
120 0 : if (BIG5_ISMB_TRAIL(*p)) {
121 0 : while (p > start) {
122 0 : if (! BIG5_ISMB_FIRST(*--p)) {
123 0 : p++;
124 0 : break;
125 : }
126 : }
127 : }
128 0 : len = enc_len(ONIG_ENCODING_BIG5, p);
129 0 : if (p + len > s) return (UChar* )p;
130 0 : p += len;
131 0 : return (UChar* )(p + ((s - p) & ~1));
132 : }
133 :
134 : static int
135 : big5_is_allowed_reverse_match(const UChar* s, const UChar* end)
136 0 : {
137 0 : const UChar c = *s;
138 :
139 0 : return (BIG5_ISMB_TRAIL(c) ? FALSE : TRUE);
140 : }
141 :
142 : OnigEncodingType OnigEncodingBIG5 = {
143 : big5_mbc_enc_len,
144 : "Big5", /* name */
145 : 2, /* max enc length */
146 : 1, /* min enc length */
147 : ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE,
148 : {
149 : (OnigCodePoint )'\\' /* esc */
150 : , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
151 : , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
152 : , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
153 : , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
154 : , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
155 : },
156 : onigenc_is_mbc_newline_0x0a,
157 : big5_mbc_to_code,
158 : onigenc_mb2_code_to_mbclen,
159 : big5_code_to_mbc,
160 : big5_mbc_to_normalize,
161 : big5_is_mbc_ambiguous,
162 : onigenc_ascii_get_all_pair_ambig_codes,
163 : onigenc_nothing_get_all_comp_ambig_codes,
164 : big5_is_code_ctype,
165 : onigenc_not_support_get_ctype_code_range,
166 : big5_left_adjust_char_head,
167 : big5_is_allowed_reverse_match
168 : };
|