1 : /**********************************************************************
2 : euc_jp.c - Oniguruma (regular expression library)
3 : **********************************************************************/
4 : /*-
5 : * Copyright (c) 2002-2005 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 : * All rights reserved.
7 : *
8 : * Redistribution and use in source and binary forms, with or without
9 : * modification, are permitted provided that the following conditions
10 : * are met:
11 : * 1. Redistributions of source code must retain the above copyright
12 : * notice, this list of conditions and the following disclaimer.
13 : * 2. Redistributions in binary form must reproduce the above copyright
14 : * notice, this list of conditions and the following disclaimer in the
15 : * documentation and/or other materials provided with the distribution.
16 : *
17 : * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 : * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 : * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 : * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 : * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 : * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 : * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 : * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 : * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 : * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 : * SUCH DAMAGE.
28 : */
29 :
30 : #include "regenc.h"
31 :
32 : #define eucjp_islead(c) ((UChar )((c) - 0xa1) > 0xfe - 0xa1)
33 :
34 : static const int EncLen_EUCJP[] = {
35 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
36 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
37 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
38 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
39 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
40 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
42 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
43 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
44 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
45 : 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
46 : 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
47 : 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
48 : 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
49 : 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
50 : 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
51 : };
52 :
53 : static int
54 : eucjp_mbc_enc_len(const UChar* p)
55 738 : {
56 738 : return EncLen_EUCJP[*p];
57 : }
58 :
59 : static OnigCodePoint
60 : eucjp_mbc_to_code(const UChar* p, const UChar* end)
61 283 : {
62 : int c, i, len;
63 : OnigCodePoint n;
64 :
65 283 : len = enc_len(ONIG_ENCODING_EUC_JP, p);
66 283 : n = (OnigCodePoint )*p++;
67 283 : if (len == 1) return n;
68 :
69 98 : for (i = 1; i < len; i++) {
70 49 : if (p >= end) break;
71 49 : c = *p++;
72 49 : n <<= 8; n += c;
73 : }
74 49 : return n;
75 : }
76 :
77 : static int
78 : eucjp_code_to_mbclen(OnigCodePoint code)
79 23 : {
80 23 : if (ONIGENC_IS_CODE_ASCII(code)) return 1;
81 8 : else if ((code & 0xff0000) != 0) return 3;
82 8 : else if ((code & 0xff00) != 0) return 2;
83 0 : else return 0;
84 : }
85 :
86 : #if 0
87 : static int
88 : eucjp_code_to_mbc_first(OnigCodePoint code)
89 : {
90 : int first;
91 :
92 : if ((code & 0xff0000) != 0) {
93 : first = (code >> 16) & 0xff;
94 : }
95 : else if ((code & 0xff00) != 0) {
96 : first = (code >> 8) & 0xff;
97 : }
98 : else {
99 : return (int )code;
100 : }
101 : return first;
102 : }
103 : #endif
104 :
105 : static int
106 : eucjp_code_to_mbc(OnigCodePoint code, UChar *buf)
107 0 : {
108 0 : UChar *p = buf;
109 :
110 0 : if ((code & 0xff0000) != 0) *p++ = (UChar )(((code >> 16) & 0xff));
111 0 : if ((code & 0xff00) != 0) *p++ = (UChar )(((code >> 8) & 0xff));
112 0 : *p++ = (UChar )(code & 0xff);
113 :
114 : #if 1
115 0 : if (enc_len(ONIG_ENCODING_EUC_JP, buf) != (p - buf))
116 0 : return ONIGENCERR_INVALID_WIDE_CHAR_VALUE;
117 : #endif
118 0 : return p - buf;
119 : }
120 :
121 : static int
122 : eucjp_mbc_to_normalize(OnigAmbigType flag,
123 : const UChar** pp, const UChar* end, UChar* lower)
124 0 : {
125 : int len;
126 0 : const UChar* p = *pp;
127 :
128 0 : if (ONIGENC_IS_MBC_ASCII(p)) {
129 0 : if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) {
130 0 : *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
131 : }
132 : else {
133 0 : *lower = *p;
134 : }
135 :
136 0 : (*pp)++;
137 0 : return 1;
138 : }
139 : else {
140 0 : len = enc_len(ONIG_ENCODING_EUC_JP, p);
141 0 : if (lower != p) {
142 : int i;
143 0 : for (i = 0; i < len; i++) {
144 0 : *lower++ = *p++;
145 : }
146 : }
147 0 : (*pp) += len;
148 0 : return len; /* return byte length of converted char to lower */
149 : }
150 : }
151 :
152 : static int
153 : eucjp_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end)
154 0 : {
155 0 : return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_EUC_JP, flag, pp, end);
156 : }
157 :
158 : static int
159 : eucjp_is_code_ctype(OnigCodePoint code, unsigned int ctype)
160 768 : {
161 768 : if (code < 128)
162 384 : return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
163 : else {
164 384 : if ((ctype & (ONIGENC_CTYPE_WORD |
165 : ONIGENC_CTYPE_GRAPH | ONIGENC_CTYPE_PRINT)) != 0) {
166 0 : return (eucjp_code_to_mbclen(code) > 1 ? TRUE : FALSE);
167 : }
168 : }
169 :
170 384 : return FALSE;
171 : }
172 :
173 : static UChar*
174 : eucjp_left_adjust_char_head(const UChar* start, const UChar* s)
175 58 : {
176 : /* In this encoding
177 : mb-trail bytes doesn't mix with single bytes.
178 : */
179 : const UChar *p;
180 : int len;
181 :
182 58 : if (s <= start) return (UChar* )s;
183 44 : p = s;
184 :
185 44 : while (!eucjp_islead(*p) && p > start) p--;
186 44 : len = enc_len(ONIG_ENCODING_EUC_JP, p);
187 44 : if (p + len > s) return (UChar* )p;
188 7 : p += len;
189 7 : return (UChar* )(p + ((s - p) & ~1));
190 : }
191 :
192 : static int
193 : eucjp_is_allowed_reverse_match(const UChar* s, const UChar* end)
194 4 : {
195 4 : const UChar c = *s;
196 4 : if (c <= 0x7e || c == 0x8e || c == 0x8f)
197 3 : return TRUE;
198 : else
199 1 : return FALSE;
200 : }
201 :
202 : OnigEncodingType OnigEncodingEUC_JP = {
203 : eucjp_mbc_enc_len,
204 : "EUC-JP", /* name */
205 : 3, /* max enc length */
206 : 1, /* min enc length */
207 : ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE,
208 : {
209 : (OnigCodePoint )'\\' /* esc */
210 : , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
211 : , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
212 : , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
213 : , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
214 : , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
215 : },
216 : onigenc_is_mbc_newline_0x0a,
217 : eucjp_mbc_to_code,
218 : eucjp_code_to_mbclen,
219 : eucjp_code_to_mbc,
220 : eucjp_mbc_to_normalize,
221 : eucjp_is_mbc_ambiguous,
222 : onigenc_ascii_get_all_pair_ambig_codes,
223 : onigenc_nothing_get_all_comp_ambig_codes,
224 : eucjp_is_code_ctype,
225 : onigenc_not_support_get_ctype_code_range,
226 : eucjp_left_adjust_char_head,
227 : eucjp_is_allowed_reverse_match
228 : };
|