1 : /* utf8_decode.c */
2 :
3 : /* 2005-12-25 */
4 :
5 : /*
6 : Copyright (c) 2005 JSON.org
7 :
8 : Permission is hereby granted, free of charge, to any person obtaining a copy
9 : of this software and associated documentation files (the "Software"), to deal
10 : in the Software without restriction, including without limitation the rights
11 : to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 : copies of the Software, and to permit persons to whom the Software is
13 : furnished to do so, subject to the following conditions:
14 :
15 : The above copyright notice and this permission notice shall be included in all
16 : copies or substantial portions of the Software.
17 :
18 : The Software shall be used for Good, not Evil.
19 :
20 : THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 : IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 : FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23 : AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 : LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 : OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 : SOFTWARE.
27 : */
28 :
29 : #include "utf8_decode.h"
30 :
31 : /*
32 : Very Strict UTF-8 Decoder
33 :
34 : UTF-8 is a multibyte character encoding of Unicode. A character can be
35 : represented by 1-4 bytes. The bit pattern of the first byte indicates the
36 : number of continuation bytes.
37 :
38 : Most UTF-8 decoders tend to be lenient, attempting to recover as much
39 : information as possible, even from badly encoded input. This UTF-8
40 : decoder is not lenient. It will reject input which does not include
41 : proper continuation bytes. It will reject aliases (or suboptimal
42 : codings). It will reject surrogates. (Surrogate encoding should only be
43 : used with UTF-16.)
44 :
45 : Code Contination Minimum Maximum
46 : 0xxxxxxx 0 0 127
47 : 10xxxxxx error
48 : 110xxxxx 1 128 2047
49 : 1110xxxx 2 2048 65535 excluding 55296 - 57343
50 : 11110xxx 3 65536 1114111
51 : 11111xxx error
52 : */
53 :
54 :
55 : /*
56 : Get the next byte. It returns UTF8_END if there are no more bytes.
57 : */
58 : static int get(json_utf8_decode *utf8) /* {{{ */
59 4800 : {
60 : int c;
61 4800 : if (utf8->the_index >= utf8->the_length) {
62 6 : return UTF8_END;
63 : }
64 4794 : c = utf8->the_input[utf8->the_index] & 0xFF;
65 4794 : utf8->the_index += 1;
66 4794 : return c;
67 : }
68 : /* }}} */
69 :
70 : /*
71 : Get the 6-bit payload of the next continuation byte.
72 : Return UTF8_ERROR if it is not a contination byte.
73 : */
74 : static int cont(json_utf8_decode *utf8) /* {{{ */
75 40 : {
76 40 : int c = get(utf8);
77 40 : return ((c & 0xC0) == 0x80) ? (c & 0x3F) : UTF8_ERROR;
78 : }
79 : /* }}} */
80 :
81 : /*
82 : Initialize the UTF-8 decoder. The decoder is not reentrant,
83 : */
84 : void utf8_decode_init(json_utf8_decode *utf8, char p[], int length) /* {{{ */
85 19 : {
86 19 : utf8->the_index = 0;
87 19 : utf8->the_input = p;
88 19 : utf8->the_length = length;
89 19 : utf8->the_char = 0;
90 19 : utf8->the_byte = 0;
91 19 : }
92 : /* }}} */
93 :
94 : /*
95 : Get the current byte offset. This is generally used in error reporting.
96 : */
97 : int utf8_decode_at_byte(json_utf8_decode *utf8) /* {{{ */
98 0 : {
99 0 : return utf8->the_byte;
100 : }
101 : /* }}} */
102 :
103 : /*
104 : Get the current character offset. This is generally used in error reporting.
105 : The character offset matches the byte offset if the text is strictly ASCII.
106 : */
107 : int utf8_decode_at_character(json_utf8_decode *utf8) /* {{{ */
108 0 : {
109 0 : return utf8->the_char > 0 ? utf8->the_char - 1 : 0;
110 : }
111 : /* }}} */
112 :
113 : /*
114 : Extract the next character.
115 : Returns: the character (between 0 and 1114111)
116 : or UTF8_END (the end)
117 : or UTF8_ERROR (error)
118 : */
119 : int utf8_decode_next(json_utf8_decode *utf8) /* {{{ */
120 4775 : {
121 : int c; /* the first byte of the character */
122 : int r; /* the result */
123 :
124 4775 : if (utf8->the_index >= utf8->the_length) {
125 15 : return utf8->the_index == utf8->the_length ? UTF8_END : UTF8_ERROR;
126 : }
127 4760 : utf8->the_byte = utf8->the_index;
128 4760 : utf8->the_char += 1;
129 4760 : c = get(utf8);
130 : /*
131 : Zero continuation (0 to 127)
132 : */
133 4760 : if ((c & 0x80) == 0) {
134 4740 : return c;
135 : }
136 : /*
137 : One contination (128 to 2047)
138 : */
139 20 : if ((c & 0xE0) == 0xC0) {
140 0 : int c1 = cont(utf8);
141 0 : if (c1 < 0) {
142 0 : return UTF8_ERROR;
143 : }
144 0 : r = ((c & 0x1F) << 6) | c1;
145 0 : return r >= 128 ? r : UTF8_ERROR;
146 : }
147 : /*
148 : Two continuation (2048 to 55295 and 57344 to 65535)
149 : */
150 20 : if ((c & 0xF0) == 0xE0) {
151 20 : int c1 = cont(utf8);
152 20 : int c2 = cont(utf8);
153 20 : if (c1 < 0 || c2 < 0) {
154 4 : return UTF8_ERROR;
155 : }
156 16 : r = ((c & 0x0F) << 12) | (c1 << 6) | c2;
157 16 : return r >= 2048 && (r < 55296 || r > 57343) ? r : UTF8_ERROR;
158 : }
159 : /*
160 : Three continuation (65536 to 1114111)
161 : */
162 0 : if ((c & 0xF8) == 0xF0) {
163 0 : int c1 = cont(utf8);
164 0 : int c2 = cont(utf8);
165 0 : int c3 = cont(utf8);
166 0 : if (c1 < 0 || c2 < 0 || c3 < 0) {
167 0 : return UTF8_ERROR;
168 : }
169 0 : r = ((c & 0x0F) << 18) | (c1 << 12) | (c2 << 6) | c3;
170 0 : return r >= 65536 && r <= 1114111 ? r : UTF8_ERROR;
171 : }
172 0 : return UTF8_ERROR;
173 : }
174 : /* }}} */
175 :
176 : /*
177 : * Local variables:
178 : * tab-width: 4
179 : * c-basic-offset: 4
180 : * End:
181 : * vim600: noet sw=4 ts=4
182 : * vim<600: noet sw=4 ts=4
183 : */
|