1 : /*************************************************
2 : * Perl-Compatible Regular Expressions *
3 : *************************************************/
4 :
5 : /* PCRE is a library of functions to support regular expressions whose syntax
6 : and semantics are as close as possible to those of the Perl 5 language.
7 :
8 : Written by Philip Hazel
9 : Copyright (c) 1997-2009 University of Cambridge
10 :
11 : -----------------------------------------------------------------------------
12 : Redistribution and use in source and binary forms, with or without
13 : modification, are permitted provided that the following conditions are met:
14 :
15 : * Redistributions of source code must retain the above copyright notice,
16 : this list of conditions and the following disclaimer.
17 :
18 : * Redistributions in binary form must reproduce the above copyright
19 : notice, this list of conditions and the following disclaimer in the
20 : documentation and/or other materials provided with the distribution.
21 :
22 : * Neither the name of the University of Cambridge nor the names of its
23 : contributors may be used to endorse or promote products derived from
24 : this software without specific prior written permission.
25 :
26 : THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 : AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 : IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 : ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 : LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 : CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 : SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 : INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 : CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 : ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 : POSSIBILITY OF SUCH DAMAGE.
37 : -----------------------------------------------------------------------------
38 : */
39 :
40 :
41 : /* This module contains an internal function for validating UTF-8 character
42 : strings. */
43 :
44 :
45 : #include "config.h"
46 :
47 : #include "pcre_internal.h"
48 :
49 :
50 : /*************************************************
51 : * Validate a UTF-8 string *
52 : *************************************************/
53 :
54 : /* This function is called (optionally) at the start of compile or match, to
55 : validate that a supposed UTF-8 string is actually valid. The early check means
56 : that subsequent code can assume it is dealing with a valid string. The check
57 : can be turned off for maximum performance, but the consequences of supplying
58 : an invalid string are then undefined.
59 :
60 : Originally, this function checked according to RFC 2279, allowing for values in
61 : the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were in
62 : the canonical format. Once somebody had pointed out RFC 3629 to me (it
63 : obsoletes 2279), additional restrictions were applied. The values are now
64 : limited to be between 0 and 0x0010ffff, no more than 4 bytes long, and the
65 : subrange 0xd000 to 0xdfff is excluded.
66 :
67 : Arguments:
68 : string points to the string
69 : length length of string, or -1 if the string is zero-terminated
70 :
71 : Returns: < 0 if the string is a valid UTF-8 string
72 : >= 0 otherwise; the value is the offset of the bad byte
73 : */
74 :
75 : int
76 : _pcre_valid_utf8(USPTR string, int length)
77 29 : {
78 : #ifdef SUPPORT_UTF8
79 : register USPTR p;
80 :
81 29 : if (length < 0)
82 : {
83 14 : for (p = string; *p != 0; p++);
84 14 : length = p - string;
85 : }
86 :
87 137 : for (p = string; length-- > 0; p++)
88 : {
89 : register int ab;
90 109 : register int c = *p;
91 109 : if (c < 128) continue;
92 7 : if (c < 0xc0) return p - string;
93 7 : ab = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */
94 7 : if (length < ab || ab > 3) return p - string;
95 6 : length -= ab;
96 :
97 : /* Check top bits in the second byte */
98 6 : if ((*(++p) & 0xc0) != 0x80) return p - string;
99 :
100 : /* Check for overlong sequences for each different length, and for the
101 : excluded range 0xd000 to 0xdfff. */
102 :
103 6 : switch (ab)
104 : {
105 : /* Check for xx00 000x (overlong sequence) */
106 :
107 : case 1:
108 2 : if ((c & 0x3e) == 0) return p - string;
109 2 : continue; /* We know there aren't any more bytes to check */
110 :
111 : /* Check for 1110 0000, xx0x xxxx (overlong sequence) or
112 : 1110 1101, 1010 xxxx (0xd000 - 0xdfff) */
113 :
114 : case 2:
115 4 : if ((c == 0xe0 && (*p & 0x20) == 0) ||
116 : (c == 0xed && *p >= 0xa0))
117 0 : return p - string;
118 4 : break;
119 :
120 : /* Check for 1111 0000, xx00 xxxx (overlong sequence) or
121 : greater than 0x0010ffff (f4 8f bf bf) */
122 :
123 : case 3:
124 0 : if ((c == 0xf0 && (*p & 0x30) == 0) ||
125 : (c > 0xf4 ) ||
126 : (c == 0xf4 && *p > 0x8f))
127 0 : return p - string;
128 : break;
129 :
130 : #if 0
131 : /* These cases can no longer occur, as we restrict to a maximum of four
132 : bytes nowadays. Leave the code here in case we ever want to add an option
133 : for longer sequences. */
134 :
135 : /* Check for 1111 1000, xx00 0xxx */
136 : case 4:
137 : if (c == 0xf8 && (*p & 0x38) == 0) return p - string;
138 : break;
139 :
140 : /* Check for leading 0xfe or 0xff, and then for 1111 1100, xx00 00xx */
141 : case 5:
142 : if (c == 0xfe || c == 0xff ||
143 : (c == 0xfc && (*p & 0x3c) == 0)) return p - string;
144 : break;
145 : #endif
146 :
147 : }
148 :
149 : /* Check for valid bytes after the 2nd, if any; all must start 10 */
150 12 : while (--ab > 0)
151 : {
152 4 : if ((*(++p) & 0xc0) != 0x80) return p - string;
153 : }
154 : }
155 : #else
156 : (void)(string); /* Keep picky compilers happy */
157 : (void)(length);
158 : #endif
159 :
160 28 : return -1;
161 : }
162 :
163 : /* End of pcre_valid_utf8.c */
|