1 : /*************************************************
2 : * Perl-Compatible Regular Expressions *
3 : *************************************************/
4 :
5 : /* PCRE is a library of functions to support regular expressions whose syntax
6 : and semantics are as close as possible to those of the Perl 5 language.
7 :
8 : Written by Philip Hazel
9 : Copyright (c) 1997-2009 University of Cambridge
10 :
11 : -----------------------------------------------------------------------------
12 : Redistribution and use in source and binary forms, with or without
13 : modification, are permitted provided that the following conditions are met:
14 :
15 : * Redistributions of source code must retain the above copyright notice,
16 : this list of conditions and the following disclaimer.
17 :
18 : * Redistributions in binary form must reproduce the above copyright
19 : notice, this list of conditions and the following disclaimer in the
20 : documentation and/or other materials provided with the distribution.
21 :
22 : * Neither the name of the University of Cambridge nor the names of its
23 : contributors may be used to endorse or promote products derived from
24 : this software without specific prior written permission.
25 :
26 : THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 : AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 : IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 : ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 : LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 : CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 : SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 : INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 : CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 : ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 : POSSIBILITY OF SUCH DAMAGE.
37 : -----------------------------------------------------------------------------
38 : */
39 :
40 :
41 : /* This module contains the external function pcre_compile(), along with
42 : supporting internal functions that are not used by other modules. */
43 :
44 :
45 : #include "config.h"
46 :
47 : #define NLBLOCK cd /* Block containing newline information */
48 : #define PSSTART start_pattern /* Field containing processed string start */
49 : #define PSEND end_pattern /* Field containing processed string end */
50 :
51 : #include "pcre_internal.h"
52 :
53 :
54 : /* When DEBUG is defined, we need the pcre_printint() function, which is also
55 : used by pcretest. DEBUG is not defined when building a production library. */
56 :
57 : #ifdef DEBUG
58 : #include "pcre_printint.src"
59 : #endif
60 :
61 :
62 : /* Macro for setting individual bits in class bitmaps. */
63 :
64 : #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
65 :
66 : /* Maximum length value to check against when making sure that the integer that
67 : holds the compiled pattern length does not overflow. We make it a bit less than
68 : INT_MAX to allow for adding in group terminating bytes, so that we don't have
69 : to check them every time. */
70 :
71 : #define OFLOW_MAX (INT_MAX - 20)
72 :
73 :
74 : /*************************************************
75 : * Code parameters and static tables *
76 : *************************************************/
77 :
78 : /* This value specifies the size of stack workspace that is used during the
79 : first pre-compile phase that determines how much memory is required. The regex
80 : is partly compiled into this space, but the compiled parts are discarded as
81 : soon as they can be, so that hopefully there will never be an overrun. The code
82 : does, however, check for an overrun. The largest amount I've seen used is 218,
83 : so this number is very generous.
84 :
85 : The same workspace is used during the second, actual compile phase for
86 : remembering forward references to groups so that they can be filled in at the
87 : end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
88 : is 4 there is plenty of room. */
89 :
90 : #define COMPILE_WORK_SIZE (4096)
91 :
92 :
93 : /* Table for handling escaped characters in the range '0'-'z'. Positive returns
94 : are simple data values; negative values are for special things like \d and so
95 : on. Zero means further processing is needed (for things like \x), or the escape
96 : is invalid. */
97 :
98 : #ifndef EBCDIC
99 :
100 : /* This is the "normal" table for ASCII systems or for EBCDIC systems running
101 : in UTF-8 mode. */
102 :
103 : static const short int escapes[] = {
104 : 0, 0,
105 : 0, 0,
106 : 0, 0,
107 : 0, 0,
108 : 0, 0,
109 : CHAR_COLON, CHAR_SEMICOLON,
110 : CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
111 : CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
112 : CHAR_COMMERCIAL_AT, -ESC_A,
113 : -ESC_B, -ESC_C,
114 : -ESC_D, -ESC_E,
115 : 0, -ESC_G,
116 : -ESC_H, 0,
117 : 0, -ESC_K,
118 : 0, 0,
119 : 0, 0,
120 : -ESC_P, -ESC_Q,
121 : -ESC_R, -ESC_S,
122 : 0, 0,
123 : -ESC_V, -ESC_W,
124 : -ESC_X, 0,
125 : -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
126 : CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
127 : CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
128 : CHAR_GRAVE_ACCENT, 7,
129 : -ESC_b, 0,
130 : -ESC_d, ESC_e,
131 : ESC_f, 0,
132 : -ESC_h, 0,
133 : 0, -ESC_k,
134 : 0, 0,
135 : ESC_n, 0,
136 : -ESC_p, 0,
137 : ESC_r, -ESC_s,
138 : ESC_tee, 0,
139 : -ESC_v, -ESC_w,
140 : 0, 0,
141 : -ESC_z
142 : };
143 :
144 : #else
145 :
146 : /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
147 :
148 : static const short int escapes[] = {
149 : /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
150 : /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
151 : /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
152 : /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
153 : /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
154 : /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
155 : /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
156 : /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
157 : /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
158 : /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
159 : /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
160 : /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
161 : /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
162 : /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
163 : /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
164 : /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
165 : /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
166 : /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
167 : /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
168 : /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
169 : /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
170 : /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
171 : /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
172 : };
173 : #endif
174 :
175 :
176 : /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
177 : searched linearly. Put all the names into a single string, in order to reduce
178 : the number of relocations when a shared library is dynamically linked. The
179 : string is built from string macros so that it works in UTF-8 mode on EBCDIC
180 : platforms. */
181 :
182 : typedef struct verbitem {
183 : int len;
184 : int op;
185 : } verbitem;
186 :
187 : static const char verbnames[] =
188 : STRING_ACCEPT0
189 : STRING_COMMIT0
190 : STRING_F0
191 : STRING_FAIL0
192 : STRING_PRUNE0
193 : STRING_SKIP0
194 : STRING_THEN;
195 :
196 : static const verbitem verbs[] = {
197 : { 6, OP_ACCEPT },
198 : { 6, OP_COMMIT },
199 : { 1, OP_FAIL },
200 : { 4, OP_FAIL },
201 : { 5, OP_PRUNE },
202 : { 4, OP_SKIP },
203 : { 4, OP_THEN }
204 : };
205 :
206 : static const int verbcount = sizeof(verbs)/sizeof(verbitem);
207 :
208 :
209 : /* Tables of names of POSIX character classes and their lengths. The names are
210 : now all in a single string, to reduce the number of relocations when a shared
211 : library is dynamically loaded. The list of lengths is terminated by a zero
212 : length entry. The first three must be alpha, lower, upper, as this is assumed
213 : for handling case independence. */
214 :
215 : static const char posix_names[] =
216 : STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
217 : STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
218 : STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
219 : STRING_word0 STRING_xdigit;
220 :
221 : static const uschar posix_name_lengths[] = {
222 : 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
223 :
224 : /* Table of class bit maps for each POSIX class. Each class is formed from a
225 : base map, with an optional addition or removal of another map. Then, for some
226 : classes, there is some additional tweaking: for [:blank:] the vertical space
227 : characters are removed, and for [:alpha:] and [:alnum:] the underscore
228 : character is removed. The triples in the table consist of the base map offset,
229 : second map offset or -1 if no second map, and a non-negative value for map
230 : addition or a negative value for map subtraction (if there are two maps). The
231 : absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
232 : remove vertical space characters, 2 => remove underscore. */
233 :
234 : static const int posix_class_maps[] = {
235 : cbit_word, cbit_digit, -2, /* alpha */
236 : cbit_lower, -1, 0, /* lower */
237 : cbit_upper, -1, 0, /* upper */
238 : cbit_word, -1, 2, /* alnum - word without underscore */
239 : cbit_print, cbit_cntrl, 0, /* ascii */
240 : cbit_space, -1, 1, /* blank - a GNU extension */
241 : cbit_cntrl, -1, 0, /* cntrl */
242 : cbit_digit, -1, 0, /* digit */
243 : cbit_graph, -1, 0, /* graph */
244 : cbit_print, -1, 0, /* print */
245 : cbit_punct, -1, 0, /* punct */
246 : cbit_space, -1, 0, /* space */
247 : cbit_word, -1, 0, /* word - a Perl extension */
248 : cbit_xdigit,-1, 0 /* xdigit */
249 : };
250 :
251 :
252 : #define STRING(a) # a
253 : #define XSTRING(s) STRING(s)
254 :
255 : /* The texts of compile-time error messages. These are "char *" because they
256 : are passed to the outside world. Do not ever re-use any error number, because
257 : they are documented. Always add a new error instead. Messages marked DEAD below
258 : are no longer used. This used to be a table of strings, but in order to reduce
259 : the number of relocations needed when a shared library is loaded dynamically,
260 : it is now one long string. We cannot use a table of offsets, because the
261 : lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
262 : simply count through to the one we want - this isn't a performance issue
263 : because these strings are used only when there is a compilation error. */
264 :
265 : static const char error_texts[] =
266 : "no error\0"
267 : "\\ at end of pattern\0"
268 : "\\c at end of pattern\0"
269 : "unrecognized character follows \\\0"
270 : "numbers out of order in {} quantifier\0"
271 : /* 5 */
272 : "number too big in {} quantifier\0"
273 : "missing terminating ] for character class\0"
274 : "invalid escape sequence in character class\0"
275 : "range out of order in character class\0"
276 : "nothing to repeat\0"
277 : /* 10 */
278 : "operand of unlimited repeat could match the empty string\0" /** DEAD **/
279 : "internal error: unexpected repeat\0"
280 : "unrecognized character after (? or (?-\0"
281 : "POSIX named classes are supported only within a class\0"
282 : "missing )\0"
283 : /* 15 */
284 : "reference to non-existent subpattern\0"
285 : "erroffset passed as NULL\0"
286 : "unknown option bit(s) set\0"
287 : "missing ) after comment\0"
288 : "parentheses nested too deeply\0" /** DEAD **/
289 : /* 20 */
290 : "regular expression is too large\0"
291 : "failed to get memory\0"
292 : "unmatched parentheses\0"
293 : "internal error: code overflow\0"
294 : "unrecognized character after (?<\0"
295 : /* 25 */
296 : "lookbehind assertion is not fixed length\0"
297 : "malformed number or name after (?(\0"
298 : "conditional group contains more than two branches\0"
299 : "assertion expected after (?(\0"
300 : "(?R or (?[+-]digits must be followed by )\0"
301 : /* 30 */
302 : "unknown POSIX class name\0"
303 : "POSIX collating elements are not supported\0"
304 : "this version of PCRE is not compiled with PCRE_UTF8 support\0"
305 : "spare error\0" /** DEAD **/
306 : "character value in \\x{...} sequence is too large\0"
307 : /* 35 */
308 : "invalid condition (?(0)\0"
309 : "\\C not allowed in lookbehind assertion\0"
310 : "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
311 : "number after (?C is > 255\0"
312 : "closing ) for (?C expected\0"
313 : /* 40 */
314 : "recursive call could loop indefinitely\0"
315 : "unrecognized character after (?P\0"
316 : "syntax error in subpattern name (missing terminator)\0"
317 : "two named subpatterns have the same name\0"
318 : "invalid UTF-8 string\0"
319 : /* 45 */
320 : "support for \\P, \\p, and \\X has not been compiled\0"
321 : "malformed \\P or \\p sequence\0"
322 : "unknown property name after \\P or \\p\0"
323 : "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
324 : "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
325 : /* 50 */
326 : "repeated subpattern is too long\0" /** DEAD **/
327 : "octal value is greater than \\377 (not in UTF-8 mode)\0"
328 : "internal error: overran compiling workspace\0"
329 : "internal error: previously-checked referenced subpattern not found\0"
330 : "DEFINE group contains more than one branch\0"
331 : /* 55 */
332 : "repeating a DEFINE group is not allowed\0"
333 : "inconsistent NEWLINE options\0"
334 : "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
335 : "a numbered reference must not be zero\0"
336 : "(*VERB) with an argument is not supported\0"
337 : /* 60 */
338 : "(*VERB) not recognized\0"
339 : "number is too big\0"
340 : "subpattern name expected\0"
341 : "digit expected after (?+\0"
342 : "] is an invalid data character in JavaScript compatibility mode";
343 :
344 :
345 : /* Table to identify digits and hex digits. This is used when compiling
346 : patterns. Note that the tables in chartables are dependent on the locale, and
347 : may mark arbitrary characters as digits - but the PCRE compiling code expects
348 : to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
349 : a private table here. It costs 256 bytes, but it is a lot faster than doing
350 : character value tests (at least in some simple cases I timed), and in some
351 : applications one wants PCRE to compile efficiently as well as match
352 : efficiently.
353 :
354 : For convenience, we use the same bit definitions as in chartables:
355 :
356 : 0x04 decimal digit
357 : 0x08 hexadecimal digit
358 :
359 : Then we can use ctype_digit and ctype_xdigit in the code. */
360 :
361 : #ifndef EBCDIC
362 :
363 : /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
364 : UTF-8 mode. */
365 :
366 : static const unsigned char digitab[] =
367 : {
368 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
369 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
370 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
371 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
372 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
373 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
374 : 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
375 : 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
376 : 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
377 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
378 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
379 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
380 : 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
381 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
382 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
383 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
384 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
385 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
386 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
387 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
388 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
389 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
390 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
391 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
392 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
393 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
394 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
395 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
396 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
397 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
398 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
399 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
400 :
401 : #else
402 :
403 : /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
404 :
405 : static const unsigned char digitab[] =
406 : {
407 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
408 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
409 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
410 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
411 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
412 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
413 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
414 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
415 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
416 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
417 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
418 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
419 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
420 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
421 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
422 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
423 : 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
424 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
425 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
426 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
427 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
428 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
429 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
430 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
431 : 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
432 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
433 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
434 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
435 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
436 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
437 : 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
438 : 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
439 :
440 : static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
441 : 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
442 : 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
443 : 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
444 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
445 : 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
446 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
447 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
448 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
449 : 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
450 : 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
451 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
452 : 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
453 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
454 : 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
455 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
456 : 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
457 : 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
458 : 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
459 : 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
460 : 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
461 : 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
462 : 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
463 : 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
464 : 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
465 : 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
466 : 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
467 : 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
468 : 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
469 : 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
470 : 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
471 : 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
472 : 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
473 : #endif
474 :
475 :
476 : /* Definition to allow mutual recursion */
477 :
478 : static BOOL
479 : compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
480 : int *, int *, branch_chain *, compile_data *, int *);
481 :
482 :
483 :
484 : /*************************************************
485 : * Find an error text *
486 : *************************************************/
487 :
488 : /* The error texts are now all in one long string, to save on relocations. As
489 : some of the text is of unknown length, we can't use a table of offsets.
490 : Instead, just count through the strings. This is not a performance issue
491 : because it happens only when there has been a compilation error.
492 :
493 : Argument: the error number
494 : Returns: pointer to the error string
495 : */
496 :
497 : static const char *
498 : find_error_text(int n)
499 4 : {
500 4 : const char *s = error_texts;
501 4 : for (; n > 0; n--) while (*s++ != 0) {};
502 4 : return s;
503 : }
504 :
505 :
506 : /*************************************************
507 : * Handle escapes *
508 : *************************************************/
509 :
510 : /* This function is called when a \ has been encountered. It either returns a
511 : positive value for a simple escape such as \n, or a negative value which
512 : encodes one of the more complicated things such as \d. A backreference to group
513 : n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
514 : UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
515 : ptr is pointing at the \. On exit, it is on the final character of the escape
516 : sequence.
517 :
518 : Arguments:
519 : ptrptr points to the pattern position pointer
520 : errorcodeptr points to the errorcode variable
521 : bracount number of previous extracting brackets
522 : options the options bits
523 : isclass TRUE if inside a character class
524 :
525 : Returns: zero or positive => a data character
526 : negative => a special escape sequence
527 : on error, errorcodeptr is set
528 : */
529 :
530 : static int
531 : check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
532 : int options, BOOL isclass)
533 1752830 : {
534 1752830 : BOOL utf8 = (options & PCRE_UTF8) != 0;
535 1752830 : const uschar *ptr = *ptrptr + 1;
536 : int c, i;
537 :
538 1752830 : GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
539 1752830 : ptr--; /* Set pointer back to the last byte */
540 :
541 : /* If backslash is at the end of the pattern, it's an error. */
542 :
543 1752830 : if (c == 0) *errorcodeptr = ERR1;
544 :
545 : /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
546 : in a table. A non-zero result is something that can be returned immediately.
547 : Otherwise further processing may be required. */
548 :
549 : #ifndef EBCDIC /* ASCII/UTF-8 coding */
550 1752830 : else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
551 749150 : else if ((i = escapes[c - CHAR_0]) != 0) c = i;
552 :
553 : #else /* EBCDIC coding */
554 : else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
555 : else if ((i = escapes[c - 0x48]) != 0) c = i;
556 : #endif
557 :
558 : /* Escapes that need further processing, or are illegal. */
559 :
560 : else
561 : {
562 : const uschar *oldptr;
563 : BOOL braced, negated;
564 :
565 2671 : switch (c)
566 : {
567 : /* A number of Perl escapes are not handled by PCRE. We give an explicit
568 : error. */
569 :
570 : case CHAR_l:
571 : case CHAR_L:
572 : case CHAR_N:
573 : case CHAR_u:
574 : case CHAR_U:
575 0 : *errorcodeptr = ERR37;
576 0 : break;
577 :
578 : /* \g must be followed by one of a number of specific things:
579 :
580 : (1) A number, either plain or braced. If positive, it is an absolute
581 : backreference. If negative, it is a relative backreference. This is a Perl
582 : 5.10 feature.
583 :
584 : (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
585 : is part of Perl's movement towards a unified syntax for back references. As
586 : this is synonymous with \k{name}, we fudge it up by pretending it really
587 : was \k.
588 :
589 : (3) For Oniguruma compatibility we also support \g followed by a name or a
590 : number either in angle brackets or in single quotes. However, these are
591 : (possibly recursive) subroutine calls, _not_ backreferences. Just return
592 : the -ESC_g code (cf \k). */
593 :
594 : case CHAR_g:
595 0 : if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
596 : {
597 0 : c = -ESC_g;
598 0 : break;
599 : }
600 :
601 : /* Handle the Perl-compatible cases */
602 :
603 0 : if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
604 : {
605 : const uschar *p;
606 0 : for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
607 0 : if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
608 0 : if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
609 : {
610 0 : c = -ESC_k;
611 0 : break;
612 : }
613 0 : braced = TRUE;
614 0 : ptr++;
615 : }
616 0 : else braced = FALSE;
617 :
618 0 : if (ptr[1] == CHAR_MINUS)
619 : {
620 0 : negated = TRUE;
621 0 : ptr++;
622 : }
623 0 : else negated = FALSE;
624 :
625 0 : c = 0;
626 0 : while ((digitab[ptr[1]] & ctype_digit) != 0)
627 0 : c = c * 10 + *(++ptr) - CHAR_0;
628 :
629 0 : if (c < 0) /* Integer overflow */
630 : {
631 0 : *errorcodeptr = ERR61;
632 0 : break;
633 : }
634 :
635 0 : if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
636 : {
637 0 : *errorcodeptr = ERR57;
638 0 : break;
639 : }
640 :
641 0 : if (c == 0)
642 : {
643 0 : *errorcodeptr = ERR58;
644 0 : break;
645 : }
646 :
647 0 : if (negated)
648 : {
649 0 : if (c > bracount)
650 : {
651 0 : *errorcodeptr = ERR15;
652 0 : break;
653 : }
654 0 : c = bracount - (c - 1);
655 : }
656 :
657 0 : c = -(ESC_REF + c);
658 0 : break;
659 :
660 : /* The handling of escape sequences consisting of a string of digits
661 : starting with one that is not zero is not straightforward. By experiment,
662 : the way Perl works seems to be as follows:
663 :
664 : Outside a character class, the digits are read as a decimal number. If the
665 : number is less than 10, or if there are that many previous extracting
666 : left brackets, then it is a back reference. Otherwise, up to three octal
667 : digits are read to form an escaped byte. Thus \123 is likely to be octal
668 : 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
669 : value is greater than 377, the least significant 8 bits are taken. Inside a
670 : character class, \ followed by a digit is always an octal number. */
671 :
672 : case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
673 : case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
674 :
675 410 : if (!isclass)
676 : {
677 410 : oldptr = ptr;
678 410 : c -= CHAR_0;
679 820 : while ((digitab[ptr[1]] & ctype_digit) != 0)
680 0 : c = c * 10 + *(++ptr) - CHAR_0;
681 410 : if (c < 0) /* Integer overflow */
682 : {
683 0 : *errorcodeptr = ERR61;
684 0 : break;
685 : }
686 410 : if (c < 10 || c <= bracount)
687 : {
688 410 : c = -(ESC_REF + c);
689 410 : break;
690 : }
691 0 : ptr = oldptr; /* Put the pointer back and fall through */
692 : }
693 :
694 : /* Handle an octal number following \. If the first digit is 8 or 9, Perl
695 : generates a binary zero byte and treats the digit as a following literal.
696 : Thus we have to pull back the pointer by one. */
697 :
698 0 : if ((c = *ptr) >= CHAR_8)
699 : {
700 0 : ptr--;
701 0 : c = 0;
702 0 : break;
703 : }
704 :
705 : /* \0 always starts an octal number, but we may drop through to here with a
706 : larger first octal digit. The original code used just to take the least
707 : significant 8 bits of octal numbers (I think this is what early Perls used
708 : to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
709 : than 3 octal digits. */
710 :
711 : case CHAR_0:
712 2254 : c -= CHAR_0;
713 8932 : while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
714 4424 : c = c * 8 + *(++ptr) - CHAR_0;
715 2254 : if (!utf8 && c > 255) *errorcodeptr = ERR51;
716 2254 : break;
717 :
718 : /* \x is complicated. \x{ddd} is a character number which can be greater
719 : than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
720 : treated as a data character. */
721 :
722 : case CHAR_x:
723 4 : if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
724 : {
725 0 : const uschar *pt = ptr + 2;
726 0 : int count = 0;
727 :
728 0 : c = 0;
729 0 : while ((digitab[*pt] & ctype_xdigit) != 0)
730 : {
731 0 : register int cc = *pt++;
732 0 : if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
733 0 : count++;
734 :
735 : #ifndef EBCDIC /* ASCII/UTF-8 coding */
736 0 : if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
737 0 : c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
738 : #else /* EBCDIC coding */
739 : if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
740 : c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
741 : #endif
742 : }
743 :
744 0 : if (*pt == CHAR_RIGHT_CURLY_BRACKET)
745 : {
746 0 : if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
747 0 : ptr = pt;
748 0 : break;
749 : }
750 :
751 : /* If the sequence of hex digits does not end with '}', then we don't
752 : recognize this construct; fall through to the normal \x handling. */
753 : }
754 :
755 : /* Read just a single-byte hex-defined char */
756 :
757 4 : c = 0;
758 16 : while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
759 : {
760 : int cc; /* Some compilers don't like */
761 8 : cc = *(++ptr); /* ++ in initializers */
762 : #ifndef EBCDIC /* ASCII/UTF-8 coding */
763 8 : if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
764 8 : c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
765 : #else /* EBCDIC coding */
766 : if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
767 : c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
768 : #endif
769 : }
770 4 : break;
771 :
772 : /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
773 : This coding is ASCII-specific, but then the whole concept of \cx is
774 : ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
775 :
776 : case CHAR_c:
777 0 : c = *(++ptr);
778 0 : if (c == 0)
779 : {
780 0 : *errorcodeptr = ERR2;
781 0 : break;
782 : }
783 :
784 : #ifndef EBCDIC /* ASCII/UTF-8 coding */
785 0 : if (c >= CHAR_a && c <= CHAR_z) c -= 32;
786 0 : c ^= 0x40;
787 : #else /* EBCDIC coding */
788 : if (c >= CHAR_a && c <= CHAR_z) c += 64;
789 : c ^= 0xC0;
790 : #endif
791 0 : break;
792 :
793 : /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
794 : other alphanumeric following \ is an error if PCRE_EXTRA was set;
795 : otherwise, for Perl compatibility, it is a literal. This code looks a bit
796 : odd, but there used to be some cases other than the default, and there may
797 : be again in future, so I haven't "optimized" it. */
798 :
799 : default:
800 3 : if ((options & PCRE_EXTRA) != 0) switch(c)
801 : {
802 : default:
803 1 : *errorcodeptr = ERR3;
804 : break;
805 : }
806 : break;
807 : }
808 : }
809 :
810 1752830 : *ptrptr = ptr;
811 1752830 : return c;
812 : }
813 :
814 :
815 :
816 : #ifdef SUPPORT_UCP
817 : /*************************************************
818 : * Handle \P and \p *
819 : *************************************************/
820 :
821 : /* This function is called after \P or \p has been encountered, provided that
822 : PCRE is compiled with support for Unicode properties. On entry, ptrptr is
823 : pointing at the P or p. On exit, it is pointing at the final character of the
824 : escape sequence.
825 :
826 : Argument:
827 : ptrptr points to the pattern position pointer
828 : negptr points to a boolean that is set TRUE for negation else FALSE
829 : dptr points to an int that is set to the detailed property value
830 : errorcodeptr points to the error code variable
831 :
832 : Returns: type value from ucp_type_table, or -1 for an invalid type
833 : */
834 :
835 : static int
836 : get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
837 40 : {
838 : int c, i, bot, top;
839 40 : const uschar *ptr = *ptrptr;
840 : char name[32];
841 :
842 40 : c = *(++ptr);
843 40 : if (c == 0) goto ERROR_RETURN;
844 :
845 40 : *negptr = FALSE;
846 :
847 : /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
848 : negation. */
849 :
850 40 : if (c == CHAR_LEFT_CURLY_BRACKET)
851 : {
852 38 : if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
853 : {
854 0 : *negptr = TRUE;
855 0 : ptr++;
856 : }
857 94 : for (i = 0; i < (int)sizeof(name) - 1; i++)
858 : {
859 94 : c = *(++ptr);
860 94 : if (c == 0) goto ERROR_RETURN;
861 94 : if (c == CHAR_RIGHT_CURLY_BRACKET) break;
862 56 : name[i] = c;
863 : }
864 38 : if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
865 38 : name[i] = 0;
866 : }
867 :
868 : /* Otherwise there is just one following character */
869 :
870 : else
871 : {
872 2 : name[0] = c;
873 2 : name[1] = 0;
874 : }
875 :
876 40 : *ptrptr = ptr;
877 :
878 : /* Search for a recognized property name using binary chop */
879 :
880 40 : bot = 0;
881 40 : top = _pcre_utt_size;
882 :
883 250 : while (bot < top)
884 : {
885 210 : i = (bot + top) >> 1;
886 210 : c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
887 210 : if (c == 0)
888 : {
889 40 : *dptr = _pcre_utt[i].value;
890 40 : return _pcre_utt[i].type;
891 : }
892 170 : if (c > 0) bot = i + 1; else top = i;
893 : }
894 :
895 0 : *errorcodeptr = ERR47;
896 0 : *ptrptr = ptr;
897 0 : return -1;
898 :
899 0 : ERROR_RETURN:
900 0 : *errorcodeptr = ERR46;
901 0 : *ptrptr = ptr;
902 0 : return -1;
903 : }
904 : #endif
905 :
906 :
907 :
908 :
909 : /*************************************************
910 : * Check for counted repeat *
911 : *************************************************/
912 :
913 : /* This function is called when a '{' is encountered in a place where it might
914 : start a quantifier. It looks ahead to see if it really is a quantifier or not.
915 : It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
916 : where the ddds are digits.
917 :
918 : Arguments:
919 : p pointer to the first char after '{'
920 :
921 : Returns: TRUE or FALSE
922 : */
923 :
924 : static BOOL
925 : is_counted_repeat(const uschar *p)
926 88 : {
927 88 : if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
928 54 : while ((digitab[*p] & ctype_digit) != 0) p++;
929 54 : if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
930 :
931 20 : if (*p++ != CHAR_COMMA) return FALSE;
932 20 : if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
933 :
934 12 : if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
935 12 : while ((digitab[*p] & ctype_digit) != 0) p++;
936 :
937 12 : return (*p == CHAR_RIGHT_CURLY_BRACKET);
938 : }
939 :
940 :
941 :
942 : /*************************************************
943 : * Read repeat counts *
944 : *************************************************/
945 :
946 : /* Read an item of the form {n,m} and return the values. This is called only
947 : after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
948 : so the syntax is guaranteed to be correct, but we need to check the values.
949 :
950 : Arguments:
951 : p pointer to first char after '{'
952 : minp pointer to int for min
953 : maxp pointer to int for max
954 : returned as -1 if no max
955 : errorcodeptr points to error code variable
956 :
957 : Returns: pointer to '}' on success;
958 : current ptr on error, with errorcodeptr set non-zero
959 : */
960 :
961 : static const uschar *
962 : read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
963 54 : {
964 54 : int min = 0;
965 54 : int max = -1;
966 :
967 : /* Read the minimum value and do a paranoid check: a negative value indicates
968 : an integer overflow. */
969 :
970 54 : while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
971 54 : if (min < 0 || min > 65535)
972 : {
973 0 : *errorcodeptr = ERR5;
974 0 : return p;
975 : }
976 :
977 : /* Read the maximum value if there is one, and again do a paranoid on its size.
978 : Also, max must not be less than min. */
979 :
980 54 : if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
981 : {
982 20 : if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
983 : {
984 12 : max = 0;
985 12 : while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
986 12 : if (max < 0 || max > 65535)
987 : {
988 0 : *errorcodeptr = ERR5;
989 0 : return p;
990 : }
991 12 : if (max < min)
992 : {
993 0 : *errorcodeptr = ERR4;
994 0 : return p;
995 : }
996 : }
997 : }
998 :
999 : /* Fill in the required variables, and pass back the pointer to the terminating
1000 : '}'. */
1001 :
1002 54 : *minp = min;
1003 54 : *maxp = max;
1004 54 : return p;
1005 : }
1006 :
1007 :
1008 :
1009 : /*************************************************
1010 : * Subroutine for finding forward reference *
1011 : *************************************************/
1012 :
1013 : /* This recursive function is called only from find_parens() below. The
1014 : top-level call starts at the beginning of the pattern. All other calls must
1015 : start at a parenthesis. It scans along a pattern's text looking for capturing
1016 : subpatterns, and counting them. If it finds a named pattern that matches the
1017 : name it is given, it returns its number. Alternatively, if the name is NULL, it
1018 : returns when it reaches a given numbered subpattern. We know that if (?P< is
1019 : encountered, the name will be terminated by '>' because that is checked in the
1020 : first pass. Recursion is used to keep track of subpatterns that reset the
1021 : capturing group numbers - the (?| feature.
1022 :
1023 : Arguments:
1024 : ptrptr address of the current character pointer (updated)
1025 : cd compile background data
1026 : name name to seek, or NULL if seeking a numbered subpattern
1027 : lorn name length, or subpattern number if name is NULL
1028 : xmode TRUE if we are in /x mode
1029 : count pointer to the current capturing subpattern number (updated)
1030 :
1031 : Returns: the number of the named subpattern, or -1 if not found
1032 : */
1033 :
1034 : static int
1035 : find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1036 : BOOL xmode, int *count)
1037 1014 : {
1038 1014 : uschar *ptr = *ptrptr;
1039 1014 : int start_count = *count;
1040 1014 : int hwm_count = start_count;
1041 1014 : BOOL dup_parens = FALSE;
1042 :
1043 : /* If the first character is a parenthesis, check on the type of group we are
1044 : dealing with. The very first call may not start with a parenthesis. */
1045 :
1046 1014 : if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1047 : {
1048 1012 : if (ptr[1] == CHAR_QUESTION_MARK &&
1049 : ptr[2] == CHAR_VERTICAL_LINE)
1050 : {
1051 0 : ptr += 3;
1052 0 : dup_parens = TRUE;
1053 : }
1054 :
1055 : /* Handle a normal, unnamed capturing parenthesis */
1056 :
1057 1622 : else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
1058 : {
1059 610 : *count += 1;
1060 610 : if (name == NULL && *count == lorn) return *count;
1061 610 : ptr++;
1062 : }
1063 :
1064 : /* Handle a condition. If it is an assertion, just carry on so that it
1065 : is processed as normal. If not, skip to the closing parenthesis of the
1066 : condition (there can't be any nested parens. */
1067 :
1068 402 : else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1069 : {
1070 4 : ptr += 2;
1071 4 : if (ptr[1] != CHAR_QUESTION_MARK)
1072 : {
1073 4 : while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1074 4 : if (*ptr != 0) ptr++;
1075 : }
1076 : }
1077 :
1078 : /* We have either (? or (* and not a condition */
1079 :
1080 : else
1081 : {
1082 398 : ptr += 2;
1083 398 : if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1084 :
1085 : /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1086 :
1087 398 : if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1088 : ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1089 : {
1090 : int term;
1091 : const uschar *thisname;
1092 0 : *count += 1;
1093 0 : if (name == NULL && *count == lorn) return *count;
1094 0 : term = *ptr++;
1095 0 : if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1096 0 : thisname = ptr;
1097 0 : while (*ptr != term) ptr++;
1098 0 : if (name != NULL && lorn == ptr - thisname &&
1099 : strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1100 0 : return *count;
1101 : }
1102 : }
1103 : }
1104 :
1105 : /* Past any initial parenthesis handling, scan for parentheses or vertical
1106 : bars. */
1107 :
1108 8108 : for (; *ptr != 0; ptr++)
1109 : {
1110 : /* Skip over backslashed characters and also entire \Q...\E */
1111 :
1112 8106 : if (*ptr == CHAR_BACKSLASH)
1113 : {
1114 810 : if (*(++ptr) == 0) goto FAIL_EXIT;
1115 810 : if (*ptr == CHAR_Q) for (;;)
1116 : {
1117 0 : while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1118 0 : if (*ptr == 0) goto FAIL_EXIT;
1119 0 : if (*(++ptr) == CHAR_E) break;
1120 0 : }
1121 810 : continue;
1122 : }
1123 :
1124 : /* Skip over character classes; this logic must be similar to the way they
1125 : are handled for real. If the first character is '^', skip it. Also, if the
1126 : first few characters (either before or after ^) are \Q\E or \E we skip them
1127 : too. This makes for compatibility with Perl. Note the use of STR macros to
1128 : encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1129 :
1130 7296 : if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1131 : {
1132 8 : BOOL negate_class = FALSE;
1133 : for (;;)
1134 : {
1135 12 : int c = *(++ptr);
1136 12 : if (c == CHAR_BACKSLASH)
1137 : {
1138 0 : if (ptr[1] == CHAR_E)
1139 0 : ptr++;
1140 0 : else if (strncmp((const char *)ptr+1,
1141 : STR_Q STR_BACKSLASH STR_E, 3) == 0)
1142 0 : ptr += 3;
1143 : else
1144 0 : break;
1145 : }
1146 12 : else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
1147 4 : negate_class = TRUE;
1148 : else break;
1149 4 : }
1150 :
1151 : /* If the next character is ']', it is a data character that must be
1152 : skipped, except in JavaScript compatibility mode. */
1153 :
1154 8 : if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1155 : (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1156 0 : ptr++;
1157 :
1158 38 : while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1159 : {
1160 22 : if (*ptr == 0) return -1;
1161 22 : if (*ptr == CHAR_BACKSLASH)
1162 : {
1163 4 : if (*(++ptr) == 0) goto FAIL_EXIT;
1164 4 : if (*ptr == CHAR_Q) for (;;)
1165 : {
1166 0 : while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1167 0 : if (*ptr == 0) goto FAIL_EXIT;
1168 0 : if (*(++ptr) == CHAR_E) break;
1169 0 : }
1170 4 : continue;
1171 : }
1172 : }
1173 8 : continue;
1174 : }
1175 :
1176 : /* Skip comments in /x mode */
1177 :
1178 7288 : if (xmode && *ptr == CHAR_NUMBER_SIGN)
1179 : {
1180 0 : while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1181 0 : if (*ptr == 0) goto FAIL_EXIT;
1182 0 : continue;
1183 : }
1184 :
1185 : /* Check for the special metacharacters */
1186 :
1187 7288 : if (*ptr == CHAR_LEFT_PARENTHESIS)
1188 : {
1189 1012 : int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
1190 1012 : if (rc > 0) return rc;
1191 1012 : if (*ptr == 0) goto FAIL_EXIT;
1192 : }
1193 :
1194 6276 : else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1195 : {
1196 1012 : if (dup_parens && *count < hwm_count) *count = hwm_count;
1197 1012 : *ptrptr = ptr;
1198 1012 : return -1;
1199 : }
1200 :
1201 5264 : else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1202 : {
1203 0 : if (*count > hwm_count) hwm_count = *count;
1204 0 : *count = start_count;
1205 : }
1206 : }
1207 :
1208 2 : FAIL_EXIT:
1209 2 : *ptrptr = ptr;
1210 2 : return -1;
1211 : }
1212 :
1213 :
1214 :
1215 :
1216 : /*************************************************
1217 : * Find forward referenced subpattern *
1218 : *************************************************/
1219 :
1220 : /* This function scans along a pattern's text looking for capturing
1221 : subpatterns, and counting them. If it finds a named pattern that matches the
1222 : name it is given, it returns its number. Alternatively, if the name is NULL, it
1223 : returns when it reaches a given numbered subpattern. This is used for forward
1224 : references to subpatterns. We used to be able to start this scan from the
1225 : current compiling point, using the current count value from cd->bracount, and
1226 : do it all in a single loop, but the addition of the possibility of duplicate
1227 : subpattern numbers means that we have to scan from the very start, in order to
1228 : take account of such duplicates, and to use a recursive function to keep track
1229 : of the different types of group.
1230 :
1231 : Arguments:
1232 : cd compile background data
1233 : name name to seek, or NULL if seeking a numbered subpattern
1234 : lorn name length, or subpattern number if name is NULL
1235 : xmode TRUE if we are in /x mode
1236 :
1237 : Returns: the number of the found subpattern, or -1 if not found
1238 : */
1239 :
1240 : static int
1241 : find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
1242 2 : {
1243 2 : uschar *ptr = (uschar *)cd->start_pattern;
1244 2 : int count = 0;
1245 : int rc;
1246 :
1247 : /* If the pattern does not start with an opening parenthesis, the first call
1248 : to find_parens_sub() will scan right to the end (if necessary). However, if it
1249 : does start with a parenthesis, find_parens_sub() will return when it hits the
1250 : matching closing parens. That is why we have to have a loop. */
1251 :
1252 : for (;;)
1253 : {
1254 2 : rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
1255 2 : if (rc > 0 || *ptr++ == 0) break;
1256 0 : }
1257 :
1258 2 : return rc;
1259 : }
1260 :
1261 :
1262 :
1263 :
1264 : /*************************************************
1265 : * Find first significant op code *
1266 : *************************************************/
1267 :
1268 : /* This is called by several functions that scan a compiled expression looking
1269 : for a fixed first character, or an anchoring op code etc. It skips over things
1270 : that do not influence this. For some calls, a change of option is important.
1271 : For some calls, it makes sense to skip negative forward and all backward
1272 : assertions, and also the \b assertion; for others it does not.
1273 :
1274 : Arguments:
1275 : code pointer to the start of the group
1276 : options pointer to external options
1277 : optbit the option bit whose changing is significant, or
1278 : zero if none are
1279 : skipassert TRUE if certain assertions are to be skipped
1280 :
1281 : Returns: pointer to the first significant opcode
1282 : */
1283 :
1284 : static const uschar*
1285 : first_significant_code(const uschar *code, int *options, int optbit,
1286 : BOOL skipassert)
1287 18721 : {
1288 : for (;;)
1289 : {
1290 18721 : switch ((int)*code)
1291 : {
1292 : case OP_OPT:
1293 0 : if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1294 0 : *options = (int)code[1];
1295 0 : code += 2;
1296 0 : break;
1297 :
1298 : case OP_ASSERT_NOT:
1299 : case OP_ASSERTBACK:
1300 : case OP_ASSERTBACK_NOT:
1301 3 : if (!skipassert) return code;
1302 1 : do code += GET(code, 1); while (*code == OP_ALT);
1303 1 : code += _pcre_OP_lengths[*code];
1304 1 : break;
1305 :
1306 : case OP_WORD_BOUNDARY:
1307 : case OP_NOT_WORD_BOUNDARY:
1308 15 : if (!skipassert) return code;
1309 : /* Fall through */
1310 :
1311 : case OP_CALLOUT:
1312 : case OP_CREF:
1313 : case OP_RREF:
1314 : case OP_DEF:
1315 5 : code += _pcre_OP_lengths[*code];
1316 5 : break;
1317 :
1318 : default:
1319 18703 : return code;
1320 : }
1321 6 : }
1322 : /* Control never reaches here */
1323 : }
1324 :
1325 :
1326 :
1327 :
1328 : /*************************************************
1329 : * Find the fixed length of a pattern *
1330 : *************************************************/
1331 :
1332 : /* Scan a pattern and compute the fixed length of subject that will match it,
1333 : if the length is fixed. This is needed for dealing with backward assertions.
1334 : In UTF8 mode, the result is in characters rather than bytes.
1335 :
1336 : Arguments:
1337 : code points to the start of the pattern (the bracket)
1338 : options the compiling options
1339 :
1340 : Returns: the fixed length, or -1 if there is no fixed length,
1341 : or -2 if \C was encountered
1342 : */
1343 :
1344 : static int
1345 : find_fixedlength(uschar *code, int options)
1346 2 : {
1347 2 : int length = -1;
1348 :
1349 2 : register int branchlength = 0;
1350 2 : register uschar *cc = code + 1 + LINK_SIZE;
1351 :
1352 : /* Scan along the opcodes for this branch. If we get to the end of the
1353 : branch, check the length against that of the other branches. */
1354 :
1355 : for (;;)
1356 : {
1357 : int d;
1358 6 : register int op = *cc;
1359 6 : switch (op)
1360 : {
1361 : case OP_CBRA:
1362 : case OP_BRA:
1363 : case OP_ONCE:
1364 : case OP_COND:
1365 0 : d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1366 0 : if (d < 0) return d;
1367 0 : branchlength += d;
1368 0 : do cc += GET(cc, 1); while (*cc == OP_ALT);
1369 0 : cc += 1 + LINK_SIZE;
1370 0 : break;
1371 :
1372 : /* Reached end of a branch; if it's a ket it is the end of a nested
1373 : call. If it's ALT it is an alternation in a nested call. If it is
1374 : END it's the end of the outer call. All can be handled by the same code. */
1375 :
1376 : case OP_ALT:
1377 : case OP_KET:
1378 : case OP_KETRMAX:
1379 : case OP_KETRMIN:
1380 : case OP_END:
1381 2 : if (length < 0) length = branchlength;
1382 0 : else if (length != branchlength) return -1;
1383 2 : if (*cc != OP_ALT) return length;
1384 0 : cc += 1 + LINK_SIZE;
1385 0 : branchlength = 0;
1386 0 : break;
1387 :
1388 : /* Skip over assertive subpatterns */
1389 :
1390 : case OP_ASSERT:
1391 : case OP_ASSERT_NOT:
1392 : case OP_ASSERTBACK:
1393 : case OP_ASSERTBACK_NOT:
1394 0 : do cc += GET(cc, 1); while (*cc == OP_ALT);
1395 : /* Fall through */
1396 :
1397 : /* Skip over things that don't match chars */
1398 :
1399 : case OP_REVERSE:
1400 : case OP_CREF:
1401 : case OP_RREF:
1402 : case OP_DEF:
1403 : case OP_OPT:
1404 : case OP_CALLOUT:
1405 : case OP_SOD:
1406 : case OP_SOM:
1407 : case OP_EOD:
1408 : case OP_EODN:
1409 : case OP_CIRC:
1410 : case OP_DOLL:
1411 : case OP_NOT_WORD_BOUNDARY:
1412 : case OP_WORD_BOUNDARY:
1413 2 : cc += _pcre_OP_lengths[*cc];
1414 2 : break;
1415 :
1416 : /* Handle literal characters */
1417 :
1418 : case OP_CHAR:
1419 : case OP_CHARNC:
1420 : case OP_NOT:
1421 1 : branchlength++;
1422 1 : cc += 2;
1423 : #ifdef SUPPORT_UTF8
1424 1 : if ((options & PCRE_UTF8) != 0)
1425 : {
1426 0 : while ((*cc & 0xc0) == 0x80) cc++;
1427 : }
1428 : #endif
1429 1 : break;
1430 :
1431 : /* Handle exact repetitions. The count is already in characters, but we
1432 : need to skip over a multibyte character in UTF8 mode. */
1433 :
1434 : case OP_EXACT:
1435 0 : branchlength += GET2(cc,1);
1436 0 : cc += 4;
1437 : #ifdef SUPPORT_UTF8
1438 0 : if ((options & PCRE_UTF8) != 0)
1439 : {
1440 0 : while((*cc & 0x80) == 0x80) cc++;
1441 : }
1442 : #endif
1443 0 : break;
1444 :
1445 : case OP_TYPEEXACT:
1446 0 : branchlength += GET2(cc,1);
1447 0 : if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1448 0 : cc += 4;
1449 0 : break;
1450 :
1451 : /* Handle single-char matchers */
1452 :
1453 : case OP_PROP:
1454 : case OP_NOTPROP:
1455 0 : cc += 2;
1456 : /* Fall through */
1457 :
1458 : case OP_NOT_DIGIT:
1459 : case OP_DIGIT:
1460 : case OP_NOT_WHITESPACE:
1461 : case OP_WHITESPACE:
1462 : case OP_NOT_WORDCHAR:
1463 : case OP_WORDCHAR:
1464 : case OP_ANY:
1465 : case OP_ALLANY:
1466 1 : branchlength++;
1467 1 : cc++;
1468 1 : break;
1469 :
1470 : /* The single-byte matcher isn't allowed */
1471 :
1472 : case OP_ANYBYTE:
1473 0 : return -2;
1474 :
1475 : /* Check a class for variable quantification */
1476 :
1477 : #ifdef SUPPORT_UTF8
1478 : case OP_XCLASS:
1479 0 : cc += GET(cc, 1) - 33;
1480 : /* Fall through */
1481 : #endif
1482 :
1483 : case OP_CLASS:
1484 : case OP_NCLASS:
1485 0 : cc += 33;
1486 :
1487 0 : switch (*cc)
1488 : {
1489 : case OP_CRSTAR:
1490 : case OP_CRMINSTAR:
1491 : case OP_CRQUERY:
1492 : case OP_CRMINQUERY:
1493 0 : return -1;
1494 :
1495 : case OP_CRRANGE:
1496 : case OP_CRMINRANGE:
1497 0 : if (GET2(cc,1) != GET2(cc,3)) return -1;
1498 0 : branchlength += GET2(cc,1);
1499 0 : cc += 5;
1500 0 : break;
1501 :
1502 : default:
1503 0 : branchlength++;
1504 : }
1505 0 : break;
1506 :
1507 : /* Anything else is variable length */
1508 :
1509 : default:
1510 0 : return -1;
1511 : }
1512 4 : }
1513 : /* Control never gets here */
1514 : }
1515 :
1516 :
1517 :
1518 :
1519 : /*************************************************
1520 : * Scan compiled regex for numbered bracket *
1521 : *************************************************/
1522 :
1523 : /* This little function scans through a compiled pattern until it finds a
1524 : capturing bracket with the given number.
1525 :
1526 : Arguments:
1527 : code points to start of expression
1528 : utf8 TRUE in UTF-8 mode
1529 : number the required bracket number
1530 :
1531 : Returns: pointer to the opcode for the bracket, or NULL if not found
1532 : */
1533 :
1534 : static const uschar *
1535 : find_bracket(const uschar *code, BOOL utf8, int number)
1536 0 : {
1537 : for (;;)
1538 : {
1539 0 : register int c = *code;
1540 0 : if (c == OP_END) return NULL;
1541 :
1542 : /* XCLASS is used for classes that cannot be represented just by a bit
1543 : map. This includes negated single high-valued characters. The length in
1544 : the table is zero; the actual length is stored in the compiled code. */
1545 :
1546 0 : if (c == OP_XCLASS) code += GET(code, 1);
1547 :
1548 : /* Handle capturing bracket */
1549 :
1550 0 : else if (c == OP_CBRA)
1551 : {
1552 0 : int n = GET2(code, 1+LINK_SIZE);
1553 0 : if (n == number) return (uschar *)code;
1554 0 : code += _pcre_OP_lengths[c];
1555 : }
1556 :
1557 : /* Otherwise, we can get the item's length from the table, except that for
1558 : repeated character types, we have to test for \p and \P, which have an extra
1559 : two bytes of parameters. */
1560 :
1561 : else
1562 : {
1563 0 : switch(c)
1564 : {
1565 : case OP_TYPESTAR:
1566 : case OP_TYPEMINSTAR:
1567 : case OP_TYPEPLUS:
1568 : case OP_TYPEMINPLUS:
1569 : case OP_TYPEQUERY:
1570 : case OP_TYPEMINQUERY:
1571 : case OP_TYPEPOSSTAR:
1572 : case OP_TYPEPOSPLUS:
1573 : case OP_TYPEPOSQUERY:
1574 0 : if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1575 0 : break;
1576 :
1577 : case OP_TYPEUPTO:
1578 : case OP_TYPEMINUPTO:
1579 : case OP_TYPEEXACT:
1580 : case OP_TYPEPOSUPTO:
1581 0 : if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1582 : break;
1583 : }
1584 :
1585 : /* Add in the fixed length from the table */
1586 :
1587 0 : code += _pcre_OP_lengths[c];
1588 :
1589 : /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1590 : a multi-byte character. The length in the table is a minimum, so we have to
1591 : arrange to skip the extra bytes. */
1592 :
1593 : #ifdef SUPPORT_UTF8
1594 0 : if (utf8) switch(c)
1595 : {
1596 : case OP_CHAR:
1597 : case OP_CHARNC:
1598 : case OP_EXACT:
1599 : case OP_UPTO:
1600 : case OP_MINUPTO:
1601 : case OP_POSUPTO:
1602 : case OP_STAR:
1603 : case OP_MINSTAR:
1604 : case OP_POSSTAR:
1605 : case OP_PLUS:
1606 : case OP_MINPLUS:
1607 : case OP_POSPLUS:
1608 : case OP_QUERY:
1609 : case OP_MINQUERY:
1610 : case OP_POSQUERY:
1611 0 : if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1612 : break;
1613 : }
1614 : #else
1615 : (void)(utf8); /* Keep compiler happy by referencing function argument */
1616 : #endif
1617 : }
1618 0 : }
1619 : }
1620 :
1621 :
1622 :
1623 : /*************************************************
1624 : * Scan compiled regex for recursion reference *
1625 : *************************************************/
1626 :
1627 : /* This little function scans through a compiled pattern until it finds an
1628 : instance of OP_RECURSE.
1629 :
1630 : Arguments:
1631 : code points to start of expression
1632 : utf8 TRUE in UTF-8 mode
1633 :
1634 : Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1635 : */
1636 :
1637 : static const uschar *
1638 : find_recurse(const uschar *code, BOOL utf8)
1639 8167 : {
1640 : for (;;)
1641 : {
1642 8167 : register int c = *code;
1643 8167 : if (c == OP_END) return NULL;
1644 7357 : if (c == OP_RECURSE) return code;
1645 :
1646 : /* XCLASS is used for classes that cannot be represented just by a bit
1647 : map. This includes negated single high-valued characters. The length in
1648 : the table is zero; the actual length is stored in the compiled code. */
1649 :
1650 7356 : if (c == OP_XCLASS) code += GET(code, 1);
1651 :
1652 : /* Otherwise, we can get the item's length from the table, except that for
1653 : repeated character types, we have to test for \p and \P, which have an extra
1654 : two bytes of parameters. */
1655 :
1656 : else
1657 : {
1658 7355 : switch(c)
1659 : {
1660 : case OP_TYPESTAR:
1661 : case OP_TYPEMINSTAR:
1662 : case OP_TYPEPLUS:
1663 : case OP_TYPEMINPLUS:
1664 : case OP_TYPEQUERY:
1665 : case OP_TYPEMINQUERY:
1666 : case OP_TYPEPOSSTAR:
1667 : case OP_TYPEPOSPLUS:
1668 : case OP_TYPEPOSQUERY:
1669 654 : if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1670 654 : break;
1671 :
1672 : case OP_TYPEPOSUPTO:
1673 : case OP_TYPEUPTO:
1674 : case OP_TYPEMINUPTO:
1675 : case OP_TYPEEXACT:
1676 2 : if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1677 : break;
1678 : }
1679 :
1680 : /* Add in the fixed length from the table */
1681 :
1682 7355 : code += _pcre_OP_lengths[c];
1683 :
1684 : /* In UTF-8 mode, opcodes that are followed by a character may be followed
1685 : by a multi-byte character. The length in the table is a minimum, so we have
1686 : to arrange to skip the extra bytes. */
1687 :
1688 : #ifdef SUPPORT_UTF8
1689 7355 : if (utf8) switch(c)
1690 : {
1691 : case OP_CHAR:
1692 : case OP_CHARNC:
1693 : case OP_EXACT:
1694 : case OP_UPTO:
1695 : case OP_MINUPTO:
1696 : case OP_POSUPTO:
1697 : case OP_STAR:
1698 : case OP_MINSTAR:
1699 : case OP_POSSTAR:
1700 : case OP_PLUS:
1701 : case OP_MINPLUS:
1702 : case OP_POSPLUS:
1703 : case OP_QUERY:
1704 : case OP_MINQUERY:
1705 : case OP_POSQUERY:
1706 0 : if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1707 : break;
1708 : }
1709 : #else
1710 : (void)(utf8); /* Keep compiler happy by referencing function argument */
1711 : #endif
1712 : }
1713 7356 : }
1714 : }
1715 :
1716 :
1717 :
1718 : /*************************************************
1719 : * Scan compiled branch for non-emptiness *
1720 : *************************************************/
1721 :
1722 : /* This function scans through a branch of a compiled pattern to see whether it
1723 : can match the empty string or not. It is called from could_be_empty()
1724 : below and from compile_branch() when checking for an unlimited repeat of a
1725 : group that can match nothing. Note that first_significant_code() skips over
1726 : backward and negative forward assertions when its final argument is TRUE. If we
1727 : hit an unclosed bracket, we return "empty" - this means we've struck an inner
1728 : bracket whose current branch will already have been scanned.
1729 :
1730 : Arguments:
1731 : code points to start of search
1732 : endcode points to where to stop
1733 : utf8 TRUE if in UTF8 mode
1734 :
1735 : Returns: TRUE if what is matched could be empty
1736 : */
1737 :
1738 : static BOOL
1739 : could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1740 52 : {
1741 : register int c;
1742 52 : for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1743 122 : code < endcode;
1744 18 : code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1745 : {
1746 : const uschar *ccode;
1747 :
1748 62 : c = *code;
1749 :
1750 : /* Skip over forward assertions; the other assertions are skipped by
1751 : first_significant_code() with a TRUE final argument. */
1752 :
1753 62 : if (c == OP_ASSERT)
1754 : {
1755 0 : do code += GET(code, 1); while (*code == OP_ALT);
1756 0 : c = *code;
1757 0 : continue;
1758 : }
1759 :
1760 : /* Groups with zero repeats can of course be empty; skip them. */
1761 :
1762 62 : if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1763 : {
1764 4 : code += _pcre_OP_lengths[c];
1765 5 : do code += GET(code, 1); while (*code == OP_ALT);
1766 4 : c = *code;
1767 4 : continue;
1768 : }
1769 :
1770 : /* For other groups, scan the branches. */
1771 :
1772 58 : if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1773 : {
1774 : BOOL empty_branch;
1775 10 : if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1776 :
1777 : /* If a conditional group has only one branch, there is a second, implied,
1778 : empty branch, so just skip over the conditional, because it could be empty.
1779 : Otherwise, scan the individual branches of the group. */
1780 :
1781 8 : if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1782 0 : code += GET(code, 1);
1783 : else
1784 : {
1785 8 : empty_branch = FALSE;
1786 : do
1787 : {
1788 8 : if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1789 3 : empty_branch = TRUE;
1790 8 : code += GET(code, 1);
1791 : }
1792 8 : while (*code == OP_ALT);
1793 8 : if (!empty_branch) return FALSE; /* All branches are non-empty */
1794 : }
1795 :
1796 3 : c = *code;
1797 3 : continue;
1798 : }
1799 :
1800 : /* Handle the other opcodes */
1801 :
1802 48 : switch (c)
1803 : {
1804 : /* Check for quantifiers after a class. XCLASS is used for classes that
1805 : cannot be represented just by a bit map. This includes negated single
1806 : high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1807 : actual length is stored in the compiled code, so we must update "code"
1808 : here. */
1809 :
1810 : #ifdef SUPPORT_UTF8
1811 : case OP_XCLASS:
1812 0 : ccode = code += GET(code, 1);
1813 0 : goto CHECK_CLASS_REPEAT;
1814 : #endif
1815 :
1816 : case OP_CLASS:
1817 : case OP_NCLASS:
1818 12 : ccode = code + 33;
1819 :
1820 : #ifdef SUPPORT_UTF8
1821 12 : CHECK_CLASS_REPEAT:
1822 : #endif
1823 :
1824 12 : switch (*ccode)
1825 : {
1826 : case OP_CRSTAR: /* These could be empty; continue */
1827 : case OP_CRMINSTAR:
1828 : case OP_CRQUERY:
1829 : case OP_CRMINQUERY:
1830 1 : break;
1831 :
1832 : default: /* Non-repeat => class must match */
1833 : case OP_CRPLUS: /* These repeats aren't empty */
1834 : case OP_CRMINPLUS:
1835 11 : return FALSE;
1836 :
1837 : case OP_CRRANGE:
1838 : case OP_CRMINRANGE:
1839 0 : if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1840 : break;
1841 : }
1842 1 : break;
1843 :
1844 : /* Opcodes that must match a character */
1845 :
1846 : case OP_PROP:
1847 : case OP_NOTPROP:
1848 : case OP_EXTUNI:
1849 : case OP_NOT_DIGIT:
1850 : case OP_DIGIT:
1851 : case OP_NOT_WHITESPACE:
1852 : case OP_WHITESPACE:
1853 : case OP_NOT_WORDCHAR:
1854 : case OP_WORDCHAR:
1855 : case OP_ANY:
1856 : case OP_ALLANY:
1857 : case OP_ANYBYTE:
1858 : case OP_CHAR:
1859 : case OP_CHARNC:
1860 : case OP_NOT:
1861 : case OP_PLUS:
1862 : case OP_MINPLUS:
1863 : case OP_POSPLUS:
1864 : case OP_EXACT:
1865 : case OP_NOTPLUS:
1866 : case OP_NOTMINPLUS:
1867 : case OP_NOTPOSPLUS:
1868 : case OP_NOTEXACT:
1869 : case OP_TYPEPLUS:
1870 : case OP_TYPEMINPLUS:
1871 : case OP_TYPEPOSPLUS:
1872 : case OP_TYPEEXACT:
1873 22 : return FALSE;
1874 :
1875 : /* These are going to continue, as they may be empty, but we have to
1876 : fudge the length for the \p and \P cases. */
1877 :
1878 : case OP_TYPESTAR:
1879 : case OP_TYPEMINSTAR:
1880 : case OP_TYPEPOSSTAR:
1881 : case OP_TYPEQUERY:
1882 : case OP_TYPEMINQUERY:
1883 : case OP_TYPEPOSQUERY:
1884 4 : if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1885 4 : break;
1886 :
1887 : /* Same for these */
1888 :
1889 : case OP_TYPEUPTO:
1890 : case OP_TYPEMINUPTO:
1891 : case OP_TYPEPOSUPTO:
1892 0 : if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1893 0 : break;
1894 :
1895 : /* End of branch */
1896 :
1897 : case OP_KET:
1898 : case OP_KETRMAX:
1899 : case OP_KETRMIN:
1900 : case OP_ALT:
1901 4 : return TRUE;
1902 :
1903 : /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1904 : MINUPTO, and POSUPTO may be followed by a multibyte character */
1905 :
1906 : #ifdef SUPPORT_UTF8
1907 : case OP_STAR:
1908 : case OP_MINSTAR:
1909 : case OP_POSSTAR:
1910 : case OP_QUERY:
1911 : case OP_MINQUERY:
1912 : case OP_POSQUERY:
1913 : case OP_UPTO:
1914 : case OP_MINUPTO:
1915 : case OP_POSUPTO:
1916 3 : if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1917 : break;
1918 : #endif
1919 : }
1920 : }
1921 :
1922 8 : return TRUE;
1923 : }
1924 :
1925 :
1926 :
1927 : /*************************************************
1928 : * Scan compiled regex for non-emptiness *
1929 : *************************************************/
1930 :
1931 : /* This function is called to check for left recursive calls. We want to check
1932 : the current branch of the current pattern to see if it could match the empty
1933 : string. If it could, we must look outwards for branches at other levels,
1934 : stopping when we pass beyond the bracket which is the subject of the recursion.
1935 :
1936 : Arguments:
1937 : code points to start of the recursion
1938 : endcode points to where to stop (current RECURSE item)
1939 : bcptr points to the chain of current (unclosed) branch starts
1940 : utf8 TRUE if in UTF-8 mode
1941 :
1942 : Returns: TRUE if what is matched could be empty
1943 : */
1944 :
1945 : static BOOL
1946 : could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1947 : BOOL utf8)
1948 2 : {
1949 8 : while (bcptr != NULL && bcptr->current >= code)
1950 : {
1951 6 : if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1952 4 : bcptr = bcptr->outer;
1953 : }
1954 0 : return TRUE;
1955 : }
1956 :
1957 :
1958 :
1959 : /*************************************************
1960 : * Check for POSIX class syntax *
1961 : *************************************************/
1962 :
1963 : /* This function is called when the sequence "[:" or "[." or "[=" is
1964 : encountered in a character class. It checks whether this is followed by a
1965 : sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1966 : reach an unescaped ']' without the special preceding character, return FALSE.
1967 :
1968 : Originally, this function only recognized a sequence of letters between the
1969 : terminators, but it seems that Perl recognizes any sequence of characters,
1970 : though of course unknown POSIX names are subsequently rejected. Perl gives an
1971 : "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1972 : didn't consider this to be a POSIX class. Likewise for [:1234:].
1973 :
1974 : The problem in trying to be exactly like Perl is in the handling of escapes. We
1975 : have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1976 : class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1977 : below handles the special case of \], but does not try to do any other escape
1978 : processing. This makes it different from Perl for cases such as [:l\ower:]
1979 : where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1980 : "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1981 : I think.
1982 :
1983 : Arguments:
1984 : ptr pointer to the initial [
1985 : endptr where to return the end pointer
1986 :
1987 : Returns: TRUE or FALSE
1988 : */
1989 :
1990 : static BOOL
1991 : check_posix_syntax(const uschar *ptr, const uschar **endptr)
1992 12 : {
1993 : int terminator; /* Don't combine these lines; the Solaris cc */
1994 12 : terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1995 68 : for (++ptr; *ptr != 0; ptr++)
1996 : {
1997 68 : if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
1998 : {
1999 68 : if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2000 64 : if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2001 : {
2002 8 : *endptr = ptr;
2003 8 : return TRUE;
2004 : }
2005 : }
2006 : }
2007 0 : return FALSE;
2008 : }
2009 :
2010 :
2011 :
2012 :
2013 : /*************************************************
2014 : * Check POSIX class name *
2015 : *************************************************/
2016 :
2017 : /* This function is called to check the name given in a POSIX-style class entry
2018 : such as [:alnum:].
2019 :
2020 : Arguments:
2021 : ptr points to the first letter
2022 : len the length of the name
2023 :
2024 : Returns: a value representing the name, or -1 if unknown
2025 : */
2026 :
2027 : static int
2028 : check_posix_name(const uschar *ptr, int len)
2029 8 : {
2030 8 : const char *pn = posix_names;
2031 8 : register int yield = 0;
2032 108 : while (posix_name_lengths[yield] != 0)
2033 : {
2034 100 : if (len == posix_name_lengths[yield] &&
2035 8 : strncmp((const char *)ptr, pn, len) == 0) return yield;
2036 92 : pn += posix_name_lengths[yield] + 1;
2037 92 : yield++;
2038 : }
2039 0 : return -1;
2040 : }
2041 :
2042 :
2043 : /*************************************************
2044 : * Adjust OP_RECURSE items in repeated group *
2045 : *************************************************/
2046 :
2047 : /* OP_RECURSE items contain an offset from the start of the regex to the group
2048 : that is referenced. This means that groups can be replicated for fixed
2049 : repetition simply by copying (because the recursion is allowed to refer to
2050 : earlier groups that are outside the current group). However, when a group is
2051 : optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2052 : inserted before it, after it has been compiled. This means that any OP_RECURSE
2053 : items within it that refer to the group itself or any contained groups have to
2054 : have their offsets adjusted. That one of the jobs of this function. Before it
2055 : is called, the partially compiled regex must be temporarily terminated with
2056 : OP_END.
2057 :
2058 : This function has been extended with the possibility of forward references for
2059 : recursions and subroutine calls. It must also check the list of such references
2060 : for the group we are dealing with. If it finds that one of the recursions in
2061 : the current group is on this list, it adjusts the offset in the list, not the
2062 : value in the reference (which is a group number).
2063 :
2064 : Arguments:
2065 : group points to the start of the group
2066 : adjust the amount by which the group is to be moved
2067 : utf8 TRUE in UTF-8 mode
2068 : cd contains pointers to tables etc.
2069 : save_hwm the hwm forward reference pointer at the start of the group
2070 :
2071 : Returns: nothing
2072 : */
2073 :
2074 : static void
2075 : adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2076 : uschar *save_hwm)
2077 810 : {
2078 810 : uschar *ptr = group;
2079 :
2080 1621 : while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2081 : {
2082 : int offset;
2083 : uschar *hc;
2084 :
2085 : /* See if this recursion is on the forward reference list. If so, adjust the
2086 : reference. */
2087 :
2088 1 : for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2089 : {
2090 0 : offset = GET(hc, 0);
2091 0 : if (cd->start_code + offset == ptr + 1)
2092 : {
2093 0 : PUT(hc, 0, offset + adjust);
2094 0 : break;
2095 : }
2096 : }
2097 :
2098 : /* Otherwise, adjust the recursion offset if it's after the start of this
2099 : group. */
2100 :
2101 1 : if (hc >= cd->hwm)
2102 : {
2103 1 : offset = GET(ptr, 1);
2104 1 : if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2105 : }
2106 :
2107 1 : ptr += 1 + LINK_SIZE;
2108 : }
2109 810 : }
2110 :
2111 :
2112 :
2113 : /*************************************************
2114 : * Insert an automatic callout point *
2115 : *************************************************/
2116 :
2117 : /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2118 : callout points before each pattern item.
2119 :
2120 : Arguments:
2121 : code current code pointer
2122 : ptr current pattern pointer
2123 : cd pointers to tables etc
2124 :
2125 : Returns: new code pointer
2126 : */
2127 :
2128 : static uschar *
2129 : auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2130 0 : {
2131 0 : *code++ = OP_CALLOUT;
2132 0 : *code++ = 255;
2133 0 : PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
2134 0 : PUT(code, LINK_SIZE, 0); /* Default length */
2135 0 : return code + 2*LINK_SIZE;
2136 : }
2137 :
2138 :
2139 :
2140 : /*************************************************
2141 : * Complete a callout item *
2142 : *************************************************/
2143 :
2144 : /* A callout item contains the length of the next item in the pattern, which
2145 : we can't fill in till after we have reached the relevant point. This is used
2146 : for both automatic and manual callouts.
2147 :
2148 : Arguments:
2149 : previous_callout points to previous callout item
2150 : ptr current pattern pointer
2151 : cd pointers to tables etc
2152 :
2153 : Returns: nothing
2154 : */
2155 :
2156 : static void
2157 : complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2158 0 : {
2159 0 : int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2160 0 : PUT(previous_callout, 2 + LINK_SIZE, length);
2161 0 : }
2162 :
2163 :
2164 :
2165 : #ifdef SUPPORT_UCP
2166 : /*************************************************
2167 : * Get othercase range *
2168 : *************************************************/
2169 :
2170 : /* This function is passed the start and end of a class range, in UTF-8 mode
2171 : with UCP support. It searches up the characters, looking for internal ranges of
2172 : characters in the "other" case. Each call returns the next one, updating the
2173 : start address.
2174 :
2175 : Arguments:
2176 : cptr points to starting character value; updated
2177 : d end value
2178 : ocptr where to put start of othercase range
2179 : odptr where to put end of othercase range
2180 :
2181 : Yield: TRUE when range returned; FALSE when no more
2182 : */
2183 :
2184 : static BOOL
2185 : get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2186 : unsigned int *odptr)
2187 0 : {
2188 : unsigned int c, othercase, next;
2189 :
2190 0 : for (c = *cptr; c <= d; c++)
2191 0 : { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2192 :
2193 0 : if (c > d) return FALSE;
2194 :
2195 0 : *ocptr = othercase;
2196 0 : next = othercase + 1;
2197 :
2198 0 : for (++c; c <= d; c++)
2199 : {
2200 0 : if (UCD_OTHERCASE(c) != next) break;
2201 0 : next++;
2202 : }
2203 :
2204 0 : *odptr = next - 1;
2205 0 : *cptr = c;
2206 :
2207 0 : return TRUE;
2208 : }
2209 : #endif /* SUPPORT_UCP */
2210 :
2211 :
2212 :
2213 : /*************************************************
2214 : * Check if auto-possessifying is possible *
2215 : *************************************************/
2216 :
2217 : /* This function is called for unlimited repeats of certain items, to see
2218 : whether the next thing could possibly match the repeated item. If not, it makes
2219 : sense to automatically possessify the repeated item.
2220 :
2221 : Arguments:
2222 : op_code the repeated op code
2223 : this data for this item, depends on the opcode
2224 : utf8 TRUE in UTF-8 mode
2225 : utf8_char used for utf8 character bytes, NULL if not relevant
2226 : ptr next character in pattern
2227 : options options bits
2228 : cd contains pointers to tables etc.
2229 :
2230 : Returns: TRUE if possessifying is wanted
2231 : */
2232 :
2233 : static BOOL
2234 : check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2235 : const uschar *ptr, int options, compile_data *cd)
2236 51510 : {
2237 : int next;
2238 :
2239 : /* Skip whitespace and comments in extended mode */
2240 :
2241 51510 : if ((options & PCRE_EXTENDED) != 0)
2242 : {
2243 : for (;;)
2244 : {
2245 0 : while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2246 0 : if (*ptr == CHAR_NUMBER_SIGN)
2247 : {
2248 0 : while (*(++ptr) != 0)
2249 0 : if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2250 : }
2251 0 : else break;
2252 0 : }
2253 : }
2254 :
2255 : /* If the next item is one that we can handle, get its value. A non-negative
2256 : value is a character, a negative value is an escape value. */
2257 :
2258 51510 : if (*ptr == CHAR_BACKSLASH)
2259 : {
2260 12590 : int temperrorcode = 0;
2261 12590 : next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2262 12590 : if (temperrorcode != 0) return FALSE;
2263 12590 : ptr++; /* Point after the escape sequence */
2264 : }
2265 :
2266 38920 : else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2267 : {
2268 : #ifdef SUPPORT_UTF8
2269 36354 : if (utf8) { GETCHARINC(next, ptr); } else
2270 : #endif
2271 36354 : next = *ptr++;
2272 : }
2273 :
2274 2566 : else return FALSE;
2275 :
2276 : /* Skip whitespace and comments in extended mode */
2277 :
2278 48944 : if ((options & PCRE_EXTENDED) != 0)
2279 : {
2280 : for (;;)
2281 : {
2282 0 : while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2283 0 : if (*ptr == CHAR_NUMBER_SIGN)
2284 : {
2285 0 : while (*(++ptr) != 0)
2286 0 : if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2287 : }
2288 0 : else break;
2289 0 : }
2290 : }
2291 :
2292 : /* If the next thing is itself optional, we have to give up. */
2293 :
2294 48944 : if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2295 : strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2296 294 : return FALSE;
2297 :
2298 : /* Now compare the next item with the previous opcode. If the previous is a
2299 : positive single character match, "item" either contains the character or, if
2300 : "item" is greater than 127 in utf8 mode, the character's bytes are in
2301 : utf8_char. */
2302 :
2303 :
2304 : /* Handle cases when the next item is a character. */
2305 :
2306 48650 : if (next >= 0) switch(op_code)
2307 : {
2308 : case OP_CHAR:
2309 : #ifdef SUPPORT_UTF8
2310 34 : if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2311 : #else
2312 : (void)(utf8_char); /* Keep compiler happy by referencing function argument */
2313 : #endif
2314 34 : return item != next;
2315 :
2316 : /* For CHARNC (caseless character) we must check the other case. If we have
2317 : Unicode property support, we can use it to test the other case of
2318 : high-valued characters. */
2319 :
2320 : case OP_CHARNC:
2321 : #ifdef SUPPORT_UTF8
2322 0 : if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2323 : #endif
2324 0 : if (item == next) return FALSE;
2325 : #ifdef SUPPORT_UTF8
2326 0 : if (utf8)
2327 : {
2328 : unsigned int othercase;
2329 0 : if (next < 128) othercase = cd->fcc[next]; else
2330 : #ifdef SUPPORT_UCP
2331 0 : othercase = UCD_OTHERCASE((unsigned int)next);
2332 : #else
2333 : othercase = NOTACHAR;
2334 : #endif
2335 0 : return (unsigned int)item != othercase;
2336 : }
2337 : else
2338 : #endif /* SUPPORT_UTF8 */
2339 0 : return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2340 :
2341 : /* For OP_NOT, "item" must be a single-byte character. */
2342 :
2343 : case OP_NOT:
2344 8 : if (item == next) return TRUE;
2345 0 : if ((options & PCRE_CASELESS) == 0) return FALSE;
2346 : #ifdef SUPPORT_UTF8
2347 0 : if (utf8)
2348 : {
2349 : unsigned int othercase;
2350 0 : if (next < 128) othercase = cd->fcc[next]; else
2351 : #ifdef SUPPORT_UCP
2352 0 : othercase = UCD_OTHERCASE(next);
2353 : #else
2354 : othercase = NOTACHAR;
2355 : #endif
2356 0 : return (unsigned int)item == othercase;
2357 : }
2358 : else
2359 : #endif /* SUPPORT_UTF8 */
2360 0 : return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2361 :
2362 : case OP_DIGIT:
2363 47936 : return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2364 :
2365 : case OP_NOT_DIGIT:
2366 2 : return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2367 :
2368 : case OP_WHITESPACE:
2369 64 : return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2370 :
2371 : case OP_NOT_WHITESPACE:
2372 2 : return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2373 :
2374 : case OP_WORDCHAR:
2375 2 : return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2376 :
2377 : case OP_NOT_WORDCHAR:
2378 0 : return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2379 :
2380 : case OP_HSPACE:
2381 : case OP_NOT_HSPACE:
2382 0 : switch(next)
2383 : {
2384 : case 0x09:
2385 : case 0x20:
2386 : case 0xa0:
2387 : case 0x1680:
2388 : case 0x180e:
2389 : case 0x2000:
2390 : case 0x2001:
2391 : case 0x2002:
2392 : case 0x2003:
2393 : case 0x2004:
2394 : case 0x2005:
2395 : case 0x2006:
2396 : case 0x2007:
2397 : case 0x2008:
2398 : case 0x2009:
2399 : case 0x200A:
2400 : case 0x202f:
2401 : case 0x205f:
2402 : case 0x3000:
2403 0 : return op_code != OP_HSPACE;
2404 : default:
2405 0 : return op_code == OP_HSPACE;
2406 : }
2407 :
2408 : case OP_VSPACE:
2409 : case OP_NOT_VSPACE:
2410 0 : switch(next)
2411 : {
2412 : case 0x0a:
2413 : case 0x0b:
2414 : case 0x0c:
2415 : case 0x0d:
2416 : case 0x85:
2417 : case 0x2028:
2418 : case 0x2029:
2419 0 : return op_code != OP_VSPACE;
2420 : default:
2421 0 : return op_code == OP_VSPACE;
2422 : }
2423 :
2424 : default:
2425 358 : return FALSE;
2426 : }
2427 :
2428 :
2429 : /* Handle the case when the next item is \d, \s, etc. */
2430 :
2431 244 : switch(op_code)
2432 : {
2433 : case OP_CHAR:
2434 : case OP_CHARNC:
2435 : #ifdef SUPPORT_UTF8
2436 0 : if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2437 : #endif
2438 0 : switch(-next)
2439 : {
2440 : case ESC_d:
2441 0 : return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2442 :
2443 : case ESC_D:
2444 0 : return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2445 :
2446 : case ESC_s:
2447 0 : return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2448 :
2449 : case ESC_S:
2450 0 : return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2451 :
2452 : case ESC_w:
2453 0 : return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2454 :
2455 : case ESC_W:
2456 0 : return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2457 :
2458 : case ESC_h:
2459 : case ESC_H:
2460 0 : switch(item)
2461 : {
2462 : case 0x09:
2463 : case 0x20:
2464 : case 0xa0:
2465 : case 0x1680:
2466 : case 0x180e:
2467 : case 0x2000:
2468 : case 0x2001:
2469 : case 0x2002:
2470 : case 0x2003:
2471 : case 0x2004:
2472 : case 0x2005:
2473 : case 0x2006:
2474 : case 0x2007:
2475 : case 0x2008:
2476 : case 0x2009:
2477 : case 0x200A:
2478 : case 0x202f:
2479 : case 0x205f:
2480 : case 0x3000:
2481 0 : return -next != ESC_h;
2482 : default:
2483 0 : return -next == ESC_h;
2484 : }
2485 :
2486 : case ESC_v:
2487 : case ESC_V:
2488 0 : switch(item)
2489 : {
2490 : case 0x0a:
2491 : case 0x0b:
2492 : case 0x0c:
2493 : case 0x0d:
2494 : case 0x85:
2495 : case 0x2028:
2496 : case 0x2029:
2497 0 : return -next != ESC_v;
2498 : default:
2499 0 : return -next == ESC_v;
2500 : }
2501 :
2502 : default:
2503 0 : return FALSE;
2504 : }
2505 :
2506 : case OP_DIGIT:
2507 32 : return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2508 : next == -ESC_h || next == -ESC_v;
2509 :
2510 : case OP_NOT_DIGIT:
2511 0 : return next == -ESC_d;
2512 :
2513 : case OP_WHITESPACE:
2514 206 : return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2515 :
2516 : case OP_NOT_WHITESPACE:
2517 2 : return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2518 :
2519 : case OP_HSPACE:
2520 0 : return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2521 :
2522 : case OP_NOT_HSPACE:
2523 0 : return next == -ESC_h;
2524 :
2525 : /* Can't have \S in here because VT matches \S (Perl anomaly) */
2526 : case OP_VSPACE:
2527 0 : return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2528 :
2529 : case OP_NOT_VSPACE:
2530 0 : return next == -ESC_v;
2531 :
2532 : case OP_WORDCHAR:
2533 2 : return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2534 :
2535 : case OP_NOT_WORDCHAR:
2536 0 : return next == -ESC_w || next == -ESC_d;
2537 :
2538 : default:
2539 2 : return FALSE;
2540 : }
2541 :
2542 : /* Control does not reach here */
2543 : }
2544 :
2545 :
2546 :
2547 : /*************************************************
2548 : * Compile one branch *
2549 : *************************************************/
2550 :
2551 : /* Scan the pattern, compiling it into the a vector. If the options are
2552 : changed during the branch, the pointer is used to change the external options
2553 : bits. This function is used during the pre-compile phase when we are trying
2554 : to find out the amount of memory needed, as well as during the real compile
2555 : phase. The value of lengthptr distinguishes the two phases.
2556 :
2557 : Arguments:
2558 : optionsptr pointer to the option bits
2559 : codeptr points to the pointer to the current code point
2560 : ptrptr points to the current pattern pointer
2561 : errorcodeptr points to error code variable
2562 : firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2563 : reqbyteptr set to the last literal character required, else < 0
2564 : bcptr points to current branch chain
2565 : cd contains pointers to tables etc.
2566 : lengthptr NULL during the real compile phase
2567 : points to length accumulator during pre-compile phase
2568 :
2569 : Returns: TRUE on success
2570 : FALSE, with *errorcodeptr set non-zero on error
2571 : */
2572 :
2573 : static BOOL
2574 : compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2575 : int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2576 : compile_data *cd, int *lengthptr)
2577 38602 : {
2578 : int repeat_type, op_type;
2579 38602 : int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2580 38602 : int bravalue = 0;
2581 : int greedy_default, greedy_non_default;
2582 : int firstbyte, reqbyte;
2583 : int zeroreqbyte, zerofirstbyte;
2584 : int req_caseopt, reqvary, tempreqvary;
2585 38602 : int options = *optionsptr;
2586 38602 : int after_manual_callout = 0;
2587 38602 : int length_prevgroup = 0;
2588 : register int c;
2589 38602 : register uschar *code = *codeptr;
2590 38602 : uschar *last_code = code;
2591 38602 : uschar *orig_code = code;
2592 : uschar *tempcode;
2593 38602 : BOOL inescq = FALSE;
2594 38602 : BOOL groupsetfirstbyte = FALSE;
2595 38602 : const uschar *ptr = *ptrptr;
2596 : const uschar *tempptr;
2597 38602 : uschar *previous = NULL;
2598 38602 : uschar *previous_callout = NULL;
2599 38602 : uschar *save_hwm = NULL;
2600 : uschar classbits[32];
2601 :
2602 : #ifdef SUPPORT_UTF8
2603 : BOOL class_utf8;
2604 38602 : BOOL utf8 = (options & PCRE_UTF8) != 0;
2605 : uschar *class_utf8data;
2606 : uschar *class_utf8data_base;
2607 : uschar utf8_char[6];
2608 : #else
2609 : BOOL utf8 = FALSE;
2610 : uschar *utf8_char = NULL;
2611 : #endif
2612 :
2613 : #ifdef DEBUG
2614 : if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2615 : #endif
2616 :
2617 : /* Set up the default and non-default settings for greediness */
2618 :
2619 38602 : greedy_default = ((options & PCRE_UNGREEDY) != 0);
2620 38602 : greedy_non_default = greedy_default ^ 1;
2621 :
2622 : /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2623 : matching encountered yet". It gets changed to REQ_NONE if we hit something that
2624 : matches a non-fixed char first char; reqbyte just remains unset if we never
2625 : find one.
2626 :
2627 : When we hit a repeat whose minimum is zero, we may have to adjust these values
2628 : to take the zero repeat into account. This is implemented by setting them to
2629 : zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2630 : item types that can be repeated set these backoff variables appropriately. */
2631 :
2632 38602 : firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2633 :
2634 : /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2635 : according to the current setting of the caseless flag. REQ_CASELESS is a bit
2636 : value > 255. It is added into the firstbyte or reqbyte variables to record the
2637 : case status of the value. This is used only for ASCII characters. */
2638 :
2639 38602 : req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2640 :
2641 : /* Switch on next character until the end of the branch */
2642 :
2643 29590902 : for (;; ptr++)
2644 : {
2645 : BOOL negate_class;
2646 : BOOL should_flip_negation;
2647 : BOOL possessive_quantifier;
2648 : BOOL is_quantifier;
2649 : BOOL is_recurse;
2650 : BOOL reset_bracount;
2651 : int class_charcount;
2652 : int class_lastchar;
2653 : int newoptions;
2654 : int recno;
2655 : int refsign;
2656 : int skipbytes;
2657 : int subreqbyte;
2658 : int subfirstbyte;
2659 : int terminator;
2660 : int mclength;
2661 : uschar mcbuffer[8];
2662 :
2663 : /* Get next byte in the pattern */
2664 :
2665 29629504 : c = *ptr;
2666 :
2667 : /* If we are in the pre-compile phase, accumulate the length used for the
2668 : previous cycle of this loop. */
2669 :
2670 29629504 : if (lengthptr != NULL)
2671 : {
2672 : #ifdef DEBUG
2673 : if (code > cd->hwm) cd->hwm = code; /* High water info */
2674 : #endif
2675 14814756 : if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2676 : {
2677 0 : *errorcodeptr = ERR52;
2678 0 : goto FAILED;
2679 : }
2680 :
2681 : /* There is at least one situation where code goes backwards: this is the
2682 : case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2683 : the class is simply eliminated. However, it is created first, so we have to
2684 : allow memory for it. Therefore, don't ever reduce the length at this point.
2685 : */
2686 :
2687 14814756 : if (code < last_code) code = last_code;
2688 :
2689 : /* Paranoid check for integer overflow */
2690 :
2691 14814756 : if (OFLOW_MAX - *lengthptr < code - last_code)
2692 : {
2693 0 : *errorcodeptr = ERR20;
2694 0 : goto FAILED;
2695 : }
2696 :
2697 14814756 : *lengthptr += code - last_code;
2698 : DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2699 :
2700 : /* If "previous" is set and it is not at the start of the work space, move
2701 : it back to there, in order to avoid filling up the work space. Otherwise,
2702 : if "previous" is NULL, reset the current code pointer to the start. */
2703 :
2704 14814756 : if (previous != NULL)
2705 : {
2706 14735115 : if (previous > orig_code)
2707 : {
2708 14673990 : memmove(orig_code, previous, code - previous);
2709 14673990 : code -= previous - orig_code;
2710 14673990 : previous = orig_code;
2711 : }
2712 : }
2713 79641 : else code = orig_code;
2714 :
2715 : /* Remember where this code item starts so we can pick up the length
2716 : next time round. */
2717 :
2718 14814756 : last_code = code;
2719 : }
2720 :
2721 : /* In the real compile phase, just check the workspace used by the forward
2722 : reference list. */
2723 :
2724 14814748 : else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2725 : {
2726 0 : *errorcodeptr = ERR52;
2727 0 : goto FAILED;
2728 : }
2729 :
2730 : /* If in \Q...\E, check for the end; if not, we have a literal */
2731 :
2732 29629504 : if (inescq && c != 0)
2733 : {
2734 0 : if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
2735 : {
2736 0 : inescq = FALSE;
2737 0 : ptr++;
2738 0 : continue;
2739 : }
2740 : else
2741 : {
2742 0 : if (previous_callout != NULL)
2743 : {
2744 0 : if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2745 0 : complete_callout(previous_callout, ptr, cd);
2746 0 : previous_callout = NULL;
2747 : }
2748 0 : if ((options & PCRE_AUTO_CALLOUT) != 0)
2749 : {
2750 0 : previous_callout = code;
2751 0 : code = auto_callout(code, ptr, cd);
2752 : }
2753 0 : goto NORMAL_CHAR;
2754 : }
2755 : }
2756 :
2757 : /* Fill in length of a previous callout, except when the next thing is
2758 : a quantifier. */
2759 :
2760 29629504 : is_quantifier =
2761 : c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
2762 : (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
2763 :
2764 29629504 : if (!is_quantifier && previous_callout != NULL &&
2765 : after_manual_callout-- <= 0)
2766 : {
2767 0 : if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2768 0 : complete_callout(previous_callout, ptr, cd);
2769 0 : previous_callout = NULL;
2770 : }
2771 :
2772 : /* In extended mode, skip white space and comments */
2773 :
2774 29629504 : if ((options & PCRE_EXTENDED) != 0)
2775 : {
2776 222 : if ((cd->ctypes[c] & ctype_space) != 0) continue;
2777 136 : if (c == CHAR_NUMBER_SIGN)
2778 : {
2779 0 : while (*(++ptr) != 0)
2780 : {
2781 0 : if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2782 : }
2783 0 : if (*ptr != 0) continue;
2784 :
2785 : /* Else fall through to handle end of string */
2786 0 : c = 0;
2787 : }
2788 : }
2789 :
2790 : /* No auto callout for quantifiers. */
2791 :
2792 29629418 : if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2793 : {
2794 0 : previous_callout = code;
2795 0 : code = auto_callout(code, ptr, cd);
2796 : }
2797 :
2798 29629418 : switch(c)
2799 : {
2800 : /* ===================================================================*/
2801 : case 0: /* The branch terminates at string end */
2802 : case CHAR_VERTICAL_LINE: /* or | or ) */
2803 : case CHAR_RIGHT_PARENTHESIS:
2804 38598 : *firstbyteptr = firstbyte;
2805 38598 : *reqbyteptr = reqbyte;
2806 38598 : *codeptr = code;
2807 38598 : *ptrptr = ptr;
2808 38598 : if (lengthptr != NULL)
2809 : {
2810 19299 : if (OFLOW_MAX - *lengthptr < code - last_code)
2811 : {
2812 0 : *errorcodeptr = ERR20;
2813 0 : goto FAILED;
2814 : }
2815 19299 : *lengthptr += code - last_code; /* To include callout length */
2816 : DPRINTF((">> end branch\n"));
2817 : }
2818 38598 : return TRUE;
2819 :
2820 :
2821 : /* ===================================================================*/
2822 : /* Handle single-character metacharacters. In multiline mode, ^ disables
2823 : the setting of any following char as a first character. */
2824 :
2825 : case CHAR_CIRCUMFLEX_ACCENT:
2826 11502 : if ((options & PCRE_MULTILINE) != 0)
2827 : {
2828 6 : if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2829 : }
2830 11502 : previous = NULL;
2831 11502 : *code++ = OP_CIRC;
2832 11502 : break;
2833 :
2834 : case CHAR_DOLLAR_SIGN:
2835 11200 : previous = NULL;
2836 11200 : *code++ = OP_DOLL;
2837 11200 : break;
2838 :
2839 : /* There can never be a first char if '.' is first, whatever happens about
2840 : repeats. The value of reqbyte doesn't change either. */
2841 :
2842 : case CHAR_DOT:
2843 16790503 : if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2844 16790503 : zerofirstbyte = firstbyte;
2845 16790503 : zeroreqbyte = reqbyte;
2846 16790503 : previous = code;
2847 16790503 : *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2848 16790503 : break;
2849 :
2850 :
2851 : /* ===================================================================*/
2852 : /* Character classes. If the included characters are all < 256, we build a
2853 : 32-byte bitmap of the permitted characters, except in the special case
2854 : where there is only one such character. For negated classes, we build the
2855 : map as usual, then invert it at the end. However, we use a different opcode
2856 : so that data characters > 255 can be handled correctly.
2857 :
2858 : If the class contains characters outside the 0-255 range, a different
2859 : opcode is compiled. It may optionally have a bit map for characters < 256,
2860 : but those above are are explicitly listed afterwards. A flag byte tells
2861 : whether the bitmap is present, and whether this is a negated class or not.
2862 :
2863 : In JavaScript compatibility mode, an isolated ']' causes an error. In
2864 : default (Perl) mode, it is treated as a data character. */
2865 :
2866 : case CHAR_RIGHT_SQUARE_BRACKET:
2867 6 : if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2868 : {
2869 0 : *errorcodeptr = ERR64;
2870 0 : goto FAILED;
2871 : }
2872 6 : goto NORMAL_CHAR;
2873 :
2874 : case CHAR_LEFT_SQUARE_BRACKET:
2875 46541 : previous = code;
2876 :
2877 : /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2878 : they are encountered at the top level, so we'll do that too. */
2879 :
2880 46541 : if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2881 : ptr[1] == CHAR_EQUALS_SIGN) &&
2882 : check_posix_syntax(ptr, &tempptr))
2883 : {
2884 0 : *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
2885 0 : goto FAILED;
2886 : }
2887 :
2888 : /* If the first character is '^', set the negation flag and skip it. Also,
2889 : if the first few characters (either before or after ^) are \Q\E or \E we
2890 : skip them too. This makes for compatibility with Perl. */
2891 :
2892 46541 : negate_class = FALSE;
2893 : for (;;)
2894 : {
2895 89061 : c = *(++ptr);
2896 89061 : if (c == CHAR_BACKSLASH)
2897 : {
2898 42608 : if (ptr[1] == CHAR_E)
2899 0 : ptr++;
2900 42608 : else if (strncmp((const char *)ptr+1,
2901 : STR_Q STR_BACKSLASH STR_E, 3) == 0)
2902 0 : ptr += 3;
2903 : else
2904 42608 : break;
2905 : }
2906 46453 : else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
2907 42520 : negate_class = TRUE;
2908 : else break;
2909 42520 : }
2910 :
2911 : /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
2912 : an initial ']' is taken as a data character -- the code below handles
2913 : that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2914 : [^] must match any character, so generate OP_ALLANY. */
2915 :
2916 46541 : if (c == CHAR_RIGHT_SQUARE_BRACKET &&
2917 : (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2918 : {
2919 0 : *code++ = negate_class? OP_ALLANY : OP_FAIL;
2920 0 : if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2921 0 : zerofirstbyte = firstbyte;
2922 0 : break;
2923 : }
2924 :
2925 : /* If a class contains a negative special such as \S, we need to flip the
2926 : negation flag at the end, so that support for characters > 255 works
2927 : correctly (they are all included in the class). */
2928 :
2929 46541 : should_flip_negation = FALSE;
2930 :
2931 : /* Keep a count of chars with values < 256 so that we can optimize the case
2932 : of just a single character (as long as it's < 256). However, For higher
2933 : valued UTF-8 characters, we don't yet do any optimization. */
2934 :
2935 46541 : class_charcount = 0;
2936 46541 : class_lastchar = -1;
2937 :
2938 : /* Initialize the 32-char bit map to all zeros. We build the map in a
2939 : temporary bit of memory, in case the class contains only 1 character (less
2940 : than 256), because in that case the compiled code doesn't use the bit map.
2941 : */
2942 :
2943 46541 : memset(classbits, 0, 32 * sizeof(uschar));
2944 :
2945 : #ifdef SUPPORT_UTF8
2946 46541 : class_utf8 = FALSE; /* No chars >= 256 */
2947 46541 : class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2948 46541 : class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
2949 : #endif
2950 :
2951 : /* Process characters until ] is reached. By writing this as a "do" it
2952 : means that an initial ] is taken as a data character. At the start of the
2953 : loop, c contains the first byte of the character. */
2954 :
2955 46541 : if (c != 0) do
2956 : {
2957 : const uschar *oldptr;
2958 :
2959 : #ifdef SUPPORT_UTF8
2960 92820 : if (utf8 && c > 127)
2961 : { /* Braces are required because the */
2962 0 : GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2963 : }
2964 :
2965 : /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2966 : data and reset the pointer. This is so that very large classes that
2967 : contain a zillion UTF-8 characters no longer overwrite the work space
2968 : (which is on the stack). */
2969 :
2970 92820 : if (lengthptr != NULL)
2971 : {
2972 46410 : *lengthptr += class_utf8data - class_utf8data_base;
2973 46410 : class_utf8data = class_utf8data_base;
2974 : }
2975 :
2976 : #endif
2977 :
2978 : /* Inside \Q...\E everything is literal except \E */
2979 :
2980 92820 : if (inescq)
2981 : {
2982 0 : if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
2983 : {
2984 0 : inescq = FALSE; /* Reset literal state */
2985 0 : ptr++; /* Skip the 'E' */
2986 0 : continue; /* Carry on with next */
2987 : }
2988 0 : goto CHECK_RANGE; /* Could be range if \E follows */
2989 : }
2990 :
2991 : /* Handle POSIX class names. Perl allows a negation extension of the
2992 : form [:^name:]. A square bracket that doesn't match the syntax is
2993 : treated as a literal. We also recognize the POSIX constructions
2994 : [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2995 : 5.6 and 5.8 do. */
2996 :
2997 92820 : if (c == CHAR_LEFT_SQUARE_BRACKET &&
2998 : (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2999 : ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3000 : {
3001 8 : BOOL local_negate = FALSE;
3002 : int posix_class, taboffset, tabopt;
3003 8 : register const uschar *cbits = cd->cbits;
3004 : uschar pbits[32];
3005 :
3006 8 : if (ptr[1] != CHAR_COLON)
3007 : {
3008 0 : *errorcodeptr = ERR31;
3009 0 : goto FAILED;
3010 : }
3011 :
3012 8 : ptr += 2;
3013 8 : if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3014 : {
3015 0 : local_negate = TRUE;
3016 0 : should_flip_negation = TRUE; /* Note negative special */
3017 0 : ptr++;
3018 : }
3019 :
3020 8 : posix_class = check_posix_name(ptr, tempptr - ptr);
3021 8 : if (posix_class < 0)
3022 : {
3023 0 : *errorcodeptr = ERR30;
3024 0 : goto FAILED;
3025 : }
3026 :
3027 : /* If matching is caseless, upper and lower are converted to
3028 : alpha. This relies on the fact that the class table starts with
3029 : alpha, lower, upper as the first 3 entries. */
3030 :
3031 8 : if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3032 0 : posix_class = 0;
3033 :
3034 : /* We build the bit map for the POSIX class in a chunk of local store
3035 : because we may be adding and subtracting from it, and we don't want to
3036 : subtract bits that may be in the main map already. At the end we or the
3037 : result into the bit map that is being built. */
3038 :
3039 8 : posix_class *= 3;
3040 :
3041 : /* Copy in the first table (always present) */
3042 :
3043 8 : memcpy(pbits, cbits + posix_class_maps[posix_class],
3044 : 32 * sizeof(uschar));
3045 :
3046 : /* If there is a second table, add or remove it as required. */
3047 :
3048 8 : taboffset = posix_class_maps[posix_class + 1];
3049 8 : tabopt = posix_class_maps[posix_class + 2];
3050 :
3051 8 : if (taboffset >= 0)
3052 : {
3053 0 : if (tabopt >= 0)
3054 0 : for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3055 : else
3056 0 : for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3057 : }
3058 :
3059 : /* Not see if we need to remove any special characters. An option
3060 : value of 1 removes vertical space and 2 removes underscore. */
3061 :
3062 8 : if (tabopt < 0) tabopt = -tabopt;
3063 8 : if (tabopt == 1) pbits[1] &= ~0x3c;
3064 8 : else if (tabopt == 2) pbits[11] &= 0x7f;
3065 :
3066 : /* Add the POSIX table or its complement into the main table that is
3067 : being built and we are done. */
3068 :
3069 8 : if (local_negate)
3070 0 : for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3071 : else
3072 8 : for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3073 :
3074 8 : ptr = tempptr + 1;
3075 8 : class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3076 8 : continue; /* End of POSIX syntax handling */
3077 : }
3078 :
3079 : /* Backslash may introduce a single character, or it may introduce one
3080 : of the specials, which just set a flag. The sequence \b is a special
3081 : case. Inside a class (and only there) it is treated as backspace.
3082 : Elsewhere it marks a word boundary. Other escapes have preset maps ready
3083 : to 'or' into the one we are building. We assume they have more than one
3084 : character in them, so set class_charcount bigger than one. */
3085 :
3086 92812 : if (c == CHAR_BACKSLASH)
3087 : {
3088 85722 : c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3089 85722 : if (*errorcodeptr != 0) goto FAILED;
3090 :
3091 85722 : if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3092 85712 : else if (-c == ESC_X) c = CHAR_X; /* \X is literal X in a class */
3093 85712 : else if (-c == ESC_R) c = CHAR_R; /* \R is literal R in a class */
3094 85712 : else if (-c == ESC_Q) /* Handle start of quoted string */
3095 : {
3096 0 : if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3097 : {
3098 0 : ptr += 2; /* avoid empty string */
3099 : }
3100 0 : else inescq = TRUE;
3101 0 : continue;
3102 : }
3103 85712 : else if (-c == ESC_E) continue; /* Ignore orphan \E */
3104 :
3105 85722 : if (c < 0)
3106 : {
3107 186 : register const uschar *cbits = cd->cbits;
3108 186 : class_charcount += 2; /* Greater than 1 is what matters */
3109 :
3110 : /* Save time by not doing this in the pre-compile phase. */
3111 :
3112 186 : if (lengthptr == NULL) switch (-c)
3113 : {
3114 : case ESC_d:
3115 1 : for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3116 1 : continue;
3117 :
3118 : case ESC_D:
3119 0 : should_flip_negation = TRUE;
3120 0 : for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3121 0 : continue;
3122 :
3123 : case ESC_w:
3124 57 : for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3125 57 : continue;
3126 :
3127 : case ESC_W:
3128 0 : should_flip_negation = TRUE;
3129 0 : for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3130 0 : continue;
3131 :
3132 : case ESC_s:
3133 3 : for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3134 3 : classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
3135 3 : continue;
3136 :
3137 : case ESC_S:
3138 0 : should_flip_negation = TRUE;
3139 0 : for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3140 0 : classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3141 0 : continue;
3142 :
3143 : default: /* Not recognized; fall through */
3144 : break; /* Need "default" setting to stop compiler warning. */
3145 : }
3146 :
3147 : /* In the pre-compile phase, just do the recognition. */
3148 :
3149 93 : else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
3150 : c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
3151 :
3152 : /* We need to deal with \H, \h, \V, and \v in both phases because
3153 : they use extra memory. */
3154 :
3155 64 : if (-c == ESC_h)
3156 : {
3157 58 : SETBIT(classbits, 0x09); /* VT */
3158 58 : SETBIT(classbits, 0x20); /* SPACE */
3159 58 : SETBIT(classbits, 0xa0); /* NSBP */
3160 : #ifdef SUPPORT_UTF8
3161 58 : if (utf8)
3162 : {
3163 0 : class_utf8 = TRUE;
3164 0 : *class_utf8data++ = XCL_SINGLE;
3165 0 : class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3166 0 : *class_utf8data++ = XCL_SINGLE;
3167 0 : class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3168 0 : *class_utf8data++ = XCL_RANGE;
3169 0 : class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3170 0 : class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3171 0 : *class_utf8data++ = XCL_SINGLE;
3172 0 : class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3173 0 : *class_utf8data++ = XCL_SINGLE;
3174 0 : class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3175 0 : *class_utf8data++ = XCL_SINGLE;
3176 0 : class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3177 : }
3178 : #endif
3179 58 : continue;
3180 : }
3181 :
3182 6 : if (-c == ESC_H)
3183 : {
3184 0 : for (c = 0; c < 32; c++)
3185 : {
3186 0 : int x = 0xff;
3187 0 : switch (c)
3188 : {
3189 0 : case 0x09/8: x ^= 1 << (0x09%8); break;
3190 0 : case 0x20/8: x ^= 1 << (0x20%8); break;
3191 0 : case 0xa0/8: x ^= 1 << (0xa0%8); break;
3192 : default: break;
3193 : }
3194 0 : classbits[c] |= x;
3195 : }
3196 :
3197 : #ifdef SUPPORT_UTF8
3198 0 : if (utf8)
3199 : {
3200 0 : class_utf8 = TRUE;
3201 0 : *class_utf8data++ = XCL_RANGE;
3202 0 : class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3203 0 : class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3204 0 : *class_utf8data++ = XCL_RANGE;
3205 0 : class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3206 0 : class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3207 0 : *class_utf8data++ = XCL_RANGE;
3208 0 : class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3209 0 : class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3210 0 : *class_utf8data++ = XCL_RANGE;
3211 0 : class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3212 0 : class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3213 0 : *class_utf8data++ = XCL_RANGE;
3214 0 : class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3215 0 : class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3216 0 : *class_utf8data++ = XCL_RANGE;
3217 0 : class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3218 0 : class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3219 0 : *class_utf8data++ = XCL_RANGE;
3220 0 : class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3221 0 : class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3222 : }
3223 : #endif
3224 0 : continue;
3225 : }
3226 :
3227 6 : if (-c == ESC_v)
3228 : {
3229 0 : SETBIT(classbits, 0x0a); /* LF */
3230 0 : SETBIT(classbits, 0x0b); /* VT */
3231 0 : SETBIT(classbits, 0x0c); /* FF */
3232 0 : SETBIT(classbits, 0x0d); /* CR */
3233 0 : SETBIT(classbits, 0x85); /* NEL */
3234 : #ifdef SUPPORT_UTF8
3235 0 : if (utf8)
3236 : {
3237 0 : class_utf8 = TRUE;
3238 0 : *class_utf8data++ = XCL_RANGE;
3239 0 : class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3240 0 : class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3241 : }
3242 : #endif
3243 0 : continue;
3244 : }
3245 :
3246 6 : if (-c == ESC_V)
3247 : {
3248 0 : for (c = 0; c < 32; c++)
3249 : {
3250 0 : int x = 0xff;
3251 0 : switch (c)
3252 : {
3253 0 : case 0x0a/8: x ^= 1 << (0x0a%8);
3254 0 : x ^= 1 << (0x0b%8);
3255 0 : x ^= 1 << (0x0c%8);
3256 0 : x ^= 1 << (0x0d%8);
3257 0 : break;
3258 0 : case 0x85/8: x ^= 1 << (0x85%8); break;
3259 : default: break;
3260 : }
3261 0 : classbits[c] |= x;
3262 : }
3263 :
3264 : #ifdef SUPPORT_UTF8
3265 0 : if (utf8)
3266 : {
3267 0 : class_utf8 = TRUE;
3268 0 : *class_utf8data++ = XCL_RANGE;
3269 0 : class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3270 0 : class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3271 0 : *class_utf8data++ = XCL_RANGE;
3272 0 : class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3273 0 : class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3274 : }
3275 : #endif
3276 0 : continue;
3277 : }
3278 :
3279 : /* We need to deal with \P and \p in both phases. */
3280 :
3281 : #ifdef SUPPORT_UCP
3282 6 : if (-c == ESC_p || -c == ESC_P)
3283 : {
3284 : BOOL negated;
3285 : int pdata;
3286 6 : int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3287 6 : if (ptype < 0) goto FAILED;
3288 6 : class_utf8 = TRUE;
3289 6 : *class_utf8data++ = ((-c == ESC_p) != negated)?
3290 : XCL_PROP : XCL_NOTPROP;
3291 6 : *class_utf8data++ = ptype;
3292 6 : *class_utf8data++ = pdata;
3293 6 : class_charcount -= 2; /* Not a < 256 character */
3294 6 : continue;
3295 : }
3296 : #endif
3297 : /* Unrecognized escapes are faulted if PCRE is running in its
3298 : strict mode. By default, for compatibility with Perl, they are
3299 : treated as literals. */
3300 :
3301 0 : if ((options & PCRE_EXTRA) != 0)
3302 : {
3303 0 : *errorcodeptr = ERR7;
3304 0 : goto FAILED;
3305 : }
3306 :
3307 0 : class_charcount -= 2; /* Undo the default count from above */
3308 0 : c = *ptr; /* Get the final character and fall through */
3309 : }
3310 :
3311 : /* Fall through if we have a single character (c >= 0). This may be
3312 : greater than 256 in UTF-8 mode. */
3313 :
3314 : } /* End of backslash handling */
3315 :
3316 : /* A single character may be followed by '-' to form a range. However,
3317 : Perl does not permit ']' to be the end of the range. A '-' character
3318 : at the end is treated as a literal. Perl ignores orphaned \E sequences
3319 : entirely. The code for handling \Q and \E is messy. */
3320 :
3321 92626 : CHECK_RANGE:
3322 185252 : while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3323 : {
3324 0 : inescq = FALSE;
3325 0 : ptr += 2;
3326 : }
3327 :
3328 92626 : oldptr = ptr;
3329 :
3330 : /* Remember \r or \n */
3331 :
3332 92626 : if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3333 :
3334 : /* Check for range */
3335 :
3336 92626 : if (!inescq && ptr[1] == CHAR_MINUS)
3337 : {
3338 : int d;
3339 3734 : ptr += 2;
3340 3734 : while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3341 :
3342 : /* If we hit \Q (not followed by \E) at this point, go into escaped
3343 : mode. */
3344 :
3345 7468 : while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3346 : {
3347 0 : ptr += 2;
3348 0 : if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3349 0 : { ptr += 2; continue; }
3350 0 : inescq = TRUE;
3351 0 : break;
3352 : }
3353 :
3354 3734 : if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3355 : {
3356 1470 : ptr = oldptr;
3357 1470 : goto LONE_SINGLE_CHARACTER;
3358 : }
3359 :
3360 : #ifdef SUPPORT_UTF8
3361 2264 : if (utf8)
3362 : { /* Braces are required because the */
3363 4 : GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3364 : }
3365 : else
3366 : #endif
3367 2260 : d = *ptr; /* Not UTF-8 mode */
3368 :
3369 : /* The second part of a range can be a single-character escape, but
3370 : not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3371 : in such circumstances. */
3372 :
3373 2264 : if (!inescq && d == CHAR_BACKSLASH)
3374 : {
3375 0 : d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3376 0 : if (*errorcodeptr != 0) goto FAILED;
3377 :
3378 : /* \b is backspace; \X is literal X; \R is literal R; any other
3379 : special means the '-' was literal */
3380 :
3381 0 : if (d < 0)
3382 : {
3383 0 : if (d == -ESC_b) d = CHAR_BS;
3384 0 : else if (d == -ESC_X) d = CHAR_X;
3385 0 : else if (d == -ESC_R) d = CHAR_R; else
3386 : {
3387 0 : ptr = oldptr;
3388 0 : goto LONE_SINGLE_CHARACTER; /* A few lines below */
3389 : }
3390 : }
3391 : }
3392 :
3393 : /* Check that the two values are in the correct order. Optimize
3394 : one-character ranges */
3395 :
3396 2264 : if (d < c)
3397 : {
3398 0 : *errorcodeptr = ERR8;
3399 0 : goto FAILED;
3400 : }
3401 :
3402 2264 : if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3403 :
3404 : /* Remember \r or \n */
3405 :
3406 2264 : if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3407 :
3408 : /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3409 : matching, we have to use an XCLASS with extra data items. Caseless
3410 : matching for characters > 127 is available only if UCP support is
3411 : available. */
3412 :
3413 : #ifdef SUPPORT_UTF8
3414 2264 : if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3415 : {
3416 0 : class_utf8 = TRUE;
3417 :
3418 : /* With UCP support, we can find the other case equivalents of
3419 : the relevant characters. There may be several ranges. Optimize how
3420 : they fit with the basic range. */
3421 :
3422 : #ifdef SUPPORT_UCP
3423 0 : if ((options & PCRE_CASELESS) != 0)
3424 : {
3425 : unsigned int occ, ocd;
3426 0 : unsigned int cc = c;
3427 0 : unsigned int origd = d;
3428 0 : while (get_othercase_range(&cc, origd, &occ, &ocd))
3429 : {
3430 0 : if (occ >= (unsigned int)c &&
3431 : ocd <= (unsigned int)d)
3432 0 : continue; /* Skip embedded ranges */
3433 :
3434 0 : if (occ < (unsigned int)c &&
3435 : ocd >= (unsigned int)c - 1) /* Extend the basic range */
3436 : { /* if there is overlap, */
3437 0 : c = occ; /* noting that if occ < c */
3438 0 : continue; /* we can't have ocd > d */
3439 : } /* because a subrange is */
3440 0 : if (ocd > (unsigned int)d &&
3441 : occ <= (unsigned int)d + 1) /* always shorter than */
3442 : { /* the basic range. */
3443 0 : d = ocd;
3444 0 : continue;
3445 : }
3446 :
3447 0 : if (occ == ocd)
3448 : {
3449 0 : *class_utf8data++ = XCL_SINGLE;
3450 : }
3451 : else
3452 : {
3453 0 : *class_utf8data++ = XCL_RANGE;
3454 0 : class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3455 : }
3456 0 : class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3457 : }
3458 : }
3459 : #endif /* SUPPORT_UCP */
3460 :
3461 : /* Now record the original range, possibly modified for UCP caseless
3462 : overlapping ranges. */
3463 :
3464 0 : *class_utf8data++ = XCL_RANGE;
3465 0 : class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3466 0 : class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3467 :
3468 : /* With UCP support, we are done. Without UCP support, there is no
3469 : caseless matching for UTF-8 characters > 127; we can use the bit map
3470 : for the smaller ones. */
3471 :
3472 : #ifdef SUPPORT_UCP
3473 0 : continue; /* With next character in the class */
3474 : #else
3475 : if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3476 :
3477 : /* Adjust upper limit and fall through to set up the map */
3478 :
3479 : d = 127;
3480 :
3481 : #endif /* SUPPORT_UCP */
3482 : }
3483 : #endif /* SUPPORT_UTF8 */
3484 :
3485 : /* We use the bit map for all cases when not in UTF-8 mode; else
3486 : ranges that lie entirely within 0-127 when there is UCP support; else
3487 : for partial ranges without UCP support. */
3488 :
3489 2264 : class_charcount += d - c + 1;
3490 2264 : class_lastchar = d;
3491 :
3492 : /* We can save a bit of time by skipping this in the pre-compile. */
3493 :
3494 16398 : if (lengthptr == NULL) for (; c <= d; c++)
3495 : {
3496 14134 : classbits[c/8] |= (1 << (c&7));
3497 14134 : if ((options & PCRE_CASELESS) != 0)
3498 : {
3499 104 : int uc = cd->fcc[c]; /* flip case */
3500 104 : classbits[uc/8] |= (1 << (uc&7));
3501 : }
3502 : }
3503 :
3504 2264 : continue; /* Go get the next char in the class */
3505 : }
3506 :
3507 : /* Handle a lone single character - we can get here for a normal
3508 : non-escape char, or after \ that introduces a single character or for an
3509 : apparent range that isn't. */
3510 :
3511 90362 : LONE_SINGLE_CHARACTER:
3512 :
3513 : /* Handle a character that cannot go in the bit map */
3514 :
3515 : #ifdef SUPPORT_UTF8
3516 90362 : if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3517 : {
3518 0 : class_utf8 = TRUE;
3519 0 : *class_utf8data++ = XCL_SINGLE;
3520 0 : class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3521 :
3522 : #ifdef SUPPORT_UCP
3523 0 : if ((options & PCRE_CASELESS) != 0)
3524 : {
3525 : unsigned int othercase;
3526 0 : if ((othercase = UCD_OTHERCASE(c)) != c)
3527 : {
3528 0 : *class_utf8data++ = XCL_SINGLE;
3529 0 : class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3530 : }
3531 : }
3532 : #endif /* SUPPORT_UCP */
3533 :
3534 : }
3535 : else
3536 : #endif /* SUPPORT_UTF8 */
3537 :
3538 : /* Handle a single-byte character */
3539 : {
3540 90362 : classbits[c/8] |= (1 << (c&7));
3541 90362 : if ((options & PCRE_CASELESS) != 0)
3542 : {
3543 12 : c = cd->fcc[c]; /* flip case */
3544 12 : classbits[c/8] |= (1 << (c&7));
3545 : }
3546 90362 : class_charcount++;
3547 90362 : class_lastchar = c;
3548 : }
3549 : }
3550 :
3551 : /* Loop until ']' reached. This "while" is the end of the "do" above. */
3552 :
3553 92820 : while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3554 :
3555 46541 : if (c == 0) /* Missing terminating ']' */
3556 : {
3557 1 : *errorcodeptr = ERR6;
3558 1 : goto FAILED;
3559 : }
3560 :
3561 :
3562 : /* This code has been disabled because it would mean that \s counts as
3563 : an explicit \r or \n reference, and that's not really what is wanted. Now
3564 : we set the flag only if there is a literal "\r" or "\n" in the class. */
3565 :
3566 : #if 0
3567 : /* Remember whether \r or \n are in this class */
3568 :
3569 : if (negate_class)
3570 : {
3571 : if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3572 : }
3573 : else
3574 : {
3575 : if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3576 : }
3577 : #endif
3578 :
3579 :
3580 : /* If class_charcount is 1, we saw precisely one character whose value is
3581 : less than 256. As long as there were no characters >= 128 and there was no
3582 : use of \p or \P, in other words, no use of any XCLASS features, we can
3583 : optimize.
3584 :
3585 : In UTF-8 mode, we can optimize the negative case only if there were no
3586 : characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3587 : operate on single-bytes only. This is an historical hangover. Maybe one day
3588 : we can tidy these opcodes to handle multi-byte characters.
3589 :
3590 : The optimization throws away the bit map. We turn the item into a
3591 : 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3592 : that OP_NOT does not support multibyte characters. In the positive case, it
3593 : can cause firstbyte to be set. Otherwise, there can be no first char if
3594 : this item is first, whatever repeat count may follow. In the case of
3595 : reqbyte, save the previous value for reinstating. */
3596 :
3597 : #ifdef SUPPORT_UTF8
3598 46540 : if (class_charcount == 1 && !class_utf8 &&
3599 : (!utf8 || !negate_class || class_lastchar < 128))
3600 : #else
3601 : if (class_charcount == 1)
3602 : #endif
3603 : {
3604 148 : zeroreqbyte = reqbyte;
3605 :
3606 : /* The OP_NOT opcode works on one-byte characters only. */
3607 :
3608 148 : if (negate_class)
3609 : {
3610 24 : if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3611 24 : zerofirstbyte = firstbyte;
3612 24 : *code++ = OP_NOT;
3613 24 : *code++ = class_lastchar;
3614 24 : break;
3615 : }
3616 :
3617 : /* For a single, positive character, get the value into mcbuffer, and
3618 : then we can handle this with the normal one-character code. */
3619 :
3620 : #ifdef SUPPORT_UTF8
3621 124 : if (utf8 && class_lastchar > 127)
3622 0 : mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3623 : else
3624 : #endif
3625 : {
3626 124 : mcbuffer[0] = class_lastchar;
3627 124 : mclength = 1;
3628 : }
3629 124 : goto ONE_CHAR;
3630 : } /* End of 1-char optimization */
3631 :
3632 : /* The general case - not the one-char optimization. If this is the first
3633 : thing in the branch, there can be no first char setting, whatever the
3634 : repeat count. Any reqbyte setting must remain unchanged after any kind of
3635 : repeat. */
3636 :
3637 46392 : if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3638 46392 : zerofirstbyte = firstbyte;
3639 46392 : zeroreqbyte = reqbyte;
3640 :
3641 : /* If there are characters with values > 255, we have to compile an
3642 : extended class, with its own opcode, unless there was a negated special
3643 : such as \S in the class, because in that case all characters > 255 are in
3644 : the class, so any that were explicitly given as well can be ignored. If
3645 : (when there are explicit characters > 255 that must be listed) there are no
3646 : characters < 256, we can omit the bitmap in the actual compiled code. */
3647 :
3648 : #ifdef SUPPORT_UTF8
3649 46392 : if (class_utf8 && !should_flip_negation)
3650 : {
3651 6 : *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3652 6 : *code++ = OP_XCLASS;
3653 6 : code += LINK_SIZE;
3654 6 : *code = negate_class? XCL_NOT : 0;
3655 :
3656 : /* If the map is required, move up the extra data to make room for it;
3657 : otherwise just move the code pointer to the end of the extra data. */
3658 :
3659 6 : if (class_charcount > 0)
3660 : {
3661 0 : *code++ |= XCL_MAP;
3662 0 : memmove(code + 32, code, class_utf8data - code);
3663 0 : memcpy(code, classbits, 32);
3664 0 : code = class_utf8data + 32;
3665 : }
3666 6 : else code = class_utf8data;
3667 :
3668 : /* Now fill in the complete length of the item */
3669 :
3670 6 : PUT(previous, 1, code - previous);
3671 6 : break; /* End of class handling */
3672 : }
3673 : #endif
3674 :
3675 : /* If there are no characters > 255, set the opcode to OP_CLASS or
3676 : OP_NCLASS, depending on whether the whole class was negated and whether
3677 : there were negative specials such as \S in the class. Then copy the 32-byte
3678 : map into the code vector, negating it if necessary. */
3679 :
3680 46386 : *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3681 46386 : if (negate_class)
3682 : {
3683 42496 : if (lengthptr == NULL) /* Save time in the pre-compile phase */
3684 21248 : for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3685 : }
3686 : else
3687 : {
3688 3890 : memcpy(code, classbits, 32);
3689 : }
3690 46386 : code += 32;
3691 46386 : break;
3692 :
3693 :
3694 : /* ===================================================================*/
3695 : /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3696 : has been tested above. */
3697 :
3698 : case CHAR_LEFT_CURLY_BRACKET:
3699 88 : if (!is_quantifier) goto NORMAL_CHAR;
3700 54 : ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3701 54 : if (*errorcodeptr != 0) goto FAILED;
3702 54 : goto REPEAT;
3703 :
3704 : case CHAR_ASTERISK:
3705 1372 : repeat_min = 0;
3706 1372 : repeat_max = -1;
3707 1372 : goto REPEAT;
3708 :
3709 : case CHAR_PLUS:
3710 93566 : repeat_min = 1;
3711 93566 : repeat_max = -1;
3712 93566 : goto REPEAT;
3713 :
3714 : case CHAR_QUESTION_MARK:
3715 2920 : repeat_min = 0;
3716 2920 : repeat_max = 1;
3717 :
3718 97912 : REPEAT:
3719 97912 : if (previous == NULL)
3720 : {
3721 2 : *errorcodeptr = ERR9;
3722 2 : goto FAILED;
3723 : }
3724 :
3725 97910 : if (repeat_min == 0)
3726 : {
3727 4291 : firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3728 4291 : reqbyte = zeroreqbyte; /* Ditto */
3729 : }
3730 :
3731 : /* Remember whether this is a variable length repeat */
3732 :
3733 97910 : reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3734 :
3735 97910 : op_type = 0; /* Default single-char op codes */
3736 97910 : possessive_quantifier = FALSE; /* Default not possessive quantifier */
3737 :
3738 : /* Save start of previous item, in case we have to move it up to make space
3739 : for an inserted OP_ONCE for the additional '+' extension. */
3740 :
3741 97910 : tempcode = previous;
3742 :
3743 : /* If the next character is '+', we have a possessive quantifier. This
3744 : implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3745 : If the next character is '?' this is a minimizing repeat, by default,
3746 : but if PCRE_UNGREEDY is set, it works the other way round. We change the
3747 : repeat type to the non-default. */
3748 :
3749 97910 : if (ptr[1] == CHAR_PLUS)
3750 : {
3751 2 : repeat_type = 0; /* Force greedy */
3752 2 : possessive_quantifier = TRUE;
3753 2 : ptr++;
3754 : }
3755 97908 : else if (ptr[1] == CHAR_QUESTION_MARK)
3756 : {
3757 58 : repeat_type = greedy_non_default;
3758 58 : ptr++;
3759 : }
3760 97850 : else repeat_type = greedy_default;
3761 :
3762 : /* If previous was a character match, abolish the item and generate a
3763 : repeat item instead. If a char item has a minumum of more than one, ensure
3764 : that it is set in reqbyte - it might not be if a sequence such as x{3} is
3765 : the first thing in a branch because the x will have gone into firstbyte
3766 : instead. */
3767 :
3768 97910 : if (*previous == OP_CHAR || *previous == OP_CHARNC)
3769 : {
3770 : /* Deal with UTF-8 characters that take up more than one byte. It's
3771 : easier to write this out separately than try to macrify it. Use c to
3772 : hold the length of the character in bytes, plus 0x80 to flag that it's a
3773 : length rather than a small character. */
3774 :
3775 : #ifdef SUPPORT_UTF8
3776 714 : if (utf8 && (code[-1] & 0x80) != 0)
3777 : {
3778 0 : uschar *lastchar = code - 1;
3779 0 : while((*lastchar & 0xc0) == 0x80) lastchar--;
3780 0 : c = code - lastchar; /* Length of UTF-8 character */
3781 0 : memcpy(utf8_char, lastchar, c); /* Save the char */
3782 0 : c |= 0x80; /* Flag c as a length */
3783 : }
3784 : else
3785 : #endif
3786 :
3787 : /* Handle the case of a single byte - either with no UTF8 support, or
3788 : with UTF-8 disabled, or for a UTF-8 character < 128. */
3789 :
3790 : {
3791 714 : c = code[-1];
3792 714 : if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3793 : }
3794 :
3795 : /* If the repetition is unlimited, it pays to see if the next thing on
3796 : the line is something that cannot possibly match this character. If so,
3797 : automatically possessifying this item gains some performance in the case
3798 : where the match fails. */
3799 :
3800 714 : if (!possessive_quantifier &&
3801 : repeat_max < 0 &&
3802 : check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3803 : options, cd))
3804 : {
3805 34 : repeat_type = 0; /* Force greedy */
3806 34 : possessive_quantifier = TRUE;
3807 : }
3808 :
3809 714 : goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3810 : }
3811 :
3812 : /* If previous was a single negated character ([^a] or similar), we use
3813 : one of the special opcodes, replacing it. The code is shared with single-
3814 : character repeats by setting opt_type to add a suitable offset into
3815 : repeat_type. We can also test for auto-possessification. OP_NOT is
3816 : currently used only for single-byte chars. */
3817 :
3818 97196 : else if (*previous == OP_NOT)
3819 : {
3820 18 : op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3821 18 : c = previous[1];
3822 18 : if (!possessive_quantifier &&
3823 : repeat_max < 0 &&
3824 : check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3825 : {
3826 8 : repeat_type = 0; /* Force greedy */
3827 8 : possessive_quantifier = TRUE;
3828 : }
3829 18 : goto OUTPUT_SINGLE_REPEAT;
3830 : }
3831 :
3832 : /* If previous was a character type match (\d or similar), abolish it and
3833 : create a suitable repeat item. The code is shared with single-character
3834 : repeats by setting op_type to add a suitable offset into repeat_type. Note
3835 : the the Unicode property types will be present only when SUPPORT_UCP is
3836 : defined, but we don't wrap the little bits of code here because it just
3837 : makes it horribly messy. */
3838 :
3839 97178 : else if (*previous < OP_EODN)
3840 : {
3841 : uschar *oldcode;
3842 : int prop_type, prop_value;
3843 51402 : op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3844 51402 : c = *previous;
3845 :
3846 51402 : if (!possessive_quantifier &&
3847 : repeat_max < 0 &&
3848 : check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3849 : {
3850 47988 : repeat_type = 0; /* Force greedy */
3851 47988 : possessive_quantifier = TRUE;
3852 : }
3853 :
3854 52134 : OUTPUT_SINGLE_REPEAT:
3855 52142 : if (*previous == OP_PROP || *previous == OP_NOTPROP)
3856 : {
3857 8 : prop_type = previous[1];
3858 8 : prop_value = previous[2];
3859 : }
3860 52126 : else prop_type = prop_value = -1;
3861 :
3862 52134 : oldcode = code;
3863 52134 : code = previous; /* Usually overwrite previous item */
3864 :
3865 : /* If the maximum is zero then the minimum must also be zero; Perl allows
3866 : this case, so we do too - by simply omitting the item altogether. */
3867 :
3868 52134 : if (repeat_max == 0) goto END_REPEAT;
3869 :
3870 : /* All real repeats make it impossible to handle partial matching (maybe
3871 : one day we will be able to remove this restriction). */
3872 :
3873 52134 : if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3874 :
3875 : /* Combine the op_type with the repeat_type */
3876 :
3877 52134 : repeat_type += op_type;
3878 :
3879 : /* A minimum of zero is handled either as the special case * or ?, or as
3880 : an UPTO, with the maximum given. */
3881 :
3882 52134 : if (repeat_min == 0)
3883 : {
3884 1303 : if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3885 582 : else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3886 : else
3887 : {
3888 0 : *code++ = OP_UPTO + repeat_type;
3889 0 : PUT2INC(code, 0, repeat_max);
3890 : }
3891 : }
3892 :
3893 : /* A repeat minimum of 1 is optimized into some special cases. If the
3894 : maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3895 : left in place and, if the maximum is greater than 1, we use OP_UPTO with
3896 : one less than the maximum. */
3897 :
3898 50831 : else if (repeat_min == 1)
3899 : {
3900 50797 : if (repeat_max == -1)
3901 50789 : *code++ = OP_PLUS + repeat_type;
3902 : else
3903 : {
3904 8 : code = oldcode; /* leave previous item in place */
3905 8 : if (repeat_max == 1) goto END_REPEAT;
3906 2 : *code++ = OP_UPTO + repeat_type;
3907 2 : PUT2INC(code, 0, repeat_max - 1);
3908 : }
3909 : }
3910 :
3911 : /* The case {n,n} is just an EXACT, while the general case {n,m} is
3912 : handled as an EXACT followed by an UPTO. */
3913 :
3914 : else
3915 : {
3916 34 : *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3917 34 : PUT2INC(code, 0, repeat_min);
3918 :
3919 : /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3920 : we have to insert the character for the previous code. For a repeated
3921 : Unicode property match, there are two extra bytes that define the
3922 : required property. In UTF-8 mode, long characters have their length in
3923 : c, with the 0x80 bit as a flag. */
3924 :
3925 34 : if (repeat_max < 0)
3926 : {
3927 : #ifdef SUPPORT_UTF8
3928 2 : if (utf8 && c >= 128)
3929 : {
3930 0 : memcpy(code, utf8_char, c & 7);
3931 0 : code += c & 7;
3932 : }
3933 : else
3934 : #endif
3935 : {
3936 2 : *code++ = c;
3937 2 : if (prop_type >= 0)
3938 : {
3939 0 : *code++ = prop_type;
3940 0 : *code++ = prop_value;
3941 : }
3942 : }
3943 2 : *code++ = OP_STAR + repeat_type;
3944 : }
3945 :
3946 : /* Else insert an UPTO if the max is greater than the min, again
3947 : preceded by the character, for the previously inserted code. If the
3948 : UPTO is just for 1 instance, we can use QUERY instead. */
3949 :
3950 32 : else if (repeat_max != repeat_min)
3951 : {
3952 : #ifdef SUPPORT_UTF8
3953 6 : if (utf8 && c >= 128)
3954 : {
3955 0 : memcpy(code, utf8_char, c & 7);
3956 0 : code += c & 7;
3957 : }
3958 : else
3959 : #endif
3960 6 : *code++ = c;
3961 6 : if (prop_type >= 0)
3962 : {
3963 0 : *code++ = prop_type;
3964 0 : *code++ = prop_value;
3965 : }
3966 6 : repeat_max -= repeat_min;
3967 :
3968 6 : if (repeat_max == 1)
3969 : {
3970 4 : *code++ = OP_QUERY + repeat_type;
3971 : }
3972 : else
3973 : {
3974 2 : *code++ = OP_UPTO + repeat_type;
3975 2 : PUT2INC(code, 0, repeat_max);
3976 : }
3977 : }
3978 : }
3979 :
3980 : /* The character or character type itself comes last in all cases. */
3981 :
3982 : #ifdef SUPPORT_UTF8
3983 52128 : if (utf8 && c >= 128)
3984 : {
3985 0 : memcpy(code, utf8_char, c & 7);
3986 0 : code += c & 7;
3987 : }
3988 : else
3989 : #endif
3990 52128 : *code++ = c;
3991 :
3992 : /* For a repeated Unicode property match, there are two extra bytes that
3993 : define the required property. */
3994 :
3995 : #ifdef SUPPORT_UCP
3996 52128 : if (prop_type >= 0)
3997 : {
3998 8 : *code++ = prop_type;
3999 8 : *code++ = prop_value;
4000 : }
4001 : #endif
4002 : }
4003 :
4004 : /* If previous was a character class or a back reference, we put the repeat
4005 : stuff after it, but just skip the item if the repeat was {0,0}. */
4006 :
4007 90712 : else if (*previous == OP_CLASS ||
4008 : *previous == OP_NCLASS ||
4009 : #ifdef SUPPORT_UTF8
4010 : *previous == OP_XCLASS ||
4011 : #endif
4012 : *previous == OP_REF)
4013 : {
4014 44936 : if (repeat_max == 0)
4015 : {
4016 0 : code = previous;
4017 0 : goto END_REPEAT;
4018 : }
4019 :
4020 : /* All real repeats make it impossible to handle partial matching (maybe
4021 : one day we will be able to remove this restriction). */
4022 :
4023 44936 : if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
4024 :
4025 45546 : if (repeat_min == 0 && repeat_max == -1)
4026 610 : *code++ = OP_CRSTAR + repeat_type;
4027 87078 : else if (repeat_min == 1 && repeat_max == -1)
4028 42752 : *code++ = OP_CRPLUS + repeat_type;
4029 3142 : else if (repeat_min == 0 && repeat_max == 1)
4030 1568 : *code++ = OP_CRQUERY + repeat_type;
4031 : else
4032 : {
4033 6 : *code++ = OP_CRRANGE + repeat_type;
4034 6 : PUT2INC(code, 0, repeat_min);
4035 6 : if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
4036 6 : PUT2INC(code, 0, repeat_max);
4037 : }
4038 : }
4039 :
4040 : /* If previous was a bracket group, we may have to replicate it in certain
4041 : cases. */
4042 :
4043 1680 : else if (*previous == OP_BRA || *previous == OP_CBRA ||
4044 : *previous == OP_ONCE || *previous == OP_COND)
4045 : {
4046 : register int i;
4047 840 : int ketoffset = 0;
4048 840 : int len = code - previous;
4049 840 : uschar *bralink = NULL;
4050 :
4051 : /* Repeating a DEFINE group is pointless */
4052 :
4053 840 : if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
4054 : {
4055 0 : *errorcodeptr = ERR55;
4056 0 : goto FAILED;
4057 : }
4058 :
4059 : /* If the maximum repeat count is unlimited, find the end of the bracket
4060 : by scanning through from the start, and compute the offset back to it
4061 : from the current code pointer. There may be an OP_OPT setting following
4062 : the final KET, so we can't find the end just by going back from the code
4063 : pointer. */
4064 :
4065 840 : if (repeat_max == -1)
4066 : {
4067 68 : register uschar *ket = previous;
4068 74 : do ket += GET(ket, 1); while (*ket != OP_KET);
4069 68 : ketoffset = code - ket;
4070 : }
4071 :
4072 : /* The case of a zero minimum is special because of the need to stick
4073 : OP_BRAZERO in front of it, and because the group appears once in the
4074 : data, whereas in other cases it appears the minimum number of times. For
4075 : this reason, it is simplest to treat this case separately, as otherwise
4076 : the code gets far too messy. There are several special subcases when the
4077 : minimum is zero. */
4078 :
4079 840 : if (repeat_min == 0)
4080 : {
4081 : /* If the maximum is also zero, we used to just omit the group from the
4082 : output altogether, like this:
4083 :
4084 : ** if (repeat_max == 0)
4085 : ** {
4086 : ** code = previous;
4087 : ** goto END_REPEAT;
4088 : ** }
4089 :
4090 : However, that fails when a group is referenced as a subroutine from
4091 : elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
4092 : so that it is skipped on execution. As we don't have a list of which
4093 : groups are referenced, we cannot do this selectively.
4094 :
4095 : If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
4096 : and do no more at this point. However, we do need to adjust any
4097 : OP_RECURSE calls inside the group that refer to the group itself or any
4098 : internal or forward referenced group, because the offset is from the
4099 : start of the whole regex. Temporarily terminate the pattern while doing
4100 : this. */
4101 :
4102 810 : if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
4103 : {
4104 810 : *code = OP_END;
4105 810 : adjust_recurse(previous, 1, utf8, cd, save_hwm);
4106 810 : memmove(previous+1, previous, len);
4107 810 : code++;
4108 810 : if (repeat_max == 0)
4109 : {
4110 0 : *previous++ = OP_SKIPZERO;
4111 0 : goto END_REPEAT;
4112 : }
4113 810 : *previous++ = OP_BRAZERO + repeat_type;
4114 : }
4115 :
4116 : /* If the maximum is greater than 1 and limited, we have to replicate
4117 : in a nested fashion, sticking OP_BRAZERO before each set of brackets.
4118 : The first one has to be handled carefully because it's the original
4119 : copy, which has to be moved up. The remainder can be handled by code
4120 : that is common with the non-zero minimum case below. We have to
4121 : adjust the value or repeat_max, since one less copy is required. Once
4122 : again, we may have to adjust any OP_RECURSE calls inside the group. */
4123 :
4124 : else
4125 : {
4126 : int offset;
4127 0 : *code = OP_END;
4128 0 : adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
4129 0 : memmove(previous + 2 + LINK_SIZE, previous, len);
4130 0 : code += 2 + LINK_SIZE;
4131 0 : *previous++ = OP_BRAZERO + repeat_type;
4132 0 : *previous++ = OP_BRA;
4133 :
4134 : /* We chain together the bracket offset fields that have to be
4135 : filled in later when the ends of the brackets are reached. */
4136 :
4137 0 : offset = (bralink == NULL)? 0 : previous - bralink;
4138 0 : bralink = previous;
4139 0 : PUTINC(previous, 0, offset);
4140 : }
4141 :
4142 810 : repeat_max--;
4143 : }
4144 :
4145 : /* If the minimum is greater than zero, replicate the group as many
4146 : times as necessary, and adjust the maximum to the number of subsequent
4147 : copies that we need. If we set a first char from the group, and didn't
4148 : set a required char, copy the latter from the former. If there are any
4149 : forward reference subroutine calls in the group, there will be entries on
4150 : the workspace list; replicate these with an appropriate increment. */
4151 :
4152 : else
4153 : {
4154 30 : if (repeat_min > 1)
4155 : {
4156 : /* In the pre-compile phase, we don't actually do the replication. We
4157 : just adjust the length as if we had. Do some paranoid checks for
4158 : potential integer overflow. */
4159 :
4160 6 : if (lengthptr != NULL)
4161 : {
4162 3 : int delta = (repeat_min - 1)*length_prevgroup;
4163 3 : if ((double)(repeat_min - 1)*(double)length_prevgroup >
4164 : (double)INT_MAX ||
4165 : OFLOW_MAX - *lengthptr < delta)
4166 : {
4167 0 : *errorcodeptr = ERR20;
4168 0 : goto FAILED;
4169 : }
4170 3 : *lengthptr += delta;
4171 : }
4172 :
4173 : /* This is compiling for real */
4174 :
4175 : else
4176 : {
4177 3 : if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
4178 6 : for (i = 1; i < repeat_min; i++)
4179 : {
4180 : uschar *hc;
4181 3 : uschar *this_hwm = cd->hwm;
4182 3 : memcpy(code, previous, len);
4183 3 : for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4184 : {
4185 0 : PUT(cd->hwm, 0, GET(hc, 0) + len);
4186 0 : cd->hwm += LINK_SIZE;
4187 : }
4188 3 : save_hwm = this_hwm;
4189 3 : code += len;
4190 : }
4191 : }
4192 : }
4193 :
4194 30 : if (repeat_max > 0) repeat_max -= repeat_min;
4195 : }
4196 :
4197 : /* This code is common to both the zero and non-zero minimum cases. If
4198 : the maximum is limited, it replicates the group in a nested fashion,
4199 : remembering the bracket starts on a stack. In the case of a zero minimum,
4200 : the first one was set up above. In all cases the repeat_max now specifies
4201 : the number of additional copies needed. Again, we must remember to
4202 : replicate entries on the forward reference list. */
4203 :
4204 840 : if (repeat_max >= 0)
4205 : {
4206 : /* In the pre-compile phase, we don't actually do the replication. We
4207 : just adjust the length as if we had. For each repetition we must add 1
4208 : to the length for BRAZERO and for all but the last repetition we must
4209 : add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4210 : paranoid checks to avoid integer overflow. */
4211 :
4212 772 : if (lengthptr != NULL && repeat_max > 0)
4213 : {
4214 : int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
4215 0 : 2 - 2*LINK_SIZE; /* Last one doesn't nest */
4216 0 : if ((double)repeat_max *
4217 : (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
4218 : > (double)INT_MAX ||
4219 : OFLOW_MAX - *lengthptr < delta)
4220 : {
4221 0 : *errorcodeptr = ERR20;
4222 0 : goto FAILED;
4223 : }
4224 0 : *lengthptr += delta;
4225 : }
4226 :
4227 : /* This is compiling for real */
4228 :
4229 772 : else for (i = repeat_max - 1; i >= 0; i--)
4230 : {
4231 : uschar *hc;
4232 0 : uschar *this_hwm = cd->hwm;
4233 :
4234 0 : *code++ = OP_BRAZERO + repeat_type;
4235 :
4236 : /* All but the final copy start a new nesting, maintaining the
4237 : chain of brackets outstanding. */
4238 :
4239 0 : if (i != 0)
4240 : {
4241 : int offset;
4242 0 : *code++ = OP_BRA;
4243 0 : offset = (bralink == NULL)? 0 : code - bralink;
4244 0 : bralink = code;
4245 0 : PUTINC(code, 0, offset);
4246 : }
4247 :
4248 0 : memcpy(code, previous, len);
4249 0 : for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4250 : {
4251 0 : PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4252 0 : cd->hwm += LINK_SIZE;
4253 : }
4254 0 : save_hwm = this_hwm;
4255 0 : code += len;
4256 : }
4257 :
4258 : /* Now chain through the pending brackets, and fill in their length
4259 : fields (which are holding the chain links pro tem). */
4260 :
4261 1544 : while (bralink != NULL)
4262 : {
4263 : int oldlinkoffset;
4264 0 : int offset = code - bralink + 1;
4265 0 : uschar *bra = code - offset;
4266 0 : oldlinkoffset = GET(bra, 1);
4267 0 : bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
4268 0 : *code++ = OP_KET;
4269 0 : PUTINC(code, 0, offset);
4270 0 : PUT(bra, 1, offset);
4271 : }
4272 : }
4273 :
4274 : /* If the maximum is unlimited, set a repeater in the final copy. We
4275 : can't just offset backwards from the current code point, because we
4276 : don't know if there's been an options resetting after the ket. The
4277 : correct offset was computed above.
4278 :
4279 : Then, when we are doing the actual compile phase, check to see whether
4280 : this group is a non-atomic one that could match an empty string. If so,
4281 : convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4282 : that runtime checking can be done. [This check is also applied to
4283 : atomic groups at runtime, but in a different way.] */
4284 :
4285 : else
4286 : {
4287 68 : uschar *ketcode = code - ketoffset;
4288 68 : uschar *bracode = ketcode - GET(ketcode, 1);
4289 68 : *ketcode = OP_KETRMAX + repeat_type;
4290 68 : if (lengthptr == NULL && *bracode != OP_ONCE)
4291 : {
4292 34 : uschar *scode = bracode;
4293 : do
4294 : {
4295 38 : if (could_be_empty_branch(scode, ketcode, utf8))
4296 : {
4297 7 : *bracode += OP_SBRA - OP_BRA;
4298 7 : break;
4299 : }
4300 31 : scode += GET(scode, 1);
4301 : }
4302 31 : while (*scode == OP_ALT);
4303 : }
4304 : }
4305 : }
4306 :
4307 : /* If previous is OP_FAIL, it was generated by an empty class [] in
4308 : JavaScript mode. The other ways in which OP_FAIL can be generated, that is
4309 : by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
4310 : error above. We can just ignore the repeat in JS case. */
4311 :
4312 0 : else if (*previous == OP_FAIL) goto END_REPEAT;
4313 :
4314 : /* Else there's some kind of shambles */
4315 :
4316 : else
4317 : {
4318 0 : *errorcodeptr = ERR11;
4319 0 : goto FAILED;
4320 : }
4321 :
4322 : /* If the character following a repeat is '+', or if certain optimization
4323 : tests above succeeded, possessive_quantifier is TRUE. For some of the
4324 : simpler opcodes, there is an special alternative opcode for this. For
4325 : anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4326 : The '+' notation is just syntactic sugar, taken from Sun's Java package,
4327 : but the special opcodes can optimize it a bit. The repeated item starts at
4328 : tempcode, not at previous, which might be the first part of a string whose
4329 : (former) last char we repeated.
4330 :
4331 : Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4332 : an 'upto' may follow. We skip over an 'exact' item, and then test the
4333 : length of what remains before proceeding. */
4334 :
4335 97904 : if (possessive_quantifier)
4336 : {
4337 : int len;
4338 48032 : if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4339 : *tempcode == OP_NOTEXACT)
4340 0 : tempcode += _pcre_OP_lengths[*tempcode] +
4341 : ((*tempcode == OP_TYPEEXACT &&
4342 : (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
4343 48032 : len = code - tempcode;
4344 48032 : if (len > 0) switch (*tempcode)
4345 : {
4346 32 : case OP_STAR: *tempcode = OP_POSSTAR; break;
4347 2 : case OP_PLUS: *tempcode = OP_POSPLUS; break;
4348 0 : case OP_QUERY: *tempcode = OP_POSQUERY; break;
4349 0 : case OP_UPTO: *tempcode = OP_POSUPTO; break;
4350 :
4351 67 : case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4352 47923 : case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4353 0 : case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4354 0 : case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4355 :
4356 2 : case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4357 6 : case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4358 0 : case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4359 0 : case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4360 :
4361 : default:
4362 0 : memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4363 0 : code += 1 + LINK_SIZE;
4364 0 : len += 1 + LINK_SIZE;
4365 0 : tempcode[0] = OP_ONCE;
4366 0 : *code++ = OP_KET;
4367 0 : PUTINC(code, 0, len);
4368 0 : PUT(tempcode, 1, len);
4369 : break;
4370 : }
4371 : }
4372 :
4373 : /* In all case we no longer have a previous item. We also set the
4374 : "follows varying string" flag for subsequently encountered reqbytes if
4375 : it isn't already set and we have just passed a varying length item. */
4376 :
4377 97910 : END_REPEAT:
4378 97910 : previous = NULL;
4379 97910 : cd->req_varyopt |= reqvary;
4380 97910 : break;
4381 :
4382 :
4383 : /* ===================================================================*/
4384 : /* Start of nested parenthesized sub-expression, or comment or lookahead or
4385 : lookbehind or option setting or condition or all the other extended
4386 : parenthesis forms. */
4387 :
4388 : case CHAR_LEFT_PARENTHESIS:
4389 17630 : newoptions = options;
4390 17630 : skipbytes = 0;
4391 17630 : bravalue = OP_CBRA;
4392 17630 : save_hwm = cd->hwm;
4393 17630 : reset_bracount = FALSE;
4394 :
4395 : /* First deal with various "verbs" that can be introduced by '*'. */
4396 :
4397 17630 : if (*(++ptr) == CHAR_ASTERISK && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4398 : {
4399 : int i, namelen;
4400 0 : const char *vn = verbnames;
4401 0 : const uschar *name = ++ptr;
4402 0 : previous = NULL;
4403 0 : while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
4404 0 : if (*ptr == CHAR_COLON)
4405 : {
4406 0 : *errorcodeptr = ERR59; /* Not supported */
4407 0 : goto FAILED;
4408 : }
4409 0 : if (*ptr != CHAR_RIGHT_PARENTHESIS)
4410 : {
4411 0 : *errorcodeptr = ERR60;
4412 0 : goto FAILED;
4413 : }
4414 0 : namelen = ptr - name;
4415 0 : for (i = 0; i < verbcount; i++)
4416 : {
4417 0 : if (namelen == verbs[i].len &&
4418 : strncmp((char *)name, vn, namelen) == 0)
4419 : {
4420 0 : *code = verbs[i].op;
4421 0 : if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4422 0 : break;
4423 : }
4424 0 : vn += verbs[i].len + 1;
4425 : }
4426 0 : if (i < verbcount) continue;
4427 0 : *errorcodeptr = ERR60;
4428 0 : goto FAILED;
4429 : }
4430 :
4431 : /* Deal with the extended parentheses; all are introduced by '?', and the
4432 : appearance of any of them means that this is not a capturing group. */
4433 :
4434 17630 : else if (*ptr == CHAR_QUESTION_MARK)
4435 : {
4436 : int i, set, unset, namelen;
4437 : int *optset;
4438 : const uschar *name;
4439 : uschar *slot;
4440 :
4441 2892 : switch (*(++ptr))
4442 : {
4443 : case CHAR_NUMBER_SIGN: /* Comment; skip to ket */
4444 0 : ptr++;
4445 0 : while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
4446 0 : if (*ptr == 0)
4447 : {
4448 0 : *errorcodeptr = ERR18;
4449 0 : goto FAILED;
4450 : }
4451 0 : continue;
4452 :
4453 :
4454 : /* ------------------------------------------------------------ */
4455 : case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
4456 0 : reset_bracount = TRUE;
4457 : /* Fall through */
4458 :
4459 : /* ------------------------------------------------------------ */
4460 : case CHAR_COLON: /* Non-capturing bracket */
4461 778 : bravalue = OP_BRA;
4462 778 : ptr++;
4463 778 : break;
4464 :
4465 :
4466 : /* ------------------------------------------------------------ */
4467 : case CHAR_LEFT_PARENTHESIS:
4468 4 : bravalue = OP_COND; /* Conditional group */
4469 :
4470 : /* A condition can be an assertion, a number (referring to a numbered
4471 : group), a name (referring to a named group), or 'R', referring to
4472 : recursion. R<digits> and R&name are also permitted for recursion tests.
4473 :
4474 : There are several syntaxes for testing a named group: (?(name)) is used
4475 : by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4476 :
4477 : There are two unfortunate ambiguities, caused by history. (a) 'R' can
4478 : be the recursive thing or the name 'R' (and similarly for 'R' followed
4479 : by digits), and (b) a number could be a name that consists of digits.
4480 : In both cases, we look for a name first; if not found, we try the other
4481 : cases. */
4482 :
4483 : /* For conditions that are assertions, check the syntax, and then exit
4484 : the switch. This will take control down to where bracketed groups,
4485 : including assertions, are processed. */
4486 :
4487 4 : if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||
4488 : ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))
4489 : break;
4490 :
4491 : /* Most other conditions use OP_CREF (a couple change to OP_RREF
4492 : below), and all need to skip 3 bytes at the start of the group. */
4493 :
4494 4 : code[1+LINK_SIZE] = OP_CREF;
4495 4 : skipbytes = 3;
4496 4 : refsign = -1;
4497 :
4498 : /* Check for a test for recursion in a named group. */
4499 :
4500 4 : if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
4501 : {
4502 0 : terminator = -1;
4503 0 : ptr += 2;
4504 0 : code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4505 : }
4506 :
4507 : /* Check for a test for a named group's having been set, using the Perl
4508 : syntax (?(<name>) or (?('name') */
4509 :
4510 4 : else if (ptr[1] == CHAR_LESS_THAN_SIGN)
4511 : {
4512 0 : terminator = CHAR_GREATER_THAN_SIGN;
4513 0 : ptr++;
4514 : }
4515 4 : else if (ptr[1] == CHAR_APOSTROPHE)
4516 : {
4517 0 : terminator = CHAR_APOSTROPHE;
4518 0 : ptr++;
4519 : }
4520 : else
4521 : {
4522 4 : terminator = 0;
4523 4 : if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
4524 : }
4525 :
4526 : /* We now expect to read a name; any thing else is an error */
4527 :
4528 4 : if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4529 : {
4530 0 : ptr += 1; /* To get the right offset */
4531 0 : *errorcodeptr = ERR28;
4532 0 : goto FAILED;
4533 : }
4534 :
4535 : /* Read the name, but also get it as a number if it's all digits */
4536 :
4537 4 : recno = 0;
4538 4 : name = ++ptr;
4539 12 : while ((cd->ctypes[*ptr] & ctype_word) != 0)
4540 : {
4541 4 : if (recno >= 0)
4542 4 : recno = ((digitab[*ptr] & ctype_digit) != 0)?
4543 : recno * 10 + *ptr - CHAR_0 : -1;
4544 4 : ptr++;
4545 : }
4546 4 : namelen = ptr - name;
4547 :
4548 4 : if ((terminator > 0 && *ptr++ != terminator) ||
4549 : *ptr++ != CHAR_RIGHT_PARENTHESIS)
4550 : {
4551 0 : ptr--; /* Error offset */
4552 0 : *errorcodeptr = ERR26;
4553 0 : goto FAILED;
4554 : }
4555 :
4556 : /* Do no further checking in the pre-compile phase. */
4557 :
4558 4 : if (lengthptr != NULL) break;
4559 :
4560 : /* In the real compile we do the work of looking for the actual
4561 : reference. If the string started with "+" or "-" we require the rest to
4562 : be digits, in which case recno will be set. */
4563 :
4564 2 : if (refsign > 0)
4565 : {
4566 0 : if (recno <= 0)
4567 : {
4568 0 : *errorcodeptr = ERR58;
4569 0 : goto FAILED;
4570 : }
4571 0 : recno = (refsign == CHAR_MINUS)?
4572 : cd->bracount - recno + 1 : recno +cd->bracount;
4573 0 : if (recno <= 0 || recno > cd->final_bracount)
4574 : {
4575 0 : *errorcodeptr = ERR15;
4576 0 : goto FAILED;
4577 : }
4578 0 : PUT2(code, 2+LINK_SIZE, recno);
4579 0 : break;
4580 : }
4581 :
4582 : /* Otherwise (did not start with "+" or "-"), start by looking for the
4583 : name. */
4584 :
4585 2 : slot = cd->name_table;
4586 2 : for (i = 0; i < cd->names_found; i++)
4587 : {
4588 0 : if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4589 0 : slot += cd->name_entry_size;
4590 : }
4591 :
4592 : /* Found a previous named subpattern */
4593 :
4594 2 : if (i < cd->names_found)
4595 : {
4596 0 : recno = GET2(slot, 0);
4597 0 : PUT2(code, 2+LINK_SIZE, recno);
4598 : }
4599 :
4600 : /* Search the pattern for a forward reference */
4601 :
4602 2 : else if ((i = find_parens(cd, name, namelen,
4603 : (options & PCRE_EXTENDED) != 0)) > 0)
4604 : {
4605 0 : PUT2(code, 2+LINK_SIZE, i);
4606 : }
4607 :
4608 : /* If terminator == 0 it means that the name followed directly after
4609 : the opening parenthesis [e.g. (?(abc)...] and in this case there are
4610 : some further alternatives to try. For the cases where terminator != 0
4611 : [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4612 : now checked all the possibilities, so give an error. */
4613 :
4614 2 : else if (terminator != 0)
4615 : {
4616 0 : *errorcodeptr = ERR15;
4617 0 : goto FAILED;
4618 : }
4619 :
4620 : /* Check for (?(R) for recursion. Allow digits after R to specify a
4621 : specific group number. */
4622 :
4623 2 : else if (*name == CHAR_R)
4624 : {
4625 0 : recno = 0;
4626 0 : for (i = 1; i < namelen; i++)
4627 : {
4628 0 : if ((digitab[name[i]] & ctype_digit) == 0)
4629 : {
4630 0 : *errorcodeptr = ERR15;
4631 0 : goto FAILED;
4632 : }
4633 0 : recno = recno * 10 + name[i] - CHAR_0;
4634 : }
4635 0 : if (recno == 0) recno = RREF_ANY;
4636 0 : code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4637 0 : PUT2(code, 2+LINK_SIZE, recno);
4638 : }
4639 :
4640 : /* Similarly, check for the (?(DEFINE) "condition", which is always
4641 : false. */
4642 :
4643 2 : else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0)
4644 : {
4645 0 : code[1+LINK_SIZE] = OP_DEF;
4646 0 : skipbytes = 1;
4647 : }
4648 :
4649 : /* Check for the "name" actually being a subpattern number. We are
4650 : in the second pass here, so final_bracount is set. */
4651 :
4652 4 : else if (recno > 0 && recno <= cd->final_bracount)
4653 : {
4654 2 : PUT2(code, 2+LINK_SIZE, recno);
4655 : }
4656 :
4657 : /* Either an unidentified subpattern, or a reference to (?(0) */
4658 :
4659 : else
4660 : {
4661 0 : *errorcodeptr = (recno == 0)? ERR35: ERR15;
4662 0 : goto FAILED;
4663 : }
4664 2 : break;
4665 :
4666 :
4667 : /* ------------------------------------------------------------ */
4668 : case CHAR_EQUALS_SIGN: /* Positive lookahead */
4669 10 : bravalue = OP_ASSERT;
4670 10 : ptr++;
4671 10 : break;
4672 :
4673 :
4674 : /* ------------------------------------------------------------ */
4675 : case CHAR_EXCLAMATION_MARK: /* Negative lookahead */
4676 4 : ptr++;
4677 4 : if (*ptr == CHAR_RIGHT_PARENTHESIS) /* Optimize (?!) */
4678 : {
4679 0 : *code++ = OP_FAIL;
4680 0 : previous = NULL;
4681 0 : continue;
4682 : }
4683 4 : bravalue = OP_ASSERT_NOT;
4684 4 : break;
4685 :
4686 :
4687 : /* ------------------------------------------------------------ */
4688 : case CHAR_LESS_THAN_SIGN: /* Lookbehind or named define */
4689 4 : switch (ptr[1])
4690 : {
4691 : case CHAR_EQUALS_SIGN: /* Positive lookbehind */
4692 0 : bravalue = OP_ASSERTBACK;
4693 0 : ptr += 2;
4694 0 : break;
4695 :
4696 : case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */
4697 4 : bravalue = OP_ASSERTBACK_NOT;
4698 4 : ptr += 2;
4699 4 : break;
4700 :
4701 : default: /* Could be name define, else bad */
4702 0 : if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4703 0 : ptr++; /* Correct offset for error */
4704 0 : *errorcodeptr = ERR24;
4705 0 : goto FAILED;
4706 : }
4707 4 : break;
4708 :
4709 :
4710 : /* ------------------------------------------------------------ */
4711 : case CHAR_GREATER_THAN_SIGN: /* One-time brackets */
4712 4 : bravalue = OP_ONCE;
4713 4 : ptr++;
4714 4 : break;
4715 :
4716 :
4717 : /* ------------------------------------------------------------ */
4718 : case CHAR_C: /* Callout - may be followed by digits; */
4719 0 : previous_callout = code; /* Save for later completion */
4720 0 : after_manual_callout = 1; /* Skip one item before completing */
4721 0 : *code++ = OP_CALLOUT;
4722 : {
4723 0 : int n = 0;
4724 0 : while ((digitab[*(++ptr)] & ctype_digit) != 0)
4725 0 : n = n * 10 + *ptr - CHAR_0;
4726 0 : if (*ptr != CHAR_RIGHT_PARENTHESIS)
4727 : {
4728 0 : *errorcodeptr = ERR39;
4729 0 : goto FAILED;
4730 : }
4731 0 : if (n > 255)
4732 : {
4733 0 : *errorcodeptr = ERR38;
4734 0 : goto FAILED;
4735 : }
4736 0 : *code++ = n;
4737 0 : PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4738 0 : PUT(code, LINK_SIZE, 0); /* Default length */
4739 0 : code += 2 * LINK_SIZE;
4740 : }
4741 0 : previous = NULL;
4742 0 : continue;
4743 :
4744 :
4745 : /* ------------------------------------------------------------ */
4746 : case CHAR_P: /* Python-style named subpattern handling */
4747 2082 : if (*(++ptr) == CHAR_EQUALS_SIGN ||
4748 : *ptr == CHAR_GREATER_THAN_SIGN) /* Reference or recursion */
4749 : {
4750 0 : is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
4751 0 : terminator = CHAR_RIGHT_PARENTHESIS;
4752 0 : goto NAMED_REF_OR_RECURSE;
4753 : }
4754 2082 : else if (*ptr != CHAR_LESS_THAN_SIGN) /* Test for Python-style defn */
4755 : {
4756 0 : *errorcodeptr = ERR41;
4757 0 : goto FAILED;
4758 : }
4759 : /* Fall through to handle (?P< as (?< is handled */
4760 :
4761 :
4762 : /* ------------------------------------------------------------ */
4763 2082 : DEFINE_NAME: /* Come here from (?< handling */
4764 : case CHAR_APOSTROPHE:
4765 : {
4766 2082 : terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
4767 : CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
4768 2082 : name = ++ptr;
4769 :
4770 2082 : while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4771 2082 : namelen = ptr - name;
4772 :
4773 : /* In the pre-compile phase, just do a syntax check. */
4774 :
4775 2082 : if (lengthptr != NULL)
4776 : {
4777 1041 : if (*ptr != terminator)
4778 : {
4779 0 : *errorcodeptr = ERR42;
4780 0 : goto FAILED;
4781 : }
4782 1041 : if (cd->names_found >= MAX_NAME_COUNT)
4783 : {
4784 0 : *errorcodeptr = ERR49;
4785 0 : goto FAILED;
4786 : }
4787 1041 : if (namelen + 3 > cd->name_entry_size)
4788 : {
4789 14 : cd->name_entry_size = namelen + 3;
4790 14 : if (namelen > MAX_NAME_SIZE)
4791 : {
4792 0 : *errorcodeptr = ERR48;
4793 0 : goto FAILED;
4794 : }
4795 : }
4796 : }
4797 :
4798 : /* In the real compile, create the entry in the table */
4799 :
4800 : else
4801 : {
4802 1041 : slot = cd->name_table;
4803 457021 : for (i = 0; i < cd->names_found; i++)
4804 : {
4805 456981 : int crc = memcmp(name, slot+2, namelen);
4806 456981 : if (crc == 0)
4807 : {
4808 0 : if (slot[2+namelen] == 0)
4809 : {
4810 0 : if ((options & PCRE_DUPNAMES) == 0)
4811 : {
4812 0 : *errorcodeptr = ERR43;
4813 0 : goto FAILED;
4814 : }
4815 : }
4816 0 : else crc = -1; /* Current name is substring */
4817 : }
4818 456981 : if (crc < 0)
4819 : {
4820 1001 : memmove(slot + cd->name_entry_size, slot,
4821 : (cd->names_found - i) * cd->name_entry_size);
4822 1001 : break;
4823 : }
4824 455980 : slot += cd->name_entry_size;
4825 : }
4826 :
4827 1041 : PUT2(slot, 0, cd->bracount + 1);
4828 1041 : memcpy(slot + 2, name, namelen);
4829 1041 : slot[2+namelen] = 0;
4830 : }
4831 : }
4832 :
4833 : /* In both cases, count the number of names we've encountered. */
4834 :
4835 2082 : ptr++; /* Move past > or ' */
4836 2082 : cd->names_found++;
4837 2082 : goto NUMBERED_GROUP;
4838 :
4839 :
4840 : /* ------------------------------------------------------------ */
4841 : case CHAR_AMPERSAND: /* Perl recursion/subroutine syntax */
4842 0 : terminator = CHAR_RIGHT_PARENTHESIS;
4843 0 : is_recurse = TRUE;
4844 : /* Fall through */
4845 :
4846 : /* We come here from the Python syntax above that handles both
4847 : references (?P=name) and recursion (?P>name), as well as falling
4848 : through from the Perl recursion syntax (?&name). We also come here from
4849 : the Perl \k<name> or \k'name' back reference syntax and the \k{name}
4850 : .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
4851 :
4852 0 : NAMED_REF_OR_RECURSE:
4853 0 : name = ++ptr;
4854 0 : while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4855 0 : namelen = ptr - name;
4856 :
4857 : /* In the pre-compile phase, do a syntax check and set a dummy
4858 : reference number. */
4859 :
4860 0 : if (lengthptr != NULL)
4861 : {
4862 0 : if (namelen == 0)
4863 : {
4864 0 : *errorcodeptr = ERR62;
4865 0 : goto FAILED;
4866 : }
4867 0 : if (*ptr != terminator)
4868 : {
4869 0 : *errorcodeptr = ERR42;
4870 0 : goto FAILED;
4871 : }
4872 0 : if (namelen > MAX_NAME_SIZE)
4873 : {
4874 0 : *errorcodeptr = ERR48;
4875 0 : goto FAILED;
4876 : }
4877 0 : recno = 0;
4878 : }
4879 :
4880 : /* In the real compile, seek the name in the table. We check the name
4881 : first, and then check that we have reached the end of the name in the
4882 : table. That way, if the name that is longer than any in the table,
4883 : the comparison will fail without reading beyond the table entry. */
4884 :
4885 : else
4886 : {
4887 0 : slot = cd->name_table;
4888 0 : for (i = 0; i < cd->names_found; i++)
4889 : {
4890 0 : if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
4891 : slot[2+namelen] == 0)
4892 0 : break;
4893 0 : slot += cd->name_entry_size;
4894 : }
4895 :
4896 0 : if (i < cd->names_found) /* Back reference */
4897 : {
4898 0 : recno = GET2(slot, 0);
4899 : }
4900 0 : else if ((recno = /* Forward back reference */
4901 : find_parens(cd, name, namelen,
4902 : (options & PCRE_EXTENDED) != 0)) <= 0)
4903 : {
4904 0 : *errorcodeptr = ERR15;
4905 0 : goto FAILED;
4906 : }
4907 : }
4908 :
4909 : /* In both phases, we can now go to the code than handles numerical
4910 : recursion or backreferences. */
4911 :
4912 0 : if (is_recurse) goto HANDLE_RECURSION;
4913 0 : else goto HANDLE_REFERENCE;
4914 :
4915 :
4916 : /* ------------------------------------------------------------ */
4917 : case CHAR_R: /* Recursion */
4918 4 : ptr++; /* Same as (?0) */
4919 : /* Fall through */
4920 :
4921 :
4922 : /* ------------------------------------------------------------ */
4923 : case CHAR_MINUS: case CHAR_PLUS: /* Recursion or subroutine */
4924 : case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
4925 : case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
4926 : {
4927 : const uschar *called;
4928 4 : terminator = CHAR_RIGHT_PARENTHESIS;
4929 :
4930 : /* Come here from the \g<...> and \g'...' code (Oniguruma
4931 : compatibility). However, the syntax has been checked to ensure that
4932 : the ... are a (signed) number, so that neither ERR63 nor ERR29 will
4933 : be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
4934 : ever be taken. */
4935 :
4936 4 : HANDLE_NUMERICAL_RECURSION:
4937 :
4938 4 : if ((refsign = *ptr) == CHAR_PLUS)
4939 : {
4940 0 : ptr++;
4941 0 : if ((digitab[*ptr] & ctype_digit) == 0)
4942 : {
4943 0 : *errorcodeptr = ERR63;
4944 0 : goto FAILED;
4945 : }
4946 : }
4947 4 : else if (refsign == CHAR_MINUS)
4948 : {
4949 0 : if ((digitab[ptr[1]] & ctype_digit) == 0)
4950 0 : goto OTHER_CHAR_AFTER_QUERY;
4951 0 : ptr++;
4952 : }
4953 :
4954 4 : recno = 0;
4955 8 : while((digitab[*ptr] & ctype_digit) != 0)
4956 0 : recno = recno * 10 + *ptr++ - CHAR_0;
4957 :
4958 4 : if (*ptr != terminator)
4959 : {
4960 0 : *errorcodeptr = ERR29;
4961 0 : goto FAILED;
4962 : }
4963 :
4964 4 : if (refsign == CHAR_MINUS)
4965 : {
4966 0 : if (recno == 0)
4967 : {
4968 0 : *errorcodeptr = ERR58;
4969 0 : goto FAILED;
4970 : }
4971 0 : recno = cd->bracount - recno + 1;
4972 0 : if (recno <= 0)
4973 : {
4974 0 : *errorcodeptr = ERR15;
4975 0 : goto FAILED;
4976 : }
4977 : }
4978 4 : else if (refsign == CHAR_PLUS)
4979 : {
4980 0 : if (recno == 0)
4981 : {
4982 0 : *errorcodeptr = ERR58;
4983 0 : goto FAILED;
4984 : }
4985 0 : recno += cd->bracount;
4986 : }
4987 :
4988 : /* Come here from code above that handles a named recursion */
4989 :
4990 4 : HANDLE_RECURSION:
4991 :
4992 4 : previous = code;
4993 4 : called = cd->start_code;
4994 :
4995 : /* When we are actually compiling, find the bracket that is being
4996 : referenced. Temporarily end the regex in case it doesn't exist before
4997 : this point. If we end up with a forward reference, first check that
4998 : the bracket does occur later so we can give the error (and position)
4999 : now. Then remember this forward reference in the workspace so it can
5000 : be filled in at the end. */
5001 :
5002 4 : if (lengthptr == NULL)
5003 : {
5004 2 : *code = OP_END;
5005 2 : if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
5006 :
5007 : /* Forward reference */
5008 :
5009 2 : if (called == NULL)
5010 : {
5011 0 : if (find_parens(cd, NULL, recno,
5012 : (options & PCRE_EXTENDED) != 0) < 0)
5013 : {
5014 0 : *errorcodeptr = ERR15;
5015 0 : goto FAILED;
5016 : }
5017 0 : called = cd->start_code + recno;
5018 0 : PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
5019 : }
5020 :
5021 : /* If not a forward reference, and the subpattern is still open,
5022 : this is a recursive call. We check to see if this is a left
5023 : recursion that could loop for ever, and diagnose that case. */
5024 :
5025 2 : else if (GET(called, 1) == 0 &&
5026 : could_be_empty(called, code, bcptr, utf8))
5027 : {
5028 0 : *errorcodeptr = ERR40;
5029 0 : goto FAILED;
5030 : }
5031 : }
5032 :
5033 : /* Insert the recursion/subroutine item, automatically wrapped inside
5034 : "once" brackets. Set up a "previous group" length so that a
5035 : subsequent quantifier will work. */
5036 :
5037 4 : *code = OP_ONCE;
5038 4 : PUT(code, 1, 2 + 2*LINK_SIZE);
5039 4 : code += 1 + LINK_SIZE;
5040 :
5041 4 : *code = OP_RECURSE;
5042 4 : PUT(code, 1, called - cd->start_code);
5043 4 : code += 1 + LINK_SIZE;
5044 :
5045 4 : *code = OP_KET;
5046 4 : PUT(code, 1, 2 + 2*LINK_SIZE);
5047 4 : code += 1 + LINK_SIZE;
5048 :
5049 4 : length_prevgroup = 3 + 3*LINK_SIZE;
5050 : }
5051 :
5052 : /* Can't determine a first byte now */
5053 :
5054 4 : if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5055 4 : continue;
5056 :
5057 :
5058 : /* ------------------------------------------------------------ */
5059 : default: /* Other characters: check option setting */
5060 2 : OTHER_CHAR_AFTER_QUERY:
5061 2 : set = unset = 0;
5062 2 : optset = &set;
5063 :
5064 6 : while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
5065 : {
5066 2 : switch (*ptr++)
5067 : {
5068 0 : case CHAR_MINUS: optset = &unset; break;
5069 :
5070 : case CHAR_J: /* Record that it changed in the external options */
5071 0 : *optset |= PCRE_DUPNAMES;
5072 0 : cd->external_flags |= PCRE_JCHANGED;
5073 0 : break;
5074 :
5075 0 : case CHAR_i: *optset |= PCRE_CASELESS; break;
5076 0 : case CHAR_m: *optset |= PCRE_MULTILINE; break;
5077 0 : case CHAR_s: *optset |= PCRE_DOTALL; break;
5078 0 : case CHAR_x: *optset |= PCRE_EXTENDED; break;
5079 2 : case CHAR_U: *optset |= PCRE_UNGREEDY; break;
5080 0 : case CHAR_X: *optset |= PCRE_EXTRA; break;
5081 :
5082 0 : default: *errorcodeptr = ERR12;
5083 0 : ptr--; /* Correct the offset */
5084 0 : goto FAILED;
5085 : }
5086 : }
5087 :
5088 : /* Set up the changed option bits, but don't change anything yet. */
5089 :
5090 2 : newoptions = (options | set) & (~unset);
5091 :
5092 : /* If the options ended with ')' this is not the start of a nested
5093 : group with option changes, so the options change at this level. If this
5094 : item is right at the start of the pattern, the options can be
5095 : abstracted and made external in the pre-compile phase, and ignored in
5096 : the compile phase. This can be helpful when matching -- for instance in
5097 : caseless checking of required bytes.
5098 :
5099 : If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
5100 : definitely *not* at the start of the pattern because something has been
5101 : compiled. In the pre-compile phase, however, the code pointer can have
5102 : that value after the start, because it gets reset as code is discarded
5103 : during the pre-compile. However, this can happen only at top level - if
5104 : we are within parentheses, the starting BRA will still be present. At
5105 : any parenthesis level, the length value can be used to test if anything
5106 : has been compiled at that level. Thus, a test for both these conditions
5107 : is necessary to ensure we correctly detect the start of the pattern in
5108 : both phases.
5109 :
5110 : If we are not at the pattern start, compile code to change the ims
5111 : options if this setting actually changes any of them, and reset the
5112 : greedy defaults and the case value for firstbyte and reqbyte. */
5113 :
5114 2 : if (*ptr == CHAR_RIGHT_PARENTHESIS)
5115 : {
5116 4 : if (code == cd->start_code + 1 + LINK_SIZE &&
5117 : (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
5118 : {
5119 2 : cd->external_options = newoptions;
5120 : }
5121 : else
5122 : {
5123 0 : if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
5124 : {
5125 0 : *code++ = OP_OPT;
5126 0 : *code++ = newoptions & PCRE_IMS;
5127 : }
5128 0 : greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
5129 0 : greedy_non_default = greedy_default ^ 1;
5130 0 : req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
5131 : }
5132 :
5133 : /* Change options at this level, and pass them back for use
5134 : in subsequent branches. When not at the start of the pattern, this
5135 : information is also necessary so that a resetting item can be
5136 : compiled at the end of a group (if we are in a group). */
5137 :
5138 2 : *optionsptr = options = newoptions;
5139 2 : previous = NULL; /* This item can't be repeated */
5140 2 : continue; /* It is complete */
5141 : }
5142 :
5143 : /* If the options ended with ':' we are heading into a nested group
5144 : with possible change of options. Such groups are non-capturing and are
5145 : not assertions of any kind. All we need to do is skip over the ':';
5146 : the newoptions value is handled below. */
5147 :
5148 0 : bravalue = OP_BRA;
5149 0 : ptr++;
5150 : } /* End of switch for character following (? */
5151 : } /* End of (? handling */
5152 :
5153 : /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
5154 : all unadorned brackets become non-capturing and behave like (?:...)
5155 : brackets. */
5156 :
5157 14738 : else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
5158 : {
5159 0 : bravalue = OP_BRA;
5160 : }
5161 :
5162 : /* Else we have a capturing group. */
5163 :
5164 : else
5165 : {
5166 16820 : NUMBERED_GROUP:
5167 16820 : cd->bracount += 1;
5168 16820 : PUT2(code, 1+LINK_SIZE, cd->bracount);
5169 16820 : skipbytes = 2;
5170 : }
5171 :
5172 : /* Process nested bracketed regex. Assertions may not be repeated, but
5173 : other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
5174 : non-register variable in order to be able to pass its address because some
5175 : compilers complain otherwise. Pass in a new setting for the ims options if
5176 : they have changed. */
5177 :
5178 17624 : previous = (bravalue >= OP_ONCE)? code : NULL;
5179 17624 : *code = bravalue;
5180 17624 : tempcode = code;
5181 17624 : tempreqvary = cd->req_varyopt; /* Save value before bracket */
5182 17624 : length_prevgroup = 0; /* Initialize for pre-compile phase */
5183 :
5184 17624 : if (!compile_regex(
5185 : newoptions, /* The complete new option state */
5186 : options & PCRE_IMS, /* The previous ims option state */
5187 : &tempcode, /* Where to put code (updated) */
5188 : &ptr, /* Input pointer (updated) */
5189 : errorcodeptr, /* Where to put an error message */
5190 : (bravalue == OP_ASSERTBACK ||
5191 : bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
5192 : reset_bracount, /* True if (?| group */
5193 : skipbytes, /* Skip over bracket number */
5194 : &subfirstbyte, /* For possible first char */
5195 : &subreqbyte, /* For possible last char */
5196 : bcptr, /* Current branch chain */
5197 : cd, /* Tables block */
5198 : (lengthptr == NULL)? NULL : /* Actual compile phase */
5199 : &length_prevgroup /* Pre-compile phase */
5200 : ))
5201 0 : goto FAILED;
5202 :
5203 : /* At the end of compiling, code is still pointing to the start of the
5204 : group, while tempcode has been updated to point past the end of the group
5205 : and any option resetting that may follow it. The pattern pointer (ptr)
5206 : is on the bracket. */
5207 :
5208 : /* If this is a conditional bracket, check that there are no more than
5209 : two branches in the group, or just one if it's a DEFINE group. We do this
5210 : in the real compile phase, not in the pre-pass, where the whole group may
5211 : not be available. */
5212 :
5213 17624 : if (bravalue == OP_COND && lengthptr == NULL)
5214 : {
5215 2 : uschar *tc = code;
5216 2 : int condcount = 0;
5217 :
5218 : do {
5219 4 : condcount++;
5220 4 : tc += GET(tc,1);
5221 : }
5222 4 : while (*tc != OP_KET);
5223 :
5224 : /* A DEFINE group is never obeyed inline (the "condition" is always
5225 : false). It must have only one branch. */
5226 :
5227 2 : if (code[LINK_SIZE+1] == OP_DEF)
5228 : {
5229 0 : if (condcount > 1)
5230 : {
5231 0 : *errorcodeptr = ERR54;
5232 0 : goto FAILED;
5233 : }
5234 0 : bravalue = OP_DEF; /* Just a flag to suppress char handling below */
5235 : }
5236 :
5237 : /* A "normal" conditional group. If there is just one branch, we must not
5238 : make use of its firstbyte or reqbyte, because this is equivalent to an
5239 : empty second branch. */
5240 :
5241 : else
5242 : {
5243 2 : if (condcount > 2)
5244 : {
5245 0 : *errorcodeptr = ERR27;
5246 0 : goto FAILED;
5247 : }
5248 2 : if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
5249 : }
5250 : }
5251 :
5252 : /* Error if hit end of pattern */
5253 :
5254 17624 : if (*ptr != CHAR_RIGHT_PARENTHESIS)
5255 : {
5256 0 : *errorcodeptr = ERR14;
5257 0 : goto FAILED;
5258 : }
5259 :
5260 : /* In the pre-compile phase, update the length by the length of the group,
5261 : less the brackets at either end. Then reduce the compiled code to just a
5262 : set of non-capturing brackets so that it doesn't use much memory if it is
5263 : duplicated by a quantifier.*/
5264 :
5265 17624 : if (lengthptr != NULL)
5266 : {
5267 8812 : if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
5268 : {
5269 0 : *errorcodeptr = ERR20;
5270 0 : goto FAILED;
5271 : }
5272 8812 : *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
5273 8812 : *code++ = OP_BRA;
5274 8812 : PUTINC(code, 0, 1 + LINK_SIZE);
5275 8812 : *code++ = OP_KET;
5276 8812 : PUTINC(code, 0, 1 + LINK_SIZE);
5277 8812 : break; /* No need to waste time with special character handling */
5278 : }
5279 :
5280 : /* Otherwise update the main code pointer to the end of the group. */
5281 :
5282 8812 : code = tempcode;
5283 :
5284 : /* For a DEFINE group, required and first character settings are not
5285 : relevant. */
5286 :
5287 8812 : if (bravalue == OP_DEF) break;
5288 :
5289 : /* Handle updating of the required and first characters for other types of
5290 : group. Update for normal brackets of all kinds, and conditions with two
5291 : branches (see code above). If the bracket is followed by a quantifier with
5292 : zero repeat, we have to back off. Hence the definition of zeroreqbyte and
5293 : zerofirstbyte outside the main loop so that they can be accessed for the
5294 : back off. */
5295 :
5296 8812 : zeroreqbyte = reqbyte;
5297 8812 : zerofirstbyte = firstbyte;
5298 8812 : groupsetfirstbyte = FALSE;
5299 :
5300 8812 : if (bravalue >= OP_ONCE)
5301 : {
5302 : /* If we have not yet set a firstbyte in this branch, take it from the
5303 : subpattern, remembering that it was set here so that a repeat of more
5304 : than one can replicate it as reqbyte if necessary. If the subpattern has
5305 : no firstbyte, set "none" for the whole branch. In both cases, a zero
5306 |