1 : /*************************************************
2 : * Perl-Compatible Regular Expressions *
3 : *************************************************/
4 :
5 : /* PCRE is a library of functions to support regular expressions whose syntax
6 : and semantics are as close as possible to those of the Perl 5 language.
7 :
8 : Written by Philip Hazel
9 : Copyright (c) 1997-2009 University of Cambridge
10 :
11 : -----------------------------------------------------------------------------
12 : Redistribution and use in source and binary forms, with or without
13 : modification, are permitted provided that the following conditions are met:
14 :
15 : * Redistributions of source code must retain the above copyright notice,
16 : this list of conditions and the following disclaimer.
17 :
18 : * Redistributions in binary form must reproduce the above copyright
19 : notice, this list of conditions and the following disclaimer in the
20 : documentation and/or other materials provided with the distribution.
21 :
22 : * Neither the name of the University of Cambridge nor the names of its
23 : contributors may be used to endorse or promote products derived from
24 : this software without specific prior written permission.
25 :
26 : THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 : AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 : IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 : ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 : LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 : CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 : SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 : INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 : CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 : ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 : POSSIBILITY OF SUCH DAMAGE.
37 : -----------------------------------------------------------------------------
38 : */
39 :
40 :
41 : /* This module contains pcre_exec(), the externally visible function that does
42 : pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 : possible. There are also some static supporting functions. */
44 :
45 : #include "config.h"
46 :
47 : #define NLBLOCK md /* Block containing newline information */
48 : #define PSSTART start_subject /* Field containing processed string start */
49 : #define PSEND end_subject /* Field containing processed string end */
50 :
51 : #include "pcre_internal.h"
52 :
53 : /* Undefine some potentially clashing cpp symbols */
54 :
55 : #undef min
56 : #undef max
57 :
58 : /* Flag bits for the match() function */
59 :
60 : #define match_condassert 0x01 /* Called to check a condition assertion */
61 : #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
62 :
63 : /* Non-error returns from the match() function. Error returns are externally
64 : defined PCRE_ERROR_xxx codes, which are all negative. */
65 :
66 : #define MATCH_MATCH 1
67 : #define MATCH_NOMATCH 0
68 :
69 : /* Special internal returns from the match() function. Make them sufficiently
70 : negative to avoid the external error codes. */
71 :
72 : #define MATCH_COMMIT (-999)
73 : #define MATCH_PRUNE (-998)
74 : #define MATCH_SKIP (-997)
75 : #define MATCH_THEN (-996)
76 :
77 : /* Maximum number of ints of offset to save on the stack for recursive calls.
78 : If the offset vector is bigger, malloc is used. This should be a multiple of 3,
79 : because the offset vector is always a multiple of 3 long. */
80 :
81 : #define REC_STACK_SAVE_MAX 30
82 :
83 : /* Min and max values for the common repeats; for the maxima, 0 => infinity */
84 :
85 : static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
86 : static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
87 :
88 :
89 :
90 : #ifdef DEBUG
91 : /*************************************************
92 : * Debugging function to print chars *
93 : *************************************************/
94 :
95 : /* Print a sequence of chars in printable format, stopping at the end of the
96 : subject if the requested.
97 :
98 : Arguments:
99 : p points to characters
100 : length number to print
101 : is_subject TRUE if printing from within md->start_subject
102 : md pointer to matching data block, if is_subject is TRUE
103 :
104 : Returns: nothing
105 : */
106 :
107 : static void
108 : pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
109 : {
110 : unsigned int c;
111 : if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
112 : while (length-- > 0)
113 : if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
114 : }
115 : #endif
116 :
117 :
118 :
119 : /*************************************************
120 : * Match a back-reference *
121 : *************************************************/
122 :
123 : /* If a back reference hasn't been set, the length that is passed is greater
124 : than the number of characters left in the string, so the match fails.
125 :
126 : Arguments:
127 : offset index into the offset vector
128 : eptr points into the subject
129 : length length to be matched
130 : md points to match data block
131 : ims the ims flags
132 :
133 : Returns: TRUE if matched
134 : */
135 :
136 : static BOOL
137 : match_ref(int offset, register USPTR eptr, int length, match_data *md,
138 : unsigned long int ims)
139 2796 : {
140 2796 : USPTR p = md->start_subject + md->offset_vector[offset];
141 :
142 : #ifdef DEBUG
143 : if (eptr >= md->end_subject)
144 : printf("matching subject <null>");
145 : else
146 : {
147 : printf("matching subject ");
148 : pchars(eptr, length, TRUE, md);
149 : }
150 : printf(" against backref ");
151 : pchars(p, length, FALSE, md);
152 : printf("\n");
153 : #endif
154 :
155 : /* Always fail if not enough characters left */
156 :
157 2796 : if (length > md->end_subject - eptr) return FALSE;
158 :
159 : /* Separate the caseless case for speed. In UTF-8 mode we can only do this
160 : properly if Unicode properties are supported. Otherwise, we can check only
161 : ASCII characters. */
162 :
163 2697 : if ((ims & PCRE_CASELESS) != 0)
164 : {
165 : #ifdef SUPPORT_UTF8
166 : #ifdef SUPPORT_UCP
167 380 : if (md->utf8)
168 : {
169 0 : USPTR endptr = eptr + length;
170 0 : while (eptr < endptr)
171 : {
172 : int c, d;
173 0 : GETCHARINC(c, eptr);
174 0 : GETCHARINC(d, p);
175 0 : if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
176 : }
177 : }
178 : else
179 : #endif
180 : #endif
181 :
182 : /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
183 : is no UCP support. */
184 :
185 760 : while (length-- > 0)
186 380 : { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
187 : }
188 :
189 : /* In the caseful case, we can just compare the bytes, whether or not we
190 : are in UTF-8 mode. */
191 :
192 : else
193 2317 : { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
194 :
195 424 : return TRUE;
196 : }
197 :
198 :
199 :
200 : /***************************************************************************
201 : ****************************************************************************
202 : RECURSION IN THE match() FUNCTION
203 :
204 : The match() function is highly recursive, though not every recursive call
205 : increases the recursive depth. Nevertheless, some regular expressions can cause
206 : it to recurse to a great depth. I was writing for Unix, so I just let it call
207 : itself recursively. This uses the stack for saving everything that has to be
208 : saved for a recursive call. On Unix, the stack can be large, and this works
209 : fine.
210 :
211 : It turns out that on some non-Unix-like systems there are problems with
212 : programs that use a lot of stack. (This despite the fact that every last chip
213 : has oodles of memory these days, and techniques for extending the stack have
214 : been known for decades.) So....
215 :
216 : There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
217 : calls by keeping local variables that need to be preserved in blocks of memory
218 : obtained from malloc() instead instead of on the stack. Macros are used to
219 : achieve this so that the actual code doesn't look very different to what it
220 : always used to.
221 :
222 : The original heap-recursive code used longjmp(). However, it seems that this
223 : can be very slow on some operating systems. Following a suggestion from Stan
224 : Switzer, the use of longjmp() has been abolished, at the cost of having to
225 : provide a unique number for each call to RMATCH. There is no way of generating
226 : a sequence of numbers at compile time in C. I have given them names, to make
227 : them stand out more clearly.
228 :
229 : Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
230 : FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
231 : tests. Furthermore, not using longjmp() means that local dynamic variables
232 : don't have indeterminate values; this has meant that the frame size can be
233 : reduced because the result can be "passed back" by straight setting of the
234 : variable instead of being passed in the frame.
235 : ****************************************************************************
236 : ***************************************************************************/
237 :
238 : /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
239 : below must be updated in sync. */
240 :
241 : enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
242 : RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
243 : RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
244 : RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
245 : RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
246 : RM51, RM52, RM53, RM54 };
247 :
248 : /* These versions of the macros use the stack, as normal. There are debugging
249 : versions and production versions. Note that the "rw" argument of RMATCH isn't
250 : actuall used in this definition. */
251 :
252 : #ifndef NO_RECURSE
253 : #define REGISTER register
254 :
255 : #ifdef DEBUG
256 : #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
257 : { \
258 : printf("match() called in line %d\n", __LINE__); \
259 : rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
260 : printf("to line %d\n", __LINE__); \
261 : }
262 : #define RRETURN(ra) \
263 : { \
264 : printf("match() returned %d from line %d ", ra, __LINE__); \
265 : return ra; \
266 : }
267 : #else
268 : #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
269 : rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
270 : #define RRETURN(ra) return ra
271 : #endif
272 :
273 : #else
274 :
275 :
276 : /* These versions of the macros manage a private stack on the heap. Note that
277 : the "rd" argument of RMATCH isn't actually used in this definition. It's the md
278 : argument of match(), which never changes. */
279 :
280 : #define REGISTER
281 :
282 : #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
283 : {\
284 : heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
285 : frame->Xwhere = rw; \
286 : newframe->Xeptr = ra;\
287 : newframe->Xecode = rb;\
288 : newframe->Xmstart = mstart;\
289 : newframe->Xoffset_top = rc;\
290 : newframe->Xims = re;\
291 : newframe->Xeptrb = rf;\
292 : newframe->Xflags = rg;\
293 : newframe->Xrdepth = frame->Xrdepth + 1;\
294 : newframe->Xprevframe = frame;\
295 : frame = newframe;\
296 : DPRINTF(("restarting from line %d\n", __LINE__));\
297 : goto HEAP_RECURSE;\
298 : L_##rw:\
299 : DPRINTF(("jumped back to line %d\n", __LINE__));\
300 : }
301 :
302 : #define RRETURN(ra)\
303 : {\
304 : heapframe *newframe = frame;\
305 : frame = newframe->Xprevframe;\
306 : (pcre_stack_free)(newframe);\
307 : if (frame != NULL)\
308 : {\
309 : rrc = ra;\
310 : goto HEAP_RETURN;\
311 : }\
312 : return ra;\
313 : }
314 :
315 :
316 : /* Structure for remembering the local variables in a private frame */
317 :
318 : typedef struct heapframe {
319 : struct heapframe *Xprevframe;
320 :
321 : /* Function arguments that may change */
322 :
323 : USPTR Xeptr;
324 : const uschar *Xecode;
325 : USPTR Xmstart;
326 : int Xoffset_top;
327 : long int Xims;
328 : eptrblock *Xeptrb;
329 : int Xflags;
330 : unsigned int Xrdepth;
331 :
332 : /* Function local variables */
333 :
334 : USPTR Xcallpat;
335 : #ifdef SUPPORT_UTF8
336 : USPTR Xcharptr;
337 : #endif
338 : USPTR Xdata;
339 : USPTR Xnext;
340 : USPTR Xpp;
341 : USPTR Xprev;
342 : USPTR Xsaved_eptr;
343 :
344 : recursion_info Xnew_recursive;
345 :
346 : BOOL Xcur_is_word;
347 : BOOL Xcondition;
348 : BOOL Xprev_is_word;
349 :
350 : unsigned long int Xoriginal_ims;
351 :
352 : #ifdef SUPPORT_UCP
353 : int Xprop_type;
354 : int Xprop_value;
355 : int Xprop_fail_result;
356 : int Xprop_category;
357 : int Xprop_chartype;
358 : int Xprop_script;
359 : int Xoclength;
360 : uschar Xocchars[8];
361 : #endif
362 :
363 : int Xcodelink;
364 : int Xctype;
365 : unsigned int Xfc;
366 : int Xfi;
367 : int Xlength;
368 : int Xmax;
369 : int Xmin;
370 : int Xnumber;
371 : int Xoffset;
372 : int Xop;
373 : int Xsave_capture_last;
374 : int Xsave_offset1, Xsave_offset2, Xsave_offset3;
375 : int Xstacksave[REC_STACK_SAVE_MAX];
376 :
377 : eptrblock Xnewptrb;
378 :
379 : /* Where to jump back to */
380 :
381 : int Xwhere;
382 :
383 : } heapframe;
384 :
385 : #endif
386 :
387 :
388 : /***************************************************************************
389 : ***************************************************************************/
390 :
391 :
392 :
393 : /*************************************************
394 : * Match from current position *
395 : *************************************************/
396 :
397 : /* This function is called recursively in many circumstances. Whenever it
398 : returns a negative (error) response, the outer incarnation must also return the
399 : same response.
400 :
401 : Performance note: It might be tempting to extract commonly used fields from the
402 : md structure (e.g. utf8, end_subject) into individual variables to improve
403 : performance. Tests using gcc on a SPARC disproved this; in the first case, it
404 : made performance worse.
405 :
406 : Arguments:
407 : eptr pointer to current character in subject
408 : ecode pointer to current position in compiled code
409 : mstart pointer to the current match start position (can be modified
410 : by encountering \K)
411 : offset_top current top pointer
412 : md pointer to "static" info for the match
413 : ims current /i, /m, and /s options
414 : eptrb pointer to chain of blocks containing eptr at start of
415 : brackets - for testing for empty matches
416 : flags can contain
417 : match_condassert - this is an assertion condition
418 : match_cbegroup - this is the start of an unlimited repeat
419 : group that can match an empty string
420 : rdepth the recursion depth
421 :
422 : Returns: MATCH_MATCH if matched ) these values are >= 0
423 : MATCH_NOMATCH if failed to match )
424 : a negative PCRE_ERROR_xxx value if aborted by an error condition
425 : (e.g. stopped by repeated call or recursion limit)
426 : */
427 :
428 : static int
429 : match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
430 : int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
431 : int flags, unsigned int rdepth)
432 1509979 : {
433 : /* These variables do not need to be preserved over recursion in this function,
434 : so they can be ordinary variables in all cases. Mark some of them with
435 : "register" because they are used a lot in loops. */
436 :
437 : register int rrc; /* Returns from recursive calls */
438 : register int i; /* Used for loops not involving calls to RMATCH() */
439 : register unsigned int c; /* Character values not kept over RMATCH() calls */
440 : register BOOL utf8; /* Local copy of UTF-8 flag for speed */
441 :
442 : BOOL minimize, possessive; /* Quantifier options */
443 : int condcode;
444 :
445 : /* When recursion is not being used, all "local" variables that have to be
446 : preserved over calls to RMATCH() are part of a "frame" which is obtained from
447 : heap storage. Set up the top-level frame here; others are obtained from the
448 : heap whenever RMATCH() does a "recursion". See the macro definitions above. */
449 :
450 : #ifdef NO_RECURSE
451 : heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
452 : frame->Xprevframe = NULL; /* Marks the top level */
453 :
454 : /* Copy in the original argument variables */
455 :
456 : frame->Xeptr = eptr;
457 : frame->Xecode = ecode;
458 : frame->Xmstart = mstart;
459 : frame->Xoffset_top = offset_top;
460 : frame->Xims = ims;
461 : frame->Xeptrb = eptrb;
462 : frame->Xflags = flags;
463 : frame->Xrdepth = rdepth;
464 :
465 : /* This is where control jumps back to to effect "recursion" */
466 :
467 : HEAP_RECURSE:
468 :
469 : /* Macros make the argument variables come from the current frame */
470 :
471 : #define eptr frame->Xeptr
472 : #define ecode frame->Xecode
473 : #define mstart frame->Xmstart
474 : #define offset_top frame->Xoffset_top
475 : #define ims frame->Xims
476 : #define eptrb frame->Xeptrb
477 : #define flags frame->Xflags
478 : #define rdepth frame->Xrdepth
479 :
480 : /* Ditto for the local variables */
481 :
482 : #ifdef SUPPORT_UTF8
483 : #define charptr frame->Xcharptr
484 : #endif
485 : #define callpat frame->Xcallpat
486 : #define codelink frame->Xcodelink
487 : #define data frame->Xdata
488 : #define next frame->Xnext
489 : #define pp frame->Xpp
490 : #define prev frame->Xprev
491 : #define saved_eptr frame->Xsaved_eptr
492 :
493 : #define new_recursive frame->Xnew_recursive
494 :
495 : #define cur_is_word frame->Xcur_is_word
496 : #define condition frame->Xcondition
497 : #define prev_is_word frame->Xprev_is_word
498 :
499 : #define original_ims frame->Xoriginal_ims
500 :
501 : #ifdef SUPPORT_UCP
502 : #define prop_type frame->Xprop_type
503 : #define prop_value frame->Xprop_value
504 : #define prop_fail_result frame->Xprop_fail_result
505 : #define prop_category frame->Xprop_category
506 : #define prop_chartype frame->Xprop_chartype
507 : #define prop_script frame->Xprop_script
508 : #define oclength frame->Xoclength
509 : #define occhars frame->Xocchars
510 : #endif
511 :
512 : #define ctype frame->Xctype
513 : #define fc frame->Xfc
514 : #define fi frame->Xfi
515 : #define length frame->Xlength
516 : #define max frame->Xmax
517 : #define min frame->Xmin
518 : #define number frame->Xnumber
519 : #define offset frame->Xoffset
520 : #define op frame->Xop
521 : #define save_capture_last frame->Xsave_capture_last
522 : #define save_offset1 frame->Xsave_offset1
523 : #define save_offset2 frame->Xsave_offset2
524 : #define save_offset3 frame->Xsave_offset3
525 : #define stacksave frame->Xstacksave
526 :
527 : #define newptrb frame->Xnewptrb
528 :
529 : /* When recursion is being used, local variables are allocated on the stack and
530 : get preserved during recursion in the normal way. In this environment, fi and
531 : i, and fc and c, can be the same variables. */
532 :
533 : #else /* NO_RECURSE not defined */
534 : #define fi i
535 : #define fc c
536 :
537 :
538 : #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
539 : const uschar *charptr; /* in small blocks of the code. My normal */
540 : #endif /* style of coding would have declared */
541 : const uschar *callpat; /* them within each of those blocks. */
542 : const uschar *data; /* However, in order to accommodate the */
543 : const uschar *next; /* version of this code that uses an */
544 : USPTR pp; /* external "stack" implemented on the */
545 : const uschar *prev; /* heap, it is easier to declare them all */
546 : USPTR saved_eptr; /* here, so the declarations can be cut */
547 : /* out in a block. The only declarations */
548 : recursion_info new_recursive; /* within blocks below are for variables */
549 : /* that do not have to be preserved over */
550 : BOOL cur_is_word; /* a recursive call to RMATCH(). */
551 : BOOL condition;
552 : BOOL prev_is_word;
553 :
554 : unsigned long int original_ims;
555 :
556 : #ifdef SUPPORT_UCP
557 : int prop_type;
558 : int prop_value;
559 : int prop_fail_result;
560 : int prop_category;
561 : int prop_chartype;
562 : int prop_script;
563 : int oclength;
564 : uschar occhars[8];
565 : #endif
566 :
567 : int codelink;
568 : int ctype;
569 : int length;
570 : int max;
571 : int min;
572 : int number;
573 : int offset;
574 : int op;
575 : int save_capture_last;
576 : int save_offset1, save_offset2, save_offset3;
577 : int stacksave[REC_STACK_SAVE_MAX];
578 :
579 : eptrblock newptrb;
580 : #endif /* NO_RECURSE */
581 :
582 : /* These statements are here to stop the compiler complaining about unitialized
583 : variables. */
584 :
585 : #ifdef SUPPORT_UCP
586 1509979 : prop_value = 0;
587 1509979 : prop_fail_result = 0;
588 : #endif
589 :
590 :
591 : /* This label is used for tail recursion, which is used in a few cases even
592 : when NO_RECURSE is not defined, in order to reduce the amount of stack that is
593 : used. Thanks to Ian Taylor for noticing this possibility and sending the
594 : original patch. */
595 :
596 2088044 : TAIL_RECURSE:
597 :
598 : /* OK, now we can get on with the real code of the function. Recursive calls
599 : are specified by the macro RMATCH and RRETURN is used to return. When
600 : NO_RECURSE is *not* defined, these just turn into a recursive call to match()
601 : and a "return", respectively (possibly with some debugging if DEBUG is
602 : defined). However, RMATCH isn't like a function call because it's quite a
603 : complicated macro. It has to be used in one particular way. This shouldn't,
604 : however, impact performance when true recursion is being used. */
605 :
606 : #ifdef SUPPORT_UTF8
607 2088044 : utf8 = md->utf8; /* Local copy of the flag */
608 : #else
609 : utf8 = FALSE;
610 : #endif
611 :
612 : /* First check that we haven't called match() too many times, or that we
613 : haven't exceeded the recursive call limit. */
614 :
615 2088044 : if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
616 2088041 : if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
617 :
618 2088038 : original_ims = ims; /* Save for resetting on ')' */
619 :
620 : /* At the start of a group with an unlimited repeat that may match an empty
621 : string, the match_cbegroup flag is set. When this is the case, add the current
622 : subject pointer to the chain of such remembered pointers, to be checked when we
623 : hit the closing ket, in order to break infinite loops that match no characters.
624 : When match() is called in other circumstances, don't add to the chain. The
625 : match_cbegroup flag must NOT be used with tail recursion, because the memory
626 : block that is used is on the stack, so a new one may be required for each
627 : match(). */
628 :
629 2088038 : if ((flags & match_cbegroup) != 0)
630 : {
631 100167 : newptrb.epb_saved_eptr = eptr;
632 100167 : newptrb.epb_prev = eptrb;
633 100167 : eptrb = &newptrb;
634 : }
635 :
636 : /* Now start processing the opcodes. */
637 :
638 : for (;;)
639 : {
640 18231589 : minimize = possessive = FALSE;
641 18231589 : op = *ecode;
642 :
643 : /* For partial matching, remember if we ever hit the end of the subject after
644 : matching at least one subject character. */
645 :
646 18231589 : if (md->partial &&
647 : eptr >= md->end_subject &&
648 : eptr > mstart)
649 0 : md->hitend = TRUE;
650 :
651 18231589 : switch(op)
652 : {
653 : case OP_FAIL:
654 0 : RRETURN(MATCH_NOMATCH);
655 :
656 : case OP_PRUNE:
657 0 : RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
658 : ims, eptrb, flags, RM51);
659 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
660 0 : RRETURN(MATCH_PRUNE);
661 :
662 : case OP_COMMIT:
663 0 : RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
664 : ims, eptrb, flags, RM52);
665 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
666 0 : RRETURN(MATCH_COMMIT);
667 :
668 : case OP_SKIP:
669 0 : RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
670 : ims, eptrb, flags, RM53);
671 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
672 0 : md->start_match_ptr = eptr; /* Pass back current position */
673 0 : RRETURN(MATCH_SKIP);
674 :
675 : case OP_THEN:
676 0 : RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
677 : ims, eptrb, flags, RM54);
678 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
679 0 : RRETURN(MATCH_THEN);
680 :
681 : /* Handle a capturing bracket. If there is space in the offset vector, save
682 : the current subject position in the working slot at the top of the vector.
683 : We mustn't change the current values of the data slot, because they may be
684 : set from a previous iteration of this group, and be referred to by a
685 : reference inside the group.
686 :
687 : If the bracket fails to match, we need to restore this value and also the
688 : values of the final offsets, in case they were set by a previous iteration
689 : of the same bracket.
690 :
691 : If there isn't enough space in the offset vector, treat this as if it were
692 : a non-capturing bracket. Don't worry about setting the flag for the error
693 : case here; that is handled in the code for KET. */
694 :
695 : case OP_CBRA:
696 : case OP_SCBRA:
697 119435 : number = GET2(ecode, 1+LINK_SIZE);
698 119435 : offset = number << 1;
699 :
700 : #ifdef DEBUG
701 : printf("start bracket %d\n", number);
702 : printf("subject=");
703 : pchars(eptr, 16, TRUE, md);
704 : printf("\n");
705 : #endif
706 :
707 119435 : if (offset < md->offset_max)
708 : {
709 115141 : save_offset1 = md->offset_vector[offset];
710 115141 : save_offset2 = md->offset_vector[offset+1];
711 115141 : save_offset3 = md->offset_vector[md->offset_end - number];
712 115141 : save_capture_last = md->capture_last;
713 :
714 : DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
715 115141 : md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
716 :
717 115141 : flags = (op == OP_SCBRA)? match_cbegroup : 0;
718 : do
719 : {
720 165370 : RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
721 : ims, eptrb, flags, RM1);
722 165370 : if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
723 130974 : md->capture_last = save_capture_last;
724 130974 : ecode += GET(ecode, 1);
725 : }
726 130974 : while (*ecode == OP_ALT);
727 :
728 : DPRINTF(("bracket %d failed\n", number));
729 :
730 80745 : md->offset_vector[offset] = save_offset1;
731 80745 : md->offset_vector[offset+1] = save_offset2;
732 80745 : md->offset_vector[md->offset_end - number] = save_offset3;
733 :
734 80745 : RRETURN(MATCH_NOMATCH);
735 : }
736 :
737 : /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
738 : as a non-capturing bracket. */
739 :
740 : /* VVVVVVVVVVVVVVVVVVVVVVVVV */
741 : /* VVVVVVVVVVVVVVVVVVVVVVVVV */
742 :
743 : DPRINTF(("insufficient capture room: treat as non-capturing\n"));
744 :
745 : /* VVVVVVVVVVVVVVVVVVVVVVVVV */
746 : /* VVVVVVVVVVVVVVVVVVVVVVVVV */
747 :
748 : /* Non-capturing bracket. Loop for all the alternatives. When we get to the
749 : final alternative within the brackets, we would return the result of a
750 : recursive call to match() whatever happened. We can reduce stack usage by
751 : turning this into a tail recursion, except in the case when match_cbegroup
752 : is set.*/
753 :
754 : case OP_BRA:
755 : case OP_SBRA:
756 : DPRINTF(("start non-capturing bracket\n"));
757 577030 : flags = (op >= OP_SBRA)? match_cbegroup : 0;
758 : for (;;)
759 : {
760 577211 : if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
761 : {
762 576948 : if (flags == 0) /* Not a possibly empty group */
763 : {
764 576943 : ecode += _pcre_OP_lengths[*ecode];
765 : DPRINTF(("bracket 0 tail recursion\n"));
766 576943 : goto TAIL_RECURSE;
767 : }
768 :
769 : /* Possibly empty group; can't use tail recursion. */
770 :
771 5 : RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
772 : eptrb, flags, RM48);
773 5 : RRETURN(rrc);
774 : }
775 :
776 : /* For non-final alternatives, continue the loop for a NOMATCH result;
777 : otherwise return. */
778 :
779 263 : RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
780 : eptrb, flags, RM2);
781 263 : if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
782 181 : ecode += GET(ecode, 1);
783 181 : }
784 : /* Control never reaches here. */
785 :
786 : /* Conditional group: compilation checked that there are no more than
787 : two branches. If the condition is false, skipping the first branch takes us
788 : past the end if there is only one branch, but that's OK because that is
789 : exactly what going to the ket would do. As there is only one branch to be
790 : obeyed, we can use tail recursion to avoid using another stack frame. */
791 :
792 : case OP_COND:
793 : case OP_SCOND:
794 52 : codelink= GET(ecode, 1);
795 :
796 : /* Because of the way auto-callout works during compile, a callout item is
797 : inserted between OP_COND and an assertion condition. */
798 :
799 52 : if (ecode[LINK_SIZE+1] == OP_CALLOUT)
800 : {
801 0 : if (pcre_callout != NULL)
802 : {
803 : pcre_callout_block cb;
804 0 : cb.version = 1; /* Version 1 of the callout block */
805 0 : cb.callout_number = ecode[LINK_SIZE+2];
806 0 : cb.offset_vector = md->offset_vector;
807 0 : cb.subject = (PCRE_SPTR)md->start_subject;
808 0 : cb.subject_length = md->end_subject - md->start_subject;
809 0 : cb.start_match = mstart - md->start_subject;
810 0 : cb.current_position = eptr - md->start_subject;
811 0 : cb.pattern_position = GET(ecode, LINK_SIZE + 3);
812 0 : cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
813 0 : cb.capture_top = offset_top/2;
814 0 : cb.capture_last = md->capture_last;
815 0 : cb.callout_data = md->callout_data;
816 0 : if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
817 0 : if (rrc < 0) RRETURN(rrc);
818 : }
819 0 : ecode += _pcre_OP_lengths[OP_CALLOUT];
820 : }
821 :
822 52 : condcode = ecode[LINK_SIZE+1];
823 :
824 : /* Now see what the actual condition is */
825 :
826 52 : if (condcode == OP_RREF) /* Recursion test */
827 : {
828 0 : offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
829 0 : condition = md->recursive != NULL &&
830 : (offset == RREF_ANY || offset == md->recursive->group_num);
831 0 : ecode += condition? 3 : GET(ecode, 1);
832 : }
833 :
834 52 : else if (condcode == OP_CREF) /* Group used test */
835 : {
836 52 : offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
837 52 : condition = offset < offset_top && md->offset_vector[offset] >= 0;
838 52 : ecode += condition? 3 : GET(ecode, 1);
839 : }
840 :
841 0 : else if (condcode == OP_DEF) /* DEFINE - always false */
842 : {
843 0 : condition = FALSE;
844 0 : ecode += GET(ecode, 1);
845 : }
846 :
847 : /* The condition is an assertion. Call match() to evaluate it - setting
848 : the final argument match_condassert causes it to stop at the end of an
849 : assertion. */
850 :
851 : else
852 : {
853 0 : RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
854 : match_condassert, RM3);
855 0 : if (rrc == MATCH_MATCH)
856 : {
857 0 : condition = TRUE;
858 0 : ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
859 0 : while (*ecode == OP_ALT) ecode += GET(ecode, 1);
860 : }
861 0 : else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
862 : {
863 0 : RRETURN(rrc); /* Need braces because of following else */
864 : }
865 : else
866 : {
867 0 : condition = FALSE;
868 0 : ecode += codelink;
869 : }
870 : }
871 :
872 : /* We are now at the branch that is to be obeyed. As there is only one,
873 : we can use tail recursion to avoid using another stack frame, except when
874 : match_cbegroup is required for an unlimited repeat of a possibly empty
875 : group. If the second alternative doesn't exist, we can just plough on. */
876 :
877 52 : if (condition || *ecode == OP_ALT)
878 : {
879 52 : ecode += 1 + LINK_SIZE;
880 52 : if (op == OP_SCOND) /* Possibly empty group */
881 : {
882 0 : RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
883 0 : RRETURN(rrc);
884 : }
885 : else /* Group must match something */
886 : {
887 52 : flags = 0;
888 52 : goto TAIL_RECURSE;
889 : }
890 : }
891 : else /* Condition false & no alternative */
892 : {
893 0 : ecode += 1 + LINK_SIZE;
894 : }
895 0 : break;
896 :
897 :
898 : /* End of the pattern, either real or forced. If we are in a top-level
899 : recursion, we should restore the offsets appropriately and continue from
900 : after the call. */
901 :
902 : case OP_ACCEPT:
903 : case OP_END:
904 56550 : if (md->recursive != NULL && md->recursive->group_num == 0)
905 : {
906 1 : recursion_info *rec = md->recursive;
907 : DPRINTF(("End of pattern in a (?0) recursion\n"));
908 1 : md->recursive = rec->prevrec;
909 1 : memmove(md->offset_vector, rec->offset_save,
910 : rec->saved_max * sizeof(int));
911 1 : mstart = rec->save_start;
912 1 : ims = original_ims;
913 1 : ecode = rec->after_call;
914 1 : break;
915 : }
916 :
917 : /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
918 : string - backtracking will then try other alternatives, if any. */
919 :
920 56549 : if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
921 56478 : md->end_match_ptr = eptr; /* Record where we ended */
922 56478 : md->end_offset_top = offset_top; /* and how many extracts were taken */
923 56478 : md->start_match_ptr = mstart; /* and the start (\K can modify) */
924 56478 : RRETURN(MATCH_MATCH);
925 :
926 : /* Change option settings */
927 :
928 : case OP_OPT:
929 0 : ims = ecode[1];
930 0 : ecode += 2;
931 : DPRINTF(("ims set to %02lx\n", ims));
932 0 : break;
933 :
934 : /* Assertion brackets. Check the alternative branches in turn - the
935 : matching won't pass the KET for an assertion. If any one branch matches,
936 : the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
937 : start of each branch to move the current point backwards, so the code at
938 : this level is identical to the lookahead case. */
939 :
940 : case OP_ASSERT:
941 : case OP_ASSERTBACK:
942 : do
943 : {
944 83 : RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
945 : RM4);
946 83 : if (rrc == MATCH_MATCH) break;
947 0 : if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
948 0 : ecode += GET(ecode, 1);
949 : }
950 0 : while (*ecode == OP_ALT);
951 83 : if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
952 :
953 : /* If checking an assertion for a condition, return MATCH_MATCH. */
954 :
955 83 : if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
956 :
957 : /* Continue from after the assertion, updating the offsets high water
958 : mark, since extracts may have been taken during the assertion. */
959 :
960 83 : do ecode += GET(ecode,1); while (*ecode == OP_ALT);
961 83 : ecode += 1 + LINK_SIZE;
962 83 : offset_top = md->end_offset_top;
963 83 : continue;
964 :
965 : /* Negative assertion: all branches must fail to match */
966 :
967 : case OP_ASSERT_NOT:
968 : case OP_ASSERTBACK_NOT:
969 : do
970 : {
971 26 : RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
972 : RM5);
973 26 : if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
974 13 : if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
975 13 : ecode += GET(ecode,1);
976 : }
977 13 : while (*ecode == OP_ALT);
978 :
979 13 : if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
980 :
981 13 : ecode += 1 + LINK_SIZE;
982 13 : continue;
983 :
984 : /* Move the subject pointer back. This occurs only at the start of
985 : each branch of a lookbehind assertion. If we are too close to the start to
986 : move back, this match function fails. When working with UTF-8 we move
987 : back a number of characters, not bytes. */
988 :
989 : case OP_REVERSE:
990 : #ifdef SUPPORT_UTF8
991 18 : if (utf8)
992 : {
993 0 : i = GET(ecode, 1);
994 0 : while (i-- > 0)
995 : {
996 0 : eptr--;
997 0 : if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
998 0 : BACKCHAR(eptr);
999 : }
1000 : }
1001 : else
1002 : #endif
1003 :
1004 : /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1005 :
1006 : {
1007 18 : eptr -= GET(ecode, 1);
1008 18 : if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1009 : }
1010 :
1011 : /* Skip to next op code */
1012 :
1013 17 : ecode += 1 + LINK_SIZE;
1014 17 : break;
1015 :
1016 : /* The callout item calls an external function, if one is provided, passing
1017 : details of the match so far. This is mainly for debugging, though the
1018 : function is able to force a failure. */
1019 :
1020 : case OP_CALLOUT:
1021 0 : if (pcre_callout != NULL)
1022 : {
1023 : pcre_callout_block cb;
1024 0 : cb.version = 1; /* Version 1 of the callout block */
1025 0 : cb.callout_number = ecode[1];
1026 0 : cb.offset_vector = md->offset_vector;
1027 0 : cb.subject = (PCRE_SPTR)md->start_subject;
1028 0 : cb.subject_length = md->end_subject - md->start_subject;
1029 0 : cb.start_match = mstart - md->start_subject;
1030 0 : cb.current_position = eptr - md->start_subject;
1031 0 : cb.pattern_position = GET(ecode, 2);
1032 0 : cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1033 0 : cb.capture_top = offset_top/2;
1034 0 : cb.capture_last = md->capture_last;
1035 0 : cb.callout_data = md->callout_data;
1036 0 : if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1037 0 : if (rrc < 0) RRETURN(rrc);
1038 : }
1039 0 : ecode += 2 + 2*LINK_SIZE;
1040 0 : break;
1041 :
1042 : /* Recursion either matches the current regex, or some subexpression. The
1043 : offset data is the offset to the starting bracket from the start of the
1044 : whole pattern. (This is so that it works from duplicated subpatterns.)
1045 :
1046 : If there are any capturing brackets started but not finished, we have to
1047 : save their starting points and reinstate them after the recursion. However,
1048 : we don't know how many such there are (offset_top records the completed
1049 : total) so we just have to save all the potential data. There may be up to
1050 : 65535 such values, which is too large to put on the stack, but using malloc
1051 : for small numbers seems expensive. As a compromise, the stack is used when
1052 : there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1053 : is used. A problem is what to do if the malloc fails ... there is no way of
1054 : returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1055 : values on the stack, and accept that the rest may be wrong.
1056 :
1057 : There are also other values that have to be saved. We use a chained
1058 : sequence of blocks that actually live on the stack. Thanks to Robin Houston
1059 : for the original version of this logic. */
1060 :
1061 : case OP_RECURSE:
1062 : {
1063 25006 : callpat = md->start_code + GET(ecode, 1);
1064 25006 : new_recursive.group_num = (callpat == md->start_code)? 0 :
1065 : GET2(callpat, 1 + LINK_SIZE);
1066 :
1067 : /* Add to "recursing stack" */
1068 :
1069 25006 : new_recursive.prevrec = md->recursive;
1070 25006 : md->recursive = &new_recursive;
1071 :
1072 : /* Find where to continue from afterwards */
1073 :
1074 25006 : ecode += 1 + LINK_SIZE;
1075 25006 : new_recursive.after_call = ecode;
1076 :
1077 : /* Now save the offset data. */
1078 :
1079 25006 : new_recursive.saved_max = md->offset_end;
1080 25006 : if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1081 25006 : new_recursive.offset_save = stacksave;
1082 : else
1083 : {
1084 0 : new_recursive.offset_save =
1085 : (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1086 0 : if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1087 : }
1088 :
1089 25006 : memcpy(new_recursive.offset_save, md->offset_vector,
1090 : new_recursive.saved_max * sizeof(int));
1091 25006 : new_recursive.save_start = mstart;
1092 25006 : mstart = eptr;
1093 :
1094 : /* OK, now we can do the recursion. For each top-level alternative we
1095 : restore the offset and recursion data. */
1096 :
1097 : DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1098 25006 : flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1099 : do
1100 : {
1101 25006 : RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1102 : md, ims, eptrb, flags, RM6);
1103 25006 : if (rrc == MATCH_MATCH)
1104 : {
1105 : DPRINTF(("Recursion matched\n"));
1106 1 : md->recursive = new_recursive.prevrec;
1107 1 : if (new_recursive.offset_save != stacksave)
1108 0 : (pcre_free)(new_recursive.offset_save);
1109 1 : RRETURN(MATCH_MATCH);
1110 : }
1111 25005 : else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1112 : {
1113 : DPRINTF(("Recursion gave error %d\n", rrc));
1114 0 : if (new_recursive.offset_save != stacksave)
1115 0 : (pcre_free)(new_recursive.offset_save);
1116 0 : RRETURN(rrc);
1117 : }
1118 :
1119 25005 : md->recursive = &new_recursive;
1120 25005 : memcpy(md->offset_vector, new_recursive.offset_save,
1121 : new_recursive.saved_max * sizeof(int));
1122 25005 : callpat += GET(callpat, 1);
1123 : }
1124 25005 : while (*callpat == OP_ALT);
1125 :
1126 : DPRINTF(("Recursion didn't match\n"));
1127 25005 : md->recursive = new_recursive.prevrec;
1128 25005 : if (new_recursive.offset_save != stacksave)
1129 0 : (pcre_free)(new_recursive.offset_save);
1130 25005 : RRETURN(MATCH_NOMATCH);
1131 : }
1132 : /* Control never reaches here */
1133 :
1134 : /* "Once" brackets are like assertion brackets except that after a match,
1135 : the point in the subject string is not moved back. Thus there can never be
1136 : a move back into the brackets. Friedl calls these "atomic" subpatterns.
1137 : Check the alternative branches in turn - the matching won't pass the KET
1138 : for this kind of subpattern. If any one branch matches, we carry on as at
1139 : the end of a normal bracket, leaving the subject pointer. */
1140 :
1141 : case OP_ONCE:
1142 25008 : prev = ecode;
1143 25008 : saved_eptr = eptr;
1144 :
1145 : do
1146 : {
1147 25008 : RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1148 25008 : if (rrc == MATCH_MATCH) break;
1149 25005 : if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1150 25005 : ecode += GET(ecode,1);
1151 : }
1152 25005 : while (*ecode == OP_ALT);
1153 :
1154 : /* If hit the end of the group (which could be repeated), fail */
1155 :
1156 25008 : if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1157 :
1158 : /* Continue as from after the assertion, updating the offsets high water
1159 : mark, since extracts may have been taken. */
1160 :
1161 3 : do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1162 :
1163 3 : offset_top = md->end_offset_top;
1164 3 : eptr = md->end_match_ptr;
1165 :
1166 : /* For a non-repeating ket, just continue at this level. This also
1167 : happens for a repeating ket if no characters were matched in the group.
1168 : This is the forcible breaking of infinite loops as implemented in Perl
1169 : 5.005. If there is an options reset, it will get obeyed in the normal
1170 : course of events. */
1171 :
1172 3 : if (*ecode == OP_KET || eptr == saved_eptr)
1173 : {
1174 3 : ecode += 1+LINK_SIZE;
1175 3 : break;
1176 : }
1177 :
1178 : /* The repeating kets try the rest of the pattern or restart from the
1179 : preceding bracket, in the appropriate order. The second "call" of match()
1180 : uses tail recursion, to avoid using another stack frame. We need to reset
1181 : any options that changed within the bracket before re-running it, so
1182 : check the next opcode. */
1183 :
1184 0 : if (ecode[1+LINK_SIZE] == OP_OPT)
1185 : {
1186 0 : ims = (ims & ~PCRE_IMS) | ecode[4];
1187 : DPRINTF(("ims set to %02lx at group repeat\n", ims));
1188 : }
1189 :
1190 0 : if (*ecode == OP_KETRMIN)
1191 : {
1192 0 : RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1193 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1194 0 : ecode = prev;
1195 0 : flags = 0;
1196 0 : goto TAIL_RECURSE;
1197 : }
1198 : else /* OP_KETRMAX */
1199 : {
1200 0 : RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1201 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1202 0 : ecode += 1 + LINK_SIZE;
1203 0 : flags = 0;
1204 0 : goto TAIL_RECURSE;
1205 : }
1206 : /* Control never gets here */
1207 :
1208 : /* An alternation is the end of a branch; scan along to find the end of the
1209 : bracketed group and go to there. */
1210 :
1211 : case OP_ALT:
1212 24908 : do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1213 24853 : break;
1214 :
1215 : /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1216 : indicating that it may occur zero times. It may repeat infinitely, or not
1217 : at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1218 : with fixed upper repeat limits are compiled as a number of copies, with the
1219 : optional ones preceded by BRAZERO or BRAMINZERO. */
1220 :
1221 : case OP_BRAZERO:
1222 : {
1223 5491 : next = ecode+1;
1224 5491 : RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1225 5491 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1226 5348 : do next += GET(next,1); while (*next == OP_ALT);
1227 5347 : ecode = next + 1 + LINK_SIZE;
1228 : }
1229 5347 : break;
1230 :
1231 : case OP_BRAMINZERO:
1232 : {
1233 14 : next = ecode+1;
1234 19 : do next += GET(next, 1); while (*next == OP_ALT);
1235 14 : RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1236 14 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1237 9 : ecode++;
1238 : }
1239 9 : break;
1240 :
1241 : case OP_SKIPZERO:
1242 : {
1243 0 : next = ecode+1;
1244 0 : do next += GET(next,1); while (*next == OP_ALT);
1245 0 : ecode = next + 1 + LINK_SIZE;
1246 : }
1247 0 : break;
1248 :
1249 : /* End of a group, repeated or non-repeating. */
1250 :
1251 : case OP_KET:
1252 : case OP_KETRMIN:
1253 : case OP_KETRMAX:
1254 150635 : prev = ecode - GET(ecode, 1);
1255 :
1256 : /* If this was a group that remembered the subject start, in order to break
1257 : infinite repeats of empty string matches, retrieve the subject start from
1258 : the chain. Otherwise, set it NULL. */
1259 :
1260 150635 : if (*prev >= OP_SBRA)
1261 : {
1262 25090 : saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1263 25090 : eptrb = eptrb->epb_prev; /* Backup to previous group */
1264 : }
1265 125545 : else saved_eptr = NULL;
1266 :
1267 : /* If we are at the end of an assertion group, stop matching and return
1268 : MATCH_MATCH, but record the current high water mark for use by positive
1269 : assertions. Do this also for the "once" (atomic) groups. */
1270 :
1271 150635 : if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1272 : *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1273 : *prev == OP_ONCE)
1274 : {
1275 99 : md->end_match_ptr = eptr; /* For ONCE */
1276 99 : md->end_offset_top = offset_top;
1277 99 : RRETURN(MATCH_MATCH);
1278 : }
1279 :
1280 : /* For capturing groups we have to check the group number back at the start
1281 : and if necessary complete handling an extraction by setting the offsets and
1282 : bumping the high water mark. Note that whole-pattern recursion is coded as
1283 : a recurse into group 0, so it won't be picked up here. Instead, we catch it
1284 : when the OP_END is reached. Other recursion is handled here. */
1285 :
1286 150536 : if (*prev == OP_CBRA || *prev == OP_SCBRA)
1287 : {
1288 93710 : number = GET2(prev, 1+LINK_SIZE);
1289 93710 : offset = number << 1;
1290 :
1291 : #ifdef DEBUG
1292 : printf("end bracket %d", number);
1293 : printf("\n");
1294 : #endif
1295 :
1296 93710 : md->capture_last = number;
1297 93710 : if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1298 : {
1299 90235 : md->offset_vector[offset] =
1300 : md->offset_vector[md->offset_end - number];
1301 90235 : md->offset_vector[offset+1] = eptr - md->start_subject;
1302 90235 : if (offset_top <= offset) offset_top = offset + 2;
1303 : }
1304 :
1305 : /* Handle a recursively called group. Restore the offsets
1306 : appropriately and continue from after the call. */
1307 :
1308 93710 : if (md->recursive != NULL && md->recursive->group_num == number)
1309 : {
1310 0 : recursion_info *rec = md->recursive;
1311 : DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1312 0 : md->recursive = rec->prevrec;
1313 0 : mstart = rec->save_start;
1314 0 : memcpy(md->offset_vector, rec->offset_save,
1315 : rec->saved_max * sizeof(int));
1316 0 : ecode = rec->after_call;
1317 0 : ims = original_ims;
1318 0 : break;
1319 : }
1320 : }
1321 :
1322 : /* For both capturing and non-capturing groups, reset the value of the ims
1323 : flags, in case they got changed during the group. */
1324 :
1325 150536 : ims = original_ims;
1326 : DPRINTF(("ims reset to %02lx\n", ims));
1327 :
1328 : /* For a non-repeating ket, just continue at this level. This also
1329 : happens for a repeating ket if no characters were matched in the group.
1330 : This is the forcible breaking of infinite loops as implemented in Perl
1331 : 5.005. If there is an options reset, it will get obeyed in the normal
1332 : course of events. */
1333 :
1334 150536 : if (*ecode == OP_KET || eptr == saved_eptr)
1335 : {
1336 124308 : ecode += 1 + LINK_SIZE;
1337 124308 : break;
1338 : }
1339 :
1340 : /* The repeating kets try the rest of the pattern or restart from the
1341 : preceding bracket, in the appropriate order. In the second case, we can use
1342 : tail recursion to avoid using another stack frame, unless we have an
1343 : unlimited repeat of a group that can match an empty string. */
1344 :
1345 26228 : flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1346 :
1347 26228 : if (*ecode == OP_KETRMIN)
1348 : {
1349 25010 : RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1350 25010 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1351 25008 : if (flags != 0) /* Could match an empty string */
1352 : {
1353 25004 : RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1354 25004 : RRETURN(rrc);
1355 : }
1356 4 : ecode = prev;
1357 4 : goto TAIL_RECURSE;
1358 : }
1359 : else /* OP_KETRMAX */
1360 : {
1361 1218 : RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1362 1218 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1363 1066 : ecode += 1 + LINK_SIZE;
1364 1066 : flags = 0;
1365 1066 : goto TAIL_RECURSE;
1366 : }
1367 : /* Control never gets here */
1368 :
1369 : /* Start of subject unless notbol, or after internal newline if multiline */
1370 :
1371 : case OP_CIRC:
1372 520499 : if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1373 520499 : if ((ims & PCRE_MULTILINE) != 0)
1374 : {
1375 61 : if (eptr != md->start_subject &&
1376 : (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1377 28 : RRETURN(MATCH_NOMATCH);
1378 33 : ecode++;
1379 33 : break;
1380 : }
1381 : /* ... else fall through */
1382 :
1383 : /* Start of subject assertion */
1384 :
1385 : case OP_SOD:
1386 520442 : if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1387 520413 : ecode++;
1388 520413 : break;
1389 :
1390 : /* Start of match assertion */
1391 :
1392 : case OP_SOM:
1393 0 : if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1394 0 : ecode++;
1395 0 : break;
1396 :
1397 : /* Reset the start of match point */
1398 :
1399 : case OP_SET_SOM:
1400 0 : mstart = eptr;
1401 0 : ecode++;
1402 0 : break;
1403 :
1404 : /* Assert before internal newline if multiline, or before a terminating
1405 : newline unless endonly is set, else end of subject unless noteol is set. */
1406 :
1407 : case OP_DOLL:
1408 29536 : if ((ims & PCRE_MULTILINE) != 0)
1409 : {
1410 4 : if (eptr < md->end_subject)
1411 3 : { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1412 : else
1413 1 : { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1414 4 : ecode++;
1415 4 : break;
1416 : }
1417 : else
1418 : {
1419 29532 : if (md->noteol) RRETURN(MATCH_NOMATCH);
1420 29532 : if (!md->endonly)
1421 : {
1422 28930 : if (eptr != md->end_subject &&
1423 : (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1424 8529 : RRETURN(MATCH_NOMATCH);
1425 20401 : ecode++;
1426 20401 : break;
1427 : }
1428 : }
1429 : /* ... else fall through for endonly */
1430 :
1431 : /* End of subject assertion (\z) */
1432 :
1433 : case OP_EOD:
1434 602 : if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1435 12 : ecode++;
1436 12 : break;
1437 :
1438 : /* End of subject or ending \n assertion (\Z) */
1439 :
1440 : case OP_EODN:
1441 0 : if (eptr != md->end_subject &&
1442 : (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1443 0 : RRETURN(MATCH_NOMATCH);
1444 0 : ecode++;
1445 0 : break;
1446 :
1447 : /* Word boundary assertions */
1448 :
1449 : case OP_NOT_WORD_BOUNDARY:
1450 : case OP_WORD_BOUNDARY:
1451 : {
1452 :
1453 : /* Find out if the previous and current characters are "word" characters.
1454 : It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1455 : be "non-word" characters. */
1456 :
1457 : #ifdef SUPPORT_UTF8
1458 209 : if (utf8)
1459 : {
1460 0 : if (eptr == md->start_subject) prev_is_word = FALSE; else
1461 : {
1462 0 : USPTR lastptr = eptr - 1;
1463 0 : while((*lastptr & 0xc0) == 0x80) lastptr--;
1464 0 : GETCHAR(c, lastptr);
1465 0 : prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1466 : }
1467 0 : if (eptr >= md->end_subject) cur_is_word = FALSE; else
1468 : {
1469 0 : GETCHAR(c, eptr);
1470 0 : cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1471 : }
1472 : }
1473 : else
1474 : #endif
1475 :
1476 : /* More streamlined when not in UTF-8 mode */
1477 :
1478 : {
1479 209 : prev_is_word = (eptr != md->start_subject) &&
1480 : ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1481 209 : cur_is_word = (eptr < md->end_subject) &&
1482 : ((md->ctypes[*eptr] & ctype_word) != 0);
1483 : }
1484 :
1485 : /* Now see if the situation is what we want */
1486 :
1487 209 : if ((*ecode++ == OP_WORD_BOUNDARY)?
1488 : cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1489 114 : RRETURN(MATCH_NOMATCH);
1490 : }
1491 95 : break;
1492 :
1493 : /* Match a single character type; inline for speed */
1494 :
1495 : case OP_ANY:
1496 8398926 : if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1497 : /* Fall through */
1498 :
1499 : case OP_ALLANY:
1500 8399051 : if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1501 8399038 : if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1502 8399038 : ecode++;
1503 8399038 : break;
1504 :
1505 : /* Match a single byte, even in UTF-8 mode. This opcode really does match
1506 : any byte, even newline, independent of the setting of PCRE_DOTALL. */
1507 :
1508 : case OP_ANYBYTE:
1509 0 : if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1510 0 : ecode++;
1511 0 : break;
1512 :
1513 : case OP_NOT_DIGIT:
1514 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1515 0 : GETCHARINCTEST(c, eptr);
1516 0 : if (
1517 : #ifdef SUPPORT_UTF8
1518 : c < 256 &&
1519 : #endif
1520 : (md->ctypes[c] & ctype_digit) != 0
1521 : )
1522 0 : RRETURN(MATCH_NOMATCH);
1523 0 : ecode++;
1524 0 : break;
1525 :
1526 : case OP_DIGIT:
1527 395 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1528 346 : GETCHARINCTEST(c, eptr);
1529 346 : if (
1530 : #ifdef SUPPORT_UTF8
1531 : c >= 256 ||
1532 : #endif
1533 : (md->ctypes[c] & ctype_digit) == 0
1534 : )
1535 241 : RRETURN(MATCH_NOMATCH);
1536 105 : ecode++;
1537 105 : break;
1538 :
1539 : case OP_NOT_WHITESPACE:
1540 5 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1541 4 : GETCHARINCTEST(c, eptr);
1542 4 : if (
1543 : #ifdef SUPPORT_UTF8
1544 : c < 256 &&
1545 : #endif
1546 : (md->ctypes[c] & ctype_space) != 0
1547 : )
1548 0 : RRETURN(MATCH_NOMATCH);
1549 4 : ecode++;
1550 4 : break;
1551 :
1552 : case OP_WHITESPACE:
1553 31 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1554 30 : GETCHARINCTEST(c, eptr);
1555 30 : if (
1556 : #ifdef SUPPORT_UTF8
1557 : c >= 256 ||
1558 : #endif
1559 : (md->ctypes[c] & ctype_space) == 0
1560 : )
1561 11 : RRETURN(MATCH_NOMATCH);
1562 19 : ecode++;
1563 19 : break;
1564 :
1565 : case OP_NOT_WORDCHAR:
1566 4 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1567 3 : GETCHARINCTEST(c, eptr);
1568 3 : if (
1569 : #ifdef SUPPORT_UTF8
1570 : c < 256 &&
1571 : #endif
1572 : (md->ctypes[c] & ctype_word) != 0
1573 : )
1574 1 : RRETURN(MATCH_NOMATCH);
1575 2 : ecode++;
1576 2 : break;
1577 :
1578 : case OP_WORDCHAR:
1579 148 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1580 143 : GETCHARINCTEST(c, eptr);
1581 143 : if (
1582 : #ifdef SUPPORT_UTF8
1583 : c >= 256 ||
1584 : #endif
1585 : (md->ctypes[c] & ctype_word) == 0
1586 : )
1587 38 : RRETURN(MATCH_NOMATCH);
1588 105 : ecode++;
1589 105 : break;
1590 :
1591 : case OP_ANYNL:
1592 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1593 0 : GETCHARINCTEST(c, eptr);
1594 0 : switch(c)
1595 : {
1596 0 : default: RRETURN(MATCH_NOMATCH);
1597 : case 0x000d:
1598 0 : if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1599 0 : break;
1600 :
1601 : case 0x000a:
1602 0 : break;
1603 :
1604 : case 0x000b:
1605 : case 0x000c:
1606 : case 0x0085:
1607 : case 0x2028:
1608 : case 0x2029:
1609 0 : if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1610 : break;
1611 : }
1612 0 : ecode++;
1613 0 : break;
1614 :
1615 : case OP_NOT_HSPACE:
1616 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1617 0 : GETCHARINCTEST(c, eptr);
1618 0 : switch(c)
1619 : {
1620 : default: break;
1621 : case 0x09: /* HT */
1622 : case 0x20: /* SPACE */
1623 : case 0xa0: /* NBSP */
1624 : case 0x1680: /* OGHAM SPACE MARK */
1625 : case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1626 : case 0x2000: /* EN QUAD */
1627 : case 0x2001: /* EM QUAD */
1628 : case 0x2002: /* EN SPACE */
1629 : case 0x2003: /* EM SPACE */
1630 : case 0x2004: /* THREE-PER-EM SPACE */
1631 : case 0x2005: /* FOUR-PER-EM SPACE */
1632 : case 0x2006: /* SIX-PER-EM SPACE */
1633 : case 0x2007: /* FIGURE SPACE */
1634 : case 0x2008: /* PUNCTUATION SPACE */
1635 : case 0x2009: /* THIN SPACE */
1636 : case 0x200A: /* HAIR SPACE */
1637 : case 0x202f: /* NARROW NO-BREAK SPACE */
1638 : case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1639 : case 0x3000: /* IDEOGRAPHIC SPACE */
1640 0 : RRETURN(MATCH_NOMATCH);
1641 : }
1642 0 : ecode++;
1643 0 : break;
1644 :
1645 : case OP_HSPACE:
1646 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1647 0 : GETCHARINCTEST(c, eptr);
1648 0 : switch(c)
1649 : {
1650 0 : default: RRETURN(MATCH_NOMATCH);
1651 : case 0x09: /* HT */
1652 : case 0x20: /* SPACE */
1653 : case 0xa0: /* NBSP */
1654 : case 0x1680: /* OGHAM SPACE MARK */
1655 : case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1656 : case 0x2000: /* EN QUAD */
1657 : case 0x2001: /* EM QUAD */
1658 : case 0x2002: /* EN SPACE */
1659 : case 0x2003: /* EM SPACE */
1660 : case 0x2004: /* THREE-PER-EM SPACE */
1661 : case 0x2005: /* FOUR-PER-EM SPACE */
1662 : case 0x2006: /* SIX-PER-EM SPACE */
1663 : case 0x2007: /* FIGURE SPACE */
1664 : case 0x2008: /* PUNCTUATION SPACE */
1665 : case 0x2009: /* THIN SPACE */
1666 : case 0x200A: /* HAIR SPACE */
1667 : case 0x202f: /* NARROW NO-BREAK SPACE */
1668 : case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1669 : case 0x3000: /* IDEOGRAPHIC SPACE */
1670 : break;
1671 : }
1672 0 : ecode++;
1673 0 : break;
1674 :
1675 : case OP_NOT_VSPACE:
1676 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1677 0 : GETCHARINCTEST(c, eptr);
1678 0 : switch(c)
1679 : {
1680 : default: break;
1681 : case 0x0a: /* LF */
1682 : case 0x0b: /* VT */
1683 : case 0x0c: /* FF */
1684 : case 0x0d: /* CR */
1685 : case 0x85: /* NEL */
1686 : case 0x2028: /* LINE SEPARATOR */
1687 : case 0x2029: /* PARAGRAPH SEPARATOR */
1688 0 : RRETURN(MATCH_NOMATCH);
1689 : }
1690 0 : ecode++;
1691 0 : break;
1692 :
1693 : case OP_VSPACE:
1694 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1695 0 : GETCHARINCTEST(c, eptr);
1696 0 : switch(c)
1697 : {
1698 0 : default: RRETURN(MATCH_NOMATCH);
1699 : case 0x0a: /* LF */
1700 : case 0x0b: /* VT */
1701 : case 0x0c: /* FF */
1702 : case 0x0d: /* CR */
1703 : case 0x85: /* NEL */
1704 : case 0x2028: /* LINE SEPARATOR */
1705 : case 0x2029: /* PARAGRAPH SEPARATOR */
1706 : break;
1707 : }
1708 0 : ecode++;
1709 0 : break;
1710 :
1711 : #ifdef SUPPORT_UCP
1712 : /* Check the next character by Unicode property. We will get here only
1713 : if the support is in the binary; otherwise a compile-time error occurs. */
1714 :
1715 : case OP_PROP:
1716 : case OP_NOTPROP:
1717 53 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1718 47 : GETCHARINCTEST(c, eptr);
1719 : {
1720 47 : const ucd_record *prop = GET_UCD(c);
1721 :
1722 47 : switch(ecode[1])
1723 : {
1724 : case PT_ANY:
1725 0 : if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1726 0 : break;
1727 :
1728 : case PT_LAMP:
1729 0 : if ((prop->chartype == ucp_Lu ||
1730 : prop->chartype == ucp_Ll ||
1731 : prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1732 0 : RRETURN(MATCH_NOMATCH);
1733 0 : break;
1734 :
1735 : case PT_GC:
1736 32 : if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1737 0 : RRETURN(MATCH_NOMATCH);
1738 32 : break;
1739 :
1740 : case PT_PC:
1741 15 : if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1742 0 : RRETURN(MATCH_NOMATCH);
1743 15 : break;
1744 :
1745 : case PT_SC:
1746 0 : if ((ecode[2] != prop->script) == (op == OP_PROP))
1747 0 : RRETURN(MATCH_NOMATCH);
1748 0 : break;
1749 :
1750 : default:
1751 0 : RRETURN(PCRE_ERROR_INTERNAL);
1752 : }
1753 :
1754 47 : ecode += 3;
1755 : }
1756 47 : break;
1757 :
1758 : /* Match an extended Unicode sequence. We will get here only if the support
1759 : is in the binary; otherwise a compile-time error occurs. */
1760 :
1761 : case OP_EXTUNI:
1762 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1763 0 : GETCHARINCTEST(c, eptr);
1764 : {
1765 0 : int category = UCD_CATEGORY(c);
1766 0 : if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1767 0 : while (eptr < md->end_subject)
1768 : {
1769 0 : int len = 1;
1770 0 : if (!utf8) c = *eptr; else
1771 : {
1772 0 : GETCHARLEN(c, eptr, len);
1773 : }
1774 0 : category = UCD_CATEGORY(c);
1775 0 : if (category != ucp_M) break;
1776 0 : eptr += len;
1777 : }
1778 : }
1779 0 : ecode++;
1780 0 : break;
1781 : #endif
1782 :
1783 :
1784 : /* Match a back reference, possibly repeatedly. Look past the end of the
1785 : item to see if there is repeat information following. The code is similar
1786 : to that for character classes, but repeated for efficiency. Then obey
1787 : similar code to character type repeats - written out again for speed.
1788 : However, if the referenced string is the empty string, always treat
1789 : it as matched, any number of times (otherwise there could be infinite
1790 : loops). */
1791 :
1792 : case OP_REF:
1793 : {
1794 2796 : offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1795 2796 : ecode += 3;
1796 :
1797 : /* If the reference is unset, there are two possibilities:
1798 :
1799 : (a) In the default, Perl-compatible state, set the length to be longer
1800 : than the amount of subject left; this ensures that every attempt at a
1801 : match fails. We can't just fail here, because of the possibility of
1802 : quantifiers with zero minima.
1803 :
1804 : (b) If the JavaScript compatibility flag is set, set the length to zero
1805 : so that the back reference matches an empty string.
1806 :
1807 : Otherwise, set the length to the length of what was matched by the
1808 : referenced subpattern. */
1809 :
1810 2796 : if (offset >= offset_top || md->offset_vector[offset] < 0)
1811 0 : length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1812 : else
1813 2796 : length = md->offset_vector[offset+1] - md->offset_vector[offset];
1814 :
1815 : /* Set up for repetition, or handle the non-repeated case */
1816 :
1817 2796 : switch (*ecode)
1818 : {
1819 : case OP_CRSTAR:
1820 : case OP_CRMINSTAR:
1821 : case OP_CRPLUS:
1822 : case OP_CRMINPLUS:
1823 : case OP_CRQUERY:
1824 : case OP_CRMINQUERY:
1825 0 : c = *ecode++ - OP_CRSTAR;
1826 0 : minimize = (c & 1) != 0;
1827 0 : min = rep_min[c]; /* Pick up values from tables; */
1828 0 : max = rep_max[c]; /* zero for max => infinity */
1829 0 : if (max == 0) max = INT_MAX;
1830 0 : break;
1831 :
1832 : case OP_CRRANGE:
1833 : case OP_CRMINRANGE:
1834 0 : minimize = (*ecode == OP_CRMINRANGE);
1835 0 : min = GET2(ecode, 1);
1836 0 : max = GET2(ecode, 3);
1837 0 : if (max == 0) max = INT_MAX;
1838 0 : ecode += 5;
1839 0 : break;
1840 :
1841 : default: /* No repeat follows */
1842 2796 : if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1843 424 : eptr += length;
1844 424 : continue; /* With the main loop */
1845 : }
1846 :
1847 : /* If the length of the reference is zero, just continue with the
1848 : main loop. */
1849 :
1850 0 : if (length == 0) continue;
1851 :
1852 : /* First, ensure the minimum number of matches are present. We get back
1853 : the length of the reference string explicitly rather than passing the
1854 : address of eptr, so that eptr can be a register variable. */
1855 :
1856 0 : for (i = 1; i <= min; i++)
1857 : {
1858 0 : if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1859 0 : eptr += length;
1860 : }
1861 :
1862 : /* If min = max, continue at the same level without recursion.
1863 : They are not both allowed to be zero. */
1864 :
1865 0 : if (min == max) continue;
1866 :
1867 : /* If minimizing, keep trying and advancing the pointer */
1868 :
1869 0 : if (minimize)
1870 : {
1871 0 : for (fi = min;; fi++)
1872 : {
1873 0 : RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1874 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1875 0 : if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1876 0 : RRETURN(MATCH_NOMATCH);
1877 0 : eptr += length;
1878 0 : }
1879 : /* Control never gets here */
1880 : }
1881 :
1882 : /* If maximizing, find the longest string and work backwards */
1883 :
1884 : else
1885 : {
1886 0 : pp = eptr;
1887 0 : for (i = min; i < max; i++)
1888 : {
1889 0 : if (!match_ref(offset, eptr, length, md, ims)) break;
1890 0 : eptr += length;
1891 : }
1892 0 : while (eptr >= pp)
1893 : {
1894 0 : RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1895 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1896 0 : eptr -= length;
1897 : }
1898 0 : RRETURN(MATCH_NOMATCH);
1899 : }
1900 : }
1901 : /* Control never gets here */
1902 :
1903 :
1904 :
1905 : /* Match a bit-mapped character class, possibly repeatedly. This op code is
1906 : used when all the characters in the class have values in the range 0-255,
1907 : and either the matching is caseful, or the characters are in the range
1908 : 0-127 when UTF-8 processing is enabled. The only difference between
1909 : OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1910 : encountered.
1911 :
1912 : First, look past the end of the item to see if there is repeat information
1913 : following. Then obey similar code to character type repeats - written out
1914 : again for speed. */
1915 :
1916 : case OP_NCLASS:
1917 : case OP_CLASS:
1918 : {
1919 143972 : data = ecode + 1; /* Save for matching */
1920 143972 : ecode += 33; /* Advance past the item */
1921 :
1922 143972 : switch (*ecode)
1923 : {
1924 : case OP_CRSTAR:
1925 : case OP_CRMINSTAR:
1926 : case OP_CRPLUS:
1927 : case OP_CRMINPLUS:
1928 : case OP_CRQUERY:
1929 : case OP_CRMINQUERY:
1930 140189 : c = *ecode++ - OP_CRSTAR;
1931 140189 : minimize = (c & 1) != 0;
1932 140189 : min = rep_min[c]; /* Pick up values from tables; */
1933 140189 : max = rep_max[c]; /* zero for max => infinity */
1934 140189 : if (max == 0) max = INT_MAX;
1935 140189 : break;
1936 :
1937 : case OP_CRRANGE:
1938 : case OP_CRMINRANGE:
1939 4 : minimize = (*ecode == OP_CRMINRANGE);
1940 4 : min = GET2(ecode, 1);
1941 4 : max = GET2(ecode, 3);
1942 4 : if (max == 0) max = INT_MAX;
1943 4 : ecode += 5;
1944 4 : break;
1945 :
1946 : default: /* No repeat follows */
1947 3779 : min = max = 1;
1948 : break;
1949 : }
1950 :
1951 : /* First, ensure the minimum number of matches are present. */
1952 :
1953 : #ifdef SUPPORT_UTF8
1954 : /* UTF-8 mode */
1955 143972 : if (utf8)
1956 : {
1957 0 : for (i = 1; i <= min; i++)
1958 : {
1959 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1960 0 : GETCHARINC(c, eptr);
1961 0 : if (c > 255)
1962 : {
1963 0 : if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1964 : }
1965 : else
1966 : {
1967 0 : if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1968 : }
1969 : }
1970 : }
1971 : else
1972 : #endif
1973 : /* Not UTF-8 mode */
1974 : {
1975 195710 : for (i = 1; i <= min; i++)
1976 : {
1977 143006 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1978 141712 : c = *eptr++;
1979 141712 : if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1980 : }
1981 : }
1982 :
1983 : /* If max == min we can continue with the main loop without the
1984 : need to recurse. */
1985 :
1986 52704 : if (min == max) continue;
1987 :
1988 : /* If minimizing, keep testing the rest of the expression and advancing
1989 : the pointer while it matches the class. */
1990 :
1991 50999 : if (minimize)
1992 : {
1993 : #ifdef SUPPORT_UTF8
1994 : /* UTF-8 mode */
1995 0 : if (utf8)
1996 : {
1997 0 : for (fi = min;; fi++)
1998 : {
1999 0 : RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2000 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2001 0 : if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2002 0 : GETCHARINC(c, eptr);
2003 0 : if (c > 255)
2004 : {
2005 0 : if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2006 : }
2007 : else
2008 : {
2009 0 : if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2010 : }
2011 0 : }
2012 : }
2013 : else
2014 : #endif
2015 : /* Not UTF-8 mode */
2016 : {
2017 0 : for (fi = min;; fi++)
2018 : {
2019 0 : RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2020 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2021 0 : if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2022 0 : c = *eptr++;
2023 0 : if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2024 0 : }
2025 : }
2026 : /* Control never gets here */
2027 : }
2028 :
2029 : /* If maximizing, find the longest possible run, then work backwards. */
2030 :
2031 : else
2032 : {
2033 50999 : pp = eptr;
2034 :
2035 : #ifdef SUPPORT_UTF8
2036 : /* UTF-8 mode */
2037 50999 : if (utf8)
2038 : {
2039 0 : for (i = min; i < max; i++)
2040 : {
2041 0 : int len = 1;
2042 0 : if (eptr >= md->end_subject) break;
2043 0 : GETCHARLEN(c, eptr, len);
2044 0 : if (c > 255)
2045 : {
2046 0 : if (op == OP_CLASS) break;
2047 : }
2048 : else
2049 : {
2050 0 : if ((data[c/8] & (1 << (c&7))) == 0) break;
2051 : }
2052 0 : eptr += len;
2053 : }
2054 : for (;;)
2055 : {
2056 0 : RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2057 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2058 0 : if (eptr-- == pp) break; /* Stop if tried at original pos */
2059 0 : BACKCHAR(eptr);
2060 0 : }
2061 : }
2062 : else
2063 : #endif
2064 : /* Not UTF-8 mode */
2065 : {
2066 1822273 : for (i = min; i < max; i++)
2067 : {
2068 1822204 : if (eptr >= md->end_subject) break;
2069 1820409 : c = *eptr;
2070 1820409 : if ((data[c/8] & (1 << (c&7))) == 0) break;
2071 1771274 : eptr++;
2072 : }
2073 427125 : while (eptr >= pp)
2074 : {
2075 374602 : RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2076 374602 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2077 325127 : eptr--;
2078 : }
2079 : }
2080 :
2081 1524 : RRETURN(MATCH_NOMATCH);
2082 : }
2083 : }
2084 : /* Control never gets here */
2085 :
2086 :
2087 : /* Match an extended character class. This opcode is encountered only
2088 : when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2089 : mode, because Unicode properties are supported in non-UTF-8 mode. */
2090 :
2091 : #ifdef SUPPORT_UTF8
2092 : case OP_XCLASS:
2093 : {
2094 3 : data = ecode + 1 + LINK_SIZE; /* Save for matching */
2095 3 : ecode += GET(ecode, 1); /* Advance past the item */
2096 :
2097 3 : switch (*ecode)
2098 : {
2099 : case OP_CRSTAR:
2100 : case OP_CRMINSTAR:
2101 : case OP_CRPLUS:
2102 : case OP_CRMINPLUS:
2103 : case OP_CRQUERY:
2104 : case OP_CRMINQUERY:
2105 3 : c = *ecode++ - OP_CRSTAR;
2106 3 : minimize = (c & 1) != 0;
2107 3 : min = rep_min[c]; /* Pick up values from tables; */
2108 3 : max = rep_max[c]; /* zero for max => infinity */
2109 3 : if (max == 0) max = INT_MAX;
2110 3 : break;
2111 :
2112 : case OP_CRRANGE:
2113 : case OP_CRMINRANGE:
2114 0 : minimize = (*ecode == OP_CRMINRANGE);
2115 0 : min = GET2(ecode, 1);
2116 0 : max = GET2(ecode, 3);
2117 0 : if (max == 0) max = INT_MAX;
2118 0 : ecode += 5;
2119 0 : break;
2120 :
2121 : default: /* No repeat follows */
2122 0 : min = max = 1;
2123 : break;
2124 : }
2125 :
2126 : /* First, ensure the minimum number of matches are present. */
2127 :
2128 3 : for (i = 1; i <= min; i++)
2129 : {
2130 3 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2131 3 : GETCHARINCTEST(c, eptr);
2132 3 : if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2133 : }
2134 :
2135 : /* If max == min we can continue with the main loop without the
2136 : need to recurse. */
2137 :
2138 0 : if (min == max) continue;
2139 :
2140 : /* If minimizing, keep testing the rest of the expression and advancing
2141 : the pointer while it matches the class. */
2142 :
2143 0 : if (minimize)
2144 : {
2145 0 : for (fi = min;; fi++)
2146 : {
2147 0 : RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2148 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2149 0 : if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2150 0 : GETCHARINCTEST(c, eptr);
2151 0 : if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2152 0 : }
2153 : /* Control never gets here */
2154 : }
2155 :
2156 : /* If maximizing, find the longest possible run, then work backwards. */
2157 :
2158 : else
2159 : {
2160 0 : pp = eptr;
2161 0 : for (i = min; i < max; i++)
2162 : {
2163 0 : int len = 1;
2164 0 : if (eptr >= md->end_subject) break;
2165 0 : GETCHARLENTEST(c, eptr, len);
2166 0 : if (!_pcre_xclass(c, data)) break;
2167 0 : eptr += len;
2168 : }
2169 : for(;;)
2170 : {
2171 0 : RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2172 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2173 0 : if (eptr-- == pp) break; /* Stop if tried at original pos */
2174 0 : if (utf8) BACKCHAR(eptr);
2175 0 : }
2176 0 : RRETURN(MATCH_NOMATCH);
2177 : }
2178 :
2179 : /* Control never gets here */
2180 : }
2181 : #endif /* End of XCLASS */
2182 :
2183 : /* Match a single character, casefully */
2184 :
2185 : case OP_CHAR:
2186 : #ifdef SUPPORT_UTF8
2187 8012400 : if (utf8)
2188 : {
2189 0 : length = 1;
2190 0 : ecode++;
2191 0 : GETCHARLEN(fc, ecode, length);
2192 0 : if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2193 0 : while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2194 : }
2195 : else
2196 : #endif
2197 :
2198 : /* Non-UTF-8 mode */
2199 : {
2200 8012400 : if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2201 8005945 : if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2202 7015741 : ecode += 2;
2203 : }
2204 7015741 : break;
2205 :
2206 : /* Match a single character, caselessly */
2207 :
2208 : case OP_CHARNC:
2209 : #ifdef SUPPORT_UTF8
2210 77923 : if (utf8)
2211 : {
2212 5 : length = 1;
2213 5 : ecode++;
2214 5 : GETCHARLEN(fc, ecode, length);
2215 :
2216 5 : if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2217 :
2218 : /* If the pattern character's value is < 128, we have only one byte, and
2219 : can use the fast lookup table. */
2220 :
2221 5 : if (fc < 128)
2222 : {
2223 5 : if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2224 : }
2225 :
2226 : /* Otherwise we must pick up the subject character */
2227 :
2228 : else
2229 : {
2230 : unsigned int dc;
2231 0 : GETCHARINC(dc, eptr);
2232 0 : ecode += length;
2233 :
2234 : /* If we have Unicode property support, we can use it to test the other
2235 : case of the character, if there is one. */
2236 :
2237 0 : if (fc != dc)
2238 : {
2239 : #ifdef SUPPORT_UCP
2240 0 : if (dc != UCD_OTHERCASE(fc))
2241 : #endif
2242 0 : RRETURN(MATCH_NOMATCH);
2243 : }
2244 : }
2245 : }
2246 : else
2247 : #endif /* SUPPORT_UTF8 */
2248 :
2249 : /* Non-UTF-8 mode */
2250 : {
2251 77918 : if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2252 41550 : if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2253 3497 : ecode += 2;
2254 : }
2255 3501 : break;
2256 :
2257 : /* Match a single character repeatedly. */
2258 :
2259 : case OP_EXACT:
2260 0 : min = max = GET2(ecode, 1);
2261 0 : ecode += 3;
2262 0 : goto REPEATCHAR;
2263 :
2264 : case OP_POSUPTO:
2265 0 : possessive = TRUE;
2266 : /* Fall through */
2267 :
2268 : case OP_UPTO:
2269 : case OP_MINUPTO:
2270 0 : min = 0;
2271 0 : max = GET2(ecode, 1);
2272 0 : minimize = *ecode == OP_MINUPTO;
2273 0 : ecode += 3;
2274 0 : goto REPEATCHAR;
2275 :
2276 : case OP_POSSTAR:
2277 28 : possessive = TRUE;
2278 28 : min = 0;
2279 28 : max = INT_MAX;
2280 28 : ecode++;
2281 28 : goto REPEATCHAR;
2282 :
2283 : case OP_POSPLUS:
2284 1 : possessive = TRUE;
2285 1 : min = 1;
2286 1 : max = INT_MAX;
2287 1 : ecode++;
2288 1 : goto REPEATCHAR;
2289 :
2290 : case OP_POSQUERY:
2291 0 : possessive = TRUE;
2292 0 : min = 0;
2293 0 : max = 1;
2294 0 : ecode++;
2295 0 : goto REPEATCHAR;
2296 :
2297 : case OP_STAR:
2298 : case OP_MINSTAR:
2299 : case OP_PLUS:
2300 : case OP_MINPLUS:
2301 : case OP_QUERY:
2302 : case OP_MINQUERY:
2303 3788 : c = *ecode++ - OP_STAR;
2304 3788 : minimize = (c & 1) != 0;
2305 3788 : min = rep_min[c]; /* Pick up values from tables; */
2306 3788 : max = rep_max[c]; /* zero for max => infinity */
2307 3788 : if (max == 0) max = INT_MAX;
2308 :
2309 : /* Common code for all repeated single-character matches. We can give
2310 : up quickly if there are fewer than the minimum number of characters left in
2311 : the subject. */
2312 :
2313 3817 : REPEATCHAR:
2314 : #ifdef SUPPORT_UTF8
2315 3817 : if (utf8)
2316 : {
2317 0 : length = 1;
2318 0 : charptr = ecode;
2319 0 : GETCHARLEN(fc, ecode, length);
2320 0 : if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2321 0 : ecode += length;
2322 :
2323 : /* Handle multibyte character matching specially here. There is
2324 : support for caseless matching if UCP support is present. */
2325 :
2326 0 : if (length > 1)
2327 : {
2328 : #ifdef SUPPORT_UCP
2329 : unsigned int othercase;
2330 0 : if ((ims & PCRE_CASELESS) != 0 &&
2331 : (othercase = UCD_OTHERCASE(fc)) != fc)
2332 0 : oclength = _pcre_ord2utf8(othercase, occhars);
2333 0 : else oclength = 0;
2334 : #endif /* SUPPORT_UCP */
2335 :
2336 0 : for (i = 1; i <= min; i++)
2337 : {
2338 0 : if (memcmp(eptr, charptr, length) == 0) eptr += length;
2339 : #ifdef SUPPORT_UCP
2340 : /* Need braces because of following else */
2341 0 : else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2342 : else
2343 : {
2344 0 : if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2345 0 : eptr += oclength;
2346 : }
2347 : #else /* without SUPPORT_UCP */
2348 : else { RRETURN(MATCH_NOMATCH); }
2349 : #endif /* SUPPORT_UCP */
2350 : }
2351 :
2352 0 : if (min == max) continue;
2353 :
2354 0 : if (minimize)
2355 : {
2356 0 : for (fi = min;; fi++)
2357 : {
2358 0 : RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2359 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2360 0 : if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2361 0 : if (memcmp(eptr, charptr, length) == 0) eptr += length;
2362 : #ifdef SUPPORT_UCP
2363 : /* Need braces because of following else */
2364 0 : else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2365 : else
2366 : {
2367 0 : if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2368 0 : eptr += oclength;
2369 : }
2370 : #else /* without SUPPORT_UCP */
2371 : else { RRETURN (MATCH_NOMATCH); }
2372 : #endif /* SUPPORT_UCP */
2373 0 : }
2374 : /* Control never gets here */
2375 : }
2376 :
2377 : else /* Maximize */
2378 : {
2379 0 : pp = eptr;
2380 0 : for (i = min; i < max; i++)
2381 : {
2382 0 : if (eptr > md->end_subject - length) break;
2383 0 : if (memcmp(eptr, charptr, length) == 0) eptr += length;
2384 : #ifdef SUPPORT_UCP
2385 0 : else if (oclength == 0) break;
2386 : else
2387 : {
2388 0 : if (memcmp(eptr, occhars, oclength) != 0) break;
2389 0 : eptr += oclength;
2390 : }
2391 : #else /* without SUPPORT_UCP */
2392 : else break;
2393 : #endif /* SUPPORT_UCP */
2394 : }
2395 :
2396 0 : if (possessive) continue;
2397 : for(;;)
2398 : {
2399 0 : RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2400 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2401 0 : if (eptr == pp) RRETURN(MATCH_NOMATCH);
2402 : #ifdef SUPPORT_UCP
2403 0 : eptr--;
2404 0 : BACKCHAR(eptr);
2405 : #else /* without SUPPORT_UCP */
2406 : eptr -= length;
2407 : #endif /* SUPPORT_UCP */
2408 0 : }
2409 : }
2410 : /* Control never gets here */
2411 : }
2412 :
2413 : /* If the length of a UTF-8 character is 1, we fall through here, and
2414 : obey the code as for non-UTF-8 characters below, though in this case the
2415 : value of fc will always be < 128. */
2416 : }
2417 : else
2418 : #endif /* SUPPORT_UTF8 */
2419 :
2420 : /* When not in UTF-8 mode, load a single-byte character. */
2421 : {
2422 3817 : if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2423 3817 : fc = *ecode++;
2424 : }
2425 :
2426 : /* The value of fc at this point is always less than 256, though we may or
2427 : may not be in UTF-8 mode. The code is duplicated for the caseless and
2428 : caseful cases, for speed, since matching characters is likely to be quite
2429 : common. First, ensure the minimum number of matches are present. If min =
2430 : max, continue at the same level without recursing. Otherwise, if
2431 : minimizing, keep trying the rest of the expression and advancing one
2432 : matching character if failing, up to the maximum. Alternatively, if
2433 : maximizing, find the maximum number of characters and work backwards. */
2434 :
2435 : DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2436 : max, eptr));
2437 :
2438 3817 : if ((ims & PCRE_CASELESS) != 0)
2439 : {
2440 0 : fc = md->lcc[fc];
2441 0 : for (i = 1; i <= min; i++)
2442 0 : if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2443 0 : if (min == max) continue;
2444 0 : if (minimize)
2445 : {
2446 0 : for (fi = min;; fi++)
2447 : {
2448 0 : RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2449 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2450 0 : if (fi >= max || eptr >= md->end_subject ||
2451 : fc != md->lcc[*eptr++])
2452 0 : RRETURN(MATCH_NOMATCH);
2453 0 : }
2454 : /* Control never gets here */
2455 : }
2456 : else /* Maximize */
2457 : {
2458 0 : pp = eptr;
2459 0 : for (i = min; i < max; i++)
2460 : {
2461 0 : if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2462 0 : eptr++;
2463 : }
2464 0 : if (possessive) continue;
2465 0 : while (eptr >= pp)
2466 : {
2467 0 : RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2468 0 : eptr--;
2469 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2470 : }
2471 0 : RRETURN(MATCH_NOMATCH);
2472 : }
2473 : /* Control never gets here */
2474 : }
2475 :
2476 : /* Caseful comparisons (includes all multi-byte characters) */
2477 :
2478 : else
2479 : {
2480 3817 : for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2481 3813 : if (min == max) continue;
2482 3813 : if (minimize)
2483 : {
2484 0 : for (fi = min;; fi++)
2485 : {
2486 0 : RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2487 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2488 0 : if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2489 0 : RRETURN(MATCH_NOMATCH);
2490 0 : }
2491 : /* Control never gets here */
2492 : }
2493 : else /* Maximize */
2494 : {
2495 3813 : pp = eptr;
2496 4114 : for (i = min; i < max; i++)
2497 : {
2498 3843 : if (eptr >= md->end_subject || fc != *eptr) break;
2499 301 : eptr++;
2500 : }
2501 3813 : if (possessive) continue;
2502 10964 : while (eptr >= pp)
2503 : {
2504 3845 : RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2505 3845 : eptr--;
2506 3845 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2507 : }
2508 3335 : RRETURN(MATCH_NOMATCH);
2509 : }
2510 : }
2511 : /* Control never gets here */
2512 :
2513 : /* Match a negated single one-byte character. The character we are
2514 : checking can be multibyte. */
2515 :
2516 : case OP_NOT:
2517 58 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2518 58 : ecode++;
2519 58 : GETCHARINCTEST(c, eptr);
2520 58 : if ((ims & PCRE_CASELESS) != 0)
2521 : {
2522 : #ifdef SUPPORT_UTF8
2523 0 : if (c < 256)
2524 : #endif
2525 0 : c = md->lcc[c];
2526 0 : if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2527 : }
2528 : else
2529 : {
2530 58 : if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2531 : }
2532 50 : break;
2533 :
2534 : /* Match a negated single one-byte character repeatedly. This is almost a
2535 : repeat of the code for a repeated single character, but I haven't found a
2536 : nice way of commoning these up that doesn't require a test of the
2537 : positive/negative option for each character match. Maybe that wouldn't add
2538 : very much to the time taken, but character matching *is* what this is all
2539 : about... */
2540 :
2541 : case OP_NOTEXACT:
2542 2 : min = max = GET2(ecode, 1);
2543 2 : ecode += 3;
2544 2 : goto REPEATNOTCHAR;
2545 :
2546 : case OP_NOTUPTO:
2547 : case OP_NOTMINUPTO:
2548 0 : min = 0;
2549 0 : max = GET2(ecode, 1);
2550 0 : minimize = *ecode == OP_NOTMINUPTO;
2551 0 : ecode += 3;
2552 0 : goto REPEATNOTCHAR;
2553 :
2554 : case OP_NOTPOSSTAR:
2555 0 : possessive = TRUE;
2556 0 : min = 0;
2557 0 : max = INT_MAX;
2558 0 : ecode++;
2559 0 : goto REPEATNOTCHAR;
2560 :
2561 : case OP_NOTPOSPLUS:
2562 8 : possessive = TRUE;
2563 8 : min = 1;
2564 8 : max = INT_MAX;
2565 8 : ecode++;
2566 8 : goto REPEATNOTCHAR;
2567 :
2568 : case OP_NOTPOSQUERY:
2569 0 : possessive = TRUE;
2570 0 : min = 0;
2571 0 : max = 1;
2572 0 : ecode++;
2573 0 : goto REPEATNOTCHAR;
2574 :
2575 : case OP_NOTPOSUPTO:
2576 0 : possessive = TRUE;
2577 0 : min = 0;
2578 0 : max = GET2(ecode, 1);
2579 0 : ecode += 3;
2580 0 : goto REPEATNOTCHAR;
2581 :
2582 : case OP_NOTSTAR:
2583 : case OP_NOTMINSTAR:
2584 : case OP_NOTPLUS:
2585 : case OP_NOTMINPLUS:
2586 : case OP_NOTQUERY:
2587 : case OP_NOTMINQUERY:
2588 25004 : c = *ecode++ - OP_NOTSTAR;
2589 25004 : minimize = (c & 1) != 0;
2590 25004 : min = rep_min[c]; /* Pick up values from tables; */
2591 25004 : max = rep_max[c]; /* zero for max => infinity */
2592 25004 : if (max == 0) max = INT_MAX;
2593 :
2594 : /* Common code for all repeated single-byte matches. We can give up quickly
2595 : if there are fewer than the minimum number of bytes left in the
2596 : subject. */
2597 :
2598 25014 : REPEATNOTCHAR:
2599 25014 : if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2600 12894 : fc = *ecode++;
2601 :
2602 : /* The code is duplicated for the caseless and caseful cases, for speed,
2603 : since matching characters is likely to be quite common. First, ensure the
2604 : minimum number of matches are present. If min = max, continue at the same
2605 : level without recursing. Otherwise, if minimizing, keep trying the rest of
2606 : the expression and advancing one matching character if failing, up to the
2607 : maximum. Alternatively, if maximizing, find the maximum number of
2608 : characters and work backwards. */
2609 :
2610 : DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2611 : max, eptr));
2612 :
2613 12894 : if ((ims & PCRE_CASELESS) != 0)
2614 : {
2615 12884 : fc = md->lcc[fc];
2616 :
2617 : #ifdef SUPPORT_UTF8
2618 : /* UTF-8 mode */
2619 12884 : if (utf8)
2620 : {
2621 : register unsigned int d;
2622 0 : for (i = 1; i <= min; i++)
2623 : {
2624 0 : GETCHARINC(d, eptr);
2625 0 : if (d < 256) d = md->lcc[d];
2626 0 : if (fc == d) RRETURN(MATCH_NOMATCH);
2627 : }
2628 : }
2629 : else
2630 : #endif
2631 :
2632 : /* Not UTF-8 mode */
2633 : {
2634 25389 : for (i = 1; i <= min; i++)
2635 12886 : if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2636 : }
2637 :
2638 12503 : if (min == max) continue;
2639 :
2640 12502 : if (minimize)
2641 : {
2642 : #ifdef SUPPORT_UTF8
2643 : /* UTF-8 mode */
2644 0 : if (utf8)
2645 : {
2646 : register unsigned int d;
2647 0 : for (fi = min;; fi++)
2648 : {
2649 0 : RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2650 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2651 0 : if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2652 0 : GETCHARINC(d, eptr);
2653 0 : if (d < 256) d = md->lcc[d];
2654 0 : if (fc == d) RRETURN(MATCH_NOMATCH);
2655 :
2656 0 : }
2657 : }
2658 : else
2659 : #endif
2660 : /* Not UTF-8 mode */
2661 : {
2662 0 : for (fi = min;; fi++)
2663 : {
2664 0 : RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2665 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2666 0 : if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2667 0 : RRETURN(MATCH_NOMATCH);
2668 0 : }
2669 : }
2670 : /* Control never gets here */
2671 : }
2672 :
2673 : /* Maximize case */
2674 :
2675 : else
2676 : {
2677 12502 : pp = eptr;
2678 :
2679 : #ifdef SUPPORT_UTF8
2680 : /* UTF-8 mode */
2681 12502 : if (utf8)
2682 : {
2683 : register unsigned int d;
2684 0 : for (i = min; i < max; i++)
2685 : {
2686 0 : int len = 1;
2687 0 : if (eptr >= md->end_subject) break;
2688 0 : GETCHARLEN(d, eptr, len);
2689 0 : if (d < 256) d = md->lcc[d];
2690 0 : if (fc == d) break;
2691 0 : eptr += len;
2692 : }
2693 0 : if (possessive) continue;
2694 : for(;;)
2695 : {
2696 0 : RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2697 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2698 0 : if (eptr-- == pp) break; /* Stop if tried at original pos */
2699 0 : BACKCHAR(eptr);
2700 0 : }
2701 : }
2702 : else
2703 : #endif
2704 : /* Not UTF-8 mode */
2705 : {
2706 24634 : for (i = min; i < max; i++)
2707 : {
2708 24634 : if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2709 12132 : eptr++;
2710 : }
2711 12502 : if (possessive) continue;
2712 49606 : while (eptr >= pp)
2713 : {
2714 24622 : RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2715 24622 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2716 24602 : eptr--;
2717 : }
2718 : }
2719 :
2720 12482 : RRETURN(MATCH_NOMATCH);
2721 : }
2722 : /* Control never gets here */
2723 : }
2724 :
2725 : /* Caseful comparisons */
2726 :
2727 : else
2728 : {
2729 : #ifdef SUPPORT_UTF8
2730 : /* UTF-8 mode */
2731 10 : if (utf8)
2732 : {
2733 : register unsigned int d;
2734 0 : for (i = 1; i <= min; i++)
2735 : {
2736 0 : GETCHARINC(d, eptr);
2737 0 : if (fc == d) RRETURN(MATCH_NOMATCH);
2738 : }
2739 : }
2740 : else
2741 : #endif
2742 : /* Not UTF-8 mode */
2743 : {
2744 18 : for (i = 1; i <= min; i++)
2745 9 : if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2746 : }
2747 :
2748 9 : if (min == max) continue;
2749 :
2750 9 : if (minimize)
2751 : {
2752 : #ifdef SUPPORT_UTF8
2753 : /* UTF-8 mode */
2754 0 : if (utf8)
2755 : {
2756 : register unsigned int d;
2757 0 : for (fi = min;; fi++)
2758 : {
2759 0 : RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2760 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2761 0 : if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2762 0 : GETCHARINC(d, eptr);
2763 0 : if (fc == d) RRETURN(MATCH_NOMATCH);
2764 0 : }
2765 : }
2766 : else
2767 : #endif
2768 : /* Not UTF-8 mode */
2769 : {
2770 0 : for (fi = min;; fi++)
2771 : {
2772 0 : RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2773 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2774 0 : if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2775 0 : RRETURN(MATCH_NOMATCH);
2776 0 : }
2777 : }
2778 : /* Control never gets here */
2779 : }
2780 :
2781 : /* Maximize case */
2782 :
2783 : else
2784 : {
2785 9 : pp = eptr;
2786 :
2787 : #ifdef SUPPORT_UTF8
2788 : /* UTF-8 mode */
2789 9 : if (utf8)
2790 : {
2791 : register unsigned int d;
2792 0 : for (i = min; i < max; i++)
2793 : {
2794 0 : int len = 1;
2795 0 : if (eptr >= md->end_subject) break;
2796 0 : GETCHARLEN(d, eptr, len);
2797 0 : if (fc == d) break;
2798 0 : eptr += len;
2799 : }
2800 0 : if (possessive) continue;
2801 : for(;;)
2802 : {
2803 0 : RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2804 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2805 0 : if (eptr-- == pp) break; /* Stop if tried at original pos */
2806 0 : BACKCHAR(eptr);
2807 0 : }
2808 : }
2809 : else
2810 : #endif
2811 : /* Not UTF-8 mode */
2812 : {
2813 33 : for (i = min; i < max; i++)
2814 : {
2815 33 : if (eptr >= md->end_subject || fc == *eptr) break;
2816 24 : eptr++;
2817 : }
2818 9 : if (possessive) continue;
2819 4 : while (eptr >= pp)
2820 : {
2821 2 : RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2822 2 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2823 0 : eptr--;
2824 : }
2825 : }
2826 :
2827 0 : RRETURN(MATCH_NOMATCH);
2828 : }
2829 : }
2830 : /* Control never gets here */
2831 :
2832 : /* Match a single character type repeatedly; several different opcodes
2833 : share code. This is very similar to the code for single characters, but we
2834 : repeat it in the interests of efficiency. */
2835 :
2836 : case OP_TYPEEXACT:
2837 218 : min = max = GET2(ecode, 1);
2838 218 : minimize = TRUE;
2839 218 : ecode += 3;
2840 218 : goto REPEATTYPE;
2841 :
2842 : case OP_TYPEUPTO:
2843 : case OP_TYPEMINUPTO:
2844 7 : min = 0;
2845 7 : max = GET2(ecode, 1);
2846 7 : minimize = *ecode == OP_TYPEMINUPTO;
2847 7 : ecode += 3;
2848 7 : goto REPEATTYPE;
2849 :
2850 : case OP_TYPEPOSSTAR:
2851 507 : possessive = TRUE;
2852 507 : min = 0;
2853 507 : max = INT_MAX;
2854 507 : ecode++;
2855 507 : goto REPEATTYPE;
2856 :
2857 : case OP_TYPEPOSPLUS:
2858 26731 : possessive = TRUE;
2859 26731 : min = 1;
2860 26731 : max = INT_MAX;
2861 26731 : ecode++;
2862 26731 : goto REPEATTYPE;
2863 :
2864 : case OP_TYPEPOSQUERY:
2865 0 : possessive = TRUE;
2866 0 : min = 0;
2867 0 : max = 1;
2868 0 : ecode++;
2869 0 : goto REPEATTYPE;
2870 :
2871 : case OP_TYPEPOSUPTO:
2872 0 : possessive = TRUE;
2873 0 : min = 0;
2874 0 : max = GET2(ecode, 1);
2875 0 : ecode += 3;
2876 0 : goto REPEATTYPE;
2877 :
2878 : case OP_TYPESTAR:
2879 : case OP_TYPEMINSTAR:
2880 : case OP_TYPEPLUS:
2881 : case OP_TYPEMINPLUS:
2882 : case OP_TYPEQUERY:
2883 : case OP_TYPEMINQUERY:
2884 8301 : c = *ecode++ - OP_TYPESTAR;
2885 8301 : minimize = (c & 1) != 0;
2886 8301 : min = rep_min[c]; /* Pick up values from tables; */
2887 8301 : max = rep_max[c]; /* zero for max => infinity */
2888 8301 : if (max == 0) max = INT_MAX;
2889 :
2890 : /* Common code for all repeated single character type matches. Note that
2891 : in UTF-8 mode, '.' matches a character of any length, but for the other
2892 : character types, the valid characters are all one-byte long. */
2893 :
2894 35764 : REPEATTYPE:
2895 35764 : ctype = *ecode++; /* Code for the character type */
2896 :
2897 : #ifdef SUPPORT_UCP
2898 35771 : if (ctype == OP_PROP || ctype == OP_NOTPROP)
2899 : {
2900 7 : prop_fail_result = ctype == OP_NOTPROP;
2901 7 : prop_type = *ecode++;
2902 7 : prop_value = *ecode++;
2903 : }
2904 35757 : else prop_type = -1;
2905 : #endif
2906 :
2907 : /* First, ensure the minimum number of matches are present. Use inline
2908 : code for maximizing the speed, and do the type test once at the start
2909 : (i.e. keep it out of the loop). Also we can test that there are at least
2910 : the minimum number of bytes before we start. This isn't as effective in
2911 : UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2912 : is tidier. Also separate the UCP code, which can be the same for both UTF-8
2913 : and single-bytes. */
2914 :
2915 35764 : if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2916 35753 : if (min > 0)
2917 : {
2918 : #ifdef SUPPORT_UCP
2919 31920 : if (prop_type >= 0)
2920 : {
2921 7 : switch(prop_type)
2922 : {
2923 : case PT_ANY:
2924 0 : if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2925 0 : for (i = 1; i <= min; i++)
2926 : {
2927 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2928 0 : GETCHARINCTEST(c, eptr);
2929 : }
2930 0 : break;
2931 :
2932 : case PT_LAMP:
2933 0 : for (i = 1; i <= min; i++)
2934 : {
2935 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2936 0 : GETCHARINCTEST(c, eptr);
2937 0 : prop_chartype = UCD_CHARTYPE(c);
2938 0 : if ((prop_chartype == ucp_Lu ||
2939 : prop_chartype == ucp_Ll ||
2940 : prop_chartype == ucp_Lt) == prop_fail_result)
2941 0 : RRETURN(MATCH_NOMATCH);
2942 : }
2943 0 : break;
2944 :
2945 : case PT_GC:
2946 8 : for (i = 1; i <= min; i++)
2947 : {
2948 7 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2949 7 : GETCHARINCTEST(c, eptr);
2950 7 : prop_category = UCD_CATEGORY(c);
2951 7 : if ((prop_category == prop_value) == prop_fail_result)
2952 6 : RRETURN(MATCH_NOMATCH);
2953 : }
2954 1 : break;
2955 :
2956 : case PT_PC:
2957 0 : for (i = 1; i <= min; i++)
2958 : {
2959 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2960 0 : GETCHARINCTEST(c, eptr);
2961 0 : prop_chartype = UCD_CHARTYPE(c);
2962 0 : if ((prop_chartype == prop_value) == prop_fail_result)
2963 0 : RRETURN(MATCH_NOMATCH);
2964 : }
2965 0 : break;
2966 :
2967 : case PT_SC:
2968 0 : for (i = 1; i <= min; i++)
2969 : {
2970 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2971 0 : GETCHARINCTEST(c, eptr);
2972 0 : prop_script = UCD_SCRIPT(c);
2973 0 : if ((prop_script == prop_value) == prop_fail_result)
2974 0 : RRETURN(MATCH_NOMATCH);
2975 : }
2976 0 : break;
2977 :
2978 : default:
2979 0 : RRETURN(PCRE_ERROR_INTERNAL);
2980 : }
2981 : }
2982 :
2983 : /* Match extended Unicode sequences. We will get here only if the
2984 : support is in the binary; otherwise a compile-time error occurs. */
2985 :
2986 31913 : else if (ctype == OP_EXTUNI)
2987 : {
2988 0 : for (i = 1; i <= min; i++)
2989 : {
2990 0 : GETCHARINCTEST(c, eptr);
2991 0 : prop_category = UCD_CATEGORY(c);
2992 0 : if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2993 0 : while (eptr < md->end_subject)
2994 : {
2995 0 : int len = 1;
2996 0 : if (!utf8) c = *eptr; else
2997 : {
2998 0 : GETCHARLEN(c, eptr, len);
2999 : }
3000 0 : prop_category = UCD_CATEGORY(c);
3001 0 : if (prop_category != ucp_M) break;
3002 0 : eptr += len;
3003 : }
3004 : }
3005 : }
3006 :
3007 : else
3008 : #endif /* SUPPORT_UCP */
3009 :
3010 : /* Handle all other cases when the coding is UTF-8 */
3011 :
3012 : #ifdef SUPPORT_UTF8
3013 31913 : if (utf8) switch(ctype)
3014 : {
3015 : case OP_ANY:
3016 0 : for (i = 1; i <= min; i++)
3017 : {
3018 0 : if (eptr >= md->end_subject || IS_NEWLINE(eptr))
3019 0 : RRETURN(MATCH_NOMATCH);
3020 0 : eptr++;
3021 0 : while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3022 : }
3023 0 : break;
3024 :
3025 : case OP_ALLANY:
3026 0 : for (i = 1; i <= min; i++)
3027 : {
3028 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3029 0 : eptr++;
3030 0 : while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3031 : }
3032 0 : break;
3033 :
3034 : case OP_ANYBYTE:
3035 0 : eptr += min;
3036 0 : break;
3037 :
3038 : case OP_ANYNL:
3039 0 : for (i = 1; i <= min; i++)
3040 : {
3041 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3042 0 : GETCHARINC(c, eptr);
3043 0 : switch(c)
3044 : {
3045 0 : default: RRETURN(MATCH_NOMATCH);
3046 : case 0x000d:
3047 0 : if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3048 0 : break;
3049 :
3050 : case 0x000a:
3051 0 : break;
3052 :
3053 : case 0x000b:
3054 : case 0x000c:
3055 : case 0x0085:
3056 : case 0x2028:
3057 : case 0x2029:
3058 0 : if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3059 : break;
3060 : }
3061 : }
3062 0 : break;
3063 :
3064 : case OP_NOT_HSPACE:
3065 0 : for (i = 1; i <= min; i++)
3066 : {
3067 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3068 0 : GETCHARINC(c, eptr);
3069 0 : switch(c)
3070 : {
3071 : default: break;
3072 : case 0x09: /* HT */
3073 : case 0x20: /* SPACE */
3074 : case 0xa0: /* NBSP */
3075 : case 0x1680: /* OGHAM SPACE MARK */
3076 : case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3077 : case 0x2000: /* EN QUAD */
3078 : case 0x2001: /* EM QUAD */
3079 : case 0x2002: /* EN SPACE */
3080 : case 0x2003: /* EM SPACE */
3081 : case 0x2004: /* THREE-PER-EM SPACE */
3082 : case 0x2005: /* FOUR-PER-EM SPACE */
3083 : case 0x2006: /* SIX-PER-EM SPACE */
3084 : case 0x2007: /* FIGURE SPACE */
3085 : case 0x2008: /* PUNCTUATION SPACE */
3086 : case 0x2009: /* THIN SPACE */
3087 : case 0x200A: /* HAIR SPACE */
3088 : case 0x202f: /* NARROW NO-BREAK SPACE */
3089 : case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3090 : case 0x3000: /* IDEOGRAPHIC SPACE */
3091 0 : RRETURN(MATCH_NOMATCH);
3092 : }
3093 : }
3094 0 : break;
3095 :
3096 : case OP_HSPACE:
3097 0 : for (i = 1; i <= min; i++)
3098 : {
3099 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3100 0 : GETCHARINC(c, eptr);
3101 0 : switch(c)
3102 : {
3103 0 : default: RRETURN(MATCH_NOMATCH);
3104 : case 0x09: /* HT */
3105 : case 0x20: /* SPACE */
3106 : case 0xa0: /* NBSP */
3107 : case 0x1680: /* OGHAM SPACE MARK */
3108 : case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3109 : case 0x2000: /* EN QUAD */
3110 : case 0x2001: /* EM QUAD */
3111 : case 0x2002: /* EN SPACE */
3112 : case 0x2003: /* EM SPACE */
3113 : case 0x2004: /* THREE-PER-EM SPACE */
3114 : case 0x2005: /* FOUR-PER-EM SPACE */
3115 : case 0x2006: /* SIX-PER-EM SPACE */
3116 : case 0x2007: /* FIGURE SPACE */
3117 : case 0x2008: /* PUNCTUATION SPACE */
3118 : case 0x2009: /* THIN SPACE */
3119 : case 0x200A: /* HAIR SPACE */
3120 : case 0x202f: /* NARROW NO-BREAK SPACE */
3121 : case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3122 : case 0x3000: /* IDEOGRAPHIC SPACE */
3123 : break;
3124 : }
3125 : }
3126 0 : break;
3127 :
3128 : case OP_NOT_VSPACE:
3129 0 : for (i = 1; i <= min; i++)
3130 : {
3131 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3132 0 : GETCHARINC(c, eptr);
3133 0 : switch(c)
3134 : {
3135 : default: break;
3136 : case 0x0a: /* LF */
3137 : case 0x0b: /* VT */
3138 : case 0x0c: /* FF */
3139 : case 0x0d: /* CR */
3140 : case 0x85: /* NEL */
3141 : case 0x2028: /* LINE SEPARATOR */
3142 : case 0x2029: /* PARAGRAPH SEPARATOR */
3143 0 : RRETURN(MATCH_NOMATCH);
3144 : }
3145 : }
3146 0 : break;
3147 :
3148 : case OP_VSPACE:
3149 0 : for (i = 1; i <= min; i++)
3150 : {
3151 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3152 0 : GETCHARINC(c, eptr);
3153 0 : switch(c)
3154 : {
3155 0 : default: RRETURN(MATCH_NOMATCH);
3156 : case 0x0a: /* LF */
3157 : case 0x0b: /* VT */
3158 : case 0x0c: /* FF */
3159 : case 0x0d: /* CR */
3160 : case 0x85: /* NEL */
3161 : case 0x2028: /* LINE SEPARATOR */
3162 : case 0x2029: /* PARAGRAPH SEPARATOR */
3163 : break;
3164 : }
3165 : }
3166 0 : break;
3167 :
3168 : case OP_NOT_DIGIT:
3169 0 : for (i = 1; i <= min; i++)
3170 : {
3171 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3172 0 : GETCHARINC(c, eptr);
3173 0 : if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3174 0 : RRETURN(MATCH_NOMATCH);
3175 : }
3176 0 : break;
3177 :
3178 : case OP_DIGIT:
3179 0 : for (i = 1; i <= min; i++)
3180 : {
3181 0 : if (eptr >= md->end_subject ||
3182 : *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3183 0 : RRETURN(MATCH_NOMATCH);
3184 : /* No need to skip more bytes - we know it's a 1-byte character */
3185 : }
3186 0 : break;
3187 :
3188 : case OP_NOT_WHITESPACE:
3189 6 : for (i = 1; i <= min; i++)
3190 : {
3191 4 : if (eptr >= md->end_subject ||
3192 : (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
3193 0 : RRETURN(MATCH_NOMATCH);
3194 5 : while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3195 : }
3196 2 : break;
3197 :
3198 : case OP_WHITESPACE:
3199 0 : for (i = 1; i <= min; i++)
3200 : {
3201 0 : if (eptr >= md->end_subject ||
3202 : *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3203 0 : RRETURN(MATCH_NOMATCH);
3204 : /* No need to skip more bytes - we know it's a 1-byte character */
3205 : }
3206 0 : break;
3207 :
3208 : case OP_NOT_WORDCHAR:
3209 4 : for (i = 1; i <= min; i++)
3210 : {
3211 3 : if (eptr >= md->end_subject ||
3212 : (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3213 1 : RRETURN(MATCH_NOMATCH);
3214 4 : while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3215 : }
3216 1 : break;
3217 :
3218 : case OP_WORDCHAR:
3219 0 : for (i = 1; i <= min; i++)
3220 : {
3221 0 : if (eptr >= md->end_subject ||
3222 : *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3223 0 : RRETURN(MATCH_NOMATCH);
3224 : /* No need to skip more bytes - we know it's a 1-byte character */
3225 : }
3226 0 : break;
3227 :
3228 : default:
3229 0 : RRETURN(PCRE_ERROR_INTERNAL);
3230 : } /* End switch(ctype) */
3231 :
3232 : else
3233 : #endif /* SUPPORT_UTF8 */
3234 :
3235 : /* Code for the non-UTF-8 case for minimum matching of operators other
3236 : than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3237 : number of bytes present, as this was tested above. */
3238 :
3239 31909 : switch(ctype)
3240 : {
3241 : case OP_ANY:
3242 1119 : for (i = 1; i <= min; i++)
3243 : {
3244 564 : if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3245 561 : eptr++;
3246 : }
3247 555 : break;
3248 :
3249 : case OP_ALLANY:
3250 629 : eptr += min;
3251 629 : break;
3252 :
3253 : case OP_ANYBYTE:
3254 0 : eptr += min;
3255 0 : break;
3256 :
3257 : /* Because of the CRLF case, we can't assume the minimum number of
3258 : bytes are present in this case. */
3259 :
3260 : case OP_ANYNL:
3261 0 : for (i = 1; i <= min; i++)
3262 : {
3263 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3264 0 : switch(*eptr++)
3265 : {
3266 0 : default: RRETURN(MATCH_NOMATCH);
3267 : case 0x000d:
3268 0 : if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3269 0 : break;
3270 : case 0x000a:
3271 0 : break;
3272 :
3273 : case 0x000b:
3274 : case 0x000c:
3275 : case 0x0085:
3276 0 : if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3277 : break;
3278 : }
3279 : }
3280 0 : break;
3281 :
3282 : case OP_NOT_HSPACE:
3283 0 : for (i = 1; i <= min; i++)
3284 : {
3285 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3286 0 : switch(*eptr++)
3287 : {
3288 : default: break;
3289 : case 0x09: /* HT */
3290 : case 0x20: /* SPACE */
3291 : case 0xa0: /* NBSP */
3292 0 : RRETURN(MATCH_NOMATCH);
3293 : }
3294 : }
3295 0 : break;
3296 :
3297 : case OP_HSPACE:
3298 0 : for (i = 1; i <= min; i++)
3299 : {
3300 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3301 0 : switch(*eptr++)
3302 : {
3303 0 : default: RRETURN(MATCH_NOMATCH);
3304 : case 0x09: /* HT */
3305 : case 0x20: /* SPACE */
3306 : case 0xa0: /* NBSP */
3307 : break;
3308 : }
3309 : }
3310 0 : break;
3311 :
3312 : case OP_NOT_VSPACE:
3313 0 : for (i = 1; i <= min; i++)
3314 : {
3315 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3316 0 : switch(*eptr++)
3317 : {
3318 : default: break;
3319 : case 0x0a: /* LF */
3320 : case 0x0b: /* VT */
3321 : case 0x0c: /* FF */
3322 : case 0x0d: /* CR */
3323 : case 0x85: /* NEL */
3324 0 : RRETURN(MATCH_NOMATCH);
3325 : }
3326 : }
3327 0 : break;
3328 :
3329 : case OP_VSPACE:
3330 0 : for (i = 1; i <= min; i++)
3331 : {
3332 0 : if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3333 0 : switch(*eptr++)
3334 : {
3335 0 : default: RRETURN(MATCH_NOMATCH);
3336 : case 0x0a: /* LF */
3337 : case 0x0b: /* VT */
3338 : case 0x0c: /* FF */
3339 : case 0x0d: /* CR */
3340 : case 0x85: /* NEL */
3341 : break;
3342 : }
3343 : }
3344 0 : break;
3345 :
3346 : case OP_NOT_DIGIT:
3347 2 : for (i = 1; i <= min; i++)
3348 1 : if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3349 1 : break;
3350 :
3351 : case OP_DIGIT:
3352 55531 : for (i = 1; i <= min; i++)
3353 28101 : if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3354 27430 : break;
3355 :
3356 : case OP_NOT_WHITESPACE:
3357 8 : for (i = 1; i <= min; i++)
3358 4 : if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3359 4 : break;
3360 :
3361 : case OP_WHITESPACE:
3362 4887 : for (i = 1; i <= min; i++)
3363 2447 : if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3364 2440 : break;
3365 :
3366 : case OP_NOT_WORDCHAR:
3367 0 : for (i = 1; i <= min; i++)
3368 0 : if ((md->ctypes[*eptr++] & ctype_word) != 0)
3369 0 : RRETURN(MATCH_NOMATCH);
3370 0 : break;
3371 :
3372 : case OP_WORDCHAR:
3373 388 : for (i = 1; i <= min; i++)
3374 251 : if ((md->ctypes[*eptr++] & ctype_word) == 0)
3375 45 : RRETURN(MATCH_NOMATCH);
3376 137 : break;
3377 :
3378 : default:
3379 0 : RRETURN(PCRE_ERROR_INTERNAL);
3380 : }
3381 : }
3382 :
3383 : /* If min = max, continue at the same level without recursing */
3384 :
3385 35033 : if (min == max) continue;
3386 :
3387 : /* If minimizing, we have to test the rest of the pattern before each
3388 : subsequent match. Again, separate the UTF-8 case for speed, and also
3389 : separate the UCP cases. */
3390 :
3391 34968 : if (minimize)
3392 : {
3393 : #ifdef SUPPORT_UCP
3394 94 : if (prop_type >= 0)
3395 : {
3396 0 : switch(prop_type)
3397 : {
3398 : case PT_ANY:
3399 0 : for (fi = min;; fi++)
3400 : {
3401 0 : RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3402 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3403 0 : if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3404 0 : GETCHARINC(c, eptr);
3405 0 : if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3406 0 : }
3407 : /* Control never gets here */
3408 :
3409 : case PT_LAMP:
3410 0 : for (fi = min;; fi++)
3411 : {
3412 0 : RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3413 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3414 0 : if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3415 0 : GETCHARINC(c, eptr);
3416 0 : prop_chartype = UCD_CHARTYPE(c);
3417 0 : if ((prop_chartype == ucp_Lu ||
3418 : prop_chartype == ucp_Ll ||
3419 : prop_chartype == ucp_Lt) == prop_fail_result)
3420 0 : RRETURN(MATCH_NOMATCH);
3421 0 : }
3422 : /* Control never gets here */
3423 :
3424 : case PT_GC:
3425 0 : for (fi = min;; fi++)
3426 : {
3427 0 : RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3428 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3429 0 : if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3430 0 : GETCHARINC(c, eptr);
3431 0 : prop_category = UCD_CATEGORY(c);
3432 0 : if ((prop_category == prop_value) == prop_fail_result)
3433 0 : RRETURN(MATCH_NOMATCH);
3434 0 : }
3435 : /* Control never gets here */
3436 :
3437 : case PT_PC:
3438 0 : for (fi = min;; fi++)
3439 : {
3440 0 : RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3441 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3442 0 : if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3443 0 : GETCHARINC(c, eptr);
3444 0 : prop_chartype = UCD_CHARTYPE(c);
3445 0 : if ((prop_chartype == prop_value) == prop_fail_result)
3446 0 : RRETURN(MATCH_NOMATCH);
3447 0 : }
3448 : /* Control never gets here */
3449 :
3450 : case PT_SC:
3451 0 : for (fi = min;; fi++)
3452 : {
3453 0 : RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3454 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3455 0 : if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3456 0 : GETCHARINC(c, eptr);
3457 0 : prop_script = UCD_SCRIPT(c);
3458 0 : if ((prop_script == prop_value) == prop_fail_result)
3459 0 : RRETURN(MATCH_NOMATCH);
3460 0 : }
3461 : /* Control never gets here */
3462 :
3463 : default:
3464 0 : RRETURN(PCRE_ERROR_INTERNAL);
3465 : }
3466 : }
3467 :
3468 : /* Match extended Unicode sequences. We will get here only if the
3469 : support is in the binary; otherwise a compile-time error occurs. */
3470 :
3471 94 : else if (ctype == OP_EXTUNI)
3472 : {
3473 0 : for (fi = min;; fi++)
3474 : {
3475 0 : RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3476 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3477 0 : if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3478 0 : GETCHARINCTEST(c, eptr);
3479 0 : prop_category = UCD_CATEGORY(c);
3480 0 : if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3481 0 : while (eptr < md->end_subject)
3482 : {
3483 0 : int len = 1;
3484 0 : if (!utf8) c = *eptr; else
3485 : {
3486 0 : GETCHARLEN(c, eptr, len);
3487 : }
3488 0 : prop_category = UCD_CATEGORY(c);
3489 0 : if (prop_category != ucp_M) break;
3490 0 : eptr += len;
3491 : }
3492 0 : }
3493 : }
3494 :
3495 : else
3496 : #endif /* SUPPORT_UCP */
3497 :
3498 : #ifdef SUPPORT_UTF8
3499 : /* UTF-8 mode */
3500 94 : if (utf8)
3501 : {
3502 0 : for (fi = min;; fi++)
3503 : {
3504 0 : RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3505 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3506 0 : if (fi >= max || eptr >= md->end_subject ||
3507 : (ctype == OP_ANY && IS_NEWLINE(eptr)))
3508 0 : RRETURN(MATCH_NOMATCH);
3509 :
3510 0 : GETCHARINC(c, eptr);
3511 0 : switch(ctype)
3512 : {
3513 : case OP_ANY: /* This is the non-NL case */
3514 : case OP_ALLANY:
3515 : case OP_ANYBYTE:
3516 0 : break;
3517 :
3518 : case OP_ANYNL:
3519 0 : switch(c)
3520 : {
3521 0 : default: RRETURN(MATCH_NOMATCH);
3522 : case 0x000d:
3523 0 : if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3524 0 : break;
3525 : case 0x000a:
3526 0 : break;
3527 :
3528 : case 0x000b:
3529 : case 0x000c:
3530 : case 0x0085:
3531 : case 0x2028:
3532 : case 0x2029:
3533 0 : if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3534 : break;
3535 : }
3536 0 : break;
3537 :
3538 : case OP_NOT_HSPACE:
3539 0 : switch(c)
3540 : {
3541 : default: break;
3542 : case 0x09: /* HT */
3543 : case 0x20: /* SPACE */
3544 : case 0xa0: /* NBSP */
3545 : case 0x1680: /* OGHAM SPACE MARK */
3546 : case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3547 : case 0x2000: /* EN QUAD */
3548 : case 0x2001: /* EM QUAD */
3549 : case 0x2002: /* EN SPACE */
3550 : case 0x2003: /* EM SPACE */
3551 : case 0x2004: /* THREE-PER-EM SPACE */
3552 : case 0x2005: /* FOUR-PER-EM SPACE */
3553 : case 0x2006: /* SIX-PER-EM SPACE */
3554 : case 0x2007: /* FIGURE SPACE */
3555 : case 0x2008: /* PUNCTUATION SPACE */
3556 : case 0x2009: /* THIN SPACE */
3557 : case 0x200A: /* HAIR SPACE */
3558 : case 0x202f: /* NARROW NO-BREAK SPACE */
3559 : case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3560 : case 0x3000: /* IDEOGRAPHIC SPACE */
3561 0 : RRETURN(MATCH_NOMATCH);
3562 : }
3563 0 : break;
3564 :
3565 : case OP_HSPACE:
3566 0 : switch(c)
3567 : {
3568 0 : default: RRETURN(MATCH_NOMATCH);
3569 : case 0x09: /* HT */
3570 : case 0x20: /* SPACE */
3571 : case 0xa0: /* NBSP */
3572 : case 0x1680: /* OGHAM SPACE MARK */
3573 : case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3574 : case 0x2000: /* EN QUAD */
3575 : case 0x2001: /* EM QUAD */
3576 : case 0x2002: /* EN SPACE */
3577 : case 0x2003: /* EM SPACE */
3578 : case 0x2004: /* THREE-PER-EM SPACE */
3579 : case 0x2005: /* FOUR-PER-EM SPACE */
3580 : case 0x2006: /* SIX-PER-EM SPACE */
3581 : case 0x2007: /* FIGURE SPACE */
3582 : case 0x2008: /* PUNCTUATION SPACE */
3583 : case 0x2009: /* THIN SPACE */
3584 : case 0x200A: /* HAIR SPACE */
3585 : case 0x202f: /* NARROW NO-BREAK SPACE */
3586 : case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3587 : case 0x3000: /* IDEOGRAPHIC SPACE */
3588 : break;
3589 : }
3590 0 : break;
3591 :
3592 : case OP_NOT_VSPACE:
3593 0 : switch(c)
3594 : {
3595 : default: break;
3596 : case 0x0a: /* LF */
3597 : case 0x0b: /* VT */
3598 : case 0x0c: /* FF */
3599 : case 0x0d: /* CR */
3600 : case 0x85: /* NEL */
3601 : case 0x2028: /* LINE SEPARATOR */
3602 : case 0x2029: /* PARAGRAPH SEPARATOR */
3603 0 : RRETURN(MATCH_NOMATCH);
3604 : }
3605 0 : break;
3606 :
3607 : case OP_VSPACE:
3608 0 : switch(c)
3609 : {
3610 0 : default: RRETURN(MATCH_NOMATCH);
3611 : case 0x0a: /* LF */
3612 : case 0x0b: /* VT */
3613 : case 0x0c: /* FF */
3614 : case 0x0d: /* CR */
3615 : case 0x85: /* NEL */
3616 : case 0x2028: /* LINE SEPARATOR */
3617 : case 0x2029: /* PARAGRAPH SEPARATOR */
3618 : break;
3619 : }
3620 0 : break;
3621 :
3622 : case OP_NOT_DIGIT:
3623 0 : if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3624 0 : RRETURN(MATCH_NOMATCH);
3625 0 : break;
3626 :
3627 : case OP_DIGIT:
3628 0 : if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3629 0 : RRETURN(MATCH_NOMATCH);
3630 0 : break;
3631 :
3632 : case OP_NOT_WHITESPACE:
3633 0 : if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3634 0 : RRETURN(MATCH_NOMATCH);
3635 0 : break;
3636 :
3637 : case OP_WHITESPACE:
3638 0 : if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3639 0 : RRETURN(MATCH_NOMATCH);
3640 0 : break;
3641 :
3642 : case OP_NOT_WORDCHAR:
3643 0 : if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3644 0 : RRETURN(MATCH_NOMATCH);
3645 0 : break;
3646 :
3647 : case OP_WORDCHAR:
3648 0 : if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3649 0 : RRETURN(MATCH_NOMATCH);
3650 0 : break;
3651 :
3652 : default:
3653 0 : RRETURN(PCRE_ERROR_INTERNAL);
3654 : }
3655 0 : }
3656 : }
3657 : else
3658 : #endif
3659 : /* Not UTF-8 mode */
3660 : {
3661 3775 : for (fi = min;; fi++)
3662 : {
3663 3775 : RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3664 3775 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3665 3682 : if (fi >= max || eptr >= md->end_subject ||
3666 : (ctype == OP_ANY && IS_NEWLINE(eptr)))
3667 1 : RRETURN(MATCH_NOMATCH);
3668 :
3669 3681 : c = *eptr++;
3670 3681 : switch(ctype)
3671 : {
3672 : case OP_ANY: /* This is the non-NL case */
3673 : case OP_ALLANY:
3674 : case OP_ANYBYTE:
3675 3681 : break;
3676 :
3677 : case OP_ANYNL:
3678 0 : switch(c)
3679 : {
3680 0 : default: RRETURN(MATCH_NOMATCH);
3681 : case 0x000d:
3682 0 : if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3683 0 : break;
3684 :
3685 : case 0x000a:
3686 0 : break;
3687 :
3688 : case 0x000b:
3689 : case 0x000c:
3690 : case 0x0085:
3691 0 : if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3692 : break;
3693 : }
3694 0 : break;
3695 :
3696 : case OP_NOT_HSPACE:
3697 0 : switch(c)
3698 : {
3699 : default: break;
3700 : case 0x09: /* HT */
3701 : case 0x20: /* SPACE */
3702 : case 0xa0: /* NBSP */
3703 0 : RRETURN(MATCH_NOMATCH);
3704 : }
3705 0 : break;
3706 :
3707 : case OP_HSPACE:
3708 0 : switch(c)
3709 : {
3710 0 : default: RRETURN(MATCH_NOMATCH);
3711 : case 0x09: /* HT */
3712 : case 0x20: /* SPACE */
3713 : case 0xa0: /* NBSP */
3714 : break;
3715 : }
3716 0 : break;
3717 :
3718 : case OP_NOT_VSPACE:
3719 0 : switch(c)
3720 : {
3721 : default: break;
3722 : case 0x0a: /* LF */
3723 : case 0x0b: /* VT */
3724 : case 0x0c: /* FF */
3725 : case 0x0d: /* CR */
3726 : case 0x85: /* NEL */
3727 0 : RRETURN(MATCH_NOMATCH);
3728 : }
3729 0 : break;
3730 :
3731 : case OP_VSPACE:
3732 0 : switch(c)
3733 : {
3734 0 : default: RRETURN(MATCH_NOMATCH);
3735 : case 0x0a: /* LF */
3736 : case 0x0b: /* VT */
3737 : case 0x0c: /* FF */
3738 : case 0x0d: /* CR */
3739 : case 0x85: /* NEL */
3740 : break;
3741 : }
3742 0 : break;
3743 :
3744 : case OP_NOT_DIGIT:
3745 0 : if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3746 0 : break;
3747 :
3748 : case OP_DIGIT:
3749 0 : if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3750 0 : break;
3751 :
3752 : case OP_NOT_WHITESPACE:
3753 0 : if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3754 0 : break;
3755 :
3756 : case OP_WHITESPACE:
3757 0 : if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3758 0 : break;
3759 :
3760 : case OP_NOT_WORDCHAR:
3761 0 : if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3762 0 : break;
3763 :
3764 : case OP_WORDCHAR:
3765 0 : if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3766 0 : break;
3767 :
3768 : default:
3769 0 : RRETURN(PCRE_ERROR_INTERNAL);
3770 : }
3771 3681 : }
3772 : }
3773 : /* Control never gets here */
3774 : }
3775 :
3776 : /* If maximizing, it is worth using inline code for speed, doing the type
3777 : test once at the start (i.e. keep it out of the loop). Again, keep the
3778 : UTF-8 and UCP stuff separate. */
3779 :
3780 : else
3781 : {
3782 34874 : pp = eptr; /* Remember where we started */
3783 :
3784 : #ifdef SUPPORT_UCP
3785 34874 : if (prop_type >= 0)
3786 : {
3787 1 : switch(prop_type)
3788 : {
3789 : case PT_ANY:
3790 0 : for (i = min; i < max; i++)
3791 : {
3792 0 : int len = 1;
3793 0 : if (eptr >= md->end_subject) break;
3794 0 : GETCHARLEN(c, eptr, len);
3795 0 : if (prop_fail_result) break;
3796 0 : eptr+= len;
3797 : }
3798 0 : break;
3799 :
3800 : case PT_LAMP:
3801 0 : for (i = min; i < max; i++)
3802 : {
3803 0 : int len = 1;
3804 0 : if (eptr >= md->end_subject) break;
3805 0 : GETCHARLEN(c, eptr, len);
3806 0 : prop_chartype = UCD_CHARTYPE(c);
3807 0 : if ((prop_chartype == ucp_Lu ||
3808 : prop_chartype == ucp_Ll ||
3809 : prop_chartype == ucp_Lt) == prop_fail_result)
3810 0 : break;
3811 0 : eptr+= len;
3812 : }
3813 0 : break;
3814 :
3815 : case PT_GC:
3816 3 : for (i = min; i < max; i++)
3817 : {
3818 3 : int len = 1;
3819 3 : if (eptr >= md->end_subject) break;
3820 2 : GETCHARLEN(c, eptr, len);
3821 2 : prop_category = UCD_CATEGORY(c);
3822 2 : if ((prop_category == prop_value) == prop_fail_result)
3823 0 : break;
3824 2 : eptr+= len;
3825 : }
3826 1 : break;
3827 :
3828 : case PT_PC:
3829 0 : for (i = min; i < max; i++)
3830 : {
3831 0 : int len = 1;
3832 0 : if (eptr >= md->end_subject) break;
3833 0 : GETCHARLEN(c, eptr, len);
3834 0 : prop_chartype = UCD_CHARTYPE(c);
3835 0 : if ((prop_chartype == prop_value) == prop_fail_result)
3836 0 : break;
3837 0 : eptr+= len;
3838 : }
3839 0 : break;
3840 :
3841 : case PT_SC:
3842 0 : for (i = min; i < max; i++)
3843 : {
3844 0 : int len = 1;
3845 0 : if (eptr >= md->end_subject) break;
3846 0 : GETCHARLEN(c, eptr, len);
3847 0 : prop_script = UCD_SCRIPT(c);
3848 0 : if ((prop_script == prop_value) == prop_fail_result)
3849 0 : break;
3850 0 : eptr+= len;
3851 : }
3852 : break;
3853 : }
3854 :
3855 : /* eptr is now past the end of the maximum run */
3856 :
3857 1 : if (possessive) continue;
3858 : for(;;)
3859 : {
3860 1 : RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3861 1 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3862 0 : if (eptr-- == pp) break; /* Stop if tried at original pos */
3863 0 : if (utf8) BACKCHAR(eptr);
3864 0 : }
3865 : }
3866 :
3867 : /* Match extended Unicode sequences. We will get here only if the
3868 : support is in the binary; otherwise a compile-time error occurs. */
3869 :
3870 34873 : else if (ctype == OP_EXTUNI)
3871 : {
3872 0 : for (i = min; i < max; i++)
3873 : {
3874 0 : if (eptr >= md->end_subject) break;
3875 0 : GETCHARINCTEST(c, eptr);
3876 0 : prop_category = UCD_CATEGORY(c);
3877 0 : if (prop_category == ucp_M) break;
3878 0 : while (eptr < md->end_subject)
3879 : {
3880 0 : int len = 1;
3881 0 : if (!utf8) c = *eptr; else
3882 : {
3883 0 : GETCHARLEN(c, eptr, len);
3884 : }
3885 0 : prop_category = UCD_CATEGORY(c);
3886 0 : if (prop_category != ucp_M) break;
3887 0 : eptr += len;
3888 : }
3889 : }
3890 :
3891 : /* eptr is now past the end of the maximum run */
3892 :
3893 0 : if (possessive) continue;
3894 : for(;;)
3895 : {
3896 0 : RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3897 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3898 0 : if (eptr-- == pp) break; /* Stop if tried at original pos */
3899 : for (;;) /* Move back over one extended */
3900 : {
3901 0 : int len = 1;
3902 0 : if (!utf8) c = *eptr; else
3903 : {
3904 0 : BACKCHAR(eptr);
3905 0 : GETCHARLEN(c, eptr, len);
3906 : }
3907 0 : prop_category = UCD_CATEGORY(c);
3908 0 : if (prop_category != ucp_M) break;
3909 0 : eptr--;
3910 0 : }
3911 0 : }
3912 : }
3913 :
3914 : else
3915 : #endif /* SUPPORT_UCP */
3916 :
3917 : #ifdef SUPPORT_UTF8
3918 : /* UTF-8 mode */
3919 :
3920 34873 : if (utf8)
3921 : {
3922 0 : switch(ctype)
3923 : {
3924 : case OP_ANY:
3925 0 : if (max < INT_MAX)
3926 : {
3927 0 : for (i = min; i < max; i++)
3928 : {
3929 0 : if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3930 0 : eptr++;
3931 0 : while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3932 : }
3933 : }
3934 :
3935 : /* Handle unlimited UTF-8 repeat */
3936 :
3937 : else
3938 : {
3939 0 : for (i = min; i < max; i++)
3940 : {
3941 0 : if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3942 0 : eptr++;
3943 0 : while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3944 : }
3945 : }
3946 0 : break;
3947 :
3948 : case OP_ALLANY:
3949 0 : if (max < INT_MAX)
3950 : {
3951 0 : for (i = min; i < max; i++)
3952 : {
3953 0 : if (eptr >= md->end_subject) break;
3954 0 : eptr++;
3955 0 : while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3956 : }
3957 : }
3958 0 : else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
3959 0 : break;
3960 :
3961 : /* The byte case is the same as non-UTF8 */
3962 :
3963 : case OP_ANYBYTE:
3964 0 : c = max - min;
3965 0 : if (c > (unsigned int)(md->end_subject - eptr))
3966 0 : c = md->end_subject - eptr;
3967 0 : eptr += c;
3968 0 : break;
3969 :
3970 : case OP_ANYNL:
3971 0 : for (i = min; i < max; i++)
3972 : {
3973 0 : int len = 1;
3974 0 : if (eptr >= md->end_subject) break;
3975 0 : GETCHARLEN(c, eptr, len);
3976 0 : if (c == 0x000d)
3977 : {
3978 0 : if (++eptr >= md->end_subject) break;
3979 0 : if (*eptr == 0x000a) eptr++;
3980 : }
3981 : else
3982 : {
3983 0 : if (c != 0x000a &&
3984 : (md->bsr_anycrlf ||
3985 : (c != 0x000b && c != 0x000c &&
3986 : c != 0x0085 && c != 0x2028 && c != 0x2029)))
3987 : break;
3988 0 : eptr += len;
3989 : }
3990 : }
3991 0 : break;
3992 :
3993 : case OP_NOT_HSPACE:
3994 : case OP_HSPACE:
3995 0 : for (i = min; i < max; i++)
3996 : {
3997 : BOOL gotspace;
3998 0 : int len = 1;
3999 0 : if (eptr >= md->end_subject) break;
4000 0 : GETCHARLEN(c, eptr, len);
4001 0 : switch(c)
4002 : {
4003 0 : default: gotspace = FALSE; break;
4004 : case 0x09: /* HT */
4005 : case 0x20: /* SPACE */
4006 : case 0xa0: /* NBSP */
4007 : case 0x1680: /* OGHAM SPACE MARK */
4008 : case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4009 : case 0x2000: /* EN QUAD */
4010 : case 0x2001: /* EM QUAD */
4011 : case 0x2002: /* EN SPACE */
4012 : case 0x2003: /* EM SPACE */
4013 : case 0x2004: /* THREE-PER-EM SPACE */
4014 : case 0x2005: /* FOUR-PER-EM SPACE */
4015 : case 0x2006: /* SIX-PER-EM SPACE */
4016 : case 0x2007: /* FIGURE SPACE */
4017 : case 0x2008: /* PUNCTUATION SPACE */
4018 : case 0x2009: /* THIN SPACE */
4019 : case 0x200A: /* HAIR SPACE */
4020 : case 0x202f: /* NARROW NO-BREAK SPACE */
4021 : case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4022 : case 0x3000: /* IDEOGRAPHIC SPACE */
4023 0 : gotspace = TRUE;
4024 : break;
4025 : }
4026 0 : if (gotspace == (ctype == OP_NOT_HSPACE)) break;
4027 0 : eptr += len;
4028 : }
4029 0 : break;
4030 :
4031 : case OP_NOT_VSPACE:
4032 : case OP_VSPACE:
4033 0 : for (i = min; i < max; i++)
4034 : {
4035 : BOOL gotspace;
4036 0 : int len = 1;
4037 0 : if (eptr >= md->end_subject) break;
4038 0 : GETCHARLEN(c, eptr, len);
4039 0 : switch(c)
4040 : {
4041 0 : default: gotspace = FALSE; break;
4042 : case 0x0a: /* LF */
4043 : case 0x0b: /* VT */
4044 : case 0x0c: /* FF */
4045 : case 0x0d: /* CR */
4046 : case 0x85: /* NEL */
4047 : case 0x2028: /* LINE SEPARATOR */
4048 : case 0x2029: /* PARAGRAPH SEPARATOR */
4049 0 : gotspace = TRUE;
4050 : break;
4051 : }
4052 0 : if (gotspace == (ctype == OP_NOT_VSPACE)) break;
4053 0 : eptr += len;
4054 : }
4055 0 : break;
4056 :
4057 : case OP_NOT_DIGIT:
4058 0 : for (i = min; i < max; i++)
4059 : {
4060 0 : int len = 1;
4061 0 : if (eptr >= md->end_subject) break;
4062 0 : GETCHARLEN(c, eptr, len);
4063 0 : if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4064 0 : eptr+= len;
4065 : }
4066 0 : break;
4067 :
4068 : case OP_DIGIT:
4069 0 : for (i = min; i < max; i++)
4070 : {
4071 0 : int len = 1;
4072 0 : if (eptr >= md->end_subject) break;
4073 0 : GETCHARLEN(c, eptr, len);
4074 0 : if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4075 0 : eptr+= len;
4076 : }
4077 0 : break;
4078 :
4079 : case OP_NOT_WHITESPACE:
4080 0 : for (i = min; i < max; i++)
4081 : {
4082 0 : int len = 1;
4083 0 : if (eptr >= md->end_subject) break;
4084 0 : GETCHARLEN(c, eptr, len);
4085 0 : if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4086 0 : eptr+= len;
4087 : }
4088 0 : break;
4089 :
4090 : case OP_WHITESPACE:
4091 0 : for (i = min; i < max; i++)
4092 : {
4093 0 : int len = 1;
4094 0 : if (eptr >= md->end_subject) break;
4095 0 : GETCHARLEN(c, eptr, len);
4096 0 : if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4097 0 : eptr+= len;
4098 : }
4099 0 : break;
4100 :
4101 : case OP_NOT_WORDCHAR:
4102 0 : for (i = min; i < max; i++)
4103 : {
4104 0 : int len = 1;
4105 0 : if (eptr >= md->end_subject) break;
4106 0 : GETCHARLEN(c, eptr, len);
4107 0 : if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4108 0 : eptr+= len;
4109 : }
4110 0 : break;
4111 :
4112 : case OP_WORDCHAR:
4113 0 : for (i = min; i < max; i++)
4114 : {
4115 0 : int len = 1;
4116 0 : if (eptr >= md->end_subject) break;
4117 0 : GETCHARLEN(c, eptr, len);
4118 0 : if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4119 0 : eptr+= len;
4120 : }
4121 0 : break;
4122 :
4123 : default:
4124 0 : RRETURN(PCRE_ERROR_INTERNAL);
4125 : }
4126 :
4127 : /* eptr is now past the end of the maximum run */
4128 :
4129 0 : if (possessive) continue;
4130 : for(;;)
4131 : {
4132 0 : RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4133 0 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4134 0 : if (eptr-- == pp) break; /* Stop if tried at original pos */
4135 0 : BACKCHAR(eptr);
4136 0 : }
4137 : }
4138 : else
4139 : #endif /* SUPPORT_UTF8 */
4140 :
4141 : /* Not UTF-8 mode */
4142 : {
4143 34873 : switch(ctype)
4144 : {
4145 : case OP_ANY:
4146 15797 : for (i = min; i < max; i++)
4147 : {
4148 15793 : if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4149 15222 : eptr++;
4150 : }
4151 575 : break;
4152 :
4153 : case OP_ALLANY:
4154 : case OP_ANYBYTE:
4155 728 : c = max - min;
4156 728 : if (c > (unsigned int)(md->end_subject - eptr))
4157 728 : c = md->end_subject - eptr;
4158 728 : eptr += c;
4159 728 : break;
4160 :
4161 : case OP_ANYNL:
4162 0 : for (i = min; i < max; i++)
4163 : {
4164 0 : if (eptr >= md->end_subject) break;
4165 0 : c = *eptr;
4166 0 : if (c == 0x000d)
4167 : {
4168 0 : if (++eptr >= md->end_subject) break;
4169 0 : if (*eptr == 0x000a) eptr++;
4170 : }
4171 : else
4172 : {
4173 0 : if (c != 0x000a &&
4174 : (md->bsr_anycrlf ||
4175 : (c != 0x000b && c != 0x000c && c != 0x0085)))
4176 : break;
4177 0 : eptr++;
4178 : }
4179 : }
4180 0 : break;
4181 :
4182 : case OP_NOT_HSPACE:
4183 0 : for (i = min; i < max; i++)
4184 : {
4185 0 : if (eptr >= md->end_subject) break;
4186 0 : c = *eptr;
4187 0 : if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4188 0 : eptr++;
4189 : }
4190 0 : break;
4191 :
4192 : case OP_HSPACE:
4193 0 : for (i = min; i < max; i++)
4194 : {
4195 0 : if (eptr >= md->end_subject) break;
4196 0 : c = *eptr;
4197 0 : if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4198 0 : eptr++;
4199 : }
4200 0 : break;
4201 :
4202 : case OP_NOT_VSPACE:
4203 0 : for (i = min; i < max; i++)
4204 : {
4205 0 : if (eptr >= md->end_subject) break;
4206 0 : c = *eptr;
4207 0 : if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4208 : break;
4209 0 : eptr++;
4210 : }
4211 0 : break;
4212 :
4213 : case OP_VSPACE:
4214 0 : for (i = min; i < max; i++)
4215 : {
4216 0 : if (eptr >= md->end_subject) break;
4217 0 : c = *eptr;
4218 0 : if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4219 0 : break;
4220 0 : eptr++;
4221 : }
4222 0 : break;
4223 :
4224 : case OP_NOT_DIGIT:
4225 5 : for (i = min; i < max; i++)
4226 : {
4227 5 : if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4228 : break;
4229 4 : eptr++;
4230 : }
4231 1 : break;
4232 :
4233 : case OP_DIGIT:
4234 60257 : for (i = min; i < max; i++)
4235 : {
4236 60257 : if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4237 : break;
4238 32606 : eptr++;
4239 : }
4240 27651 : break;
4241 :
4242 : case OP_NOT_WHITESPACE:
4243 72 : for (i = min; i < max; i++)
4244 : {
4245 72 : if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4246 : break;
4247 67 : eptr++;
4248 : }
4249 5 : break;
4250 :
4251 : case OP_WHITESPACE:
4252 8607 : for (i = min; i < max; i++)
4253 : {
4254 8607 : if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4255 : break;
4256 2837 : eptr++;
4257 : }
4258 5770 : break;
4259 :
4260 : case OP_NOT_WORDCHAR:
4261 0 : for (i = min; i < max; i++)
4262 : {
4263 0 : if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4264 : break;
4265 0 : eptr++;
4266 : }
4267 0 : break;
4268 :
4269 : case OP_WORDCHAR:
4270 788 : for (i = min; i < max; i++)
4271 : {
4272 782 : if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4273 : break;
4274 645 : eptr++;
4275 : }
4276 143 : break;
4277 :
4278 : default:
4279 0 : RRETURN(PCRE_ERROR_INTERNAL);
4280 : }
4281 :
4282 : /* eptr is now past the end of the maximum run */
4283 :
4284 34873 : if (possessive) continue;
4285 271358 : while (eptr >= pp)
4286 : {
4287 260585 : RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
4288 260585 : eptr--;
4289 260585 : if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4290 : }
4291 : }
4292 :
4293 : /* Get here if we can't make it match with any permitted repetitions */
4294 :
4295 3016 : RRETURN(MATCH_NOMATCH);
4296 : }
4297 : /* Control never gets here */
4298 :
4299 : /* There's been some horrible disaster. Arrival here can only mean there is
4300 : something seriously wrong in the code above or the OP_xxx definitions. */
4301 :
4302 : default:
4303 : DPRINTF(("Unknown opcode %d\n", *ecode));
4304 0 : RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
4305 : }
4306 :
4307 : /* Do not stick any code in here without much thought; it is assumed
4308 : that "continue" in the code above comes out to here to repeat the main
4309 : loop. */
4310 :
4311 16143551 : } /* End of main loop */
4312 : /* Control never reaches here */
4313 :
4314 :
4315 : /* When compiling to use the heap rather than the stack for recursive calls to
4316 : match(), the RRETURN() macro jumps here. The number that is saved in
4317 : frame->Xwhere indicates which label we actually want to return to. */
4318 :
4319 : #ifdef NO_RECURSE
4320 : #define LBL(val) case val: goto L_RM##val;
4321 : HEAP_RETURN:
4322 : switch (frame->Xwhere)
4323 : {
4324 : LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
4325 : LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
4326 : LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
4327 : LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
4328 : LBL(53) LBL(54)
4329 : #ifdef SUPPORT_UTF8
4330 : LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
4331 : LBL(32) LBL(34) LBL(42) LBL(46)
4332 : #ifdef SUPPORT_UCP
4333 : LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
4334 : #endif /* SUPPORT_UCP */
4335 : #endif /* SUPPORT_UTF8 */
4336 : default:
4337 : DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
4338 : return PCRE_ERROR_INTERNAL;
4339 : }
4340 : #undef LBL
4341 : #endif /* NO_RECURSE */
4342 : }
4343 :
4344 :
4345 : /***************************************************************************
4346 : ****************************************************************************
4347 : RECURSION IN THE match() FUNCTION
4348 :
4349 : Undefine all the macros that were defined above to handle this. */
4350 :
4351 : #ifdef NO_RECURSE
4352 : #undef eptr
4353 : #undef ecode
4354 : #undef mstart
4355 : #undef offset_top
4356 : #undef ims
4357 : #undef eptrb
4358 : #undef flags
4359 :
4360 : #undef callpat
4361 : #undef charptr
4362 : #undef data
4363 : #undef next
4364 : #undef pp
4365 : #undef prev
4366 : #undef saved_eptr
4367 :
4368 : #undef new_recursive
4369 :
4370 : #undef cur_is_word
4371 : #undef condition
4372 : #undef prev_is_word
4373 :
4374 : #undef original_ims
4375 :
4376 : #undef ctype
4377 : #undef length
4378 : #undef max
4379 : #undef min
4380 : #undef number
4381 : #undef offset
4382 : #undef op
4383 : #undef save_capture_last
4384 : #undef save_offset1
4385 : #undef save_offset2
4386 : #undef save_offset3
4387 : #undef stacksave
4388 :
4389 : #undef newptrb
4390 :
4391 : #endif
4392 :
4393 : /* These two are defined as macros in both cases */
4394 :
4395 : #undef fc
4396 : #undef fi
4397 :
4398 : /***************************************************************************
4399 : ***************************************************************************/
4400 :
4401 :
4402 :
4403 : /*************************************************
4404 : * Execute a Regular Expression *
4405 : *************************************************/
4406 :
4407 : /* This function applies a compiled re to a subject string and picks out
4408 : portions of the string if it matches. Two elements in the vector are set for
4409 : each substring: the offsets to the start and end of the substring.
4410 :
4411 : Arguments:
4412 : argument_re points to the compiled expression
4413 : extra_data points to extra data or is NULL
4414 : subject points to the subject string
4415 : length length of subject string (may contain binary zeros)
4416 : start_offset where to start in the subject string
4417 : options option bits
4418 : offsets points to a vector of ints to be filled in with offsets
4419 : offsetcount the number of elements in the vector
4420 :
4421 : Returns: > 0 => success; value is the number of elements filled in
4422 : = 0 => success, but offsets is not big enough
4423 : -1 => failed to match
4424 : < -1 => some kind of unexpected problem
4425 : */
4426 :
4427 : PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
4428 : pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
4429 : PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
4430 : int offsetcount)
4431 1290903 : {
4432 : int rc, resetcount, ocount;
4433 1290903 : int first_byte = -1;
4434 1290903 : int req_byte = -1;
4435 1290903 : int req_byte2 = -1;
4436 : int newline;
4437 : unsigned long int ims;
4438 1290903 : BOOL using_temporary_offsets = FALSE;
4439 : BOOL anchored;
4440 : BOOL startline;
4441 : BOOL firstline;
4442 1290903 : BOOL first_byte_caseless = FALSE;
4443 1290903 : BOOL req_byte_caseless = FALSE;
4444 : BOOL utf8;
4445 : match_data match_block;
4446 1290903 : match_data *md = &match_block;
4447 : const uschar *tables;
4448 1290903 : const uschar *start_bits = NULL;
4449 1290903 : USPTR start_match = (USPTR)subject + start_offset;
4450 : USPTR end_subject;
4451 1290903 : USPTR req_byte_ptr = start_match - 1;
4452 :
4453 : pcre_study_data internal_study;
4454 : const pcre_study_data *study;
4455 :
4456 : real_pcre internal_re;
4457 1290903 : const real_pcre *external_re = (const real_pcre *)argument_re;
4458 1290903 : const real_pcre *re = external_re;
4459 :
4460 : /* Plausibility checks */
4461 :
4462 1290903 : if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4463 1290903 : if (re == NULL || subject == NULL ||
4464 0 : (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4465 1290903 : if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
4466 :
4467 : /* Fish out the optional data from the extra_data structure, first setting
4468 : the default values. */
4469 :
4470 1290903 : study = NULL;
4471 1290903 : md->match_limit = MATCH_LIMIT;
4472 1290903 : md->match_limit_recursion = MATCH_LIMIT_RECURSION;
4473 1290903 : md->callout_data = NULL;
4474 :
4475 : /* The table pointer is always in native byte order. */
4476 :
4477 1290903 : tables = external_re->tables;
4478 :
4479 1290903 : if (extra_data != NULL)
4480 : {
4481 1290825 : register unsigned int flags = extra_data->flags;
4482 1290825 : if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
4483 12 : study = (const pcre_study_data *)extra_data->study_data;
4484 1290825 : if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
4485 1290825 : md->match_limit = extra_data->match_limit;
4486 1290825 : if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
4487 1290825 : md->match_limit_recursion = extra_data->match_limit_recursion;
4488 1290825 : if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
4489 0 : md->callout_data = extra_data->callout_data;
4490 1290825 : if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
4491 : }
4492 :
4493 : /* If the exec call supplied NULL for tables, use the inbuilt ones. This
4494 : is a feature that makes it possible to save compiled regex and re-use them
4495 : in other programs later. */
4496 :
4497 1290903 : if (tables == NULL) tables = _pcre_default_tables;
4498 :
4499 : /* Check that the first field in the block is the magic number. If it is not,
4500 : test for a regex that was compiled on a host of opposite endianness. If this is
4501 : the case, flipped values are put in internal_re and internal_study if there was
4502 : study data too. */
4503 :
4504 1290903 : if (re->magic_number != MAGIC_NUMBER)
4505 : {
4506 0 : re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
4507 0 : if (re == NULL) return PCRE_ERROR_BADMAGIC;
4508 0 : if (study != NULL) study = &internal_study;
4509 : }
4510 :
4511 : /* Set up other data */
4512 :
4513 1290903 : anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4514 1290903 : startline = (re->flags & PCRE_STARTLINE) != 0;
4515 1290903 : firstline = (re->options & PCRE_FIRSTLINE) != 0;
4516 :
4517 : /* The code starts after the real_pcre block and the capture name table. */
4518 :
4519 1290903 : md->start_code = (const uschar *)external_re + re->name_table_offset +
4520 : re->name_count * re->name_entry_size;
4521 :
4522 1290903 : md->start_subject = (USPTR)subject;
4523 1290903 : md->start_offset = start_offset;
4524 1290903 : md->end_subject = md->start_subject + length;
4525 1290903 : end_subject = md->end_subject;
4526 :
4527 1290903 : md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4528 1290903 : utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
4529 1290903 : md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
4530 :
4531 1290903 : md->notbol = (options & PCRE_NOTBOL) != 0;
4532 1290903 : md->noteol = (options & PCRE_NOTEOL) != 0;
4533 1290903 : md->notempty = (options & PCRE_NOTEMPTY) != 0;
4534 1290903 : md->partial = (options & PCRE_PARTIAL) != 0;
4535 1290903 : md->hitend = FALSE;
4536 :
4537 1290903 : md->recursive = NULL; /* No recursion at top level */
4538 :
4539 1290903 : md->lcc = tables + lcc_offset;
4540 1290903 : md->ctypes = tables + ctypes_offset;
4541 :
4542 : /* Handle different \R options. */
4543 :
4544 1290903 : switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
4545 : {
4546 : case 0:
4547 1290903 : if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
4548 0 : md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
4549 : else
4550 : #ifdef BSR_ANYCRLF
4551 : md->bsr_anycrlf = TRUE;
4552 : #else
4553 1290903 : md->bsr_anycrlf = FALSE;
4554 : #endif
4555 1290903 : break;
4556 :
4557 : case PCRE_BSR_ANYCRLF:
4558 0 : md->bsr_anycrlf = TRUE;
4559 0 : break;
4560 :
4561 : case PCRE_BSR_UNICODE:
4562 0 : md->bsr_anycrlf = FALSE;
4563 0 : break;
4564 :
4565 0 : default: return PCRE_ERROR_BADNEWLINE;
4566 : }
4567 :
4568 : /* Handle different types of newline. The three bits give eight cases. If
4569 : nothing is set at run time, whatever was used at compile time applies. */
4570 :
4571 : switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
4572 1290903 : (pcre_uint32)options) & PCRE_NEWLINE_BITS)
4573 : {
4574 1290903 : case 0: newline = NEWLINE; break; /* Compile-time default */
4575 0 : case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
4576 0 : case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
4577 : case PCRE_NEWLINE_CR+
4578 0 : PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
4579 0 : case PCRE_NEWLINE_ANY: newline = -1; break;
4580 0 : case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4581 0 : default: return PCRE_ERROR_BADNEWLINE;
4582 : }
4583 :
4584 1290903 : if (newline == -2)
4585 : {
4586 0 : md->nltype = NLTYPE_ANYCRLF;
4587 : }
4588 1290903 : else if (newline < 0)
4589 : {
4590 0 : md->nltype = NLTYPE_ANY;
4591 : }
4592 : else
4593 : {
4594 1290903 : md->nltype = NLTYPE_FIXED;
4595 1290903 : if (newline > 255)
4596 : {
4597 0 : md->nllen = 2;
4598 0 : md->nl[0] = (newline >> 8) & 255;
4599 0 : md->nl[1] = newline & 255;
4600 : }
4601 : else
4602 : {
4603 1290903 : md->nllen = 1;
4604 1290903 : md->nl[0] = newline;
4605 : }
4606 : }
4607 :
4608 : /* Partial matching is supported only for a restricted set of regexes at the
4609 : moment. */
4610 :
4611 1290903 : if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
4612 0 : return PCRE_ERROR_BADPARTIAL;
4613 :
4614 : /* Check a UTF-8 string if required. Unfortunately there's no way of passing
4615 : back the character offset. */
4616 :
4617 : #ifdef SUPPORT_UTF8
4618 1290903 : if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
4619 : {
4620 15 : if (_pcre_valid_utf8((USPTR)subject, length) >= 0)
4621 1 : return PCRE_ERROR_BADUTF8;
4622 14 : if (start_offset > 0 && start_offset < length)
4623 : {
4624 0 : int tb = ((USPTR)subject)[start_offset];
4625 0 : if (tb > 127)
4626 : {
4627 0 : tb &= 0xc0;
4628 0 : if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
4629 : }
4630 : }
4631 : }
4632 : #endif
4633 :
4634 : /* The ims options can vary during the matching as a result of the presence
4635 : of (?ims) items in the pattern. They are kept in a local variable so that
4636 : restoring at the exit of a group is easy. */
4637 :
4638 1290902 : ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4639 :
4640 : /* If the expression has got more back references than the offsets supplied can
4641 : hold, we get a temporary chunk of working store to use during the matching.
4642 : Otherwise, we can use the vector supplied, rounding down its size to a multiple
4643 : of 3. */
4644 :
4645 1290902 : ocount = offsetcount - (offsetcount % 3);
4646 :
4647 1290902 : if (re->top_backref > 0 && re->top_backref >= ocount/3)
4648 : {
4649 0 : ocount = re->top_backref * 3 + 3;
4650 0 : md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4651 0 : if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4652 0 : using_temporary_offsets = TRUE;
4653 : DPRINTF(("Got memory to hold back references\n"));
4654 : }
4655 1290902 : else md->offset_vector = offsets;
4656 :
4657 1290902 : md->offset_end = ocount;
4658 1290902 : md->offset_max = (2*ocount)/3;
4659 1290902 : md->offset_overflow = FALSE;
4660 1290902 : md->capture_last = -1;
4661 :
4662 : /* Compute the minimum number of offsets that we need to reset each time. Doing
4663 : this makes a huge difference to execution time when there aren't many brackets
4664 : in the pattern. */
4665 :
4666 1290902 : resetcount = 2 + re->top_bracket * 2;
4667 1290902 : if (resetcount > offsetcount) resetcount = ocount;
4668 :
4669 : /* Reset the working variable associated with each extraction. These should
4670 : never be used unless previously set, but they get saved and restored, and so we
4671 : initialize them to avoid reading uninitialized locations. */
4672 :
4673 1290902 : if (md->offset_vector != NULL)
4674 : {
4675 1290881 : register int *iptr = md->offset_vector + ocount;
4676 1290881 : register int *iend = iptr - resetcount/2 + 1;
4677 1290881 : while (--iptr >= iend) *iptr = -1;
4678 : }
4679 :
4680 : /* Set up the first character to match, if available. The first_byte value is
4681 : never set for an anchored regular expression, but the anchoring may be forced
4682 : at run time, so we have to test for anchoring. The first char may be unset for
4683 : an unanchored pattern, of course. If there's no first char and the pattern was
4684 : studied, there may be a bitmap of possible first characters. */
4685 :
4686 1290902 : if (!anchored)
4687 : {
4688 28956 : if ((re->flags & PCRE_FIRSTSET) != 0)
4689 : {
4690 21377 : first_byte = re->first_byte & 255;
4691 21377 : if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
4692 14 : first_byte = md->lcc[first_byte];
4693 : }
4694 : else
4695 7579 : if (!startline && study != NULL &&
4696 : (study->options & PCRE_STUDY_MAPPED) != 0)
4697 12 : start_bits = study->start_bits;
4698 : }
4699 :
4700 : /* For anchored or unanchored matches, there may be a "last known required
4701 : character" set. */
4702 :
4703 1290902 : if ((re->flags & PCRE_REQCHSET) != 0)
4704 : {
4705 901209 : req_byte = re->req_byte & 255;
4706 901209 : req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
4707 901209 : req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
4708 : }
4709 :
4710 :
4711 : /* ==========================================================================*/
4712 :
4713 : /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4714 : the loop runs just once. */
4715 :
4716 : for(;;)
4717 : {
4718 1328590 : USPTR save_end_subject = end_subject;
4719 : USPTR new_start_match;
4720 :
4721 : /* Reset the maximum number of extractions we might see. */
4722 :
4723 1328590 : if (md->offset_vector != NULL)
4724 : {
4725 1328569 : register int *iptr = md->offset_vector;
4726 1328569 : register int *iend = iptr + resetcount;
4727 1328569 : while (iptr < iend) *iptr++ = -1;
4728 : }
4729 :
4730 : /* If firstline is TRUE, the start of the match is constrained to the first
4731 : line of a multiline string. That is, the match must be before or at the first
4732 : newline. Implement this by temporarily adjusting end_subject so that we stop
4733 : scanning at a newline. If the match fails at the newline, later code breaks
4734 : this loop. */
4735 :
4736 1328590 : if (firstline)
4737 : {
4738 0 : USPTR t = start_match;
4739 : #ifdef SUPPORT_UTF8
4740 0 : if (utf8)
4741 : {
4742 0 : while (t < md->end_subject && !IS_NEWLINE(t))
4743 : {
4744 0 : t++;
4745 0 : while (t < end_subject && (*t & 0xc0) == 0x80) t++;
4746 : }
4747 : }
4748 : else
4749 : #endif
4750 0 : while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4751 0 : end_subject = t;
4752 : }
4753 :
4754 : /* There are some optimizations that avoid running the match if a known
4755 : starting point is not found, or if a known later character is not present.
4756 : However, there is an option that disables these, for testing and for ensuring
4757 : that all callouts do actually occur. */
4758 :
4759 1328590 : if ((options & PCRE_NO_START_OPTIMIZE) == 0)
4760 : {
4761 : /* Advance to a unique first byte if there is one. */
4762 :
4763 1328590 : if (first_byte >= 0)
4764 : {
4765 21973 : if (first_byte_caseless)
4766 143 : while (start_match < end_subject && md->lcc[*start_match] != first_byte)
4767 107 : start_match++;
4768 : else
4769 18283954 : while (start_match < end_subject && *start_match != first_byte)
4770 18240044 : start_match++;
4771 : }
4772 :
4773 : /* Or to just after a linebreak for a multiline match */
4774 :
4775 1306617 : else if (startline)
4776 : {
4777 73 : if (start_match > md->start_subject + start_offset)
4778 : {
4779 : #ifdef SUPPORT_UTF8
4780 30 : if (utf8)
4781 : {
4782 0 : while (start_match < end_subject && !WAS_NEWLINE(start_match))
4783 : {
4784 0 : start_match++;
4785 0 : while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
4786 0 : start_match++;
4787 : }
4788 : }
4789 : else
4790 : #endif
4791 177 : while (start_match < end_subject && !WAS_NEWLINE(start_match))
4792 117 : start_match++;
4793 :
4794 : /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4795 : and we are now at a LF, advance the match position by one more character.
4796 : */
4797 :
4798 30 : if (start_match[-1] == CHAR_CR &&
4799 : (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4800 : start_match < end_subject &&
4801 : *start_match == CHAR_NL)
4802 0 : start_match++;
4803 : }
4804 : }
4805 :
4806 : /* Or to a non-unique first byte after study */
4807 :
4808 1306544 : else if (start_bits != NULL)
4809 : {
4810 37 : while (start_match < end_subject)
4811 : {
4812 21 : register unsigned int c = *start_match;
4813 34 : if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
4814 8 : else break;
4815 : }
4816 : }
4817 : } /* Starting optimizations */
4818 :
4819 : /* Restore fudged end_subject */
4820 :
4821 1328590 : end_subject = save_end_subject;
4822 :
4823 : #ifdef DEBUG /* Sigh. Some compilers never learn. */
4824 : printf(">>>> Match against: ");
4825 : pchars(start_match, end_subject - start_match, TRUE, md);
4826 : printf("\n");
4827 : #endif
4828 :
4829 : /* If req_byte is set, we know that that character must appear in the
4830 : subject for the match to succeed. If the first character is set, req_byte
4831 : must be later in the subject; otherwise the test starts at the match point.
4832 : This optimization can save a huge amount of backtracking in patterns with
4833 : nested unlimited repeats that aren't going to match. Writing separate code
4834 : for cased/caseless versions makes it go faster, as does using an
4835 : autoincrement and backing off on a match.
4836 :
4837 : HOWEVER: when the subject string is very, very long, searching to its end
4838 : can take a long time, and give bad performance on quite ordinary patterns.
4839 : This showed up when somebody was matching something like /^\d+C/ on a
4840 : 32-megabyte string... so we don't do this when the string is sufficiently
4841 : long.
4842 :
4843 : ALSO: this processing is disabled when partial matching is requested, or if
4844 : disabling is explicitly requested. */
4845 :
4846 1328590 : if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
4847 : req_byte >= 0 &&
4848 : end_subject - start_match < REQ_BYTE_MAX &&
4849 : !md->partial)
4850 : {
4851 897759 : register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
4852 :
4853 : /* We don't need to repeat the search if we haven't yet reached the
4854 : place we found it at last time. */
4855 :
4856 897759 : if (p > req_byte_ptr)
4857 : {
4858 897419 : if (req_byte_caseless)
4859 : {
4860 2425 : while (p < end_subject)
4861 : {
4862 1928 : register int pp = *p++;
4863 1928 : if (pp == req_byte || pp == req_byte2) { p--; break; }
4864 : }
4865 : }
4866 : else
4867 : {
4868 16529982 : while (p < end_subject)
4869 : {
4870 14874519 : if (*p++ == req_byte) { p--; break; }
4871 : }
4872 : }
4873 :
4874 : /* If we can't find the required character, break the matching loop,
4875 : forcing a match failure. */
4876 :
4877 897419 : if (p >= end_subject)
4878 : {
4879 758541 : rc = MATCH_NOMATCH;
4880 758541 : break;
4881 : }
4882 :
4883 : /* If we have found the required character, save the point where we
4884 : found it, so that we don't search again next time round the loop if
4885 : the start hasn't passed this character yet. */
4886 :
4887 138878 : req_byte_ptr = p;
4888 : }
4889 : }
4890 :
4891 : /* OK, we can now run the match. */
4892 :
4893 570049 : md->start_match_ptr = start_match;
4894 570049 : md->match_call_count = 0;
4895 570049 : rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
4896 :
4897 570049 : switch(rc)
4898 : {
4899 : /* NOMATCH and PRUNE advance by one character. THEN at this level acts
4900 : exactly like PRUNE. */
4901 :
4902 : case MATCH_NOMATCH:
4903 : case MATCH_PRUNE:
4904 : case MATCH_THEN:
4905 513565 : new_start_match = start_match + 1;
4906 : #ifdef SUPPORT_UTF8
4907 513565 : if (utf8)
4908 54 : while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
4909 2 : new_start_match++;
4910 : #endif
4911 513565 : break;
4912 :
4913 : /* SKIP passes back the next starting point explicitly. */
4914 :
4915 : case MATCH_SKIP:
4916 0 : new_start_match = md->start_match_ptr;
4917 0 : break;
4918 :
4919 : /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
4920 :
4921 : case MATCH_COMMIT:
4922 0 : rc = MATCH_NOMATCH;
4923 0 : goto ENDLOOP;
4924 :
4925 : /* Any other return is some kind of error. */
4926 :
4927 : default:
4928 56484 : goto ENDLOOP;
4929 : }
4930 :
4931 : /* Control reaches here for the various types of "no match at this point"
4932 : result. Reset the code to MATCH_NOMATCH for subsequent checking. */
4933 :
4934 513565 : rc = MATCH_NOMATCH;
4935 :
4936 : /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
4937 : newline in the subject (though it may continue over the newline). Therefore,
4938 : if we have just failed to match, starting at a newline, do not continue. */
4939 :
4940 513565 : if (firstline && IS_NEWLINE(start_match)) break;
4941 :
4942 : /* Advance to new matching position */
4943 :
4944 513565 : start_match = new_start_match;
4945 :
4946 : /* Break the loop if the pattern is anchored or if we have passed the end of
4947 : the subject. */
4948 :
4949 513565 : if (anchored || start_match > end_subject) break;
4950 :
4951 : /* If we have just passed a CR and we are now at a LF, and the pattern does
4952 : not contain any explicit matches for \r or \n, and the newline option is CRLF
4953 : or ANY or ANYCRLF, advance the match position by one more character. */
4954 :
4955 37688 : if (start_match[-1] == CHAR_CR &&
4956 : start_match < end_subject &&
4957 : *start_match == CHAR_NL &&
4958 : (re->flags & PCRE_HASCRORLF) == 0 &&
4959 : (md->nltype == NLTYPE_ANY ||
4960 : md->nltype == NLTYPE_ANYCRLF ||
4961 : md->nllen == 2))
4962 0 : start_match++;
4963 :
4964 37688 : } /* End of for(;;) "bumpalong" loop */
4965 :
4966 : /* ==========================================================================*/
4967 :
4968 : /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
4969 : conditions is true:
4970 :
4971 : (1) The pattern is anchored or the match was failed by (*COMMIT);
4972 :
4973 : (2) We are past the end of the subject;
4974 :
4975 : (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
4976 : this option requests that a match occur at or before the first newline in
4977 : the subject.
4978 :
4979 : When we have a match and the offset vector is big enough to deal with any
4980 : backreferences, captured substring offsets will already be set up. In the case
4981 : where we had to get some local store to hold offsets for backreference
4982 : processing, copy those that we can. In this case there need not be overflow if
4983 : certain parts of the pattern were not used, even though there are more
4984 : capturing parentheses than vector slots. */
4985 :
4986 1290902 : ENDLOOP:
4987 :
4988 1290902 : if (rc == MATCH_MATCH)
4989 : {
4990 56478 : if (using_temporary_offsets)
4991 : {
4992 0 : if (offsetcount >= 4)
4993 : {
4994 0 : memcpy(offsets + 2, md->offset_vector + 2,
4995 : (offsetcount - 2) * sizeof(int));
4996 : DPRINTF(("Copied offsets from temporary memory\n"));
4997 : }
4998 0 : if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
4999 : DPRINTF(("Freeing temporary memory\n"));
5000 0 : (pcre_free)(md->offset_vector);
5001 : }
5002 :
5003 : /* Set the return code to the number of captured strings, or 0 if there are
5004 : too many to fit into the vector. */
5005 :
5006 56478 : rc = md->offset_overflow? 0 : md->end_offset_top/2;
5007 :
5008 : /* If there is space, set up the whole thing as substring 0. The value of
5009 : md->start_match_ptr might be modified if \K was encountered on the success
5010 : matching path. */
5011 :
5012 56478 : if (offsetcount < 2) rc = 0; else
5013 : {
5014 56465 : offsets[0] = md->start_match_ptr - md->start_subject;
5015 56465 : offsets[1] = md->end_match_ptr - md->start_subject;
5016 : }
5017 :
5018 : DPRINTF((">>>> returning %d\n", rc));
5019 56478 : return rc;
5020 : }
5021 :
5022 : /* Control gets here if there has been an error, or if the overall match
5023 : attempt has failed at all permitted starting positions. */
5024 :
5025 1234424 : if (using_temporary_offsets)
5026 : {
5027 : DPRINTF(("Freeing temporary memory\n"));
5028 0 : (pcre_free)(md->offset_vector);
5029 : }
5030 :
5031 1234424 : if (rc != MATCH_NOMATCH)
5032 : {
5033 : DPRINTF((">>>> error: returning %d\n", rc));
5034 6 : return rc;
5035 : }
5036 1234418 : else if (md->partial && md->hitend)
5037 : {
5038 : DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
5039 0 : return PCRE_ERROR_PARTIAL;
5040 : }
5041 : else
5042 : {
5043 : DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
5044 1234418 : return PCRE_ERROR_NOMATCH;
5045 : }
5046 : }
5047 :
5048 : /* End of pcre_exec.c */
|