PHP  
 PHP: Test and Code Coverage Analysis
downloads | QA | documentation | faq | getting help | mailing lists | reporting bugs | php.net sites | links | my php.net 
 

LTP GCOV extension - code coverage report
Current view: directory - pcre/pcrelib - pcre_compile.c
Test: PHP Code Coverage
Date: 2009-11-19 Instrumented lines: 1985
Code covered: 56.8 % Executed lines: 1128
Legend: not executed executed

       1                 : /*************************************************
       2                 : *      Perl-Compatible Regular Expressions       *
       3                 : *************************************************/
       4                 : 
       5                 : /* PCRE is a library of functions to support regular expressions whose syntax
       6                 : and semantics are as close as possible to those of the Perl 5 language.
       7                 : 
       8                 :                        Written by Philip Hazel
       9                 :            Copyright (c) 1997-2009 University of Cambridge
      10                 : 
      11                 : -----------------------------------------------------------------------------
      12                 : Redistribution and use in source and binary forms, with or without
      13                 : modification, are permitted provided that the following conditions are met:
      14                 : 
      15                 :     * Redistributions of source code must retain the above copyright notice,
      16                 :       this list of conditions and the following disclaimer.
      17                 : 
      18                 :     * Redistributions in binary form must reproduce the above copyright
      19                 :       notice, this list of conditions and the following disclaimer in the
      20                 :       documentation and/or other materials provided with the distribution.
      21                 : 
      22                 :     * Neither the name of the University of Cambridge nor the names of its
      23                 :       contributors may be used to endorse or promote products derived from
      24                 :       this software without specific prior written permission.
      25                 : 
      26                 : THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
      27                 : AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
      28                 : IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
      29                 : ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
      30                 : LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
      31                 : CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
      32                 : SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
      33                 : INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
      34                 : CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
      35                 : ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
      36                 : POSSIBILITY OF SUCH DAMAGE.
      37                 : -----------------------------------------------------------------------------
      38                 : */
      39                 : 
      40                 : 
      41                 : /* This module contains the external function pcre_compile(), along with
      42                 : supporting internal functions that are not used by other modules. */
      43                 : 
      44                 : 
      45                 : #include "config.h"
      46                 : 
      47                 : #define NLBLOCK cd             /* Block containing newline information */
      48                 : #define PSSTART start_pattern  /* Field containing processed string start */
      49                 : #define PSEND   end_pattern    /* Field containing processed string end */
      50                 : 
      51                 : #include "pcre_internal.h"
      52                 : 
      53                 : 
      54                 : /* When DEBUG is defined, we need the pcre_printint() function, which is also
      55                 : used by pcretest. DEBUG is not defined when building a production library. */
      56                 : 
      57                 : #ifdef DEBUG
      58                 : #include "pcre_printint.src"
      59                 : #endif
      60                 : 
      61                 : 
      62                 : /* Macro for setting individual bits in class bitmaps. */
      63                 : 
      64                 : #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
      65                 : 
      66                 : /* Maximum length value to check against when making sure that the integer that
      67                 : holds the compiled pattern length does not overflow. We make it a bit less than
      68                 : INT_MAX to allow for adding in group terminating bytes, so that we don't have
      69                 : to check them every time. */
      70                 : 
      71                 : #define OFLOW_MAX (INT_MAX - 20)
      72                 : 
      73                 : 
      74                 : /*************************************************
      75                 : *      Code parameters and static tables         *
      76                 : *************************************************/
      77                 : 
      78                 : /* This value specifies the size of stack workspace that is used during the
      79                 : first pre-compile phase that determines how much memory is required. The regex
      80                 : is partly compiled into this space, but the compiled parts are discarded as
      81                 : soon as they can be, so that hopefully there will never be an overrun. The code
      82                 : does, however, check for an overrun. The largest amount I've seen used is 218,
      83                 : so this number is very generous.
      84                 : 
      85                 : The same workspace is used during the second, actual compile phase for
      86                 : remembering forward references to groups so that they can be filled in at the
      87                 : end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
      88                 : is 4 there is plenty of room. */
      89                 : 
      90                 : #define COMPILE_WORK_SIZE (4096)
      91                 : 
      92                 : 
      93                 : /* Table for handling escaped characters in the range '0'-'z'. Positive returns
      94                 : are simple data values; negative values are for special things like \d and so
      95                 : on. Zero means further processing is needed (for things like \x), or the escape
      96                 : is invalid. */
      97                 : 
      98                 : #ifndef EBCDIC
      99                 : 
     100                 : /* This is the "normal" table for ASCII systems or for EBCDIC systems running
     101                 : in UTF-8 mode. */
     102                 : 
     103                 : static const short int escapes[] = {
     104                 :      0,                       0,
     105                 :      0,                       0,
     106                 :      0,                       0,
     107                 :      0,                       0,
     108                 :      0,                       0,
     109                 :      CHAR_COLON,              CHAR_SEMICOLON,
     110                 :      CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
     111                 :      CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
     112                 :      CHAR_COMMERCIAL_AT,      -ESC_A,
     113                 :      -ESC_B,                  -ESC_C,
     114                 :      -ESC_D,                  -ESC_E,
     115                 :      0,                       -ESC_G,
     116                 :      -ESC_H,                  0,
     117                 :      0,                       -ESC_K,
     118                 :      0,                       0,
     119                 :      0,                       0,
     120                 :      -ESC_P,                  -ESC_Q,
     121                 :      -ESC_R,                  -ESC_S,
     122                 :      0,                       0,
     123                 :      -ESC_V,                  -ESC_W,
     124                 :      -ESC_X,                  0,
     125                 :      -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
     126                 :      CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
     127                 :      CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
     128                 :      CHAR_GRAVE_ACCENT,       7,
     129                 :      -ESC_b,                  0,
     130                 :      -ESC_d,                  ESC_e,
     131                 :      ESC_f,                   0,
     132                 :      -ESC_h,                  0,
     133                 :      0,                       -ESC_k,
     134                 :      0,                       0,
     135                 :      ESC_n,                   0,
     136                 :      -ESC_p,                  0,
     137                 :      ESC_r,                   -ESC_s,
     138                 :      ESC_tee,                 0,
     139                 :      -ESC_v,                  -ESC_w,
     140                 :      0,                       0,
     141                 :      -ESC_z
     142                 : };
     143                 : 
     144                 : #else
     145                 : 
     146                 : /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
     147                 : 
     148                 : static const short int escapes[] = {
     149                 : /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
     150                 : /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
     151                 : /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
     152                 : /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
     153                 : /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
     154                 : /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
     155                 : /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
     156                 : /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
     157                 : /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
     158                 : /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
     159                 : /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
     160                 : /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
     161                 : /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
     162                 : /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
     163                 : /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
     164                 : /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
     165                 : /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
     166                 : /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
     167                 : /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
     168                 : /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
     169                 : /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
     170                 : /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
     171                 : /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
     172                 : };
     173                 : #endif
     174                 : 
     175                 : 
     176                 : /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
     177                 : searched linearly. Put all the names into a single string, in order to reduce
     178                 : the number of relocations when a shared library is dynamically linked. The
     179                 : string is built from string macros so that it works in UTF-8 mode on EBCDIC
     180                 : platforms. */
     181                 : 
     182                 : typedef struct verbitem {
     183                 :   int   len;
     184                 :   int   op;
     185                 : } verbitem;
     186                 : 
     187                 : static const char verbnames[] =
     188                 :   STRING_ACCEPT0
     189                 :   STRING_COMMIT0
     190                 :   STRING_F0
     191                 :   STRING_FAIL0
     192                 :   STRING_PRUNE0
     193                 :   STRING_SKIP0
     194                 :   STRING_THEN;
     195                 : 
     196                 : static const verbitem verbs[] = {
     197                 :   { 6, OP_ACCEPT },
     198                 :   { 6, OP_COMMIT },
     199                 :   { 1, OP_FAIL },
     200                 :   { 4, OP_FAIL },
     201                 :   { 5, OP_PRUNE },
     202                 :   { 4, OP_SKIP  },
     203                 :   { 4, OP_THEN  }
     204                 : };
     205                 : 
     206                 : static const int verbcount = sizeof(verbs)/sizeof(verbitem);
     207                 : 
     208                 : 
     209                 : /* Tables of names of POSIX character classes and their lengths. The names are
     210                 : now all in a single string, to reduce the number of relocations when a shared
     211                 : library is dynamically loaded. The list of lengths is terminated by a zero
     212                 : length entry. The first three must be alpha, lower, upper, as this is assumed
     213                 : for handling case independence. */
     214                 : 
     215                 : static const char posix_names[] =
     216                 :   STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
     217                 :   STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
     218                 :   STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
     219                 :   STRING_word0  STRING_xdigit;
     220                 : 
     221                 : static const uschar posix_name_lengths[] = {
     222                 :   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
     223                 : 
     224                 : /* Table of class bit maps for each POSIX class. Each class is formed from a
     225                 : base map, with an optional addition or removal of another map. Then, for some
     226                 : classes, there is some additional tweaking: for [:blank:] the vertical space
     227                 : characters are removed, and for [:alpha:] and [:alnum:] the underscore
     228                 : character is removed. The triples in the table consist of the base map offset,
     229                 : second map offset or -1 if no second map, and a non-negative value for map
     230                 : addition or a negative value for map subtraction (if there are two maps). The
     231                 : absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
     232                 : remove vertical space characters, 2 => remove underscore. */
     233                 : 
     234                 : static const int posix_class_maps[] = {
     235                 :   cbit_word,  cbit_digit, -2,             /* alpha */
     236                 :   cbit_lower, -1,          0,             /* lower */
     237                 :   cbit_upper, -1,          0,             /* upper */
     238                 :   cbit_word,  -1,          2,             /* alnum - word without underscore */
     239                 :   cbit_print, cbit_cntrl,  0,             /* ascii */
     240                 :   cbit_space, -1,          1,             /* blank - a GNU extension */
     241                 :   cbit_cntrl, -1,          0,             /* cntrl */
     242                 :   cbit_digit, -1,          0,             /* digit */
     243                 :   cbit_graph, -1,          0,             /* graph */
     244                 :   cbit_print, -1,          0,             /* print */
     245                 :   cbit_punct, -1,          0,             /* punct */
     246                 :   cbit_space, -1,          0,             /* space */
     247                 :   cbit_word,  -1,          0,             /* word - a Perl extension */
     248                 :   cbit_xdigit,-1,          0              /* xdigit */
     249                 : };
     250                 : 
     251                 : 
     252                 : #define STRING(a)  # a
     253                 : #define XSTRING(s) STRING(s)
     254                 : 
     255                 : /* The texts of compile-time error messages. These are "char *" because they
     256                 : are passed to the outside world. Do not ever re-use any error number, because
     257                 : they are documented. Always add a new error instead. Messages marked DEAD below
     258                 : are no longer used. This used to be a table of strings, but in order to reduce
     259                 : the number of relocations needed when a shared library is loaded dynamically,
     260                 : it is now one long string. We cannot use a table of offsets, because the
     261                 : lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
     262                 : simply count through to the one we want - this isn't a performance issue
     263                 : because these strings are used only when there is a compilation error. */
     264                 : 
     265                 : static const char error_texts[] =
     266                 :   "no error\0"
     267                 :   "\\ at end of pattern\0"
     268                 :   "\\c at end of pattern\0"
     269                 :   "unrecognized character follows \\\0"
     270                 :   "numbers out of order in {} quantifier\0"
     271                 :   /* 5 */
     272                 :   "number too big in {} quantifier\0"
     273                 :   "missing terminating ] for character class\0"
     274                 :   "invalid escape sequence in character class\0"
     275                 :   "range out of order in character class\0"
     276                 :   "nothing to repeat\0"
     277                 :   /* 10 */
     278                 :   "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
     279                 :   "internal error: unexpected repeat\0"
     280                 :   "unrecognized character after (? or (?-\0"
     281                 :   "POSIX named classes are supported only within a class\0"
     282                 :   "missing )\0"
     283                 :   /* 15 */
     284                 :   "reference to non-existent subpattern\0"
     285                 :   "erroffset passed as NULL\0"
     286                 :   "unknown option bit(s) set\0"
     287                 :   "missing ) after comment\0"
     288                 :   "parentheses nested too deeply\0"  /** DEAD **/
     289                 :   /* 20 */
     290                 :   "regular expression is too large\0"
     291                 :   "failed to get memory\0"
     292                 :   "unmatched parentheses\0"
     293                 :   "internal error: code overflow\0"
     294                 :   "unrecognized character after (?<\0"
     295                 :   /* 25 */
     296                 :   "lookbehind assertion is not fixed length\0"
     297                 :   "malformed number or name after (?(\0"
     298                 :   "conditional group contains more than two branches\0"
     299                 :   "assertion expected after (?(\0"
     300                 :   "(?R or (?[+-]digits must be followed by )\0"
     301                 :   /* 30 */
     302                 :   "unknown POSIX class name\0"
     303                 :   "POSIX collating elements are not supported\0"
     304                 :   "this version of PCRE is not compiled with PCRE_UTF8 support\0"
     305                 :   "spare error\0"  /** DEAD **/
     306                 :   "character value in \\x{...} sequence is too large\0"
     307                 :   /* 35 */
     308                 :   "invalid condition (?(0)\0"
     309                 :   "\\C not allowed in lookbehind assertion\0"
     310                 :   "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
     311                 :   "number after (?C is > 255\0"
     312                 :   "closing ) for (?C expected\0"
     313                 :   /* 40 */
     314                 :   "recursive call could loop indefinitely\0"
     315                 :   "unrecognized character after (?P\0"
     316                 :   "syntax error in subpattern name (missing terminator)\0"
     317                 :   "two named subpatterns have the same name\0"
     318                 :   "invalid UTF-8 string\0"
     319                 :   /* 45 */
     320                 :   "support for \\P, \\p, and \\X has not been compiled\0"
     321                 :   "malformed \\P or \\p sequence\0"
     322                 :   "unknown property name after \\P or \\p\0"
     323                 :   "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
     324                 :   "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
     325                 :   /* 50 */
     326                 :   "repeated subpattern is too long\0"    /** DEAD **/
     327                 :   "octal value is greater than \\377 (not in UTF-8 mode)\0"
     328                 :   "internal error: overran compiling workspace\0"
     329                 :   "internal error: previously-checked referenced subpattern not found\0"
     330                 :   "DEFINE group contains more than one branch\0"
     331                 :   /* 55 */
     332                 :   "repeating a DEFINE group is not allowed\0"
     333                 :   "inconsistent NEWLINE options\0"
     334                 :   "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
     335                 :   "a numbered reference must not be zero\0"
     336                 :   "(*VERB) with an argument is not supported\0"
     337                 :   /* 60 */
     338                 :   "(*VERB) not recognized\0"
     339                 :   "number is too big\0"
     340                 :   "subpattern name expected\0"
     341                 :   "digit expected after (?+\0"
     342                 :   "] is an invalid data character in JavaScript compatibility mode";
     343                 : 
     344                 : 
     345                 : /* Table to identify digits and hex digits. This is used when compiling
     346                 : patterns. Note that the tables in chartables are dependent on the locale, and
     347                 : may mark arbitrary characters as digits - but the PCRE compiling code expects
     348                 : to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
     349                 : a private table here. It costs 256 bytes, but it is a lot faster than doing
     350                 : character value tests (at least in some simple cases I timed), and in some
     351                 : applications one wants PCRE to compile efficiently as well as match
     352                 : efficiently.
     353                 : 
     354                 : For convenience, we use the same bit definitions as in chartables:
     355                 : 
     356                 :   0x04   decimal digit
     357                 :   0x08   hexadecimal digit
     358                 : 
     359                 : Then we can use ctype_digit and ctype_xdigit in the code. */
     360                 : 
     361                 : #ifndef EBCDIC
     362                 : 
     363                 : /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
     364                 : UTF-8 mode. */
     365                 : 
     366                 : static const unsigned char digitab[] =
     367                 :   {
     368                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
     369                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
     370                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
     371                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
     372                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
     373                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
     374                 :   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
     375                 :   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
     376                 :   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
     377                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
     378                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
     379                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
     380                 :   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
     381                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
     382                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
     383                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
     384                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
     385                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
     386                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
     387                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
     388                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
     389                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
     390                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
     391                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
     392                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
     393                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
     394                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
     395                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
     396                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
     397                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
     398                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
     399                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
     400                 : 
     401                 : #else
     402                 : 
     403                 : /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
     404                 : 
     405                 : static const unsigned char digitab[] =
     406                 :   {
     407                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
     408                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
     409                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
     410                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
     411                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
     412                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
     413                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
     414                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
     415                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
     416                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
     417                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
     418                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
     419                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
     420                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
     421                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
     422                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
     423                 :   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
     424                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
     425                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
     426                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
     427                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
     428                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
     429                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
     430                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
     431                 :   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
     432                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
     433                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
     434                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
     435                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
     436                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
     437                 :   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
     438                 :   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
     439                 : 
     440                 : static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
     441                 :   0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
     442                 :   0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
     443                 :   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
     444                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
     445                 :   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
     446                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
     447                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
     448                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
     449                 :   0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
     450                 :   0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
     451                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
     452                 :   0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
     453                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
     454                 :   0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
     455                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
     456                 :   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
     457                 :   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
     458                 :   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
     459                 :   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
     460                 :   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
     461                 :   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
     462                 :   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
     463                 :   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
     464                 :   0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
     465                 :   0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
     466                 :   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
     467                 :   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
     468                 :   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
     469                 :   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
     470                 :   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
     471                 :   0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
     472                 :   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
     473                 : #endif
     474                 : 
     475                 : 
     476                 : /* Definition to allow mutual recursion */
     477                 : 
     478                 : static BOOL
     479                 :   compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
     480                 :     int *, int *, branch_chain *, compile_data *, int *);
     481                 : 
     482                 : 
     483                 : 
     484                 : /*************************************************
     485                 : *            Find an error text                  *
     486                 : *************************************************/
     487                 : 
     488                 : /* The error texts are now all in one long string, to save on relocations. As
     489                 : some of the text is of unknown length, we can't use a table of offsets.
     490                 : Instead, just count through the strings. This is not a performance issue
     491                 : because it happens only when there has been a compilation error.
     492                 : 
     493                 : Argument:   the error number
     494                 : Returns:    pointer to the error string
     495                 : */
     496                 : 
     497                 : static const char *
     498                 : find_error_text(int n)
     499               4 : {
     500               4 : const char *s = error_texts;
     501               4 : for (; n > 0; n--) while (*s++ != 0) {};
     502               4 : return s;
     503                 : }
     504                 : 
     505                 : 
     506                 : /*************************************************
     507                 : *            Handle escapes                      *
     508                 : *************************************************/
     509                 : 
     510                 : /* This function is called when a \ has been encountered. It either returns a
     511                 : positive value for a simple escape such as \n, or a negative value which
     512                 : encodes one of the more complicated things such as \d. A backreference to group
     513                 : n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
     514                 : UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
     515                 : ptr is pointing at the \. On exit, it is on the final character of the escape
     516                 : sequence.
     517                 : 
     518                 : Arguments:
     519                 :   ptrptr         points to the pattern position pointer
     520                 :   errorcodeptr   points to the errorcode variable
     521                 :   bracount       number of previous extracting brackets
     522                 :   options        the options bits
     523                 :   isclass        TRUE if inside a character class
     524                 : 
     525                 : Returns:         zero or positive => a data character
     526                 :                  negative => a special escape sequence
     527                 :                  on error, errorcodeptr is set
     528                 : */
     529                 : 
     530                 : static int
     531                 : check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
     532                 :   int options, BOOL isclass)
     533         1752830 : {
     534         1752830 : BOOL utf8 = (options & PCRE_UTF8) != 0;
     535         1752830 : const uschar *ptr = *ptrptr + 1;
     536                 : int c, i;
     537                 : 
     538         1752830 : GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
     539         1752830 : ptr--;                            /* Set pointer back to the last byte */
     540                 : 
     541                 : /* If backslash is at the end of the pattern, it's an error. */
     542                 : 
     543         1752830 : if (c == 0) *errorcodeptr = ERR1;
     544                 : 
     545                 : /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
     546                 : in a table. A non-zero result is something that can be returned immediately.
     547                 : Otherwise further processing may be required. */
     548                 : 
     549                 : #ifndef EBCDIC  /* ASCII/UTF-8 coding */
     550         1752830 : else if (c < CHAR_0 || c > CHAR_z) {}                     /* Not alphanumeric */
     551          749150 : else if ((i = escapes[c - CHAR_0]) != 0) c = i;
     552                 : 
     553                 : #else           /* EBCDIC coding */
     554                 : else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */
     555                 : else if ((i = escapes[c - 0x48]) != 0)  c = i;
     556                 : #endif
     557                 : 
     558                 : /* Escapes that need further processing, or are illegal. */
     559                 : 
     560                 : else
     561                 :   {
     562                 :   const uschar *oldptr;
     563                 :   BOOL braced, negated;
     564                 : 
     565            2671 :   switch (c)
     566                 :     {
     567                 :     /* A number of Perl escapes are not handled by PCRE. We give an explicit
     568                 :     error. */
     569                 : 
     570                 :     case CHAR_l:
     571                 :     case CHAR_L:
     572                 :     case CHAR_N:
     573                 :     case CHAR_u:
     574                 :     case CHAR_U:
     575               0 :     *errorcodeptr = ERR37;
     576               0 :     break;
     577                 : 
     578                 :     /* \g must be followed by one of a number of specific things:
     579                 : 
     580                 :     (1) A number, either plain or braced. If positive, it is an absolute
     581                 :     backreference. If negative, it is a relative backreference. This is a Perl
     582                 :     5.10 feature.
     583                 : 
     584                 :     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
     585                 :     is part of Perl's movement towards a unified syntax for back references. As
     586                 :     this is synonymous with \k{name}, we fudge it up by pretending it really
     587                 :     was \k.
     588                 : 
     589                 :     (3) For Oniguruma compatibility we also support \g followed by a name or a
     590                 :     number either in angle brackets or in single quotes. However, these are
     591                 :     (possibly recursive) subroutine calls, _not_ backreferences. Just return
     592                 :     the -ESC_g code (cf \k). */
     593                 : 
     594                 :     case CHAR_g:
     595               0 :     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
     596                 :       {
     597               0 :       c = -ESC_g;
     598               0 :       break;
     599                 :       }
     600                 : 
     601                 :     /* Handle the Perl-compatible cases */
     602                 : 
     603               0 :     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
     604                 :       {
     605                 :       const uschar *p;
     606               0 :       for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
     607               0 :         if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
     608               0 :       if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
     609                 :         {
     610               0 :         c = -ESC_k;
     611               0 :         break;
     612                 :         }
     613               0 :       braced = TRUE;
     614               0 :       ptr++;
     615                 :       }
     616               0 :     else braced = FALSE;
     617                 : 
     618               0 :     if (ptr[1] == CHAR_MINUS)
     619                 :       {
     620               0 :       negated = TRUE;
     621               0 :       ptr++;
     622                 :       }
     623               0 :     else negated = FALSE;
     624                 : 
     625               0 :     c = 0;
     626               0 :     while ((digitab[ptr[1]] & ctype_digit) != 0)
     627               0 :       c = c * 10 + *(++ptr) - CHAR_0;
     628                 : 
     629               0 :     if (c < 0)   /* Integer overflow */
     630                 :       {
     631               0 :       *errorcodeptr = ERR61;
     632               0 :       break;
     633                 :       }
     634                 : 
     635               0 :     if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
     636                 :       {
     637               0 :       *errorcodeptr = ERR57;
     638               0 :       break;
     639                 :       }
     640                 : 
     641               0 :     if (c == 0)
     642                 :       {
     643               0 :       *errorcodeptr = ERR58;
     644               0 :       break;
     645                 :       }
     646                 : 
     647               0 :     if (negated)
     648                 :       {
     649               0 :       if (c > bracount)
     650                 :         {
     651               0 :         *errorcodeptr = ERR15;
     652               0 :         break;
     653                 :         }
     654               0 :       c = bracount - (c - 1);
     655                 :       }
     656                 : 
     657               0 :     c = -(ESC_REF + c);
     658               0 :     break;
     659                 : 
     660                 :     /* The handling of escape sequences consisting of a string of digits
     661                 :     starting with one that is not zero is not straightforward. By experiment,
     662                 :     the way Perl works seems to be as follows:
     663                 : 
     664                 :     Outside a character class, the digits are read as a decimal number. If the
     665                 :     number is less than 10, or if there are that many previous extracting
     666                 :     left brackets, then it is a back reference. Otherwise, up to three octal
     667                 :     digits are read to form an escaped byte. Thus \123 is likely to be octal
     668                 :     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
     669                 :     value is greater than 377, the least significant 8 bits are taken. Inside a
     670                 :     character class, \ followed by a digit is always an octal number. */
     671                 : 
     672                 :     case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
     673                 :     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
     674                 : 
     675             410 :     if (!isclass)
     676                 :       {
     677             410 :       oldptr = ptr;
     678             410 :       c -= CHAR_0;
     679             820 :       while ((digitab[ptr[1]] & ctype_digit) != 0)
     680               0 :         c = c * 10 + *(++ptr) - CHAR_0;
     681             410 :       if (c < 0)    /* Integer overflow */
     682                 :         {
     683               0 :         *errorcodeptr = ERR61;
     684               0 :         break;
     685                 :         }
     686             410 :       if (c < 10 || c <= bracount)
     687                 :         {
     688             410 :         c = -(ESC_REF + c);
     689             410 :         break;
     690                 :         }
     691               0 :       ptr = oldptr;      /* Put the pointer back and fall through */
     692                 :       }
     693                 : 
     694                 :     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
     695                 :     generates a binary zero byte and treats the digit as a following literal.
     696                 :     Thus we have to pull back the pointer by one. */
     697                 : 
     698               0 :     if ((c = *ptr) >= CHAR_8)
     699                 :       {
     700               0 :       ptr--;
     701               0 :       c = 0;
     702               0 :       break;
     703                 :       }
     704                 : 
     705                 :     /* \0 always starts an octal number, but we may drop through to here with a
     706                 :     larger first octal digit. The original code used just to take the least
     707                 :     significant 8 bits of octal numbers (I think this is what early Perls used
     708                 :     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
     709                 :     than 3 octal digits. */
     710                 : 
     711                 :     case CHAR_0:
     712            2254 :     c -= CHAR_0;
     713            8932 :     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
     714            4424 :         c = c * 8 + *(++ptr) - CHAR_0;
     715            2254 :     if (!utf8 && c > 255) *errorcodeptr = ERR51;
     716            2254 :     break;
     717                 : 
     718                 :     /* \x is complicated. \x{ddd} is a character number which can be greater
     719                 :     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
     720                 :     treated as a data character. */
     721                 : 
     722                 :     case CHAR_x:
     723               4 :     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
     724                 :       {
     725               0 :       const uschar *pt = ptr + 2;
     726               0 :       int count = 0;
     727                 : 
     728               0 :       c = 0;
     729               0 :       while ((digitab[*pt] & ctype_xdigit) != 0)
     730                 :         {
     731               0 :         register int cc = *pt++;
     732               0 :         if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
     733               0 :         count++;
     734                 : 
     735                 : #ifndef EBCDIC  /* ASCII/UTF-8 coding */
     736               0 :         if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
     737               0 :         c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
     738                 : #else           /* EBCDIC coding */
     739                 :         if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
     740                 :         c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
     741                 : #endif
     742                 :         }
     743                 : 
     744               0 :       if (*pt == CHAR_RIGHT_CURLY_BRACKET)
     745                 :         {
     746               0 :         if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
     747               0 :         ptr = pt;
     748               0 :         break;
     749                 :         }
     750                 : 
     751                 :       /* If the sequence of hex digits does not end with '}', then we don't
     752                 :       recognize this construct; fall through to the normal \x handling. */
     753                 :       }
     754                 : 
     755                 :     /* Read just a single-byte hex-defined char */
     756                 : 
     757               4 :     c = 0;
     758              16 :     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
     759                 :       {
     760                 :       int cc;                                  /* Some compilers don't like */
     761               8 :       cc = *(++ptr);                           /* ++ in initializers */
     762                 : #ifndef EBCDIC  /* ASCII/UTF-8 coding */
     763               8 :       if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
     764               8 :       c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
     765                 : #else           /* EBCDIC coding */
     766                 :       if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
     767                 :       c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
     768                 : #endif
     769                 :       }
     770               4 :     break;
     771                 : 
     772                 :     /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
     773                 :     This coding is ASCII-specific, but then the whole concept of \cx is
     774                 :     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
     775                 : 
     776                 :     case CHAR_c:
     777               0 :     c = *(++ptr);
     778               0 :     if (c == 0)
     779                 :       {
     780               0 :       *errorcodeptr = ERR2;
     781               0 :       break;
     782                 :       }
     783                 : 
     784                 : #ifndef EBCDIC  /* ASCII/UTF-8 coding */
     785               0 :     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
     786               0 :     c ^= 0x40;
     787                 : #else           /* EBCDIC coding */
     788                 :     if (c >= CHAR_a && c <= CHAR_z) c += 64;
     789                 :     c ^= 0xC0;
     790                 : #endif
     791               0 :     break;
     792                 : 
     793                 :     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
     794                 :     other alphanumeric following \ is an error if PCRE_EXTRA was set;
     795                 :     otherwise, for Perl compatibility, it is a literal. This code looks a bit
     796                 :     odd, but there used to be some cases other than the default, and there may
     797                 :     be again in future, so I haven't "optimized" it. */
     798                 : 
     799                 :     default:
     800               3 :     if ((options & PCRE_EXTRA) != 0) switch(c)
     801                 :       {
     802                 :       default:
     803               1 :       *errorcodeptr = ERR3;
     804                 :       break;
     805                 :       }
     806                 :     break;
     807                 :     }
     808                 :   }
     809                 : 
     810         1752830 : *ptrptr = ptr;
     811         1752830 : return c;
     812                 : }
     813                 : 
     814                 : 
     815                 : 
     816                 : #ifdef SUPPORT_UCP
     817                 : /*************************************************
     818                 : *               Handle \P and \p                 *
     819                 : *************************************************/
     820                 : 
     821                 : /* This function is called after \P or \p has been encountered, provided that
     822                 : PCRE is compiled with support for Unicode properties. On entry, ptrptr is
     823                 : pointing at the P or p. On exit, it is pointing at the final character of the
     824                 : escape sequence.
     825                 : 
     826                 : Argument:
     827                 :   ptrptr         points to the pattern position pointer
     828                 :   negptr         points to a boolean that is set TRUE for negation else FALSE
     829                 :   dptr           points to an int that is set to the detailed property value
     830                 :   errorcodeptr   points to the error code variable
     831                 : 
     832                 : Returns:         type value from ucp_type_table, or -1 for an invalid type
     833                 : */
     834                 : 
     835                 : static int
     836                 : get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
     837              40 : {
     838                 : int c, i, bot, top;
     839              40 : const uschar *ptr = *ptrptr;
     840                 : char name[32];
     841                 : 
     842              40 : c = *(++ptr);
     843              40 : if (c == 0) goto ERROR_RETURN;
     844                 : 
     845              40 : *negptr = FALSE;
     846                 : 
     847                 : /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
     848                 : negation. */
     849                 : 
     850              40 : if (c == CHAR_LEFT_CURLY_BRACKET)
     851                 :   {
     852              38 :   if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
     853                 :     {
     854               0 :     *negptr = TRUE;
     855               0 :     ptr++;
     856                 :     }
     857              94 :   for (i = 0; i < (int)sizeof(name) - 1; i++)
     858                 :     {
     859              94 :     c = *(++ptr);
     860              94 :     if (c == 0) goto ERROR_RETURN;
     861              94 :     if (c == CHAR_RIGHT_CURLY_BRACKET) break;
     862              56 :     name[i] = c;
     863                 :     }
     864              38 :   if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
     865              38 :   name[i] = 0;
     866                 :   }
     867                 : 
     868                 : /* Otherwise there is just one following character */
     869                 : 
     870                 : else
     871                 :   {
     872               2 :   name[0] = c;
     873               2 :   name[1] = 0;
     874                 :   }
     875                 : 
     876              40 : *ptrptr = ptr;
     877                 : 
     878                 : /* Search for a recognized property name using binary chop */
     879                 : 
     880              40 : bot = 0;
     881              40 : top = _pcre_utt_size;
     882                 : 
     883             250 : while (bot < top)
     884                 :   {
     885             210 :   i = (bot + top) >> 1;
     886             210 :   c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
     887             210 :   if (c == 0)
     888                 :     {
     889              40 :     *dptr = _pcre_utt[i].value;
     890              40 :     return _pcre_utt[i].type;
     891                 :     }
     892             170 :   if (c > 0) bot = i + 1; else top = i;
     893                 :   }
     894                 : 
     895               0 : *errorcodeptr = ERR47;
     896               0 : *ptrptr = ptr;
     897               0 : return -1;
     898                 : 
     899               0 : ERROR_RETURN:
     900               0 : *errorcodeptr = ERR46;
     901               0 : *ptrptr = ptr;
     902               0 : return -1;
     903                 : }
     904                 : #endif
     905                 : 
     906                 : 
     907                 : 
     908                 : 
     909                 : /*************************************************
     910                 : *            Check for counted repeat            *
     911                 : *************************************************/
     912                 : 
     913                 : /* This function is called when a '{' is encountered in a place where it might
     914                 : start a quantifier. It looks ahead to see if it really is a quantifier or not.
     915                 : It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
     916                 : where the ddds are digits.
     917                 : 
     918                 : Arguments:
     919                 :   p         pointer to the first char after '{'
     920                 : 
     921                 : Returns:    TRUE or FALSE
     922                 : */
     923                 : 
     924                 : static BOOL
     925                 : is_counted_repeat(const uschar *p)
     926              88 : {
     927              88 : if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
     928              54 : while ((digitab[*p] & ctype_digit) != 0) p++;
     929              54 : if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
     930                 : 
     931              20 : if (*p++ != CHAR_COMMA) return FALSE;
     932              20 : if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
     933                 : 
     934              12 : if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
     935              12 : while ((digitab[*p] & ctype_digit) != 0) p++;
     936                 : 
     937              12 : return (*p == CHAR_RIGHT_CURLY_BRACKET);
     938                 : }
     939                 : 
     940                 : 
     941                 : 
     942                 : /*************************************************
     943                 : *         Read repeat counts                     *
     944                 : *************************************************/
     945                 : 
     946                 : /* Read an item of the form {n,m} and return the values. This is called only
     947                 : after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
     948                 : so the syntax is guaranteed to be correct, but we need to check the values.
     949                 : 
     950                 : Arguments:
     951                 :   p              pointer to first char after '{'
     952                 :   minp           pointer to int for min
     953                 :   maxp           pointer to int for max
     954                 :                  returned as -1 if no max
     955                 :   errorcodeptr   points to error code variable
     956                 : 
     957                 : Returns:         pointer to '}' on success;
     958                 :                  current ptr on error, with errorcodeptr set non-zero
     959                 : */
     960                 : 
     961                 : static const uschar *
     962                 : read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
     963              54 : {
     964              54 : int min = 0;
     965              54 : int max = -1;
     966                 : 
     967                 : /* Read the minimum value and do a paranoid check: a negative value indicates
     968                 : an integer overflow. */
     969                 : 
     970              54 : while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
     971              54 : if (min < 0 || min > 65535)
     972                 :   {
     973               0 :   *errorcodeptr = ERR5;
     974               0 :   return p;
     975                 :   }
     976                 : 
     977                 : /* Read the maximum value if there is one, and again do a paranoid on its size.
     978                 : Also, max must not be less than min. */
     979                 : 
     980              54 : if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
     981                 :   {
     982              20 :   if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
     983                 :     {
     984              12 :     max = 0;
     985              12 :     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
     986              12 :     if (max < 0 || max > 65535)
     987                 :       {
     988               0 :       *errorcodeptr = ERR5;
     989               0 :       return p;
     990                 :       }
     991              12 :     if (max < min)
     992                 :       {
     993               0 :       *errorcodeptr = ERR4;
     994               0 :       return p;
     995                 :       }
     996                 :     }
     997                 :   }
     998                 : 
     999                 : /* Fill in the required variables, and pass back the pointer to the terminating
    1000                 : '}'. */
    1001                 : 
    1002              54 : *minp = min;
    1003              54 : *maxp = max;
    1004              54 : return p;
    1005                 : }
    1006                 : 
    1007                 : 
    1008                 : 
    1009                 : /*************************************************
    1010                 : *  Subroutine for finding forward reference      *
    1011                 : *************************************************/
    1012                 : 
    1013                 : /* This recursive function is called only from find_parens() below. The
    1014                 : top-level call starts at the beginning of the pattern. All other calls must
    1015                 : start at a parenthesis. It scans along a pattern's text looking for capturing
    1016                 : subpatterns, and counting them. If it finds a named pattern that matches the
    1017                 : name it is given, it returns its number. Alternatively, if the name is NULL, it
    1018                 : returns when it reaches a given numbered subpattern. We know that if (?P< is
    1019                 : encountered, the name will be terminated by '>' because that is checked in the
    1020                 : first pass. Recursion is used to keep track of subpatterns that reset the
    1021                 : capturing group numbers - the (?| feature.
    1022                 : 
    1023                 : Arguments:
    1024                 :   ptrptr       address of the current character pointer (updated)
    1025                 :   cd           compile background data
    1026                 :   name         name to seek, or NULL if seeking a numbered subpattern
    1027                 :   lorn         name length, or subpattern number if name is NULL
    1028                 :   xmode        TRUE if we are in /x mode
    1029                 :   count        pointer to the current capturing subpattern number (updated)
    1030                 : 
    1031                 : Returns:       the number of the named subpattern, or -1 if not found
    1032                 : */
    1033                 : 
    1034                 : static int
    1035                 : find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
    1036                 :   BOOL xmode, int *count)
    1037            1014 : {
    1038            1014 : uschar *ptr = *ptrptr;
    1039            1014 : int start_count = *count;
    1040            1014 : int hwm_count = start_count;
    1041            1014 : BOOL dup_parens = FALSE;
    1042                 : 
    1043                 : /* If the first character is a parenthesis, check on the type of group we are
    1044                 : dealing with. The very first call may not start with a parenthesis. */
    1045                 : 
    1046            1014 : if (ptr[0] == CHAR_LEFT_PARENTHESIS)
    1047                 :   {
    1048            1012 :   if (ptr[1] == CHAR_QUESTION_MARK &&
    1049                 :       ptr[2] == CHAR_VERTICAL_LINE)
    1050                 :     {
    1051               0 :     ptr += 3;
    1052               0 :     dup_parens = TRUE;
    1053                 :     }
    1054                 : 
    1055                 :   /* Handle a normal, unnamed capturing parenthesis */
    1056                 : 
    1057            1622 :   else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
    1058                 :     {
    1059             610 :     *count += 1;
    1060             610 :     if (name == NULL && *count == lorn) return *count;
    1061             610 :     ptr++;
    1062                 :     }
    1063                 : 
    1064                 :   /* Handle a condition. If it is an assertion, just carry on so that it
    1065                 :   is processed as normal. If not, skip to the closing parenthesis of the
    1066                 :   condition (there can't be any nested parens. */
    1067                 : 
    1068             402 :   else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
    1069                 :     {
    1070               4 :     ptr += 2;
    1071               4 :     if (ptr[1] != CHAR_QUESTION_MARK)
    1072                 :       {
    1073               4 :       while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
    1074               4 :       if (*ptr != 0) ptr++;
    1075                 :       }
    1076                 :     }
    1077                 : 
    1078                 :   /* We have either (? or (* and not a condition */
    1079                 : 
    1080                 :   else
    1081                 :     {
    1082             398 :     ptr += 2;
    1083             398 :     if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */
    1084                 : 
    1085                 :     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
    1086                 : 
    1087             398 :     if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
    1088                 :         ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
    1089                 :       {
    1090                 :       int term;
    1091                 :       const uschar *thisname;
    1092               0 :       *count += 1;
    1093               0 :       if (name == NULL && *count == lorn) return *count;
    1094               0 :       term = *ptr++;
    1095               0 :       if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
    1096               0 :       thisname = ptr;
    1097               0 :       while (*ptr != term) ptr++;
    1098               0 :       if (name != NULL && lorn == ptr - thisname &&
    1099                 :           strncmp((const char *)name, (const char *)thisname, lorn) == 0)
    1100               0 :         return *count;
    1101                 :       }
    1102                 :     }
    1103                 :   }
    1104                 : 
    1105                 : /* Past any initial parenthesis handling, scan for parentheses or vertical
    1106                 : bars. */
    1107                 : 
    1108            8108 : for (; *ptr != 0; ptr++)
    1109                 :   {
    1110                 :   /* Skip over backslashed characters and also entire \Q...\E */
    1111                 : 
    1112            8106 :   if (*ptr == CHAR_BACKSLASH)
    1113                 :     {
    1114             810 :     if (*(++ptr) == 0) goto FAIL_EXIT;
    1115             810 :     if (*ptr == CHAR_Q) for (;;)
    1116                 :       {
    1117               0 :       while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
    1118               0 :       if (*ptr == 0) goto FAIL_EXIT;
    1119               0 :       if (*(++ptr) == CHAR_E) break;
    1120               0 :       }
    1121             810 :     continue;
    1122                 :     }
    1123                 : 
    1124                 :   /* Skip over character classes; this logic must be similar to the way they
    1125                 :   are handled for real. If the first character is '^', skip it. Also, if the
    1126                 :   first few characters (either before or after ^) are \Q\E or \E we skip them
    1127                 :   too. This makes for compatibility with Perl. Note the use of STR macros to
    1128                 :   encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
    1129                 : 
    1130            7296 :   if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
    1131                 :     {
    1132               8 :     BOOL negate_class = FALSE;
    1133                 :     for (;;)
    1134                 :       {
    1135              12 :       int c = *(++ptr);
    1136              12 :       if (c == CHAR_BACKSLASH)
    1137                 :         {
    1138               0 :         if (ptr[1] == CHAR_E)
    1139               0 :           ptr++;
    1140               0 :         else if (strncmp((const char *)ptr+1,
    1141                 :                  STR_Q STR_BACKSLASH STR_E, 3) == 0)
    1142               0 :           ptr += 3;
    1143                 :         else
    1144               0 :           break;
    1145                 :         }
    1146              12 :       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
    1147               4 :         negate_class = TRUE;
    1148                 :       else break;
    1149               4 :       }
    1150                 : 
    1151                 :     /* If the next character is ']', it is a data character that must be
    1152                 :     skipped, except in JavaScript compatibility mode. */
    1153                 : 
    1154               8 :     if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
    1155                 :         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
    1156               0 :       ptr++;
    1157                 : 
    1158              38 :     while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
    1159                 :       {
    1160              22 :       if (*ptr == 0) return -1;
    1161              22 :       if (*ptr == CHAR_BACKSLASH)
    1162                 :         {
    1163               4 :         if (*(++ptr) == 0) goto FAIL_EXIT;
    1164               4 :         if (*ptr == CHAR_Q) for (;;)
    1165                 :           {
    1166               0 :           while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
    1167               0 :           if (*ptr == 0) goto FAIL_EXIT;
    1168               0 :           if (*(++ptr) == CHAR_E) break;
    1169               0 :           }
    1170               4 :         continue;
    1171                 :         }
    1172                 :       }
    1173               8 :     continue;
    1174                 :     }
    1175                 : 
    1176                 :   /* Skip comments in /x mode */
    1177                 : 
    1178            7288 :   if (xmode && *ptr == CHAR_NUMBER_SIGN)
    1179                 :     {
    1180               0 :     while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
    1181               0 :     if (*ptr == 0) goto FAIL_EXIT;
    1182               0 :     continue;
    1183                 :     }
    1184                 : 
    1185                 :   /* Check for the special metacharacters */
    1186                 : 
    1187            7288 :   if (*ptr == CHAR_LEFT_PARENTHESIS)
    1188                 :     {
    1189            1012 :     int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
    1190            1012 :     if (rc > 0) return rc;
    1191            1012 :     if (*ptr == 0) goto FAIL_EXIT;
    1192                 :     }
    1193                 : 
    1194            6276 :   else if (*ptr == CHAR_RIGHT_PARENTHESIS)
    1195                 :     {
    1196            1012 :     if (dup_parens && *count < hwm_count) *count = hwm_count;
    1197            1012 :     *ptrptr = ptr;
    1198            1012 :     return -1;
    1199                 :     }
    1200                 : 
    1201            5264 :   else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
    1202                 :     {
    1203               0 :     if (*count > hwm_count) hwm_count = *count;
    1204               0 :     *count = start_count;
    1205                 :     }
    1206                 :   }
    1207                 : 
    1208               2 : FAIL_EXIT:
    1209               2 : *ptrptr = ptr;
    1210               2 : return -1;
    1211                 : }
    1212                 : 
    1213                 : 
    1214                 : 
    1215                 : 
    1216                 : /*************************************************
    1217                 : *       Find forward referenced subpattern       *
    1218                 : *************************************************/
    1219                 : 
    1220                 : /* This function scans along a pattern's text looking for capturing
    1221                 : subpatterns, and counting them. If it finds a named pattern that matches the
    1222                 : name it is given, it returns its number. Alternatively, if the name is NULL, it
    1223                 : returns when it reaches a given numbered subpattern. This is used for forward
    1224                 : references to subpatterns. We used to be able to start this scan from the
    1225                 : current compiling point, using the current count value from cd->bracount, and
    1226                 : do it all in a single loop, but the addition of the possibility of duplicate
    1227                 : subpattern numbers means that we have to scan from the very start, in order to
    1228                 : take account of such duplicates, and to use a recursive function to keep track
    1229                 : of the different types of group.
    1230                 : 
    1231                 : Arguments:
    1232                 :   cd           compile background data
    1233                 :   name         name to seek, or NULL if seeking a numbered subpattern
    1234                 :   lorn         name length, or subpattern number if name is NULL
    1235                 :   xmode        TRUE if we are in /x mode
    1236                 : 
    1237                 : Returns:       the number of the found subpattern, or -1 if not found
    1238                 : */
    1239                 : 
    1240                 : static int
    1241                 : find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
    1242               2 : {
    1243               2 : uschar *ptr = (uschar *)cd->start_pattern;
    1244               2 : int count = 0;
    1245                 : int rc;
    1246                 : 
    1247                 : /* If the pattern does not start with an opening parenthesis, the first call
    1248                 : to find_parens_sub() will scan right to the end (if necessary). However, if it
    1249                 : does start with a parenthesis, find_parens_sub() will return when it hits the
    1250                 : matching closing parens. That is why we have to have a loop. */
    1251                 : 
    1252                 : for (;;)
    1253                 :   {
    1254               2 :   rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
    1255               2 :   if (rc > 0 || *ptr++ == 0) break;
    1256               0 :   }
    1257                 : 
    1258               2 : return rc;
    1259                 : }
    1260                 : 
    1261                 : 
    1262                 : 
    1263                 : 
    1264                 : /*************************************************
    1265                 : *      Find first significant op code            *
    1266                 : *************************************************/
    1267                 : 
    1268                 : /* This is called by several functions that scan a compiled expression looking
    1269                 : for a fixed first character, or an anchoring op code etc. It skips over things
    1270                 : that do not influence this. For some calls, a change of option is important.
    1271                 : For some calls, it makes sense to skip negative forward and all backward
    1272                 : assertions, and also the \b assertion; for others it does not.
    1273                 : 
    1274                 : Arguments:
    1275                 :   code         pointer to the start of the group
    1276                 :   options      pointer to external options
    1277                 :   optbit       the option bit whose changing is significant, or
    1278                 :                  zero if none are
    1279                 :   skipassert   TRUE if certain assertions are to be skipped
    1280                 : 
    1281                 : Returns:       pointer to the first significant opcode
    1282                 : */
    1283                 : 
    1284                 : static const uschar*
    1285                 : first_significant_code(const uschar *code, int *options, int optbit,
    1286                 :   BOOL skipassert)
    1287           18721 : {
    1288                 : for (;;)
    1289                 :   {
    1290           18721 :   switch ((int)*code)
    1291                 :     {
    1292                 :     case OP_OPT:
    1293               0 :     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
    1294               0 :       *options = (int)code[1];
    1295               0 :     code += 2;
    1296               0 :     break;
    1297                 : 
    1298                 :     case OP_ASSERT_NOT:
    1299                 :     case OP_ASSERTBACK:
    1300                 :     case OP_ASSERTBACK_NOT:
    1301               3 :     if (!skipassert) return code;
    1302               1 :     do code += GET(code, 1); while (*code == OP_ALT);
    1303               1 :     code += _pcre_OP_lengths[*code];
    1304               1 :     break;
    1305                 : 
    1306                 :     case OP_WORD_BOUNDARY:
    1307                 :     case OP_NOT_WORD_BOUNDARY:
    1308              15 :     if (!skipassert) return code;
    1309                 :     /* Fall through */
    1310                 : 
    1311                 :     case OP_CALLOUT:
    1312                 :     case OP_CREF:
    1313                 :     case OP_RREF:
    1314                 :     case OP_DEF:
    1315               5 :     code += _pcre_OP_lengths[*code];
    1316               5 :     break;
    1317                 : 
    1318                 :     default:
    1319           18703 :     return code;
    1320                 :     }
    1321               6 :   }
    1322                 : /* Control never reaches here */
    1323                 : }
    1324                 : 
    1325                 : 
    1326                 : 
    1327                 : 
    1328                 : /*************************************************
    1329                 : *        Find the fixed length of a pattern      *
    1330                 : *************************************************/
    1331                 : 
    1332                 : /* Scan a pattern and compute the fixed length of subject that will match it,
    1333                 : if the length is fixed. This is needed for dealing with backward assertions.
    1334                 : In UTF8 mode, the result is in characters rather than bytes.
    1335                 : 
    1336                 : Arguments:
    1337                 :   code     points to the start of the pattern (the bracket)
    1338                 :   options  the compiling options
    1339                 : 
    1340                 : Returns:   the fixed length, or -1 if there is no fixed length,
    1341                 :              or -2 if \C was encountered
    1342                 : */
    1343                 : 
    1344                 : static int
    1345                 : find_fixedlength(uschar *code, int options)
    1346               2 : {
    1347               2 : int length = -1;
    1348                 : 
    1349               2 : register int branchlength = 0;
    1350               2 : register uschar *cc = code + 1 + LINK_SIZE;
    1351                 : 
    1352                 : /* Scan along the opcodes for this branch. If we get to the end of the
    1353                 : branch, check the length against that of the other branches. */
    1354                 : 
    1355                 : for (;;)
    1356                 :   {
    1357                 :   int d;
    1358               6 :   register int op = *cc;
    1359               6 :   switch (op)
    1360                 :     {
    1361                 :     case OP_CBRA:
    1362                 :     case OP_BRA:
    1363                 :     case OP_ONCE:
    1364                 :     case OP_COND:
    1365               0 :     d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
    1366               0 :     if (d < 0) return d;
    1367               0 :     branchlength += d;
    1368               0 :     do cc += GET(cc, 1); while (*cc == OP_ALT);
    1369               0 :     cc += 1 + LINK_SIZE;
    1370               0 :     break;
    1371                 : 
    1372                 :     /* Reached end of a branch; if it's a ket it is the end of a nested
    1373                 :     call. If it's ALT it is an alternation in a nested call. If it is
    1374                 :     END it's the end of the outer call. All can be handled by the same code. */
    1375                 : 
    1376                 :     case OP_ALT:
    1377                 :     case OP_KET:
    1378                 :     case OP_KETRMAX:
    1379                 :     case OP_KETRMIN:
    1380                 :     case OP_END:
    1381               2 :     if (length < 0) length = branchlength;
    1382               0 :       else if (length != branchlength) return -1;
    1383               2 :     if (*cc != OP_ALT) return length;
    1384               0 :     cc += 1 + LINK_SIZE;
    1385               0 :     branchlength = 0;
    1386               0 :     break;
    1387                 : 
    1388                 :     /* Skip over assertive subpatterns */
    1389                 : 
    1390                 :     case OP_ASSERT:
    1391                 :     case OP_ASSERT_NOT:
    1392                 :     case OP_ASSERTBACK:
    1393                 :     case OP_ASSERTBACK_NOT:
    1394               0 :     do cc += GET(cc, 1); while (*cc == OP_ALT);
    1395                 :     /* Fall through */
    1396                 : 
    1397                 :     /* Skip over things that don't match chars */
    1398                 : 
    1399                 :     case OP_REVERSE:
    1400                 :     case OP_CREF:
    1401                 :     case OP_RREF:
    1402                 :     case OP_DEF:
    1403                 :     case OP_OPT:
    1404                 :     case OP_CALLOUT:
    1405                 :     case OP_SOD:
    1406                 :     case OP_SOM:
    1407                 :     case OP_EOD:
    1408                 :     case OP_EODN:
    1409                 :     case OP_CIRC:
    1410                 :     case OP_DOLL:
    1411                 :     case OP_NOT_WORD_BOUNDARY:
    1412                 :     case OP_WORD_BOUNDARY:
    1413               2 :     cc += _pcre_OP_lengths[*cc];
    1414               2 :     break;
    1415                 : 
    1416                 :     /* Handle literal characters */
    1417                 : 
    1418                 :     case OP_CHAR:
    1419                 :     case OP_CHARNC:
    1420                 :     case OP_NOT:
    1421               1 :     branchlength++;
    1422               1 :     cc += 2;
    1423                 : #ifdef SUPPORT_UTF8
    1424               1 :     if ((options & PCRE_UTF8) != 0)
    1425                 :       {
    1426               0 :       while ((*cc & 0xc0) == 0x80) cc++;
    1427                 :       }
    1428                 : #endif
    1429               1 :     break;
    1430                 : 
    1431                 :     /* Handle exact repetitions. The count is already in characters, but we
    1432                 :     need to skip over a multibyte character in UTF8 mode.  */
    1433                 : 
    1434                 :     case OP_EXACT:
    1435               0 :     branchlength += GET2(cc,1);
    1436               0 :     cc += 4;
    1437                 : #ifdef SUPPORT_UTF8
    1438               0 :     if ((options & PCRE_UTF8) != 0)
    1439                 :       {
    1440               0 :       while((*cc & 0x80) == 0x80) cc++;
    1441                 :       }
    1442                 : #endif
    1443               0 :     break;
    1444                 : 
    1445                 :     case OP_TYPEEXACT:
    1446               0 :     branchlength += GET2(cc,1);
    1447               0 :     if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
    1448               0 :     cc += 4;
    1449               0 :     break;
    1450                 : 
    1451                 :     /* Handle single-char matchers */
    1452                 : 
    1453                 :     case OP_PROP:
    1454                 :     case OP_NOTPROP:
    1455               0 :     cc += 2;
    1456                 :     /* Fall through */
    1457                 : 
    1458                 :     case OP_NOT_DIGIT:
    1459                 :     case OP_DIGIT:
    1460                 :     case OP_NOT_WHITESPACE:
    1461                 :     case OP_WHITESPACE:
    1462                 :     case OP_NOT_WORDCHAR:
    1463                 :     case OP_WORDCHAR:
    1464                 :     case OP_ANY:
    1465                 :     case OP_ALLANY:
    1466               1 :     branchlength++;
    1467               1 :     cc++;
    1468               1 :     break;
    1469                 : 
    1470                 :     /* The single-byte matcher isn't allowed */
    1471                 : 
    1472                 :     case OP_ANYBYTE:
    1473               0 :     return -2;
    1474                 : 
    1475                 :     /* Check a class for variable quantification */
    1476                 : 
    1477                 : #ifdef SUPPORT_UTF8
    1478                 :     case OP_XCLASS:
    1479               0 :     cc += GET(cc, 1) - 33;
    1480                 :     /* Fall through */
    1481                 : #endif
    1482                 : 
    1483                 :     case OP_CLASS:
    1484                 :     case OP_NCLASS:
    1485               0 :     cc += 33;
    1486                 : 
    1487               0 :     switch (*cc)
    1488                 :       {
    1489                 :       case OP_CRSTAR:
    1490                 :       case OP_CRMINSTAR:
    1491                 :       case OP_CRQUERY:
    1492                 :       case OP_CRMINQUERY:
    1493               0 :       return -1;
    1494                 : 
    1495                 :       case OP_CRRANGE:
    1496                 :       case OP_CRMINRANGE:
    1497               0 :       if (GET2(cc,1) != GET2(cc,3)) return -1;
    1498               0 :       branchlength += GET2(cc,1);
    1499               0 :       cc += 5;
    1500               0 :       break;
    1501                 : 
    1502                 :       default:
    1503               0 :       branchlength++;
    1504                 :       }
    1505               0 :     break;
    1506                 : 
    1507                 :     /* Anything else is variable length */
    1508                 : 
    1509                 :     default:
    1510               0 :     return -1;
    1511                 :     }
    1512               4 :   }
    1513                 : /* Control never gets here */
    1514                 : }
    1515                 : 
    1516                 : 
    1517                 : 
    1518                 : 
    1519                 : /*************************************************
    1520                 : *    Scan compiled regex for numbered bracket    *
    1521                 : *************************************************/
    1522                 : 
    1523                 : /* This little function scans through a compiled pattern until it finds a
    1524                 : capturing bracket with the given number.
    1525                 : 
    1526                 : Arguments:
    1527                 :   code        points to start of expression
    1528                 :   utf8        TRUE in UTF-8 mode
    1529                 :   number      the required bracket number
    1530                 : 
    1531                 : Returns:      pointer to the opcode for the bracket, or NULL if not found
    1532                 : */
    1533                 : 
    1534                 : static const uschar *
    1535                 : find_bracket(const uschar *code, BOOL utf8, int number)
    1536               0 : {
    1537                 : for (;;)
    1538                 :   {
    1539               0 :   register int c = *code;
    1540               0 :   if (c == OP_END) return NULL;
    1541                 : 
    1542                 :   /* XCLASS is used for classes that cannot be represented just by a bit
    1543                 :   map. This includes negated single high-valued characters. The length in
    1544                 :   the table is zero; the actual length is stored in the compiled code. */
    1545                 : 
    1546               0 :   if (c == OP_XCLASS) code += GET(code, 1);
    1547                 : 
    1548                 :   /* Handle capturing bracket */
    1549                 : 
    1550               0 :   else if (c == OP_CBRA)
    1551                 :     {
    1552               0 :     int n = GET2(code, 1+LINK_SIZE);
    1553               0 :     if (n == number) return (uschar *)code;
    1554               0 :     code += _pcre_OP_lengths[c];
    1555                 :     }
    1556                 : 
    1557                 :   /* Otherwise, we can get the item's length from the table, except that for
    1558                 :   repeated character types, we have to test for \p and \P, which have an extra
    1559                 :   two bytes of parameters. */
    1560                 : 
    1561                 :   else
    1562                 :     {
    1563               0 :     switch(c)
    1564                 :       {
    1565                 :       case OP_TYPESTAR:
    1566                 :       case OP_TYPEMINSTAR:
    1567                 :       case OP_TYPEPLUS:
    1568                 :       case OP_TYPEMINPLUS:
    1569                 :       case OP_TYPEQUERY:
    1570                 :       case OP_TYPEMINQUERY:
    1571                 :       case OP_TYPEPOSSTAR:
    1572                 :       case OP_TYPEPOSPLUS:
    1573                 :       case OP_TYPEPOSQUERY:
    1574               0 :       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
    1575               0 :       break;
    1576                 : 
    1577                 :       case OP_TYPEUPTO:
    1578                 :       case OP_TYPEMINUPTO:
    1579                 :       case OP_TYPEEXACT:
    1580                 :       case OP_TYPEPOSUPTO:
    1581               0 :       if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
    1582                 :       break;
    1583                 :       }
    1584                 : 
    1585                 :     /* Add in the fixed length from the table */
    1586                 : 
    1587               0 :     code += _pcre_OP_lengths[c];
    1588                 : 
    1589                 :   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
    1590                 :   a multi-byte character. The length in the table is a minimum, so we have to
    1591                 :   arrange to skip the extra bytes. */
    1592                 : 
    1593                 : #ifdef SUPPORT_UTF8
    1594               0 :     if (utf8) switch(c)
    1595                 :       {
    1596                 :       case OP_CHAR:
    1597                 :       case OP_CHARNC:
    1598                 :       case OP_EXACT:
    1599                 :       case OP_UPTO:
    1600                 :       case OP_MINUPTO:
    1601                 :       case OP_POSUPTO:
    1602                 :       case OP_STAR:
    1603                 :       case OP_MINSTAR:
    1604                 :       case OP_POSSTAR:
    1605                 :       case OP_PLUS:
    1606                 :       case OP_MINPLUS:
    1607                 :       case OP_POSPLUS:
    1608                 :       case OP_QUERY:
    1609                 :       case OP_MINQUERY:
    1610                 :       case OP_POSQUERY:
    1611               0 :       if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
    1612                 :       break;
    1613                 :       }
    1614                 : #else
    1615                 :     (void)(utf8);  /* Keep compiler happy by referencing function argument */
    1616                 : #endif
    1617                 :     }
    1618               0 :   }
    1619                 : }
    1620                 : 
    1621                 : 
    1622                 : 
    1623                 : /*************************************************
    1624                 : *   Scan compiled regex for recursion reference  *
    1625                 : *************************************************/
    1626                 : 
    1627                 : /* This little function scans through a compiled pattern until it finds an
    1628                 : instance of OP_RECURSE.
    1629                 : 
    1630                 : Arguments:
    1631                 :   code        points to start of expression
    1632                 :   utf8        TRUE in UTF-8 mode
    1633                 : 
    1634                 : Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
    1635                 : */
    1636                 : 
    1637                 : static const uschar *
    1638                 : find_recurse(const uschar *code, BOOL utf8)
    1639            8167 : {
    1640                 : for (;;)
    1641                 :   {
    1642            8167 :   register int c = *code;
    1643            8167 :   if (c == OP_END) return NULL;
    1644            7357 :   if (c == OP_RECURSE) return code;
    1645                 : 
    1646                 :   /* XCLASS is used for classes that cannot be represented just by a bit
    1647                 :   map. This includes negated single high-valued characters. The length in
    1648                 :   the table is zero; the actual length is stored in the compiled code. */
    1649                 : 
    1650            7356 :   if (c == OP_XCLASS) code += GET(code, 1);
    1651                 : 
    1652                 :   /* Otherwise, we can get the item's length from the table, except that for
    1653                 :   repeated character types, we have to test for \p and \P, which have an extra
    1654                 :   two bytes of parameters. */
    1655                 : 
    1656                 :   else
    1657                 :     {
    1658            7355 :     switch(c)
    1659                 :       {
    1660                 :       case OP_TYPESTAR:
    1661                 :       case OP_TYPEMINSTAR:
    1662                 :       case OP_TYPEPLUS:
    1663                 :       case OP_TYPEMINPLUS:
    1664                 :       case OP_TYPEQUERY:
    1665                 :       case OP_TYPEMINQUERY:
    1666                 :       case OP_TYPEPOSSTAR:
    1667                 :       case OP_TYPEPOSPLUS:
    1668                 :       case OP_TYPEPOSQUERY:
    1669             654 :       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
    1670             654 :       break;
    1671                 : 
    1672                 :       case OP_TYPEPOSUPTO:
    1673                 :       case OP_TYPEUPTO:
    1674                 :       case OP_TYPEMINUPTO:
    1675                 :       case OP_TYPEEXACT:
    1676               2 :       if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
    1677                 :       break;
    1678                 :       }
    1679                 : 
    1680                 :     /* Add in the fixed length from the table */
    1681                 : 
    1682            7355 :     code += _pcre_OP_lengths[c];
    1683                 : 
    1684                 :     /* In UTF-8 mode, opcodes that are followed by a character may be followed
    1685                 :     by a multi-byte character. The length in the table is a minimum, so we have
    1686                 :     to arrange to skip the extra bytes. */
    1687                 : 
    1688                 : #ifdef SUPPORT_UTF8
    1689            7355 :     if (utf8) switch(c)
    1690                 :       {
    1691                 :       case OP_CHAR:
    1692                 :       case OP_CHARNC:
    1693                 :       case OP_EXACT:
    1694                 :       case OP_UPTO:
    1695                 :       case OP_MINUPTO:
    1696                 :       case OP_POSUPTO:
    1697                 :       case OP_STAR:
    1698                 :       case OP_MINSTAR:
    1699                 :       case OP_POSSTAR:
    1700                 :       case OP_PLUS:
    1701                 :       case OP_MINPLUS:
    1702                 :       case OP_POSPLUS:
    1703                 :       case OP_QUERY:
    1704                 :       case OP_MINQUERY:
    1705                 :       case OP_POSQUERY:
    1706               0 :       if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
    1707                 :       break;
    1708                 :       }
    1709                 : #else
    1710                 :     (void)(utf8);  /* Keep compiler happy by referencing function argument */
    1711                 : #endif
    1712                 :     }
    1713            7356 :   }
    1714                 : }
    1715                 : 
    1716                 : 
    1717                 : 
    1718                 : /*************************************************
    1719                 : *    Scan compiled branch for non-emptiness      *
    1720                 : *************************************************/
    1721                 : 
    1722                 : /* This function scans through a branch of a compiled pattern to see whether it
    1723                 : can match the empty string or not. It is called from could_be_empty()
    1724                 : below and from compile_branch() when checking for an unlimited repeat of a
    1725                 : group that can match nothing. Note that first_significant_code() skips over
    1726                 : backward and negative forward assertions when its final argument is TRUE. If we
    1727                 : hit an unclosed bracket, we return "empty" - this means we've struck an inner
    1728                 : bracket whose current branch will already have been scanned.
    1729                 : 
    1730                 : Arguments:
    1731                 :   code        points to start of search
    1732                 :   endcode     points to where to stop
    1733                 :   utf8        TRUE if in UTF8 mode
    1734                 : 
    1735                 : Returns:      TRUE if what is matched could be empty
    1736                 : */
    1737                 : 
    1738                 : static BOOL
    1739                 : could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
    1740              52 : {
    1741                 : register int c;
    1742              52 : for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
    1743             122 :      code < endcode;
    1744              18 :      code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
    1745                 :   {
    1746                 :   const uschar *ccode;
    1747                 : 
    1748              62 :   c = *code;
    1749                 : 
    1750                 :   /* Skip over forward assertions; the other assertions are skipped by
    1751                 :   first_significant_code() with a TRUE final argument. */
    1752                 : 
    1753              62 :   if (c == OP_ASSERT)
    1754                 :     {
    1755               0 :     do code += GET(code, 1); while (*code == OP_ALT);
    1756               0 :     c = *code;
    1757               0 :     continue;
    1758                 :     }
    1759                 : 
    1760                 :   /* Groups with zero repeats can of course be empty; skip them. */
    1761                 : 
    1762              62 :   if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
    1763                 :     {
    1764               4 :     code += _pcre_OP_lengths[c];
    1765               5 :     do code += GET(code, 1); while (*code == OP_ALT);
    1766               4 :     c = *code;
    1767               4 :     continue;
    1768                 :     }
    1769                 : 
    1770                 :   /* For other groups, scan the branches. */
    1771                 : 
    1772              58 :   if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
    1773                 :     {
    1774                 :     BOOL empty_branch;
    1775              10 :     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
    1776                 : 
    1777                 :     /* If a conditional group has only one branch, there is a second, implied,
    1778                 :     empty branch, so just skip over the conditional, because it could be empty.
    1779                 :     Otherwise, scan the individual branches of the group. */
    1780                 : 
    1781               8 :     if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
    1782               0 :       code += GET(code, 1);
    1783                 :     else
    1784                 :       {
    1785               8 :       empty_branch = FALSE;
    1786                 :       do
    1787                 :         {
    1788               8 :         if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
    1789               3 :           empty_branch = TRUE;
    1790               8 :         code += GET(code, 1);
    1791                 :         }
    1792               8 :       while (*code == OP_ALT);
    1793               8 :       if (!empty_branch) return FALSE;   /* All branches are non-empty */
    1794                 :       }
    1795                 : 
    1796               3 :     c = *code;
    1797               3 :     continue;
    1798                 :     }
    1799                 : 
    1800                 :   /* Handle the other opcodes */
    1801                 : 
    1802              48 :   switch (c)
    1803                 :     {
    1804                 :     /* Check for quantifiers after a class. XCLASS is used for classes that
    1805                 :     cannot be represented just by a bit map. This includes negated single
    1806                 :     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
    1807                 :     actual length is stored in the compiled code, so we must update "code"
    1808                 :     here. */
    1809                 : 
    1810                 : #ifdef SUPPORT_UTF8
    1811                 :     case OP_XCLASS:
    1812               0 :     ccode = code += GET(code, 1);
    1813               0 :     goto CHECK_CLASS_REPEAT;
    1814                 : #endif
    1815                 : 
    1816                 :     case OP_CLASS:
    1817                 :     case OP_NCLASS:
    1818              12 :     ccode = code + 33;
    1819                 : 
    1820                 : #ifdef SUPPORT_UTF8
    1821              12 :     CHECK_CLASS_REPEAT:
    1822                 : #endif
    1823                 : 
    1824              12 :     switch (*ccode)
    1825                 :       {
    1826                 :       case OP_CRSTAR:            /* These could be empty; continue */
    1827                 :       case OP_CRMINSTAR:
    1828                 :       case OP_CRQUERY:
    1829                 :       case OP_CRMINQUERY:
    1830               1 :       break;
    1831                 : 
    1832                 :       default:                   /* Non-repeat => class must match */
    1833                 :       case OP_CRPLUS:            /* These repeats aren't empty */
    1834                 :       case OP_CRMINPLUS:
    1835              11 :       return FALSE;
    1836                 : 
    1837                 :       case OP_CRRANGE:
    1838                 :       case OP_CRMINRANGE:
    1839               0 :       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
    1840                 :       break;
    1841                 :       }
    1842               1 :     break;
    1843                 : 
    1844                 :     /* Opcodes that must match a character */
    1845                 : 
    1846                 :     case OP_PROP:
    1847                 :     case OP_NOTPROP:
    1848                 :     case OP_EXTUNI:
    1849                 :     case OP_NOT_DIGIT:
    1850                 :     case OP_DIGIT:
    1851                 :     case OP_NOT_WHITESPACE:
    1852                 :     case OP_WHITESPACE:
    1853                 :     case OP_NOT_WORDCHAR:
    1854                 :     case OP_WORDCHAR:
    1855                 :     case OP_ANY:
    1856                 :     case OP_ALLANY:
    1857                 :     case OP_ANYBYTE:
    1858                 :     case OP_CHAR:
    1859                 :     case OP_CHARNC:
    1860                 :     case OP_NOT:
    1861                 :     case OP_PLUS:
    1862                 :     case OP_MINPLUS:
    1863                 :     case OP_POSPLUS:
    1864                 :     case OP_EXACT:
    1865                 :     case OP_NOTPLUS:
    1866                 :     case OP_NOTMINPLUS:
    1867                 :     case OP_NOTPOSPLUS:
    1868                 :     case OP_NOTEXACT:
    1869                 :     case OP_TYPEPLUS:
    1870                 :     case OP_TYPEMINPLUS:
    1871                 :     case OP_TYPEPOSPLUS:
    1872                 :     case OP_TYPEEXACT:
    1873              22 :     return FALSE;
    1874                 : 
    1875                 :     /* These are going to continue, as they may be empty, but we have to
    1876                 :     fudge the length for the \p and \P cases. */
    1877                 : 
    1878                 :     case OP_TYPESTAR:
    1879                 :     case OP_TYPEMINSTAR:
    1880                 :     case OP_TYPEPOSSTAR:
    1881                 :     case OP_TYPEQUERY:
    1882                 :     case OP_TYPEMINQUERY:
    1883                 :     case OP_TYPEPOSQUERY:
    1884               4 :     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
    1885               4 :     break;
    1886                 : 
    1887                 :     /* Same for these */
    1888                 : 
    1889                 :     case OP_TYPEUPTO:
    1890                 :     case OP_TYPEMINUPTO:
    1891                 :     case OP_TYPEPOSUPTO:
    1892               0 :     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
    1893               0 :     break;
    1894                 : 
    1895                 :     /* End of branch */
    1896                 : 
    1897                 :     case OP_KET:
    1898                 :     case OP_KETRMAX:
    1899                 :     case OP_KETRMIN:
    1900                 :     case OP_ALT:
    1901               4 :     return TRUE;
    1902                 : 
    1903                 :     /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
    1904                 :     MINUPTO, and POSUPTO may be followed by a multibyte character */
    1905                 : 
    1906                 : #ifdef SUPPORT_UTF8
    1907                 :     case OP_STAR:
    1908                 :     case OP_MINSTAR:
    1909                 :     case OP_POSSTAR:
    1910                 :     case OP_QUERY:
    1911                 :     case OP_MINQUERY:
    1912                 :     case OP_POSQUERY:
    1913                 :     case OP_UPTO:
    1914                 :     case OP_MINUPTO:
    1915                 :     case OP_POSUPTO:
    1916               3 :     if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
    1917                 :     break;
    1918                 : #endif
    1919                 :     }
    1920                 :   }
    1921                 : 
    1922               8 : return TRUE;
    1923                 : }
    1924                 : 
    1925                 : 
    1926                 : 
    1927                 : /*************************************************
    1928                 : *    Scan compiled regex for non-emptiness       *
    1929                 : *************************************************/
    1930                 : 
    1931                 : /* This function is called to check for left recursive calls. We want to check
    1932                 : the current branch of the current pattern to see if it could match the empty
    1933                 : string. If it could, we must look outwards for branches at other levels,
    1934                 : stopping when we pass beyond the bracket which is the subject of the recursion.
    1935                 : 
    1936                 : Arguments:
    1937                 :   code        points to start of the recursion
    1938                 :   endcode     points to where to stop (current RECURSE item)
    1939                 :   bcptr       points to the chain of current (unclosed) branch starts
    1940                 :   utf8        TRUE if in UTF-8 mode
    1941                 : 
    1942                 : Returns:      TRUE if what is matched could be empty
    1943                 : */
    1944                 : 
    1945                 : static BOOL
    1946                 : could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
    1947                 :   BOOL utf8)
    1948               2 : {
    1949               8 : while (bcptr != NULL && bcptr->current >= code)
    1950                 :   {
    1951               6 :   if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
    1952               4 :   bcptr = bcptr->outer;
    1953                 :   }
    1954               0 : return TRUE;
    1955                 : }
    1956                 : 
    1957                 : 
    1958                 : 
    1959                 : /*************************************************
    1960                 : *           Check for POSIX class syntax         *
    1961                 : *************************************************/
    1962                 : 
    1963                 : /* This function is called when the sequence "[:" or "[." or "[=" is
    1964                 : encountered in a character class. It checks whether this is followed by a
    1965                 : sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
    1966                 : reach an unescaped ']' without the special preceding character, return FALSE.
    1967                 : 
    1968                 : Originally, this function only recognized a sequence of letters between the
    1969                 : terminators, but it seems that Perl recognizes any sequence of characters,
    1970                 : though of course unknown POSIX names are subsequently rejected. Perl gives an
    1971                 : "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
    1972                 : didn't consider this to be a POSIX class. Likewise for [:1234:].
    1973                 : 
    1974                 : The problem in trying to be exactly like Perl is in the handling of escapes. We
    1975                 : have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
    1976                 : class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
    1977                 : below handles the special case of \], but does not try to do any other escape
    1978                 : processing. This makes it different from Perl for cases such as [:l\ower:]
    1979                 : where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
    1980                 : "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
    1981                 : I think.
    1982                 : 
    1983                 : Arguments:
    1984                 :   ptr      pointer to the initial [
    1985                 :   endptr   where to return the end pointer
    1986                 : 
    1987                 : Returns:   TRUE or FALSE
    1988                 : */
    1989                 : 
    1990                 : static BOOL
    1991                 : check_posix_syntax(const uschar *ptr, const uschar **endptr)
    1992              12 : {
    1993                 : int terminator;          /* Don't combine these lines; the Solaris cc */
    1994              12 : terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
    1995              68 : for (++ptr; *ptr != 0; ptr++)
    1996                 :   {
    1997              68 :   if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
    1998                 :     {
    1999              68 :     if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
    2000              64 :     if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
    2001                 :       {
    2002               8 :       *endptr = ptr;
    2003               8 :       return TRUE;
    2004                 :       }
    2005                 :     }
    2006                 :   }
    2007               0 : return FALSE;
    2008                 : }
    2009                 : 
    2010                 : 
    2011                 : 
    2012                 : 
    2013                 : /*************************************************
    2014                 : *          Check POSIX class name                *
    2015                 : *************************************************/
    2016                 : 
    2017                 : /* This function is called to check the name given in a POSIX-style class entry
    2018                 : such as [:alnum:].
    2019                 : 
    2020                 : Arguments:
    2021                 :   ptr        points to the first letter
    2022                 :   len        the length of the name
    2023                 : 
    2024                 : Returns:     a value representing the name, or -1 if unknown
    2025                 : */
    2026                 : 
    2027                 : static int
    2028                 : check_posix_name(const uschar *ptr, int len)
    2029               8 : {
    2030               8 : const char *pn = posix_names;
    2031               8 : register int yield = 0;
    2032             108 : while (posix_name_lengths[yield] != 0)
    2033                 :   {
    2034             100 :   if (len == posix_name_lengths[yield] &&
    2035               8 :     strncmp((const char *)ptr, pn, len) == 0) return yield;
    2036              92 :   pn += posix_name_lengths[yield] + 1;
    2037              92 :   yield++;
    2038                 :   }
    2039               0 : return -1;
    2040                 : }
    2041                 : 
    2042                 : 
    2043                 : /*************************************************
    2044                 : *    Adjust OP_RECURSE items in repeated group   *
    2045                 : *************************************************/
    2046                 : 
    2047                 : /* OP_RECURSE items contain an offset from the start of the regex to the group
    2048                 : that is referenced. This means that groups can be replicated for fixed
    2049                 : repetition simply by copying (because the recursion is allowed to refer to
    2050                 : earlier groups that are outside the current group). However, when a group is
    2051                 : optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
    2052                 : inserted before it, after it has been compiled. This means that any OP_RECURSE
    2053                 : items within it that refer to the group itself or any contained groups have to
    2054                 : have their offsets adjusted. That one of the jobs of this function. Before it
    2055                 : is called, the partially compiled regex must be temporarily terminated with
    2056                 : OP_END.
    2057                 : 
    2058                 : This function has been extended with the possibility of forward references for
    2059                 : recursions and subroutine calls. It must also check the list of such references
    2060                 : for the group we are dealing with. If it finds that one of the recursions in
    2061                 : the current group is on this list, it adjusts the offset in the list, not the
    2062                 : value in the reference (which is a group number).
    2063                 : 
    2064                 : Arguments:
    2065                 :   group      points to the start of the group
    2066                 :   adjust     the amount by which the group is to be moved
    2067                 :   utf8       TRUE in UTF-8 mode
    2068                 :   cd         contains pointers to tables etc.
    2069                 :   save_hwm   the hwm forward reference pointer at the start of the group
    2070                 : 
    2071                 : Returns:     nothing
    2072                 : */
    2073                 : 
    2074                 : static void
    2075                 : adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
    2076                 :   uschar *save_hwm)
    2077             810 : {
    2078             810 : uschar *ptr = group;
    2079                 : 
    2080            1621 : while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
    2081                 :   {
    2082                 :   int offset;
    2083                 :   uschar *hc;
    2084                 : 
    2085                 :   /* See if this recursion is on the forward reference list. If so, adjust the
    2086                 :   reference. */
    2087                 : 
    2088               1 :   for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
    2089                 :     {
    2090               0 :     offset = GET(hc, 0);
    2091               0 :     if (cd->start_code + offset == ptr + 1)
    2092                 :       {
    2093               0 :       PUT(hc, 0, offset + adjust);
    2094               0 :       break;
    2095                 :       }
    2096                 :     }
    2097                 : 
    2098                 :   /* Otherwise, adjust the recursion offset if it's after the start of this
    2099                 :   group. */
    2100                 : 
    2101               1 :   if (hc >= cd->hwm)
    2102                 :     {
    2103               1 :     offset = GET(ptr, 1);
    2104               1 :     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
    2105                 :     }
    2106                 : 
    2107               1 :   ptr += 1 + LINK_SIZE;
    2108                 :   }
    2109             810 : }
    2110                 : 
    2111                 : 
    2112                 : 
    2113                 : /*************************************************
    2114                 : *        Insert an automatic callout point       *
    2115                 : *************************************************/
    2116                 : 
    2117                 : /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
    2118                 : callout points before each pattern item.
    2119                 : 
    2120                 : Arguments:
    2121                 :   code           current code pointer
    2122                 :   ptr            current pattern pointer
    2123                 :   cd             pointers to tables etc
    2124                 : 
    2125                 : Returns:         new code pointer
    2126                 : */
    2127                 : 
    2128                 : static uschar *
    2129                 : auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
    2130               0 : {
    2131               0 : *code++ = OP_CALLOUT;
    2132               0 : *code++ = 255;
    2133               0 : PUT(code, 0, ptr - cd->start_pattern);  /* Pattern offset */
    2134               0 : PUT(code, LINK_SIZE, 0);                /* Default length */
    2135               0 : return code + 2*LINK_SIZE;
    2136                 : }
    2137                 : 
    2138                 : 
    2139                 : 
    2140                 : /*************************************************
    2141                 : *         Complete a callout item                *
    2142                 : *************************************************/
    2143                 : 
    2144                 : /* A callout item contains the length of the next item in the pattern, which
    2145                 : we can't fill in till after we have reached the relevant point. This is used
    2146                 : for both automatic and manual callouts.
    2147                 : 
    2148                 : Arguments:
    2149                 :   previous_callout   points to previous callout item
    2150                 :   ptr                current pattern pointer
    2151                 :   cd                 pointers to tables etc
    2152                 : 
    2153                 : Returns:             nothing
    2154                 : */
    2155                 : 
    2156                 : static void
    2157                 : complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
    2158               0 : {
    2159               0 : int length = ptr - cd->start_pattern - GET(previous_callout, 2);
    2160               0 : PUT(previous_callout, 2 + LINK_SIZE, length);
    2161               0 : }
    2162                 : 
    2163                 : 
    2164                 : 
    2165                 : #ifdef SUPPORT_UCP
    2166                 : /*************************************************
    2167                 : *           Get othercase range                  *
    2168                 : *************************************************/
    2169                 : 
    2170                 : /* This function is passed the start and end of a class range, in UTF-8 mode
    2171                 : with UCP support. It searches up the characters, looking for internal ranges of
    2172                 : characters in the "other" case. Each call returns the next one, updating the
    2173                 : start address.
    2174                 : 
    2175                 : Arguments:
    2176                 :   cptr        points to starting character value; updated
    2177                 :   d           end value
    2178                 :   ocptr       where to put start of othercase range
    2179                 :   odptr       where to put end of othercase range
    2180                 : 
    2181                 : Yield:        TRUE when range returned; FALSE when no more
    2182                 : */
    2183                 : 
    2184                 : static BOOL
    2185                 : get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
    2186                 :   unsigned int *odptr)
    2187               0 : {
    2188                 : unsigned int c, othercase, next;
    2189                 : 
    2190               0 : for (c = *cptr; c <= d; c++)
    2191               0 :   { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
    2192                 : 
    2193               0 : if (c > d) return FALSE;
    2194                 : 
    2195               0 : *ocptr = othercase;
    2196               0 : next = othercase + 1;
    2197                 : 
    2198               0 : for (++c; c <= d; c++)
    2199                 :   {
    2200               0 :   if (UCD_OTHERCASE(c) != next) break;
    2201               0 :   next++;
    2202                 :   }
    2203                 : 
    2204               0 : *odptr = next - 1;
    2205               0 : *cptr = c;
    2206                 : 
    2207               0 : return TRUE;
    2208                 : }
    2209                 : #endif  /* SUPPORT_UCP */
    2210                 : 
    2211                 : 
    2212                 : 
    2213                 : /*************************************************
    2214                 : *     Check if auto-possessifying is possible    *
    2215                 : *************************************************/
    2216                 : 
    2217                 : /* This function is called for unlimited repeats of certain items, to see
    2218                 : whether the next thing could possibly match the repeated item. If not, it makes
    2219                 : sense to automatically possessify the repeated item.
    2220                 : 
    2221                 : Arguments:
    2222                 :   op_code       the repeated op code
    2223                 :   this          data for this item, depends on the opcode
    2224                 :   utf8          TRUE in UTF-8 mode
    2225                 :   utf8_char     used for utf8 character bytes, NULL if not relevant
    2226                 :   ptr           next character in pattern
    2227                 :   options       options bits
    2228                 :   cd            contains pointers to tables etc.
    2229                 : 
    2230                 : Returns:        TRUE if possessifying is wanted
    2231                 : */
    2232                 : 
    2233                 : static BOOL
    2234                 : check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
    2235                 :   const uschar *ptr, int options, compile_data *cd)
    2236           51510 : {
    2237                 : int next;
    2238                 : 
    2239                 : /* Skip whitespace and comments in extended mode */
    2240                 : 
    2241           51510 : if ((options & PCRE_EXTENDED) != 0)
    2242                 :   {
    2243                 :   for (;;)
    2244                 :     {
    2245               0 :     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
    2246               0 :     if (*ptr == CHAR_NUMBER_SIGN)
    2247                 :       {
    2248               0 :       while (*(++ptr) != 0)
    2249               0 :         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
    2250                 :       }
    2251               0 :     else break;
    2252               0 :     }
    2253                 :   }
    2254                 : 
    2255                 : /* If the next item is one that we can handle, get its value. A non-negative
    2256                 : value is a character, a negative value is an escape value. */
    2257                 : 
    2258           51510 : if (*ptr == CHAR_BACKSLASH)
    2259                 :   {
    2260           12590 :   int temperrorcode = 0;
    2261           12590 :   next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
    2262           12590 :   if (temperrorcode != 0) return FALSE;
    2263           12590 :   ptr++;    /* Point after the escape sequence */
    2264                 :   }
    2265                 : 
    2266           38920 : else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
    2267                 :   {
    2268                 : #ifdef SUPPORT_UTF8
    2269           36354 :   if (utf8) { GETCHARINC(next, ptr); } else
    2270                 : #endif
    2271           36354 :   next = *ptr++;
    2272                 :   }
    2273                 : 
    2274            2566 : else return FALSE;
    2275                 : 
    2276                 : /* Skip whitespace and comments in extended mode */
    2277                 : 
    2278           48944 : if ((options & PCRE_EXTENDED) != 0)
    2279                 :   {
    2280                 :   for (;;)
    2281                 :     {
    2282               0 :     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
    2283               0 :     if (*ptr == CHAR_NUMBER_SIGN)
    2284                 :       {
    2285               0 :       while (*(++ptr) != 0)
    2286               0 :         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
    2287                 :       }
    2288               0 :     else break;
    2289               0 :     }
    2290                 :   }
    2291                 : 
    2292                 : /* If the next thing is itself optional, we have to give up. */
    2293                 : 
    2294           48944 : if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
    2295                 :   strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
    2296             294 :     return FALSE;
    2297                 : 
    2298                 : /* Now compare the next item with the previous opcode. If the previous is a
    2299                 : positive single character match, "item" either contains the character or, if
    2300                 : "item" is greater than 127 in utf8 mode, the character's bytes are in
    2301                 : utf8_char. */
    2302                 : 
    2303                 : 
    2304                 : /* Handle cases when the next item is a character. */
    2305                 : 
    2306           48650 : if (next >= 0) switch(op_code)
    2307                 :   {
    2308                 :   case OP_CHAR:
    2309                 : #ifdef SUPPORT_UTF8
    2310              34 :   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
    2311                 : #else
    2312                 :   (void)(utf8_char);  /* Keep compiler happy by referencing function argument */
    2313                 : #endif
    2314              34 :   return item != next;
    2315                 : 
    2316                 :   /* For CHARNC (caseless character) we must check the other case. If we have
    2317                 :   Unicode property support, we can use it to test the other case of
    2318                 :   high-valued characters. */
    2319                 : 
    2320                 :   case OP_CHARNC:
    2321                 : #ifdef SUPPORT_UTF8
    2322               0 :   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
    2323                 : #endif
    2324               0 :   if (item == next) return FALSE;
    2325                 : #ifdef SUPPORT_UTF8
    2326               0 :   if (utf8)
    2327                 :     {
    2328                 :     unsigned int othercase;
    2329               0 :     if (next < 128) othercase = cd->fcc[next]; else
    2330                 : #ifdef SUPPORT_UCP
    2331               0 :     othercase = UCD_OTHERCASE((unsigned int)next);
    2332                 : #else
    2333                 :     othercase = NOTACHAR;
    2334                 : #endif
    2335               0 :     return (unsigned int)item != othercase;
    2336                 :     }
    2337                 :   else
    2338                 : #endif  /* SUPPORT_UTF8 */
    2339               0 :   return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
    2340                 : 
    2341                 :   /* For OP_NOT, "item" must be a single-byte character. */
    2342                 : 
    2343                 :   case OP_NOT:
    2344               8 :   if (item == next) return TRUE;
    2345               0 :   if ((options & PCRE_CASELESS) == 0) return FALSE;
    2346                 : #ifdef SUPPORT_UTF8
    2347               0 :   if (utf8)
    2348                 :     {
    2349                 :     unsigned int othercase;
    2350               0 :     if (next < 128) othercase = cd->fcc[next]; else
    2351                 : #ifdef SUPPORT_UCP
    2352               0 :     othercase = UCD_OTHERCASE(next);
    2353                 : #else
    2354                 :     othercase = NOTACHAR;
    2355                 : #endif
    2356               0 :     return (unsigned int)item == othercase;
    2357                 :     }
    2358                 :   else
    2359                 : #endif  /* SUPPORT_UTF8 */
    2360               0 :   return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
    2361                 : 
    2362                 :   case OP_DIGIT:
    2363           47936 :   return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
    2364                 : 
    2365                 :   case OP_NOT_DIGIT:
    2366               2 :   return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
    2367                 : 
    2368                 :   case OP_WHITESPACE:
    2369              64 :   return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
    2370                 : 
    2371                 :   case OP_NOT_WHITESPACE:
    2372               2 :   return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
    2373                 : 
    2374                 :   case OP_WORDCHAR:
    2375               2 :   return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
    2376                 : 
    2377                 :   case OP_NOT_WORDCHAR:
    2378               0 :   return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
    2379                 : 
    2380                 :   case OP_HSPACE:
    2381                 :   case OP_NOT_HSPACE:
    2382               0 :   switch(next)
    2383                 :     {
    2384                 :     case 0x09:
    2385                 :     case 0x20:
    2386                 :     case 0xa0:
    2387                 :     case 0x1680:
    2388                 :     case 0x180e:
    2389                 :     case 0x2000:
    2390                 :     case 0x2001:
    2391                 :     case 0x2002:
    2392                 :     case 0x2003:
    2393                 :     case 0x2004:
    2394                 :     case 0x2005:
    2395                 :     case 0x2006:
    2396                 :     case 0x2007:
    2397                 :     case 0x2008:
    2398                 :     case 0x2009:
    2399                 :     case 0x200A:
    2400                 :     case 0x202f:
    2401                 :     case 0x205f:
    2402                 :     case 0x3000:
    2403               0 :     return op_code != OP_HSPACE;
    2404                 :     default:
    2405               0 :     return op_code == OP_HSPACE;
    2406                 :     }
    2407                 : 
    2408                 :   case OP_VSPACE:
    2409                 :   case OP_NOT_VSPACE:
    2410               0 :   switch(next)
    2411                 :     {
    2412                 :     case 0x0a:
    2413                 :     case 0x0b:
    2414                 :     case 0x0c:
    2415                 :     case 0x0d:
    2416                 :     case 0x85:
    2417                 :     case 0x2028:
    2418                 :     case 0x2029:
    2419               0 :     return op_code != OP_VSPACE;
    2420                 :     default:
    2421               0 :     return op_code == OP_VSPACE;
    2422                 :     }
    2423                 : 
    2424                 :   default:
    2425             358 :   return FALSE;
    2426                 :   }
    2427                 : 
    2428                 : 
    2429                 : /* Handle the case when the next item is \d, \s, etc. */
    2430                 : 
    2431             244 : switch(op_code)
    2432                 :   {
    2433                 :   case OP_CHAR:
    2434                 :   case OP_CHARNC:
    2435                 : #ifdef SUPPORT_UTF8
    2436               0 :   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
    2437                 : #endif
    2438               0 :   switch(-next)
    2439                 :     {
    2440                 :     case ESC_d:
    2441               0 :     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
    2442                 : 
    2443                 :     case ESC_D:
    2444               0 :     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
    2445                 : 
    2446                 :     case ESC_s:
    2447               0 :     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
    2448                 : 
    2449                 :     case ESC_S:
    2450               0 :     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
    2451                 : 
    2452                 :     case ESC_w:
    2453               0 :     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
    2454                 : 
    2455                 :     case ESC_W:
    2456               0 :     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
    2457                 : 
    2458                 :     case ESC_h:
    2459                 :     case ESC_H:
    2460               0 :     switch(item)
    2461                 :       {
    2462                 :       case 0x09:
    2463                 :       case 0x20:
    2464                 :       case 0xa0:
    2465                 :       case 0x1680:
    2466                 :       case 0x180e:
    2467                 :       case 0x2000:
    2468                 :       case 0x2001:
    2469                 :       case 0x2002:
    2470                 :       case 0x2003:
    2471                 :       case 0x2004:
    2472                 :       case 0x2005:
    2473                 :       case 0x2006:
    2474                 :       case 0x2007:
    2475                 :       case 0x2008:
    2476                 :       case 0x2009:
    2477                 :       case 0x200A:
    2478                 :       case 0x202f:
    2479                 :       case 0x205f:
    2480                 :       case 0x3000:
    2481               0 :       return -next != ESC_h;
    2482                 :       default:
    2483               0 :       return -next == ESC_h;
    2484                 :       }
    2485                 : 
    2486                 :     case ESC_v:
    2487                 :     case ESC_V:
    2488               0 :     switch(item)
    2489                 :       {
    2490                 :       case 0x0a:
    2491                 :       case 0x0b:
    2492                 :       case 0x0c:
    2493                 :       case 0x0d:
    2494                 :       case 0x85:
    2495                 :       case 0x2028:
    2496                 :       case 0x2029:
    2497               0 :       return -next != ESC_v;
    2498                 :       default:
    2499               0 :       return -next == ESC_v;
    2500                 :       }
    2501                 : 
    2502                 :     default:
    2503               0 :     return FALSE;
    2504                 :     }
    2505                 : 
    2506                 :   case OP_DIGIT:
    2507              32 :   return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
    2508                 :          next == -ESC_h || next == -ESC_v;
    2509                 : 
    2510                 :   case OP_NOT_DIGIT:
    2511               0 :   return next == -ESC_d;
    2512                 : 
    2513                 :   case OP_WHITESPACE:
    2514             206 :   return next == -ESC_S || next == -ESC_d || next == -ESC_w;
    2515                 : 
    2516                 :   case OP_NOT_WHITESPACE:
    2517               2 :   return next == -ESC_s || next == -ESC_h || next == -ESC_v;
    2518                 : 
    2519                 :   case OP_HSPACE:
    2520               0 :   return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
    2521                 : 
    2522                 :   case OP_NOT_HSPACE:
    2523               0 :   return next == -ESC_h;
    2524                 : 
    2525                 :   /* Can't have \S in here because VT matches \S (Perl anomaly) */
    2526                 :   case OP_VSPACE:
    2527               0 :   return next == -ESC_V || next == -ESC_d || next == -ESC_w;
    2528                 : 
    2529                 :   case OP_NOT_VSPACE:
    2530               0 :   return next == -ESC_v;
    2531                 : 
    2532                 :   case OP_WORDCHAR:
    2533               2 :   return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
    2534                 : 
    2535                 :   case OP_NOT_WORDCHAR:
    2536               0 :   return next == -ESC_w || next == -ESC_d;
    2537                 : 
    2538                 :   default:
    2539               2 :   return FALSE;
    2540                 :   }
    2541                 : 
    2542                 : /* Control does not reach here */
    2543                 : }
    2544                 : 
    2545                 : 
    2546                 : 
    2547                 : /*************************************************
    2548                 : *           Compile one branch                   *
    2549                 : *************************************************/
    2550                 : 
    2551                 : /* Scan the pattern, compiling it into the a vector. If the options are
    2552                 : changed during the branch, the pointer is used to change the external options
    2553                 : bits. This function is used during the pre-compile phase when we are trying
    2554                 : to find out the amount of memory needed, as well as during the real compile
    2555                 : phase. The value of lengthptr distinguishes the two phases.
    2556                 : 
    2557                 : Arguments:
    2558                 :   optionsptr     pointer to the option bits
    2559                 :   codeptr        points to the pointer to the current code point
    2560                 :   ptrptr         points to the current pattern pointer
    2561                 :   errorcodeptr   points to error code variable
    2562                 :   firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
    2563                 :   reqbyteptr     set to the last literal character required, else < 0
    2564                 :   bcptr          points to current branch chain
    2565                 :   cd             contains pointers to tables etc.
    2566                 :   lengthptr      NULL during the real compile phase
    2567                 :                  points to length accumulator during pre-compile phase
    2568                 : 
    2569                 : Returns:         TRUE on success
    2570                 :                  FALSE, with *errorcodeptr set non-zero on error
    2571                 : */
    2572                 : 
    2573                 : static BOOL
    2574                 : compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
    2575                 :   int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
    2576                 :   compile_data *cd, int *lengthptr)
    2577           38602 : {
    2578                 : int repeat_type, op_type;
    2579           38602 : int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
    2580           38602 : int bravalue = 0;
    2581                 : int greedy_default, greedy_non_default;
    2582                 : int firstbyte, reqbyte;
    2583                 : int zeroreqbyte, zerofirstbyte;
    2584                 : int req_caseopt, reqvary, tempreqvary;
    2585           38602 : int options = *optionsptr;
    2586           38602 : int after_manual_callout = 0;
    2587           38602 : int length_prevgroup = 0;
    2588                 : register int c;
    2589           38602 : register uschar *code = *codeptr;
    2590           38602 : uschar *last_code = code;
    2591           38602 : uschar *orig_code = code;
    2592                 : uschar *tempcode;
    2593           38602 : BOOL inescq = FALSE;
    2594           38602 : BOOL groupsetfirstbyte = FALSE;
    2595           38602 : const uschar *ptr = *ptrptr;
    2596                 : const uschar *tempptr;
    2597           38602 : uschar *previous = NULL;
    2598           38602 : uschar *previous_callout = NULL;
    2599           38602 : uschar *save_hwm = NULL;
    2600                 : uschar classbits[32];
    2601                 : 
    2602                 : #ifdef SUPPORT_UTF8
    2603                 : BOOL class_utf8;
    2604           38602 : BOOL utf8 = (options & PCRE_UTF8) != 0;
    2605                 : uschar *class_utf8data;
    2606                 : uschar *class_utf8data_base;
    2607                 : uschar utf8_char[6];
    2608                 : #else
    2609                 : BOOL utf8 = FALSE;
    2610                 : uschar *utf8_char = NULL;
    2611                 : #endif
    2612                 : 
    2613                 : #ifdef DEBUG
    2614                 : if (lengthptr != NULL) DPRINTF((">> start branch\n"));
    2615                 : #endif
    2616                 : 
    2617                 : /* Set up the default and non-default settings for greediness */
    2618                 : 
    2619           38602 : greedy_default = ((options & PCRE_UNGREEDY) != 0);
    2620           38602 : greedy_non_default = greedy_default ^ 1;
    2621                 : 
    2622                 : /* Initialize no first byte, no required byte. REQ_UNSET means "no char
    2623                 : matching encountered yet". It gets changed to REQ_NONE if we hit something that
    2624                 : matches a non-fixed char first char; reqbyte just remains unset if we never
    2625                 : find one.
    2626                 : 
    2627                 : When we hit a repeat whose minimum is zero, we may have to adjust these values
    2628                 : to take the zero repeat into account. This is implemented by setting them to
    2629                 : zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
    2630                 : item types that can be repeated set these backoff variables appropriately. */
    2631                 : 
    2632           38602 : firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
    2633                 : 
    2634                 : /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
    2635                 : according to the current setting of the caseless flag. REQ_CASELESS is a bit
    2636                 : value > 255. It is added into the firstbyte or reqbyte variables to record the
    2637                 : case status of the value. This is used only for ASCII characters. */
    2638                 : 
    2639           38602 : req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
    2640                 : 
    2641                 : /* Switch on next character until the end of the branch */
    2642                 : 
    2643        29590902 : for (;; ptr++)
    2644                 :   {
    2645                 :   BOOL negate_class;
    2646                 :   BOOL should_flip_negation;
    2647                 :   BOOL possessive_quantifier;
    2648                 :   BOOL is_quantifier;
    2649                 :   BOOL is_recurse;
    2650                 :   BOOL reset_bracount;
    2651                 :   int class_charcount;
    2652                 :   int class_lastchar;
    2653                 :   int newoptions;
    2654                 :   int recno;
    2655                 :   int refsign;
    2656                 :   int skipbytes;
    2657                 :   int subreqbyte;
    2658                 :   int subfirstbyte;
    2659                 :   int terminator;
    2660                 :   int mclength;
    2661                 :   uschar mcbuffer[8];
    2662                 : 
    2663                 :   /* Get next byte in the pattern */
    2664                 : 
    2665        29629504 :   c = *ptr;
    2666                 : 
    2667                 :   /* If we are in the pre-compile phase, accumulate the length used for the
    2668                 :   previous cycle of this loop. */
    2669                 : 
    2670        29629504 :   if (lengthptr != NULL)
    2671                 :     {
    2672                 : #ifdef DEBUG
    2673                 :     if (code > cd->hwm) cd->hwm = code;                 /* High water info */
    2674                 : #endif
    2675        14814756 :     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
    2676                 :       {
    2677               0 :       *errorcodeptr = ERR52;
    2678               0 :       goto FAILED;
    2679                 :       }
    2680                 : 
    2681                 :     /* There is at least one situation where code goes backwards: this is the
    2682                 :     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
    2683                 :     the class is simply eliminated. However, it is created first, so we have to
    2684                 :     allow memory for it. Therefore, don't ever reduce the length at this point.
    2685                 :     */
    2686                 : 
    2687        14814756 :     if (code < last_code) code = last_code;
    2688                 : 
    2689                 :     /* Paranoid check for integer overflow */
    2690                 : 
    2691        14814756 :     if (OFLOW_MAX - *lengthptr < code - last_code)
    2692                 :       {
    2693               0 :       *errorcodeptr = ERR20;
    2694               0 :       goto FAILED;
    2695                 :       }
    2696                 : 
    2697        14814756 :     *lengthptr += code - last_code;
    2698                 :     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
    2699                 : 
    2700                 :     /* If "previous" is set and it is not at the start of the work space, move
    2701                 :     it back to there, in order to avoid filling up the work space. Otherwise,
    2702                 :     if "previous" is NULL, reset the current code pointer to the start. */
    2703                 : 
    2704        14814756 :     if (previous != NULL)
    2705                 :       {
    2706        14735115 :       if (previous > orig_code)
    2707                 :         {
    2708        14673990 :         memmove(orig_code, previous, code - previous);
    2709        14673990 :         code -= previous - orig_code;
    2710        14673990 :         previous = orig_code;
    2711                 :         }
    2712                 :       }
    2713           79641 :     else code = orig_code;
    2714                 : 
    2715                 :     /* Remember where this code item starts so we can pick up the length
    2716                 :     next time round. */
    2717                 : 
    2718        14814756 :     last_code = code;
    2719                 :     }
    2720                 : 
    2721                 :   /* In the real compile phase, just check the workspace used by the forward
    2722                 :   reference list. */
    2723                 : 
    2724        14814748 :   else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
    2725                 :     {
    2726               0 :     *errorcodeptr = ERR52;
    2727               0 :     goto FAILED;
    2728                 :     }
    2729                 : 
    2730                 :   /* If in \Q...\E, check for the end; if not, we have a literal */
    2731                 : 
    2732        29629504 :   if (inescq && c != 0)
    2733                 :     {
    2734               0 :     if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
    2735                 :       {
    2736               0 :       inescq = FALSE;
    2737               0 :       ptr++;
    2738               0 :       continue;
    2739                 :       }
    2740                 :     else
    2741                 :       {
    2742               0 :       if (previous_callout != NULL)
    2743                 :         {
    2744               0 :         if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
    2745               0 :           complete_callout(previous_callout, ptr, cd);
    2746               0 :         previous_callout = NULL;
    2747                 :         }
    2748               0 :       if ((options & PCRE_AUTO_CALLOUT) != 0)
    2749                 :         {
    2750               0 :         previous_callout = code;
    2751               0 :         code = auto_callout(code, ptr, cd);
    2752                 :         }
    2753               0 :       goto NORMAL_CHAR;
    2754                 :       }
    2755                 :     }
    2756                 : 
    2757                 :   /* Fill in length of a previous callout, except when the next thing is
    2758                 :   a quantifier. */
    2759                 : 
    2760        29629504 :   is_quantifier =
    2761                 :     c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
    2762                 :     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
    2763                 : 
    2764        29629504 :   if (!is_quantifier && previous_callout != NULL &&
    2765                 :        after_manual_callout-- <= 0)
    2766                 :     {
    2767               0 :     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
    2768               0 :       complete_callout(previous_callout, ptr, cd);
    2769               0 :     previous_callout = NULL;
    2770                 :     }
    2771                 : 
    2772                 :   /* In extended mode, skip white space and comments */
    2773                 : 
    2774        29629504 :   if ((options & PCRE_EXTENDED) != 0)
    2775                 :     {
    2776             222 :     if ((cd->ctypes[c] & ctype_space) != 0) continue;
    2777             136 :     if (c == CHAR_NUMBER_SIGN)
    2778                 :       {
    2779               0 :       while (*(++ptr) != 0)
    2780                 :         {
    2781               0 :         if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
    2782                 :         }
    2783               0 :       if (*ptr != 0) continue;
    2784                 : 
    2785                 :       /* Else fall through to handle end of string */
    2786               0 :       c = 0;
    2787                 :       }
    2788                 :     }
    2789                 : 
    2790                 :   /* No auto callout for quantifiers. */
    2791                 : 
    2792        29629418 :   if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
    2793                 :     {
    2794               0 :     previous_callout = code;
    2795               0 :     code = auto_callout(code, ptr, cd);
    2796                 :     }
    2797                 : 
    2798        29629418 :   switch(c)
    2799                 :     {
    2800                 :     /* ===================================================================*/
    2801                 :     case 0:                        /* The branch terminates at string end */
    2802                 :     case CHAR_VERTICAL_LINE:       /* or | or ) */
    2803                 :     case CHAR_RIGHT_PARENTHESIS:
    2804           38598 :     *firstbyteptr = firstbyte;
    2805           38598 :     *reqbyteptr = reqbyte;
    2806           38598 :     *codeptr = code;
    2807           38598 :     *ptrptr = ptr;
    2808           38598 :     if (lengthptr != NULL)
    2809                 :       {
    2810           19299 :       if (OFLOW_MAX - *lengthptr < code - last_code)
    2811                 :         {
    2812               0 :         *errorcodeptr = ERR20;
    2813               0 :         goto FAILED;
    2814                 :         }
    2815           19299 :       *lengthptr += code - last_code;   /* To include callout length */
    2816                 :       DPRINTF((">> end branch\n"));
    2817                 :       }
    2818           38598 :     return TRUE;
    2819                 : 
    2820                 : 
    2821                 :     /* ===================================================================*/
    2822                 :     /* Handle single-character metacharacters. In multiline mode, ^ disables
    2823                 :     the setting of any following char as a first character. */
    2824                 : 
    2825                 :     case CHAR_CIRCUMFLEX_ACCENT:
    2826           11502 :     if ((options & PCRE_MULTILINE) != 0)
    2827                 :       {
    2828               6 :       if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
    2829                 :       }
    2830           11502 :     previous = NULL;
    2831           11502 :     *code++ = OP_CIRC;
    2832           11502 :     break;
    2833                 : 
    2834                 :     case CHAR_DOLLAR_SIGN:
    2835           11200 :     previous = NULL;
    2836           11200 :     *code++ = OP_DOLL;
    2837           11200 :     break;
    2838                 : 
    2839                 :     /* There can never be a first char if '.' is first, whatever happens about
    2840                 :     repeats. The value of reqbyte doesn't change either. */
    2841                 : 
    2842                 :     case CHAR_DOT:
    2843        16790503 :     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
    2844        16790503 :     zerofirstbyte = firstbyte;
    2845        16790503 :     zeroreqbyte = reqbyte;
    2846        16790503 :     previous = code;
    2847        16790503 :     *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
    2848        16790503 :     break;
    2849                 : 
    2850                 : 
    2851                 :     /* ===================================================================*/
    2852                 :     /* Character classes. If the included characters are all < 256, we build a
    2853                 :     32-byte bitmap of the permitted characters, except in the special case
    2854                 :     where there is only one such character. For negated classes, we build the
    2855                 :     map as usual, then invert it at the end. However, we use a different opcode
    2856                 :     so that data characters > 255 can be handled correctly.
    2857                 : 
    2858                 :     If the class contains characters outside the 0-255 range, a different
    2859                 :     opcode is compiled. It may optionally have a bit map for characters < 256,
    2860                 :     but those above are are explicitly listed afterwards. A flag byte tells
    2861                 :     whether the bitmap is present, and whether this is a negated class or not.
    2862                 : 
    2863                 :     In JavaScript compatibility mode, an isolated ']' causes an error. In
    2864                 :     default (Perl) mode, it is treated as a data character. */
    2865                 : 
    2866                 :     case CHAR_RIGHT_SQUARE_BRACKET:
    2867               6 :     if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
    2868                 :       {
    2869               0 :       *errorcodeptr = ERR64;
    2870               0 :       goto FAILED;
    2871                 :       }
    2872               6 :     goto NORMAL_CHAR;
    2873                 : 
    2874                 :     case CHAR_LEFT_SQUARE_BRACKET:
    2875           46541 :     previous = code;
    2876                 : 
    2877                 :     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
    2878                 :     they are encountered at the top level, so we'll do that too. */
    2879                 : 
    2880           46541 :     if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
    2881                 :          ptr[1] == CHAR_EQUALS_SIGN) &&
    2882                 :         check_posix_syntax(ptr, &tempptr))
    2883                 :       {
    2884               0 :       *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
    2885               0 :       goto FAILED;
    2886                 :       }
    2887                 : 
    2888                 :     /* If the first character is '^', set the negation flag and skip it. Also,
    2889                 :     if the first few characters (either before or after ^) are \Q\E or \E we
    2890                 :     skip them too. This makes for compatibility with Perl. */
    2891                 : 
    2892           46541 :     negate_class = FALSE;
    2893                 :     for (;;)
    2894                 :       {
    2895           89061 :       c = *(++ptr);
    2896           89061 :       if (c == CHAR_BACKSLASH)
    2897                 :         {
    2898           42608 :         if (ptr[1] == CHAR_E)
    2899               0 :           ptr++;
    2900           42608 :         else if (strncmp((const char *)ptr+1,
    2901                 :                           STR_Q STR_BACKSLASH STR_E, 3) == 0)
    2902               0 :           ptr += 3;
    2903                 :         else
    2904           42608 :           break;
    2905                 :         }
    2906           46453 :       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
    2907           42520 :         negate_class = TRUE;
    2908                 :       else break;
    2909           42520 :       }
    2910                 : 
    2911                 :     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
    2912                 :     an initial ']' is taken as a data character -- the code below handles
    2913                 :     that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
    2914                 :     [^] must match any character, so generate OP_ALLANY. */
    2915                 : 
    2916           46541 :     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
    2917                 :         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
    2918                 :       {
    2919               0 :       *code++ = negate_class? OP_ALLANY : OP_FAIL;
    2920               0 :       if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
    2921               0 :       zerofirstbyte = firstbyte;
    2922               0 :       break;
    2923                 :       }
    2924                 : 
    2925                 :     /* If a class contains a negative special such as \S, we need to flip the
    2926                 :     negation flag at the end, so that support for characters > 255 works
    2927                 :     correctly (they are all included in the class). */
    2928                 : 
    2929           46541 :     should_flip_negation = FALSE;
    2930                 : 
    2931                 :     /* Keep a count of chars with values < 256 so that we can optimize the case
    2932                 :     of just a single character (as long as it's < 256). However, For higher
    2933                 :     valued UTF-8 characters, we don't yet do any optimization. */
    2934                 : 
    2935           46541 :     class_charcount = 0;
    2936           46541 :     class_lastchar = -1;
    2937                 : 
    2938                 :     /* Initialize the 32-char bit map to all zeros. We build the map in a
    2939                 :     temporary bit of memory, in case the class contains only 1 character (less
    2940                 :     than 256), because in that case the compiled code doesn't use the bit map.
    2941                 :     */
    2942                 : 
    2943           46541 :     memset(classbits, 0, 32 * sizeof(uschar));
    2944                 : 
    2945                 : #ifdef SUPPORT_UTF8
    2946           46541 :     class_utf8 = FALSE;                       /* No chars >= 256 */
    2947           46541 :     class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
    2948           46541 :     class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */
    2949                 : #endif
    2950                 : 
    2951                 :     /* Process characters until ] is reached. By writing this as a "do" it
    2952                 :     means that an initial ] is taken as a data character. At the start of the
    2953                 :     loop, c contains the first byte of the character. */
    2954                 : 
    2955           46541 :     if (c != 0) do
    2956                 :       {
    2957                 :       const uschar *oldptr;
    2958                 : 
    2959                 : #ifdef SUPPORT_UTF8
    2960           92820 :       if (utf8 && c > 127)
    2961                 :         {                           /* Braces are required because the */
    2962               0 :         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
    2963                 :         }
    2964                 : 
    2965                 :       /* In the pre-compile phase, accumulate the length of any UTF-8 extra
    2966                 :       data and reset the pointer. This is so that very large classes that
    2967                 :       contain a zillion UTF-8 characters no longer overwrite the work space
    2968                 :       (which is on the stack). */
    2969                 : 
    2970           92820 :       if (lengthptr != NULL)
    2971                 :         {
    2972           46410 :         *lengthptr += class_utf8data - class_utf8data_base;
    2973           46410 :         class_utf8data = class_utf8data_base;
    2974                 :         }
    2975                 : 
    2976                 : #endif
    2977                 : 
    2978                 :       /* Inside \Q...\E everything is literal except \E */
    2979                 : 
    2980           92820 :       if (inescq)
    2981                 :         {
    2982               0 :         if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
    2983                 :           {
    2984               0 :           inescq = FALSE;                   /* Reset literal state */
    2985               0 :           ptr++;                            /* Skip the 'E' */
    2986               0 :           continue;                         /* Carry on with next */
    2987                 :           }
    2988               0 :         goto CHECK_RANGE;                   /* Could be range if \E follows */
    2989                 :         }
    2990                 : 
    2991                 :       /* Handle POSIX class names. Perl allows a negation extension of the
    2992                 :       form [:^name:]. A square bracket that doesn't match the syntax is
    2993                 :       treated as a literal. We also recognize the POSIX constructions
    2994                 :       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
    2995                 :       5.6 and 5.8 do. */
    2996                 : 
    2997           92820 :       if (c == CHAR_LEFT_SQUARE_BRACKET &&
    2998                 :           (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
    2999                 :            ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
    3000                 :         {
    3001               8 :         BOOL local_negate = FALSE;
    3002                 :         int posix_class, taboffset, tabopt;
    3003               8 :         register const uschar *cbits = cd->cbits;
    3004                 :         uschar pbits[32];
    3005                 : 
    3006               8 :         if (ptr[1] != CHAR_COLON)
    3007                 :           {
    3008               0 :           *errorcodeptr = ERR31;
    3009               0 :           goto FAILED;
    3010                 :           }
    3011                 : 
    3012               8 :         ptr += 2;
    3013               8 :         if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
    3014                 :           {
    3015               0 :           local_negate = TRUE;
    3016               0 :           should_flip_negation = TRUE;  /* Note negative special */
    3017               0 :           ptr++;
    3018                 :           }
    3019                 : 
    3020               8 :         posix_class = check_posix_name(ptr, tempptr - ptr);
    3021               8 :         if (posix_class < 0)
    3022                 :           {
    3023               0 :           *errorcodeptr = ERR30;
    3024               0 :           goto FAILED;
    3025                 :           }
    3026                 : 
    3027                 :         /* If matching is caseless, upper and lower are converted to
    3028                 :         alpha. This relies on the fact that the class table starts with
    3029                 :         alpha, lower, upper as the first 3 entries. */
    3030                 : 
    3031               8 :         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
    3032               0 :           posix_class = 0;
    3033                 : 
    3034                 :         /* We build the bit map for the POSIX class in a chunk of local store
    3035                 :         because we may be adding and subtracting from it, and we don't want to
    3036                 :         subtract bits that may be in the main map already. At the end we or the
    3037                 :         result into the bit map that is being built. */
    3038                 : 
    3039               8 :         posix_class *= 3;
    3040                 : 
    3041                 :         /* Copy in the first table (always present) */
    3042                 : 
    3043               8 :         memcpy(pbits, cbits + posix_class_maps[posix_class],
    3044                 :           32 * sizeof(uschar));
    3045                 : 
    3046                 :         /* If there is a second table, add or remove it as required. */
    3047                 : 
    3048               8 :         taboffset = posix_class_maps[posix_class + 1];
    3049               8 :         tabopt = posix_class_maps[posix_class + 2];
    3050                 : 
    3051               8 :         if (taboffset >= 0)
    3052                 :           {
    3053               0 :           if (tabopt >= 0)
    3054               0 :             for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
    3055                 :           else
    3056               0 :             for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
    3057                 :           }
    3058                 : 
    3059                 :         /* Not see if we need to remove any special characters. An option
    3060                 :         value of 1 removes vertical space and 2 removes underscore. */
    3061                 : 
    3062               8 :         if (tabopt < 0) tabopt = -tabopt;
    3063               8 :         if (tabopt == 1) pbits[1] &= ~0x3c;
    3064               8 :           else if (tabopt == 2) pbits[11] &= 0x7f;
    3065                 : 
    3066                 :         /* Add the POSIX table or its complement into the main table that is
    3067                 :         being built and we are done. */
    3068                 : 
    3069               8 :         if (local_negate)
    3070               0 :           for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
    3071                 :         else
    3072               8 :           for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
    3073                 : 
    3074               8 :         ptr = tempptr + 1;
    3075               8 :         class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
    3076               8 :         continue;    /* End of POSIX syntax handling */
    3077                 :         }
    3078                 : 
    3079                 :       /* Backslash may introduce a single character, or it may introduce one
    3080                 :       of the specials, which just set a flag. The sequence \b is a special
    3081                 :       case. Inside a class (and only there) it is treated as backspace.
    3082                 :       Elsewhere it marks a word boundary. Other escapes have preset maps ready
    3083                 :       to 'or' into the one we are building. We assume they have more than one
    3084                 :       character in them, so set class_charcount bigger than one. */
    3085                 : 
    3086           92812 :       if (c == CHAR_BACKSLASH)
    3087                 :         {
    3088           85722 :         c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
    3089           85722 :         if (*errorcodeptr != 0) goto FAILED;
    3090                 : 
    3091           85722 :         if (-c == ESC_b) c = CHAR_BS;       /* \b is backspace in a class */
    3092           85712 :         else if (-c == ESC_X) c = CHAR_X;   /* \X is literal X in a class */
    3093           85712 :         else if (-c == ESC_R) c = CHAR_R;   /* \R is literal R in a class */
    3094           85712 :         else if (-c == ESC_Q)            /* Handle start of quoted string */
    3095                 :           {
    3096               0 :           if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
    3097                 :             {
    3098               0 :             ptr += 2; /* avoid empty string */
    3099                 :             }
    3100               0 :           else inescq = TRUE;
    3101               0 :           continue;
    3102                 :           }
    3103           85712 :         else if (-c == ESC_E) continue;  /* Ignore orphan \E */
    3104                 : 
    3105           85722 :         if (c < 0)
    3106                 :           {
    3107             186 :           register const uschar *cbits = cd->cbits;
    3108             186 :           class_charcount += 2;     /* Greater than 1 is what matters */
    3109                 : 
    3110                 :           /* Save time by not doing this in the pre-compile phase. */
    3111                 : 
    3112             186 :           if (lengthptr == NULL) switch (-c)
    3113                 :             {
    3114                 :             case ESC_d:
    3115               1 :             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
    3116               1 :             continue;
    3117                 : 
    3118                 :             case ESC_D:
    3119               0 :             should_flip_negation = TRUE;
    3120               0 :             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
    3121               0 :             continue;
    3122                 : 
    3123                 :             case ESC_w:
    3124              57 :             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
    3125              57 :             continue;
    3126                 : 
    3127                 :             case ESC_W:
    3128               0 :             should_flip_negation = TRUE;
    3129               0 :             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
    3130               0 :             continue;
    3131                 : 
    3132                 :             case ESC_s:
    3133               3 :             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
    3134               3 :             classbits[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */
    3135               3 :             continue;
    3136                 : 
    3137                 :             case ESC_S:
    3138               0 :             should_flip_negation = TRUE;
    3139               0 :             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
    3140               0 :             classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
    3141               0 :             continue;
    3142                 : 
    3143                 :             default:    /* Not recognized; fall through */
    3144                 :             break;      /* Need "default" setting to stop compiler warning. */
    3145                 :             }
    3146                 : 
    3147                 :           /* In the pre-compile phase, just do the recognition. */
    3148                 : 
    3149              93 :           else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
    3150                 :                    c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
    3151                 : 
    3152                 :           /* We need to deal with \H, \h, \V, and \v in both phases because
    3153                 :           they use extra memory. */
    3154                 : 
    3155              64 :           if (-c == ESC_h)
    3156                 :             {
    3157              58 :             SETBIT(classbits, 0x09); /* VT */
    3158              58 :             SETBIT(classbits, 0x20); /* SPACE */
    3159              58 :             SETBIT(classbits, 0xa0); /* NSBP */
    3160                 : #ifdef SUPPORT_UTF8
    3161              58 :             if (utf8)
    3162                 :               {
    3163               0 :               class_utf8 = TRUE;
    3164               0 :               *class_utf8data++ = XCL_SINGLE;
    3165               0 :               class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
    3166               0 :               *class_utf8data++ = XCL_SINGLE;
    3167               0 :               class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
    3168               0 :               *class_utf8data++ = XCL_RANGE;
    3169               0 :               class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
    3170               0 :               class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
    3171               0 :               *class_utf8data++ = XCL_SINGLE;
    3172               0 :               class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
    3173               0 :               *class_utf8data++ = XCL_SINGLE;
    3174               0 :               class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
    3175               0 :               *class_utf8data++ = XCL_SINGLE;
    3176               0 :               class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
    3177                 :               }
    3178                 : #endif
    3179              58 :             continue;
    3180                 :             }
    3181                 : 
    3182               6 :           if (-c == ESC_H)
    3183                 :             {
    3184               0 :             for (c = 0; c < 32; c++)
    3185                 :               {
    3186               0 :               int x = 0xff;
    3187               0 :               switch (c)
    3188                 :                 {
    3189               0 :                 case 0x09/8: x ^= 1 << (0x09%8); break;
    3190               0 :                 case 0x20/8: x ^= 1 << (0x20%8); break;
    3191               0 :                 case 0xa0/8: x ^= 1 << (0xa0%8); break;
    3192                 :                 default: break;
    3193                 :                 }
    3194               0 :               classbits[c] |= x;
    3195                 :               }
    3196                 : 
    3197                 : #ifdef SUPPORT_UTF8
    3198               0 :             if (utf8)
    3199                 :               {
    3200               0 :               class_utf8 = TRUE;
    3201               0 :               *class_utf8data++ = XCL_RANGE;
    3202               0 :               class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
    3203               0 :               class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
    3204               0 :               *class_utf8data++ = XCL_RANGE;
    3205               0 :               class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
    3206               0 :               class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
    3207               0 :               *class_utf8data++ = XCL_RANGE;
    3208               0 :               class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
    3209               0 :               class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
    3210               0 :               *class_utf8data++ = XCL_RANGE;
    3211               0 :               class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
    3212               0 :               class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
    3213               0 :               *class_utf8data++ = XCL_RANGE;
    3214               0 :               class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
    3215               0 :               class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
    3216               0 :               *class_utf8data++ = XCL_RANGE;
    3217               0 :               class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
    3218               0 :               class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
    3219               0 :               *class_utf8data++ = XCL_RANGE;
    3220               0 :               class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
    3221               0 :               class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
    3222                 :               }
    3223                 : #endif
    3224               0 :             continue;
    3225                 :             }
    3226                 : 
    3227               6 :           if (-c == ESC_v)
    3228                 :             {
    3229               0 :             SETBIT(classbits, 0x0a); /* LF */
    3230               0 :             SETBIT(classbits, 0x0b); /* VT */
    3231               0 :             SETBIT(classbits, 0x0c); /* FF */
    3232               0 :             SETBIT(classbits, 0x0d); /* CR */
    3233               0 :             SETBIT(classbits, 0x85); /* NEL */
    3234                 : #ifdef SUPPORT_UTF8
    3235               0 :             if (utf8)
    3236                 :               {
    3237               0 :               class_utf8 = TRUE;
    3238               0 :               *class_utf8data++ = XCL_RANGE;
    3239               0 :               class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
    3240               0 :               class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
    3241                 :               }
    3242                 : #endif
    3243               0 :             continue;
    3244                 :             }
    3245                 : 
    3246               6 :           if (-c == ESC_V)
    3247                 :             {
    3248               0 :             for (c = 0; c < 32; c++)
    3249                 :               {
    3250               0 :               int x = 0xff;
    3251               0 :               switch (c)
    3252                 :                 {
    3253               0 :                 case 0x0a/8: x ^= 1 << (0x0a%8);
    3254               0 :                              x ^= 1 << (0x0b%8);
    3255               0 :                              x ^= 1 << (0x0c%8);
    3256               0 :                              x ^= 1 << (0x0d%8);
    3257               0 :                              break;
    3258               0 :                 case 0x85/8: x ^= 1 << (0x85%8); break;
    3259                 :                 default: break;
    3260                 :                 }
    3261               0 :               classbits[c] |= x;
    3262                 :               }
    3263                 : 
    3264                 : #ifdef SUPPORT_UTF8
    3265               0 :             if (utf8)
    3266                 :               {
    3267               0 :               class_utf8 = TRUE;
    3268               0 :               *class_utf8data++ = XCL_RANGE;
    3269               0 :               class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
    3270               0 :               class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
    3271               0 :               *class_utf8data++ = XCL_RANGE;
    3272               0 :               class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
    3273               0 :               class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
    3274                 :               }
    3275                 : #endif
    3276               0 :             continue;
    3277                 :             }
    3278                 : 
    3279                 :           /* We need to deal with \P and \p in both phases. */
    3280                 : 
    3281                 : #ifdef SUPPORT_UCP
    3282               6 :           if (-c == ESC_p || -c == ESC_P)
    3283                 :             {
    3284                 :             BOOL negated;
    3285                 :             int pdata;
    3286               6 :             int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
    3287               6 :             if (ptype < 0) goto FAILED;
    3288               6 :             class_utf8 = TRUE;
    3289               6 :             *class_utf8data++ = ((-c == ESC_p) != negated)?
    3290                 :               XCL_PROP : XCL_NOTPROP;
    3291               6 :             *class_utf8data++ = ptype;
    3292               6 :             *class_utf8data++ = pdata;
    3293               6 :             class_charcount -= 2;   /* Not a < 256 character */
    3294               6 :             continue;
    3295                 :             }
    3296                 : #endif
    3297                 :           /* Unrecognized escapes are faulted if PCRE is running in its
    3298                 :           strict mode. By default, for compatibility with Perl, they are
    3299                 :           treated as literals. */
    3300                 : 
    3301               0 :           if ((options & PCRE_EXTRA) != 0)
    3302                 :             {
    3303               0 :             *errorcodeptr = ERR7;
    3304               0 :             goto FAILED;
    3305                 :             }
    3306                 : 
    3307               0 :           class_charcount -= 2;  /* Undo the default count from above */
    3308               0 :           c = *ptr;              /* Get the final character and fall through */
    3309                 :           }
    3310                 : 
    3311                 :         /* Fall through if we have a single character (c >= 0). This may be
    3312                 :         greater than 256 in UTF-8 mode. */
    3313                 : 
    3314                 :         }   /* End of backslash handling */
    3315                 : 
    3316                 :       /* A single character may be followed by '-' to form a range. However,
    3317                 :       Perl does not permit ']' to be the end of the range. A '-' character
    3318                 :       at the end is treated as a literal. Perl ignores orphaned \E sequences
    3319                 :       entirely. The code for handling \Q and \E is messy. */
    3320                 : 
    3321           92626 :       CHECK_RANGE:
    3322          185252 :       while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
    3323                 :         {
    3324               0 :         inescq = FALSE;
    3325               0 :         ptr += 2;
    3326                 :         }
    3327                 : 
    3328           92626 :       oldptr = ptr;
    3329                 : 
    3330                 :       /* Remember \r or \n */
    3331                 : 
    3332           92626 :       if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
    3333                 : 
    3334                 :       /* Check for range */
    3335                 : 
    3336           92626 :       if (!inescq && ptr[1] == CHAR_MINUS)
    3337                 :         {
    3338                 :         int d;
    3339            3734 :         ptr += 2;
    3340            3734 :         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
    3341                 : 
    3342                 :         /* If we hit \Q (not followed by \E) at this point, go into escaped
    3343                 :         mode. */
    3344                 : 
    3345            7468 :         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
    3346                 :           {
    3347               0 :           ptr += 2;
    3348               0 :           if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
    3349               0 :             { ptr += 2; continue; }
    3350               0 :           inescq = TRUE;
    3351               0 :           break;
    3352                 :           }
    3353                 : 
    3354            3734 :         if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
    3355                 :           {
    3356            1470 :           ptr = oldptr;
    3357            1470 :           goto LONE_SINGLE_CHARACTER;
    3358                 :           }
    3359                 : 
    3360                 : #ifdef SUPPORT_UTF8
    3361            2264 :         if (utf8)
    3362                 :           {                           /* Braces are required because the */
    3363               4 :           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
    3364                 :           }
    3365                 :         else
    3366                 : #endif
    3367            2260 :         d = *ptr;  /* Not UTF-8 mode */
    3368                 : 
    3369                 :         /* The second part of a range can be a single-character escape, but
    3370                 :         not any of the other escapes. Perl 5.6 treats a hyphen as a literal
    3371                 :         in such circumstances. */
    3372                 : 
    3373            2264 :         if (!inescq && d == CHAR_BACKSLASH)
    3374                 :           {
    3375               0 :           d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
    3376               0 :           if (*errorcodeptr != 0) goto FAILED;
    3377                 : 
    3378                 :           /* \b is backspace; \X is literal X; \R is literal R; any other
    3379                 :           special means the '-' was literal */
    3380                 : 
    3381               0 :           if (d < 0)
    3382                 :             {
    3383               0 :             if (d == -ESC_b) d = CHAR_BS;
    3384               0 :             else if (d == -ESC_X) d = CHAR_X;
    3385               0 :             else if (d == -ESC_R) d = CHAR_R; else
    3386                 :               {
    3387               0 :               ptr = oldptr;
    3388               0 :               goto LONE_SINGLE_CHARACTER;  /* A few lines below */
    3389                 :               }
    3390                 :             }
    3391                 :           }
    3392                 : 
    3393                 :         /* Check that the two values are in the correct order. Optimize
    3394                 :         one-character ranges */
    3395                 : 
    3396            2264 :         if (d < c)
    3397                 :           {
    3398               0 :           *errorcodeptr = ERR8;
    3399               0 :           goto FAILED;
    3400                 :           }
    3401                 : 
    3402            2264 :         if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
    3403                 : 
    3404                 :         /* Remember \r or \n */
    3405                 : 
    3406            2264 :         if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
    3407                 : 
    3408                 :         /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
    3409                 :         matching, we have to use an XCLASS with extra data items. Caseless
    3410                 :         matching for characters > 127 is available only if UCP support is
    3411                 :         available. */
    3412                 : 
    3413                 : #ifdef SUPPORT_UTF8
    3414            2264 :         if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
    3415                 :           {
    3416               0 :           class_utf8 = TRUE;
    3417                 : 
    3418                 :           /* With UCP support, we can find the other case equivalents of
    3419                 :           the relevant characters. There may be several ranges. Optimize how
    3420                 :           they fit with the basic range. */
    3421                 : 
    3422                 : #ifdef SUPPORT_UCP
    3423               0 :           if ((options & PCRE_CASELESS) != 0)
    3424                 :             {
    3425                 :             unsigned int occ, ocd;
    3426               0 :             unsigned int cc = c;
    3427               0 :             unsigned int origd = d;
    3428               0 :             while (get_othercase_range(&cc, origd, &occ, &ocd))
    3429                 :               {
    3430               0 :               if (occ >= (unsigned int)c &&
    3431                 :                   ocd <= (unsigned int)d)
    3432               0 :                 continue;                          /* Skip embedded ranges */
    3433                 : 
    3434               0 :               if (occ < (unsigned int)c  &&
    3435                 :                   ocd >= (unsigned int)c - 1)      /* Extend the basic range */
    3436                 :                 {                                  /* if there is overlap,   */
    3437               0 :                 c = occ;                           /* noting that if occ < c */
    3438               0 :                 continue;                          /* we can't have ocd > d  */
    3439                 :                 }                                  /* because a subrange is  */
    3440               0 :               if (ocd > (unsigned int)d &&
    3441                 :                   occ <= (unsigned int)d + 1)      /* always shorter than    */
    3442                 :                 {                                  /* the basic range.       */
    3443               0 :                 d = ocd;
    3444               0 :                 continue;
    3445                 :                 }
    3446                 : 
    3447               0 :               if (occ == ocd)
    3448                 :                 {
    3449               0 :                 *class_utf8data++ = XCL_SINGLE;
    3450                 :                 }
    3451                 :               else
    3452                 :                 {
    3453               0 :                 *class_utf8data++ = XCL_RANGE;
    3454               0 :                 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
    3455                 :                 }
    3456               0 :               class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
    3457                 :               }
    3458                 :             }
    3459                 : #endif  /* SUPPORT_UCP */
    3460                 : 
    3461                 :           /* Now record the original range, possibly modified for UCP caseless
    3462                 :           overlapping ranges. */
    3463                 : 
    3464               0 :           *class_utf8data++ = XCL_RANGE;
    3465               0 :           class_utf8data += _pcre_ord2utf8(c, class_utf8data);
    3466               0 :           class_utf8data += _pcre_ord2utf8(d, class_utf8data);
    3467                 : 
    3468                 :           /* With UCP support, we are done. Without UCP support, there is no
    3469                 :           caseless matching for UTF-8 characters > 127; we can use the bit map
    3470                 :           for the smaller ones. */
    3471                 : 
    3472                 : #ifdef SUPPORT_UCP
    3473               0 :           continue;    /* With next character in the class */
    3474                 : #else
    3475                 :           if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
    3476                 : 
    3477                 :           /* Adjust upper limit and fall through to set up the map */
    3478                 : 
    3479                 :           d = 127;
    3480                 : 
    3481                 : #endif  /* SUPPORT_UCP */
    3482                 :           }
    3483                 : #endif  /* SUPPORT_UTF8 */
    3484                 : 
    3485                 :         /* We use the bit map for all cases when not in UTF-8 mode; else
    3486                 :         ranges that lie entirely within 0-127 when there is UCP support; else
    3487                 :         for partial ranges without UCP support. */
    3488                 : 
    3489            2264 :         class_charcount += d - c + 1;
    3490            2264 :         class_lastchar = d;
    3491                 : 
    3492                 :         /* We can save a bit of time by skipping this in the pre-compile. */
    3493                 : 
    3494           16398 :         if (lengthptr == NULL) for (; c <= d; c++)
    3495                 :           {
    3496           14134 :           classbits[c/8] |= (1 << (c&7));
    3497           14134 :           if ((options & PCRE_CASELESS) != 0)
    3498                 :             {
    3499             104 :             int uc = cd->fcc[c];           /* flip case */
    3500             104 :             classbits[uc/8] |= (1 << (uc&7));
    3501                 :             }
    3502                 :           }
    3503                 : 
    3504            2264 :         continue;   /* Go get the next char in the class */
    3505                 :         }
    3506                 : 
    3507                 :       /* Handle a lone single character - we can get here for a normal
    3508                 :       non-escape char, or after \ that introduces a single character or for an
    3509                 :       apparent range that isn't. */
    3510                 : 
    3511           90362 :       LONE_SINGLE_CHARACTER:
    3512                 : 
    3513                 :       /* Handle a character that cannot go in the bit map */
    3514                 : 
    3515                 : #ifdef SUPPORT_UTF8
    3516           90362 :       if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
    3517                 :         {
    3518               0 :         class_utf8 = TRUE;
    3519               0 :         *class_utf8data++ = XCL_SINGLE;
    3520               0 :         class_utf8data += _pcre_ord2utf8(c, class_utf8data);
    3521                 : 
    3522                 : #ifdef SUPPORT_UCP
    3523               0 :         if ((options & PCRE_CASELESS) != 0)
    3524                 :           {
    3525                 :           unsigned int othercase;
    3526               0 :           if ((othercase = UCD_OTHERCASE(c)) != c)
    3527                 :             {
    3528               0 :             *class_utf8data++ = XCL_SINGLE;
    3529               0 :             class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
    3530                 :             }
    3531                 :           }
    3532                 : #endif  /* SUPPORT_UCP */
    3533                 : 
    3534                 :         }
    3535                 :       else
    3536                 : #endif  /* SUPPORT_UTF8 */
    3537                 : 
    3538                 :       /* Handle a single-byte character */
    3539                 :         {
    3540           90362 :         classbits[c/8] |= (1 << (c&7));
    3541           90362 :         if ((options & PCRE_CASELESS) != 0)
    3542                 :           {
    3543              12 :           c = cd->fcc[c];   /* flip case */
    3544              12 :           classbits[c/8] |= (1 << (c&7));
    3545                 :           }
    3546           90362 :         class_charcount++;
    3547           90362 :         class_lastchar = c;
    3548                 :         }
    3549                 :       }
    3550                 : 
    3551                 :     /* Loop until ']' reached. This "while" is the end of the "do" above. */
    3552                 : 
    3553           92820 :     while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
    3554                 : 
    3555           46541 :     if (c == 0)                          /* Missing terminating ']' */
    3556                 :       {
    3557               1 :       *errorcodeptr = ERR6;
    3558               1 :       goto FAILED;
    3559                 :       }
    3560                 : 
    3561                 : 
    3562                 : /* This code has been disabled because it would mean that \s counts as
    3563                 : an explicit \r or \n reference, and that's not really what is wanted. Now
    3564                 : we set the flag only if there is a literal "\r" or "\n" in the class. */
    3565                 : 
    3566                 : #if 0
    3567                 :     /* Remember whether \r or \n are in this class */
    3568                 : 
    3569                 :     if (negate_class)
    3570                 :       {
    3571                 :       if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
    3572                 :       }
    3573                 :     else
    3574                 :       {
    3575                 :       if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
    3576                 :       }
    3577                 : #endif
    3578                 : 
    3579                 : 
    3580                 :     /* If class_charcount is 1, we saw precisely one character whose value is
    3581                 :     less than 256. As long as there were no characters >= 128 and there was no
    3582                 :     use of \p or \P, in other words, no use of any XCLASS features, we can
    3583                 :     optimize.
    3584                 : 
    3585                 :     In UTF-8 mode, we can optimize the negative case only if there were no
    3586                 :     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
    3587                 :     operate on single-bytes only. This is an historical hangover. Maybe one day
    3588                 :     we can tidy these opcodes to handle multi-byte characters.
    3589                 : 
    3590                 :     The optimization throws away the bit map. We turn the item into a
    3591                 :     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
    3592                 :     that OP_NOT does not support multibyte characters. In the positive case, it
    3593                 :     can cause firstbyte to be set. Otherwise, there can be no first char if
    3594                 :     this item is first, whatever repeat count may follow. In the case of
    3595                 :     reqbyte, save the previous value for reinstating. */
    3596                 : 
    3597                 : #ifdef SUPPORT_UTF8
    3598           46540 :     if (class_charcount == 1 && !class_utf8 &&
    3599                 :       (!utf8 || !negate_class || class_lastchar < 128))
    3600                 : #else
    3601                 :     if (class_charcount == 1)
    3602                 : #endif
    3603                 :       {
    3604             148 :       zeroreqbyte = reqbyte;
    3605                 : 
    3606                 :       /* The OP_NOT opcode works on one-byte characters only. */
    3607                 : 
    3608             148 :       if (negate_class)
    3609                 :         {
    3610              24 :         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
    3611              24 :         zerofirstbyte = firstbyte;
    3612              24 :         *code++ = OP_NOT;
    3613              24 :         *code++ = class_lastchar;
    3614              24 :         break;
    3615                 :         }
    3616                 : 
    3617                 :       /* For a single, positive character, get the value into mcbuffer, and
    3618                 :       then we can handle this with the normal one-character code. */
    3619                 : 
    3620                 : #ifdef SUPPORT_UTF8
    3621             124 :       if (utf8 && class_lastchar > 127)
    3622               0 :         mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
    3623                 :       else
    3624                 : #endif
    3625                 :         {
    3626             124 :         mcbuffer[0] = class_lastchar;
    3627             124 :         mclength = 1;
    3628                 :         }
    3629             124 :       goto ONE_CHAR;
    3630                 :       }       /* End of 1-char optimization */
    3631                 : 
    3632                 :     /* The general case - not the one-char optimization. If this is the first
    3633                 :     thing in the branch, there can be no first char setting, whatever the
    3634                 :     repeat count. Any reqbyte setting must remain unchanged after any kind of
    3635                 :     repeat. */
    3636                 : 
    3637           46392 :     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
    3638           46392 :     zerofirstbyte = firstbyte;
    3639           46392 :     zeroreqbyte = reqbyte;
    3640                 : 
    3641                 :     /* If there are characters with values > 255, we have to compile an
    3642                 :     extended class, with its own opcode, unless there was a negated special
    3643                 :     such as \S in the class, because in that case all characters > 255 are in
    3644                 :     the class, so any that were explicitly given as well can be ignored. If
    3645                 :     (when there are explicit characters > 255 that must be listed) there are no
    3646                 :     characters < 256, we can omit the bitmap in the actual compiled code. */
    3647                 : 
    3648                 : #ifdef SUPPORT_UTF8
    3649           46392 :     if (class_utf8 && !should_flip_negation)
    3650                 :       {
    3651               6 :       *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
    3652               6 :       *code++ = OP_XCLASS;
    3653               6 :       code += LINK_SIZE;
    3654               6 :       *code = negate_class? XCL_NOT : 0;
    3655                 : 
    3656                 :       /* If the map is required, move up the extra data to make room for it;
    3657                 :       otherwise just move the code pointer to the end of the extra data. */
    3658                 : 
    3659               6 :       if (class_charcount > 0)
    3660                 :         {
    3661               0 :         *code++ |= XCL_MAP;
    3662               0 :         memmove(code + 32, code, class_utf8data - code);
    3663               0 :         memcpy(code, classbits, 32);
    3664               0 :         code = class_utf8data + 32;
    3665                 :         }
    3666               6 :       else code = class_utf8data;
    3667                 : 
    3668                 :       /* Now fill in the complete length of the item */
    3669                 : 
    3670               6 :       PUT(previous, 1, code - previous);
    3671               6 :       break;   /* End of class handling */
    3672                 :       }
    3673                 : #endif
    3674                 : 
    3675                 :     /* If there are no characters > 255, set the opcode to OP_CLASS or
    3676                 :     OP_NCLASS, depending on whether the whole class was negated and whether
    3677                 :     there were negative specials such as \S in the class. Then copy the 32-byte
    3678                 :     map into the code vector, negating it if necessary. */
    3679                 : 
    3680           46386 :     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
    3681           46386 :     if (negate_class)
    3682                 :       {
    3683           42496 :       if (lengthptr == NULL)    /* Save time in the pre-compile phase */
    3684           21248 :         for (c = 0; c < 32; c++) code[c] = ~classbits[c];
    3685                 :       }
    3686                 :     else
    3687                 :       {
    3688            3890 :       memcpy(code, classbits, 32);
    3689                 :       }
    3690           46386 :     code += 32;
    3691           46386 :     break;
    3692                 : 
    3693                 : 
    3694                 :     /* ===================================================================*/
    3695                 :     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
    3696                 :     has been tested above. */
    3697                 : 
    3698                 :     case CHAR_LEFT_CURLY_BRACKET:
    3699              88 :     if (!is_quantifier) goto NORMAL_CHAR;
    3700              54 :     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
    3701              54 :     if (*errorcodeptr != 0) goto FAILED;
    3702              54 :     goto REPEAT;
    3703                 : 
    3704                 :     case CHAR_ASTERISK:
    3705            1372 :     repeat_min = 0;
    3706            1372 :     repeat_max = -1;
    3707            1372 :     goto REPEAT;
    3708                 : 
    3709                 :     case CHAR_PLUS:
    3710           93566 :     repeat_min = 1;
    3711           93566 :     repeat_max = -1;
    3712           93566 :     goto REPEAT;
    3713                 : 
    3714                 :     case CHAR_QUESTION_MARK:
    3715            2920 :     repeat_min = 0;
    3716            2920 :     repeat_max = 1;
    3717                 : 
    3718           97912 :     REPEAT:
    3719           97912 :     if (previous == NULL)
    3720                 :       {
    3721               2 :       *errorcodeptr = ERR9;
    3722               2 :       goto FAILED;
    3723                 :       }
    3724                 : 
    3725           97910 :     if (repeat_min == 0)
    3726                 :       {
    3727            4291 :       firstbyte = zerofirstbyte;    /* Adjust for zero repeat */
    3728            4291 :       reqbyte = zeroreqbyte;        /* Ditto */
    3729                 :       }
    3730                 : 
    3731                 :     /* Remember whether this is a variable length repeat */
    3732                 : 
    3733           97910 :     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
    3734                 : 
    3735           97910 :     op_type = 0;                    /* Default single-char op codes */
    3736           97910 :     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
    3737                 : 
    3738                 :     /* Save start of previous item, in case we have to move it up to make space
    3739                 :     for an inserted OP_ONCE for the additional '+' extension. */
    3740                 : 
    3741           97910 :     tempcode = previous;
    3742                 : 
    3743                 :     /* If the next character is '+', we have a possessive quantifier. This
    3744                 :     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
    3745                 :     If the next character is '?' this is a minimizing repeat, by default,
    3746                 :     but if PCRE_UNGREEDY is set, it works the other way round. We change the
    3747                 :     repeat type to the non-default. */
    3748                 : 
    3749           97910 :     if (ptr[1] == CHAR_PLUS)
    3750                 :       {
    3751               2 :       repeat_type = 0;                  /* Force greedy */
    3752               2 :       possessive_quantifier = TRUE;
    3753               2 :       ptr++;
    3754                 :       }
    3755           97908 :     else if (ptr[1] == CHAR_QUESTION_MARK)
    3756                 :       {
    3757              58 :       repeat_type = greedy_non_default;
    3758              58 :       ptr++;
    3759                 :       }
    3760           97850 :     else repeat_type = greedy_default;
    3761                 : 
    3762                 :     /* If previous was a character match, abolish the item and generate a
    3763                 :     repeat item instead. If a char item has a minumum of more than one, ensure
    3764                 :     that it is set in reqbyte - it might not be if a sequence such as x{3} is
    3765                 :     the first thing in a branch because the x will have gone into firstbyte
    3766                 :     instead.  */
    3767                 : 
    3768           97910 :     if (*previous == OP_CHAR || *previous == OP_CHARNC)
    3769                 :       {
    3770                 :       /* Deal with UTF-8 characters that take up more than one byte. It's
    3771                 :       easier to write this out separately than try to macrify it. Use c to
    3772                 :       hold the length of the character in bytes, plus 0x80 to flag that it's a
    3773                 :       length rather than a small character. */
    3774                 : 
    3775                 : #ifdef SUPPORT_UTF8
    3776             714 :       if (utf8 && (code[-1] & 0x80) != 0)
    3777                 :         {
    3778               0 :         uschar *lastchar = code - 1;
    3779               0 :         while((*lastchar & 0xc0) == 0x80) lastchar--;
    3780               0 :         c = code - lastchar;            /* Length of UTF-8 character */
    3781               0 :         memcpy(utf8_char, lastchar, c); /* Save the char */
    3782               0 :         c |= 0x80;                      /* Flag c as a length */
    3783                 :         }
    3784                 :       else
    3785                 : #endif
    3786                 : 
    3787                 :       /* Handle the case of a single byte - either with no UTF8 support, or
    3788                 :       with UTF-8 disabled, or for a UTF-8 character < 128. */
    3789                 : 
    3790                 :         {
    3791             714 :         c = code[-1];
    3792             714 :         if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
    3793                 :         }
    3794                 : 
    3795                 :       /* If the repetition is unlimited, it pays to see if the next thing on
    3796                 :       the line is something that cannot possibly match this character. If so,
    3797                 :       automatically possessifying this item gains some performance in the case
    3798                 :       where the match fails. */
    3799                 : 
    3800             714 :       if (!possessive_quantifier &&
    3801                 :           repeat_max < 0 &&
    3802                 :           check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
    3803                 :             options, cd))
    3804                 :         {
    3805              34 :         repeat_type = 0;    /* Force greedy */
    3806              34 :         possessive_quantifier = TRUE;
    3807                 :         }
    3808                 : 
    3809             714 :       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
    3810                 :       }
    3811                 : 
    3812                 :     /* If previous was a single negated character ([^a] or similar), we use
    3813                 :     one of the special opcodes, replacing it. The code is shared with single-
    3814                 :     character repeats by setting opt_type to add a suitable offset into
    3815                 :     repeat_type. We can also test for auto-possessification. OP_NOT is
    3816                 :     currently used only for single-byte chars. */
    3817                 : 
    3818           97196 :     else if (*previous == OP_NOT)
    3819                 :       {
    3820              18 :       op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
    3821              18 :       c = previous[1];
    3822              18 :       if (!possessive_quantifier &&
    3823                 :           repeat_max < 0 &&
    3824                 :           check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
    3825                 :         {
    3826               8 :         repeat_type = 0;    /* Force greedy */
    3827               8 :         possessive_quantifier = TRUE;
    3828                 :         }
    3829              18 :       goto OUTPUT_SINGLE_REPEAT;
    3830                 :       }
    3831                 : 
    3832                 :     /* If previous was a character type match (\d or similar), abolish it and
    3833                 :     create a suitable repeat item. The code is shared with single-character
    3834                 :     repeats by setting op_type to add a suitable offset into repeat_type. Note
    3835                 :     the the Unicode property types will be present only when SUPPORT_UCP is
    3836                 :     defined, but we don't wrap the little bits of code here because it just
    3837                 :     makes it horribly messy. */
    3838                 : 
    3839           97178 :     else if (*previous < OP_EODN)
    3840                 :       {
    3841                 :       uschar *oldcode;
    3842                 :       int prop_type, prop_value;
    3843           51402 :       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
    3844           51402 :       c = *previous;
    3845                 : 
    3846           51402 :       if (!possessive_quantifier &&
    3847                 :           repeat_max < 0 &&
    3848                 :           check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
    3849                 :         {
    3850           47988 :         repeat_type = 0;    /* Force greedy */
    3851           47988 :         possessive_quantifier = TRUE;
    3852                 :         }
    3853                 : 
    3854           52134 :       OUTPUT_SINGLE_REPEAT:
    3855           52142 :       if (*previous == OP_PROP || *previous == OP_NOTPROP)
    3856                 :         {
    3857               8 :         prop_type = previous[1];
    3858               8 :         prop_value = previous[2];
    3859                 :         }
    3860           52126 :       else prop_type = prop_value = -1;
    3861                 : 
    3862           52134 :       oldcode = code;
    3863           52134 :       code = previous;                  /* Usually overwrite previous item */
    3864                 : 
    3865                 :       /* If the maximum is zero then the minimum must also be zero; Perl allows
    3866                 :       this case, so we do too - by simply omitting the item altogether. */
    3867                 : 
    3868           52134 :       if (repeat_max == 0) goto END_REPEAT;
    3869                 : 
    3870                 :       /* All real repeats make it impossible to handle partial matching (maybe
    3871                 :       one day we will be able to remove this restriction). */
    3872                 : 
    3873           52134 :       if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
    3874                 : 
    3875                 :       /* Combine the op_type with the repeat_type */
    3876                 : 
    3877           52134 :       repeat_type += op_type;
    3878                 : 
    3879                 :       /* A minimum of zero is handled either as the special case * or ?, or as
    3880                 :       an UPTO, with the maximum given. */
    3881                 : 
    3882           52134 :       if (repeat_min == 0)
    3883                 :         {
    3884            1303 :         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
    3885             582 :           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
    3886                 :         else
    3887                 :           {
    3888               0 :           *code++ = OP_UPTO + repeat_type;
    3889               0 :           PUT2INC(code, 0, repeat_max);
    3890                 :           }
    3891                 :         }
    3892                 : 
    3893                 :       /* A repeat minimum of 1 is optimized into some special cases. If the
    3894                 :       maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
    3895                 :       left in place and, if the maximum is greater than 1, we use OP_UPTO with
    3896                 :       one less than the maximum. */
    3897                 : 
    3898           50831 :       else if (repeat_min == 1)
    3899                 :         {
    3900           50797 :         if (repeat_max == -1)
    3901           50789 :           *code++ = OP_PLUS + repeat_type;
    3902                 :         else
    3903                 :           {
    3904               8 :           code = oldcode;                 /* leave previous item in place */
    3905               8 :           if (repeat_max == 1) goto END_REPEAT;
    3906               2 :           *code++ = OP_UPTO + repeat_type;
    3907               2 :           PUT2INC(code, 0, repeat_max - 1);
    3908                 :           }
    3909                 :         }
    3910                 : 
    3911                 :       /* The case {n,n} is just an EXACT, while the general case {n,m} is
    3912                 :       handled as an EXACT followed by an UPTO. */
    3913                 : 
    3914                 :       else
    3915                 :         {
    3916              34 :         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
    3917              34 :         PUT2INC(code, 0, repeat_min);
    3918                 : 
    3919                 :         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
    3920                 :         we have to insert the character for the previous code. For a repeated
    3921                 :         Unicode property match, there are two extra bytes that define the
    3922                 :         required property. In UTF-8 mode, long characters have their length in
    3923                 :         c, with the 0x80 bit as a flag. */
    3924                 : 
    3925              34 :         if (repeat_max < 0)
    3926                 :           {
    3927                 : #ifdef SUPPORT_UTF8
    3928               2 :           if (utf8 && c >= 128)
    3929                 :             {
    3930               0 :             memcpy(code, utf8_char, c & 7);
    3931               0 :             code += c & 7;
    3932                 :             }
    3933                 :           else
    3934                 : #endif
    3935                 :             {
    3936               2 :             *code++ = c;
    3937               2 :             if (prop_type >= 0)
    3938                 :               {
    3939               0 :               *code++ = prop_type;
    3940               0 :               *code++ = prop_value;
    3941                 :               }
    3942                 :             }
    3943               2 :           *code++ = OP_STAR + repeat_type;
    3944                 :           }
    3945                 : 
    3946                 :         /* Else insert an UPTO if the max is greater than the min, again
    3947                 :         preceded by the character, for the previously inserted code. If the
    3948                 :         UPTO is just for 1 instance, we can use QUERY instead. */
    3949                 : 
    3950              32 :         else if (repeat_max != repeat_min)
    3951                 :           {
    3952                 : #ifdef SUPPORT_UTF8
    3953               6 :           if (utf8 && c >= 128)
    3954                 :             {
    3955               0 :             memcpy(code, utf8_char, c & 7);
    3956               0 :             code += c & 7;
    3957                 :             }
    3958                 :           else
    3959                 : #endif
    3960               6 :           *code++ = c;
    3961               6 :           if (prop_type >= 0)
    3962                 :             {
    3963               0 :             *code++ = prop_type;
    3964               0 :             *code++ = prop_value;
    3965                 :             }
    3966               6 :           repeat_max -= repeat_min;
    3967                 : 
    3968               6 :           if (repeat_max == 1)
    3969                 :             {
    3970               4 :             *code++ = OP_QUERY + repeat_type;
    3971                 :             }
    3972                 :           else
    3973                 :             {
    3974               2 :             *code++ = OP_UPTO + repeat_type;
    3975               2 :             PUT2INC(code, 0, repeat_max);
    3976                 :             }
    3977                 :           }
    3978                 :         }
    3979                 : 
    3980                 :       /* The character or character type itself comes last in all cases. */
    3981                 : 
    3982                 : #ifdef SUPPORT_UTF8
    3983           52128 :       if (utf8 && c >= 128)
    3984                 :         {
    3985               0 :         memcpy(code, utf8_char, c & 7);
    3986               0 :         code += c & 7;
    3987                 :         }
    3988                 :       else
    3989                 : #endif
    3990           52128 :       *code++ = c;
    3991                 : 
    3992                 :       /* For a repeated Unicode property match, there are two extra bytes that
    3993                 :       define the required property. */
    3994                 : 
    3995                 : #ifdef SUPPORT_UCP
    3996           52128 :       if (prop_type >= 0)
    3997                 :         {
    3998               8 :         *code++ = prop_type;
    3999               8 :         *code++ = prop_value;
    4000                 :         }
    4001                 : #endif
    4002                 :       }
    4003                 : 
    4004                 :     /* If previous was a character class or a back reference, we put the repeat
    4005                 :     stuff after it, but just skip the item if the repeat was {0,0}. */
    4006                 : 
    4007           90712 :     else if (*previous == OP_CLASS ||
    4008                 :              *previous == OP_NCLASS ||
    4009                 : #ifdef SUPPORT_UTF8
    4010                 :              *previous == OP_XCLASS ||
    4011                 : #endif
    4012                 :              *previous == OP_REF)
    4013                 :       {
    4014           44936 :       if (repeat_max == 0)
    4015                 :         {
    4016               0 :         code = previous;
    4017               0 :         goto END_REPEAT;
    4018                 :         }
    4019                 : 
    4020                 :       /* All real repeats make it impossible to handle partial matching (maybe
    4021                 :       one day we will be able to remove this restriction). */
    4022                 : 
    4023           44936 :       if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
    4024                 : 
    4025           45546 :       if (repeat_min == 0 && repeat_max == -1)
    4026             610 :         *code++ = OP_CRSTAR + repeat_type;
    4027           87078 :       else if (repeat_min == 1 && repeat_max == -1)
    4028           42752 :         *code++ = OP_CRPLUS + repeat_type;
    4029            3142 :       else if (repeat_min == 0 && repeat_max == 1)
    4030            1568 :         *code++ = OP_CRQUERY + repeat_type;
    4031                 :       else
    4032                 :         {
    4033               6 :         *code++ = OP_CRRANGE + repeat_type;
    4034               6 :         PUT2INC(code, 0, repeat_min);
    4035               6 :         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
    4036               6 :         PUT2INC(code, 0, repeat_max);
    4037                 :         }
    4038                 :       }
    4039                 : 
    4040                 :     /* If previous was a bracket group, we may have to replicate it in certain
    4041                 :     cases. */
    4042                 : 
    4043            1680 :     else if (*previous == OP_BRA  || *previous == OP_CBRA ||
    4044                 :              *previous == OP_ONCE || *previous == OP_COND)
    4045                 :       {
    4046                 :       register int i;
    4047             840 :       int ketoffset = 0;
    4048             840 :       int len = code - previous;
    4049             840 :       uschar *bralink = NULL;
    4050                 : 
    4051                 :       /* Repeating a DEFINE group is pointless */
    4052                 : 
    4053             840 :       if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
    4054                 :         {
    4055               0 :         *errorcodeptr = ERR55;
    4056               0 :         goto FAILED;
    4057                 :         }
    4058                 : 
    4059                 :       /* If the maximum repeat count is unlimited, find the end of the bracket
    4060                 :       by scanning through from the start, and compute the offset back to it
    4061                 :       from the current code pointer. There may be an OP_OPT setting following
    4062                 :       the final KET, so we can't find the end just by going back from the code
    4063                 :       pointer. */
    4064                 : 
    4065             840 :       if (repeat_max == -1)
    4066                 :         {
    4067              68 :         register uschar *ket = previous;
    4068              74 :         do ket += GET(ket, 1); while (*ket != OP_KET);
    4069              68 :         ketoffset = code - ket;
    4070                 :         }
    4071                 : 
    4072                 :       /* The case of a zero minimum is special because of the need to stick
    4073                 :       OP_BRAZERO in front of it, and because the group appears once in the
    4074                 :       data, whereas in other cases it appears the minimum number of times. For
    4075                 :       this reason, it is simplest to treat this case separately, as otherwise
    4076                 :       the code gets far too messy. There are several special subcases when the
    4077                 :       minimum is zero. */
    4078                 : 
    4079             840 :       if (repeat_min == 0)
    4080                 :         {
    4081                 :         /* If the maximum is also zero, we used to just omit the group from the
    4082                 :         output altogether, like this:
    4083                 : 
    4084                 :         ** if (repeat_max == 0)
    4085                 :         **   {
    4086                 :         **   code = previous;
    4087                 :         **   goto END_REPEAT;
    4088                 :         **   }
    4089                 : 
    4090                 :         However, that fails when a group is referenced as a subroutine from
    4091                 :         elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
    4092                 :         so that it is skipped on execution. As we don't have a list of which
    4093                 :         groups are referenced, we cannot do this selectively.
    4094                 : 
    4095                 :         If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
    4096                 :         and do no more at this point. However, we do need to adjust any
    4097                 :         OP_RECURSE calls inside the group that refer to the group itself or any
    4098                 :         internal or forward referenced group, because the offset is from the
    4099                 :         start of the whole regex. Temporarily terminate the pattern while doing
    4100                 :         this. */
    4101                 : 
    4102             810 :         if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
    4103                 :           {
    4104             810 :           *code = OP_END;
    4105             810 :           adjust_recurse(previous, 1, utf8, cd, save_hwm);
    4106             810 :           memmove(previous+1, previous, len);
    4107             810 :           code++;
    4108             810 :           if (repeat_max == 0)
    4109                 :             {
    4110               0 :             *previous++ = OP_SKIPZERO;
    4111               0 :             goto END_REPEAT;
    4112                 :             }
    4113             810 :           *previous++ = OP_BRAZERO + repeat_type;
    4114                 :           }
    4115                 : 
    4116                 :         /* If the maximum is greater than 1 and limited, we have to replicate
    4117                 :         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
    4118                 :         The first one has to be handled carefully because it's the original
    4119                 :         copy, which has to be moved up. The remainder can be handled by code
    4120                 :         that is common with the non-zero minimum case below. We have to
    4121                 :         adjust the value or repeat_max, since one less copy is required. Once
    4122                 :         again, we may have to adjust any OP_RECURSE calls inside the group. */
    4123                 : 
    4124                 :         else
    4125                 :           {
    4126                 :           int offset;
    4127               0 :           *code = OP_END;
    4128               0 :           adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
    4129               0 :           memmove(previous + 2 + LINK_SIZE, previous, len);
    4130               0 :           code += 2 + LINK_SIZE;
    4131               0 :           *previous++ = OP_BRAZERO + repeat_type;
    4132               0 :           *previous++ = OP_BRA;
    4133                 : 
    4134                 :           /* We chain together the bracket offset fields that have to be
    4135                 :           filled in later when the ends of the brackets are reached. */
    4136                 : 
    4137               0 :           offset = (bralink == NULL)? 0 : previous - bralink;
    4138               0 :           bralink = previous;
    4139               0 :           PUTINC(previous, 0, offset);
    4140                 :           }
    4141                 : 
    4142             810 :         repeat_max--;
    4143                 :         }
    4144                 : 
    4145                 :       /* If the minimum is greater than zero, replicate the group as many
    4146                 :       times as necessary, and adjust the maximum to the number of subsequent
    4147                 :       copies that we need. If we set a first char from the group, and didn't
    4148                 :       set a required char, copy the latter from the former. If there are any
    4149                 :       forward reference subroutine calls in the group, there will be entries on
    4150                 :       the workspace list; replicate these with an appropriate increment. */
    4151                 : 
    4152                 :       else
    4153                 :         {
    4154              30 :         if (repeat_min > 1)
    4155                 :           {
    4156                 :           /* In the pre-compile phase, we don't actually do the replication. We
    4157                 :           just adjust the length as if we had. Do some paranoid checks for
    4158                 :           potential integer overflow. */
    4159                 : 
    4160               6 :           if (lengthptr != NULL)
    4161                 :             {
    4162               3 :             int delta = (repeat_min - 1)*length_prevgroup;
    4163               3 :             if ((double)(repeat_min - 1)*(double)length_prevgroup >
    4164                 :                                                             (double)INT_MAX ||
    4165                 :                 OFLOW_MAX - *lengthptr < delta)
    4166                 :               {
    4167               0 :               *errorcodeptr = ERR20;
    4168               0 :               goto FAILED;
    4169                 :               }
    4170               3 :             *lengthptr += delta;
    4171                 :             }
    4172                 : 
    4173                 :           /* This is compiling for real */
    4174                 : 
    4175                 :           else
    4176                 :             {
    4177               3 :             if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
    4178               6 :             for (i = 1; i < repeat_min; i++)
    4179                 :               {
    4180                 :               uschar *hc;
    4181               3 :               uschar *this_hwm = cd->hwm;
    4182               3 :               memcpy(code, previous, len);
    4183               3 :               for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
    4184                 :                 {
    4185               0 :                 PUT(cd->hwm, 0, GET(hc, 0) + len);
    4186               0 :                 cd->hwm += LINK_SIZE;
    4187                 :                 }
    4188               3 :               save_hwm = this_hwm;
    4189               3 :               code += len;
    4190                 :               }
    4191                 :             }
    4192                 :           }
    4193                 : 
    4194              30 :         if (repeat_max > 0) repeat_max -= repeat_min;
    4195                 :         }
    4196                 : 
    4197                 :       /* This code is common to both the zero and non-zero minimum cases. If
    4198                 :       the maximum is limited, it replicates the group in a nested fashion,
    4199                 :       remembering the bracket starts on a stack. In the case of a zero minimum,
    4200                 :       the first one was set up above. In all cases the repeat_max now specifies
    4201                 :       the number of additional copies needed. Again, we must remember to
    4202                 :       replicate entries on the forward reference list. */
    4203                 : 
    4204             840 :       if (repeat_max >= 0)
    4205                 :         {
    4206                 :         /* In the pre-compile phase, we don't actually do the replication. We
    4207                 :         just adjust the length as if we had. For each repetition we must add 1
    4208                 :         to the length for BRAZERO and for all but the last repetition we must
    4209                 :         add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
    4210                 :         paranoid checks to avoid integer overflow. */
    4211                 : 
    4212             772 :         if (lengthptr != NULL && repeat_max > 0)
    4213                 :           {
    4214                 :           int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
    4215               0 :                       2 - 2*LINK_SIZE;   /* Last one doesn't nest */
    4216               0 :           if ((double)repeat_max *
    4217                 :                 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
    4218                 :                   > (double)INT_MAX ||
    4219                 :               OFLOW_MAX - *lengthptr < delta)
    4220                 :             {
    4221               0 :             *errorcodeptr = ERR20;
    4222               0 :             goto FAILED;
    4223                 :             }
    4224               0 :           *lengthptr += delta;
    4225                 :           }
    4226                 : 
    4227                 :         /* This is compiling for real */
    4228                 : 
    4229             772 :         else for (i = repeat_max - 1; i >= 0; i--)
    4230                 :           {
    4231                 :           uschar *hc;
    4232               0 :           uschar *this_hwm = cd->hwm;
    4233                 : 
    4234               0 :           *code++ = OP_BRAZERO + repeat_type;
    4235                 : 
    4236                 :           /* All but the final copy start a new nesting, maintaining the
    4237                 :           chain of brackets outstanding. */
    4238                 : 
    4239               0 :           if (i != 0)
    4240                 :             {
    4241                 :             int offset;
    4242               0 :             *code++ = OP_BRA;
    4243               0 :             offset = (bralink == NULL)? 0 : code - bralink;
    4244               0 :             bralink = code;
    4245               0 :             PUTINC(code, 0, offset);
    4246                 :             }
    4247                 : 
    4248               0 :           memcpy(code, previous, len);
    4249               0 :           for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
    4250                 :             {
    4251               0 :             PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
    4252               0 :             cd->hwm += LINK_SIZE;
    4253                 :             }
    4254               0 :           save_hwm = this_hwm;
    4255               0 :           code += len;
    4256                 :           }
    4257                 : 
    4258                 :         /* Now chain through the pending brackets, and fill in their length
    4259                 :         fields (which are holding the chain links pro tem). */
    4260                 : 
    4261            1544 :         while (bralink != NULL)
    4262                 :           {
    4263                 :           int oldlinkoffset;
    4264               0 :           int offset = code - bralink + 1;
    4265               0 :           uschar *bra = code - offset;
    4266               0 :           oldlinkoffset = GET(bra, 1);
    4267               0 :           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
    4268               0 :           *code++ = OP_KET;
    4269               0 :           PUTINC(code, 0, offset);
    4270               0 :           PUT(bra, 1, offset);
    4271                 :           }
    4272                 :         }
    4273                 : 
    4274                 :       /* If the maximum is unlimited, set a repeater in the final copy. We
    4275                 :       can't just offset backwards from the current code point, because we
    4276                 :       don't know if there's been an options resetting after the ket. The
    4277                 :       correct offset was computed above.
    4278                 : 
    4279                 :       Then, when we are doing the actual compile phase, check to see whether
    4280                 :       this group is a non-atomic one that could match an empty string. If so,
    4281                 :       convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
    4282                 :       that runtime checking can be done. [This check is also applied to
    4283                 :       atomic groups at runtime, but in a different way.] */
    4284                 : 
    4285                 :       else
    4286                 :         {
    4287              68 :         uschar *ketcode = code - ketoffset;
    4288              68 :         uschar *bracode = ketcode - GET(ketcode, 1);
    4289              68 :         *ketcode = OP_KETRMAX + repeat_type;
    4290              68 :         if (lengthptr == NULL && *bracode != OP_ONCE)
    4291                 :           {
    4292              34 :           uschar *scode = bracode;
    4293                 :           do
    4294                 :             {
    4295              38 :             if (could_be_empty_branch(scode, ketcode, utf8))
    4296                 :               {
    4297               7 :               *bracode += OP_SBRA - OP_BRA;
    4298               7 :               break;
    4299                 :               }
    4300              31 :             scode += GET(scode, 1);
    4301                 :             }
    4302              31 :           while (*scode == OP_ALT);
    4303                 :           }
    4304                 :         }
    4305                 :       }
    4306                 : 
    4307                 :     /* If previous is OP_FAIL, it was generated by an empty class [] in
    4308                 :     JavaScript mode. The other ways in which OP_FAIL can be generated, that is
    4309                 :     by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
    4310                 :     error above. We can just ignore the repeat in JS case. */
    4311                 : 
    4312               0 :     else if (*previous == OP_FAIL) goto END_REPEAT;
    4313                 : 
    4314                 :     /* Else there's some kind of shambles */
    4315                 : 
    4316                 :     else
    4317                 :       {
    4318               0 :       *errorcodeptr = ERR11;
    4319               0 :       goto FAILED;
    4320                 :       }
    4321                 : 
    4322                 :     /* If the character following a repeat is '+', or if certain optimization
    4323                 :     tests above succeeded, possessive_quantifier is TRUE. For some of the
    4324                 :     simpler opcodes, there is an special alternative opcode for this. For
    4325                 :     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
    4326                 :     The '+' notation is just syntactic sugar, taken from Sun's Java package,
    4327                 :     but the special opcodes can optimize it a bit. The repeated item starts at
    4328                 :     tempcode, not at previous, which might be the first part of a string whose
    4329                 :     (former) last char we repeated.
    4330                 : 
    4331                 :     Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
    4332                 :     an 'upto' may follow. We skip over an 'exact' item, and then test the
    4333                 :     length of what remains before proceeding. */
    4334                 : 
    4335           97904 :     if (possessive_quantifier)
    4336                 :       {
    4337                 :       int len;
    4338           48032 :       if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
    4339                 :           *tempcode == OP_NOTEXACT)
    4340               0 :         tempcode += _pcre_OP_lengths[*tempcode] +
    4341                 :           ((*tempcode == OP_TYPEEXACT &&
    4342                 :              (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
    4343           48032 :       len = code - tempcode;
    4344           48032 :       if (len > 0) switch (*tempcode)
    4345                 :         {
    4346              32 :         case OP_STAR:  *tempcode = OP_POSSTAR; break;
    4347               2 :         case OP_PLUS:  *tempcode = OP_POSPLUS; break;
    4348               0 :         case OP_QUERY: *tempcode = OP_POSQUERY; break;
    4349               0 :         case OP_UPTO:  *tempcode = OP_POSUPTO; break;
    4350                 : 
    4351              67 :         case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
    4352           47923 :         case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
    4353               0 :         case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
    4354               0 :         case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
    4355                 : 
    4356               2 :         case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
    4357               6 :         case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
    4358               0 :         case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
    4359               0 :         case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
    4360                 : 
    4361                 :         default:
    4362               0 :         memmove(tempcode + 1+LINK_SIZE, tempcode, len);
    4363               0 :         code += 1 + LINK_SIZE;
    4364               0 :         len += 1 + LINK_SIZE;
    4365               0 :         tempcode[0] = OP_ONCE;
    4366               0 :         *code++ = OP_KET;
    4367               0 :         PUTINC(code, 0, len);
    4368               0 :         PUT(tempcode, 1, len);
    4369                 :         break;
    4370                 :         }
    4371                 :       }
    4372                 : 
    4373                 :     /* In all case we no longer have a previous item. We also set the
    4374                 :     "follows varying string" flag for subsequently encountered reqbytes if
    4375                 :     it isn't already set and we have just passed a varying length item. */
    4376                 : 
    4377           97910 :     END_REPEAT:
    4378           97910 :     previous = NULL;
    4379           97910 :     cd->req_varyopt |= reqvary;
    4380           97910 :     break;
    4381                 : 
    4382                 : 
    4383                 :     /* ===================================================================*/
    4384                 :     /* Start of nested parenthesized sub-expression, or comment or lookahead or
    4385                 :     lookbehind or option setting or condition or all the other extended
    4386                 :     parenthesis forms.  */
    4387                 : 
    4388                 :     case CHAR_LEFT_PARENTHESIS:
    4389           17630 :     newoptions = options;
    4390           17630 :     skipbytes = 0;
    4391           17630 :     bravalue = OP_CBRA;
    4392           17630 :     save_hwm = cd->hwm;
    4393           17630 :     reset_bracount = FALSE;
    4394                 : 
    4395                 :     /* First deal with various "verbs" that can be introduced by '*'. */
    4396                 : 
    4397           17630 :     if (*(++ptr) == CHAR_ASTERISK && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
    4398                 :       {
    4399                 :       int i, namelen;
    4400               0 :       const char *vn = verbnames;
    4401               0 :       const uschar *name = ++ptr;
    4402               0 :       previous = NULL;
    4403               0 :       while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
    4404               0 :       if (*ptr == CHAR_COLON)
    4405                 :         {
    4406               0 :         *errorcodeptr = ERR59;   /* Not supported */
    4407               0 :         goto FAILED;
    4408                 :         }
    4409               0 :       if (*ptr != CHAR_RIGHT_PARENTHESIS)
    4410                 :         {
    4411               0 :         *errorcodeptr = ERR60;
    4412               0 :         goto FAILED;
    4413                 :         }
    4414               0 :       namelen = ptr - name;
    4415               0 :       for (i = 0; i < verbcount; i++)
    4416                 :         {
    4417               0 :         if (namelen == verbs[i].len &&
    4418                 :             strncmp((char *)name, vn, namelen) == 0)
    4419                 :           {
    4420               0 :           *code = verbs[i].op;
    4421               0 :           if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
    4422               0 :           break;
    4423                 :           }
    4424               0 :         vn += verbs[i].len + 1;
    4425                 :         }
    4426               0 :       if (i < verbcount) continue;
    4427               0 :       *errorcodeptr = ERR60;
    4428               0 :       goto FAILED;
    4429                 :       }
    4430                 : 
    4431                 :     /* Deal with the extended parentheses; all are introduced by '?', and the
    4432                 :     appearance of any of them means that this is not a capturing group. */
    4433                 : 
    4434           17630 :     else if (*ptr == CHAR_QUESTION_MARK)
    4435                 :       {
    4436                 :       int i, set, unset, namelen;
    4437                 :       int *optset;
    4438                 :       const uschar *name;
    4439                 :       uschar *slot;
    4440                 : 
    4441            2892 :       switch (*(++ptr))
    4442                 :         {
    4443                 :         case CHAR_NUMBER_SIGN:                 /* Comment; skip to ket */
    4444               0 :         ptr++;
    4445               0 :         while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
    4446               0 :         if (*ptr == 0)
    4447                 :           {
    4448               0 :           *errorcodeptr = ERR18;
    4449               0 :           goto FAILED;
    4450                 :           }
    4451               0 :         continue;
    4452                 : 
    4453                 : 
    4454                 :         /* ------------------------------------------------------------ */
    4455                 :         case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
    4456               0 :         reset_bracount = TRUE;
    4457                 :         /* Fall through */
    4458                 : 
    4459                 :         /* ------------------------------------------------------------ */
    4460                 :         case CHAR_COLON:          /* Non-capturing bracket */
    4461             778 :         bravalue = OP_BRA;
    4462             778 :         ptr++;
    4463             778 :         break;
    4464                 : 
    4465                 : 
    4466                 :         /* ------------------------------------------------------------ */
    4467                 :         case CHAR_LEFT_PARENTHESIS:
    4468               4 :         bravalue = OP_COND;       /* Conditional group */
    4469                 : 
    4470                 :         /* A condition can be an assertion, a number (referring to a numbered
    4471                 :         group), a name (referring to a named group), or 'R', referring to
    4472                 :         recursion. R<digits> and R&name are also permitted for recursion tests.
    4473                 : 
    4474                 :         There are several syntaxes for testing a named group: (?(name)) is used
    4475                 :         by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
    4476                 : 
    4477                 :         There are two unfortunate ambiguities, caused by history. (a) 'R' can
    4478                 :         be the recursive thing or the name 'R' (and similarly for 'R' followed
    4479                 :         by digits), and (b) a number could be a name that consists of digits.
    4480                 :         In both cases, we look for a name first; if not found, we try the other
    4481                 :         cases. */
    4482                 : 
    4483                 :         /* For conditions that are assertions, check the syntax, and then exit
    4484                 :         the switch. This will take control down to where bracketed groups,
    4485                 :         including assertions, are processed. */
    4486                 : 
    4487               4 :         if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||
    4488                 :             ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))
    4489                 :           break;
    4490                 : 
    4491                 :         /* Most other conditions use OP_CREF (a couple change to OP_RREF
    4492                 :         below), and all need to skip 3 bytes at the start of the group. */
    4493                 : 
    4494               4 :         code[1+LINK_SIZE] = OP_CREF;
    4495               4 :         skipbytes = 3;
    4496               4 :         refsign = -1;
    4497                 : 
    4498                 :         /* Check for a test for recursion in a named group. */
    4499                 : 
    4500               4 :         if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
    4501                 :           {
    4502               0 :           terminator = -1;
    4503               0 :           ptr += 2;
    4504               0 :           code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
    4505                 :           }
    4506                 : 
    4507                 :         /* Check for a test for a named group's having been set, using the Perl
    4508                 :         syntax (?(<name>) or (?('name') */
    4509                 : 
    4510               4 :         else if (ptr[1] == CHAR_LESS_THAN_SIGN)
    4511                 :           {
    4512               0 :           terminator = CHAR_GREATER_THAN_SIGN;
    4513               0 :           ptr++;
    4514                 :           }
    4515               4 :         else if (ptr[1] == CHAR_APOSTROPHE)
    4516                 :           {
    4517               0 :           terminator = CHAR_APOSTROPHE;
    4518               0 :           ptr++;
    4519                 :           }
    4520                 :         else
    4521                 :           {
    4522               4 :           terminator = 0;
    4523               4 :           if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
    4524                 :           }
    4525                 : 
    4526                 :         /* We now expect to read a name; any thing else is an error */
    4527                 : 
    4528               4 :         if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
    4529                 :           {
    4530               0 :           ptr += 1;  /* To get the right offset */
    4531               0 :           *errorcodeptr = ERR28;
    4532               0 :           goto FAILED;
    4533                 :           }
    4534                 : 
    4535                 :         /* Read the name, but also get it as a number if it's all digits */
    4536                 : 
    4537               4 :         recno = 0;
    4538               4 :         name = ++ptr;
    4539              12 :         while ((cd->ctypes[*ptr] & ctype_word) != 0)
    4540                 :           {
    4541               4 :           if (recno >= 0)
    4542               4 :             recno = ((digitab[*ptr] & ctype_digit) != 0)?
    4543                 :               recno * 10 + *ptr - CHAR_0 : -1;
    4544               4 :           ptr++;
    4545                 :           }
    4546               4 :         namelen = ptr - name;
    4547                 : 
    4548               4 :         if ((terminator > 0 && *ptr++ != terminator) ||
    4549                 :             *ptr++ != CHAR_RIGHT_PARENTHESIS)
    4550                 :           {
    4551               0 :           ptr--;      /* Error offset */
    4552               0 :           *errorcodeptr = ERR26;
    4553               0 :           goto FAILED;
    4554                 :           }
    4555                 : 
    4556                 :         /* Do no further checking in the pre-compile phase. */
    4557                 : 
    4558               4 :         if (lengthptr != NULL) break;
    4559                 : 
    4560                 :         /* In the real compile we do the work of looking for the actual
    4561                 :         reference. If the string started with "+" or "-" we require the rest to
    4562                 :         be digits, in which case recno will be set. */
    4563                 : 
    4564               2 :         if (refsign > 0)
    4565                 :           {
    4566               0 :           if (recno <= 0)
    4567                 :             {
    4568               0 :             *errorcodeptr = ERR58;
    4569               0 :             goto FAILED;
    4570                 :             }
    4571               0 :           recno = (refsign == CHAR_MINUS)?
    4572                 :             cd->bracount - recno + 1 : recno +cd->bracount;
    4573               0 :           if (recno <= 0 || recno > cd->final_bracount)
    4574                 :             {
    4575               0 :             *errorcodeptr = ERR15;
    4576               0 :             goto FAILED;
    4577                 :             }
    4578               0 :           PUT2(code, 2+LINK_SIZE, recno);
    4579               0 :           break;
    4580                 :           }
    4581                 : 
    4582                 :         /* Otherwise (did not start with "+" or "-"), start by looking for the
    4583                 :         name. */
    4584                 : 
    4585               2 :         slot = cd->name_table;
    4586               2 :         for (i = 0; i < cd->names_found; i++)
    4587                 :           {
    4588               0 :           if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
    4589               0 :           slot += cd->name_entry_size;
    4590                 :           }
    4591                 : 
    4592                 :         /* Found a previous named subpattern */
    4593                 : 
    4594               2 :         if (i < cd->names_found)
    4595                 :           {
    4596               0 :           recno = GET2(slot, 0);
    4597               0 :           PUT2(code, 2+LINK_SIZE, recno);
    4598                 :           }
    4599                 : 
    4600                 :         /* Search the pattern for a forward reference */
    4601                 : 
    4602               2 :         else if ((i = find_parens(cd, name, namelen,
    4603                 :                         (options & PCRE_EXTENDED) != 0)) > 0)
    4604                 :           {
    4605               0 :           PUT2(code, 2+LINK_SIZE, i);
    4606                 :           }
    4607                 : 
    4608                 :         /* If terminator == 0 it means that the name followed directly after
    4609                 :         the opening parenthesis [e.g. (?(abc)...] and in this case there are
    4610                 :         some further alternatives to try. For the cases where terminator != 0
    4611                 :         [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
    4612                 :         now checked all the possibilities, so give an error. */
    4613                 : 
    4614               2 :         else if (terminator != 0)
    4615                 :           {
    4616               0 :           *errorcodeptr = ERR15;
    4617               0 :           goto FAILED;
    4618                 :           }
    4619                 : 
    4620                 :         /* Check for (?(R) for recursion. Allow digits after R to specify a
    4621                 :         specific group number. */
    4622                 : 
    4623               2 :         else if (*name == CHAR_R)
    4624                 :           {
    4625               0 :           recno = 0;
    4626               0 :           for (i = 1; i < namelen; i++)
    4627                 :             {
    4628               0 :             if ((digitab[name[i]] & ctype_digit) == 0)
    4629                 :               {
    4630               0 :               *errorcodeptr = ERR15;
    4631               0 :               goto FAILED;
    4632                 :               }
    4633               0 :             recno = recno * 10 + name[i] - CHAR_0;
    4634                 :             }
    4635               0 :           if (recno == 0) recno = RREF_ANY;
    4636               0 :           code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
    4637               0 :           PUT2(code, 2+LINK_SIZE, recno);
    4638                 :           }
    4639                 : 
    4640                 :         /* Similarly, check for the (?(DEFINE) "condition", which is always
    4641                 :         false. */
    4642                 : 
    4643               2 :         else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0)
    4644                 :           {
    4645               0 :           code[1+LINK_SIZE] = OP_DEF;
    4646               0 :           skipbytes = 1;
    4647                 :           }
    4648                 : 
    4649                 :         /* Check for the "name" actually being a subpattern number. We are
    4650                 :         in the second pass here, so final_bracount is set. */
    4651                 : 
    4652               4 :         else if (recno > 0 && recno <= cd->final_bracount)
    4653                 :           {
    4654               2 :           PUT2(code, 2+LINK_SIZE, recno);
    4655                 :           }
    4656                 : 
    4657                 :         /* Either an unidentified subpattern, or a reference to (?(0) */
    4658                 : 
    4659                 :         else
    4660                 :           {
    4661               0 :           *errorcodeptr = (recno == 0)? ERR35: ERR15;
    4662               0 :           goto FAILED;
    4663                 :           }
    4664               2 :         break;
    4665                 : 
    4666                 : 
    4667                 :         /* ------------------------------------------------------------ */
    4668                 :         case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
    4669              10 :         bravalue = OP_ASSERT;
    4670              10 :         ptr++;
    4671              10 :         break;
    4672                 : 
    4673                 : 
    4674                 :         /* ------------------------------------------------------------ */
    4675                 :         case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
    4676               4 :         ptr++;
    4677               4 :         if (*ptr == CHAR_RIGHT_PARENTHESIS)    /* Optimize (?!) */
    4678                 :           {
    4679               0 :           *code++ = OP_FAIL;
    4680               0 :           previous = NULL;
    4681               0 :           continue;
    4682                 :           }
    4683               4 :         bravalue = OP_ASSERT_NOT;
    4684               4 :         break;
    4685                 : 
    4686                 : 
    4687                 :         /* ------------------------------------------------------------ */
    4688                 :         case CHAR_LESS_THAN_SIGN:              /* Lookbehind or named define */
    4689               4 :         switch (ptr[1])
    4690                 :           {
    4691                 :           case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
    4692               0 :           bravalue = OP_ASSERTBACK;
    4693               0 :           ptr += 2;
    4694               0 :           break;
    4695                 : 
    4696                 :           case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
    4697               4 :           bravalue = OP_ASSERTBACK_NOT;
    4698               4 :           ptr += 2;
    4699               4 :           break;
    4700                 : 
    4701                 :           default:                /* Could be name define, else bad */
    4702               0 :           if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
    4703               0 :           ptr++;                  /* Correct offset for error */
    4704               0 :           *errorcodeptr = ERR24;
    4705               0 :           goto FAILED;
    4706                 :           }
    4707               4 :         break;
    4708                 : 
    4709                 : 
    4710                 :         /* ------------------------------------------------------------ */
    4711                 :         case CHAR_GREATER_THAN_SIGN:           /* One-time brackets */
    4712               4 :         bravalue = OP_ONCE;
    4713               4 :         ptr++;
    4714               4 :         break;
    4715                 : 
    4716                 : 
    4717                 :         /* ------------------------------------------------------------ */
    4718                 :         case CHAR_C:                 /* Callout - may be followed by digits; */
    4719               0 :         previous_callout = code;  /* Save for later completion */
    4720               0 :         after_manual_callout = 1; /* Skip one item before completing */
    4721               0 :         *code++ = OP_CALLOUT;
    4722                 :           {
    4723               0 :           int n = 0;
    4724               0 :           while ((digitab[*(++ptr)] & ctype_digit) != 0)
    4725               0 :             n = n * 10 + *ptr - CHAR_0;
    4726               0 :           if (*ptr != CHAR_RIGHT_PARENTHESIS)
    4727                 :             {
    4728               0 :             *errorcodeptr = ERR39;
    4729               0 :             goto FAILED;
    4730                 :             }
    4731               0 :           if (n > 255)
    4732                 :             {
    4733               0 :             *errorcodeptr = ERR38;
    4734               0 :             goto FAILED;
    4735                 :             }
    4736               0 :           *code++ = n;
    4737               0 :           PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */
    4738               0 :           PUT(code, LINK_SIZE, 0);                    /* Default length */
    4739               0 :           code += 2 * LINK_SIZE;
    4740                 :           }
    4741               0 :         previous = NULL;
    4742               0 :         continue;
    4743                 : 
    4744                 : 
    4745                 :         /* ------------------------------------------------------------ */
    4746                 :         case CHAR_P:              /* Python-style named subpattern handling */
    4747            2082 :         if (*(++ptr) == CHAR_EQUALS_SIGN ||
    4748                 :             *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */
    4749                 :           {
    4750               0 :           is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
    4751               0 :           terminator = CHAR_RIGHT_PARENTHESIS;
    4752               0 :           goto NAMED_REF_OR_RECURSE;
    4753                 :           }
    4754            2082 :         else if (*ptr != CHAR_LESS_THAN_SIGN)  /* Test for Python-style defn */
    4755                 :           {
    4756               0 :           *errorcodeptr = ERR41;
    4757               0 :           goto FAILED;
    4758                 :           }
    4759                 :         /* Fall through to handle (?P< as (?< is handled */
    4760                 : 
    4761                 : 
    4762                 :         /* ------------------------------------------------------------ */
    4763            2082 :         DEFINE_NAME:    /* Come here from (?< handling */
    4764                 :         case CHAR_APOSTROPHE:
    4765                 :           {
    4766            2082 :           terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
    4767                 :             CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
    4768            2082 :           name = ++ptr;
    4769                 : 
    4770            2082 :           while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
    4771            2082 :           namelen = ptr - name;
    4772                 : 
    4773                 :           /* In the pre-compile phase, just do a syntax check. */
    4774                 : 
    4775            2082 :           if (lengthptr != NULL)
    4776                 :             {
    4777            1041 :             if (*ptr != terminator)
    4778                 :               {
    4779               0 :               *errorcodeptr = ERR42;
    4780               0 :               goto FAILED;
    4781                 :               }
    4782            1041 :             if (cd->names_found >= MAX_NAME_COUNT)
    4783                 :               {
    4784               0 :               *errorcodeptr = ERR49;
    4785               0 :               goto FAILED;
    4786                 :               }
    4787            1041 :             if (namelen + 3 > cd->name_entry_size)
    4788                 :               {
    4789              14 :               cd->name_entry_size = namelen + 3;
    4790              14 :               if (namelen > MAX_NAME_SIZE)
    4791                 :                 {
    4792               0 :                 *errorcodeptr = ERR48;
    4793               0 :                 goto FAILED;
    4794                 :                 }
    4795                 :               }
    4796                 :             }
    4797                 : 
    4798                 :           /* In the real compile, create the entry in the table */
    4799                 : 
    4800                 :           else
    4801                 :             {
    4802            1041 :             slot = cd->name_table;
    4803          457021 :             for (i = 0; i < cd->names_found; i++)
    4804                 :               {
    4805          456981 :               int crc = memcmp(name, slot+2, namelen);
    4806          456981 :               if (crc == 0)
    4807                 :                 {
    4808               0 :                 if (slot[2+namelen] == 0)
    4809                 :                   {
    4810               0 :                   if ((options & PCRE_DUPNAMES) == 0)
    4811                 :                     {
    4812               0 :                     *errorcodeptr = ERR43;
    4813               0 :                     goto FAILED;
    4814                 :                     }
    4815                 :                   }
    4816               0 :                 else crc = -1;      /* Current name is substring */
    4817                 :                 }
    4818          456981 :               if (crc < 0)
    4819                 :                 {
    4820            1001 :                 memmove(slot + cd->name_entry_size, slot,
    4821                 :                   (cd->names_found - i) * cd->name_entry_size);
    4822            1001 :                 break;
    4823                 :                 }
    4824          455980 :               slot += cd->name_entry_size;
    4825                 :               }
    4826                 : 
    4827            1041 :             PUT2(slot, 0, cd->bracount + 1);
    4828            1041 :             memcpy(slot + 2, name, namelen);
    4829            1041 :             slot[2+namelen] = 0;
    4830                 :             }
    4831                 :           }
    4832                 : 
    4833                 :         /* In both cases, count the number of names we've encountered. */
    4834                 : 
    4835            2082 :         ptr++;                    /* Move past > or ' */
    4836            2082 :         cd->names_found++;
    4837            2082 :         goto NUMBERED_GROUP;
    4838                 : 
    4839                 : 
    4840                 :         /* ------------------------------------------------------------ */
    4841                 :         case CHAR_AMPERSAND:            /* Perl recursion/subroutine syntax */
    4842               0 :         terminator = CHAR_RIGHT_PARENTHESIS;
    4843               0 :         is_recurse = TRUE;
    4844                 :         /* Fall through */
    4845                 : 
    4846                 :         /* We come here from the Python syntax above that handles both
    4847                 :         references (?P=name) and recursion (?P>name), as well as falling
    4848                 :         through from the Perl recursion syntax (?&name). We also come here from
    4849                 :         the Perl \k<name> or \k'name' back reference syntax and the \k{name}
    4850                 :         .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
    4851                 : 
    4852               0 :         NAMED_REF_OR_RECURSE:
    4853               0 :         name = ++ptr;
    4854               0 :         while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
    4855               0 :         namelen = ptr - name;
    4856                 : 
    4857                 :         /* In the pre-compile phase, do a syntax check and set a dummy
    4858                 :         reference number. */
    4859                 : 
    4860               0 :         if (lengthptr != NULL)
    4861                 :           {
    4862               0 :           if (namelen == 0)
    4863                 :             {
    4864               0 :             *errorcodeptr = ERR62;
    4865               0 :             goto FAILED;
    4866                 :             }
    4867               0 :           if (*ptr != terminator)
    4868                 :             {
    4869               0 :             *errorcodeptr = ERR42;
    4870               0 :             goto FAILED;
    4871                 :             }
    4872               0 :           if (namelen > MAX_NAME_SIZE)
    4873                 :             {
    4874               0 :             *errorcodeptr = ERR48;
    4875               0 :             goto FAILED;
    4876                 :             }
    4877               0 :           recno = 0;
    4878                 :           }
    4879                 : 
    4880                 :         /* In the real compile, seek the name in the table. We check the name
    4881                 :         first, and then check that we have reached the end of the name in the
    4882                 :         table. That way, if the name that is longer than any in the table,
    4883                 :         the comparison will fail without reading beyond the table entry. */
    4884                 : 
    4885                 :         else
    4886                 :           {
    4887               0 :           slot = cd->name_table;
    4888               0 :           for (i = 0; i < cd->names_found; i++)
    4889                 :             {
    4890               0 :             if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
    4891                 :                 slot[2+namelen] == 0)
    4892               0 :               break;
    4893               0 :             slot += cd->name_entry_size;
    4894                 :             }
    4895                 : 
    4896               0 :           if (i < cd->names_found)         /* Back reference */
    4897                 :             {
    4898               0 :             recno = GET2(slot, 0);
    4899                 :             }
    4900               0 :           else if ((recno =                /* Forward back reference */
    4901                 :                     find_parens(cd, name, namelen,
    4902                 :                       (options & PCRE_EXTENDED) != 0)) <= 0)
    4903                 :             {
    4904               0 :             *errorcodeptr = ERR15;
    4905               0 :             goto FAILED;
    4906                 :             }
    4907                 :           }
    4908                 : 
    4909                 :         /* In both phases, we can now go to the code than handles numerical
    4910                 :         recursion or backreferences. */
    4911                 : 
    4912               0 :         if (is_recurse) goto HANDLE_RECURSION;
    4913               0 :           else goto HANDLE_REFERENCE;
    4914                 : 
    4915                 : 
    4916                 :         /* ------------------------------------------------------------ */
    4917                 :         case CHAR_R:              /* Recursion */
    4918               4 :         ptr++;                    /* Same as (?0)      */
    4919                 :         /* Fall through */
    4920                 : 
    4921                 : 
    4922                 :         /* ------------------------------------------------------------ */
    4923                 :         case CHAR_MINUS: case CHAR_PLUS:  /* Recursion or subroutine */
    4924                 :         case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
    4925                 :         case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
    4926                 :           {
    4927                 :           const uschar *called;
    4928               4 :           terminator = CHAR_RIGHT_PARENTHESIS;
    4929                 : 
    4930                 :           /* Come here from the \g<...> and \g'...' code (Oniguruma
    4931                 :           compatibility). However, the syntax has been checked to ensure that
    4932                 :           the ... are a (signed) number, so that neither ERR63 nor ERR29 will
    4933                 :           be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
    4934                 :           ever be taken. */
    4935                 : 
    4936               4 :           HANDLE_NUMERICAL_RECURSION:
    4937                 : 
    4938               4 :           if ((refsign = *ptr) == CHAR_PLUS)
    4939                 :             {
    4940               0 :             ptr++;
    4941               0 :             if ((digitab[*ptr] & ctype_digit) == 0)
    4942                 :               {
    4943               0 :               *errorcodeptr = ERR63;
    4944               0 :               goto FAILED;
    4945                 :               }
    4946                 :             }
    4947               4 :           else if (refsign == CHAR_MINUS)
    4948                 :             {
    4949               0 :             if ((digitab[ptr[1]] & ctype_digit) == 0)
    4950               0 :               goto OTHER_CHAR_AFTER_QUERY;
    4951               0 :             ptr++;
    4952                 :             }
    4953                 : 
    4954               4 :           recno = 0;
    4955               8 :           while((digitab[*ptr] & ctype_digit) != 0)
    4956               0 :             recno = recno * 10 + *ptr++ - CHAR_0;
    4957                 : 
    4958               4 :           if (*ptr != terminator)
    4959                 :             {
    4960               0 :             *errorcodeptr = ERR29;
    4961               0 :             goto FAILED;
    4962                 :             }
    4963                 : 
    4964               4 :           if (refsign == CHAR_MINUS)
    4965                 :             {
    4966               0 :             if (recno == 0)
    4967                 :               {
    4968               0 :               *errorcodeptr = ERR58;
    4969               0 :               goto FAILED;
    4970                 :               }
    4971               0 :             recno = cd->bracount - recno + 1;
    4972               0 :             if (recno <= 0)
    4973                 :               {
    4974               0 :               *errorcodeptr = ERR15;
    4975               0 :               goto FAILED;
    4976                 :               }
    4977                 :             }
    4978               4 :           else if (refsign == CHAR_PLUS)
    4979                 :             {
    4980               0 :             if (recno == 0)
    4981                 :               {
    4982               0 :               *errorcodeptr = ERR58;
    4983               0 :               goto FAILED;
    4984                 :               }
    4985               0 :             recno += cd->bracount;
    4986                 :             }
    4987                 : 
    4988                 :           /* Come here from code above that handles a named recursion */
    4989                 : 
    4990               4 :           HANDLE_RECURSION:
    4991                 : 
    4992               4 :           previous = code;
    4993               4 :           called = cd->start_code;
    4994                 : 
    4995                 :           /* When we are actually compiling, find the bracket that is being
    4996                 :           referenced. Temporarily end the regex in case it doesn't exist before
    4997                 :           this point. If we end up with a forward reference, first check that
    4998                 :           the bracket does occur later so we can give the error (and position)
    4999                 :           now. Then remember this forward reference in the workspace so it can
    5000                 :           be filled in at the end. */
    5001                 : 
    5002               4 :           if (lengthptr == NULL)
    5003                 :             {
    5004               2 :             *code = OP_END;
    5005               2 :             if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
    5006                 : 
    5007                 :             /* Forward reference */
    5008                 : 
    5009               2 :             if (called == NULL)
    5010                 :               {
    5011               0 :               if (find_parens(cd, NULL, recno,
    5012                 :                     (options & PCRE_EXTENDED) != 0) < 0)
    5013                 :                 {
    5014               0 :                 *errorcodeptr = ERR15;
    5015               0 :                 goto FAILED;
    5016                 :                 }
    5017               0 :               called = cd->start_code + recno;
    5018               0 :               PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
    5019                 :               }
    5020                 : 
    5021                 :             /* If not a forward reference, and the subpattern is still open,
    5022                 :             this is a recursive call. We check to see if this is a left
    5023                 :             recursion that could loop for ever, and diagnose that case. */
    5024                 : 
    5025               2 :             else if (GET(called, 1) == 0 &&
    5026                 :                      could_be_empty(called, code, bcptr, utf8))
    5027                 :               {
    5028               0 :               *errorcodeptr = ERR40;
    5029               0 :               goto FAILED;
    5030                 :               }
    5031                 :             }
    5032                 : 
    5033                 :           /* Insert the recursion/subroutine item, automatically wrapped inside
    5034                 :           "once" brackets. Set up a "previous group" length so that a
    5035                 :           subsequent quantifier will work. */
    5036                 : 
    5037               4 :           *code = OP_ONCE;
    5038               4 :           PUT(code, 1, 2 + 2*LINK_SIZE);
    5039               4 :           code += 1 + LINK_SIZE;
    5040                 : 
    5041               4 :           *code = OP_RECURSE;
    5042               4 :           PUT(code, 1, called - cd->start_code);
    5043               4 :           code += 1 + LINK_SIZE;
    5044                 : 
    5045               4 :           *code = OP_KET;
    5046               4 :           PUT(code, 1, 2 + 2*LINK_SIZE);
    5047               4 :           code += 1 + LINK_SIZE;
    5048                 : 
    5049               4 :           length_prevgroup = 3 + 3*LINK_SIZE;
    5050                 :           }
    5051                 : 
    5052                 :         /* Can't determine a first byte now */
    5053                 : 
    5054               4 :         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
    5055               4 :         continue;
    5056                 : 
    5057                 : 
    5058                 :         /* ------------------------------------------------------------ */
    5059                 :         default:              /* Other characters: check option setting */
    5060               2 :         OTHER_CHAR_AFTER_QUERY:
    5061               2 :         set = unset = 0;
    5062               2 :         optset = &set;
    5063                 : 
    5064               6 :         while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
    5065                 :           {
    5066               2 :           switch (*ptr++)
    5067                 :             {
    5068               0 :             case CHAR_MINUS: optset = &unset; break;
    5069                 : 
    5070                 :             case CHAR_J:    /* Record that it changed in the external options */
    5071               0 :             *optset |= PCRE_DUPNAMES;
    5072               0 :             cd->external_flags |= PCRE_JCHANGED;
    5073               0 :             break;
    5074                 : 
    5075               0 :             case CHAR_i: *optset |= PCRE_CASELESS; break;
    5076               0 :             case CHAR_m: *optset |= PCRE_MULTILINE; break;
    5077               0 :             case CHAR_s: *optset |= PCRE_DOTALL; break;
    5078               0 :             case CHAR_x: *optset |= PCRE_EXTENDED; break;
    5079               2 :             case CHAR_U: *optset |= PCRE_UNGREEDY; break;
    5080               0 :             case CHAR_X: *optset |= PCRE_EXTRA; break;
    5081                 : 
    5082               0 :             default:  *errorcodeptr = ERR12;
    5083               0 :                       ptr--;    /* Correct the offset */
    5084               0 :                       goto FAILED;
    5085                 :             }
    5086                 :           }
    5087                 : 
    5088                 :         /* Set up the changed option bits, but don't change anything yet. */
    5089                 : 
    5090               2 :         newoptions = (options | set) & (~unset);
    5091                 : 
    5092                 :         /* If the options ended with ')' this is not the start of a nested
    5093                 :         group with option changes, so the options change at this level. If this
    5094                 :         item is right at the start of the pattern, the options can be
    5095                 :         abstracted and made external in the pre-compile phase, and ignored in
    5096                 :         the compile phase. This can be helpful when matching -- for instance in
    5097                 :         caseless checking of required bytes.
    5098                 : 
    5099                 :         If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
    5100                 :         definitely *not* at the start of the pattern because something has been
    5101                 :         compiled. In the pre-compile phase, however, the code pointer can have
    5102                 :         that value after the start, because it gets reset as code is discarded
    5103                 :         during the pre-compile. However, this can happen only at top level - if
    5104                 :         we are within parentheses, the starting BRA will still be present. At
    5105                 :         any parenthesis level, the length value can be used to test if anything
    5106                 :         has been compiled at that level. Thus, a test for both these conditions
    5107                 :         is necessary to ensure we correctly detect the start of the pattern in
    5108                 :         both phases.
    5109                 : 
    5110                 :         If we are not at the pattern start, compile code to change the ims
    5111                 :         options if this setting actually changes any of them, and reset the
    5112                 :         greedy defaults and the case value for firstbyte and reqbyte. */
    5113                 : 
    5114               2 :         if (*ptr == CHAR_RIGHT_PARENTHESIS)
    5115                 :           {
    5116               4 :           if (code == cd->start_code + 1 + LINK_SIZE &&
    5117                 :                (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
    5118                 :             {
    5119               2 :             cd->external_options = newoptions;
    5120                 :             }
    5121                 :          else
    5122                 :             {
    5123               0 :             if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
    5124                 :               {
    5125               0 :               *code++ = OP_OPT;
    5126               0 :               *code++ = newoptions & PCRE_IMS;
    5127                 :               }
    5128               0 :             greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
    5129               0 :             greedy_non_default = greedy_default ^ 1;
    5130               0 :             req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
    5131                 :             }
    5132                 : 
    5133                 :           /* Change options at this level, and pass them back for use
    5134                 :           in subsequent branches. When not at the start of the pattern, this
    5135                 :           information is also necessary so that a resetting item can be
    5136                 :           compiled at the end of a group (if we are in a group). */
    5137                 : 
    5138               2 :           *optionsptr = options = newoptions;
    5139               2 :           previous = NULL;       /* This item can't be repeated */
    5140               2 :           continue;              /* It is complete */
    5141                 :           }
    5142                 : 
    5143                 :         /* If the options ended with ':' we are heading into a nested group
    5144                 :         with possible change of options. Such groups are non-capturing and are
    5145                 :         not assertions of any kind. All we need to do is skip over the ':';
    5146                 :         the newoptions value is handled below. */
    5147                 : 
    5148               0 :         bravalue = OP_BRA;
    5149               0 :         ptr++;
    5150                 :         }     /* End of switch for character following (? */
    5151                 :       }       /* End of (? handling */
    5152                 : 
    5153                 :     /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
    5154                 :     all unadorned brackets become non-capturing and behave like (?:...)
    5155                 :     brackets. */
    5156                 : 
    5157           14738 :     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
    5158                 :       {
    5159               0 :       bravalue = OP_BRA;
    5160                 :       }
    5161                 : 
    5162                 :     /* Else we have a capturing group. */
    5163                 : 
    5164                 :     else
    5165                 :       {
    5166           16820 :       NUMBERED_GROUP:
    5167           16820 :       cd->bracount += 1;
    5168           16820 :       PUT2(code, 1+LINK_SIZE, cd->bracount);
    5169           16820 :       skipbytes = 2;
    5170                 :       }
    5171                 : 
    5172                 :     /* Process nested bracketed regex. Assertions may not be repeated, but
    5173                 :     other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
    5174                 :     non-register variable in order to be able to pass its address because some
    5175                 :     compilers complain otherwise. Pass in a new setting for the ims options if
    5176                 :     they have changed. */
    5177                 : 
    5178           17624 :     previous = (bravalue >= OP_ONCE)? code : NULL;
    5179           17624 :     *code = bravalue;
    5180           17624 :     tempcode = code;
    5181           17624 :     tempreqvary = cd->req_varyopt;     /* Save value before bracket */
    5182           17624 :     length_prevgroup = 0;              /* Initialize for pre-compile phase */
    5183                 : 
    5184           17624 :     if (!compile_regex(
    5185                 :          newoptions,                   /* The complete new option state */
    5186                 :          options & PCRE_IMS,           /* The previous ims option state */
    5187                 :          &tempcode,                    /* Where to put code (updated) */
    5188                 :          &ptr,                         /* Input pointer (updated) */
    5189                 :          errorcodeptr,                 /* Where to put an error message */
    5190                 :          (bravalue == OP_ASSERTBACK ||
    5191                 :           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
    5192                 :          reset_bracount,               /* True if (?| group */
    5193                 :          skipbytes,                    /* Skip over bracket number */
    5194                 :          &subfirstbyte,                /* For possible first char */
    5195                 :          &subreqbyte,                  /* For possible last char */
    5196                 :          bcptr,                        /* Current branch chain */
    5197                 :          cd,                           /* Tables block */
    5198                 :          (lengthptr == NULL)? NULL :   /* Actual compile phase */
    5199                 :            &length_prevgroup           /* Pre-compile phase */
    5200                 :          ))
    5201               0 :       goto FAILED;
    5202                 : 
    5203                 :     /* At the end of compiling, code is still pointing to the start of the
    5204                 :     group, while tempcode has been updated to point past the end of the group
    5205                 :     and any option resetting that may follow it. The pattern pointer (ptr)
    5206                 :     is on the bracket. */
    5207                 : 
    5208                 :     /* If this is a conditional bracket, check that there are no more than
    5209                 :     two branches in the group, or just one if it's a DEFINE group. We do this
    5210                 :     in the real compile phase, not in the pre-pass, where the whole group may
    5211                 :     not be available. */
    5212                 : 
    5213           17624 :     if (bravalue == OP_COND && lengthptr == NULL)
    5214                 :       {
    5215               2 :       uschar *tc = code;
    5216               2 :       int condcount = 0;
    5217                 : 
    5218                 :       do {
    5219               4 :          condcount++;
    5220               4 :          tc += GET(tc,1);
    5221                 :          }
    5222               4 :       while (*tc != OP_KET);
    5223                 : 
    5224                 :       /* A DEFINE group is never obeyed inline (the "condition" is always
    5225                 :       false). It must have only one branch. */
    5226                 : 
    5227               2 :       if (code[LINK_SIZE+1] == OP_DEF)
    5228                 :         {
    5229               0 :         if (condcount > 1)
    5230                 :           {
    5231               0 :           *errorcodeptr = ERR54;
    5232               0 :           goto FAILED;
    5233                 :           }
    5234               0 :         bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
    5235                 :         }
    5236                 : 
    5237                 :       /* A "normal" conditional group. If there is just one branch, we must not
    5238                 :       make use of its firstbyte or reqbyte, because this is equivalent to an
    5239                 :       empty second branch. */
    5240                 : 
    5241                 :       else
    5242                 :         {
    5243               2 :         if (condcount > 2)
    5244                 :           {
    5245               0 :           *errorcodeptr = ERR27;
    5246               0 :           goto FAILED;
    5247                 :           }
    5248               2 :         if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
    5249                 :         }
    5250                 :       }
    5251                 : 
    5252                 :     /* Error if hit end of pattern */
    5253                 : 
    5254           17624 :     if (*ptr != CHAR_RIGHT_PARENTHESIS)
    5255                 :       {
    5256               0 :       *errorcodeptr = ERR14;
    5257               0 :       goto FAILED;
    5258                 :       }
    5259                 : 
    5260                 :     /* In the pre-compile phase, update the length by the length of the group,
    5261                 :     less the brackets at either end. Then reduce the compiled code to just a
    5262                 :     set of non-capturing brackets so that it doesn't use much memory if it is
    5263                 :     duplicated by a quantifier.*/
    5264                 : 
    5265           17624 :     if (lengthptr != NULL)
    5266                 :       {
    5267            8812 :       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
    5268                 :         {
    5269               0 :         *errorcodeptr = ERR20;
    5270               0 :         goto FAILED;
    5271                 :         }
    5272            8812 :       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
    5273            8812 :       *code++ = OP_BRA;
    5274            8812 :       PUTINC(code, 0, 1 + LINK_SIZE);
    5275            8812 :       *code++ = OP_KET;
    5276            8812 :       PUTINC(code, 0, 1 + LINK_SIZE);
    5277            8812 :       break;    /* No need to waste time with special character handling */
    5278                 :       }
    5279                 : 
    5280                 :     /* Otherwise update the main code pointer to the end of the group. */
    5281                 : 
    5282            8812 :     code = tempcode;
    5283                 : 
    5284                 :     /* For a DEFINE group, required and first character settings are not
    5285                 :     relevant. */
    5286                 : 
    5287            8812 :     if (bravalue == OP_DEF) break;
    5288                 : 
    5289                 :     /* Handle updating of the required and first characters for other types of
    5290                 :     group. Update for normal brackets of all kinds, and conditions with two
    5291                 :     branches (see code above). If the bracket is followed by a quantifier with
    5292                 :     zero repeat, we have to back off. Hence the definition of zeroreqbyte and
    5293                 :     zerofirstbyte outside the main loop so that they can be accessed for the
    5294                 :     back off. */
    5295                 : 
    5296            8812 :     zeroreqbyte = reqbyte;
    5297            8812 :     zerofirstbyte = firstbyte;
    5298            8812 :     groupsetfirstbyte = FALSE;
    5299                 : 
    5300            8812 :     if (bravalue >= OP_ONCE)
    5301                 :       {
    5302                 :       /* If we have not yet set a firstbyte in this branch, take it from the
    5303                 :       subpattern, remembering that it was set here so that a repeat of more
    5304                 :       than one can replicate it as reqbyte if necessary. If the subpattern has
    5305                 :       no firstbyte, set "none" for the whole branch. In both cases, a zero
    5306