rubbos/app/httpd-2.0.64/srclib/pcre/pcre.c

   1 /*************************************************
   2 *      Perl-Compatible Regular Expressions       *
   3 *************************************************/
   4
   5 /*
   6 This is a library of functions to support regular expressions whose syntax
   7 and semantics are as close as possible to those of the Perl 5 language. See
   8 the file Tech.Notes for some information on the internals.
   9
  10 Written by: Philip Hazel <ph10@cam.ac.uk>
  11
  12            Copyright (c) 1997-2001 University of Cambridge
  13
  14 -----------------------------------------------------------------------------
  15 Permission is granted to anyone to use this software for any purpose on any
  16 computer system, and to redistribute it freely, subject to the following
  17 restrictions:
  18
  19 1. This software is distributed in the hope that it will be useful,
  20    but WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  22
  23 2. The origin of this software must not be misrepresented, either by
  24    explicit claim or by omission.
  25
  26 3. Altered versions must be plainly marked as such, and must not be
  27    misrepresented as being the original software.
  28
  29 4. If PCRE is embedded in any software that is released under the GNU
  30    General Purpose Licence (GPL), then the terms of that licence shall
  31    supersede any condition above with which it is incompatible.
  32 -----------------------------------------------------------------------------
  33 */
  34
  35
  36 /* Define DEBUG to get debugging output on stdout. */
  37
  38 /* #define DEBUG */
  39
  40 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
  41 inline, and there are *still* stupid compilers about that don't like indented
  42 pre-processor statements. I suppose it's only been 10 years... */
  43
  44 #ifdef DEBUG
  45 #define DPRINTF(p) printf p
  46 #else
  47 #define DPRINTF(p) /*nothing*/
  48 #endif
  49
  50 /* Include the internals header, which itself includes Standard C headers plus
  51 the external pcre header. */
  52
  53 #include "internal.h"
  54
  55
  56 /* Allow compilation as C++ source code, should anybody want to do that. */
  57
  58 #ifdef __cplusplus
  59 #define class pcre_class
  60 #endif
  61
  62
  63 /* Maximum number of items on the nested bracket stacks at compile time. This
  64 applies to the nesting of all kinds of parentheses. It does not limit
  65 un-nested, non-capturing parentheses. This number can be made bigger if
  66 necessary - it is used to dimension one int and one unsigned char vector at
  67 compile time. */
  68
  69 #define BRASTACK_SIZE 200
  70
  71
  72 /* The number of bytes in a literal character string above which we can't add
  73 any more is different when UTF-8 characters may be encountered. */
  74
  75 #ifdef SUPPORT_UTF8
  76 #define MAXLIT 250
  77 #else
  78 #define MAXLIT 255
  79 #endif
  80
  81
  82 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
  83
  84 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
  85 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
  86
  87 /* Text forms of OP_ values and things, for debugging (not all used) */
  88
  89 #ifdef DEBUG
  90 static const char *OP_names[] = {
  91   "End", "\\A", "\\B", "\\b", "\\D", "\\d",
  92   "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
  93   "Opt", "^", "$", "Any", "chars", "not",
  94   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
  95   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
  96   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
  97   "*", "*?", "+", "+?", "?", "??", "{", "{",
  98   "class", "Ref", "Recurse",
  99   "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
 100   "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
 101   "Brazero", "Braminzero", "Branumber", "Bra"
 102 };
 103 #endif
 104
 105 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
 106 are simple data values; negative values are for special things like \d and so
 107 on. Zero means further processing is needed (for things like \x), or the escape
 108 is invalid. */
 109
 110 static const short int escapes[] = {
 111     0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
 112     0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
 113   '@', -ESC_A, -ESC_B,      0, -ESC_D,      0,      0,      0,   /* @ - G */
 114     0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
 115     0,      0,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
 116     0,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
 117   '`',      7, -ESC_b,      0, -ESC_d,  ESC_E,  ESC_F,      0,   /* ` - g */
 118     0,      0,      0,      0,      0,      0,  ESC_N,      0,   /* h - o */
 119     0,      0,  ESC_R, -ESC_s,  ESC_T,      0,      0, -ESC_w,   /* p - w */
 120     0,      0, -ESC_z                                            /* x - z */
 121 };
 122
 123 /* Tables of names of POSIX character classes and their lengths. The list is
 124 terminated by a zero length entry. The first three must be alpha, upper, lower,
 125 as this is assumed for handling case independence. */
 126
 127 static const char *posix_names[] = {
 128   "alpha", "lower", "upper",
 129   "alnum", "ascii", "cntrl", "digit", "graph",
 130   "print", "punct", "space", "word",  "xdigit" };
 131
 132 static const uschar posix_name_lengths[] = {
 133   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
 134
 135 /* Table of class bit maps for each POSIX class; up to three may be combined
 136 to form the class. */
 137
 138 static const int posix_class_maps[] = {
 139   cbit_lower, cbit_upper, -1,             /* alpha */
 140   cbit_lower, -1,         -1,             /* lower */
 141   cbit_upper, -1,         -1,             /* upper */
 142   cbit_digit, cbit_lower, cbit_upper,     /* alnum */
 143   cbit_print, cbit_cntrl, -1,             /* ascii */
 144   cbit_cntrl, -1,         -1,             /* cntrl */
 145   cbit_digit, -1,         -1,             /* digit */
 146   cbit_graph, -1,         -1,             /* graph */
 147   cbit_print, -1,         -1,             /* print */
 148   cbit_punct, -1,         -1,             /* punct */
 149   cbit_space, -1,         -1,             /* space */
 150   cbit_word,  -1,         -1,             /* word */
 151   cbit_xdigit,-1,         -1              /* xdigit */
 152 };
 153
 154
 155 /* Definition to allow mutual recursion */
 156
 157 static BOOL
 158   compile_regex(int, int, int *, uschar **, const uschar **, const char **,
 159     BOOL, int, int *, int *, compile_data *);
 160
 161 /* Structure for building a chain of data that actually lives on the
 162 stack, for holding the values of the subject pointer at the start of each
 163 subpattern, so as to detect when an empty string has been matched by a
 164 subpattern - to break infinite loops. */
 165
 166 typedef struct eptrblock {
 167   struct eptrblock *prev;
 168   const uschar *saved_eptr;
 169 } eptrblock;
 170
 171 /* Flag bits for the match() function */
 172
 173 #define match_condassert   0x01    /* Called to check a condition assertion */
 174 #define match_isgroup      0x02    /* Set if start of bracketed group */
 175
 176
 177
 178 /*************************************************
 179 *               Global variables                 *
 180 *************************************************/
 181
 182 /* PCRE is thread-clean and doesn't use any global variables in the normal
 183 sense. However, it calls memory allocation and free functions via the two
 184 indirections below, which are can be changed by the caller, but are shared
 185 between all threads. */
 186
 187 void *(*pcre_malloc)(size_t) = malloc;
 188 void  (*pcre_free)(void *) = free;
 189
 190
 191
 192 /*************************************************
 193 *    Macros and tables for character handling    *
 194 *************************************************/
 195
 196 /* When UTF-8 encoding is being used, a character is no longer just a single
 197 byte. The macros for character handling generate simple sequences when used in
 198 byte-mode, and more complicated ones for UTF-8 characters. */
 199
 200 #ifndef SUPPORT_UTF8
 201 #define GETCHARINC(c, eptr) c = *eptr++;
 202 #define GETCHARLEN(c, eptr, len) c = *eptr;
 203 #define BACKCHAR(eptr)
 204
 205 #else   /* SUPPORT_UTF8 */
 206
 207 /* Get the next UTF-8 character, advancing the pointer */
 208
 209 #define GETCHARINC(c, eptr) \
 210   c = *eptr++; \
 211   if (md->utf8 && (c & 0xc0) == 0xc0) \
 212     { \
 213     int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
 214     int s = 6*a; \
 215     c = (c & utf8_table3[a]) << s; \
 216     while (a-- > 0) \
 217       { \
 218       s -= 6; \
 219       c |= (*eptr++ & 0x3f) << s; \
 220       } \
 221     }
 222
 223 /* Get the next UTF-8 character, not advancing the pointer, setting length */
 224
 225 #define GETCHARLEN(c, eptr, len) \
 226   c = *eptr; \
 227   len = 1; \
 228   if (md->utf8 && (c & 0xc0) == 0xc0) \
 229     { \
 230     int i; \
 231     int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
 232     int s = 6*a; \
 233     c = (c & utf8_table3[a]) << s; \
 234     for (i = 1; i <= a; i++) \
 235       { \
 236       s -= 6; \
 237       c |= (eptr[i] & 0x3f) << s; \
 238       } \
 239     len += a; \
 240     }
 241
 242 /* If the pointer is not at the start of a character, move it back until
 243 it is. */
 244
 245 #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
 246
 247 #endif
 248
 249
 250
 251 /*************************************************
 252 *             Default character tables           *
 253 *************************************************/
 254
 255 /* A default set of character tables is included in the PCRE binary. Its source
 256 is built by the maketables auxiliary program, which uses the default C ctypes
 257 functions, and put in the file chartables.c. These tables are used by PCRE
 258 whenever the caller of pcre_compile() does not provide an alternate set of
 259 tables. */
 260
 261 #include "chartables.c"
 262
 263
 264
 265 #ifdef SUPPORT_UTF8
 266 /*************************************************
 267 *           Tables for UTF-8 support             *
 268 *************************************************/
 269
 270 /* These are the breakpoints for different numbers of bytes in a UTF-8
 271 character. */
 272
 273 static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
 274
 275 /* These are the indicator bits and the mask for the data bits to set in the
 276 first byte of a character, indexed by the number of additional bytes. */
 277
 278 static int utf8_table2[] = { 0,    0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
 279 static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
 280
 281 /* Table of the number of extra characters, indexed by the first character
 282 masked with 0x3f. The highest number for a valid UTF-8 character is in fact
 283 0x3d. */
 284
 285 static uschar utf8_table4[] = {
 286   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 287   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 288   2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
 289   3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
 290
 291
 292 /*************************************************
 293 *       Convert character value to UTF-8         *
 294 *************************************************/
 295
 296 /* This function takes an integer value in the range 0 - 0x7fffffff
 297 and encodes it as a UTF-8 character in 0 to 6 bytes.
 298
 299 Arguments:
 300   cvalue     the character value
 301   buffer     pointer to buffer for result - at least 6 bytes long
 302
 303 Returns:     number of characters placed in the buffer
 304 */
 305
 306 static int
 307 ord2utf8(int cvalue, uschar *buffer)
 308 {
 309 register int i, j;
 310 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
 311   if (cvalue <= utf8_table1[i]) break;
 312 buffer += i;
 313 for (j = i; j > 0; j--)
 314  {
 315  *buffer-- = 0x80 | (cvalue & 0x3f);
 316  cvalue >>= 6;
 317  }
 318 *buffer = utf8_table2[i] | cvalue;
 319 return i + 1;
 320 }
 321 #endif
 322
 323
 324
 325 /*************************************************
 326 *          Return version string                 *
 327 *************************************************/
 328
 329 #define STRING(a)  # a
 330 #define XSTRING(s) STRING(s)
 331
 332 const char *
 333 pcre_version(void)
 334 {
 335 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
 336 }
 337
 338
 339
 340
 341 /*************************************************
 342 * (Obsolete) Return info about compiled pattern  *
 343 *************************************************/
 344
 345 /* This is the original "info" function. It picks potentially useful data out
 346 of the private structure, but its interface was too rigid. It remains for
 347 backwards compatibility. The public options are passed back in an int - though
 348 the re->options field has been expanded to a long int, all the public options
 349 at the low end of it, and so even on 16-bit systems this will still be OK.
 350 Therefore, I haven't changed the API for pcre_info().
 351
 352 Arguments:
 353   external_re   points to compiled code
 354   optptr        where to pass back the options
 355   first_char    where to pass back the first character,
 356                 or -1 if multiline and all branches start ^,
 357                 or -2 otherwise
 358
 359 Returns:        number of capturing subpatterns
 360                 or negative values on error
 361 */
 362
 363 int
 364 pcre_info(const pcre *external_re, int *optptr, int *first_char)
 365 {
 366 const real_pcre *re = (const real_pcre *)external_re;
 367 if (re == NULL) return PCRE_ERROR_NULL;
 368 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
 369 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
 370 if (first_char != NULL)
 371   *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
 372      ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
 373 return re->top_bracket;
 374 }
 375
 376
 377
 378 /*************************************************
 379 *        Return info about compiled pattern      *
 380 *************************************************/
 381
 382 /* This is a newer "info" function which has an extensible interface so
 383 that additional items can be added compatibly.
 384
 385 Arguments:
 386   external_re      points to compiled code
 387   external_study   points to study data, or NULL
 388   what             what information is required
 389   where            where to put the information
 390
 391 Returns:           0 if data returned, negative on error
 392 */
 393
 394 int
 395 pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,
 396   void *where)
 397 {
 398 const real_pcre *re = (const real_pcre *)external_re;
 399 const real_pcre_extra *study = (const real_pcre_extra *)study_data;
 400
 401 if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
 402 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
 403
 404 switch (what)
 405   {
 406   case PCRE_INFO_OPTIONS:
 407   *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
 408   break;
 409
 410   case PCRE_INFO_SIZE:
 411   *((size_t *)where) = re->size;
 412   break;
 413
 414   case PCRE_INFO_CAPTURECOUNT:
 415   *((int *)where) = re->top_bracket;
 416   break;
 417
 418   case PCRE_INFO_BACKREFMAX:
 419   *((int *)where) = re->top_backref;
 420   break;
 421
 422   case PCRE_INFO_FIRSTCHAR:
 423   *((int *)where) =
 424     ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
 425     ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
 426   break;
 427
 428   case PCRE_INFO_FIRSTTABLE:
 429   *((const uschar **)where) =
 430     (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
 431       study->start_bits : NULL;
 432   break;
 433
 434   case PCRE_INFO_LASTLITERAL:
 435   *((int *)where) =
 436     ((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;
 437   break;
 438
 439   default: return PCRE_ERROR_BADOPTION;
 440   }
 441
 442 return 0;
 443 }
 444
 445
 446
 447 #ifdef DEBUG
 448 /*************************************************
 449 *        Debugging function to print chars       *
 450 *************************************************/
 451
 452 /* Print a sequence of chars in printable format, stopping at the end of the
 453 subject if the requested.
 454
 455 Arguments:
 456   p           points to characters
 457   length      number to print
 458   is_subject  TRUE if printing from within md->start_subject
 459   md          pointer to matching data block, if is_subject is TRUE
 460
 461 Returns:     nothing
 462 */
 463
 464 static void
 465 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
 466 {
 467 int c;
 468 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
 469 while (length-- > 0)
 470   if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
 471 }
 472 #endif
 473
 474
 475
 476
 477 /*************************************************
 478 *            Handle escapes                      *
 479 *************************************************/
 480
 481 /* This function is called when a \ has been encountered. It either returns a
 482 positive value for a simple escape such as \n, or a negative value which
 483 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
 484 a positive value greater than 255 may be returned. On entry, ptr is pointing at
 485 the \. On exit, it is on the final character of the escape sequence.
 486
 487 Arguments:
 488   ptrptr     points to the pattern position pointer
 489   errorptr   points to the pointer to the error message
 490   bracount   number of previous extracting brackets
 491   options    the options bits
 492   isclass    TRUE if inside a character class
 493   cd         pointer to char tables block
 494
 495 Returns:     zero or positive => a data character
 496              negative => a special escape sequence
 497              on error, errorptr is set
 498 */
 499
 500 static int
 501 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
 502   int options, BOOL isclass, compile_data *cd)
 503 {
 504 const uschar *ptr = *ptrptr;
 505 int c, i;
 506
 507 /* If backslash is at the end of the pattern, it's an error. */
 508
 509 c = *(++ptr);
 510 if (c == 0) *errorptr = ERR1;
 511
 512 /* Digits or letters may have special meaning; all others are literals. */
 513
 514 else if (c < '0' || c > 'z') {}
 515
 516 /* Do an initial lookup in a table. A non-zero result is something that can be
 517 returned immediately. Otherwise further processing may be required. */
 518
 519 else if ((i = escapes[c - '0']) != 0) c = i;
 520
 521 /* Escapes that need further processing, or are illegal. */
 522
 523 else
 524   {
 525   const uschar *oldptr;
 526   switch (c)
 527     {
 528     /* The handling of escape sequences consisting of a string of digits
 529     starting with one that is not zero is not straightforward. By experiment,
 530     the way Perl works seems to be as follows:
 531
 532     Outside a character class, the digits are read as a decimal number. If the
 533     number is less than 10, or if there are that many previous extracting
 534     left brackets, then it is a back reference. Otherwise, up to three octal
 535     digits are read to form an escaped byte. Thus \123 is likely to be octal
 536     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
 537     value is greater than 377, the least significant 8 bits are taken. Inside a
 538     character class, \ followed by a digit is always an octal number. */
 539
 540     case '1': case '2': case '3': case '4': case '5':
 541     case '6': case '7': case '8': case '9':
 542
 543     if (!isclass)
 544       {
 545       oldptr = ptr;
 546       c -= '0';
 547       while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
 548         c = c * 10 + *(++ptr) - '0';
 549       if (c < 10 || c <= bracount)
 550         {
 551         c = -(ESC_REF + c);
 552         break;
 553         }
 554       ptr = oldptr;      /* Put the pointer back and fall through */
 555       }
 556
 557     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
 558     generates a binary zero byte and treats the digit as a following literal.
 559     Thus we have to pull back the pointer by one. */
 560
 561     if ((c = *ptr) >= '8')
 562       {
 563       ptr--;
 564       c = 0;
 565       break;
 566       }
 567
 568     /* \0 always starts an octal number, but we may drop through to here with a
 569     larger first octal digit. */
 570
 571     case '0':
 572     c -= '0';
 573     while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
 574       ptr[1] != '8' && ptr[1] != '9')
 575         c = c * 8 + *(++ptr) - '0';
 576     c &= 255;     /* Take least significant 8 bits */
 577     break;
 578
 579     /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
 580     which can be greater than 0xff, but only if the ddd are hex digits. */
 581
 582     case 'x':
 583 #ifdef SUPPORT_UTF8
 584     if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
 585       {
 586       const uschar *pt = ptr + 2;
 587       register int count = 0;
 588       c = 0;
 589       while ((cd->ctypes[*pt] & ctype_xdigit) != 0)
 590         {
 591         count++;
 592         c = c * 16 + cd->lcc[*pt] -
 593           (((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W');
 594         pt++;
 595         }
 596       if (*pt == '}')
 597         {
 598         if (c < 0 || count > 8) *errorptr = ERR34;
 599         ptr = pt;
 600         break;
 601         }
 602       /* If the sequence of hex digits does not end with '}', then we don't
 603       recognize this construct; fall through to the normal \x handling. */
 604       }
 605 #endif
 606
 607     /* Read just a single hex char */
 608
 609     c = 0;
 610     while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
 611       {
 612       ptr++;
 613       c = c * 16 + cd->lcc[*ptr] -
 614         (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
 615       }
 616     break;
 617
 618     /* Other special escapes not starting with a digit are straightforward */
 619
 620     case 'c':
 621     c = *(++ptr);
 622     if (c == 0)
 623       {
 624       *errorptr = ERR2;
 625       return 0;
 626       }
 627
 628     /* A letter is upper-cased; then the 0x40 bit is flipped */
 629
 630     if (c >= 'a' && c <= 'z') c = cd->fcc[c];
 631     c ^= 0x40;
 632     break;
 633
 634     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
 635     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
 636     for Perl compatibility, it is a literal. This code looks a bit odd, but
 637     there used to be some cases other than the default, and there may be again
 638     in future, so I haven't "optimized" it. */
 639
 640     default:
 641     if ((options & PCRE_EXTRA) != 0) switch(c)
 642       {
 643       default:
 644       *errorptr = ERR3;
 645       break;
 646       }
 647     break;
 648     }
 649   }
 650
 651 *ptrptr = ptr;
 652 return c;
 653 }
 654
 655
 656
 657 /*************************************************
 658 *            Check for counted repeat            *
 659 *************************************************/
 660
 661 /* This function is called when a '{' is encountered in a place where it might
 662 start a quantifier. It looks ahead to see if it really is a quantifier or not.
 663 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
 664 where the ddds are digits.
 665
 666 Arguments:
 667   p         pointer to the first char after '{'
 668   cd        pointer to char tables block
 669
 670 Returns:    TRUE or FALSE
 671 */
 672
 673 static BOOL
 674 is_counted_repeat(const uschar *p, compile_data *cd)
 675 {
 676 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
 677 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
 678 if (*p == '}') return TRUE;
 679
 680 if (*p++ != ',') return FALSE;
 681 if (*p == '}') return TRUE;
 682
 683 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
 684 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
 685 return (*p == '}');
 686 }
 687
 688
 689
 690 /*************************************************
 691 *         Read repeat counts                     *
 692 *************************************************/
 693
 694 /* Read an item of the form {n,m} and return the values. This is called only
 695 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
 696 so the syntax is guaranteed to be correct, but we need to check the values.
 697
 698 Arguments:
 699   p          pointer to first char after '{'
 700   minp       pointer to int for min
 701   maxp       pointer to int for max
 702              returned as -1 if no max
 703   errorptr   points to pointer to error message
 704   cd         pointer to character tables clock
 705
 706 Returns:     pointer to '}' on success;
 707              current ptr on error, with errorptr set
 708 */
 709
 710 static const uschar *
 711 read_repeat_counts(const uschar *p, int *minp, int *maxp,
 712   const char **errorptr, compile_data *cd)
 713 {
 714 int min = 0;
 715 int max = -1;
 716
 717 /* Read the minimum value and do a paranoid check: a negative value indicates
 718 an integer overflow. */
 719
 720 while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
 721 if (min < 0 || min > 65535)
 722   {
 723   *errorptr = ERR5;
 724   return p;
 725   }
 726
 727 /* Read the maximum value if there is one, and again do a paranoid on its size.
 728 Also, max must not be less than min. */
 729
 730 if (*p == '}') max = min; else
 731   {
 732   if (*(++p) != '}')
 733     {
 734     max = 0;
 735     while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
 736     if (max < 0 || max > 65535)
 737       {
 738       *errorptr = ERR5;
 739       return p;
 740       }
 741     if (max < min)
 742       {
 743       *errorptr = ERR4;
 744       return p;
 745       }
 746     }
 747   }
 748
 749 /* Fill in the required variables, and pass back the pointer to the terminating
 750 '}'. */
 751
 752 *minp = min;
 753 *maxp = max;
 754 return p;
 755 }
 756
 757
 758
 759 /*************************************************
 760 *        Find the fixed length of a pattern      *
 761 *************************************************/
 762
 763 /* Scan a pattern and compute the fixed length of subject that will match it,
 764 if the length is fixed. This is needed for dealing with backward assertions.
 765
 766 Arguments:
 767   code     points to the start of the pattern (the bracket)
 768   options  the compiling options
 769
 770 Returns:   the fixed length, or -1 if there is no fixed length
 771 */
 772
 773 static int
 774 find_fixedlength(uschar *code, int options)
 775 {
 776 int length = -1;
 777
 778 register int branchlength = 0;
 779 register uschar *cc = code + 3;
 780
 781 /* Scan along the opcodes for this branch. If we get to the end of the
 782 branch, check the length against that of the other branches. */
 783
 784 for (;;)
 785   {
 786   int d;
 787   register int op = *cc;
 788   if (op >= OP_BRA) op = OP_BRA;
 789
 790   switch (op)
 791     {
 792     case OP_BRA:
 793     case OP_ONCE:
 794     case OP_COND:
 795     d = find_fixedlength(cc, options);
 796     if (d < 0) return -1;
 797     branchlength += d;
 798     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
 799     cc += 3;
 800     break;
 801
 802     /* Reached end of a branch; if it's a ket it is the end of a nested
 803     call. If it's ALT it is an alternation in a nested call. If it is
 804     END it's the end of the outer call. All can be handled by the same code. */
 805
 806     case OP_ALT:
 807     case OP_KET:
 808     case OP_KETRMAX:
 809     case OP_KETRMIN:
 810     case OP_END:
 811     if (length < 0) length = branchlength;
 812       else if (length != branchlength) return -1;
 813     if (*cc != OP_ALT) return length;
 814     cc += 3;
 815     branchlength = 0;
 816     break;
 817
 818     /* Skip over assertive subpatterns */
 819
 820     case OP_ASSERT:
 821     case OP_ASSERT_NOT:
 822     case OP_ASSERTBACK:
 823     case OP_ASSERTBACK_NOT:
 824     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
 825     cc += 3;
 826     break;
 827
 828     /* Skip over things that don't match chars */
 829
 830     case OP_REVERSE:
 831     case OP_BRANUMBER:
 832     case OP_CREF:
 833     cc++;
 834     /* Fall through */
 835
 836     case OP_OPT:
 837     cc++;
 838     /* Fall through */
 839
 840     case OP_SOD:
 841     case OP_EOD:
 842     case OP_EODN:
 843     case OP_CIRC:
 844     case OP_DOLL:
 845     case OP_NOT_WORD_BOUNDARY:
 846     case OP_WORD_BOUNDARY:
 847     cc++;
 848     break;
 849
 850     /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
 851     This requires a scan of the string, unfortunately. We assume valid UTF-8
 852     strings, so all we do is reduce the length by one for byte whose bits are
 853     10xxxxxx. */
 854
 855     case OP_CHARS:
 856     branchlength += *(++cc);
 857 #ifdef SUPPORT_UTF8
 858     for (d = 1; d <= *cc; d++)
 859       if ((cc[d] & 0xc0) == 0x80) branchlength--;
 860 #endif
 861     cc += *cc + 1;
 862     break;
 863
 864     /* Handle exact repetitions */
 865
 866     case OP_EXACT:
 867     case OP_TYPEEXACT:
 868     branchlength += (cc[1] << 8) + cc[2];
 869     cc += 4;
 870     break;
 871
 872     /* Handle single-char matchers */
 873
 874     case OP_NOT_DIGIT:
 875     case OP_DIGIT:
 876     case OP_NOT_WHITESPACE:
 877     case OP_WHITESPACE:
 878     case OP_NOT_WORDCHAR:
 879     case OP_WORDCHAR:
 880     case OP_ANY:
 881     branchlength++;
 882     cc++;
 883     break;
 884
 885
 886     /* Check a class for variable quantification */
 887
 888     case OP_CLASS:
 889     cc += 33;
 890
 891     switch (*cc)
 892       {
 893       case OP_CRSTAR:
 894       case OP_CRMINSTAR:
 895       case OP_CRQUERY:
 896       case OP_CRMINQUERY:
 897       return -1;
 898
 899       case OP_CRRANGE:
 900       case OP_CRMINRANGE:
 901       if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
 902       branchlength += (cc[1] << 8) + cc[2];
 903       cc += 5;
 904       break;
 905
 906       default:
 907       branchlength++;
 908       }
 909     break;
 910
 911     /* Anything else is variable length */
 912
 913     default:
 914     return -1;
 915     }
 916   }
 917 /* Control never gets here */
 918 }
 919
 920
 921
 922
 923 /*************************************************
 924 *           Check for POSIX class syntax         *
 925 *************************************************/
 926
 927 /* This function is called when the sequence "[:" or "[." or "[=" is
 928 encountered in a character class. It checks whether this is followed by an
 929 optional ^ and then a sequence of letters, terminated by a matching ":]" or
 930 ".]" or "=]".
 931
 932 Argument:
 933   ptr      pointer to the initial [
 934   endptr   where to return the end pointer
 935   cd       pointer to compile data
 936
 937 Returns:   TRUE or FALSE
 938 */
 939
 940 static BOOL
 941 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
 942 {
 943 int terminator;          /* Don't combine these lines; the Solaris cc */
 944 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
 945 if (*(++ptr) == '^') ptr++;
 946 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
 947 if (*ptr == terminator && ptr[1] == ']')
 948   {
 949   *endptr = ptr;
 950   return TRUE;
 951   }
 952 return FALSE;
 953 }
 954
 955
 956
 957
 958 /*************************************************
 959 *          Check POSIX class name                *
 960 *************************************************/
 961
 962 /* This function is called to check the name given in a POSIX-style class entry
 963 such as [:alnum:].
 964
 965 Arguments:
 966   ptr        points to the first letter
 967   len        the length of the name
 968
 969 Returns:     a value representing the name, or -1 if unknown
 970 */
 971
 972 static int
 973 check_posix_name(const uschar *ptr, int len)
 974 {
 975 register int yield = 0;
 976 while (posix_name_lengths[yield] != 0)
 977   {
 978   if (len == posix_name_lengths[yield] &&
 979     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
 980   yield++;
 981   }
 982 return -1;
 983 }
 984
 985
 986
 987
 988 /*************************************************
 989 *           Compile one branch                   *
 990 *************************************************/
 991
 992 /* Scan the pattern, compiling it into the code vector.
 993
 994 Arguments:
 995   options      the option bits
 996   brackets     points to number of extracting brackets used
 997   code         points to the pointer to the current code point
 998   ptrptr       points to the current pattern pointer
 999   errorptr     points to pointer to error message
1000   optchanged   set to the value of the last OP_OPT item compiled
1001   reqchar      set to the last literal character required, else -1
1002   countlits    set to count of mandatory literal characters
1003   cd           contains pointers to tables
1004
1005 Returns:       TRUE on success
1006                FALSE, with *errorptr set on error
1007 */
1008
1009 static BOOL
1010 compile_branch(int options, int *brackets, uschar **codeptr,
1011   const uschar **ptrptr, const char **errorptr, int *optchanged,
1012   int *reqchar, int *countlits, compile_data *cd)
1013 {
1014 int repeat_type, op_type;
1015 int repeat_min, repeat_max;
1016 int bravalue, length;
1017 int greedy_default, greedy_non_default;
1018 int prevreqchar;
1019 int condcount = 0;
1020 int subcountlits = 0;
1021 register int c;
1022 register uschar *code = *codeptr;
1023 uschar *tempcode;
1024 const uschar *ptr = *ptrptr;
1025 const uschar *tempptr;
1026 uschar *previous = NULL;
1027 uschar class[32];
1028
1029 /* Set up the default and non-default settings for greediness */
1030
1031 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1032 greedy_non_default = greedy_default ^ 1;
1033
1034 /* Initialize no required char, and count of literals */
1035
1036 *reqchar = prevreqchar = -1;
1037 *countlits = 0;
1038
1039 /* Switch on next character until the end of the branch */
1040
1041 for (;; ptr++)
1042   {
1043   BOOL negate_class;
1044   int class_charcount;
1045   int class_lastchar;
1046   int newoptions;
1047   int skipbytes;
1048   int subreqchar;
1049
1050   c = *ptr;
1051   if ((options & PCRE_EXTENDED) != 0)
1052     {
1053     if ((cd->ctypes[c] & ctype_space) != 0) continue;
1054     if (c == '#')
1055       {
1056       /* The space before the ; is to avoid a warning on a silly compiler
1057       on the Macintosh. */
1058       while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1059       continue;
1060       }
1061     }
1062
1063   switch(c)
1064     {
1065     /* The branch terminates at end of string, |, or ). */
1066
1067     case 0:
1068     case '|':
1069     case ')':
1070     *codeptr = code;
1071     *ptrptr = ptr;
1072     return TRUE;
1073
1074     /* Handle single-character metacharacters */
1075
1076     case '^':
1077     previous = NULL;
1078     *code++ = OP_CIRC;
1079     break;
1080
1081     case '$':
1082     previous = NULL;
1083     *code++ = OP_DOLL;
1084     break;
1085
1086     case '.':
1087     previous = code;
1088     *code++ = OP_ANY;
1089     break;
1090
1091     /* Character classes. These always build a 32-byte bitmap of the permitted
1092     characters, except in the special case where there is only one character.
1093     For negated classes, we build the map as usual, then invert it at the end.
1094     */
1095
1096     case '[':
1097     previous = code;
1098     *code++ = OP_CLASS;
1099
1100     /* If the first character is '^', set the negation flag and skip it. */
1101
1102     if ((c = *(++ptr)) == '^')
1103       {
1104       negate_class = TRUE;
1105       c = *(++ptr);
1106       }
1107     else negate_class = FALSE;
1108
1109     /* Keep a count of chars so that we can optimize the case of just a single
1110     character. */
1111
1112     class_charcount = 0;
1113     class_lastchar = -1;
1114
1115     /* Initialize the 32-char bit map to all zeros. We have to build the
1116     map in a temporary bit of store, in case the class contains only 1
1117     character, because in that case the compiled code doesn't use the
1118     bit map. */
1119
1120     memset(class, 0, 32 * sizeof(uschar));
1121
1122     /* Process characters until ] is reached. By writing this as a "do" it
1123     means that an initial ] is taken as a data character. */
1124
1125     do
1126       {
1127       if (c == 0)
1128         {
1129         *errorptr = ERR6;
1130         goto FAILED;
1131         }
1132
1133       /* Handle POSIX class names. Perl allows a negation extension of the
1134       form [:^name]. A square bracket that doesn't match the syntax is
1135       treated as a literal. We also recognize the POSIX constructions
1136       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1137       5.6 does. */
1138
1139       if (c == '[' &&
1140           (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1141           check_posix_syntax(ptr, &tempptr, cd))
1142         {
1143         BOOL local_negate = FALSE;
1144         int posix_class, i;
1145         register const uschar *cbits = cd->cbits;
1146
1147         if (ptr[1] != ':')
1148           {
1149           *errorptr = ERR31;
1150           goto FAILED;
1151           }
1152
1153         ptr += 2;
1154         if (*ptr == '^')
1155           {
1156           local_negate = TRUE;
1157           ptr++;
1158           }
1159
1160         posix_class = check_posix_name(ptr, tempptr - ptr);
1161         if (posix_class < 0)
1162           {
1163           *errorptr = ERR30;
1164           goto FAILED;
1165           }
1166
1167         /* If matching is caseless, upper and lower are converted to
1168         alpha. This relies on the fact that the class table starts with
1169         alpha, lower, upper as the first 3 entries. */
1170
1171         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1172           posix_class = 0;
1173
1174         /* Or into the map we are building up to 3 of the static class
1175         tables, or their negations. */
1176
1177         posix_class *= 3;
1178         for (i = 0; i < 3; i++)
1179           {
1180           int taboffset = posix_class_maps[posix_class + i];
1181           if (taboffset < 0) break;
1182           if (local_negate)
1183             for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1184           else
1185             for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1186           }
1187
1188         ptr = tempptr + 1;
1189         class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
1190         continue;
1191         }
1192
1193       /* Backslash may introduce a single character, or it may introduce one
1194       of the specials, which just set a flag. Escaped items are checked for
1195       validity in the pre-compiling pass. The sequence \b is a special case.
1196       Inside a class (and only there) it is treated as backspace. Elsewhere
1197       it marks a word boundary. Other escapes have preset maps ready to
1198       or into the one we are building. We assume they have more than one
1199       character in them, so set class_count bigger than one. */
1200
1201       if (c == '\\')
1202         {
1203         c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1204         if (-c == ESC_b) c = '\b';
1205         else if (c < 0)
1206           {
1207           register const uschar *cbits = cd->cbits;
1208           class_charcount = 10;
1209           switch (-c)
1210             {
1211             case ESC_d:
1212             for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
1213             continue;
1214
1215             case ESC_D:
1216             for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
1217             continue;
1218
1219             case ESC_w:
1220             for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
1221             continue;
1222
1223             case ESC_W:
1224             for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
1225             continue;
1226
1227             case ESC_s:
1228             for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
1229             continue;
1230
1231             case ESC_S:
1232             for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
1233             continue;
1234
1235             default:
1236             *errorptr = ERR7;
1237             goto FAILED;
1238             }
1239           }
1240
1241         /* Fall through if single character, but don't at present allow
1242         chars > 255 in UTF-8 mode. */
1243
1244 #ifdef SUPPORT_UTF8
1245         if (c > 255)
1246           {
1247           *errorptr = ERR33;
1248           goto FAILED;
1249           }
1250 #endif
1251         }
1252
1253       /* A single character may be followed by '-' to form a range. However,
1254       Perl does not permit ']' to be the end of the range. A '-' character
1255       here is treated as a literal. */
1256
1257       if (ptr[1] == '-' && ptr[2] != ']')
1258         {
1259         int d;
1260         ptr += 2;
1261         d = *ptr;
1262
1263         if (d == 0)
1264           {
1265           *errorptr = ERR6;
1266           goto FAILED;
1267           }
1268
1269         /* The second part of a range can be a single-character escape, but
1270         not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1271         in such circumstances. */
1272
1273         if (d == '\\')
1274           {
1275           const uschar *oldptr = ptr;
1276           d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1277
1278 #ifdef SUPPORT_UTF8
1279           if (d > 255)
1280             {
1281             *errorptr = ERR33;
1282             goto FAILED;
1283             }
1284 #endif
1285           /* \b is backslash; any other special means the '-' was literal */
1286
1287           if (d < 0)
1288             {
1289             if (d == -ESC_b) d = '\b'; else
1290               {
1291               ptr = oldptr - 2;
1292               goto SINGLE_CHARACTER;  /* A few lines below */
1293               }
1294             }
1295           }
1296
1297         if (d < c)
1298           {
1299           *errorptr = ERR8;
1300           goto FAILED;
1301           }
1302
1303         for (; c <= d; c++)
1304           {
1305           class[c/8] |= (1 << (c&7));
1306           if ((options & PCRE_CASELESS) != 0)
1307             {
1308             int uc = cd->fcc[c];           /* flip case */
1309             class[uc/8] |= (1 << (uc&7));
1310             }
1311           class_charcount++;                /* in case a one-char range */
1312           class_lastchar = c;
1313           }
1314         continue;   /* Go get the next char in the class */
1315         }
1316
1317       /* Handle a lone single character - we can get here for a normal
1318       non-escape char, or after \ that introduces a single character. */
1319
1320       SINGLE_CHARACTER:
1321
1322       class [c/8] |= (1 << (c&7));
1323       if ((options & PCRE_CASELESS) != 0)
1324         {
1325         c = cd->fcc[c];   /* flip case */
1326         class[c/8] |= (1 << (c&7));
1327         }
1328       class_charcount++;
1329       class_lastchar = c;
1330       }
1331
1332     /* Loop until ']' reached; the check for end of string happens inside the
1333     loop. This "while" is the end of the "do" above. */
1334
1335     while ((c = *(++ptr)) != ']');
1336
1337     /* If class_charcount is 1 and class_lastchar is not negative, we saw
1338     precisely one character. This doesn't need the whole 32-byte bit map.
1339     We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1340     it's negative. */
1341
1342     if (class_charcount == 1 && class_lastchar >= 0)
1343       {
1344       if (negate_class)
1345         {
1346         code[-1] = OP_NOT;
1347         }
1348       else
1349         {
1350         code[-1] = OP_CHARS;
1351         *code++ = 1;
1352         }
1353       *code++ = class_lastchar;
1354       }
1355
1356     /* Otherwise, negate the 32-byte map if necessary, and copy it into
1357     the code vector. */
1358
1359     else
1360       {
1361       if (negate_class)
1362         for (c = 0; c < 32; c++) code[c] = ~class[c];
1363       else
1364         memcpy(code, class, 32);
1365       code += 32;
1366       }
1367     break;
1368
1369     /* Various kinds of repeat */
1370
1371     case '{':
1372     if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
1373     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
1374     if (*errorptr != NULL) goto FAILED;
1375     goto REPEAT;
1376
1377     case '*':
1378     repeat_min = 0;
1379     repeat_max = -1;
1380     goto REPEAT;
1381
1382     case '+':
1383     repeat_min = 1;
1384     repeat_max = -1;
1385     goto REPEAT;
1386
1387     case '?':
1388     repeat_min = 0;
1389     repeat_max = 1;
1390
1391     REPEAT:
1392     if (previous == NULL)
1393       {
1394       *errorptr = ERR9;
1395       goto FAILED;
1396       }
1397
1398     /* If the next character is '?' this is a minimizing repeat, by default,
1399     but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
1400     next character. */
1401
1402     if (ptr[1] == '?')
1403       { repeat_type = greedy_non_default; ptr++; }
1404     else repeat_type = greedy_default;
1405
1406     /* If previous was a string of characters, chop off the last one and use it
1407     as the subject of the repeat. If there was only one character, we can
1408     abolish the previous item altogether. A repeat with a zero minimum wipes
1409     out any reqchar setting, backing up to the previous value. We must also
1410     adjust the countlits value. */
1411
1412     if (*previous == OP_CHARS)
1413       {
1414       int len = previous[1];
1415
1416       if (repeat_min == 0) *reqchar = prevreqchar;
1417       *countlits += repeat_min - 1;
1418
1419       if (len == 1)
1420         {
1421         c = previous[2];
1422         code = previous;
1423         }
1424       else
1425         {
1426         c = previous[len+1];
1427         previous[1]--;
1428         code--;
1429         }
1430       op_type = 0;                 /* Use single-char op codes */
1431       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
1432       }
1433
1434     /* If previous was a single negated character ([^a] or similar), we use
1435     one of the special opcodes, replacing it. The code is shared with single-
1436     character repeats by adding a suitable offset into repeat_type. */
1437
1438     else if ((int)*previous == OP_NOT)
1439       {
1440       op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
1441       c = previous[1];
1442       code = previous;
1443       goto OUTPUT_SINGLE_REPEAT;
1444       }
1445
1446     /* If previous was a character type match (\d or similar), abolish it and
1447     create a suitable repeat item. The code is shared with single-character
1448     repeats by adding a suitable offset into repeat_type. */
1449
1450     else if ((int)*previous < OP_EODN || *previous == OP_ANY)
1451       {
1452       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
1453       c = *previous;
1454       code = previous;
1455
1456       OUTPUT_SINGLE_REPEAT:
1457
1458       /* If the maximum is zero then the minimum must also be zero; Perl allows
1459       this case, so we do too - by simply omitting the item altogether. */
1460
1461       if (repeat_max == 0) goto END_REPEAT;
1462
1463       /* Combine the op_type with the repeat_type */
1464
1465       repeat_type += op_type;
1466
1467       /* A minimum of zero is handled either as the special case * or ?, or as
1468       an UPTO, with the maximum given. */
1469
1470       if (repeat_min == 0)
1471         {
1472         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1473           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1474         else
1475           {
1476           *code++ = OP_UPTO + repeat_type;
1477           *code++ = repeat_max >> 8;
1478           *code++ = (repeat_max & 255);
1479           }
1480         }
1481
1482       /* The case {1,} is handled as the special case + */
1483
1484       else if (repeat_min == 1 && repeat_max == -1)
1485         *code++ = OP_PLUS + repeat_type;
1486
1487       /* The case {n,n} is just an EXACT, while the general case {n,m} is
1488       handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1489
1490       else
1491         {
1492         if (repeat_min != 1)
1493           {
1494           *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
1495           *code++ = repeat_min >> 8;
1496           *code++ = (repeat_min & 255);
1497           }
1498
1499         /* If the mininum is 1 and the previous item was a character string,
1500         we either have to put back the item that got cancelled if the string
1501         length was 1, or add the character back onto the end of a longer
1502         string. For a character type nothing need be done; it will just get
1503         put back naturally. Note that the final character is always going to
1504         get added below. */
1505
1506         else if (*previous == OP_CHARS)
1507           {
1508           if (code == previous) code += 2; else previous[1]++;
1509           }
1510
1511         /*  For a single negated character we also have to put back the
1512         item that got cancelled. */
1513
1514         else if (*previous == OP_NOT) code++;
1515
1516         /* If the maximum is unlimited, insert an OP_STAR. */
1517
1518         if (repeat_max < 0)
1519           {
1520           *code++ = c;
1521           *code++ = OP_STAR + repeat_type;
1522           }
1523
1524         /* Else insert an UPTO if the max is greater than the min. */
1525
1526         else if (repeat_max != repeat_min)
1527           {
1528           *code++ = c;
1529           repeat_max -= repeat_min;
1530           *code++ = OP_UPTO + repeat_type;
1531           *code++ = repeat_max >> 8;
1532           *code++ = (repeat_max & 255);
1533           }
1534         }
1535
1536       /* The character or character type itself comes last in all cases. */
1537
1538       *code++ = c;
1539       }
1540
1541     /* If previous was a character class or a back reference, we put the repeat
1542     stuff after it, but just skip the item if the repeat was {0,0}. */
1543
1544     else if (*previous == OP_CLASS || *previous == OP_REF)
1545       {
1546       if (repeat_max == 0)
1547         {
1548         code = previous;
1549         goto END_REPEAT;
1550         }
1551       if (repeat_min == 0 && repeat_max == -1)
1552         *code++ = OP_CRSTAR + repeat_type;
1553       else if (repeat_min == 1 && repeat_max == -1)
1554         *code++ = OP_CRPLUS + repeat_type;
1555       else if (repeat_min == 0 && repeat_max == 1)
1556         *code++ = OP_CRQUERY + repeat_type;
1557       else
1558         {
1559         *code++ = OP_CRRANGE + repeat_type;
1560         *code++ = repeat_min >> 8;
1561         *code++ = repeat_min & 255;
1562         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
1563         *code++ = repeat_max >> 8;
1564         *code++ = repeat_max & 255;
1565         }
1566       }
1567
1568     /* If previous was a bracket group, we may have to replicate it in certain
1569     cases. */
1570
1571     else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1572              (int)*previous == OP_COND)
1573       {
1574       register int i;
1575       int ketoffset = 0;
1576       int len = code - previous;
1577       uschar *bralink = NULL;
1578
1579       /* If the maximum repeat count is unlimited, find the end of the bracket
1580       by scanning through from the start, and compute the offset back to it
1581       from the current code pointer. There may be an OP_OPT setting following
1582       the final KET, so we can't find the end just by going back from the code
1583       pointer. */
1584
1585       if (repeat_max == -1)
1586         {
1587         register uschar *ket = previous;
1588         do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
1589         ketoffset = code - ket;
1590         }
1591
1592       /* The case of a zero minimum is special because of the need to stick
1593       OP_BRAZERO in front of it, and because the group appears once in the
1594       data, whereas in other cases it appears the minimum number of times. For
1595       this reason, it is simplest to treat this case separately, as otherwise
1596       the code gets far too messy. There are several special subcases when the
1597       minimum is zero. */
1598
1599       if (repeat_min == 0)
1600         {
1601         /* If we set up a required char from the bracket, we must back off
1602         to the previous value and reset the countlits value too. */
1603
1604         if (subcountlits > 0)
1605           {
1606           *reqchar = prevreqchar;
1607           *countlits -= subcountlits;
1608           }
1609
1610         /* If the maximum is also zero, we just omit the group from the output
1611         altogether. */
1612
1613         if (repeat_max == 0)
1614           {
1615           code = previous;
1616           goto END_REPEAT;
1617           }
1618
1619         /* If the maximum is 1 or unlimited, we just have to stick in the
1620         BRAZERO and do no more at this point. */
1621
1622         if (repeat_max <= 1)
1623           {
1624           memmove(previous+1, previous, len);
1625           code++;
1626           *previous++ = OP_BRAZERO + repeat_type;
1627           }
1628
1629         /* If the maximum is greater than 1 and limited, we have to replicate
1630         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1631         The first one has to be handled carefully because it's the original
1632         copy, which has to be moved up. The remainder can be handled by code
1633         that is common with the non-zero minimum case below. We just have to
1634         adjust the value or repeat_max, since one less copy is required. */
1635
1636         else
1637           {
1638           int offset;
1639           memmove(previous+4, previous, len);
1640           code += 4;
1641           *previous++ = OP_BRAZERO + repeat_type;
1642           *previous++ = OP_BRA;
1643
1644           /* We chain together the bracket offset fields that have to be
1645           filled in later when the ends of the brackets are reached. */
1646
1647           offset = (bralink == NULL)? 0 : previous - bralink;
1648           bralink = previous;
1649           *previous++ = offset >> 8;
1650           *previous++ = offset & 255;
1651           }
1652
1653         repeat_max--;
1654         }
1655
1656       /* If the minimum is greater than zero, replicate the group as many
1657       times as necessary, and adjust the maximum to the number of subsequent
1658       copies that we need. */
1659
1660       else
1661         {
1662         for (i = 1; i < repeat_min; i++)
1663           {
1664           memcpy(code, previous, len);
1665           code += len;
1666           }
1667         if (repeat_max > 0) repeat_max -= repeat_min;
1668         }
1669
1670       /* This code is common to both the zero and non-zero minimum cases. If
1671       the maximum is limited, it replicates the group in a nested fashion,
1672       remembering the bracket starts on a stack. In the case of a zero minimum,
1673       the first one was set up above. In all cases the repeat_max now specifies
1674       the number of additional copies needed. */
1675
1676       if (repeat_max >= 0)
1677         {
1678         for (i = repeat_max - 1; i >= 0; i--)
1679           {
1680           *code++ = OP_BRAZERO + repeat_type;
1681
1682           /* All but the final copy start a new nesting, maintaining the
1683           chain of brackets outstanding. */
1684
1685           if (i != 0)
1686             {
1687             int offset;
1688             *code++ = OP_BRA;
1689             offset = (bralink == NULL)? 0 : code - bralink;
1690             bralink = code;
1691             *code++ = offset >> 8;
1692             *code++ = offset & 255;
1693             }
1694
1695           memcpy(code, previous, len);
1696           code += len;
1697           }
1698
1699         /* Now chain through the pending brackets, and fill in their length
1700         fields (which are holding the chain links pro tem). */
1701
1702         while (bralink != NULL)
1703           {
1704           int oldlinkoffset;
1705           int offset = code - bralink + 1;
1706           uschar *bra = code - offset;
1707           oldlinkoffset = (bra[1] << 8) + bra[2];
1708           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1709           *code++ = OP_KET;
1710           *code++ = bra[1] = offset >> 8;
1711           *code++ = bra[2] = (offset & 255);
1712           }
1713         }
1714
1715       /* If the maximum is unlimited, set a repeater in the final copy. We
1716       can't just offset backwards from the current code point, because we
1717       don't know if there's been an options resetting after the ket. The
1718       correct offset was computed above. */
1719
1720       else code[-ketoffset] = OP_KETRMAX + repeat_type;
1721       }
1722
1723     /* Else there's some kind of shambles */
1724
1725     else
1726       {
1727       *errorptr = ERR11;
1728       goto FAILED;
1729       }
1730
1731     /* In all case we no longer have a previous item. */
1732
1733     END_REPEAT:
1734     previous = NULL;
1735     break;
1736
1737
1738     /* Start of nested bracket sub-expression, or comment or lookahead or
1739     lookbehind or option setting or condition. First deal with special things
1740     that can come after a bracket; all are introduced by ?, and the appearance
1741     of any of them means that this is not a referencing group. They were
1742     checked for validity in the first pass over the string, so we don't have to
1743     check for syntax errors here.  */
1744
1745     case '(':
1746     newoptions = options;
1747     skipbytes = 0;
1748
1749     if (*(++ptr) == '?')
1750       {
1751       int set, unset;
1752       int *optset;
1753
1754       switch (*(++ptr))
1755         {
1756         case '#':                 /* Comment; skip to ket */
1757         ptr++;
1758         while (*ptr != ')') ptr++;
1759         continue;
1760
1761         case ':':                 /* Non-extracting bracket */
1762         bravalue = OP_BRA;
1763         ptr++;
1764         break;
1765
1766         case '(':
1767         bravalue = OP_COND;       /* Conditional group */
1768         if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1769           {
1770           int condref = *ptr - '0';
1771           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1772           if (condref == 0)
1773             {
1774             *errorptr = ERR35;
1775             goto FAILED;
1776             }
1777           ptr++;
1778           code[3] = OP_CREF;
1779           code[4] = condref >> 8;
1780           code[5] = condref & 255;
1781           skipbytes = 3;
1782           }
1783         else ptr--;
1784         break;
1785
1786         case '=':                 /* Positive lookahead */
1787         bravalue = OP_ASSERT;
1788         ptr++;
1789         break;
1790
1791         case '!':                 /* Negative lookahead */
1792         bravalue = OP_ASSERT_NOT;
1793         ptr++;
1794         break;
1795
1796         case '<':                 /* Lookbehinds */
1797         switch (*(++ptr))
1798           {
1799           case '=':               /* Positive lookbehind */
1800           bravalue = OP_ASSERTBACK;
1801           ptr++;
1802           break;
1803
1804           case '!':               /* Negative lookbehind */
1805           bravalue = OP_ASSERTBACK_NOT;
1806           ptr++;
1807           break;
1808
1809           default:                /* Syntax error */
1810           *errorptr = ERR24;
1811           goto FAILED;
1812           }
1813         break;
1814
1815         case '>':                 /* One-time brackets */
1816         bravalue = OP_ONCE;
1817         ptr++;
1818         break;
1819
1820         case 'R':                 /* Pattern recursion */
1821         *code++ = OP_RECURSE;
1822         ptr++;
1823         continue;
1824
1825         default:                  /* Option setting */
1826         set = unset = 0;
1827         optset = &set;
1828
1829         while (*ptr != ')' && *ptr != ':')
1830           {
1831           switch (*ptr++)
1832             {
1833             case '-': optset = &unset; break;
1834
1835             case 'i': *optset |= PCRE_CASELESS; break;
1836             case 'm': *optset |= PCRE_MULTILINE; break;
1837             case 's': *optset |= PCRE_DOTALL; break;
1838             case 'x': *optset |= PCRE_EXTENDED; break;
1839             case 'U': *optset |= PCRE_UNGREEDY; break;
1840             case 'X': *optset |= PCRE_EXTRA; break;
1841
1842             default:
1843             *errorptr = ERR12;
1844             goto FAILED;
1845             }
1846           }
1847
1848         /* Set up the changed option bits, but don't change anything yet. */
1849
1850         newoptions = (options | set) & (~unset);
1851
1852         /* If the options ended with ')' this is not the start of a nested
1853         group with option changes, so the options change at this level. At top
1854         level there is nothing else to be done (the options will in fact have
1855         been set from the start of compiling as a result of the first pass) but
1856         at an inner level we must compile code to change the ims options if
1857         necessary, and pass the new setting back so that it can be put at the
1858         start of any following branches, and when this group ends, a resetting
1859         item can be compiled. */
1860
1861         if (*ptr == ')')
1862           {
1863           if ((options & PCRE_INGROUP) != 0 &&
1864               (options & PCRE_IMS) != (newoptions & PCRE_IMS))
1865             {
1866             *code++ = OP_OPT;
1867             *code++ = *optchanged = newoptions & PCRE_IMS;
1868             }
1869           options = newoptions;  /* Change options at this level */
1870           previous = NULL;       /* This item can't be repeated */
1871           continue;              /* It is complete */
1872           }
1873
1874         /* If the options ended with ':' we are heading into a nested group
1875         with possible change of options. Such groups are non-capturing and are
1876         not assertions of any kind. All we need to do is skip over the ':';
1877         the newoptions value is handled below. */
1878
1879         bravalue = OP_BRA;
1880         ptr++;
1881         }
1882       }
1883
1884     /* Else we have a referencing group; adjust the opcode. If the bracket
1885     number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
1886     arrange for the true number to follow later, in an OP_BRANUMBER item. */
1887
1888     else
1889       {
1890       if (++(*brackets) > EXTRACT_BASIC_MAX)
1891         {
1892         bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
1893         code[3] = OP_BRANUMBER;
1894         code[4] = *brackets >> 8;
1895         code[5] = *brackets & 255;
1896         skipbytes = 3;
1897         }
1898       else bravalue = OP_BRA + *brackets;
1899       }
1900
1901     /* Process nested bracketed re. Assertions may not be repeated, but other
1902     kinds can be. We copy code into a non-register variable in order to be able
1903     to pass its address because some compilers complain otherwise. Pass in a
1904     new setting for the ims options if they have changed. */
1905
1906     previous = (bravalue >= OP_ONCE)? code : NULL;
1907     *code = bravalue;
1908     tempcode = code;
1909
1910     if (!compile_regex(
1911          options | PCRE_INGROUP,       /* Set for all nested groups */
1912          ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1913            newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1914          brackets,                     /* Extracting bracket count */
1915          &tempcode,                    /* Where to put code (updated) */
1916          &ptr,                         /* Input pointer (updated) */
1917          errorptr,                     /* Where to put an error message */
1918          (bravalue == OP_ASSERTBACK ||
1919           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1920          skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */
1921          &subreqchar,                  /* For possible last char */
1922          &subcountlits,                /* For literal count */
1923          cd))                          /* Tables block */
1924       goto FAILED;
1925
1926     /* At the end of compiling, code is still pointing to the start of the
1927     group, while tempcode has been updated to point past the end of the group
1928     and any option resetting that may follow it. The pattern pointer (ptr)
1929     is on the bracket. */
1930
1931     /* If this is a conditional bracket, check that there are no more than
1932     two branches in the group. */
1933
1934     else if (bravalue == OP_COND)
1935       {
1936       uschar *tc = code;
1937       condcount = 0;
1938
1939       do {
1940          condcount++;
1941          tc += (tc[1] << 8) | tc[2];
1942          }
1943       while (*tc != OP_KET);
1944
1945       if (condcount > 2)
1946         {
1947         *errorptr = ERR27;
1948         goto FAILED;
1949         }
1950       }
1951
1952     /* Handle updating of the required character. If the subpattern didn't
1953     set one, leave it as it was. Otherwise, update it for normal brackets of
1954     all kinds, forward assertions, and conditions with two branches. Don't
1955     update the literal count for forward assertions, however. If the bracket
1956     is followed by a quantifier with zero repeat, we have to back off. Hence
1957     the definition of prevreqchar and subcountlits outside the main loop so
1958     that they can be accessed for the back off. */
1959
1960     if (subreqchar > 0 &&
1961          (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1962          (bravalue == OP_COND && condcount == 2)))
1963       {
1964       prevreqchar = *reqchar;
1965       *reqchar = subreqchar;
1966       if (bravalue != OP_ASSERT) *countlits += subcountlits;
1967       }
1968
1969     /* Now update the main code pointer to the end of the group. */
1970
1971     code = tempcode;
1972
1973     /* Error if hit end of pattern */
1974
1975     if (*ptr != ')')
1976       {
1977       *errorptr = ERR14;
1978       goto FAILED;
1979       }
1980     break;
1981
1982     /* Check \ for being a real metacharacter; if not, fall through and handle
1983     it as a data character at the start of a string. Escape items are checked
1984     for validity in the pre-compiling pass. */
1985
1986     case '\\':
1987     tempptr = ptr;
1988     c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1989
1990     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1991     are arranged to be the negation of the corresponding OP_values. For the
1992     back references, the values are ESC_REF plus the reference number. Only
1993     back references and those types that consume a character may be repeated.
1994     We can test for values between ESC_b and ESC_Z for the latter; this may
1995     have to change if any new ones are ever created. */
1996
1997     if (c < 0)
1998       {
1999       if (-c >= ESC_REF)
2000         {
2001         int number = -c - ESC_REF;
2002         previous = code;
2003         *code++ = OP_REF;
2004         *code++ = number >> 8;
2005         *code++ = number & 255;
2006         }
2007       else
2008         {
2009         previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
2010         *code++ = -c;
2011         }
2012       continue;
2013       }
2014
2015     /* Data character: reset and fall through */
2016
2017     ptr = tempptr;
2018     c = '\\';
2019
2020     /* Handle a run of data characters until a metacharacter is encountered.
2021     The first character is guaranteed not to be whitespace or # when the
2022     extended flag is set. */
2023
2024     NORMAL_CHAR:
2025     default:
2026     previous = code;
2027     *code = OP_CHARS;
2028     code += 2;
2029     length = 0;
2030
2031     do
2032       {
2033       if ((options & PCRE_EXTENDED) != 0)
2034         {
2035         if ((cd->ctypes[c] & ctype_space) != 0) continue;
2036         if (c == '#')
2037           {
2038           /* The space before the ; is to avoid a warning on a silly compiler
2039           on the Macintosh. */
2040           while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2041           if (c == 0) break;
2042           continue;
2043           }
2044         }
2045
2046       /* Backslash may introduce a data char or a metacharacter. Escaped items
2047       are checked for validity in the pre-compiling pass. Stop the string
2048       before a metaitem. */
2049
2050       if (c == '\\')
2051         {
2052         tempptr = ptr;
2053         c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
2054         if (c < 0) { ptr = tempptr; break; }
2055
2056         /* If a character is > 127 in UTF-8 mode, we have to turn it into
2057         two or more characters in the UTF-8 encoding. */
2058
2059 #ifdef SUPPORT_UTF8
2060         if (c > 127 && (options & PCRE_UTF8) != 0)
2061           {
2062           uschar buffer[8];
2063           int len = ord2utf8(c, buffer);
2064           for (c = 0; c < len; c++) *code++ = buffer[c];
2065           length += len;
2066           continue;
2067           }
2068 #endif
2069         }
2070
2071       /* Ordinary character or single-char escape */
2072
2073       *code++ = c;
2074       length++;
2075       }
2076
2077     /* This "while" is the end of the "do" above. */
2078
2079     while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
2080
2081     /* Update the last character and the count of literals */
2082
2083     prevreqchar = (length > 1)? code[-2] : *reqchar;
2084     *reqchar = code[-1];
2085     *countlits += length;
2086
2087     /* Compute the length and set it in the data vector, and advance to
2088     the next state. */
2089
2090     previous[1] = length;
2091     if (length < MAXLIT) ptr--;
2092     break;
2093     }
2094   }                   /* end of big loop */
2095
2096 /* Control never reaches here by falling through, only by a goto for all the
2097 error states. Pass back the position in the pattern so that it can be displayed
2098 to the user for diagnosing the error. */
2099
2100 FAILED:
2101 *ptrptr = ptr;
2102 return FALSE;
2103 }
2104
2105
2106
2107
2108 /*************************************************
2109 *     Compile sequence of alternatives           *
2110 *************************************************/
2111
2112 /* On entry, ptr is pointing past the bracket character, but on return
2113 it points to the closing bracket, or vertical bar, or end of string.
2114 The code variable is pointing at the byte into which the BRA operator has been
2115 stored. If the ims options are changed at the start (for a (?ims: group) or
2116 during any branch, we need to insert an OP_OPT item at the start of every
2117 following branch to ensure they get set correctly at run time, and also pass
2118 the new options into every subsequent branch compile.
2119
2120 Argument:
2121   options     the option bits
2122   optchanged  new ims options to set as if (?ims) were at the start, or -1
2123                for no change
2124   brackets    -> int containing the number of extracting brackets used
2125   codeptr     -> the address of the current code pointer
2126   ptrptr      -> the address of the current pattern pointer
2127   errorptr    -> pointer to error message
2128   lookbehind  TRUE if this is a lookbehind assertion
2129   skipbytes   skip this many bytes at start (for OP_COND, OP_BRANUMBER)
2130   reqchar     -> place to put the last required character, or a negative number
2131   countlits   -> place to put the shortest literal count of any branch
2132   cd          points to the data block with tables pointers
2133
2134 Returns:      TRUE on success
2135 */
2136
2137 static BOOL
2138 compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
2139   const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
2140   int *reqchar, int *countlits, compile_data *cd)
2141 {
2142 const uschar *ptr = *ptrptr;
2143 uschar *code = *codeptr;
2144 uschar *last_branch = code;
2145 uschar *start_bracket = code;
2146 uschar *reverse_count = NULL;
2147 int oldoptions = options & PCRE_IMS;
2148 int branchreqchar, branchcountlits;
2149
2150 *reqchar = -1;
2151 *countlits = INT_MAX;
2152 code += 3 + skipbytes;
2153
2154 /* Loop for each alternative branch */
2155
2156 for (;;)
2157   {
2158   int length;
2159
2160   /* Handle change of options */
2161
2162   if (optchanged >= 0)
2163     {
2164     *code++ = OP_OPT;
2165     *code++ = optchanged;
2166     options = (options & ~PCRE_IMS) | optchanged;
2167     }
2168
2169   /* Set up dummy OP_REVERSE if lookbehind assertion */
2170
2171   if (lookbehind)
2172     {
2173     *code++ = OP_REVERSE;
2174     reverse_count = code;
2175     *code++ = 0;
2176     *code++ = 0;
2177     }
2178
2179   /* Now compile the branch */
2180
2181   if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
2182       &branchreqchar, &branchcountlits, cd))
2183     {
2184     *ptrptr = ptr;
2185     return FALSE;
2186     }
2187
2188   /* Fill in the length of the last branch */
2189
2190   length = code - last_branch;
2191   last_branch[1] = length >> 8;
2192   last_branch[2] = length & 255;
2193
2194   /* Save the last required character if all branches have the same; a current
2195   value of -1 means unset, while -2 means "previous branch had no last required
2196   char".  */
2197
2198   if (*reqchar != -2)
2199     {
2200     if (branchreqchar >= 0)
2201       {
2202       if (*reqchar == -1) *reqchar = branchreqchar;
2203       else if (*reqchar != branchreqchar) *reqchar = -2;
2204       }
2205     else *reqchar = -2;
2206     }
2207
2208   /* Keep the shortest literal count */
2209
2210   if (branchcountlits < *countlits) *countlits = branchcountlits;
2211   DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
2212
2213   /* If lookbehind, check that this branch matches a fixed-length string,
2214   and put the length into the OP_REVERSE item. Temporarily mark the end of
2215   the branch with OP_END. */
2216
2217   if (lookbehind)
2218     {
2219     *code = OP_END;
2220     length = find_fixedlength(last_branch, options);
2221     DPRINTF(("fixed length = %d\n", length));
2222     if (length < 0)
2223       {
2224       *errorptr = ERR25;
2225       *ptrptr = ptr;
2226       return FALSE;
2227       }
2228     reverse_count[0] = (length >> 8);
2229     reverse_count[1] = length & 255;
2230     }
2231
2232   /* Reached end of expression, either ')' or end of pattern. Insert a
2233   terminating ket and the length of the whole bracketed item, and return,
2234   leaving the pointer at the terminating char. If any of the ims options
2235   were changed inside the group, compile a resetting op-code following. */
2236
2237   if (*ptr != '|')
2238     {
2239     length = code - start_bracket;
2240     *code++ = OP_KET;
2241     *code++ = length >> 8;
2242     *code++ = length & 255;
2243     if (optchanged >= 0)
2244       {
2245       *code++ = OP_OPT;
2246       *code++ = oldoptions;
2247       }
2248     *codeptr = code;
2249     *ptrptr = ptr;
2250     return TRUE;
2251     }
2252
2253   /* Another branch follows; insert an "or" node and advance the pointer. */
2254
2255   *code = OP_ALT;
2256   last_branch = code;
2257   code += 3;
2258   ptr++;
2259   }
2260 /* Control never reaches here */
2261 }
2262
2263
2264
2265
2266 /*************************************************
2267 *      Find first significant op code            *
2268 *************************************************/
2269
2270 /* This is called by several functions that scan a compiled expression looking
2271 for a fixed first character, or an anchoring op code etc. It skips over things
2272 that do not influence this. For one application, a change of caseless option is
2273 important.
2274
2275 Arguments:
2276   code       pointer to the start of the group
2277   options    pointer to external options
2278   optbit     the option bit whose changing is significant, or
2279              zero if none are
2280   optstop    TRUE to return on option change, otherwise change the options
2281                value and continue
2282
2283 Returns:     pointer to the first significant opcode
2284 */
2285
2286 static const uschar*
2287 first_significant_code(const uschar *code, int *options, int optbit,
2288   BOOL optstop)
2289 {
2290 for (;;)
2291   {
2292   switch ((int)*code)
2293     {
2294     case OP_OPT:
2295     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
2296       {
2297       if (optstop) return code;
2298       *options = (int)code[1];
2299       }
2300     code += 2;
2301     break;
2302
2303     case OP_CREF:
2304     case OP_BRANUMBER:
2305     code += 3;
2306     break;
2307
2308     case OP_WORD_BOUNDARY:
2309     case OP_NOT_WORD_BOUNDARY:
2310     code++;
2311     break;
2312
2313     case OP_ASSERT_NOT:
2314     case OP_ASSERTBACK:
2315     case OP_ASSERTBACK_NOT:
2316     do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
2317     code += 3;
2318     break;
2319
2320     default:
2321     return code;
2322     }
2323   }
2324 /* Control never reaches here */
2325 }
2326
2327
2328
2329
2330 /*************************************************
2331 *          Check for anchored expression         *
2332 *************************************************/
2333
2334 /* Try to find out if this is an anchored regular expression. Consider each
2335 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2336 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2337 it's anchored. However, if this is a multiline pattern, then only OP_SOD
2338 counts, since OP_CIRC can match in the middle.
2339
2340 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
2341 because that will try the rest of the pattern at all possible matching points,
2342 so there is no point trying them again.
2343
2344 Arguments:
2345   code       points to start of expression (the bracket)
2346   options    points to the options setting
2347
2348 Returns:     TRUE or FALSE
2349 */
2350
2351 static BOOL
2352 is_anchored(register const uschar *code, int *options)
2353 {
2354 do {
2355    const uschar *scode = first_significant_code(code + 3, options,
2356      PCRE_MULTILINE, FALSE);
2357    register int op = *scode;
2358    if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2359      { if (!is_anchored(scode, options)) return FALSE; }
2360    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
2361             (*options & PCRE_DOTALL) != 0)
2362      { if (scode[1] != OP_ANY) return FALSE; }
2363    else if (op != OP_SOD &&
2364            ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
2365      return FALSE;
2366    code += (code[1] << 8) + code[2];
2367    }
2368 while (*code == OP_ALT);
2369 return TRUE;
2370 }
2371
2372
2373
2374 /*************************************************
2375 *         Check for starting with ^ or .*        *
2376 *************************************************/
2377
2378 /* This is called to find out if every branch starts with ^ or .* so that
2379 "first char" processing can be done to speed things up in multiline
2380 matching and for non-DOTALL patterns that start with .* (which must start at
2381 the beginning or after \n).
2382
2383 Argument:  points to start of expression (the bracket)
2384 Returns:   TRUE or FALSE
2385 */
2386
2387 static BOOL
2388 is_startline(const uschar *code)
2389 {
2390 do {
2391    const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
2392    register int op = *scode;
2393    if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2394      { if (!is_startline(scode)) return FALSE; }
2395    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2396      { if (scode[1] != OP_ANY) return FALSE; }
2397    else if (op != OP_CIRC) return FALSE;
2398    code += (code[1] << 8) + code[2];
2399    }
2400 while (*code == OP_ALT);
2401 return TRUE;
2402 }
2403
2404
2405
2406 /*************************************************
2407 *          Check for fixed first char            *
2408 *************************************************/
2409
2410 /* Try to find out if there is a fixed first character. This is called for
2411 unanchored expressions, as it speeds up their processing quite considerably.
2412 Consider each alternative branch. If they all start with the same char, or with
2413 a bracket all of whose alternatives start with the same char (recurse ad lib),
2414 then we return that char, otherwise -1.
2415
2416 Arguments:
2417   code       points to start of expression (the bracket)
2418   options    pointer to the options (used to check casing changes)
2419
2420 Returns:     -1 or the fixed first char
2421 */
2422
2423 static int
2424 find_firstchar(const uschar *code, int *options)
2425 {
2426 register int c = -1;
2427 do {
2428    int d;
2429    const uschar *scode = first_significant_code(code + 3, options,
2430      PCRE_CASELESS, TRUE);
2431    register int op = *scode;
2432
2433    if (op >= OP_BRA) op = OP_BRA;
2434
2435    switch(op)
2436      {
2437      default:
2438      return -1;
2439
2440      case OP_BRA:
2441      case OP_ASSERT:
2442      case OP_ONCE:
2443      case OP_COND:
2444      if ((d = find_firstchar(scode, options)) < 0) return -1;
2445      if (c < 0) c = d; else if (c != d) return -1;
2446      break;
2447
2448      case OP_EXACT:       /* Fall through */
2449      scode++;
2450
2451      case OP_CHARS:       /* Fall through */
2452      scode++;
2453
2454      case OP_PLUS:
2455      case OP_MINPLUS:
2456      if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
2457      break;
2458      }
2459
2460    code += (code[1] << 8) + code[2];
2461    }
2462 while (*code == OP_ALT);
2463 return c;
2464 }
2465
2466
2467
2468
2469
2470 /*************************************************
2471 *        Compile a Regular Expression            *
2472 *************************************************/
2473
2474 /* This function takes a string and returns a pointer to a block of store
2475 holding a compiled version of the expression.
2476
2477 Arguments:
2478   pattern      the regular expression
2479   options      various option bits
2480   errorptr     pointer to pointer to error text
2481   erroroffset  ptr offset in pattern where error was detected
2482   tables       pointer to character tables or NULL
2483
2484 Returns:       pointer to compiled data block, or NULL on error,
2485                with errorptr and erroroffset set
2486 */
2487
2488 pcre *
2489 pcre_compile(const char *pattern, int options, const char **errorptr,
2490   int *erroroffset, const unsigned char *tables)
2491 {
2492 real_pcre *re;
2493 int length = 3;      /* For initial BRA plus length */
2494 int runlength;
2495 int c, reqchar, countlits;
2496 int bracount = 0;
2497 int top_backref = 0;
2498 int branch_extra = 0;
2499 int branch_newextra;
2500 unsigned int brastackptr = 0;
2501 size_t size;
2502 uschar *code;
2503 const uschar *ptr;
2504 compile_data compile_block;
2505 int brastack[BRASTACK_SIZE];
2506 uschar bralenstack[BRASTACK_SIZE];
2507
2508 #ifdef DEBUG
2509 uschar *code_base, *code_end;
2510 #endif
2511
2512 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
2513
2514 #ifndef SUPPORT_UTF8
2515 if ((options & PCRE_UTF8) != 0)
2516   {
2517   *errorptr = ERR32;
2518   return NULL;
2519   }
2520 #endif
2521
2522 /* We can't pass back an error message if errorptr is NULL; I guess the best we
2523 can do is just return NULL. */
2524
2525 if (errorptr == NULL) return NULL;
2526 *errorptr = NULL;
2527
2528 /* However, we can give a message for this error */
2529
2530 if (erroroffset == NULL)
2531   {
2532   *errorptr = ERR16;
2533   return NULL;
2534   }
2535 *erroroffset = 0;
2536
2537 if ((options & ~PUBLIC_OPTIONS) != 0)
2538   {
2539   *errorptr = ERR17;
2540   return NULL;
2541   }
2542
2543 /* Set up pointers to the individual character tables */
2544
2545 if (tables == NULL) tables = pcre_default_tables;
2546 compile_block.lcc = tables + lcc_offset;
2547 compile_block.fcc = tables + fcc_offset;
2548 compile_block.cbits = tables + cbits_offset;
2549 compile_block.ctypes = tables + ctypes_offset;
2550
2551 /* Reflect pattern for debugging output */
2552
2553 DPRINTF(("------------------------------------------------------------------\n"));
2554 DPRINTF(("%s\n", pattern));
2555
2556 /* The first thing to do is to make a pass over the pattern to compute the
2557 amount of store required to hold the compiled code. This does not have to be
2558 perfect as long as errors are overestimates. At the same time we can detect any
2559 internal flag settings. Make an attempt to correct for any counted white space
2560 if an "extended" flag setting appears late in the pattern. We can't be so
2561 clever for #-comments. */
2562
2563 ptr = (const uschar *)(pattern - 1);
2564 while ((c = *(++ptr)) != 0)
2565   {
2566   int min, max;
2567   int class_charcount;
2568   int bracket_length;
2569
2570   if ((options & PCRE_EXTENDED) != 0)
2571     {
2572     if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2573     if (c == '#')
2574       {
2575       /* The space before the ; is to avoid a warning on a silly compiler
2576       on the Macintosh. */
2577       while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2578       continue;
2579       }
2580     }
2581
2582   switch(c)
2583     {
2584     /* A backslashed item may be an escaped "normal" character or a
2585     character type. For a "normal" character, put the pointers and
2586     character back so that tests for whitespace etc. in the input
2587     are done correctly. */
2588
2589     case '\\':
2590       {
2591       const uschar *save_ptr = ptr;
2592       c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
2593       if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2594       if (c >= 0)
2595         {
2596         ptr = save_ptr;
2597         c = '\\';
2598         goto NORMAL_CHAR;
2599         }
2600       }
2601     length++;
2602
2603     /* A back reference needs an additional 2 bytes, plus either one or 5
2604     bytes for a repeat. We also need to keep the value of the highest
2605     back reference. */
2606
2607     if (c <= -ESC_REF)
2608       {
2609       int refnum = -c - ESC_REF;
2610       if (refnum > top_backref) top_backref = refnum;
2611       length += 2;   /* For single back reference */
2612       if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2613         {
2614         ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2615         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2616         if ((min == 0 && (max == 1 || max == -1)) ||
2617           (min == 1 && max == -1))
2618             length++;
2619         else length += 5;
2620         if (ptr[1] == '?') ptr++;
2621         }
2622       }
2623     continue;
2624
2625     case '^':
2626     case '.':
2627     case '$':
2628     case '*':     /* These repeats won't be after brackets; */
2629     case '+':     /* those are handled separately */
2630     case '?':
2631     length++;
2632     continue;
2633
2634     /* This covers the cases of repeats after a single char, metachar, class,
2635     or back reference. */
2636
2637     case '{':
2638     if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
2639     ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
2640     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2641     if ((min == 0 && (max == 1 || max == -1)) ||
2642       (min == 1 && max == -1))
2643         length++;
2644     else
2645       {
2646       length--;   /* Uncount the original char or metachar */
2647       if (min == 1) length++; else if (min > 0) length += 4;
2648       if (max > 0) length += 4; else length += 2;
2649       }
2650     if (ptr[1] == '?') ptr++;
2651     continue;
2652
2653     /* An alternation contains an offset to the next branch or ket. If any ims
2654     options changed in the previous branch(es), and/or if we are in a
2655     lookbehind assertion, extra space will be needed at the start of the
2656     branch. This is handled by branch_extra. */
2657
2658     case '|':
2659     length += 3 + branch_extra;
2660     continue;
2661
2662     /* A character class uses 33 characters. Don't worry about character types
2663     that aren't allowed in classes - they'll get picked up during the compile.
2664     A character class that contains only one character uses 2 or 3 bytes,
2665     depending on whether it is negated or not. Notice this where we can. */
2666
2667     case '[':
2668     class_charcount = 0;
2669     if (*(++ptr) == '^') ptr++;
2670     do
2671       {
2672       if (*ptr == '\\')
2673         {
2674         int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2675           &compile_block);
2676         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2677         if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2678         }
2679       else class_charcount++;
2680       ptr++;
2681       }
2682     while (*ptr != 0 && *ptr != ']');
2683
2684     /* Repeats for negated single chars are handled by the general code */
2685
2686     if (class_charcount == 1) length += 3; else
2687       {
2688       length += 33;
2689
2690       /* A repeat needs either 1 or 5 bytes. */
2691
2692       if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2693         {
2694         ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2695         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2696         if ((min == 0 && (max == 1 || max == -1)) ||
2697           (min == 1 && max == -1))
2698             length++;
2699         else length += 5;
2700         if (ptr[1] == '?') ptr++;
2701         }
2702       }
2703     continue;
2704
2705     /* Brackets may be genuine groups or special things */
2706
2707     case '(':
2708     branch_newextra = 0;
2709     bracket_length = 3;
2710
2711     /* Handle special forms of bracket, which all start (? */
2712
2713     if (ptr[1] == '?')
2714       {
2715       int set, unset;
2716       int *optset;
2717
2718       switch (c = ptr[2])
2719         {
2720         /* Skip over comments entirely */
2721         case '#':
2722         ptr += 3;
2723         while (*ptr != 0 && *ptr != ')') ptr++;
2724         if (*ptr == 0)
2725           {
2726           *errorptr = ERR18;
2727           goto PCRE_ERROR_RETURN;
2728           }
2729         continue;
2730
2731         /* Non-referencing groups and lookaheads just move the pointer on, and
2732         then behave like a non-special bracket, except that they don't increment
2733         the count of extracting brackets. Ditto for the "once only" bracket,
2734         which is in Perl from version 5.005. */
2735
2736         case ':':
2737         case '=':
2738         case '!':
2739         case '>':
2740         ptr += 2;
2741         break;
2742
2743         /* A recursive call to the regex is an extension, to provide the
2744         facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */
2745
2746         case 'R':
2747         if (ptr[3] != ')')
2748           {
2749           *errorptr = ERR29;
2750           goto PCRE_ERROR_RETURN;
2751           }
2752         ptr += 3;
2753         length += 1;
2754         break;
2755
2756         /* Lookbehinds are in Perl from version 5.005 */
2757
2758         case '<':
2759         if (ptr[3] == '=' || ptr[3] == '!')
2760           {
2761           ptr += 3;
2762           branch_newextra = 3;
2763           length += 3;         /* For the first branch */
2764           break;
2765           }
2766         *errorptr = ERR24;
2767         goto PCRE_ERROR_RETURN;
2768
2769         /* Conditionals are in Perl from version 5.005. The bracket must either
2770         be followed by a number (for bracket reference) or by an assertion
2771         group. */
2772
2773         case '(':
2774         if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2775           {
2776           ptr += 4;
2777           length += 3;
2778           while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2779           if (*ptr != ')')
2780             {
2781             *errorptr = ERR26;
2782             goto PCRE_ERROR_RETURN;
2783             }
2784           }
2785         else   /* An assertion must follow */
2786           {
2787           ptr++;   /* Can treat like ':' as far as spacing is concerned */
2788           if (ptr[2] != '?' ||
2789              (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
2790             {
2791             ptr += 2;    /* To get right offset in message */
2792             *errorptr = ERR28;
2793             goto PCRE_ERROR_RETURN;
2794             }
2795           }
2796         break;
2797
2798         /* Else loop checking valid options until ) is met. Anything else is an
2799         error. If we are without any brackets, i.e. at top level, the settings
2800         act as if specified in the options, so massage the options immediately.
2801         This is for backward compatibility with Perl 5.004. */
2802
2803         default:
2804         set = unset = 0;
2805         optset = &set;
2806         ptr += 2;
2807
2808         for (;; ptr++)
2809           {
2810           c = *ptr;
2811           switch (c)
2812             {
2813             case 'i':
2814             *optset |= PCRE_CASELESS;
2815             continue;
2816
2817             case 'm':
2818             *optset |= PCRE_MULTILINE;
2819             continue;
2820
2821             case 's':
2822             *optset |= PCRE_DOTALL;
2823             continue;
2824
2825             case 'x':
2826             *optset |= PCRE_EXTENDED;
2827             continue;
2828
2829             case 'X':
2830             *optset |= PCRE_EXTRA;
2831             continue;
2832
2833             case 'U':
2834             *optset |= PCRE_UNGREEDY;
2835             continue;
2836
2837             case '-':
2838             optset = &unset;
2839             continue;
2840
2841             /* A termination by ')' indicates an options-setting-only item;
2842             this is global at top level; otherwise nothing is done here and
2843             it is handled during the compiling process on a per-bracket-group
2844             basis. */
2845
2846             case ')':
2847             if (brastackptr == 0)
2848               {
2849               options = (options | set) & (~unset);
2850               set = unset = 0;     /* To save length */
2851               }
2852             /* Fall through */
2853
2854             /* A termination by ':' indicates the start of a nested group with
2855             the given options set. This is again handled at compile time, but
2856             we must allow for compiled space if any of the ims options are
2857             set. We also have to allow for resetting space at the end of
2858             the group, which is why 4 is added to the length and not just 2.
2859             If there are several changes of options within the same group, this
2860             will lead to an over-estimate on the length, but this shouldn't
2861             matter very much. We also have to allow for resetting options at
2862             the start of any alternations, which we do by setting
2863             branch_newextra to 2. Finally, we record whether the case-dependent
2864             flag ever changes within the regex. This is used by the "required
2865             character" code. */
2866
2867             case ':':
2868             if (((set|unset) & PCRE_IMS) != 0)
2869               {
2870               length += 4;
2871               branch_newextra = 2;
2872               if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2873               }
2874             goto END_OPTIONS;
2875
2876             /* Unrecognized option character */
2877
2878             default:
2879             *errorptr = ERR12;
2880             goto PCRE_ERROR_RETURN;
2881             }
2882           }
2883
2884         /* If we hit a closing bracket, that's it - this is a freestanding
2885         option-setting. We need to ensure that branch_extra is updated if
2886         necessary. The only values branch_newextra can have here are 0 or 2.
2887         If the value is 2, then branch_extra must either be 2 or 5, depending
2888         on whether this is a lookbehind group or not. */
2889
2890         END_OPTIONS:
2891         if (c == ')')
2892           {
2893           if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
2894             branch_extra += branch_newextra;
2895           continue;
2896           }
2897
2898         /* If options were terminated by ':' control comes here. Fall through
2899         to handle the group below. */
2900         }
2901       }
2902
2903     /* Extracting brackets must be counted so we can process escapes in a
2904     Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
2905     need an additional 3 bytes of store per extracting bracket. */
2906
2907     else
2908       {
2909       bracount++;
2910       if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
2911       }
2912
2913     /* Save length for computing whole length at end if there's a repeat that
2914     requires duplication of the group. Also save the current value of
2915     branch_extra, and start the new group with the new value. If non-zero, this
2916     will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
2917
2918     if (brastackptr >= sizeof(brastack)/sizeof(int))
2919       {
2920       *errorptr = ERR19;
2921       goto PCRE_ERROR_RETURN;
2922       }
2923
2924     bralenstack[brastackptr] = branch_extra;
2925     branch_extra = branch_newextra;
2926
2927     brastack[brastackptr++] = length;
2928     length += bracket_length;
2929     continue;
2930
2931     /* Handle ket. Look for subsequent max/min; for certain sets of values we
2932     have to replicate this bracket up to that many times. If brastackptr is
2933     0 this is an unmatched bracket which will generate an error, but take care
2934     not to try to access brastack[-1] when computing the length and restoring
2935     the branch_extra value. */
2936
2937     case ')':
2938     length += 3;
2939       {
2940       int minval = 1;
2941       int maxval = 1;
2942       int duplength;
2943
2944       if (brastackptr > 0)
2945         {
2946         duplength = length - brastack[--brastackptr];
2947         branch_extra = bralenstack[brastackptr];
2948         }
2949       else duplength = 0;
2950
2951       /* Leave ptr at the final char; for read_repeat_counts this happens
2952       automatically; for the others we need an increment. */
2953
2954       if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2955         {
2956         ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2957           &compile_block);
2958         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2959         }
2960       else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2961       else if (c == '+') { maxval = -1; ptr++; }
2962       else if (c == '?') { minval = 0; ptr++; }
2963
2964       /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2965       group, and if the maximum is greater than zero, we have to replicate
2966       maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2967       bracket set - hence the 7. */
2968
2969       if (minval == 0)
2970         {
2971         length++;
2972         if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2973         }
2974
2975       /* When the minimum is greater than zero, 1 we have to replicate up to
2976       minval-1 times, with no additions required in the copies. Then, if
2977       there is a limited maximum we have to replicate up to maxval-1 times
2978       allowing for a BRAZERO item before each optional copy and nesting
2979       brackets for all but one of the optional copies. */
2980
2981       else
2982         {
2983         length += (minval - 1) * duplength;
2984         if (maxval > minval)   /* Need this test as maxval=-1 means no limit */
2985           length += (maxval - minval) * (duplength + 7) - 6;
2986         }
2987       }
2988     continue;
2989
2990     /* Non-special character. For a run of such characters the length required
2991     is the number of characters + 2, except that the maximum run length is 255.
2992     We won't get a skipped space or a non-data escape or the start of a #
2993     comment as the first character, so the length can't be zero. */
2994
2995     NORMAL_CHAR:
2996     default:
2997     length += 2;
2998     runlength = 0;
2999     do
3000       {
3001       if ((options & PCRE_EXTENDED) != 0)
3002         {
3003         if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
3004         if (c == '#')
3005           {
3006           /* The space before the ; is to avoid a warning on a silly compiler
3007           on the Macintosh. */
3008           while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3009           continue;
3010           }
3011         }
3012
3013       /* Backslash may introduce a data char or a metacharacter; stop the
3014       string before the latter. */
3015
3016       if (c == '\\')
3017         {
3018         const uschar *saveptr = ptr;
3019         c = check_escape(&ptr, errorptr, bracount, options, FALSE,
3020           &compile_block);
3021         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3022         if (c < 0) { ptr = saveptr; break; }
3023
3024 #ifdef SUPPORT_UTF8
3025         if (c > 127 && (options & PCRE_UTF8) != 0)
3026           {
3027           int i;
3028           for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
3029             if (c <= utf8_table1[i]) break;
3030           runlength += i;
3031           }
3032 #endif
3033         }
3034
3035       /* Ordinary character or single-char escape */
3036
3037       runlength++;
3038       }
3039
3040     /* This "while" is the end of the "do" above. */
3041
3042     while (runlength < MAXLIT &&
3043       (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
3044
3045     ptr--;
3046     length += runlength;
3047     continue;
3048     }
3049   }
3050
3051 length += 4;    /* For final KET and END */
3052
3053 if (length > 65539)
3054   {
3055   *errorptr = ERR20;
3056   return NULL;
3057   }
3058
3059 /* Compute the size of data block needed and get it, either from malloc or
3060 externally provided function. We specify "code[0]" in the offsetof() expression
3061 rather than just "code", because it has been reported that one broken compiler
3062 fails on "code" because it is also an independent variable. It should make no
3063 difference to the value of the offsetof(). */
3064
3065 size = length + offsetof(real_pcre, code[0]);
3066 re = (real_pcre *)(pcre_malloc)(size);
3067
3068 if (re == NULL)
3069   {
3070   *errorptr = ERR21;
3071   return NULL;
3072   }
3073
3074 /* Put in the magic number, and save the size, options, and table pointer */
3075
3076 re->magic_number = MAGIC_NUMBER;
3077 re->size = size;
3078 re->options = options;
3079 re->tables = tables;
3080
3081 /* Set up a starting, non-extracting bracket, then compile the expression. On
3082 error, *errorptr will be set non-NULL, so we don't need to look at the result
3083 of the function here. */
3084
3085 ptr = (const uschar *)pattern;
3086 code = re->code;
3087 *code = OP_BRA;
3088 bracount = 0;
3089 (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, 0,
3090   &reqchar, &countlits, &compile_block);
3091 re->top_bracket = bracount;
3092 re->top_backref = top_backref;
3093
3094 /* If not reached end of pattern on success, there's an excess bracket. */
3095
3096 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
3097
3098 /* Fill in the terminating state and check for disastrous overflow, but
3099 if debugging, leave the test till after things are printed out. */
3100
3101 *code++ = OP_END;
3102
3103 #ifndef DEBUG
3104 if (code - re->code > length) *errorptr = ERR23;
3105 #endif
3106
3107 /* Give an error if there's back reference to a non-existent capturing
3108 subpattern. */
3109
3110 if (top_backref > re->top_bracket) *errorptr = ERR15;
3111
3112 /* Failed to compile */
3113
3114 if (*errorptr != NULL)
3115   {
3116   (pcre_free)(re);
3117   PCRE_ERROR_RETURN:
3118   *erroroffset = ptr - (const uschar *)pattern;
3119   return NULL;
3120   }
3121
3122 /* If the anchored option was not passed, set flag if we can determine that the
3123 pattern is anchored by virtue of ^ characters or \A or anything else (such as
3124 starting with .* when DOTALL is set).
3125
3126 Otherwise, see if we can determine what the first character has to be, because
3127 that speeds up unanchored matches no end. If not, see if we can set the
3128 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
3129 start with ^. and also when all branches start with .* for non-DOTALL matches.
3130 */
3131
3132 if ((options & PCRE_ANCHORED) == 0)
3133   {
3134   int temp_options = options;
3135   if (is_anchored(re->code, &temp_options))
3136     re->options |= PCRE_ANCHORED;
3137   else
3138     {
3139     int ch = find_firstchar(re->code, &temp_options);
3140     if (ch >= 0)
3141       {
3142       re->first_char = ch;
3143       re->options |= PCRE_FIRSTSET;
3144       }
3145     else if (is_startline(re->code))
3146       re->options |= PCRE_STARTLINE;
3147     }
3148   }
3149
3150 /* Save the last required character if there are at least two literal
3151 characters on all paths, or if there is no first character setting. */
3152
3153 if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
3154   {
3155   re->req_char = reqchar;
3156   re->options |= PCRE_REQCHSET;
3157   }
3158
3159 /* Print out the compiled data for debugging */
3160
3161 #ifdef DEBUG
3162
3163 printf("Length = %d top_bracket = %d top_backref = %d\n",
3164   length, re->top_bracket, re->top_backref);
3165
3166 if (re->options != 0)
3167   {
3168   printf("%s%s%s%s%s%s%s%s%s\n",
3169     ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
3170     ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
3171     ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
3172     ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
3173     ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
3174     ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
3175     ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
3176     ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
3177     ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
3178   }
3179
3180 if ((re->options & PCRE_FIRSTSET) != 0)
3181   {
3182   if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
3183     else printf("First char = \\x%02x\n", re->first_char);
3184   }
3185
3186 if ((re->options & PCRE_REQCHSET) != 0)
3187   {
3188   if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
3189     else printf("Req char = \\x%02x\n", re->req_char);
3190   }
3191
3192 code_end = code;
3193 code_base = code = re->code;
3194
3195 while (code < code_end)
3196   {
3197   int charlength;
3198
3199   printf("%3d ", code - code_base);
3200
3201   if (*code >= OP_BRA)
3202     {
3203     if (*code - OP_BRA > EXTRACT_BASIC_MAX)
3204       printf("%3d Bra extra", (code[1] << 8) + code[2]);
3205     else
3206       printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
3207     code += 2;
3208     }
3209
3210   else switch(*code)
3211     {
3212     case OP_OPT:
3213     printf(" %.2x %s", code[1], OP_names[*code]);
3214     code++;
3215     break;
3216
3217     case OP_CHARS:
3218     charlength = *(++code);
3219     printf("%3d ", charlength);
3220     while (charlength-- > 0)
3221       if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
3222     break;
3223
3224     case OP_KETRMAX:
3225     case OP_KETRMIN:
3226     case OP_ALT:
3227     case OP_KET:
3228     case OP_ASSERT:
3229     case OP_ASSERT_NOT:
3230     case OP_ASSERTBACK:
3231     case OP_ASSERTBACK_NOT:
3232     case OP_ONCE:
3233     case OP_REVERSE:
3234     case OP_BRANUMBER:
3235     case OP_COND:
3236     case OP_CREF:
3237     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
3238     code += 2;
3239     break;
3240
3241     case OP_STAR:
3242     case OP_MINSTAR:
3243     case OP_PLUS:
3244     case OP_MINPLUS:
3245     case OP_QUERY:
3246     case OP_MINQUERY:
3247     case OP_TYPESTAR:
3248     case OP_TYPEMINSTAR:
3249     case OP_TYPEPLUS:
3250     case OP_TYPEMINPLUS:
3251     case OP_TYPEQUERY:
3252     case OP_TYPEMINQUERY:
3253     if (*code >= OP_TYPESTAR)
3254       printf("    %s", OP_names[code[1]]);
3255     else if (isprint(c = code[1])) printf("    %c", c);
3256       else printf("    \\x%02x", c);
3257     printf("%s", OP_names[*code++]);
3258     break;
3259
3260     case OP_EXACT:
3261     case OP_UPTO:
3262     case OP_MINUPTO:
3263     if (isprint(c = code[3])) printf("    %c{", c);
3264       else printf("    \\x%02x{", c);
3265     if (*code != OP_EXACT) printf("0,");
3266     printf("%d}", (code[1] << 8) + code[2]);
3267     if (*code == OP_MINUPTO) printf("?");
3268     code += 3;
3269     break;
3270
3271     case OP_TYPEEXACT:
3272     case OP_TYPEUPTO:
3273     case OP_TYPEMINUPTO:
3274     printf("    %s{", OP_names[code[3]]);
3275     if (*code != OP_TYPEEXACT) printf(",");
3276     printf("%d}", (code[1] << 8) + code[2]);
3277     if (*code == OP_TYPEMINUPTO) printf("?");
3278     code += 3;
3279     break;
3280
3281     case OP_NOT:
3282     if (isprint(c = *(++code))) printf("    [^%c]", c);
3283       else printf("    [^\\x%02x]", c);
3284     break;
3285
3286     case OP_NOTSTAR:
3287     case OP_NOTMINSTAR:
3288     case OP_NOTPLUS:
3289     case OP_NOTMINPLUS:
3290     case OP_NOTQUERY:
3291     case OP_NOTMINQUERY:
3292     if (isprint(c = code[1])) printf("    [^%c]", c);
3293       else printf("    [^\\x%02x]", c);
3294     printf("%s", OP_names[*code++]);
3295     break;
3296
3297     case OP_NOTEXACT:
3298     case OP_NOTUPTO:
3299     case OP_NOTMINUPTO:
3300     if (isprint(c = code[3])) printf("    [^%c]{", c);
3301       else printf("    [^\\x%02x]{", c);
3302     if (*code != OP_NOTEXACT) printf(",");
3303     printf("%d}", (code[1] << 8) + code[2]);
3304     if (*code == OP_NOTMINUPTO) printf("?");
3305     code += 3;
3306     break;
3307
3308     case OP_REF:
3309     printf("    \\%d", (code[1] << 8) | code[2]);
3310     code += 3;
3311     goto CLASS_REF_REPEAT;
3312
3313     case OP_CLASS:
3314       {
3315       int i, min, max;
3316       code++;
3317       printf("    [");
3318
3319       for (i = 0; i < 256; i++)
3320         {
3321         if ((code[i/8] & (1 << (i&7))) != 0)
3322           {
3323           int j;
3324           for (j = i+1; j < 256; j++)
3325             if ((code[j/8] & (1 << (j&7))) == 0) break;
3326           if (i == '-' || i == ']') printf("\\");
3327           if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
3328           if (--j > i)
3329             {
3330             printf("-");
3331             if (j == '-' || j == ']') printf("\\");
3332             if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
3333             }
3334           i = j;
3335           }
3336         }
3337       printf("]");
3338       code += 32;
3339
3340       CLASS_REF_REPEAT:
3341
3342       switch(*code)
3343         {
3344         case OP_CRSTAR:
3345         case OP_CRMINSTAR:
3346         case OP_CRPLUS:
3347         case OP_CRMINPLUS:
3348         case OP_CRQUERY:
3349         case OP_CRMINQUERY:
3350         printf("%s", OP_names[*code]);
3351         break;
3352
3353         case OP_CRRANGE:
3354         case OP_CRMINRANGE:
3355         min = (code[1] << 8) + code[2];
3356         max = (code[3] << 8) + code[4];
3357         if (max == 0) printf("{%d,}", min);
3358         else printf("{%d,%d}", min, max);
3359         if (*code == OP_CRMINRANGE) printf("?");
3360         code += 4;
3361         break;
3362
3363         default:
3364         code--;
3365         }
3366       }
3367     break;
3368
3369     /* Anything else is just a one-node item */
3370
3371     default:
3372     printf("    %s", OP_names[*code]);
3373     break;
3374     }
3375
3376   code++;
3377   printf("\n");
3378   }
3379 printf("------------------------------------------------------------------\n");
3380
3381 /* This check is done here in the debugging case so that the code that
3382 was compiled can be seen. */
3383
3384 if (code - re->code > length)
3385   {
3386   *errorptr = ERR23;
3387   (pcre_free)(re);
3388   *erroroffset = ptr - (uschar *)pattern;
3389   return NULL;
3390   }
3391 #endif
3392
3393 return (pcre *)re;
3394 }
3395
3396
3397
3398 /*************************************************
3399 *          Match a back-reference                *
3400 *************************************************/
3401
3402 /* If a back reference hasn't been set, the length that is passed is greater
3403 than the number of characters left in the string, so the match fails.
3404
3405 Arguments:
3406   offset      index into the offset vector
3407   eptr        points into the subject
3408   length      length to be matched
3409   md          points to match data block
3410   ims         the ims flags
3411
3412 Returns:      TRUE if matched
3413 */
3414
3415 static BOOL
3416 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
3417   unsigned long int ims)
3418 {
3419 const uschar *p = md->start_subject + md->offset_vector[offset];
3420
3421 #ifdef DEBUG
3422 if (eptr >= md->end_subject)
3423   printf("matching subject <null>");
3424 else
3425   {
3426   printf("matching subject ");
3427   pchars(eptr, length, TRUE, md);
3428   }
3429 printf(" against backref ");
3430 pchars(p, length, FALSE, md);
3431 printf("\n");
3432 #endif
3433
3434 /* Always fail if not enough characters left */
3435
3436 if (length > md->end_subject - eptr) return FALSE;
3437
3438 /* Separate the caselesss case for speed */
3439
3440 if ((ims & PCRE_CASELESS) != 0)
3441   {
3442   while (length-- > 0)
3443     if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
3444   }
3445 else
3446   { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3447
3448 return TRUE;
3449 }
3450
3451
3452
3453 /*************************************************
3454 *         Match from current position            *
3455 *************************************************/
3456
3457 /* On entry ecode points to the first opcode, and eptr to the first character
3458 in the subject string, while eptrb holds the value of eptr at the start of the
3459 last bracketed group - used for breaking infinite loops matching zero-length
3460 strings.
3461
3462 Arguments:
3463    eptr        pointer in subject
3464    ecode       position in code
3465    offset_top  current top pointer
3466    md          pointer to "static" info for the match
3467    ims         current /i, /m, and /s options
3468    eptrb       pointer to chain of blocks containing eptr at start of
3469                  brackets - for testing for empty matches
3470    flags       can contain
3471                  match_condassert - this is an assertion condition
3472                  match_isgroup - this is the start of a bracketed group
3473
3474 Returns:       TRUE if matched
3475 */
3476
3477 static BOOL
3478 match(register const uschar *eptr, register const uschar *ecode,
3479   int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
3480   int flags)
3481 {
3482 unsigned long int original_ims = ims;   /* Save for resetting on ')' */
3483 eptrblock newptrb;
3484
3485 /* At the start of a bracketed group, add the current subject pointer to the
3486 stack of such pointers, to be re-instated at the end of the group when we hit
3487 the closing ket. When match() is called in other circumstances, we don't add to
3488 the stack. */
3489
3490 if ((flags & match_isgroup) != 0)
3491   {
3492   newptrb.prev = eptrb;
3493   newptrb.saved_eptr = eptr;
3494   eptrb = &newptrb;
3495   }
3496
3497 /* Now start processing the operations. */
3498
3499 for (;;)
3500   {
3501   int op = (int)*ecode;
3502   int min, max, ctype;
3503   register int i;
3504   register int c;
3505   BOOL minimize = FALSE;
3506
3507   /* Opening capturing bracket. If there is space in the offset vector, save
3508   the current subject position in the working slot at the top of the vector. We
3509   mustn't change the current values of the data slot, because they may be set
3510   from a previous iteration of this group, and be referred to by a reference
3511   inside the group.
3512
3513   If the bracket fails to match, we need to restore this value and also the
3514   values of the final offsets, in case they were set by a previous iteration of
3515   the same bracket.
3516
3517   If there isn't enough space in the offset vector, treat this as if it were a
3518   non-capturing bracket. Don't worry about setting the flag for the error case
3519   here; that is handled in the code for KET. */
3520
3521   if (op > OP_BRA)
3522     {
3523     int offset;
3524     int number = op - OP_BRA;
3525
3526     /* For extended extraction brackets (large number), we have to fish out the
3527     number from a dummy opcode at the start. */
3528
3529     if (number > EXTRACT_BASIC_MAX) number = (ecode[4] << 8) | ecode[5];
3530     offset = number << 1;
3531
3532 #ifdef DEBUG
3533     printf("start bracket %d subject=", number);
3534     pchars(eptr, 16, TRUE, md);
3535     printf("\n");
3536 #endif
3537
3538     if (offset < md->offset_max)
3539       {
3540       int save_offset1 = md->offset_vector[offset];
3541       int save_offset2 = md->offset_vector[offset+1];
3542       int save_offset3 = md->offset_vector[md->offset_end - number];
3543
3544       DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
3545       md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
3546
3547       do
3548         {
3549         if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3550           return TRUE;
3551         ecode += (ecode[1] << 8) + ecode[2];
3552         }
3553       while (*ecode == OP_ALT);
3554
3555       DPRINTF(("bracket %d failed\n", number));
3556
3557       md->offset_vector[offset] = save_offset1;
3558       md->offset_vector[offset+1] = save_offset2;
3559       md->offset_vector[md->offset_end - number] = save_offset3;
3560
3561       return FALSE;
3562       }
3563
3564     /* Insufficient room for saving captured contents */
3565
3566     else op = OP_BRA;
3567     }
3568
3569   /* Other types of node can be handled by a switch */
3570
3571   switch(op)
3572     {
3573     case OP_BRA:     /* Non-capturing bracket: optimized */
3574     DPRINTF(("start bracket 0\n"));
3575     do
3576       {
3577       if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3578         return TRUE;
3579       ecode += (ecode[1] << 8) + ecode[2];
3580       }
3581     while (*ecode == OP_ALT);
3582     DPRINTF(("bracket 0 failed\n"));
3583     return FALSE;
3584
3585     /* Conditional group: compilation checked that there are no more than
3586     two branches. If the condition is false, skipping the first branch takes us
3587     past the end if there is only one branch, but that's OK because that is
3588     exactly what going to the ket would do. */
3589
3590     case OP_COND:
3591     if (ecode[3] == OP_CREF)         /* Condition is extraction test */
3592       {
3593       int offset = (ecode[4] << 9) | (ecode[5] << 1); /* Doubled ref number */
3594       return match(eptr,
3595         ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
3596           6 : 3 + (ecode[1] << 8) + ecode[2]),
3597         offset_top, md, ims, eptrb, match_isgroup);
3598       }
3599
3600     /* The condition is an assertion. Call match() to evaluate it - setting
3601     the final argument TRUE causes it to stop at the end of an assertion. */
3602
3603     else
3604       {
3605       if (match(eptr, ecode+3, offset_top, md, ims, NULL,
3606           match_condassert | match_isgroup))
3607         {
3608         ecode += 3 + (ecode[4] << 8) + ecode[5];
3609         while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
3610         }
3611       else ecode += (ecode[1] << 8) + ecode[2];
3612       return match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup);
3613       }
3614     /* Control never reaches here */
3615
3616     /* Skip over conditional reference or large extraction number data if
3617     encountered. */
3618
3619     case OP_CREF:
3620     case OP_BRANUMBER:
3621     ecode += 3;
3622     break;
3623
3624     /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3625     an empty string - recursion will then try other alternatives, if any. */
3626
3627     case OP_END:
3628     if (md->notempty && eptr == md->start_match) return FALSE;
3629     md->end_match_ptr = eptr;          /* Record where we ended */
3630     md->end_offset_top = offset_top;   /* and how many extracts were taken */
3631     return TRUE;
3632
3633     /* Change option settings */
3634
3635     case OP_OPT:
3636     ims = ecode[1];
3637     ecode += 2;
3638     DPRINTF(("ims set to %02lx\n", ims));
3639     break;
3640
3641     /* Assertion brackets. Check the alternative branches in turn - the
3642     matching won't pass the KET for an assertion. If any one branch matches,
3643     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
3644     start of each branch to move the current point backwards, so the code at
3645     this level is identical to the lookahead case. */
3646
3647     case OP_ASSERT:
3648     case OP_ASSERTBACK:
3649     do
3650       {
3651       if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup)) break;
3652       ecode += (ecode[1] << 8) + ecode[2];
3653       }
3654     while (*ecode == OP_ALT);
3655     if (*ecode == OP_KET) return FALSE;
3656
3657     /* If checking an assertion for a condition, return TRUE. */
3658
3659     if ((flags & match_condassert) != 0) return TRUE;
3660
3661     /* Continue from after the assertion, updating the offsets high water
3662     mark, since extracts may have been taken during the assertion. */
3663
3664     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3665     ecode += 3;
3666     offset_top = md->end_offset_top;
3667     continue;
3668
3669     /* Negative assertion: all branches must fail to match */
3670
3671     case OP_ASSERT_NOT:
3672     case OP_ASSERTBACK_NOT:
3673     do
3674       {
3675       if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup))
3676         return FALSE;
3677       ecode += (ecode[1] << 8) + ecode[2];
3678       }
3679     while (*ecode == OP_ALT);
3680
3681     if ((flags & match_condassert) != 0) return TRUE;
3682
3683     ecode += 3;
3684     continue;
3685
3686     /* Move the subject pointer back. This occurs only at the start of
3687     each branch of a lookbehind assertion. If we are too close to the start to
3688     move back, this match function fails. When working with UTF-8 we move
3689     back a number of characters, not bytes. */
3690
3691     case OP_REVERSE:
3692 #ifdef SUPPORT_UTF8
3693     c = (ecode[1] << 8) + ecode[2];
3694     for (i = 0; i < c; i++)
3695       {
3696       eptr--;
3697       BACKCHAR(eptr)
3698       }
3699 #else
3700     eptr -= (ecode[1] << 8) + ecode[2];
3701 #endif
3702
3703     if (eptr < md->start_subject) return FALSE;
3704     ecode += 3;
3705     break;
3706
3707     /* Recursion matches the current regex, nested. If there are any capturing
3708     brackets started but not finished, we have to save their starting points
3709     and reinstate them after the recursion. However, we don't know how many
3710     such there are (offset_top records the completed total) so we just have
3711     to save all the potential data. There may be up to 99 such values, which
3712     is a bit large to put on the stack, but using malloc for small numbers
3713     seems expensive. As a compromise, the stack is used when there are fewer
3714     than 16 values to store; otherwise malloc is used. A problem is what to do
3715     if the malloc fails ... there is no way of returning to the top level with
3716     an error. Save the top 15 values on the stack, and accept that the rest
3717     may be wrong. */
3718
3719     case OP_RECURSE:
3720       {
3721       BOOL rc;
3722       int *save;
3723       int stacksave[15];
3724
3725       c = md->offset_max;
3726
3727       if (c < 16) save = stacksave; else
3728         {
3729         save = (int *)(pcre_malloc)((c+1) * sizeof(int));
3730         if (save == NULL)
3731           {
3732           save = stacksave;
3733           c = 15;
3734           }
3735         }
3736
3737       for (i = 1; i <= c; i++)
3738         save[i] = md->offset_vector[md->offset_end - i];
3739       rc = match(eptr, md->start_pattern, offset_top, md, ims, eptrb,
3740         match_isgroup);
3741       for (i = 1; i <= c; i++)
3742         md->offset_vector[md->offset_end - i] = save[i];
3743       if (save != stacksave) (pcre_free)(save);
3744       if (!rc) return FALSE;
3745
3746       /* In case the recursion has set more capturing values, save the final
3747       number, then move along the subject till after the recursive match,
3748       and advance one byte in the pattern code. */
3749
3750       offset_top = md->end_offset_top;
3751       eptr = md->end_match_ptr;
3752       ecode++;
3753       }
3754     break;
3755
3756     /* "Once" brackets are like assertion brackets except that after a match,
3757     the point in the subject string is not moved back. Thus there can never be
3758     a move back into the brackets. Check the alternative branches in turn - the
3759     matching won't pass the KET for this kind of subpattern. If any one branch
3760     matches, we carry on as at the end of a normal bracket, leaving the subject
3761     pointer. */
3762
3763     case OP_ONCE:
3764       {
3765       const uschar *prev = ecode;
3766       const uschar *saved_eptr = eptr;
3767
3768       do
3769         {
3770         if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3771           break;
3772         ecode += (ecode[1] << 8) + ecode[2];
3773         }
3774       while (*ecode == OP_ALT);
3775
3776       /* If hit the end of the group (which could be repeated), fail */
3777
3778       if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
3779
3780       /* Continue as from after the assertion, updating the offsets high water
3781       mark, since extracts may have been taken. */
3782
3783       do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3784
3785       offset_top = md->end_offset_top;
3786       eptr = md->end_match_ptr;
3787
3788       /* For a non-repeating ket, just continue at this level. This also
3789       happens for a repeating ket if no characters were matched in the group.
3790       This is the forcible breaking of infinite loops as implemented in Perl
3791       5.005. If there is an options reset, it will get obeyed in the normal
3792       course of events. */
3793
3794       if (*ecode == OP_KET || eptr == saved_eptr)
3795         {
3796         ecode += 3;
3797         break;
3798         }
3799
3800       /* The repeating kets try the rest of the pattern or restart from the
3801       preceding bracket, in the appropriate order. We need to reset any options
3802       that changed within the bracket before re-running it, so check the next
3803       opcode. */
3804
3805       if (ecode[3] == OP_OPT)
3806         {
3807         ims = (ims & ~PCRE_IMS) | ecode[4];
3808         DPRINTF(("ims set to %02lx at group repeat\n", ims));
3809         }
3810
3811       if (*ecode == OP_KETRMIN)
3812         {
3813         if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3814             match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3815               return TRUE;
3816         }
3817       else  /* OP_KETRMAX */
3818         {
3819         if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3820             match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3821         }
3822       }
3823     return FALSE;
3824
3825     /* An alternation is the end of a branch; scan along to find the end of the
3826     bracketed group and go to there. */
3827
3828     case OP_ALT:
3829     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3830     break;
3831
3832     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3833     that it may occur zero times. It may repeat infinitely, or not at all -
3834     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3835     repeat limits are compiled as a number of copies, with the optional ones
3836     preceded by BRAZERO or BRAMINZERO. */
3837
3838     case OP_BRAZERO:
3839       {
3840       const uschar *next = ecode+1;
3841       if (match(eptr, next, offset_top, md, ims, eptrb, match_isgroup))
3842         return TRUE;
3843       do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3844       ecode = next + 3;
3845       }
3846     break;
3847
3848     case OP_BRAMINZERO:
3849       {
3850       const uschar *next = ecode+1;
3851       do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3852       if (match(eptr, next+3, offset_top, md, ims, eptrb, match_isgroup))
3853         return TRUE;
3854       ecode++;
3855       }
3856     break;
3857
3858     /* End of a group, repeated or non-repeating. If we are at the end of
3859     an assertion "group", stop matching and return TRUE, but record the
3860     current high water mark for use by positive assertions. Do this also
3861     for the "once" (not-backup up) groups. */
3862
3863     case OP_KET:
3864     case OP_KETRMIN:
3865     case OP_KETRMAX:
3866       {
3867       const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3868       const uschar *saved_eptr = eptrb->saved_eptr;
3869
3870       eptrb = eptrb->prev;    /* Back up the stack of bracket start pointers */
3871
3872       if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3873           *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
3874           *prev == OP_ONCE)
3875         {
3876         md->end_match_ptr = eptr;      /* For ONCE */
3877         md->end_offset_top = offset_top;
3878         return TRUE;
3879         }
3880
3881       /* In all other cases except a conditional group we have to check the
3882       group number back at the start and if necessary complete handling an
3883       extraction by setting the offsets and bumping the high water mark. */
3884
3885       if (*prev != OP_COND)
3886         {
3887         int offset;
3888         int number = *prev - OP_BRA;
3889
3890         /* For extended extraction brackets (large number), we have to fish out
3891         the number from a dummy opcode at the start. */
3892
3893         if (number > EXTRACT_BASIC_MAX) number = (prev[4] << 8) | prev[5];
3894         offset = number << 1;
3895
3896 #ifdef DEBUG
3897         printf("end bracket %d", number);
3898         printf("\n");
3899 #endif
3900
3901         if (number > 0)
3902           {
3903           if (offset >= md->offset_max) md->offset_overflow = TRUE; else
3904             {
3905             md->offset_vector[offset] =
3906               md->offset_vector[md->offset_end - number];
3907             md->offset_vector[offset+1] = eptr - md->start_subject;
3908             if (offset_top <= offset) offset_top = offset + 2;
3909             }
3910           }
3911         }
3912
3913       /* Reset the value of the ims flags, in case they got changed during
3914       the group. */
3915
3916       ims = original_ims;
3917       DPRINTF(("ims reset to %02lx\n", ims));
3918
3919       /* For a non-repeating ket, just continue at this level. This also
3920       happens for a repeating ket if no characters were matched in the group.
3921       This is the forcible breaking of infinite loops as implemented in Perl
3922       5.005. If there is an options reset, it will get obeyed in the normal
3923       course of events. */
3924
3925       if (*ecode == OP_KET || eptr == saved_eptr)
3926         {
3927         ecode += 3;
3928         break;
3929         }
3930
3931       /* The repeating kets try the rest of the pattern or restart from the
3932       preceding bracket, in the appropriate order. */
3933
3934       if (*ecode == OP_KETRMIN)
3935         {
3936         if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3937             match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3938               return TRUE;
3939         }
3940       else  /* OP_KETRMAX */
3941         {
3942         if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3943             match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3944         }
3945       }
3946     return FALSE;
3947
3948     /* Start of subject unless notbol, or after internal newline if multiline */
3949
3950     case OP_CIRC:
3951     if (md->notbol && eptr == md->start_subject) return FALSE;
3952     if ((ims & PCRE_MULTILINE) != 0)
3953       {
3954       if (eptr != md->start_subject && eptr[-1] != NEWLINE) return FALSE;
3955       ecode++;
3956       break;
3957       }
3958     /* ... else fall through */
3959
3960     /* Start of subject assertion */
3961
3962     case OP_SOD:
3963     if (eptr != md->start_subject) return FALSE;
3964     ecode++;
3965     break;
3966
3967     /* Assert before internal newline if multiline, or before a terminating
3968     newline unless endonly is set, else end of subject unless noteol is set. */
3969
3970     case OP_DOLL:
3971     if ((ims & PCRE_MULTILINE) != 0)
3972       {
3973       if (eptr < md->end_subject) { if (*eptr != NEWLINE) return FALSE; }
3974         else { if (md->noteol) return FALSE; }
3975       ecode++;
3976       break;
3977       }
3978     else
3979       {
3980       if (md->noteol) return FALSE;
3981       if (!md->endonly)
3982         {
3983         if (eptr < md->end_subject - 1 ||
3984            (eptr == md->end_subject - 1 && *eptr != NEWLINE)) return FALSE;
3985
3986         ecode++;
3987         break;
3988         }
3989       }
3990     /* ... else fall through */
3991
3992     /* End of subject assertion (\z) */
3993
3994     case OP_EOD:
3995     if (eptr < md->end_subject) return FALSE;
3996     ecode++;
3997     break;
3998
3999     /* End of subject or ending \n assertion (\Z) */
4000
4001     case OP_EODN:
4002     if (eptr < md->end_subject - 1 ||
4003        (eptr == md->end_subject - 1 && *eptr != NEWLINE)) return FALSE;
4004     ecode++;
4005     break;
4006
4007     /* Word boundary assertions */
4008
4009     case OP_NOT_WORD_BOUNDARY:
4010     case OP_WORD_BOUNDARY:
4011       {
4012       BOOL prev_is_word = (eptr != md->start_subject) &&
4013         ((md->ctypes[eptr[-1]] & ctype_word) != 0);
4014       BOOL cur_is_word = (eptr < md->end_subject) &&
4015         ((md->ctypes[*eptr] & ctype_word) != 0);
4016       if ((*ecode++ == OP_WORD_BOUNDARY)?
4017            cur_is_word == prev_is_word : cur_is_word != prev_is_word)
4018         return FALSE;
4019       }
4020     break;
4021
4022     /* Match a single character type; inline for speed */
4023
4024     case OP_ANY:
4025     if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
4026       return FALSE;
4027     if (eptr++ >= md->end_subject) return FALSE;
4028 #ifdef SUPPORT_UTF8
4029     if (md->utf8)
4030       while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4031 #endif
4032     ecode++;
4033     break;
4034
4035     case OP_NOT_DIGIT:
4036     if (eptr >= md->end_subject ||
4037        (md->ctypes[*eptr++] & ctype_digit) != 0)
4038       return FALSE;
4039     ecode++;
4040     break;
4041
4042     case OP_DIGIT:
4043     if (eptr >= md->end_subject ||
4044        (md->ctypes[*eptr++] & ctype_digit) == 0)
4045       return FALSE;
4046     ecode++;
4047     break;
4048
4049     case OP_NOT_WHITESPACE:
4050     if (eptr >= md->end_subject ||
4051        (md->ctypes[*eptr++] & ctype_space) != 0)
4052       return FALSE;
4053     ecode++;
4054     break;
4055
4056     case OP_WHITESPACE:
4057     if (eptr >= md->end_subject ||
4058        (md->ctypes[*eptr++] & ctype_space) == 0)
4059       return FALSE;
4060     ecode++;
4061     break;
4062
4063     case OP_NOT_WORDCHAR:
4064     if (eptr >= md->end_subject ||
4065        (md->ctypes[*eptr++] & ctype_word) != 0)
4066       return FALSE;
4067     ecode++;
4068     break;
4069
4070     case OP_WORDCHAR:
4071     if (eptr >= md->end_subject ||
4072        (md->ctypes[*eptr++] & ctype_word) == 0)
4073       return FALSE;
4074     ecode++;
4075     break;
4076
4077     /* Match a back reference, possibly repeatedly. Look past the end of the
4078     item to see if there is repeat information following. The code is similar
4079     to that for character classes, but repeated for efficiency. Then obey
4080     similar code to character type repeats - written out again for speed.
4081     However, if the referenced string is the empty string, always treat
4082     it as matched, any number of times (otherwise there could be infinite
4083     loops). */
4084
4085     case OP_REF:
4086       {
4087       int length;
4088       int offset = (ecode[1] << 9) | (ecode[2] << 1); /* Doubled ref number */
4089       ecode += 3;                                     /* Advance past item */
4090
4091       /* If the reference is unset, set the length to be longer than the amount
4092       of subject left; this ensures that every attempt at a match fails. We
4093       can't just fail here, because of the possibility of quantifiers with zero
4094       minima. */
4095
4096       length = (offset >= offset_top || md->offset_vector[offset] < 0)?
4097         md->end_subject - eptr + 1 :
4098         md->offset_vector[offset+1] - md->offset_vector[offset];
4099
4100       /* Set up for repetition, or handle the non-repeated case */
4101
4102       switch (*ecode)
4103         {
4104         case OP_CRSTAR:
4105         case OP_CRMINSTAR:
4106         case OP_CRPLUS:
4107         case OP_CRMINPLUS:
4108         case OP_CRQUERY:
4109         case OP_CRMINQUERY:
4110         c = *ecode++ - OP_CRSTAR;
4111         minimize = (c & 1) != 0;
4112         min = rep_min[c];                 /* Pick up values from tables; */
4113         max = rep_max[c];                 /* zero for max => infinity */
4114         if (max == 0) max = INT_MAX;
4115         break;
4116
4117         case OP_CRRANGE:
4118         case OP_CRMINRANGE:
4119         minimize = (*ecode == OP_CRMINRANGE);
4120         min = (ecode[1] << 8) + ecode[2];
4121         max = (ecode[3] << 8) + ecode[4];
4122         if (max == 0) max = INT_MAX;
4123         ecode += 5;
4124         break;
4125
4126         default:               /* No repeat follows */
4127         if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
4128         eptr += length;
4129         continue;              /* With the main loop */
4130         }
4131
4132       /* If the length of the reference is zero, just continue with the
4133       main loop. */
4134
4135       if (length == 0) continue;
4136
4137       /* First, ensure the minimum number of matches are present. We get back
4138       the length of the reference string explicitly rather than passing the
4139       address of eptr, so that eptr can be a register variable. */
4140
4141       for (i = 1; i <= min; i++)
4142         {
4143         if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
4144         eptr += length;
4145         }
4146
4147       /* If min = max, continue at the same level without recursion.
4148       They are not both allowed to be zero. */
4149
4150       if (min == max) continue;
4151
4152       /* If minimizing, keep trying and advancing the pointer */
4153
4154       if (minimize)
4155         {
4156         for (i = min;; i++)
4157           {
4158           if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4159             return TRUE;
4160           if (i >= max || !match_ref(offset, eptr, length, md, ims))
4161             return FALSE;
4162           eptr += length;
4163           }
4164         /* Control never gets here */
4165         }
4166
4167       /* If maximizing, find the longest string and work backwards */
4168
4169       else
4170         {
4171         const uschar *pp = eptr;
4172         for (i = min; i < max; i++)
4173           {
4174           if (!match_ref(offset, eptr, length, md, ims)) break;
4175           eptr += length;
4176           }
4177         while (eptr >= pp)
4178           {
4179           if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4180             return TRUE;
4181           eptr -= length;
4182           }
4183         return FALSE;
4184         }
4185       }
4186     /* Control never gets here */
4187
4188
4189
4190     /* Match a character class, possibly repeatedly. Look past the end of the
4191     item to see if there is repeat information following. Then obey similar
4192     code to character type repeats - written out again for speed. */
4193
4194     case OP_CLASS:
4195       {
4196       const uschar *data = ecode + 1;  /* Save for matching */
4197       ecode += 33;                     /* Advance past the item */
4198
4199       switch (*ecode)
4200         {
4201         case OP_CRSTAR:
4202         case OP_CRMINSTAR:
4203         case OP_CRPLUS:
4204         case OP_CRMINPLUS:
4205         case OP_CRQUERY:
4206         case OP_CRMINQUERY:
4207         c = *ecode++ - OP_CRSTAR;
4208         minimize = (c & 1) != 0;
4209         min = rep_min[c];                 /* Pick up values from tables; */
4210         max = rep_max[c];                 /* zero for max => infinity */
4211         if (max == 0) max = INT_MAX;
4212         break;
4213
4214         case OP_CRRANGE:
4215         case OP_CRMINRANGE:
4216         minimize = (*ecode == OP_CRMINRANGE);
4217         min = (ecode[1] << 8) + ecode[2];
4218         max = (ecode[3] << 8) + ecode[4];
4219         if (max == 0) max = INT_MAX;
4220         ecode += 5;
4221         break;
4222
4223         default:               /* No repeat follows */
4224         min = max = 1;
4225         break;
4226         }
4227
4228       /* First, ensure the minimum number of matches are present. */
4229
4230       for (i = 1; i <= min; i++)
4231         {
4232         if (eptr >= md->end_subject) return FALSE;
4233         GETCHARINC(c, eptr)         /* Get character; increment eptr */
4234
4235 #ifdef SUPPORT_UTF8
4236         /* We do not yet support class members > 255 */
4237         if (c > 255) return FALSE;
4238 #endif
4239
4240         if ((data[c/8] & (1 << (c&7))) != 0) continue;
4241         return FALSE;
4242         }
4243
4244       /* If max == min we can continue with the main loop without the
4245       need to recurse. */
4246
4247       if (min == max) continue;
4248
4249       /* If minimizing, keep testing the rest of the expression and advancing
4250       the pointer while it matches the class. */
4251
4252       if (minimize)
4253         {
4254         for (i = min;; i++)
4255           {
4256           if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4257             return TRUE;
4258           if (i >= max || eptr >= md->end_subject) return FALSE;
4259           GETCHARINC(c, eptr)       /* Get character; increment eptr */
4260
4261 #ifdef SUPPORT_UTF8
4262           /* We do not yet support class members > 255 */
4263           if (c > 255) return FALSE;
4264 #endif
4265           if ((data[c/8] & (1 << (c&7))) != 0) continue;
4266           return FALSE;
4267           }
4268         /* Control never gets here */
4269         }
4270
4271       /* If maximizing, find the longest possible run, then work backwards. */
4272
4273       else
4274         {
4275         const uschar *pp = eptr;
4276         int len = 1;
4277         for (i = min; i < max; i++)
4278           {
4279           if (eptr >= md->end_subject) break;
4280           GETCHARLEN(c, eptr, len)  /* Get character, set length if UTF-8 */
4281
4282 #ifdef SUPPORT_UTF8
4283           /* We do not yet support class members > 255 */
4284           if (c > 255) break;
4285 #endif
4286           if ((data[c/8] & (1 << (c&7))) == 0) break;
4287           eptr += len;
4288           }
4289
4290         while (eptr >= pp)
4291           {
4292           if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4293             return TRUE;
4294
4295 #ifdef SUPPORT_UTF8
4296           BACKCHAR(eptr)
4297 #endif
4298           }
4299         return FALSE;
4300         }
4301       }
4302     /* Control never gets here */
4303
4304     /* Match a run of characters */
4305
4306     case OP_CHARS:
4307       {
4308       register int length = ecode[1];
4309       ecode += 2;
4310
4311 #ifdef DEBUG    /* Sigh. Some compilers never learn. */
4312       if (eptr >= md->end_subject)
4313         printf("matching subject <null> against pattern ");
4314       else
4315         {
4316         printf("matching subject ");
4317         pchars(eptr, length, TRUE, md);
4318         printf(" against pattern ");
4319         }
4320       pchars(ecode, length, FALSE, md);
4321       printf("\n");
4322 #endif
4323
4324       if (length > md->end_subject - eptr) return FALSE;
4325       if ((ims & PCRE_CASELESS) != 0)
4326         {
4327         while (length-- > 0)
4328           if (md->lcc[*ecode++] != md->lcc[*eptr++])
4329             return FALSE;
4330         }
4331       else
4332         {
4333         while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
4334         }
4335       }
4336     break;
4337
4338     /* Match a single character repeatedly; different opcodes share code. */
4339
4340     case OP_EXACT:
4341     min = max = (ecode[1] << 8) + ecode[2];
4342     ecode += 3;
4343     goto REPEATCHAR;
4344
4345     case OP_UPTO:
4346     case OP_MINUPTO:
4347     min = 0;
4348     max = (ecode[1] << 8) + ecode[2];
4349     minimize = *ecode == OP_MINUPTO;
4350     ecode += 3;
4351     goto REPEATCHAR;
4352
4353     case OP_STAR:
4354     case OP_MINSTAR:
4355     case OP_PLUS:
4356     case OP_MINPLUS:
4357     case OP_QUERY:
4358     case OP_MINQUERY:
4359     c = *ecode++ - OP_STAR;
4360     minimize = (c & 1) != 0;
4361     min = rep_min[c];                 /* Pick up values from tables; */
4362     max = rep_max[c];                 /* zero for max => infinity */
4363     if (max == 0) max = INT_MAX;
4364
4365     /* Common code for all repeated single-character matches. We can give
4366     up quickly if there are fewer than the minimum number of characters left in
4367     the subject. */
4368
4369     REPEATCHAR:
4370     if (min > md->end_subject - eptr) return FALSE;
4371     c = *ecode++;
4372
4373     /* The code is duplicated for the caseless and caseful cases, for speed,
4374     since matching characters is likely to be quite common. First, ensure the
4375     minimum number of matches are present. If min = max, continue at the same
4376     level without recursing. Otherwise, if minimizing, keep trying the rest of
4377     the expression and advancing one matching character if failing, up to the
4378     maximum. Alternatively, if maximizing, find the maximum number of
4379     characters and work backwards. */
4380
4381     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
4382       max, eptr));
4383
4384     if ((ims & PCRE_CASELESS) != 0)
4385       {
4386       c = md->lcc[c];
4387       for (i = 1; i <= min; i++)
4388         if (c != md->lcc[*eptr++]) return FALSE;
4389       if (min == max) continue;
4390       if (minimize)
4391         {
4392         for (i = min;; i++)
4393           {
4394           if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4395             return TRUE;
4396           if (i >= max || eptr >= md->end_subject ||
4397               c != md->lcc[*eptr++])
4398             return FALSE;
4399           }
4400         /* Control never gets here */
4401         }
4402       else
4403         {
4404         const uschar *pp = eptr;
4405         for (i = min; i < max; i++)
4406           {
4407           if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
4408           eptr++;
4409           }
4410         while (eptr >= pp)
4411           if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4412             return TRUE;
4413         return FALSE;
4414         }
4415       /* Control never gets here */
4416       }
4417
4418     /* Caseful comparisons */
4419
4420     else
4421       {
4422       for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
4423       if (min == max) continue;
4424       if (minimize)
4425         {
4426         for (i = min;; i++)
4427           {
4428           if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4429             return TRUE;
4430           if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
4431           }
4432         /* Control never gets here */
4433         }
4434       else
4435         {
4436         const uschar *pp = eptr;
4437         for (i = min; i < max; i++)
4438           {
4439           if (eptr >= md->end_subject || c != *eptr) break;
4440           eptr++;
4441           }
4442         while (eptr >= pp)
4443          if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4444            return TRUE;
4445         return FALSE;
4446         }
4447       }
4448     /* Control never gets here */
4449
4450     /* Match a negated single character */
4451
4452     case OP_NOT:
4453     if (eptr >= md->end_subject) return FALSE;
4454     ecode++;
4455     if ((ims & PCRE_CASELESS) != 0)
4456       {
4457       if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
4458       }
4459     else
4460       {
4461       if (*ecode++ == *eptr++) return FALSE;
4462       }
4463     break;
4464
4465     /* Match a negated single character repeatedly. This is almost a repeat of
4466     the code for a repeated single character, but I haven't found a nice way of
4467     commoning these up that doesn't require a test of the positive/negative
4468     option for each character match. Maybe that wouldn't add very much to the
4469     time taken, but character matching *is* what this is all about... */
4470
4471     case OP_NOTEXACT:
4472     min = max = (ecode[1] << 8) + ecode[2];
4473     ecode += 3;
4474     goto REPEATNOTCHAR;
4475
4476     case OP_NOTUPTO:
4477     case OP_NOTMINUPTO:
4478     min = 0;
4479     max = (ecode[1] << 8) + ecode[2];
4480     minimize = *ecode == OP_NOTMINUPTO;
4481     ecode += 3;
4482     goto REPEATNOTCHAR;
4483
4484     case OP_NOTSTAR:
4485     case OP_NOTMINSTAR:
4486     case OP_NOTPLUS:
4487     case OP_NOTMINPLUS:
4488     case OP_NOTQUERY:
4489     case OP_NOTMINQUERY:
4490     c = *ecode++ - OP_NOTSTAR;
4491     minimize = (c & 1) != 0;
4492     min = rep_min[c];                 /* Pick up values from tables; */
4493     max = rep_max[c];                 /* zero for max => infinity */
4494     if (max == 0) max = INT_MAX;
4495
4496     /* Common code for all repeated single-character matches. We can give
4497     up quickly if there are fewer than the minimum number of characters left in
4498     the subject. */
4499
4500     REPEATNOTCHAR:
4501     if (min > md->end_subject - eptr) return FALSE;
4502     c = *ecode++;
4503
4504     /* The code is duplicated for the caseless and caseful cases, for speed,
4505     since matching characters is likely to be quite common. First, ensure the
4506     minimum number of matches are present. If min = max, continue at the same
4507     level without recursing. Otherwise, if minimizing, keep trying the rest of
4508     the expression and advancing one matching character if failing, up to the
4509     maximum. Alternatively, if maximizing, find the maximum number of
4510     characters and work backwards. */
4511
4512     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
4513       max, eptr));
4514
4515     if ((ims & PCRE_CASELESS) != 0)
4516       {
4517       c = md->lcc[c];
4518       for (i = 1; i <= min; i++)
4519         if (c == md->lcc[*eptr++]) return FALSE;
4520       if (min == max) continue;
4521       if (minimize)
4522         {
4523         for (i = min;; i++)
4524           {
4525           if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4526             return TRUE;
4527           if (i >= max || eptr >= md->end_subject ||
4528               c == md->lcc[*eptr++])
4529             return FALSE;
4530           }
4531         /* Control never gets here */
4532         }
4533       else
4534         {
4535         const uschar *pp = eptr;
4536         for (i = min; i < max; i++)
4537           {
4538           if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
4539           eptr++;
4540           }
4541         while (eptr >= pp)
4542           if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4543             return TRUE;
4544         return FALSE;
4545         }
4546       /* Control never gets here */
4547       }
4548
4549     /* Caseful comparisons */
4550
4551     else
4552       {
4553       for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
4554       if (min == max) continue;
4555       if (minimize)
4556         {
4557         for (i = min;; i++)
4558           {
4559           if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4560             return TRUE;
4561           if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
4562           }
4563         /* Control never gets here */
4564         }
4565       else
4566         {
4567         const uschar *pp = eptr;
4568         for (i = min; i < max; i++)
4569           {
4570           if (eptr >= md->end_subject || c == *eptr) break;
4571           eptr++;
4572           }
4573         while (eptr >= pp)
4574          if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4575            return TRUE;
4576         return FALSE;
4577         }
4578       }
4579     /* Control never gets here */
4580
4581     /* Match a single character type repeatedly; several different opcodes
4582     share code. This is very similar to the code for single characters, but we
4583     repeat it in the interests of efficiency. */
4584
4585     case OP_TYPEEXACT:
4586     min = max = (ecode[1] << 8) + ecode[2];
4587     minimize = TRUE;
4588     ecode += 3;
4589     goto REPEATTYPE;
4590
4591     case OP_TYPEUPTO:
4592     case OP_TYPEMINUPTO:
4593     min = 0;
4594     max = (ecode[1] << 8) + ecode[2];
4595     minimize = *ecode == OP_TYPEMINUPTO;
4596     ecode += 3;
4597     goto REPEATTYPE;
4598
4599     case OP_TYPESTAR:
4600     case OP_TYPEMINSTAR:
4601     case OP_TYPEPLUS:
4602     case OP_TYPEMINPLUS:
4603     case OP_TYPEQUERY:
4604     case OP_TYPEMINQUERY:
4605     c = *ecode++ - OP_TYPESTAR;
4606     minimize = (c & 1) != 0;
4607     min = rep_min[c];                 /* Pick up values from tables; */
4608     max = rep_max[c];                 /* zero for max => infinity */
4609     if (max == 0) max = INT_MAX;
4610
4611     /* Common code for all repeated single character type matches */
4612
4613     REPEATTYPE:
4614     ctype = *ecode++;      /* Code for the character type */
4615
4616     /* First, ensure the minimum number of matches are present. Use inline
4617     code for maximizing the speed, and do the type test once at the start
4618     (i.e. keep it out of the loop). Also we can test that there are at least
4619     the minimum number of bytes before we start, except when doing '.' in
4620     UTF8 mode. Leave the test in in all cases; in the special case we have
4621     to test after each character. */
4622
4623     if (min > md->end_subject - eptr) return FALSE;
4624     if (min > 0) switch(ctype)
4625       {
4626       case OP_ANY:
4627 #ifdef SUPPORT_UTF8
4628       if (md->utf8)
4629         {
4630         for (i = 1; i <= min; i++)
4631           {
4632           if (eptr >= md->end_subject ||
4633              (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
4634             return FALSE;
4635           while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4636           }
4637         break;
4638         }
4639 #endif
4640       /* Non-UTF8 can be faster */
4641       if ((ims & PCRE_DOTALL) == 0)
4642         { for (i = 1; i <= min; i++) if (*eptr++ == NEWLINE) return FALSE; }
4643       else eptr += min;
4644       break;
4645
4646       case OP_NOT_DIGIT:
4647       for (i = 1; i <= min; i++)
4648         if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
4649       break;
4650
4651       case OP_DIGIT:
4652       for (i = 1; i <= min; i++)
4653         if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
4654       break;
4655
4656       case OP_NOT_WHITESPACE:
4657       for (i = 1; i <= min; i++)
4658         if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE;
4659       break;
4660
4661       case OP_WHITESPACE:
4662       for (i = 1; i <= min; i++)
4663         if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE;
4664       break;
4665
4666       case OP_NOT_WORDCHAR:
4667       for (i = 1; i <= min; i++)
4668         if ((md->ctypes[*eptr++] & ctype_word) != 0)
4669           return FALSE;
4670       break;
4671
4672       case OP_WORDCHAR:
4673       for (i = 1; i <= min; i++)
4674         if ((md->ctypes[*eptr++] & ctype_word) == 0)
4675           return FALSE;
4676       break;
4677       }
4678
4679     /* If min = max, continue at the same level without recursing */
4680
4681     if (min == max) continue;
4682
4683     /* If minimizing, we have to test the rest of the pattern before each
4684     subsequent match. */
4685
4686     if (minimize)
4687       {
4688       for (i = min;; i++)
4689         {
4690         if (match(eptr, ecode, offset_top, md, ims, eptrb, 0)) return TRUE;
4691         if (i >= max || eptr >= md->end_subject) return FALSE;
4692
4693         c = *eptr++;
4694         switch(ctype)
4695           {
4696           case OP_ANY:
4697           if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) return FALSE;
4698 #ifdef SUPPORT_UTF8
4699           if (md->utf8)
4700             while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4701 #endif
4702           break;
4703
4704           case OP_NOT_DIGIT:
4705           if ((md->ctypes[c] & ctype_digit) != 0) return FALSE;
4706           break;
4707
4708           case OP_DIGIT:
4709           if ((md->ctypes[c] & ctype_digit) == 0) return FALSE;
4710           break;
4711
4712           case OP_NOT_WHITESPACE:
4713           if ((md->ctypes[c] & ctype_space) != 0) return FALSE;
4714           break;
4715
4716           case OP_WHITESPACE:
4717           if  ((md->ctypes[c] & ctype_space) == 0) return FALSE;
4718           break;
4719
4720           case OP_NOT_WORDCHAR:
4721           if ((md->ctypes[c] & ctype_word) != 0) return FALSE;
4722           break;
4723
4724           case OP_WORDCHAR:
4725           if ((md->ctypes[c] & ctype_word) == 0) return FALSE;
4726           break;
4727           }
4728         }
4729       /* Control never gets here */
4730       }
4731
4732     /* If maximizing it is worth using inline code for speed, doing the type
4733     test once at the start (i.e. keep it out of the loop). */
4734
4735     else
4736       {
4737       const uschar *pp = eptr;
4738       switch(ctype)
4739         {
4740         case OP_ANY:
4741
4742         /* Special code is required for UTF8, but when the maximum is unlimited
4743         we don't need it. */
4744
4745 #ifdef SUPPORT_UTF8
4746         if (md->utf8 && max < INT_MAX)
4747           {
4748           if ((ims & PCRE_DOTALL) == 0)
4749             {
4750             for (i = min; i < max; i++)
4751               {
4752               if (eptr >= md->end_subject || *eptr++ == NEWLINE) break;
4753               while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4754               }
4755             }
4756           else
4757             {
4758             for (i = min; i < max; i++)
4759               {
4760               eptr++;
4761               while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4762               }
4763             }
4764           break;
4765           }
4766 #endif
4767         /* Non-UTF8 can be faster */
4768         if ((ims & PCRE_DOTALL) == 0)
4769           {
4770           for (i = min; i < max; i++)
4771             {
4772             if (eptr >= md->end_subject || *eptr == NEWLINE) break;
4773             eptr++;
4774             }
4775           }
4776         else
4777           {
4778           c = max - min;
4779           if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4780           eptr += c;
4781           }
4782         break;
4783
4784         case OP_NOT_DIGIT:
4785         for (i = min; i < max; i++)
4786           {
4787           if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4788             break;
4789           eptr++;
4790           }
4791         break;
4792
4793         case OP_DIGIT:
4794         for (i = min; i < max; i++)
4795           {
4796           if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4797             break;
4798           eptr++;
4799           }
4800         break;
4801
4802         case OP_NOT_WHITESPACE:
4803         for (i = min; i < max; i++)
4804           {
4805           if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4806             break;
4807           eptr++;
4808           }
4809         break;
4810
4811         case OP_WHITESPACE:
4812         for (i = min; i < max; i++)
4813           {
4814           if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4815             break;
4816           eptr++;
4817           }
4818         break;
4819
4820         case OP_NOT_WORDCHAR:
4821         for (i = min; i < max; i++)
4822           {
4823           if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4824             break;
4825           eptr++;
4826           }
4827         break;
4828
4829         case OP_WORDCHAR:
4830         for (i = min; i < max; i++)
4831           {
4832           if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4833             break;
4834           eptr++;
4835           }
4836         break;
4837         }
4838
4839       while (eptr >= pp)
4840         {
4841         if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4842           return TRUE;
4843 #ifdef SUPPORT_UTF8
4844         if (md->utf8)
4845           while (eptr > pp && (*eptr & 0xc0) == 0x80) eptr--;
4846 #endif
4847         }
4848       return FALSE;
4849       }
4850     /* Control never gets here */
4851
4852     /* There's been some horrible disaster. */
4853
4854     default:
4855     DPRINTF(("Unknown opcode %d\n", *ecode));
4856     md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4857     return FALSE;
4858     }
4859
4860   /* Do not stick any code in here without much thought; it is assumed
4861   that "continue" in the code above comes out to here to repeat the main
4862   loop. */
4863
4864   }             /* End of main loop */
4865 /* Control never reaches here */
4866 }
4867
4868
4869
4870
4871 /*************************************************
4872 *         Execute a Regular Expression           *
4873 *************************************************/
4874
4875 /* This function applies a compiled re to a subject string and picks out
4876 portions of the string if it matches. Two elements in the vector are set for
4877 each substring: the offsets to the start and end of the substring.
4878
4879 Arguments:
4880   external_re     points to the compiled expression
4881   external_extra  points to "hints" from pcre_study() or is NULL
4882   subject         points to the subject string
4883   length          length of subject string (may contain binary zeros)
4884   start_offset    where to start in the subject string
4885   options         option bits
4886   offsets         points to a vector of ints to be filled in with offsets
4887   offsetcount     the number of elements in the vector
4888
4889 Returns:          > 0 => success; value is the number of elements filled in
4890                   = 0 => success, but offsets is not big enough
4891                    -1 => failed to match
4892                  < -1 => some kind of unexpected problem
4893 */
4894
4895 int
4896 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4897   const char *subject, int length, int start_offset, int options, int *offsets,
4898   int offsetcount)
4899 {
4900 int resetcount, ocount;
4901 int first_char = -1;
4902 int req_char = -1;
4903 int req_char2 = -1;
4904 unsigned long int ims = 0;
4905 match_data match_block;
4906 const uschar *start_bits = NULL;
4907 const uschar *start_match = (const uschar *)subject + start_offset;
4908 const uschar *end_subject;
4909 const uschar *req_char_ptr = start_match - 1;
4910 const real_pcre *re = (const real_pcre *)external_re;
4911 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4912 BOOL using_temporary_offsets = FALSE;
4913 BOOL anchored;
4914 BOOL startline;
4915
4916 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4917
4918 if (re == NULL || subject == NULL ||
4919    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4920 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4921
4922 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4923 startline = (re->options & PCRE_STARTLINE) != 0;
4924
4925 match_block.start_pattern = re->code;
4926 match_block.start_subject = (const uschar *)subject;
4927 match_block.end_subject = match_block.start_subject + length;
4928 end_subject = match_block.end_subject;
4929
4930 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4931 match_block.utf8 = (re->options & PCRE_UTF8) != 0;
4932
4933 match_block.notbol = (options & PCRE_NOTBOL) != 0;
4934 match_block.noteol = (options & PCRE_NOTEOL) != 0;
4935 match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
4936
4937 match_block.errorcode = PCRE_ERROR_NOMATCH;     /* Default error */
4938
4939 match_block.lcc = re->tables + lcc_offset;
4940 match_block.ctypes = re->tables + ctypes_offset;
4941
4942 /* The ims options can vary during the matching as a result of the presence
4943 of (?ims) items in the pattern. They are kept in a local variable so that
4944 restoring at the exit of a group is easy. */
4945
4946 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4947
4948 /* If the expression has got more back references than the offsets supplied can
4949 hold, we get a temporary bit of working store to use during the matching.
4950 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4951 of 3. */
4952
4953 ocount = offsetcount - (offsetcount % 3);
4954
4955 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4956   {
4957   ocount = re->top_backref * 3 + 3;
4958   match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4959   if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4960   using_temporary_offsets = TRUE;
4961   DPRINTF(("Got memory to hold back references\n"));
4962   }
4963 else match_block.offset_vector = offsets;
4964
4965 match_block.offset_end = ocount;
4966 match_block.offset_max = (2*ocount)/3;
4967 match_block.offset_overflow = FALSE;
4968
4969 /* Compute the minimum number of offsets that we need to reset each time. Doing
4970 this makes a huge difference to execution time when there aren't many brackets
4971 in the pattern. */
4972
4973 resetcount = 2 + re->top_bracket * 2;
4974 if (resetcount > offsetcount) resetcount = ocount;
4975
4976 /* Reset the working variable associated with each extraction. These should
4977 never be used unless previously set, but they get saved and restored, and so we
4978 initialize them to avoid reading uninitialized locations. */
4979
4980 if (match_block.offset_vector != NULL)
4981   {
4982   register int *iptr = match_block.offset_vector + ocount;
4983   register int *iend = iptr - resetcount/2 + 1;
4984   while (--iptr >= iend) *iptr = -1;
4985   }
4986
4987 /* Set up the first character to match, if available. The first_char value is
4988 never set for an anchored regular expression, but the anchoring may be forced
4989 at run time, so we have to test for anchoring. The first char may be unset for
4990 an unanchored pattern, of course. If there's no first char and the pattern was
4991 studied, there may be a bitmap of possible first characters. */
4992
4993 if (!anchored)
4994   {
4995   if ((re->options & PCRE_FIRSTSET) != 0)
4996     {
4997     first_char = re->first_char;
4998     if ((ims & PCRE_CASELESS) != 0) first_char = match_block.lcc[first_char];
4999     }
5000   else
5001     if (!startline && extra != NULL &&
5002       (extra->options & PCRE_STUDY_MAPPED) != 0)
5003         start_bits = extra->start_bits;
5004   }
5005
5006 /* For anchored or unanchored matches, there may be a "last known required
5007 character" set. If the PCRE_CASELESS is set, implying that the match starts
5008 caselessly, or if there are any changes of this flag within the regex, set up
5009 both cases of the character. Otherwise set the two values the same, which will
5010 avoid duplicate testing (which takes significant time). This covers the vast
5011 majority of cases. It will be suboptimal when the case flag changes in a regex
5012 and the required character in fact is caseful. */
5013
5014 if ((re->options & PCRE_REQCHSET) != 0)
5015   {
5016   req_char = re->req_char;
5017   req_char2 = ((re->options & (PCRE_CASELESS | PCRE_ICHANGED)) != 0)?
5018     (re->tables + fcc_offset)[req_char] : req_char;
5019   }
5020
5021 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5022 the loop runs just once. */
5023
5024 do
5025   {
5026   int rc;
5027   register int *iptr = match_block.offset_vector;
5028   register int *iend = iptr + resetcount;
5029
5030   /* Reset the maximum number of extractions we might see. */
5031
5032   while (iptr < iend) *iptr++ = -1;
5033
5034   /* Advance to a unique first char if possible */
5035
5036   if (first_char >= 0)
5037     {
5038     if ((ims & PCRE_CASELESS) != 0)
5039       while (start_match < end_subject &&
5040              match_block.lcc[*start_match] != first_char)
5041         start_match++;
5042     else
5043       while (start_match < end_subject && *start_match != first_char)
5044         start_match++;
5045     }
5046
5047   /* Or to just after \n for a multiline match if possible */
5048
5049   else if (startline)
5050     {
5051     if (start_match > match_block.start_subject + start_offset)
5052       {
5053       while (start_match < end_subject && start_match[-1] != NEWLINE)
5054         start_match++;
5055       }
5056     }
5057
5058   /* Or to a non-unique first char after study */
5059
5060   else if (start_bits != NULL)
5061     {
5062     while (start_match < end_subject)
5063       {
5064       register int c = *start_match;
5065       if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
5066       }
5067     }
5068
5069 #ifdef DEBUG  /* Sigh. Some compilers never learn. */
5070   printf(">>>> Match against: ");
5071   pchars(start_match, end_subject - start_match, TRUE, &match_block);
5072   printf("\n");
5073 #endif
5074
5075   /* If req_char is set, we know that that character must appear in the subject
5076   for the match to succeed. If the first character is set, req_char must be
5077   later in the subject; otherwise the test starts at the match point. This
5078   optimization can save a huge amount of backtracking in patterns with nested
5079   unlimited repeats that aren't going to match. We don't know what the state of
5080   case matching may be when this character is hit, so test for it in both its
5081   cases if necessary. However, the different cased versions will not be set up
5082   unless PCRE_CASELESS was given or the casing state changes within the regex.
5083   Writing separate code makes it go faster, as does using an autoincrement and
5084   backing off on a match. */
5085
5086   if (req_char >= 0)
5087     {
5088     register const uschar *p = start_match + ((first_char >= 0)? 1 : 0);
5089
5090     /* We don't need to repeat the search if we haven't yet reached the
5091     place we found it at last time. */
5092
5093     if (p > req_char_ptr)
5094       {
5095       /* Do a single test if no case difference is set up */
5096
5097       if (req_char == req_char2)
5098         {
5099         while (p < end_subject)
5100           {
5101           if (*p++ == req_char) { p--; break; }
5102           }
5103         }
5104
5105       /* Otherwise test for either case */
5106
5107       else
5108         {
5109         while (p < end_subject)
5110           {
5111           register int pp = *p++;
5112           if (pp == req_char || pp == req_char2) { p--; break; }
5113           }
5114         }
5115
5116       /* If we can't find the required character, break the matching loop */
5117
5118       if (p >= end_subject) break;
5119
5120       /* If we have found the required character, save the point where we
5121       found it, so that we don't search again next time round the loop if
5122       the start hasn't passed this character yet. */
5123
5124       req_char_ptr = p;
5125       }
5126     }
5127
5128   /* When a match occurs, substrings will be set for all internal extractions;
5129   we just need to set up the whole thing as substring 0 before returning. If
5130   there were too many extractions, set the return code to zero. In the case
5131   where we had to get some local store to hold offsets for backreferences, copy
5132   those back references that we can. In this case there need not be overflow
5133   if certain parts of the pattern were not used. */
5134
5135   match_block.start_match = start_match;
5136   if (!match(start_match, re->code, 2, &match_block, ims, NULL, match_isgroup))
5137     continue;
5138
5139   /* Copy the offset information from temporary store if necessary */
5140
5141   if (using_temporary_offsets)
5142     {
5143     if (offsetcount >= 4)
5144       {
5145       memcpy(offsets + 2, match_block.offset_vector + 2,
5146         (offsetcount - 2) * sizeof(int));
5147       DPRINTF(("Copied offsets from temporary memory\n"));
5148       }
5149     if (match_block.end_offset_top > offsetcount)
5150       match_block.offset_overflow = TRUE;
5151
5152     DPRINTF(("Freeing temporary memory\n"));
5153     (pcre_free)(match_block.offset_vector);
5154     }
5155
5156   rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
5157
5158   if (offsetcount < 2) rc = 0; else
5159     {
5160     offsets[0] = start_match - match_block.start_subject;
5161     offsets[1] = match_block.end_match_ptr - match_block.start_subject;
5162     }
5163
5164   DPRINTF((">>>> returning %d\n", rc));
5165   return rc;
5166   }
5167
5168 /* This "while" is the end of the "do" above */
5169
5170 while (!anchored &&
5171        match_block.errorcode == PCRE_ERROR_NOMATCH &&
5172        start_match++ < end_subject);
5173
5174 if (using_temporary_offsets)
5175   {
5176   DPRINTF(("Freeing temporary memory\n"));
5177   (pcre_free)(match_block.offset_vector);
5178   }
5179
5180 DPRINTF((">>>> returning %d\n", match_block.errorcode));
5181
5182 return match_block.errorcode;
5183 }
5184
5185 /* End of pcre.c */