1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
10 Written by: Philip Hazel <ph10@cam.ac.uk>
12 Copyright (c) 1997-2001 University of Cambridge
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
29 4. If PCRE is embedded in any software that is released under the GNU
30 General Purpose Licence (GPL), then the terms of that licence shall
31 supersede any condition above with which it is incompatible.
32 -----------------------------------------------------------------------------
36 /* Define DEBUG to get debugging output on stdout. */
40 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
41 inline, and there are *still* stupid compilers about that don't like indented
42 pre-processor statements. I suppose it's only been 10 years... */
45 #define DPRINTF(p) printf p
47 #define DPRINTF(p) /*nothing*/
50 /* Include the internals header, which itself includes Standard C headers plus
51 the external pcre header. */
56 /* Allow compilation as C++ source code, should anybody want to do that. */
59 #define class pcre_class
63 /* Maximum number of items on the nested bracket stacks at compile time. This
64 applies to the nesting of all kinds of parentheses. It does not limit
65 un-nested, non-capturing parentheses. This number can be made bigger if
66 necessary - it is used to dimension one int and one unsigned char vector at
69 #define BRASTACK_SIZE 200
72 /* The number of bytes in a literal character string above which we can't add
73 any more is different when UTF-8 characters may be encountered. */
82 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
84 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
85 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
87 /* Text forms of OP_ values and things, for debugging (not all used) */
90 static const char *OP_names[] = {
91 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
92 "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
93 "Opt", "^", "$", "Any", "chars", "not",
94 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
95 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
96 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
97 "*", "*?", "+", "+?", "?", "??", "{", "{",
98 "class", "Ref", "Recurse",
99 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
100 "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
101 "Brazero", "Braminzero", "Branumber", "Bra"
105 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
106 are simple data values; negative values are for special things like \d and so
107 on. Zero means further processing is needed (for things like \x), or the escape
110 static const short int escapes[] = {
111 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
112 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
113 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
114 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
115 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
116 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
117 '`', 7, -ESC_b, 0, -ESC_d, ESC_E, ESC_F, 0, /* ` - g */
118 0, 0, 0, 0, 0, 0, ESC_N, 0, /* h - o */
119 0, 0, ESC_R, -ESC_s, ESC_T, 0, 0, -ESC_w, /* p - w */
120 0, 0, -ESC_z /* x - z */
123 /* Tables of names of POSIX character classes and their lengths. The list is
124 terminated by a zero length entry. The first three must be alpha, upper, lower,
125 as this is assumed for handling case independence. */
127 static const char *posix_names[] = {
128 "alpha", "lower", "upper",
129 "alnum", "ascii", "cntrl", "digit", "graph",
130 "print", "punct", "space", "word", "xdigit" };
132 static const uschar posix_name_lengths[] = {
133 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
135 /* Table of class bit maps for each POSIX class; up to three may be combined
136 to form the class. */
138 static const int posix_class_maps[] = {
139 cbit_lower, cbit_upper, -1, /* alpha */
140 cbit_lower, -1, -1, /* lower */
141 cbit_upper, -1, -1, /* upper */
142 cbit_digit, cbit_lower, cbit_upper, /* alnum */
143 cbit_print, cbit_cntrl, -1, /* ascii */
144 cbit_cntrl, -1, -1, /* cntrl */
145 cbit_digit, -1, -1, /* digit */
146 cbit_graph, -1, -1, /* graph */
147 cbit_print, -1, -1, /* print */
148 cbit_punct, -1, -1, /* punct */
149 cbit_space, -1, -1, /* space */
150 cbit_word, -1, -1, /* word */
151 cbit_xdigit,-1, -1 /* xdigit */
155 /* Definition to allow mutual recursion */
158 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
159 BOOL, int, int *, int *, compile_data *);
161 /* Structure for building a chain of data that actually lives on the
162 stack, for holding the values of the subject pointer at the start of each
163 subpattern, so as to detect when an empty string has been matched by a
164 subpattern - to break infinite loops. */
166 typedef struct eptrblock {
167 struct eptrblock *prev;
168 const uschar *saved_eptr;
171 /* Flag bits for the match() function */
173 #define match_condassert 0x01 /* Called to check a condition assertion */
174 #define match_isgroup 0x02 /* Set if start of bracketed group */
178 /*************************************************
180 *************************************************/
182 /* PCRE is thread-clean and doesn't use any global variables in the normal
183 sense. However, it calls memory allocation and free functions via the two
184 indirections below, which are can be changed by the caller, but are shared
185 between all threads. */
187 void *(*pcre_malloc)(size_t) = malloc;
188 void (*pcre_free)(void *) = free;
192 /*************************************************
193 * Macros and tables for character handling *
194 *************************************************/
196 /* When UTF-8 encoding is being used, a character is no longer just a single
197 byte. The macros for character handling generate simple sequences when used in
198 byte-mode, and more complicated ones for UTF-8 characters. */
201 #define GETCHARINC(c, eptr) c = *eptr++;
202 #define GETCHARLEN(c, eptr, len) c = *eptr;
203 #define BACKCHAR(eptr)
205 #else /* SUPPORT_UTF8 */
207 /* Get the next UTF-8 character, advancing the pointer */
209 #define GETCHARINC(c, eptr) \
211 if (md->utf8 && (c & 0xc0) == 0xc0) \
213 int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
215 c = (c & utf8_table3[a]) << s; \
219 c |= (*eptr++ & 0x3f) << s; \
223 /* Get the next UTF-8 character, not advancing the pointer, setting length */
225 #define GETCHARLEN(c, eptr, len) \
228 if (md->utf8 && (c & 0xc0) == 0xc0) \
231 int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
233 c = (c & utf8_table3[a]) << s; \
234 for (i = 1; i <= a; i++) \
237 c |= (eptr[i] & 0x3f) << s; \
242 /* If the pointer is not at the start of a character, move it back until
245 #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
251 /*************************************************
252 * Default character tables *
253 *************************************************/
255 /* A default set of character tables is included in the PCRE binary. Its source
256 is built by the maketables auxiliary program, which uses the default C ctypes
257 functions, and put in the file chartables.c. These tables are used by PCRE
258 whenever the caller of pcre_compile() does not provide an alternate set of
261 #include "chartables.c"
266 /*************************************************
267 * Tables for UTF-8 support *
268 *************************************************/
270 /* These are the breakpoints for different numbers of bytes in a UTF-8
273 static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
275 /* These are the indicator bits and the mask for the data bits to set in the
276 first byte of a character, indexed by the number of additional bytes. */
278 static int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
279 static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
281 /* Table of the number of extra characters, indexed by the first character
282 masked with 0x3f. The highest number for a valid UTF-8 character is in fact
285 static uschar utf8_table4[] = {
286 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
287 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
288 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
289 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
292 /*************************************************
293 * Convert character value to UTF-8 *
294 *************************************************/
296 /* This function takes an integer value in the range 0 - 0x7fffffff
297 and encodes it as a UTF-8 character in 0 to 6 bytes.
300 cvalue the character value
301 buffer pointer to buffer for result - at least 6 bytes long
303 Returns: number of characters placed in the buffer
307 ord2utf8(int cvalue, uschar *buffer)
310 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
311 if (cvalue <= utf8_table1[i]) break;
313 for (j = i; j > 0; j--)
315 *buffer-- = 0x80 | (cvalue & 0x3f);
318 *buffer = utf8_table2[i] | cvalue;
325 /*************************************************
326 * Return version string *
327 *************************************************/
329 #define STRING(a) # a
330 #define XSTRING(s) STRING(s)
335 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
341 /*************************************************
342 * (Obsolete) Return info about compiled pattern *
343 *************************************************/
345 /* This is the original "info" function. It picks potentially useful data out
346 of the private structure, but its interface was too rigid. It remains for
347 backwards compatibility. The public options are passed back in an int - though
348 the re->options field has been expanded to a long int, all the public options
349 at the low end of it, and so even on 16-bit systems this will still be OK.
350 Therefore, I haven't changed the API for pcre_info().
353 external_re points to compiled code
354 optptr where to pass back the options
355 first_char where to pass back the first character,
356 or -1 if multiline and all branches start ^,
359 Returns: number of capturing subpatterns
360 or negative values on error
364 pcre_info(const pcre *external_re, int *optptr, int *first_char)
366 const real_pcre *re = (const real_pcre *)external_re;
367 if (re == NULL) return PCRE_ERROR_NULL;
368 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
369 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
370 if (first_char != NULL)
371 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
372 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
373 return re->top_bracket;
378 /*************************************************
379 * Return info about compiled pattern *
380 *************************************************/
382 /* This is a newer "info" function which has an extensible interface so
383 that additional items can be added compatibly.
386 external_re points to compiled code
387 external_study points to study data, or NULL
388 what what information is required
389 where where to put the information
391 Returns: 0 if data returned, negative on error
395 pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,
398 const real_pcre *re = (const real_pcre *)external_re;
399 const real_pcre_extra *study = (const real_pcre_extra *)study_data;
401 if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
402 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
406 case PCRE_INFO_OPTIONS:
407 *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
411 *((size_t *)where) = re->size;
414 case PCRE_INFO_CAPTURECOUNT:
415 *((int *)where) = re->top_bracket;
418 case PCRE_INFO_BACKREFMAX:
419 *((int *)where) = re->top_backref;
422 case PCRE_INFO_FIRSTCHAR:
424 ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
425 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
428 case PCRE_INFO_FIRSTTABLE:
429 *((const uschar **)where) =
430 (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
431 study->start_bits : NULL;
434 case PCRE_INFO_LASTLITERAL:
436 ((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;
439 default: return PCRE_ERROR_BADOPTION;
448 /*************************************************
449 * Debugging function to print chars *
450 *************************************************/
452 /* Print a sequence of chars in printable format, stopping at the end of the
453 subject if the requested.
456 p points to characters
457 length number to print
458 is_subject TRUE if printing from within md->start_subject
459 md pointer to matching data block, if is_subject is TRUE
465 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
468 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
470 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
477 /*************************************************
479 *************************************************/
481 /* This function is called when a \ has been encountered. It either returns a
482 positive value for a simple escape such as \n, or a negative value which
483 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
484 a positive value greater than 255 may be returned. On entry, ptr is pointing at
485 the \. On exit, it is on the final character of the escape sequence.
488 ptrptr points to the pattern position pointer
489 errorptr points to the pointer to the error message
490 bracount number of previous extracting brackets
491 options the options bits
492 isclass TRUE if inside a character class
493 cd pointer to char tables block
495 Returns: zero or positive => a data character
496 negative => a special escape sequence
497 on error, errorptr is set
501 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
502 int options, BOOL isclass, compile_data *cd)
504 const uschar *ptr = *ptrptr;
507 /* If backslash is at the end of the pattern, it's an error. */
510 if (c == 0) *errorptr = ERR1;
512 /* Digits or letters may have special meaning; all others are literals. */
514 else if (c < '0' || c > 'z') {}
516 /* Do an initial lookup in a table. A non-zero result is something that can be
517 returned immediately. Otherwise further processing may be required. */
519 else if ((i = escapes[c - '0']) != 0) c = i;
521 /* Escapes that need further processing, or are illegal. */
525 const uschar *oldptr;
528 /* The handling of escape sequences consisting of a string of digits
529 starting with one that is not zero is not straightforward. By experiment,
530 the way Perl works seems to be as follows:
532 Outside a character class, the digits are read as a decimal number. If the
533 number is less than 10, or if there are that many previous extracting
534 left brackets, then it is a back reference. Otherwise, up to three octal
535 digits are read to form an escaped byte. Thus \123 is likely to be octal
536 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
537 value is greater than 377, the least significant 8 bits are taken. Inside a
538 character class, \ followed by a digit is always an octal number. */
540 case '1': case '2': case '3': case '4': case '5':
541 case '6': case '7': case '8': case '9':
547 while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
548 c = c * 10 + *(++ptr) - '0';
549 if (c < 10 || c <= bracount)
554 ptr = oldptr; /* Put the pointer back and fall through */
557 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
558 generates a binary zero byte and treats the digit as a following literal.
559 Thus we have to pull back the pointer by one. */
561 if ((c = *ptr) >= '8')
568 /* \0 always starts an octal number, but we may drop through to here with a
569 larger first octal digit. */
573 while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
574 ptr[1] != '8' && ptr[1] != '9')
575 c = c * 8 + *(++ptr) - '0';
576 c &= 255; /* Take least significant 8 bits */
579 /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
580 which can be greater than 0xff, but only if the ddd are hex digits. */
584 if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
586 const uschar *pt = ptr + 2;
587 register int count = 0;
589 while ((cd->ctypes[*pt] & ctype_xdigit) != 0)
592 c = c * 16 + cd->lcc[*pt] -
593 (((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W');
598 if (c < 0 || count > 8) *errorptr = ERR34;
602 /* If the sequence of hex digits does not end with '}', then we don't
603 recognize this construct; fall through to the normal \x handling. */
607 /* Read just a single hex char */
610 while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
613 c = c * 16 + cd->lcc[*ptr] -
614 (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
618 /* Other special escapes not starting with a digit are straightforward */
628 /* A letter is upper-cased; then the 0x40 bit is flipped */
630 if (c >= 'a' && c <= 'z') c = cd->fcc[c];
634 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
635 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
636 for Perl compatibility, it is a literal. This code looks a bit odd, but
637 there used to be some cases other than the default, and there may be again
638 in future, so I haven't "optimized" it. */
641 if ((options & PCRE_EXTRA) != 0) switch(c)
657 /*************************************************
658 * Check for counted repeat *
659 *************************************************/
661 /* This function is called when a '{' is encountered in a place where it might
662 start a quantifier. It looks ahead to see if it really is a quantifier or not.
663 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
664 where the ddds are digits.
667 p pointer to the first char after '{'
668 cd pointer to char tables block
670 Returns: TRUE or FALSE
674 is_counted_repeat(const uschar *p, compile_data *cd)
676 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
677 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
678 if (*p == '}') return TRUE;
680 if (*p++ != ',') return FALSE;
681 if (*p == '}') return TRUE;
683 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
684 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
690 /*************************************************
691 * Read repeat counts *
692 *************************************************/
694 /* Read an item of the form {n,m} and return the values. This is called only
695 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
696 so the syntax is guaranteed to be correct, but we need to check the values.
699 p pointer to first char after '{'
700 minp pointer to int for min
701 maxp pointer to int for max
702 returned as -1 if no max
703 errorptr points to pointer to error message
704 cd pointer to character tables clock
706 Returns: pointer to '}' on success;
707 current ptr on error, with errorptr set
710 static const uschar *
711 read_repeat_counts(const uschar *p, int *minp, int *maxp,
712 const char **errorptr, compile_data *cd)
717 /* Read the minimum value and do a paranoid check: a negative value indicates
718 an integer overflow. */
720 while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
721 if (min < 0 || min > 65535)
727 /* Read the maximum value if there is one, and again do a paranoid on its size.
728 Also, max must not be less than min. */
730 if (*p == '}') max = min; else
735 while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
736 if (max < 0 || max > 65535)
749 /* Fill in the required variables, and pass back the pointer to the terminating
759 /*************************************************
760 * Find the fixed length of a pattern *
761 *************************************************/
763 /* Scan a pattern and compute the fixed length of subject that will match it,
764 if the length is fixed. This is needed for dealing with backward assertions.
767 code points to the start of the pattern (the bracket)
768 options the compiling options
770 Returns: the fixed length, or -1 if there is no fixed length
774 find_fixedlength(uschar *code, int options)
778 register int branchlength = 0;
779 register uschar *cc = code + 3;
781 /* Scan along the opcodes for this branch. If we get to the end of the
782 branch, check the length against that of the other branches. */
787 register int op = *cc;
788 if (op >= OP_BRA) op = OP_BRA;
795 d = find_fixedlength(cc, options);
796 if (d < 0) return -1;
798 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
802 /* Reached end of a branch; if it's a ket it is the end of a nested
803 call. If it's ALT it is an alternation in a nested call. If it is
804 END it's the end of the outer call. All can be handled by the same code. */
811 if (length < 0) length = branchlength;
812 else if (length != branchlength) return -1;
813 if (*cc != OP_ALT) return length;
818 /* Skip over assertive subpatterns */
823 case OP_ASSERTBACK_NOT:
824 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
828 /* Skip over things that don't match chars */
845 case OP_NOT_WORD_BOUNDARY:
846 case OP_WORD_BOUNDARY:
850 /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
851 This requires a scan of the string, unfortunately. We assume valid UTF-8
852 strings, so all we do is reduce the length by one for byte whose bits are
856 branchlength += *(++cc);
858 for (d = 1; d <= *cc; d++)
859 if ((cc[d] & 0xc0) == 0x80) branchlength--;
864 /* Handle exact repetitions */
868 branchlength += (cc[1] << 8) + cc[2];
872 /* Handle single-char matchers */
876 case OP_NOT_WHITESPACE:
878 case OP_NOT_WORDCHAR:
886 /* Check a class for variable quantification */
901 if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
902 branchlength += (cc[1] << 8) + cc[2];
911 /* Anything else is variable length */
917 /* Control never gets here */
923 /*************************************************
924 * Check for POSIX class syntax *
925 *************************************************/
927 /* This function is called when the sequence "[:" or "[." or "[=" is
928 encountered in a character class. It checks whether this is followed by an
929 optional ^ and then a sequence of letters, terminated by a matching ":]" or
933 ptr pointer to the initial [
934 endptr where to return the end pointer
935 cd pointer to compile data
937 Returns: TRUE or FALSE
941 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
943 int terminator; /* Don't combine these lines; the Solaris cc */
944 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
945 if (*(++ptr) == '^') ptr++;
946 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
947 if (*ptr == terminator && ptr[1] == ']')
958 /*************************************************
959 * Check POSIX class name *
960 *************************************************/
962 /* This function is called to check the name given in a POSIX-style class entry
966 ptr points to the first letter
967 len the length of the name
969 Returns: a value representing the name, or -1 if unknown
973 check_posix_name(const uschar *ptr, int len)
975 register int yield = 0;
976 while (posix_name_lengths[yield] != 0)
978 if (len == posix_name_lengths[yield] &&
979 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
988 /*************************************************
989 * Compile one branch *
990 *************************************************/
992 /* Scan the pattern, compiling it into the code vector.
995 options the option bits
996 brackets points to number of extracting brackets used
997 code points to the pointer to the current code point
998 ptrptr points to the current pattern pointer
999 errorptr points to pointer to error message
1000 optchanged set to the value of the last OP_OPT item compiled
1001 reqchar set to the last literal character required, else -1
1002 countlits set to count of mandatory literal characters
1003 cd contains pointers to tables
1005 Returns: TRUE on success
1006 FALSE, with *errorptr set on error
1010 compile_branch(int options, int *brackets, uschar **codeptr,
1011 const uschar **ptrptr, const char **errorptr, int *optchanged,
1012 int *reqchar, int *countlits, compile_data *cd)
1014 int repeat_type, op_type;
1015 int repeat_min, repeat_max;
1016 int bravalue, length;
1017 int greedy_default, greedy_non_default;
1020 int subcountlits = 0;
1022 register uschar *code = *codeptr;
1024 const uschar *ptr = *ptrptr;
1025 const uschar *tempptr;
1026 uschar *previous = NULL;
1029 /* Set up the default and non-default settings for greediness */
1031 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1032 greedy_non_default = greedy_default ^ 1;
1034 /* Initialize no required char, and count of literals */
1036 *reqchar = prevreqchar = -1;
1039 /* Switch on next character until the end of the branch */
1044 int class_charcount;
1051 if ((options & PCRE_EXTENDED) != 0)
1053 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1056 /* The space before the ; is to avoid a warning on a silly compiler
1057 on the Macintosh. */
1058 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1065 /* The branch terminates at end of string, |, or ). */
1074 /* Handle single-character metacharacters */
1091 /* Character classes. These always build a 32-byte bitmap of the permitted
1092 characters, except in the special case where there is only one character.
1093 For negated classes, we build the map as usual, then invert it at the end.
1100 /* If the first character is '^', set the negation flag and skip it. */
1102 if ((c = *(++ptr)) == '^')
1104 negate_class = TRUE;
1107 else negate_class = FALSE;
1109 /* Keep a count of chars so that we can optimize the case of just a single
1112 class_charcount = 0;
1113 class_lastchar = -1;
1115 /* Initialize the 32-char bit map to all zeros. We have to build the
1116 map in a temporary bit of store, in case the class contains only 1
1117 character, because in that case the compiled code doesn't use the
1120 memset(class, 0, 32 * sizeof(uschar));
1122 /* Process characters until ] is reached. By writing this as a "do" it
1123 means that an initial ] is taken as a data character. */
1133 /* Handle POSIX class names. Perl allows a negation extension of the
1134 form [:^name]. A square bracket that doesn't match the syntax is
1135 treated as a literal. We also recognize the POSIX constructions
1136 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1140 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1141 check_posix_syntax(ptr, &tempptr, cd))
1143 BOOL local_negate = FALSE;
1145 register const uschar *cbits = cd->cbits;
1156 local_negate = TRUE;
1160 posix_class = check_posix_name(ptr, tempptr - ptr);
1161 if (posix_class < 0)
1167 /* If matching is caseless, upper and lower are converted to
1168 alpha. This relies on the fact that the class table starts with
1169 alpha, lower, upper as the first 3 entries. */
1171 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1174 /* Or into the map we are building up to 3 of the static class
1175 tables, or their negations. */
1178 for (i = 0; i < 3; i++)
1180 int taboffset = posix_class_maps[posix_class + i];
1181 if (taboffset < 0) break;
1183 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1185 for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1189 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1193 /* Backslash may introduce a single character, or it may introduce one
1194 of the specials, which just set a flag. Escaped items are checked for
1195 validity in the pre-compiling pass. The sequence \b is a special case.
1196 Inside a class (and only there) it is treated as backspace. Elsewhere
1197 it marks a word boundary. Other escapes have preset maps ready to
1198 or into the one we are building. We assume they have more than one
1199 character in them, so set class_count bigger than one. */
1203 c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1204 if (-c == ESC_b) c = '\b';
1207 register const uschar *cbits = cd->cbits;
1208 class_charcount = 10;
1212 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
1216 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
1220 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
1224 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
1228 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
1232 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
1241 /* Fall through if single character, but don't at present allow
1242 chars > 255 in UTF-8 mode. */
1253 /* A single character may be followed by '-' to form a range. However,
1254 Perl does not permit ']' to be the end of the range. A '-' character
1255 here is treated as a literal. */
1257 if (ptr[1] == '-' && ptr[2] != ']')
1269 /* The second part of a range can be a single-character escape, but
1270 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1271 in such circumstances. */
1275 const uschar *oldptr = ptr;
1276 d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1285 /* \b is backslash; any other special means the '-' was literal */
1289 if (d == -ESC_b) d = '\b'; else
1292 goto SINGLE_CHARACTER; /* A few lines below */
1305 class[c/8] |= (1 << (c&7));
1306 if ((options & PCRE_CASELESS) != 0)
1308 int uc = cd->fcc[c]; /* flip case */
1309 class[uc/8] |= (1 << (uc&7));
1311 class_charcount++; /* in case a one-char range */
1314 continue; /* Go get the next char in the class */
1317 /* Handle a lone single character - we can get here for a normal
1318 non-escape char, or after \ that introduces a single character. */
1322 class [c/8] |= (1 << (c&7));
1323 if ((options & PCRE_CASELESS) != 0)
1325 c = cd->fcc[c]; /* flip case */
1326 class[c/8] |= (1 << (c&7));
1332 /* Loop until ']' reached; the check for end of string happens inside the
1333 loop. This "while" is the end of the "do" above. */
1335 while ((c = *(++ptr)) != ']');
1337 /* If class_charcount is 1 and class_lastchar is not negative, we saw
1338 precisely one character. This doesn't need the whole 32-byte bit map.
1339 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1342 if (class_charcount == 1 && class_lastchar >= 0)
1350 code[-1] = OP_CHARS;
1353 *code++ = class_lastchar;
1356 /* Otherwise, negate the 32-byte map if necessary, and copy it into
1362 for (c = 0; c < 32; c++) code[c] = ~class[c];
1364 memcpy(code, class, 32);
1369 /* Various kinds of repeat */
1372 if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
1373 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
1374 if (*errorptr != NULL) goto FAILED;
1392 if (previous == NULL)
1398 /* If the next character is '?' this is a minimizing repeat, by default,
1399 but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
1403 { repeat_type = greedy_non_default; ptr++; }
1404 else repeat_type = greedy_default;
1406 /* If previous was a string of characters, chop off the last one and use it
1407 as the subject of the repeat. If there was only one character, we can
1408 abolish the previous item altogether. A repeat with a zero minimum wipes
1409 out any reqchar setting, backing up to the previous value. We must also
1410 adjust the countlits value. */
1412 if (*previous == OP_CHARS)
1414 int len = previous[1];
1416 if (repeat_min == 0) *reqchar = prevreqchar;
1417 *countlits += repeat_min - 1;
1426 c = previous[len+1];
1430 op_type = 0; /* Use single-char op codes */
1431 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1434 /* If previous was a single negated character ([^a] or similar), we use
1435 one of the special opcodes, replacing it. The code is shared with single-
1436 character repeats by adding a suitable offset into repeat_type. */
1438 else if ((int)*previous == OP_NOT)
1440 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
1443 goto OUTPUT_SINGLE_REPEAT;
1446 /* If previous was a character type match (\d or similar), abolish it and
1447 create a suitable repeat item. The code is shared with single-character
1448 repeats by adding a suitable offset into repeat_type. */
1450 else if ((int)*previous < OP_EODN || *previous == OP_ANY)
1452 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1456 OUTPUT_SINGLE_REPEAT:
1458 /* If the maximum is zero then the minimum must also be zero; Perl allows
1459 this case, so we do too - by simply omitting the item altogether. */
1461 if (repeat_max == 0) goto END_REPEAT;
1463 /* Combine the op_type with the repeat_type */
1465 repeat_type += op_type;
1467 /* A minimum of zero is handled either as the special case * or ?, or as
1468 an UPTO, with the maximum given. */
1470 if (repeat_min == 0)
1472 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1473 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1476 *code++ = OP_UPTO + repeat_type;
1477 *code++ = repeat_max >> 8;
1478 *code++ = (repeat_max & 255);
1482 /* The case {1,} is handled as the special case + */
1484 else if (repeat_min == 1 && repeat_max == -1)
1485 *code++ = OP_PLUS + repeat_type;
1487 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1488 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1492 if (repeat_min != 1)
1494 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1495 *code++ = repeat_min >> 8;
1496 *code++ = (repeat_min & 255);
1499 /* If the mininum is 1 and the previous item was a character string,
1500 we either have to put back the item that got cancelled if the string
1501 length was 1, or add the character back onto the end of a longer
1502 string. For a character type nothing need be done; it will just get
1503 put back naturally. Note that the final character is always going to
1506 else if (*previous == OP_CHARS)
1508 if (code == previous) code += 2; else previous[1]++;
1511 /* For a single negated character we also have to put back the
1512 item that got cancelled. */
1514 else if (*previous == OP_NOT) code++;
1516 /* If the maximum is unlimited, insert an OP_STAR. */
1521 *code++ = OP_STAR + repeat_type;
1524 /* Else insert an UPTO if the max is greater than the min. */
1526 else if (repeat_max != repeat_min)
1529 repeat_max -= repeat_min;
1530 *code++ = OP_UPTO + repeat_type;
1531 *code++ = repeat_max >> 8;
1532 *code++ = (repeat_max & 255);
1536 /* The character or character type itself comes last in all cases. */
1541 /* If previous was a character class or a back reference, we put the repeat
1542 stuff after it, but just skip the item if the repeat was {0,0}. */
1544 else if (*previous == OP_CLASS || *previous == OP_REF)
1546 if (repeat_max == 0)
1551 if (repeat_min == 0 && repeat_max == -1)
1552 *code++ = OP_CRSTAR + repeat_type;
1553 else if (repeat_min == 1 && repeat_max == -1)
1554 *code++ = OP_CRPLUS + repeat_type;
1555 else if (repeat_min == 0 && repeat_max == 1)
1556 *code++ = OP_CRQUERY + repeat_type;
1559 *code++ = OP_CRRANGE + repeat_type;
1560 *code++ = repeat_min >> 8;
1561 *code++ = repeat_min & 255;
1562 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1563 *code++ = repeat_max >> 8;
1564 *code++ = repeat_max & 255;
1568 /* If previous was a bracket group, we may have to replicate it in certain
1571 else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1572 (int)*previous == OP_COND)
1576 int len = code - previous;
1577 uschar *bralink = NULL;
1579 /* If the maximum repeat count is unlimited, find the end of the bracket
1580 by scanning through from the start, and compute the offset back to it
1581 from the current code pointer. There may be an OP_OPT setting following
1582 the final KET, so we can't find the end just by going back from the code
1585 if (repeat_max == -1)
1587 register uschar *ket = previous;
1588 do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
1589 ketoffset = code - ket;
1592 /* The case of a zero minimum is special because of the need to stick
1593 OP_BRAZERO in front of it, and because the group appears once in the
1594 data, whereas in other cases it appears the minimum number of times. For
1595 this reason, it is simplest to treat this case separately, as otherwise
1596 the code gets far too messy. There are several special subcases when the
1599 if (repeat_min == 0)
1601 /* If we set up a required char from the bracket, we must back off
1602 to the previous value and reset the countlits value too. */
1604 if (subcountlits > 0)
1606 *reqchar = prevreqchar;
1607 *countlits -= subcountlits;
1610 /* If the maximum is also zero, we just omit the group from the output
1613 if (repeat_max == 0)
1619 /* If the maximum is 1 or unlimited, we just have to stick in the
1620 BRAZERO and do no more at this point. */
1622 if (repeat_max <= 1)
1624 memmove(previous+1, previous, len);
1626 *previous++ = OP_BRAZERO + repeat_type;
1629 /* If the maximum is greater than 1 and limited, we have to replicate
1630 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1631 The first one has to be handled carefully because it's the original
1632 copy, which has to be moved up. The remainder can be handled by code
1633 that is common with the non-zero minimum case below. We just have to
1634 adjust the value or repeat_max, since one less copy is required. */
1639 memmove(previous+4, previous, len);
1641 *previous++ = OP_BRAZERO + repeat_type;
1642 *previous++ = OP_BRA;
1644 /* We chain together the bracket offset fields that have to be
1645 filled in later when the ends of the brackets are reached. */
1647 offset = (bralink == NULL)? 0 : previous - bralink;
1649 *previous++ = offset >> 8;
1650 *previous++ = offset & 255;
1656 /* If the minimum is greater than zero, replicate the group as many
1657 times as necessary, and adjust the maximum to the number of subsequent
1658 copies that we need. */
1662 for (i = 1; i < repeat_min; i++)
1664 memcpy(code, previous, len);
1667 if (repeat_max > 0) repeat_max -= repeat_min;
1670 /* This code is common to both the zero and non-zero minimum cases. If
1671 the maximum is limited, it replicates the group in a nested fashion,
1672 remembering the bracket starts on a stack. In the case of a zero minimum,
1673 the first one was set up above. In all cases the repeat_max now specifies
1674 the number of additional copies needed. */
1676 if (repeat_max >= 0)
1678 for (i = repeat_max - 1; i >= 0; i--)
1680 *code++ = OP_BRAZERO + repeat_type;
1682 /* All but the final copy start a new nesting, maintaining the
1683 chain of brackets outstanding. */
1689 offset = (bralink == NULL)? 0 : code - bralink;
1691 *code++ = offset >> 8;
1692 *code++ = offset & 255;
1695 memcpy(code, previous, len);
1699 /* Now chain through the pending brackets, and fill in their length
1700 fields (which are holding the chain links pro tem). */
1702 while (bralink != NULL)
1705 int offset = code - bralink + 1;
1706 uschar *bra = code - offset;
1707 oldlinkoffset = (bra[1] << 8) + bra[2];
1708 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1710 *code++ = bra[1] = offset >> 8;
1711 *code++ = bra[2] = (offset & 255);
1715 /* If the maximum is unlimited, set a repeater in the final copy. We
1716 can't just offset backwards from the current code point, because we
1717 don't know if there's been an options resetting after the ket. The
1718 correct offset was computed above. */
1720 else code[-ketoffset] = OP_KETRMAX + repeat_type;
1723 /* Else there's some kind of shambles */
1731 /* In all case we no longer have a previous item. */
1738 /* Start of nested bracket sub-expression, or comment or lookahead or
1739 lookbehind or option setting or condition. First deal with special things
1740 that can come after a bracket; all are introduced by ?, and the appearance
1741 of any of them means that this is not a referencing group. They were
1742 checked for validity in the first pass over the string, so we don't have to
1743 check for syntax errors here. */
1746 newoptions = options;
1749 if (*(++ptr) == '?')
1756 case '#': /* Comment; skip to ket */
1758 while (*ptr != ')') ptr++;
1761 case ':': /* Non-extracting bracket */
1767 bravalue = OP_COND; /* Conditional group */
1768 if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1770 int condref = *ptr - '0';
1771 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1779 code[4] = condref >> 8;
1780 code[5] = condref & 255;
1786 case '=': /* Positive lookahead */
1787 bravalue = OP_ASSERT;
1791 case '!': /* Negative lookahead */
1792 bravalue = OP_ASSERT_NOT;
1796 case '<': /* Lookbehinds */
1799 case '=': /* Positive lookbehind */
1800 bravalue = OP_ASSERTBACK;
1804 case '!': /* Negative lookbehind */
1805 bravalue = OP_ASSERTBACK_NOT;
1809 default: /* Syntax error */
1815 case '>': /* One-time brackets */
1820 case 'R': /* Pattern recursion */
1821 *code++ = OP_RECURSE;
1825 default: /* Option setting */
1829 while (*ptr != ')' && *ptr != ':')
1833 case '-': optset = &unset; break;
1835 case 'i': *optset |= PCRE_CASELESS; break;
1836 case 'm': *optset |= PCRE_MULTILINE; break;
1837 case 's': *optset |= PCRE_DOTALL; break;
1838 case 'x': *optset |= PCRE_EXTENDED; break;
1839 case 'U': *optset |= PCRE_UNGREEDY; break;
1840 case 'X': *optset |= PCRE_EXTRA; break;
1848 /* Set up the changed option bits, but don't change anything yet. */
1850 newoptions = (options | set) & (~unset);
1852 /* If the options ended with ')' this is not the start of a nested
1853 group with option changes, so the options change at this level. At top
1854 level there is nothing else to be done (the options will in fact have
1855 been set from the start of compiling as a result of the first pass) but
1856 at an inner level we must compile code to change the ims options if
1857 necessary, and pass the new setting back so that it can be put at the
1858 start of any following branches, and when this group ends, a resetting
1859 item can be compiled. */
1863 if ((options & PCRE_INGROUP) != 0 &&
1864 (options & PCRE_IMS) != (newoptions & PCRE_IMS))
1867 *code++ = *optchanged = newoptions & PCRE_IMS;
1869 options = newoptions; /* Change options at this level */
1870 previous = NULL; /* This item can't be repeated */
1871 continue; /* It is complete */
1874 /* If the options ended with ':' we are heading into a nested group
1875 with possible change of options. Such groups are non-capturing and are
1876 not assertions of any kind. All we need to do is skip over the ':';
1877 the newoptions value is handled below. */
1884 /* Else we have a referencing group; adjust the opcode. If the bracket
1885 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
1886 arrange for the true number to follow later, in an OP_BRANUMBER item. */
1890 if (++(*brackets) > EXTRACT_BASIC_MAX)
1892 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
1893 code[3] = OP_BRANUMBER;
1894 code[4] = *brackets >> 8;
1895 code[5] = *brackets & 255;
1898 else bravalue = OP_BRA + *brackets;
1901 /* Process nested bracketed re. Assertions may not be repeated, but other
1902 kinds can be. We copy code into a non-register variable in order to be able
1903 to pass its address because some compilers complain otherwise. Pass in a
1904 new setting for the ims options if they have changed. */
1906 previous = (bravalue >= OP_ONCE)? code : NULL;
1911 options | PCRE_INGROUP, /* Set for all nested groups */
1912 ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1913 newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1914 brackets, /* Extracting bracket count */
1915 &tempcode, /* Where to put code (updated) */
1916 &ptr, /* Input pointer (updated) */
1917 errorptr, /* Where to put an error message */
1918 (bravalue == OP_ASSERTBACK ||
1919 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1920 skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
1921 &subreqchar, /* For possible last char */
1922 &subcountlits, /* For literal count */
1923 cd)) /* Tables block */
1926 /* At the end of compiling, code is still pointing to the start of the
1927 group, while tempcode has been updated to point past the end of the group
1928 and any option resetting that may follow it. The pattern pointer (ptr)
1929 is on the bracket. */
1931 /* If this is a conditional bracket, check that there are no more than
1932 two branches in the group. */
1934 else if (bravalue == OP_COND)
1941 tc += (tc[1] << 8) | tc[2];
1943 while (*tc != OP_KET);
1952 /* Handle updating of the required character. If the subpattern didn't
1953 set one, leave it as it was. Otherwise, update it for normal brackets of
1954 all kinds, forward assertions, and conditions with two branches. Don't
1955 update the literal count for forward assertions, however. If the bracket
1956 is followed by a quantifier with zero repeat, we have to back off. Hence
1957 the definition of prevreqchar and subcountlits outside the main loop so
1958 that they can be accessed for the back off. */
1960 if (subreqchar > 0 &&
1961 (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1962 (bravalue == OP_COND && condcount == 2)))
1964 prevreqchar = *reqchar;
1965 *reqchar = subreqchar;
1966 if (bravalue != OP_ASSERT) *countlits += subcountlits;
1969 /* Now update the main code pointer to the end of the group. */
1973 /* Error if hit end of pattern */
1982 /* Check \ for being a real metacharacter; if not, fall through and handle
1983 it as a data character at the start of a string. Escape items are checked
1984 for validity in the pre-compiling pass. */
1988 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1990 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1991 are arranged to be the negation of the corresponding OP_values. For the
1992 back references, the values are ESC_REF plus the reference number. Only
1993 back references and those types that consume a character may be repeated.
1994 We can test for values between ESC_b and ESC_Z for the latter; this may
1995 have to change if any new ones are ever created. */
2001 int number = -c - ESC_REF;
2004 *code++ = number >> 8;
2005 *code++ = number & 255;
2009 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
2015 /* Data character: reset and fall through */
2020 /* Handle a run of data characters until a metacharacter is encountered.
2021 The first character is guaranteed not to be whitespace or # when the
2022 extended flag is set. */
2033 if ((options & PCRE_EXTENDED) != 0)
2035 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2038 /* The space before the ; is to avoid a warning on a silly compiler
2039 on the Macintosh. */
2040 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2046 /* Backslash may introduce a data char or a metacharacter. Escaped items
2047 are checked for validity in the pre-compiling pass. Stop the string
2048 before a metaitem. */
2053 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
2054 if (c < 0) { ptr = tempptr; break; }
2056 /* If a character is > 127 in UTF-8 mode, we have to turn it into
2057 two or more characters in the UTF-8 encoding. */
2060 if (c > 127 && (options & PCRE_UTF8) != 0)
2063 int len = ord2utf8(c, buffer);
2064 for (c = 0; c < len; c++) *code++ = buffer[c];
2071 /* Ordinary character or single-char escape */
2077 /* This "while" is the end of the "do" above. */
2079 while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
2081 /* Update the last character and the count of literals */
2083 prevreqchar = (length > 1)? code[-2] : *reqchar;
2084 *reqchar = code[-1];
2085 *countlits += length;
2087 /* Compute the length and set it in the data vector, and advance to
2090 previous[1] = length;
2091 if (length < MAXLIT) ptr--;
2094 } /* end of big loop */
2096 /* Control never reaches here by falling through, only by a goto for all the
2097 error states. Pass back the position in the pattern so that it can be displayed
2098 to the user for diagnosing the error. */
2108 /*************************************************
2109 * Compile sequence of alternatives *
2110 *************************************************/
2112 /* On entry, ptr is pointing past the bracket character, but on return
2113 it points to the closing bracket, or vertical bar, or end of string.
2114 The code variable is pointing at the byte into which the BRA operator has been
2115 stored. If the ims options are changed at the start (for a (?ims: group) or
2116 during any branch, we need to insert an OP_OPT item at the start of every
2117 following branch to ensure they get set correctly at run time, and also pass
2118 the new options into every subsequent branch compile.
2121 options the option bits
2122 optchanged new ims options to set as if (?ims) were at the start, or -1
2124 brackets -> int containing the number of extracting brackets used
2125 codeptr -> the address of the current code pointer
2126 ptrptr -> the address of the current pattern pointer
2127 errorptr -> pointer to error message
2128 lookbehind TRUE if this is a lookbehind assertion
2129 skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
2130 reqchar -> place to put the last required character, or a negative number
2131 countlits -> place to put the shortest literal count of any branch
2132 cd points to the data block with tables pointers
2134 Returns: TRUE on success
2138 compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
2139 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
2140 int *reqchar, int *countlits, compile_data *cd)
2142 const uschar *ptr = *ptrptr;
2143 uschar *code = *codeptr;
2144 uschar *last_branch = code;
2145 uschar *start_bracket = code;
2146 uschar *reverse_count = NULL;
2147 int oldoptions = options & PCRE_IMS;
2148 int branchreqchar, branchcountlits;
2151 *countlits = INT_MAX;
2152 code += 3 + skipbytes;
2154 /* Loop for each alternative branch */
2160 /* Handle change of options */
2162 if (optchanged >= 0)
2165 *code++ = optchanged;
2166 options = (options & ~PCRE_IMS) | optchanged;
2169 /* Set up dummy OP_REVERSE if lookbehind assertion */
2173 *code++ = OP_REVERSE;
2174 reverse_count = code;
2179 /* Now compile the branch */
2181 if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
2182 &branchreqchar, &branchcountlits, cd))
2188 /* Fill in the length of the last branch */
2190 length = code - last_branch;
2191 last_branch[1] = length >> 8;
2192 last_branch[2] = length & 255;
2194 /* Save the last required character if all branches have the same; a current
2195 value of -1 means unset, while -2 means "previous branch had no last required
2200 if (branchreqchar >= 0)
2202 if (*reqchar == -1) *reqchar = branchreqchar;
2203 else if (*reqchar != branchreqchar) *reqchar = -2;
2208 /* Keep the shortest literal count */
2210 if (branchcountlits < *countlits) *countlits = branchcountlits;
2211 DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
2213 /* If lookbehind, check that this branch matches a fixed-length string,
2214 and put the length into the OP_REVERSE item. Temporarily mark the end of
2215 the branch with OP_END. */
2220 length = find_fixedlength(last_branch, options);
2221 DPRINTF(("fixed length = %d\n", length));
2228 reverse_count[0] = (length >> 8);
2229 reverse_count[1] = length & 255;
2232 /* Reached end of expression, either ')' or end of pattern. Insert a
2233 terminating ket and the length of the whole bracketed item, and return,
2234 leaving the pointer at the terminating char. If any of the ims options
2235 were changed inside the group, compile a resetting op-code following. */
2239 length = code - start_bracket;
2241 *code++ = length >> 8;
2242 *code++ = length & 255;
2243 if (optchanged >= 0)
2246 *code++ = oldoptions;
2253 /* Another branch follows; insert an "or" node and advance the pointer. */
2260 /* Control never reaches here */
2266 /*************************************************
2267 * Find first significant op code *
2268 *************************************************/
2270 /* This is called by several functions that scan a compiled expression looking
2271 for a fixed first character, or an anchoring op code etc. It skips over things
2272 that do not influence this. For one application, a change of caseless option is
2276 code pointer to the start of the group
2277 options pointer to external options
2278 optbit the option bit whose changing is significant, or
2280 optstop TRUE to return on option change, otherwise change the options
2283 Returns: pointer to the first significant opcode
2286 static const uschar*
2287 first_significant_code(const uschar *code, int *options, int optbit,
2295 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
2297 if (optstop) return code;
2298 *options = (int)code[1];
2308 case OP_WORD_BOUNDARY:
2309 case OP_NOT_WORD_BOUNDARY:
2315 case OP_ASSERTBACK_NOT:
2316 do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
2324 /* Control never reaches here */
2330 /*************************************************
2331 * Check for anchored expression *
2332 *************************************************/
2334 /* Try to find out if this is an anchored regular expression. Consider each
2335 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2336 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2337 it's anchored. However, if this is a multiline pattern, then only OP_SOD
2338 counts, since OP_CIRC can match in the middle.
2340 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
2341 because that will try the rest of the pattern at all possible matching points,
2342 so there is no point trying them again.
2345 code points to start of expression (the bracket)
2346 options points to the options setting
2348 Returns: TRUE or FALSE
2352 is_anchored(register const uschar *code, int *options)
2355 const uschar *scode = first_significant_code(code + 3, options,
2356 PCRE_MULTILINE, FALSE);
2357 register int op = *scode;
2358 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2359 { if (!is_anchored(scode, options)) return FALSE; }
2360 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
2361 (*options & PCRE_DOTALL) != 0)
2362 { if (scode[1] != OP_ANY) return FALSE; }
2363 else if (op != OP_SOD &&
2364 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
2366 code += (code[1] << 8) + code[2];
2368 while (*code == OP_ALT);
2374 /*************************************************
2375 * Check for starting with ^ or .* *
2376 *************************************************/
2378 /* This is called to find out if every branch starts with ^ or .* so that
2379 "first char" processing can be done to speed things up in multiline
2380 matching and for non-DOTALL patterns that start with .* (which must start at
2381 the beginning or after \n).
2383 Argument: points to start of expression (the bracket)
2384 Returns: TRUE or FALSE
2388 is_startline(const uschar *code)
2391 const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
2392 register int op = *scode;
2393 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2394 { if (!is_startline(scode)) return FALSE; }
2395 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2396 { if (scode[1] != OP_ANY) return FALSE; }
2397 else if (op != OP_CIRC) return FALSE;
2398 code += (code[1] << 8) + code[2];
2400 while (*code == OP_ALT);
2406 /*************************************************
2407 * Check for fixed first char *
2408 *************************************************/
2410 /* Try to find out if there is a fixed first character. This is called for
2411 unanchored expressions, as it speeds up their processing quite considerably.
2412 Consider each alternative branch. If they all start with the same char, or with
2413 a bracket all of whose alternatives start with the same char (recurse ad lib),
2414 then we return that char, otherwise -1.
2417 code points to start of expression (the bracket)
2418 options pointer to the options (used to check casing changes)
2420 Returns: -1 or the fixed first char
2424 find_firstchar(const uschar *code, int *options)
2426 register int c = -1;
2429 const uschar *scode = first_significant_code(code + 3, options,
2430 PCRE_CASELESS, TRUE);
2431 register int op = *scode;
2433 if (op >= OP_BRA) op = OP_BRA;
2444 if ((d = find_firstchar(scode, options)) < 0) return -1;
2445 if (c < 0) c = d; else if (c != d) return -1;
2448 case OP_EXACT: /* Fall through */
2451 case OP_CHARS: /* Fall through */
2456 if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
2460 code += (code[1] << 8) + code[2];
2462 while (*code == OP_ALT);
2470 /*************************************************
2471 * Compile a Regular Expression *
2472 *************************************************/
2474 /* This function takes a string and returns a pointer to a block of store
2475 holding a compiled version of the expression.
2478 pattern the regular expression
2479 options various option bits
2480 errorptr pointer to pointer to error text
2481 erroroffset ptr offset in pattern where error was detected
2482 tables pointer to character tables or NULL
2484 Returns: pointer to compiled data block, or NULL on error,
2485 with errorptr and erroroffset set
2489 pcre_compile(const char *pattern, int options, const char **errorptr,
2490 int *erroroffset, const unsigned char *tables)
2493 int length = 3; /* For initial BRA plus length */
2495 int c, reqchar, countlits;
2497 int top_backref = 0;
2498 int branch_extra = 0;
2499 int branch_newextra;
2500 unsigned int brastackptr = 0;
2504 compile_data compile_block;
2505 int brastack[BRASTACK_SIZE];
2506 uschar bralenstack[BRASTACK_SIZE];
2509 uschar *code_base, *code_end;
2512 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
2514 #ifndef SUPPORT_UTF8
2515 if ((options & PCRE_UTF8) != 0)
2522 /* We can't pass back an error message if errorptr is NULL; I guess the best we
2523 can do is just return NULL. */
2525 if (errorptr == NULL) return NULL;
2528 /* However, we can give a message for this error */
2530 if (erroroffset == NULL)
2537 if ((options & ~PUBLIC_OPTIONS) != 0)
2543 /* Set up pointers to the individual character tables */
2545 if (tables == NULL) tables = pcre_default_tables;
2546 compile_block.lcc = tables + lcc_offset;
2547 compile_block.fcc = tables + fcc_offset;
2548 compile_block.cbits = tables + cbits_offset;
2549 compile_block.ctypes = tables + ctypes_offset;
2551 /* Reflect pattern for debugging output */
2553 DPRINTF(("------------------------------------------------------------------\n"));
2554 DPRINTF(("%s\n", pattern));
2556 /* The first thing to do is to make a pass over the pattern to compute the
2557 amount of store required to hold the compiled code. This does not have to be
2558 perfect as long as errors are overestimates. At the same time we can detect any
2559 internal flag settings. Make an attempt to correct for any counted white space
2560 if an "extended" flag setting appears late in the pattern. We can't be so
2561 clever for #-comments. */
2563 ptr = (const uschar *)(pattern - 1);
2564 while ((c = *(++ptr)) != 0)
2567 int class_charcount;
2570 if ((options & PCRE_EXTENDED) != 0)
2572 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2575 /* The space before the ; is to avoid a warning on a silly compiler
2576 on the Macintosh. */
2577 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2584 /* A backslashed item may be an escaped "normal" character or a
2585 character type. For a "normal" character, put the pointers and
2586 character back so that tests for whitespace etc. in the input
2587 are done correctly. */
2591 const uschar *save_ptr = ptr;
2592 c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
2593 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2603 /* A back reference needs an additional 2 bytes, plus either one or 5
2604 bytes for a repeat. We also need to keep the value of the highest
2609 int refnum = -c - ESC_REF;
2610 if (refnum > top_backref) top_backref = refnum;
2611 length += 2; /* For single back reference */
2612 if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2614 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2615 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2616 if ((min == 0 && (max == 1 || max == -1)) ||
2617 (min == 1 && max == -1))
2620 if (ptr[1] == '?') ptr++;
2628 case '*': /* These repeats won't be after brackets; */
2629 case '+': /* those are handled separately */
2634 /* This covers the cases of repeats after a single char, metachar, class,
2635 or back reference. */
2638 if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
2639 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
2640 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2641 if ((min == 0 && (max == 1 || max == -1)) ||
2642 (min == 1 && max == -1))
2646 length--; /* Uncount the original char or metachar */
2647 if (min == 1) length++; else if (min > 0) length += 4;
2648 if (max > 0) length += 4; else length += 2;
2650 if (ptr[1] == '?') ptr++;
2653 /* An alternation contains an offset to the next branch or ket. If any ims
2654 options changed in the previous branch(es), and/or if we are in a
2655 lookbehind assertion, extra space will be needed at the start of the
2656 branch. This is handled by branch_extra. */
2659 length += 3 + branch_extra;
2662 /* A character class uses 33 characters. Don't worry about character types
2663 that aren't allowed in classes - they'll get picked up during the compile.
2664 A character class that contains only one character uses 2 or 3 bytes,
2665 depending on whether it is negated or not. Notice this where we can. */
2668 class_charcount = 0;
2669 if (*(++ptr) == '^') ptr++;
2674 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2676 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2677 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2679 else class_charcount++;
2682 while (*ptr != 0 && *ptr != ']');
2684 /* Repeats for negated single chars are handled by the general code */
2686 if (class_charcount == 1) length += 3; else
2690 /* A repeat needs either 1 or 5 bytes. */
2692 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2694 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2695 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2696 if ((min == 0 && (max == 1 || max == -1)) ||
2697 (min == 1 && max == -1))
2700 if (ptr[1] == '?') ptr++;
2705 /* Brackets may be genuine groups or special things */
2708 branch_newextra = 0;
2711 /* Handle special forms of bracket, which all start (? */
2720 /* Skip over comments entirely */
2723 while (*ptr != 0 && *ptr != ')') ptr++;
2727 goto PCRE_ERROR_RETURN;
2731 /* Non-referencing groups and lookaheads just move the pointer on, and
2732 then behave like a non-special bracket, except that they don't increment
2733 the count of extracting brackets. Ditto for the "once only" bracket,
2734 which is in Perl from version 5.005. */
2743 /* A recursive call to the regex is an extension, to provide the
2744 facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */
2750 goto PCRE_ERROR_RETURN;
2756 /* Lookbehinds are in Perl from version 5.005 */
2759 if (ptr[3] == '=' || ptr[3] == '!')
2762 branch_newextra = 3;
2763 length += 3; /* For the first branch */
2767 goto PCRE_ERROR_RETURN;
2769 /* Conditionals are in Perl from version 5.005. The bracket must either
2770 be followed by a number (for bracket reference) or by an assertion
2774 if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2778 while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2782 goto PCRE_ERROR_RETURN;
2785 else /* An assertion must follow */
2787 ptr++; /* Can treat like ':' as far as spacing is concerned */
2788 if (ptr[2] != '?' ||
2789 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
2791 ptr += 2; /* To get right offset in message */
2793 goto PCRE_ERROR_RETURN;
2798 /* Else loop checking valid options until ) is met. Anything else is an
2799 error. If we are without any brackets, i.e. at top level, the settings
2800 act as if specified in the options, so massage the options immediately.
2801 This is for backward compatibility with Perl 5.004. */
2814 *optset |= PCRE_CASELESS;
2818 *optset |= PCRE_MULTILINE;
2822 *optset |= PCRE_DOTALL;
2826 *optset |= PCRE_EXTENDED;
2830 *optset |= PCRE_EXTRA;
2834 *optset |= PCRE_UNGREEDY;
2841 /* A termination by ')' indicates an options-setting-only item;
2842 this is global at top level; otherwise nothing is done here and
2843 it is handled during the compiling process on a per-bracket-group
2847 if (brastackptr == 0)
2849 options = (options | set) & (~unset);
2850 set = unset = 0; /* To save length */
2854 /* A termination by ':' indicates the start of a nested group with
2855 the given options set. This is again handled at compile time, but
2856 we must allow for compiled space if any of the ims options are
2857 set. We also have to allow for resetting space at the end of
2858 the group, which is why 4 is added to the length and not just 2.
2859 If there are several changes of options within the same group, this
2860 will lead to an over-estimate on the length, but this shouldn't
2861 matter very much. We also have to allow for resetting options at
2862 the start of any alternations, which we do by setting
2863 branch_newextra to 2. Finally, we record whether the case-dependent
2864 flag ever changes within the regex. This is used by the "required
2868 if (((set|unset) & PCRE_IMS) != 0)
2871 branch_newextra = 2;
2872 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2876 /* Unrecognized option character */
2880 goto PCRE_ERROR_RETURN;
2884 /* If we hit a closing bracket, that's it - this is a freestanding
2885 option-setting. We need to ensure that branch_extra is updated if
2886 necessary. The only values branch_newextra can have here are 0 or 2.
2887 If the value is 2, then branch_extra must either be 2 or 5, depending
2888 on whether this is a lookbehind group or not. */
2893 if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
2894 branch_extra += branch_newextra;
2898 /* If options were terminated by ':' control comes here. Fall through
2899 to handle the group below. */
2903 /* Extracting brackets must be counted so we can process escapes in a
2904 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
2905 need an additional 3 bytes of store per extracting bracket. */
2910 if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
2913 /* Save length for computing whole length at end if there's a repeat that
2914 requires duplication of the group. Also save the current value of
2915 branch_extra, and start the new group with the new value. If non-zero, this
2916 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
2918 if (brastackptr >= sizeof(brastack)/sizeof(int))
2921 goto PCRE_ERROR_RETURN;
2924 bralenstack[brastackptr] = branch_extra;
2925 branch_extra = branch_newextra;
2927 brastack[brastackptr++] = length;
2928 length += bracket_length;
2931 /* Handle ket. Look for subsequent max/min; for certain sets of values we
2932 have to replicate this bracket up to that many times. If brastackptr is
2933 0 this is an unmatched bracket which will generate an error, but take care
2934 not to try to access brastack[-1] when computing the length and restoring
2935 the branch_extra value. */
2944 if (brastackptr > 0)
2946 duplength = length - brastack[--brastackptr];
2947 branch_extra = bralenstack[brastackptr];
2951 /* Leave ptr at the final char; for read_repeat_counts this happens
2952 automatically; for the others we need an increment. */
2954 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2956 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2958 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2960 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2961 else if (c == '+') { maxval = -1; ptr++; }
2962 else if (c == '?') { minval = 0; ptr++; }
2964 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2965 group, and if the maximum is greater than zero, we have to replicate
2966 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2967 bracket set - hence the 7. */
2972 if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2975 /* When the minimum is greater than zero, 1 we have to replicate up to
2976 minval-1 times, with no additions required in the copies. Then, if
2977 there is a limited maximum we have to replicate up to maxval-1 times
2978 allowing for a BRAZERO item before each optional copy and nesting
2979 brackets for all but one of the optional copies. */
2983 length += (minval - 1) * duplength;
2984 if (maxval > minval) /* Need this test as maxval=-1 means no limit */
2985 length += (maxval - minval) * (duplength + 7) - 6;
2990 /* Non-special character. For a run of such characters the length required
2991 is the number of characters + 2, except that the maximum run length is 255.
2992 We won't get a skipped space or a non-data escape or the start of a #
2993 comment as the first character, so the length can't be zero. */
3001 if ((options & PCRE_EXTENDED) != 0)
3003 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
3006 /* The space before the ; is to avoid a warning on a silly compiler
3007 on the Macintosh. */
3008 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3013 /* Backslash may introduce a data char or a metacharacter; stop the
3014 string before the latter. */
3018 const uschar *saveptr = ptr;
3019 c = check_escape(&ptr, errorptr, bracount, options, FALSE,
3021 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3022 if (c < 0) { ptr = saveptr; break; }
3025 if (c > 127 && (options & PCRE_UTF8) != 0)
3028 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
3029 if (c <= utf8_table1[i]) break;
3035 /* Ordinary character or single-char escape */
3040 /* This "while" is the end of the "do" above. */
3042 while (runlength < MAXLIT &&
3043 (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
3046 length += runlength;
3051 length += 4; /* For final KET and END */
3059 /* Compute the size of data block needed and get it, either from malloc or
3060 externally provided function. We specify "code[0]" in the offsetof() expression
3061 rather than just "code", because it has been reported that one broken compiler
3062 fails on "code" because it is also an independent variable. It should make no
3063 difference to the value of the offsetof(). */
3065 size = length + offsetof(real_pcre, code[0]);
3066 re = (real_pcre *)(pcre_malloc)(size);
3074 /* Put in the magic number, and save the size, options, and table pointer */
3076 re->magic_number = MAGIC_NUMBER;
3078 re->options = options;
3079 re->tables = tables;
3081 /* Set up a starting, non-extracting bracket, then compile the expression. On
3082 error, *errorptr will be set non-NULL, so we don't need to look at the result
3083 of the function here. */
3085 ptr = (const uschar *)pattern;
3089 (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, 0,
3090 &reqchar, &countlits, &compile_block);
3091 re->top_bracket = bracount;
3092 re->top_backref = top_backref;
3094 /* If not reached end of pattern on success, there's an excess bracket. */
3096 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
3098 /* Fill in the terminating state and check for disastrous overflow, but
3099 if debugging, leave the test till after things are printed out. */
3104 if (code - re->code > length) *errorptr = ERR23;
3107 /* Give an error if there's back reference to a non-existent capturing
3110 if (top_backref > re->top_bracket) *errorptr = ERR15;
3112 /* Failed to compile */
3114 if (*errorptr != NULL)
3118 *erroroffset = ptr - (const uschar *)pattern;
3122 /* If the anchored option was not passed, set flag if we can determine that the
3123 pattern is anchored by virtue of ^ characters or \A or anything else (such as
3124 starting with .* when DOTALL is set).
3126 Otherwise, see if we can determine what the first character has to be, because
3127 that speeds up unanchored matches no end. If not, see if we can set the
3128 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
3129 start with ^. and also when all branches start with .* for non-DOTALL matches.
3132 if ((options & PCRE_ANCHORED) == 0)
3134 int temp_options = options;
3135 if (is_anchored(re->code, &temp_options))
3136 re->options |= PCRE_ANCHORED;
3139 int ch = find_firstchar(re->code, &temp_options);
3142 re->first_char = ch;
3143 re->options |= PCRE_FIRSTSET;
3145 else if (is_startline(re->code))
3146 re->options |= PCRE_STARTLINE;
3150 /* Save the last required character if there are at least two literal
3151 characters on all paths, or if there is no first character setting. */
3153 if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
3155 re->req_char = reqchar;
3156 re->options |= PCRE_REQCHSET;
3159 /* Print out the compiled data for debugging */
3163 printf("Length = %d top_bracket = %d top_backref = %d\n",
3164 length, re->top_bracket, re->top_backref);
3166 if (re->options != 0)
3168 printf("%s%s%s%s%s%s%s%s%s\n",
3169 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
3170 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
3171 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
3172 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
3173 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
3174 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
3175 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
3176 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
3177 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
3180 if ((re->options & PCRE_FIRSTSET) != 0)
3182 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
3183 else printf("First char = \\x%02x\n", re->first_char);
3186 if ((re->options & PCRE_REQCHSET) != 0)
3188 if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
3189 else printf("Req char = \\x%02x\n", re->req_char);
3193 code_base = code = re->code;
3195 while (code < code_end)
3199 printf("%3d ", code - code_base);
3201 if (*code >= OP_BRA)
3203 if (*code - OP_BRA > EXTRACT_BASIC_MAX)
3204 printf("%3d Bra extra", (code[1] << 8) + code[2]);
3206 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
3213 printf(" %.2x %s", code[1], OP_names[*code]);
3218 charlength = *(++code);
3219 printf("%3d ", charlength);
3220 while (charlength-- > 0)
3221 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
3231 case OP_ASSERTBACK_NOT:
3237 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
3248 case OP_TYPEMINSTAR:
3250 case OP_TYPEMINPLUS:
3252 case OP_TYPEMINQUERY:
3253 if (*code >= OP_TYPESTAR)
3254 printf(" %s", OP_names[code[1]]);
3255 else if (isprint(c = code[1])) printf(" %c", c);
3256 else printf(" \\x%02x", c);
3257 printf("%s", OP_names[*code++]);
3263 if (isprint(c = code[3])) printf(" %c{", c);
3264 else printf(" \\x%02x{", c);
3265 if (*code != OP_EXACT) printf("0,");
3266 printf("%d}", (code[1] << 8) + code[2]);
3267 if (*code == OP_MINUPTO) printf("?");
3273 case OP_TYPEMINUPTO:
3274 printf(" %s{", OP_names[code[3]]);
3275 if (*code != OP_TYPEEXACT) printf(",");
3276 printf("%d}", (code[1] << 8) + code[2]);
3277 if (*code == OP_TYPEMINUPTO) printf("?");
3282 if (isprint(c = *(++code))) printf(" [^%c]", c);
3283 else printf(" [^\\x%02x]", c);
3291 case OP_NOTMINQUERY:
3292 if (isprint(c = code[1])) printf(" [^%c]", c);
3293 else printf(" [^\\x%02x]", c);
3294 printf("%s", OP_names[*code++]);
3300 if (isprint(c = code[3])) printf(" [^%c]{", c);
3301 else printf(" [^\\x%02x]{", c);
3302 if (*code != OP_NOTEXACT) printf(",");
3303 printf("%d}", (code[1] << 8) + code[2]);
3304 if (*code == OP_NOTMINUPTO) printf("?");
3309 printf(" \\%d", (code[1] << 8) | code[2]);
3311 goto CLASS_REF_REPEAT;
3319 for (i = 0; i < 256; i++)
3321 if ((code[i/8] & (1 << (i&7))) != 0)
3324 for (j = i+1; j < 256; j++)
3325 if ((code[j/8] & (1 << (j&7))) == 0) break;
3326 if (i == '-' || i == ']') printf("\\");
3327 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
3331 if (j == '-' || j == ']') printf("\\");
3332 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
3350 printf("%s", OP_names[*code]);
3355 min = (code[1] << 8) + code[2];
3356 max = (code[3] << 8) + code[4];
3357 if (max == 0) printf("{%d,}", min);
3358 else printf("{%d,%d}", min, max);
3359 if (*code == OP_CRMINRANGE) printf("?");
3369 /* Anything else is just a one-node item */
3372 printf(" %s", OP_names[*code]);
3379 printf("------------------------------------------------------------------\n");
3381 /* This check is done here in the debugging case so that the code that
3382 was compiled can be seen. */
3384 if (code - re->code > length)
3388 *erroroffset = ptr - (uschar *)pattern;
3398 /*************************************************
3399 * Match a back-reference *
3400 *************************************************/
3402 /* If a back reference hasn't been set, the length that is passed is greater
3403 than the number of characters left in the string, so the match fails.
3406 offset index into the offset vector
3407 eptr points into the subject
3408 length length to be matched
3409 md points to match data block
3412 Returns: TRUE if matched
3416 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
3417 unsigned long int ims)
3419 const uschar *p = md->start_subject + md->offset_vector[offset];
3422 if (eptr >= md->end_subject)
3423 printf("matching subject <null>");
3426 printf("matching subject ");
3427 pchars(eptr, length, TRUE, md);
3429 printf(" against backref ");
3430 pchars(p, length, FALSE, md);
3434 /* Always fail if not enough characters left */
3436 if (length > md->end_subject - eptr) return FALSE;
3438 /* Separate the caselesss case for speed */
3440 if ((ims & PCRE_CASELESS) != 0)
3442 while (length-- > 0)
3443 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
3446 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3453 /*************************************************
3454 * Match from current position *
3455 *************************************************/
3457 /* On entry ecode points to the first opcode, and eptr to the first character
3458 in the subject string, while eptrb holds the value of eptr at the start of the
3459 last bracketed group - used for breaking infinite loops matching zero-length
3463 eptr pointer in subject
3464 ecode position in code
3465 offset_top current top pointer
3466 md pointer to "static" info for the match
3467 ims current /i, /m, and /s options
3468 eptrb pointer to chain of blocks containing eptr at start of
3469 brackets - for testing for empty matches
3471 match_condassert - this is an assertion condition
3472 match_isgroup - this is the start of a bracketed group
3474 Returns: TRUE if matched
3478 match(register const uschar *eptr, register const uschar *ecode,
3479 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
3482 unsigned long int original_ims = ims; /* Save for resetting on ')' */
3485 /* At the start of a bracketed group, add the current subject pointer to the
3486 stack of such pointers, to be re-instated at the end of the group when we hit
3487 the closing ket. When match() is called in other circumstances, we don't add to
3490 if ((flags & match_isgroup) != 0)
3492 newptrb.prev = eptrb;
3493 newptrb.saved_eptr = eptr;
3497 /* Now start processing the operations. */
3501 int op = (int)*ecode;
3502 int min, max, ctype;
3505 BOOL minimize = FALSE;
3507 /* Opening capturing bracket. If there is space in the offset vector, save
3508 the current subject position in the working slot at the top of the vector. We
3509 mustn't change the current values of the data slot, because they may be set
3510 from a previous iteration of this group, and be referred to by a reference
3513 If the bracket fails to match, we need to restore this value and also the
3514 values of the final offsets, in case they were set by a previous iteration of
3517 If there isn't enough space in the offset vector, treat this as if it were a
3518 non-capturing bracket. Don't worry about setting the flag for the error case
3519 here; that is handled in the code for KET. */
3524 int number = op - OP_BRA;
3526 /* For extended extraction brackets (large number), we have to fish out the
3527 number from a dummy opcode at the start. */
3529 if (number > EXTRACT_BASIC_MAX) number = (ecode[4] << 8) | ecode[5];
3530 offset = number << 1;
3533 printf("start bracket %d subject=", number);
3534 pchars(eptr, 16, TRUE, md);
3538 if (offset < md->offset_max)
3540 int save_offset1 = md->offset_vector[offset];
3541 int save_offset2 = md->offset_vector[offset+1];
3542 int save_offset3 = md->offset_vector[md->offset_end - number];
3544 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
3545 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
3549 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3551 ecode += (ecode[1] << 8) + ecode[2];
3553 while (*ecode == OP_ALT);
3555 DPRINTF(("bracket %d failed\n", number));
3557 md->offset_vector[offset] = save_offset1;
3558 md->offset_vector[offset+1] = save_offset2;
3559 md->offset_vector[md->offset_end - number] = save_offset3;
3564 /* Insufficient room for saving captured contents */
3569 /* Other types of node can be handled by a switch */
3573 case OP_BRA: /* Non-capturing bracket: optimized */
3574 DPRINTF(("start bracket 0\n"));
3577 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3579 ecode += (ecode[1] << 8) + ecode[2];
3581 while (*ecode == OP_ALT);
3582 DPRINTF(("bracket 0 failed\n"));
3585 /* Conditional group: compilation checked that there are no more than
3586 two branches. If the condition is false, skipping the first branch takes us
3587 past the end if there is only one branch, but that's OK because that is
3588 exactly what going to the ket would do. */
3591 if (ecode[3] == OP_CREF) /* Condition is extraction test */
3593 int offset = (ecode[4] << 9) | (ecode[5] << 1); /* Doubled ref number */
3595 ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
3596 6 : 3 + (ecode[1] << 8) + ecode[2]),
3597 offset_top, md, ims, eptrb, match_isgroup);
3600 /* The condition is an assertion. Call match() to evaluate it - setting
3601 the final argument TRUE causes it to stop at the end of an assertion. */
3605 if (match(eptr, ecode+3, offset_top, md, ims, NULL,
3606 match_condassert | match_isgroup))
3608 ecode += 3 + (ecode[4] << 8) + ecode[5];
3609 while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
3611 else ecode += (ecode[1] << 8) + ecode[2];
3612 return match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup);
3614 /* Control never reaches here */
3616 /* Skip over conditional reference or large extraction number data if
3624 /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3625 an empty string - recursion will then try other alternatives, if any. */
3628 if (md->notempty && eptr == md->start_match) return FALSE;
3629 md->end_match_ptr = eptr; /* Record where we ended */
3630 md->end_offset_top = offset_top; /* and how many extracts were taken */
3633 /* Change option settings */
3638 DPRINTF(("ims set to %02lx\n", ims));
3641 /* Assertion brackets. Check the alternative branches in turn - the
3642 matching won't pass the KET for an assertion. If any one branch matches,
3643 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
3644 start of each branch to move the current point backwards, so the code at
3645 this level is identical to the lookahead case. */
3651 if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup)) break;
3652 ecode += (ecode[1] << 8) + ecode[2];
3654 while (*ecode == OP_ALT);
3655 if (*ecode == OP_KET) return FALSE;
3657 /* If checking an assertion for a condition, return TRUE. */
3659 if ((flags & match_condassert) != 0) return TRUE;
3661 /* Continue from after the assertion, updating the offsets high water
3662 mark, since extracts may have been taken during the assertion. */
3664 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3666 offset_top = md->end_offset_top;
3669 /* Negative assertion: all branches must fail to match */
3672 case OP_ASSERTBACK_NOT:
3675 if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup))
3677 ecode += (ecode[1] << 8) + ecode[2];
3679 while (*ecode == OP_ALT);
3681 if ((flags & match_condassert) != 0) return TRUE;
3686 /* Move the subject pointer back. This occurs only at the start of
3687 each branch of a lookbehind assertion. If we are too close to the start to
3688 move back, this match function fails. When working with UTF-8 we move
3689 back a number of characters, not bytes. */
3693 c = (ecode[1] << 8) + ecode[2];
3694 for (i = 0; i < c; i++)
3700 eptr -= (ecode[1] << 8) + ecode[2];
3703 if (eptr < md->start_subject) return FALSE;
3707 /* Recursion matches the current regex, nested. If there are any capturing
3708 brackets started but not finished, we have to save their starting points
3709 and reinstate them after the recursion. However, we don't know how many
3710 such there are (offset_top records the completed total) so we just have
3711 to save all the potential data. There may be up to 99 such values, which
3712 is a bit large to put on the stack, but using malloc for small numbers
3713 seems expensive. As a compromise, the stack is used when there are fewer
3714 than 16 values to store; otherwise malloc is used. A problem is what to do
3715 if the malloc fails ... there is no way of returning to the top level with
3716 an error. Save the top 15 values on the stack, and accept that the rest
3727 if (c < 16) save = stacksave; else
3729 save = (int *)(pcre_malloc)((c+1) * sizeof(int));
3737 for (i = 1; i <= c; i++)
3738 save[i] = md->offset_vector[md->offset_end - i];
3739 rc = match(eptr, md->start_pattern, offset_top, md, ims, eptrb,
3741 for (i = 1; i <= c; i++)
3742 md->offset_vector[md->offset_end - i] = save[i];
3743 if (save != stacksave) (pcre_free)(save);
3744 if (!rc) return FALSE;
3746 /* In case the recursion has set more capturing values, save the final
3747 number, then move along the subject till after the recursive match,
3748 and advance one byte in the pattern code. */
3750 offset_top = md->end_offset_top;
3751 eptr = md->end_match_ptr;
3756 /* "Once" brackets are like assertion brackets except that after a match,
3757 the point in the subject string is not moved back. Thus there can never be
3758 a move back into the brackets. Check the alternative branches in turn - the
3759 matching won't pass the KET for this kind of subpattern. If any one branch
3760 matches, we carry on as at the end of a normal bracket, leaving the subject
3765 const uschar *prev = ecode;
3766 const uschar *saved_eptr = eptr;
3770 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3772 ecode += (ecode[1] << 8) + ecode[2];
3774 while (*ecode == OP_ALT);
3776 /* If hit the end of the group (which could be repeated), fail */
3778 if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
3780 /* Continue as from after the assertion, updating the offsets high water
3781 mark, since extracts may have been taken. */
3783 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3785 offset_top = md->end_offset_top;
3786 eptr = md->end_match_ptr;
3788 /* For a non-repeating ket, just continue at this level. This also
3789 happens for a repeating ket if no characters were matched in the group.
3790 This is the forcible breaking of infinite loops as implemented in Perl
3791 5.005. If there is an options reset, it will get obeyed in the normal
3792 course of events. */
3794 if (*ecode == OP_KET || eptr == saved_eptr)
3800 /* The repeating kets try the rest of the pattern or restart from the
3801 preceding bracket, in the appropriate order. We need to reset any options
3802 that changed within the bracket before re-running it, so check the next
3805 if (ecode[3] == OP_OPT)
3807 ims = (ims & ~PCRE_IMS) | ecode[4];
3808 DPRINTF(("ims set to %02lx at group repeat\n", ims));
3811 if (*ecode == OP_KETRMIN)
3813 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3814 match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3817 else /* OP_KETRMAX */
3819 if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3820 match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3825 /* An alternation is the end of a branch; scan along to find the end of the
3826 bracketed group and go to there. */
3829 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3832 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3833 that it may occur zero times. It may repeat infinitely, or not at all -
3834 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3835 repeat limits are compiled as a number of copies, with the optional ones
3836 preceded by BRAZERO or BRAMINZERO. */
3840 const uschar *next = ecode+1;
3841 if (match(eptr, next, offset_top, md, ims, eptrb, match_isgroup))
3843 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3850 const uschar *next = ecode+1;
3851 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3852 if (match(eptr, next+3, offset_top, md, ims, eptrb, match_isgroup))
3858 /* End of a group, repeated or non-repeating. If we are at the end of
3859 an assertion "group", stop matching and return TRUE, but record the
3860 current high water mark for use by positive assertions. Do this also
3861 for the "once" (not-backup up) groups. */
3867 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3868 const uschar *saved_eptr = eptrb->saved_eptr;
3870 eptrb = eptrb->prev; /* Back up the stack of bracket start pointers */
3872 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3873 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
3876 md->end_match_ptr = eptr; /* For ONCE */
3877 md->end_offset_top = offset_top;
3881 /* In all other cases except a conditional group we have to check the
3882 group number back at the start and if necessary complete handling an
3883 extraction by setting the offsets and bumping the high water mark. */
3885 if (*prev != OP_COND)
3888 int number = *prev - OP_BRA;
3890 /* For extended extraction brackets (large number), we have to fish out
3891 the number from a dummy opcode at the start. */
3893 if (number > EXTRACT_BASIC_MAX) number = (prev[4] << 8) | prev[5];
3894 offset = number << 1;
3897 printf("end bracket %d", number);
3903 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
3905 md->offset_vector[offset] =
3906 md->offset_vector[md->offset_end - number];
3907 md->offset_vector[offset+1] = eptr - md->start_subject;
3908 if (offset_top <= offset) offset_top = offset + 2;
3913 /* Reset the value of the ims flags, in case they got changed during
3917 DPRINTF(("ims reset to %02lx\n", ims));
3919 /* For a non-repeating ket, just continue at this level. This also
3920 happens for a repeating ket if no characters were matched in the group.
3921 This is the forcible breaking of infinite loops as implemented in Perl
3922 5.005. If there is an options reset, it will get obeyed in the normal
3923 course of events. */
3925 if (*ecode == OP_KET || eptr == saved_eptr)
3931 /* The repeating kets try the rest of the pattern or restart from the
3932 preceding bracket, in the appropriate order. */
3934 if (*ecode == OP_KETRMIN)
3936 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3937 match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3940 else /* OP_KETRMAX */
3942 if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3943 match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3948 /* Start of subject unless notbol, or after internal newline if multiline */
3951 if (md->notbol && eptr == md->start_subject) return FALSE;
3952 if ((ims & PCRE_MULTILINE) != 0)
3954 if (eptr != md->start_subject && eptr[-1] != NEWLINE) return FALSE;
3958 /* ... else fall through */
3960 /* Start of subject assertion */
3963 if (eptr != md->start_subject) return FALSE;
3967 /* Assert before internal newline if multiline, or before a terminating
3968 newline unless endonly is set, else end of subject unless noteol is set. */
3971 if ((ims & PCRE_MULTILINE) != 0)
3973 if (eptr < md->end_subject) { if (*eptr != NEWLINE) return FALSE; }
3974 else { if (md->noteol) return FALSE; }
3980 if (md->noteol) return FALSE;
3983 if (eptr < md->end_subject - 1 ||
3984 (eptr == md->end_subject - 1 && *eptr != NEWLINE)) return FALSE;
3990 /* ... else fall through */
3992 /* End of subject assertion (\z) */
3995 if (eptr < md->end_subject) return FALSE;
3999 /* End of subject or ending \n assertion (\Z) */
4002 if (eptr < md->end_subject - 1 ||
4003 (eptr == md->end_subject - 1 && *eptr != NEWLINE)) return FALSE;
4007 /* Word boundary assertions */
4009 case OP_NOT_WORD_BOUNDARY:
4010 case OP_WORD_BOUNDARY:
4012 BOOL prev_is_word = (eptr != md->start_subject) &&
4013 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
4014 BOOL cur_is_word = (eptr < md->end_subject) &&
4015 ((md->ctypes[*eptr] & ctype_word) != 0);
4016 if ((*ecode++ == OP_WORD_BOUNDARY)?
4017 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
4022 /* Match a single character type; inline for speed */
4025 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
4027 if (eptr++ >= md->end_subject) return FALSE;
4030 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4036 if (eptr >= md->end_subject ||
4037 (md->ctypes[*eptr++] & ctype_digit) != 0)
4043 if (eptr >= md->end_subject ||
4044 (md->ctypes[*eptr++] & ctype_digit) == 0)
4049 case OP_NOT_WHITESPACE:
4050 if (eptr >= md->end_subject ||
4051 (md->ctypes[*eptr++] & ctype_space) != 0)
4057 if (eptr >= md->end_subject ||
4058 (md->ctypes[*eptr++] & ctype_space) == 0)
4063 case OP_NOT_WORDCHAR:
4064 if (eptr >= md->end_subject ||
4065 (md->ctypes[*eptr++] & ctype_word) != 0)
4071 if (eptr >= md->end_subject ||
4072 (md->ctypes[*eptr++] & ctype_word) == 0)
4077 /* Match a back reference, possibly repeatedly. Look past the end of the
4078 item to see if there is repeat information following. The code is similar
4079 to that for character classes, but repeated for efficiency. Then obey
4080 similar code to character type repeats - written out again for speed.
4081 However, if the referenced string is the empty string, always treat
4082 it as matched, any number of times (otherwise there could be infinite
4088 int offset = (ecode[1] << 9) | (ecode[2] << 1); /* Doubled ref number */
4089 ecode += 3; /* Advance past item */
4091 /* If the reference is unset, set the length to be longer than the amount
4092 of subject left; this ensures that every attempt at a match fails. We
4093 can't just fail here, because of the possibility of quantifiers with zero
4096 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
4097 md->end_subject - eptr + 1 :
4098 md->offset_vector[offset+1] - md->offset_vector[offset];
4100 /* Set up for repetition, or handle the non-repeated case */
4110 c = *ecode++ - OP_CRSTAR;
4111 minimize = (c & 1) != 0;
4112 min = rep_min[c]; /* Pick up values from tables; */
4113 max = rep_max[c]; /* zero for max => infinity */
4114 if (max == 0) max = INT_MAX;
4119 minimize = (*ecode == OP_CRMINRANGE);
4120 min = (ecode[1] << 8) + ecode[2];
4121 max = (ecode[3] << 8) + ecode[4];
4122 if (max == 0) max = INT_MAX;
4126 default: /* No repeat follows */
4127 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
4129 continue; /* With the main loop */
4132 /* If the length of the reference is zero, just continue with the
4135 if (length == 0) continue;
4137 /* First, ensure the minimum number of matches are present. We get back
4138 the length of the reference string explicitly rather than passing the
4139 address of eptr, so that eptr can be a register variable. */
4141 for (i = 1; i <= min; i++)
4143 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
4147 /* If min = max, continue at the same level without recursion.
4148 They are not both allowed to be zero. */
4150 if (min == max) continue;
4152 /* If minimizing, keep trying and advancing the pointer */
4158 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4160 if (i >= max || !match_ref(offset, eptr, length, md, ims))
4164 /* Control never gets here */
4167 /* If maximizing, find the longest string and work backwards */
4171 const uschar *pp = eptr;
4172 for (i = min; i < max; i++)
4174 if (!match_ref(offset, eptr, length, md, ims)) break;
4179 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4186 /* Control never gets here */
4190 /* Match a character class, possibly repeatedly. Look past the end of the
4191 item to see if there is repeat information following. Then obey similar
4192 code to character type repeats - written out again for speed. */
4196 const uschar *data = ecode + 1; /* Save for matching */
4197 ecode += 33; /* Advance past the item */
4207 c = *ecode++ - OP_CRSTAR;
4208 minimize = (c & 1) != 0;
4209 min = rep_min[c]; /* Pick up values from tables; */
4210 max = rep_max[c]; /* zero for max => infinity */
4211 if (max == 0) max = INT_MAX;
4216 minimize = (*ecode == OP_CRMINRANGE);
4217 min = (ecode[1] << 8) + ecode[2];
4218 max = (ecode[3] << 8) + ecode[4];
4219 if (max == 0) max = INT_MAX;
4223 default: /* No repeat follows */
4228 /* First, ensure the minimum number of matches are present. */
4230 for (i = 1; i <= min; i++)
4232 if (eptr >= md->end_subject) return FALSE;
4233 GETCHARINC(c, eptr) /* Get character; increment eptr */
4236 /* We do not yet support class members > 255 */
4237 if (c > 255) return FALSE;
4240 if ((data[c/8] & (1 << (c&7))) != 0) continue;
4244 /* If max == min we can continue with the main loop without the
4247 if (min == max) continue;
4249 /* If minimizing, keep testing the rest of the expression and advancing
4250 the pointer while it matches the class. */
4256 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4258 if (i >= max || eptr >= md->end_subject) return FALSE;
4259 GETCHARINC(c, eptr) /* Get character; increment eptr */
4262 /* We do not yet support class members > 255 */
4263 if (c > 255) return FALSE;
4265 if ((data[c/8] & (1 << (c&7))) != 0) continue;
4268 /* Control never gets here */
4271 /* If maximizing, find the longest possible run, then work backwards. */
4275 const uschar *pp = eptr;
4277 for (i = min; i < max; i++)
4279 if (eptr >= md->end_subject) break;
4280 GETCHARLEN(c, eptr, len) /* Get character, set length if UTF-8 */
4283 /* We do not yet support class members > 255 */
4286 if ((data[c/8] & (1 << (c&7))) == 0) break;
4292 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4302 /* Control never gets here */
4304 /* Match a run of characters */
4308 register int length = ecode[1];
4311 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4312 if (eptr >= md->end_subject)
4313 printf("matching subject <null> against pattern ");
4316 printf("matching subject ");
4317 pchars(eptr, length, TRUE, md);
4318 printf(" against pattern ");
4320 pchars(ecode, length, FALSE, md);
4324 if (length > md->end_subject - eptr) return FALSE;
4325 if ((ims & PCRE_CASELESS) != 0)
4327 while (length-- > 0)
4328 if (md->lcc[*ecode++] != md->lcc[*eptr++])
4333 while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
4338 /* Match a single character repeatedly; different opcodes share code. */
4341 min = max = (ecode[1] << 8) + ecode[2];
4348 max = (ecode[1] << 8) + ecode[2];
4349 minimize = *ecode == OP_MINUPTO;
4359 c = *ecode++ - OP_STAR;
4360 minimize = (c & 1) != 0;
4361 min = rep_min[c]; /* Pick up values from tables; */
4362 max = rep_max[c]; /* zero for max => infinity */
4363 if (max == 0) max = INT_MAX;
4365 /* Common code for all repeated single-character matches. We can give
4366 up quickly if there are fewer than the minimum number of characters left in
4370 if (min > md->end_subject - eptr) return FALSE;
4373 /* The code is duplicated for the caseless and caseful cases, for speed,
4374 since matching characters is likely to be quite common. First, ensure the
4375 minimum number of matches are present. If min = max, continue at the same
4376 level without recursing. Otherwise, if minimizing, keep trying the rest of
4377 the expression and advancing one matching character if failing, up to the
4378 maximum. Alternatively, if maximizing, find the maximum number of
4379 characters and work backwards. */
4381 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
4384 if ((ims & PCRE_CASELESS) != 0)
4387 for (i = 1; i <= min; i++)
4388 if (c != md->lcc[*eptr++]) return FALSE;
4389 if (min == max) continue;
4394 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4396 if (i >= max || eptr >= md->end_subject ||
4397 c != md->lcc[*eptr++])
4400 /* Control never gets here */
4404 const uschar *pp = eptr;
4405 for (i = min; i < max; i++)
4407 if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
4411 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4415 /* Control never gets here */
4418 /* Caseful comparisons */
4422 for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
4423 if (min == max) continue;
4428 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4430 if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
4432 /* Control never gets here */
4436 const uschar *pp = eptr;
4437 for (i = min; i < max; i++)
4439 if (eptr >= md->end_subject || c != *eptr) break;
4443 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4448 /* Control never gets here */
4450 /* Match a negated single character */
4453 if (eptr >= md->end_subject) return FALSE;
4455 if ((ims & PCRE_CASELESS) != 0)
4457 if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
4461 if (*ecode++ == *eptr++) return FALSE;
4465 /* Match a negated single character repeatedly. This is almost a repeat of
4466 the code for a repeated single character, but I haven't found a nice way of
4467 commoning these up that doesn't require a test of the positive/negative
4468 option for each character match. Maybe that wouldn't add very much to the
4469 time taken, but character matching *is* what this is all about... */
4472 min = max = (ecode[1] << 8) + ecode[2];
4479 max = (ecode[1] << 8) + ecode[2];
4480 minimize = *ecode == OP_NOTMINUPTO;
4489 case OP_NOTMINQUERY:
4490 c = *ecode++ - OP_NOTSTAR;
4491 minimize = (c & 1) != 0;
4492 min = rep_min[c]; /* Pick up values from tables; */
4493 max = rep_max[c]; /* zero for max => infinity */
4494 if (max == 0) max = INT_MAX;
4496 /* Common code for all repeated single-character matches. We can give
4497 up quickly if there are fewer than the minimum number of characters left in
4501 if (min > md->end_subject - eptr) return FALSE;
4504 /* The code is duplicated for the caseless and caseful cases, for speed,
4505 since matching characters is likely to be quite common. First, ensure the
4506 minimum number of matches are present. If min = max, continue at the same
4507 level without recursing. Otherwise, if minimizing, keep trying the rest of
4508 the expression and advancing one matching character if failing, up to the
4509 maximum. Alternatively, if maximizing, find the maximum number of
4510 characters and work backwards. */
4512 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
4515 if ((ims & PCRE_CASELESS) != 0)
4518 for (i = 1; i <= min; i++)
4519 if (c == md->lcc[*eptr++]) return FALSE;
4520 if (min == max) continue;
4525 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4527 if (i >= max || eptr >= md->end_subject ||
4528 c == md->lcc[*eptr++])
4531 /* Control never gets here */
4535 const uschar *pp = eptr;
4536 for (i = min; i < max; i++)
4538 if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
4542 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4546 /* Control never gets here */
4549 /* Caseful comparisons */
4553 for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
4554 if (min == max) continue;
4559 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4561 if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
4563 /* Control never gets here */
4567 const uschar *pp = eptr;
4568 for (i = min; i < max; i++)
4570 if (eptr >= md->end_subject || c == *eptr) break;
4574 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4579 /* Control never gets here */
4581 /* Match a single character type repeatedly; several different opcodes
4582 share code. This is very similar to the code for single characters, but we
4583 repeat it in the interests of efficiency. */
4586 min = max = (ecode[1] << 8) + ecode[2];
4592 case OP_TYPEMINUPTO:
4594 max = (ecode[1] << 8) + ecode[2];
4595 minimize = *ecode == OP_TYPEMINUPTO;
4600 case OP_TYPEMINSTAR:
4602 case OP_TYPEMINPLUS:
4604 case OP_TYPEMINQUERY:
4605 c = *ecode++ - OP_TYPESTAR;
4606 minimize = (c & 1) != 0;
4607 min = rep_min[c]; /* Pick up values from tables; */
4608 max = rep_max[c]; /* zero for max => infinity */
4609 if (max == 0) max = INT_MAX;
4611 /* Common code for all repeated single character type matches */
4614 ctype = *ecode++; /* Code for the character type */
4616 /* First, ensure the minimum number of matches are present. Use inline
4617 code for maximizing the speed, and do the type test once at the start
4618 (i.e. keep it out of the loop). Also we can test that there are at least
4619 the minimum number of bytes before we start, except when doing '.' in
4620 UTF8 mode. Leave the test in in all cases; in the special case we have
4621 to test after each character. */
4623 if (min > md->end_subject - eptr) return FALSE;
4624 if (min > 0) switch(ctype)
4630 for (i = 1; i <= min; i++)
4632 if (eptr >= md->end_subject ||
4633 (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
4635 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4640 /* Non-UTF8 can be faster */
4641 if ((ims & PCRE_DOTALL) == 0)
4642 { for (i = 1; i <= min; i++) if (*eptr++ == NEWLINE) return FALSE; }
4647 for (i = 1; i <= min; i++)
4648 if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
4652 for (i = 1; i <= min; i++)
4653 if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
4656 case OP_NOT_WHITESPACE:
4657 for (i = 1; i <= min; i++)
4658 if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE;
4662 for (i = 1; i <= min; i++)
4663 if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE;
4666 case OP_NOT_WORDCHAR:
4667 for (i = 1; i <= min; i++)
4668 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4673 for (i = 1; i <= min; i++)
4674 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4679 /* If min = max, continue at the same level without recursing */
4681 if (min == max) continue;
4683 /* If minimizing, we have to test the rest of the pattern before each
4684 subsequent match. */
4690 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0)) return TRUE;
4691 if (i >= max || eptr >= md->end_subject) return FALSE;
4697 if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) return FALSE;
4700 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4705 if ((md->ctypes[c] & ctype_digit) != 0) return FALSE;
4709 if ((md->ctypes[c] & ctype_digit) == 0) return FALSE;
4712 case OP_NOT_WHITESPACE:
4713 if ((md->ctypes[c] & ctype_space) != 0) return FALSE;
4717 if ((md->ctypes[c] & ctype_space) == 0) return FALSE;
4720 case OP_NOT_WORDCHAR:
4721 if ((md->ctypes[c] & ctype_word) != 0) return FALSE;
4725 if ((md->ctypes[c] & ctype_word) == 0) return FALSE;
4729 /* Control never gets here */
4732 /* If maximizing it is worth using inline code for speed, doing the type
4733 test once at the start (i.e. keep it out of the loop). */
4737 const uschar *pp = eptr;
4742 /* Special code is required for UTF8, but when the maximum is unlimited
4743 we don't need it. */
4746 if (md->utf8 && max < INT_MAX)
4748 if ((ims & PCRE_DOTALL) == 0)
4750 for (i = min; i < max; i++)
4752 if (eptr >= md->end_subject || *eptr++ == NEWLINE) break;
4753 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4758 for (i = min; i < max; i++)
4761 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4767 /* Non-UTF8 can be faster */
4768 if ((ims & PCRE_DOTALL) == 0)
4770 for (i = min; i < max; i++)
4772 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
4779 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4785 for (i = min; i < max; i++)
4787 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4794 for (i = min; i < max; i++)
4796 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4802 case OP_NOT_WHITESPACE:
4803 for (i = min; i < max; i++)
4805 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4812 for (i = min; i < max; i++)
4814 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4820 case OP_NOT_WORDCHAR:
4821 for (i = min; i < max; i++)
4823 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4830 for (i = min; i < max; i++)
4832 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4841 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4845 while (eptr > pp && (*eptr & 0xc0) == 0x80) eptr--;
4850 /* Control never gets here */
4852 /* There's been some horrible disaster. */
4855 DPRINTF(("Unknown opcode %d\n", *ecode));
4856 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4860 /* Do not stick any code in here without much thought; it is assumed
4861 that "continue" in the code above comes out to here to repeat the main
4864 } /* End of main loop */
4865 /* Control never reaches here */
4871 /*************************************************
4872 * Execute a Regular Expression *
4873 *************************************************/
4875 /* This function applies a compiled re to a subject string and picks out
4876 portions of the string if it matches. Two elements in the vector are set for
4877 each substring: the offsets to the start and end of the substring.
4880 external_re points to the compiled expression
4881 external_extra points to "hints" from pcre_study() or is NULL
4882 subject points to the subject string
4883 length length of subject string (may contain binary zeros)
4884 start_offset where to start in the subject string
4886 offsets points to a vector of ints to be filled in with offsets
4887 offsetcount the number of elements in the vector
4889 Returns: > 0 => success; value is the number of elements filled in
4890 = 0 => success, but offsets is not big enough
4891 -1 => failed to match
4892 < -1 => some kind of unexpected problem
4896 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4897 const char *subject, int length, int start_offset, int options, int *offsets,
4900 int resetcount, ocount;
4901 int first_char = -1;
4904 unsigned long int ims = 0;
4905 match_data match_block;
4906 const uschar *start_bits = NULL;
4907 const uschar *start_match = (const uschar *)subject + start_offset;
4908 const uschar *end_subject;
4909 const uschar *req_char_ptr = start_match - 1;
4910 const real_pcre *re = (const real_pcre *)external_re;
4911 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4912 BOOL using_temporary_offsets = FALSE;
4916 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4918 if (re == NULL || subject == NULL ||
4919 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4920 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4922 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4923 startline = (re->options & PCRE_STARTLINE) != 0;
4925 match_block.start_pattern = re->code;
4926 match_block.start_subject = (const uschar *)subject;
4927 match_block.end_subject = match_block.start_subject + length;
4928 end_subject = match_block.end_subject;
4930 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4931 match_block.utf8 = (re->options & PCRE_UTF8) != 0;
4933 match_block.notbol = (options & PCRE_NOTBOL) != 0;
4934 match_block.noteol = (options & PCRE_NOTEOL) != 0;
4935 match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
4937 match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
4939 match_block.lcc = re->tables + lcc_offset;
4940 match_block.ctypes = re->tables + ctypes_offset;
4942 /* The ims options can vary during the matching as a result of the presence
4943 of (?ims) items in the pattern. They are kept in a local variable so that
4944 restoring at the exit of a group is easy. */
4946 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4948 /* If the expression has got more back references than the offsets supplied can
4949 hold, we get a temporary bit of working store to use during the matching.
4950 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4953 ocount = offsetcount - (offsetcount % 3);
4955 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4957 ocount = re->top_backref * 3 + 3;
4958 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4959 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4960 using_temporary_offsets = TRUE;
4961 DPRINTF(("Got memory to hold back references\n"));
4963 else match_block.offset_vector = offsets;
4965 match_block.offset_end = ocount;
4966 match_block.offset_max = (2*ocount)/3;
4967 match_block.offset_overflow = FALSE;
4969 /* Compute the minimum number of offsets that we need to reset each time. Doing
4970 this makes a huge difference to execution time when there aren't many brackets
4973 resetcount = 2 + re->top_bracket * 2;
4974 if (resetcount > offsetcount) resetcount = ocount;
4976 /* Reset the working variable associated with each extraction. These should
4977 never be used unless previously set, but they get saved and restored, and so we
4978 initialize them to avoid reading uninitialized locations. */
4980 if (match_block.offset_vector != NULL)
4982 register int *iptr = match_block.offset_vector + ocount;
4983 register int *iend = iptr - resetcount/2 + 1;
4984 while (--iptr >= iend) *iptr = -1;
4987 /* Set up the first character to match, if available. The first_char value is
4988 never set for an anchored regular expression, but the anchoring may be forced
4989 at run time, so we have to test for anchoring. The first char may be unset for
4990 an unanchored pattern, of course. If there's no first char and the pattern was
4991 studied, there may be a bitmap of possible first characters. */
4995 if ((re->options & PCRE_FIRSTSET) != 0)
4997 first_char = re->first_char;
4998 if ((ims & PCRE_CASELESS) != 0) first_char = match_block.lcc[first_char];
5001 if (!startline && extra != NULL &&
5002 (extra->options & PCRE_STUDY_MAPPED) != 0)
5003 start_bits = extra->start_bits;
5006 /* For anchored or unanchored matches, there may be a "last known required
5007 character" set. If the PCRE_CASELESS is set, implying that the match starts
5008 caselessly, or if there are any changes of this flag within the regex, set up
5009 both cases of the character. Otherwise set the two values the same, which will
5010 avoid duplicate testing (which takes significant time). This covers the vast
5011 majority of cases. It will be suboptimal when the case flag changes in a regex
5012 and the required character in fact is caseful. */
5014 if ((re->options & PCRE_REQCHSET) != 0)
5016 req_char = re->req_char;
5017 req_char2 = ((re->options & (PCRE_CASELESS | PCRE_ICHANGED)) != 0)?
5018 (re->tables + fcc_offset)[req_char] : req_char;
5021 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5022 the loop runs just once. */
5027 register int *iptr = match_block.offset_vector;
5028 register int *iend = iptr + resetcount;
5030 /* Reset the maximum number of extractions we might see. */
5032 while (iptr < iend) *iptr++ = -1;
5034 /* Advance to a unique first char if possible */
5036 if (first_char >= 0)
5038 if ((ims & PCRE_CASELESS) != 0)
5039 while (start_match < end_subject &&
5040 match_block.lcc[*start_match] != first_char)
5043 while (start_match < end_subject && *start_match != first_char)
5047 /* Or to just after \n for a multiline match if possible */
5051 if (start_match > match_block.start_subject + start_offset)
5053 while (start_match < end_subject && start_match[-1] != NEWLINE)
5058 /* Or to a non-unique first char after study */
5060 else if (start_bits != NULL)
5062 while (start_match < end_subject)
5064 register int c = *start_match;
5065 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
5069 #ifdef DEBUG /* Sigh. Some compilers never learn. */
5070 printf(">>>> Match against: ");
5071 pchars(start_match, end_subject - start_match, TRUE, &match_block);
5075 /* If req_char is set, we know that that character must appear in the subject
5076 for the match to succeed. If the first character is set, req_char must be
5077 later in the subject; otherwise the test starts at the match point. This
5078 optimization can save a huge amount of backtracking in patterns with nested
5079 unlimited repeats that aren't going to match. We don't know what the state of
5080 case matching may be when this character is hit, so test for it in both its
5081 cases if necessary. However, the different cased versions will not be set up
5082 unless PCRE_CASELESS was given or the casing state changes within the regex.
5083 Writing separate code makes it go faster, as does using an autoincrement and
5084 backing off on a match. */
5088 register const uschar *p = start_match + ((first_char >= 0)? 1 : 0);
5090 /* We don't need to repeat the search if we haven't yet reached the
5091 place we found it at last time. */
5093 if (p > req_char_ptr)
5095 /* Do a single test if no case difference is set up */
5097 if (req_char == req_char2)
5099 while (p < end_subject)
5101 if (*p++ == req_char) { p--; break; }
5105 /* Otherwise test for either case */
5109 while (p < end_subject)
5111 register int pp = *p++;
5112 if (pp == req_char || pp == req_char2) { p--; break; }
5116 /* If we can't find the required character, break the matching loop */
5118 if (p >= end_subject) break;
5120 /* If we have found the required character, save the point where we
5121 found it, so that we don't search again next time round the loop if
5122 the start hasn't passed this character yet. */
5128 /* When a match occurs, substrings will be set for all internal extractions;
5129 we just need to set up the whole thing as substring 0 before returning. If
5130 there were too many extractions, set the return code to zero. In the case
5131 where we had to get some local store to hold offsets for backreferences, copy
5132 those back references that we can. In this case there need not be overflow
5133 if certain parts of the pattern were not used. */
5135 match_block.start_match = start_match;
5136 if (!match(start_match, re->code, 2, &match_block, ims, NULL, match_isgroup))
5139 /* Copy the offset information from temporary store if necessary */
5141 if (using_temporary_offsets)
5143 if (offsetcount >= 4)
5145 memcpy(offsets + 2, match_block.offset_vector + 2,
5146 (offsetcount - 2) * sizeof(int));
5147 DPRINTF(("Copied offsets from temporary memory\n"));
5149 if (match_block.end_offset_top > offsetcount)
5150 match_block.offset_overflow = TRUE;
5152 DPRINTF(("Freeing temporary memory\n"));
5153 (pcre_free)(match_block.offset_vector);
5156 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
5158 if (offsetcount < 2) rc = 0; else
5160 offsets[0] = start_match - match_block.start_subject;
5161 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
5164 DPRINTF((">>>> returning %d\n", rc));
5168 /* This "while" is the end of the "do" above */
5171 match_block.errorcode == PCRE_ERROR_NOMATCH &&
5172 start_match++ < end_subject);
5174 if (using_temporary_offsets)
5176 DPRINTF(("Freeing temporary memory\n"));
5177 (pcre_free)(match_block.offset_vector);
5180 DPRINTF((">>>> returning %d\n", match_block.errorcode));
5182 return match_block.errorcode;