1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
10 Written by: Philip Hazel <ph10@cam.ac.uk>
12 Copyright (c) 1997-2004 University of Cambridge
14 -----------------------------------------------------------------------------
15 Redistribution and use in source and binary forms, with or without
16 modification, are permitted provided that the following conditions are met:
18 * Redistributions of source code must retain the above copyright notice,
19 this list of conditions and the following disclaimer.
21 * Redistributions in binary form must reproduce the above copyright
22 notice, this list of conditions and the following disclaimer in the
23 documentation and/or other materials provided with the distribution.
25 * Neither the name of the University of Cambridge nor the names of its
26 contributors may be used to endorse or promote products derived from
27 this software without specific prior written permission.
29 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
30 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
33 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
34 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
35 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
36 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
37 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
38 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
39 POSSIBILITY OF SUCH DAMAGE.
40 -----------------------------------------------------------------------------
44 /* Define DEBUG to get debugging output on stdout. */
47 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
48 inline, and there are *still* stupid compilers about that don't like indented
49 pre-processor statements. I suppose it's only been 10 years... */
52 #define DPRINTF(p) printf p
54 #define DPRINTF(p) /*nothing*/
57 /* Include the internals header, which itself includes "config.h", the Standard
58 C headers, and the external pcre header. */
62 /* If Unicode Property support is wanted, include a private copy of the
63 function that does it, and the table that translates names to numbers. */
67 #include "ucptypetable.c"
70 /* Maximum number of items on the nested bracket stacks at compile time. This
71 applies to the nesting of all kinds of parentheses. It does not limit
72 un-nested, non-capturing parentheses. This number can be made bigger if
73 necessary - it is used to dimension one int and one unsigned char vector at
76 #define BRASTACK_SIZE 200
79 /* Maximum number of ints of offset to save on the stack for recursive calls.
80 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81 because the offset vector is always a multiple of 3 long. */
83 #define REC_STACK_SAVE_MAX 30
86 /* The maximum remaining length of subject we are prepared to search for a
89 #define REQ_BYTE_MAX 1000
92 /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
93 the definition is next to the definition of the opcodes in internal.h. */
95 static const uschar OP_lengths[] = { OP_LENGTHS };
97 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
99 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
100 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
102 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
103 are simple data values; negative values are for special things like \d and so
104 on. Zero means further processing is needed (for things like \x), or the escape
107 #if !EBCDIC /* This is the "normal" table for ASCII systems */
108 static const short int escapes[] = {
109 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
110 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
111 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
112 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
113 -ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
114 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
115 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
116 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
117 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
118 0, 0, -ESC_z /* x - z */
121 #else /* This is the "abnormal" table for EBCDIC systems */
122 static const short int escapes[] = {
123 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
124 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
125 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
126 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
127 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
128 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
129 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
130 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
131 /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
132 /* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p,
133 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
134 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
135 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
136 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
137 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
138 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
139 /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
140 /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
141 /* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
142 /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
143 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
144 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
145 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
150 /* Tables of names of POSIX character classes and their lengths. The list is
151 terminated by a zero length entry. The first three must be alpha, upper, lower,
152 as this is assumed for handling case independence. */
154 static const char *const posix_names[] = {
155 "alpha", "lower", "upper",
156 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
157 "print", "punct", "space", "word", "xdigit" };
159 static const uschar posix_name_lengths[] = {
160 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
162 /* Table of class bit maps for each POSIX class; up to three may be combined
163 to form the class. The table for [:blank:] is dynamically modified to remove
164 the vertical space characters. */
166 static const int posix_class_maps[] = {
167 cbit_lower, cbit_upper, -1, /* alpha */
168 cbit_lower, -1, -1, /* lower */
169 cbit_upper, -1, -1, /* upper */
170 cbit_digit, cbit_lower, cbit_upper, /* alnum */
171 cbit_print, cbit_cntrl, -1, /* ascii */
172 cbit_space, -1, -1, /* blank - a GNU extension */
173 cbit_cntrl, -1, -1, /* cntrl */
174 cbit_digit, -1, -1, /* digit */
175 cbit_graph, -1, -1, /* graph */
176 cbit_print, -1, -1, /* print */
177 cbit_punct, -1, -1, /* punct */
178 cbit_space, -1, -1, /* space */
179 cbit_word, -1, -1, /* word - a Perl extension */
180 cbit_xdigit,-1, -1 /* xdigit */
183 /* Table to identify digits and hex digits. This is used when compiling
184 patterns. Note that the tables in chartables are dependent on the locale, and
185 may mark arbitrary characters as digits - but the PCRE compiling code expects
186 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
187 a private table here. It costs 256 bytes, but it is a lot faster than doing
188 character value tests (at least in some simple cases I timed), and in some
189 applications one wants PCRE to compile efficiently as well as match
192 For convenience, we use the same bit definitions as in chartables:
195 0x08 hexadecimal digit
197 Then we can use ctype_digit and ctype_xdigit in the code. */
199 #if !EBCDIC /* This is the "normal" case, for ASCII systems */
200 static const unsigned char digitab[] =
202 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
203 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
204 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
205 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
206 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
207 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
208 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
209 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
210 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
211 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
212 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
213 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
214 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
215 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
216 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
217 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
218 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
219 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
220 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
221 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
222 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
223 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
224 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
225 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
226 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
227 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
228 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
229 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
230 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
231 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
232 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
233 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
235 #else /* This is the "abnormal" case, for EBCDIC systems */
236 static const unsigned char digitab[] =
238 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
239 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
240 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
241 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
242 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
243 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
244 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
245 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
246 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
247 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
248 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
249 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- ¬ */
250 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
251 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
252 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
253 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
254 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
255 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
256 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
257 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
258 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
259 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
260 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
261 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
262 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
263 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
264 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
265 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
266 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
267 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
268 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
269 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
271 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
272 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
273 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
274 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
275 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
276 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
277 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
278 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
279 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
280 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
281 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
282 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
283 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- ¬ */
284 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
285 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
286 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
287 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
288 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
289 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
290 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
291 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
292 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
293 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
294 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
295 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
296 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
297 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
298 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
299 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
300 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
301 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
302 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
303 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
307 /* Definition to allow mutual recursion */
310 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
311 BOOL, int, int *, int *, branch_chain *, compile_data *);
313 /* Structure for building a chain of data that actually lives on the
314 stack, for holding the values of the subject pointer at the start of each
315 subpattern, so as to detect when an empty string has been matched by a
316 subpattern - to break infinite loops. When NO_RECURSE is set, these blocks
317 are on the heap, not on the stack. */
319 typedef struct eptrblock {
320 struct eptrblock *epb_prev;
321 const uschar *epb_saved_eptr;
324 /* Flag bits for the match() function */
326 #define match_condassert 0x01 /* Called to check a condition assertion */
327 #define match_isgroup 0x02 /* Set if start of bracketed group */
329 /* Non-error returns from the match() function. Error returns are externally
330 defined PCRE_ERROR_xxx codes, which are all negative. */
332 #define MATCH_MATCH 1
333 #define MATCH_NOMATCH 0
337 /*************************************************
339 *************************************************/
341 /* PCRE is thread-clean and doesn't use any global variables in the normal
342 sense. However, it calls memory allocation and free functions via the four
343 indirections below, and it can optionally do callouts. These values can be
344 changed by the caller, but are shared between all threads. However, when
345 compiling for Virtual Pascal, things are done differently (see pcre.in). */
349 extern "C" void *(*pcre_malloc)(size_t) = malloc;
350 extern "C" void (*pcre_free)(void *) = free;
351 extern "C" void *(*pcre_stack_malloc)(size_t) = malloc;
352 extern "C" void (*pcre_stack_free)(void *) = free;
353 extern "C" int (*pcre_callout)(pcre_callout_block *) = NULL;
355 void *(*pcre_malloc)(size_t) = malloc;
356 void (*pcre_free)(void *) = free;
357 void *(*pcre_stack_malloc)(size_t) = malloc;
358 void (*pcre_stack_free)(void *) = free;
359 int (*pcre_callout)(pcre_callout_block *) = NULL;
364 /*************************************************
365 * Macros and tables for character handling *
366 *************************************************/
368 /* When UTF-8 encoding is being used, a character is no longer just a single
369 byte. The macros for character handling generate simple sequences when used in
370 byte-mode, and more complicated ones for UTF-8 characters. */
373 #define GETCHAR(c, eptr) c = *eptr;
374 #define GETCHARINC(c, eptr) c = *eptr++;
375 #define GETCHARINCTEST(c, eptr) c = *eptr++;
376 #define GETCHARLEN(c, eptr, len) c = *eptr;
377 #define BACKCHAR(eptr)
379 #else /* SUPPORT_UTF8 */
381 /* Get the next UTF-8 character, not advancing the pointer. This is called when
382 we know we are in UTF-8 mode. */
384 #define GETCHAR(c, eptr) \
386 if ((c & 0xc0) == 0xc0) \
389 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
391 c = (c & utf8_table3[gcaa]) << gcss; \
392 for (gcii = 1; gcii <= gcaa; gcii++) \
395 c |= (eptr[gcii] & 0x3f) << gcss; \
399 /* Get the next UTF-8 character, advancing the pointer. This is called when we
400 know we are in UTF-8 mode. */
402 #define GETCHARINC(c, eptr) \
404 if ((c & 0xc0) == 0xc0) \
406 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
408 c = (c & utf8_table3[gcaa]) << gcss; \
412 c |= (*eptr++ & 0x3f) << gcss; \
416 /* Get the next character, testing for UTF-8 mode, and advancing the pointer */
418 #define GETCHARINCTEST(c, eptr) \
420 if (md->utf8 && (c & 0xc0) == 0xc0) \
422 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
424 c = (c & utf8_table3[gcaa]) << gcss; \
428 c |= (*eptr++ & 0x3f) << gcss; \
432 /* Get the next UTF-8 character, not advancing the pointer, incrementing length
433 if there are extra bytes. This is called when we know we are in UTF-8 mode. */
435 #define GETCHARLEN(c, eptr, len) \
437 if ((c & 0xc0) == 0xc0) \
440 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
442 c = (c & utf8_table3[gcaa]) << gcss; \
443 for (gcii = 1; gcii <= gcaa; gcii++) \
446 c |= (eptr[gcii] & 0x3f) << gcss; \
451 /* If the pointer is not at the start of a character, move it back until
452 it is. Called only in UTF-8 mode. */
454 #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
460 /*************************************************
461 * Default character tables *
462 *************************************************/
464 /* A default set of character tables is included in the PCRE binary. Its source
465 is built by the maketables auxiliary program, which uses the default C ctypes
466 functions, and put in the file chartables.c. These tables are used by PCRE
467 whenever the caller of pcre_compile() does not provide an alternate set of
470 #include "chartables.c"
475 /*************************************************
476 * Tables for UTF-8 support *
477 *************************************************/
479 /* These are the breakpoints for different numbers of bytes in a UTF-8
482 static const int utf8_table1[] =
483 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
485 /* These are the indicator bits and the mask for the data bits to set in the
486 first byte of a character, indexed by the number of additional bytes. */
488 static const int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
489 static const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
491 /* Table of the number of extra characters, indexed by the first character
492 masked with 0x3f. The highest number for a valid UTF-8 character is in fact
495 static const uschar utf8_table4[] = {
496 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
497 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
498 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
499 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
502 /*************************************************
503 * Convert character value to UTF-8 *
504 *************************************************/
506 /* This function takes an integer value in the range 0 - 0x7fffffff
507 and encodes it as a UTF-8 character in 0 to 6 bytes.
510 cvalue the character value
511 buffer pointer to buffer for result - at least 6 bytes long
513 Returns: number of characters placed in the buffer
517 ord2utf8(int cvalue, uschar *buffer)
520 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
521 if (cvalue <= utf8_table1[i]) break;
523 for (j = i; j > 0; j--)
525 *buffer-- = 0x80 | (cvalue & 0x3f);
528 *buffer = utf8_table2[i] | cvalue;
535 /*************************************************
536 * Print compiled regex *
537 *************************************************/
539 /* The code for doing this is held in a separate file that is also included in
540 pcretest.c. It defines a function called print_internals(). */
543 #include "printint.c"
548 /*************************************************
549 * Return version string *
550 *************************************************/
552 #define STRING(a) # a
553 #define XSTRING(s) STRING(s)
558 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
564 /*************************************************
565 * Flip bytes in an integer *
566 *************************************************/
568 /* This function is called when the magic number in a regex doesn't match in
569 order to flip its bytes to see if we are dealing with a pattern that was
570 compiled on a host of different endianness. If so, this function is used to
571 flip other byte values.
574 value the number to flip
575 n the number of bytes to flip (assumed to be 2 or 4)
577 Returns: the flipped value
581 byteflip2(pcre_uint16 value)
583 return ((value & 0x00ff) << 8) |
584 ((value & 0xff00) >> 8);
588 byteflip4(pcre_uint32 value)
590 return ((value & 0x000000ff) << 24) |
591 ((value & 0x0000ff00) << 8) |
592 ((value & 0x00ff0000) >> 8) |
593 ((value & 0xff000000) >> 24);
596 /*************************************************
597 * Test for a byte-flipped compiled regex *
598 *************************************************/
600 /* This function is called from pce_exec() and also from pcre_fullinfo(). Its
601 job is to test whether the regex is byte-flipped - that is, it was compiled on
602 a system of opposite endianness. The function is called only when the native
603 MAGIC_NUMBER test fails. If the regex is indeed flipped, we flip all the
604 relevant values into a different data block, and return it.
607 re points to the regex
608 study points to study data, or NULL
609 internal_re points to a new regex block
610 internal_study points to a new study block
612 Returns: the new block if is is indeed a byte-flipped regex
617 try_flipped(const real_pcre *re, real_pcre *internal_re,
618 const pcre_study_data *study, pcre_study_data *internal_study)
620 if (byteflip4(re->magic_number) != MAGIC_NUMBER)
623 *internal_re = *re; /* To copy other fields */
624 internal_re->size = byteflip4(re->size);
625 internal_re->options = byteflip4(re->options);
626 internal_re->top_bracket = byteflip2(re->top_bracket);
627 internal_re->top_backref = byteflip2(re->top_backref);
628 internal_re->first_byte = byteflip2(re->first_byte);
629 internal_re->req_byte = byteflip2(re->req_byte);
630 internal_re->name_table_offset = byteflip2(re->name_table_offset);
631 internal_re->name_entry_size = byteflip2(re->name_entry_size);
632 internal_re->name_count = byteflip2(re->name_count);
636 *internal_study = *study; /* To copy other fields */
637 internal_study->size = byteflip4(study->size);
638 internal_study->options = byteflip4(study->options);
646 /*************************************************
647 * (Obsolete) Return info about compiled pattern *
648 *************************************************/
650 /* This is the original "info" function. It picks potentially useful data out
651 of the private structure, but its interface was too rigid. It remains for
652 backwards compatibility. The public options are passed back in an int - though
653 the re->options field has been expanded to a long int, all the public options
654 at the low end of it, and so even on 16-bit systems this will still be OK.
655 Therefore, I haven't changed the API for pcre_info().
658 argument_re points to compiled code
659 optptr where to pass back the options
660 first_byte where to pass back the first character,
661 or -1 if multiline and all branches start ^,
664 Returns: number of capturing subpatterns
665 or negative values on error
669 pcre_info(const pcre *argument_re, int *optptr, int *first_byte)
671 real_pcre internal_re;
672 const real_pcre *re = (const real_pcre *)argument_re;
673 if (re == NULL) return PCRE_ERROR_NULL;
674 if (re->magic_number != MAGIC_NUMBER)
676 re = try_flipped(re, &internal_re, NULL, NULL);
677 if (re == NULL) return PCRE_ERROR_BADMAGIC;
679 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
680 if (first_byte != NULL)
681 *first_byte = ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
682 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
683 return re->top_bracket;
688 /*************************************************
689 * Return info about compiled pattern *
690 *************************************************/
692 /* This is a newer "info" function which has an extensible interface so
693 that additional items can be added compatibly.
696 argument_re points to compiled code
697 extra_data points extra data, or NULL
698 what what information is required
699 where where to put the information
701 Returns: 0 if data returned, negative on error
705 pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what,
708 real_pcre internal_re;
709 pcre_study_data internal_study;
710 const real_pcre *re = (const real_pcre *)argument_re;
711 const pcre_study_data *study = NULL;
713 if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
715 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
716 study = (const pcre_study_data *)extra_data->study_data;
718 if (re->magic_number != MAGIC_NUMBER)
720 re = try_flipped(re, &internal_re, study, &internal_study);
721 if (re == NULL) return PCRE_ERROR_BADMAGIC;
722 if (study != NULL) study = &internal_study;
727 case PCRE_INFO_OPTIONS:
728 *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
732 *((size_t *)where) = re->size;
735 case PCRE_INFO_STUDYSIZE:
736 *((size_t *)where) = (study == NULL)? 0 : study->size;
739 case PCRE_INFO_CAPTURECOUNT:
740 *((int *)where) = re->top_bracket;
743 case PCRE_INFO_BACKREFMAX:
744 *((int *)where) = re->top_backref;
747 case PCRE_INFO_FIRSTBYTE:
749 ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
750 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
753 /* Make sure we pass back the pointer to the bit vector in the external
754 block, not the internal copy (with flipped integer fields). */
756 case PCRE_INFO_FIRSTTABLE:
757 *((const uschar **)where) =
758 (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
759 ((const pcre_study_data *)extra_data->study_data)->start_bits : NULL;
762 case PCRE_INFO_LASTLITERAL:
764 ((re->options & PCRE_REQCHSET) != 0)? re->req_byte : -1;
767 case PCRE_INFO_NAMEENTRYSIZE:
768 *((int *)where) = re->name_entry_size;
771 case PCRE_INFO_NAMECOUNT:
772 *((int *)where) = re->name_count;
775 case PCRE_INFO_NAMETABLE:
776 *((const uschar **)where) = (const uschar *)re + re->name_table_offset;
779 case PCRE_INFO_DEFAULT_TABLES:
780 *((const uschar **)where) = (const uschar *)pcre_default_tables;
783 default: return PCRE_ERROR_BADOPTION;
791 /*************************************************
792 * Return info about what features are configured *
793 *************************************************/
795 /* This is function which has an extensible interface so that additional items
796 can be added compatibly.
799 what what information is required
800 where where to put the information
802 Returns: 0 if data returned, negative on error
806 pcre_config(int what, void *where)
810 case PCRE_CONFIG_UTF8:
818 case PCRE_CONFIG_UNICODE_PROPERTIES:
826 case PCRE_CONFIG_NEWLINE:
827 *((int *)where) = NEWLINE;
830 case PCRE_CONFIG_LINK_SIZE:
831 *((int *)where) = LINK_SIZE;
834 case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD:
835 *((int *)where) = POSIX_MALLOC_THRESHOLD;
838 case PCRE_CONFIG_MATCH_LIMIT:
839 *((unsigned int *)where) = MATCH_LIMIT;
842 case PCRE_CONFIG_STACKRECURSE:
850 default: return PCRE_ERROR_BADOPTION;
859 /*************************************************
860 * Debugging function to print chars *
861 *************************************************/
863 /* Print a sequence of chars in printable format, stopping at the end of the
864 subject if the requested.
867 p points to characters
868 length number to print
869 is_subject TRUE if printing from within md->start_subject
870 md pointer to matching data block, if is_subject is TRUE
876 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
879 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
881 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
888 /*************************************************
890 *************************************************/
892 /* This function is called when a \ has been encountered. It either returns a
893 positive value for a simple escape such as \n, or a negative value which
894 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
895 a positive value greater than 255 may be returned. On entry, ptr is pointing at
896 the \. On exit, it is on the final character of the escape sequence.
899 ptrptr points to the pattern position pointer
900 errorptr points to the pointer to the error message
901 bracount number of previous extracting brackets
902 options the options bits
903 isclass TRUE if inside a character class
905 Returns: zero or positive => a data character
906 negative => a special escape sequence
907 on error, errorptr is set
911 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
912 int options, BOOL isclass)
914 const uschar *ptr = *ptrptr;
917 /* If backslash is at the end of the pattern, it's an error. */
920 if (c == 0) *errorptr = ERR1;
922 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
923 a table. A non-zero result is something that can be returned immediately.
924 Otherwise further processing may be required. */
926 #if !EBCDIC /* ASCII coding */
927 else if (c < '0' || c > 'z') {} /* Not alphameric */
928 else if ((i = escapes[c - '0']) != 0) c = i;
930 #else /* EBCDIC coding */
931 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
932 else if ((i = escapes[c - 0x48]) != 0) c = i;
935 /* Escapes that need further processing, or are illegal. */
939 const uschar *oldptr;
942 /* A number of Perl escapes are not handled by PCRE. We give an explicit
953 /* The handling of escape sequences consisting of a string of digits
954 starting with one that is not zero is not straightforward. By experiment,
955 the way Perl works seems to be as follows:
957 Outside a character class, the digits are read as a decimal number. If the
958 number is less than 10, or if there are that many previous extracting
959 left brackets, then it is a back reference. Otherwise, up to three octal
960 digits are read to form an escaped byte. Thus \123 is likely to be octal
961 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
962 value is greater than 377, the least significant 8 bits are taken. Inside a
963 character class, \ followed by a digit is always an octal number. */
965 case '1': case '2': case '3': case '4': case '5':
966 case '6': case '7': case '8': case '9':
972 while ((digitab[ptr[1]] & ctype_digit) != 0)
973 c = c * 10 + *(++ptr) - '0';
974 if (c < 10 || c <= bracount)
979 ptr = oldptr; /* Put the pointer back and fall through */
982 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
983 generates a binary zero byte and treats the digit as a following literal.
984 Thus we have to pull back the pointer by one. */
986 if ((c = *ptr) >= '8')
993 /* \0 always starts an octal number, but we may drop through to here with a
994 larger first octal digit. */
998 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
999 c = c * 8 + *(++ptr) - '0';
1000 c &= 255; /* Take least significant 8 bits */
1003 /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
1004 which can be greater than 0xff, but only if the ddd are hex digits. */
1008 if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
1010 const uschar *pt = ptr + 2;
1011 register int count = 0;
1013 while ((digitab[*pt] & ctype_xdigit) != 0)
1017 #if !EBCDIC /* ASCII coding */
1018 if (cc >= 'a') cc -= 32; /* Convert to upper case */
1019 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
1020 #else /* EBCDIC coding */
1021 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
1022 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
1027 if (c < 0 || count > 8) *errorptr = ERR34;
1031 /* If the sequence of hex digits does not end with '}', then we don't
1032 recognize this construct; fall through to the normal \x handling. */
1036 /* Read just a single hex char */
1039 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
1041 int cc; /* Some compilers don't like ++ */
1042 cc = *(++ptr); /* in initializers */
1043 #if !EBCDIC /* ASCII coding */
1044 if (cc >= 'a') cc -= 32; /* Convert to upper case */
1045 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
1046 #else /* EBCDIC coding */
1047 if (cc <= 'z') cc += 64; /* Convert to upper case */
1048 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
1053 /* Other special escapes not starting with a digit are straightforward */
1063 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
1064 is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
1065 (However, an EBCDIC equivalent has now been added.) */
1067 #if !EBCDIC /* ASCII coding */
1068 if (c >= 'a' && c <= 'z') c -= 32;
1070 #else /* EBCDIC coding */
1071 if (c >= 'a' && c <= 'z') c += 64;
1076 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1077 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
1078 for Perl compatibility, it is a literal. This code looks a bit odd, but
1079 there used to be some cases other than the default, and there may be again
1080 in future, so I haven't "optimized" it. */
1083 if ((options & PCRE_EXTRA) != 0) switch(c)
1100 /*************************************************
1101 * Handle \P and \p *
1102 *************************************************/
1104 /* This function is called after \P or \p has been encountered, provided that
1105 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1106 pointing at the P or p. On exit, it is pointing at the final character of the
1110 ptrptr points to the pattern position pointer
1111 negptr points to a boolean that is set TRUE for negation else FALSE
1112 errorptr points to the pointer to the error message
1114 Returns: value from ucp_type_table, or -1 for an invalid type
1118 get_ucp(const uschar **ptrptr, BOOL *negptr, const char **errorptr)
1121 const uschar *ptr = *ptrptr;
1125 if (c == 0) goto ERROR_RETURN;
1129 /* \P or \p can be followed by a one- or two-character name in {}, optionally
1130 preceded by ^ for negation. */
1139 for (i = 0; i <= 2; i++)
1142 if (c == 0) goto ERROR_RETURN;
1143 if (c == '}') break;
1146 if (c !='}') /* Try to distinguish error cases */
1148 while (*(++ptr) != 0 && *ptr != '}');
1149 if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;
1154 /* Otherwise there is just one following character */
1164 /* Search for a recognized property name using binary chop */
1167 top = sizeof(utt)/sizeof(ucp_type_table);
1172 c = strcmp(name, utt[i].name);
1173 if (c == 0) return utt[i].value;
1174 if (c > 0) bot = i + 1; else top = i;
1192 /*************************************************
1193 * Check for counted repeat *
1194 *************************************************/
1196 /* This function is called when a '{' is encountered in a place where it might
1197 start a quantifier. It looks ahead to see if it really is a quantifier or not.
1198 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1199 where the ddds are digits.
1202 p pointer to the first char after '{'
1204 Returns: TRUE or FALSE
1208 is_counted_repeat(const uschar *p)
1210 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1211 while ((digitab[*p] & ctype_digit) != 0) p++;
1212 if (*p == '}') return TRUE;
1214 if (*p++ != ',') return FALSE;
1215 if (*p == '}') return TRUE;
1217 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1218 while ((digitab[*p] & ctype_digit) != 0) p++;
1225 /*************************************************
1226 * Read repeat counts *
1227 *************************************************/
1229 /* Read an item of the form {n,m} and return the values. This is called only
1230 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1231 so the syntax is guaranteed to be correct, but we need to check the values.
1234 p pointer to first char after '{'
1235 minp pointer to int for min
1236 maxp pointer to int for max
1237 returned as -1 if no max
1238 errorptr points to pointer to error message
1240 Returns: pointer to '}' on success;
1241 current ptr on error, with errorptr set
1244 static const uschar *
1245 read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
1250 /* Read the minimum value and do a paranoid check: a negative value indicates
1251 an integer overflow. */
1253 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
1254 if (min < 0 || min > 65535)
1260 /* Read the maximum value if there is one, and again do a paranoid on its size.
1261 Also, max must not be less than min. */
1263 if (*p == '}') max = min; else
1268 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
1269 if (max < 0 || max > 65535)
1282 /* Fill in the required variables, and pass back the pointer to the terminating
1292 /*************************************************
1293 * Find first significant op code *
1294 *************************************************/
1296 /* This is called by several functions that scan a compiled expression looking
1297 for a fixed first character, or an anchoring op code etc. It skips over things
1298 that do not influence this. For some calls, a change of option is important.
1299 For some calls, it makes sense to skip negative forward and all backward
1300 assertions, and also the \b assertion; for others it does not.
1303 code pointer to the start of the group
1304 options pointer to external options
1305 optbit the option bit whose changing is significant, or
1307 skipassert TRUE if certain assertions are to be skipped
1309 Returns: pointer to the first significant opcode
1312 static const uschar*
1313 first_significant_code(const uschar *code, int *options, int optbit,
1321 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1322 *options = (int)code[1];
1328 case OP_ASSERTBACK_NOT:
1329 if (!skipassert) return code;
1330 do code += GET(code, 1); while (*code == OP_ALT);
1331 code += OP_lengths[*code];
1334 case OP_WORD_BOUNDARY:
1335 case OP_NOT_WORD_BOUNDARY:
1336 if (!skipassert) return code;
1342 code += OP_lengths[*code];
1349 /* Control never reaches here */
1355 /*************************************************
1356 * Find the fixed length of a pattern *
1357 *************************************************/
1359 /* Scan a pattern and compute the fixed length of subject that will match it,
1360 if the length is fixed. This is needed for dealing with backward assertions.
1361 In UTF8 mode, the result is in characters rather than bytes.
1364 code points to the start of the pattern (the bracket)
1365 options the compiling options
1367 Returns: the fixed length, or -1 if there is no fixed length,
1368 or -2 if \C was encountered
1372 find_fixedlength(uschar *code, int options)
1376 register int branchlength = 0;
1377 register uschar *cc = code + 1 + LINK_SIZE;
1379 /* Scan along the opcodes for this branch. If we get to the end of the
1380 branch, check the length against that of the other branches. */
1385 register int op = *cc;
1386 if (op >= OP_BRA) op = OP_BRA;
1393 d = find_fixedlength(cc, options);
1394 if (d < 0) return d;
1396 do cc += GET(cc, 1); while (*cc == OP_ALT);
1397 cc += 1 + LINK_SIZE;
1400 /* Reached end of a branch; if it's a ket it is the end of a nested
1401 call. If it's ALT it is an alternation in a nested call. If it is
1402 END it's the end of the outer call. All can be handled by the same code. */
1409 if (length < 0) length = branchlength;
1410 else if (length != branchlength) return -1;
1411 if (*cc != OP_ALT) return length;
1412 cc += 1 + LINK_SIZE;
1416 /* Skip over assertive subpatterns */
1421 case OP_ASSERTBACK_NOT:
1422 do cc += GET(cc, 1); while (*cc == OP_ALT);
1425 /* Skip over things that don't match chars */
1438 case OP_NOT_WORD_BOUNDARY:
1439 case OP_WORD_BOUNDARY:
1440 cc += OP_lengths[*cc];
1443 /* Handle literal characters */
1450 if ((options & PCRE_UTF8) != 0)
1452 while ((*cc & 0xc0) == 0x80) cc++;
1457 /* Handle exact repetitions. The count is already in characters, but we
1458 need to skip over a multibyte character in UTF8 mode. */
1461 branchlength += GET2(cc,1);
1464 if ((options & PCRE_UTF8) != 0)
1466 while((*cc & 0x80) == 0x80) cc++;
1472 branchlength += GET2(cc,1);
1476 /* Handle single-char matchers */
1485 case OP_NOT_WHITESPACE:
1487 case OP_NOT_WORDCHAR:
1494 /* The single-byte matcher isn't allowed */
1499 /* Check a class for variable quantification */
1503 cc += GET(cc, 1) - 33;
1521 if (GET2(cc,1) != GET2(cc,3)) return -1;
1522 branchlength += GET2(cc,1);
1531 /* Anything else is variable length */
1537 /* Control never gets here */
1543 /*************************************************
1544 * Scan compiled regex for numbered bracket *
1545 *************************************************/
1547 /* This little function scans through a compiled pattern until it finds a
1548 capturing bracket with the given number.
1551 code points to start of expression
1552 utf8 TRUE in UTF-8 mode
1553 number the required bracket number
1555 Returns: pointer to the opcode for the bracket, or NULL if not found
1558 static const uschar *
1559 find_bracket(const uschar *code, BOOL utf8, int number)
1561 #ifndef SUPPORT_UTF8
1562 utf8 = utf8; /* Stop pedantic compilers complaining */
1567 register int c = *code;
1568 if (c == OP_END) return NULL;
1569 else if (c > OP_BRA)
1572 if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1573 if (n == number) return (uschar *)code;
1574 code += OP_lengths[OP_BRA];
1578 code += OP_lengths[c];
1582 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1583 by a multi-byte character. The length in the table is a minimum, so we have
1584 to scan along to skip the extra bytes. All opcodes are less than 128, so we
1585 can use relatively efficient code. */
1600 while ((*code & 0xc0) == 0x80) code++;
1603 /* XCLASS is used for classes that cannot be represented just by a bit
1604 map. This includes negated single high-valued characters. The length in
1605 the table is zero; the actual length is stored in the compiled code. */
1608 code += GET(code, 1) + 1;
1618 /*************************************************
1619 * Scan compiled regex for recursion reference *
1620 *************************************************/
1622 /* This little function scans through a compiled pattern until it finds an
1623 instance of OP_RECURSE.
1626 code points to start of expression
1627 utf8 TRUE in UTF-8 mode
1629 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1632 static const uschar *
1633 find_recurse(const uschar *code, BOOL utf8)
1635 #ifndef SUPPORT_UTF8
1636 utf8 = utf8; /* Stop pedantic compilers complaining */
1641 register int c = *code;
1642 if (c == OP_END) return NULL;
1643 else if (c == OP_RECURSE) return code;
1644 else if (c > OP_BRA)
1646 code += OP_lengths[OP_BRA];
1650 code += OP_lengths[c];
1654 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1655 by a multi-byte character. The length in the table is a minimum, so we have
1656 to scan along to skip the extra bytes. All opcodes are less than 128, so we
1657 can use relatively efficient code. */
1672 while ((*code & 0xc0) == 0x80) code++;
1675 /* XCLASS is used for classes that cannot be represented just by a bit
1676 map. This includes negated single high-valued characters. The length in
1677 the table is zero; the actual length is stored in the compiled code. */
1680 code += GET(code, 1) + 1;
1690 /*************************************************
1691 * Scan compiled branch for non-emptiness *
1692 *************************************************/
1694 /* This function scans through a branch of a compiled pattern to see whether it
1695 can match the empty string or not. It is called only from could_be_empty()
1696 below. Note that first_significant_code() skips over assertions. If we hit an
1697 unclosed bracket, we return "empty" - this means we've struck an inner bracket
1698 whose current branch will already have been scanned.
1701 code points to start of search
1702 endcode points to where to stop
1703 utf8 TRUE if in UTF8 mode
1705 Returns: TRUE if what is matched could be empty
1709 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1712 for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
1714 code = first_significant_code(code + OP_lengths[c], NULL, 0, TRUE))
1716 const uschar *ccode;
1723 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1725 /* Scan a closed bracket */
1727 empty_branch = FALSE;
1730 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1731 empty_branch = TRUE;
1732 code += GET(code, 1);
1734 while (*code == OP_ALT);
1735 if (!empty_branch) return FALSE; /* All branches are non-empty */
1736 code += 1 + LINK_SIZE;
1742 /* Check for quantifiers after a class */
1746 ccode = code + GET(code, 1);
1747 goto CHECK_CLASS_REPEAT;
1760 case OP_CRSTAR: /* These could be empty; continue */
1766 default: /* Non-repeat => class must match */
1767 case OP_CRPLUS: /* These repeats aren't empty */
1773 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1778 /* Opcodes that must match a character */
1785 case OP_NOT_WHITESPACE:
1787 case OP_NOT_WORDCHAR:
1801 case OP_TYPEMINPLUS:
1813 /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
1814 followed by a multibyte character */
1823 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1834 /*************************************************
1835 * Scan compiled regex for non-emptiness *
1836 *************************************************/
1838 /* This function is called to check for left recursive calls. We want to check
1839 the current branch of the current pattern to see if it could match the empty
1840 string. If it could, we must look outwards for branches at other levels,
1841 stopping when we pass beyond the bracket which is the subject of the recursion.
1844 code points to start of the recursion
1845 endcode points to where to stop (current RECURSE item)
1846 bcptr points to the chain of current (unclosed) branch starts
1847 utf8 TRUE if in UTF-8 mode
1849 Returns: TRUE if what is matched could be empty
1853 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1856 while (bcptr != NULL && bcptr->current >= code)
1858 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1859 bcptr = bcptr->outer;
1866 /*************************************************
1867 * Check for POSIX class syntax *
1868 *************************************************/
1870 /* This function is called when the sequence "[:" or "[." or "[=" is
1871 encountered in a character class. It checks whether this is followed by an
1872 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1876 ptr pointer to the initial [
1877 endptr where to return the end pointer
1878 cd pointer to compile data
1880 Returns: TRUE or FALSE
1884 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1886 int terminator; /* Don't combine these lines; the Solaris cc */
1887 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1888 if (*(++ptr) == '^') ptr++;
1889 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1890 if (*ptr == terminator && ptr[1] == ']')
1901 /*************************************************
1902 * Check POSIX class name *
1903 *************************************************/
1905 /* This function is called to check the name given in a POSIX-style class entry
1909 ptr points to the first letter
1910 len the length of the name
1912 Returns: a value representing the name, or -1 if unknown
1916 check_posix_name(const uschar *ptr, int len)
1918 register int yield = 0;
1919 while (posix_name_lengths[yield] != 0)
1921 if (len == posix_name_lengths[yield] &&
1922 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1929 /*************************************************
1930 * Adjust OP_RECURSE items in repeated group *
1931 *************************************************/
1933 /* OP_RECURSE items contain an offset from the start of the regex to the group
1934 that is referenced. This means that groups can be replicated for fixed
1935 repetition simply by copying (because the recursion is allowed to refer to
1936 earlier groups that are outside the current group). However, when a group is
1937 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1938 it, after it has been compiled. This means that any OP_RECURSE items within it
1939 that refer to the group itself or any contained groups have to have their
1940 offsets adjusted. That is the job of this function. Before it is called, the
1941 partially compiled regex must be temporarily terminated with OP_END.
1944 group points to the start of the group
1945 adjust the amount by which the group is to be moved
1946 utf8 TRUE in UTF-8 mode
1947 cd contains pointers to tables etc.
1953 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
1955 uschar *ptr = group;
1956 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1958 int offset = GET(ptr, 1);
1959 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1960 ptr += 1 + LINK_SIZE;
1966 /*************************************************
1967 * Insert an automatic callout point *
1968 *************************************************/
1970 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1971 callout points before each pattern item.
1974 code current code pointer
1975 ptr current pattern pointer
1976 cd pointers to tables etc
1978 Returns: new code pointer
1982 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1984 *code++ = OP_CALLOUT;
1986 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1987 PUT(code, LINK_SIZE, 0); /* Default length */
1988 return code + 2*LINK_SIZE;
1993 /*************************************************
1994 * Complete a callout item *
1995 *************************************************/
1997 /* A callout item contains the length of the next item in the pattern, which
1998 we can't fill in till after we have reached the relevant point. This is used
1999 for both automatic and manual callouts.
2002 previous_callout points to previous callout item
2003 ptr current pattern pointer
2004 cd pointers to tables etc
2010 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2012 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2013 PUT(previous_callout, 2 + LINK_SIZE, length);
2019 /*************************************************
2020 * Get othercase range *
2021 *************************************************/
2023 /* This function is passed the start and end of a class range, in UTF-8 mode
2024 with UCP support. It searches up the characters, looking for internal ranges of
2025 characters in the "other" case. Each call returns the next one, updating the
2029 cptr points to starting character value; updated
2031 ocptr where to put start of othercase range
2032 odptr where to put end of othercase range
2034 Yield: TRUE when range returned; FALSE when no more
2038 get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
2040 int c, chartype, othercase, next;
2042 for (c = *cptr; c <= d; c++)
2044 if (ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0) break;
2047 if (c > d) return FALSE;
2050 next = othercase + 1;
2052 for (++c; c <= d; c++)
2054 if (ucp_findchar(c, &chartype, &othercase) != ucp_L || othercase != next)
2064 #endif /* SUPPORT_UCP */
2067 /*************************************************
2068 * Compile one branch *
2069 *************************************************/
2071 /* Scan the pattern, compiling it into the code vector. If the options are
2072 changed during the branch, the pointer is used to change the external options
2076 optionsptr pointer to the option bits
2077 brackets points to number of extracting brackets used
2078 codeptr points to the pointer to the current code point
2079 ptrptr points to the current pattern pointer
2080 errorptr points to pointer to error message
2081 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2082 reqbyteptr set to the last literal character required, else < 0
2083 bcptr points to current branch chain
2084 cd contains pointers to tables etc.
2086 Returns: TRUE on success
2087 FALSE, with *errorptr set on error
2091 compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
2092 const uschar **ptrptr, const char **errorptr, int *firstbyteptr,
2093 int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
2095 int repeat_type, op_type;
2096 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2098 int greedy_default, greedy_non_default;
2099 int firstbyte, reqbyte;
2100 int zeroreqbyte, zerofirstbyte;
2101 int req_caseopt, reqvary, tempreqvary;
2103 int options = *optionsptr;
2104 int after_manual_callout = 0;
2106 register uschar *code = *codeptr;
2108 BOOL inescq = FALSE;
2109 BOOL groupsetfirstbyte = FALSE;
2110 const uschar *ptr = *ptrptr;
2111 const uschar *tempptr;
2112 uschar *previous = NULL;
2113 uschar *previous_callout = NULL;
2114 uschar classbits[32];
2118 BOOL utf8 = (options & PCRE_UTF8) != 0;
2119 uschar *class_utf8data;
2120 uschar utf8_char[6];
2125 /* Set up the default and non-default settings for greediness */
2127 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2128 greedy_non_default = greedy_default ^ 1;
2130 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2131 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2132 matches a non-fixed char first char; reqbyte just remains unset if we never
2135 When we hit a repeat whose minimum is zero, we may have to adjust these values
2136 to take the zero repeat into account. This is implemented by setting them to
2137 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2138 item types that can be repeated set these backoff variables appropriately. */
2140 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2142 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2143 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2144 value > 255. It is added into the firstbyte or reqbyte variables to record the
2145 case status of the value. This is used only for ASCII characters. */
2147 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2149 /* Switch on next character until the end of the branch */
2154 BOOL possessive_quantifier;
2156 int class_charcount;
2166 /* Next byte in the pattern */
2170 /* If in \Q...\E, check for the end; if not, we have a literal */
2172 if (inescq && c != 0)
2174 if (c == '\\' && ptr[1] == 'E')
2182 if (previous_callout != NULL)
2184 complete_callout(previous_callout, ptr, cd);
2185 previous_callout = NULL;
2187 if ((options & PCRE_AUTO_CALLOUT) != 0)
2189 previous_callout = code;
2190 code = auto_callout(code, ptr, cd);
2196 /* Fill in length of a previous callout, except when the next thing is
2199 is_quantifier = c == '*' || c == '+' || c == '?' ||
2200 (c == '{' && is_counted_repeat(ptr+1));
2202 if (!is_quantifier && previous_callout != NULL &&
2203 after_manual_callout-- <= 0)
2205 complete_callout(previous_callout, ptr, cd);
2206 previous_callout = NULL;
2209 /* In extended mode, skip white space and comments */
2211 if ((options & PCRE_EXTENDED) != 0)
2213 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2216 /* The space before the ; is to avoid a warning on a silly compiler
2217 on the Macintosh. */
2218 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2219 if (c != 0) continue; /* Else fall through to handle end of string */
2223 /* No auto callout for quantifiers. */
2225 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2227 previous_callout = code;
2228 code = auto_callout(code, ptr, cd);
2233 /* The branch terminates at end of string, |, or ). */
2238 *firstbyteptr = firstbyte;
2239 *reqbyteptr = reqbyte;
2244 /* Handle single-character metacharacters. In multiline mode, ^ disables
2245 the setting of any following char as a first character. */
2248 if ((options & PCRE_MULTILINE) != 0)
2250 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2261 /* There can never be a first char if '.' is first, whatever happens about
2262 repeats. The value of reqbyte doesn't change either. */
2265 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2266 zerofirstbyte = firstbyte;
2267 zeroreqbyte = reqbyte;
2272 /* Character classes. If the included characters are all < 255 in value, we
2273 build a 32-byte bitmap of the permitted characters, except in the special
2274 case where there is only one such character. For negated classes, we build
2275 the map as usual, then invert it at the end. However, we use a different
2276 opcode so that data characters > 255 can be handled correctly.
2278 If the class contains characters outside the 0-255 range, a different
2279 opcode is compiled. It may optionally have a bit map for characters < 256,
2280 but those above are are explicitly listed afterwards. A flag byte tells
2281 whether the bitmap is present, and whether this is a negated class or not.
2287 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2288 they are encountered at the top level, so we'll do that too. */
2290 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2291 check_posix_syntax(ptr, &tempptr, cd))
2293 *errorptr = (ptr[1] == ':')? ERR13 : ERR31;
2297 /* If the first character is '^', set the negation flag and skip it. */
2299 if ((c = *(++ptr)) == '^')
2301 negate_class = TRUE;
2306 negate_class = FALSE;
2309 /* Keep a count of chars with values < 256 so that we can optimize the case
2310 of just a single character (as long as it's < 256). For higher valued UTF-8
2311 characters, we don't yet do any optimization. */
2313 class_charcount = 0;
2314 class_lastchar = -1;
2317 class_utf8 = FALSE; /* No chars >= 256 */
2318 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
2321 /* Initialize the 32-char bit map to all zeros. We have to build the
2322 map in a temporary bit of store, in case the class contains only 1
2323 character (< 256), because in that case the compiled code doesn't use the
2326 memset(classbits, 0, 32 * sizeof(uschar));
2328 /* Process characters until ] is reached. By writing this as a "do" it
2329 means that an initial ] is taken as a data character. The first pass
2330 through the regex checked the overall syntax, so we don't need to be very
2331 strict here. At the start of the loop, c contains the first byte of the
2337 if (utf8 && c > 127)
2338 { /* Braces are required because the */
2339 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2343 /* Inside \Q...\E everything is literal except \E */
2347 if (c == '\\' && ptr[1] == 'E')
2353 else goto LONE_SINGLE_CHARACTER;
2356 /* Handle POSIX class names. Perl allows a negation extension of the
2357 form [:^name:]. A square bracket that doesn't match the syntax is
2358 treated as a literal. We also recognize the POSIX constructions
2359 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2363 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2364 check_posix_syntax(ptr, &tempptr, cd))
2366 BOOL local_negate = FALSE;
2368 register const uschar *cbits = cd->cbits;
2379 local_negate = TRUE;
2383 posix_class = check_posix_name(ptr, tempptr - ptr);
2384 if (posix_class < 0)
2390 /* If matching is caseless, upper and lower are converted to
2391 alpha. This relies on the fact that the class table starts with
2392 alpha, lower, upper as the first 3 entries. */
2394 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2397 /* Or into the map we are building up to 3 of the static class
2398 tables, or their negations. The [:blank:] class sets up the same
2399 chars as the [:space:] class (all white space). We remove the vertical
2400 white space chars afterwards. */
2403 for (i = 0; i < 3; i++)
2405 BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;
2406 int taboffset = posix_class_maps[posix_class + i];
2407 if (taboffset < 0) break;
2411 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];
2413 for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];
2414 if (blankclass) classbits[1] |= 0x3c;
2418 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];
2419 if (blankclass) classbits[1] &= ~0x3c;
2424 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2425 continue; /* End of POSIX syntax handling */
2428 /* Backslash may introduce a single character, or it may introduce one
2429 of the specials, which just set a flag. Escaped items are checked for
2430 validity in the pre-compiling pass. The sequence \b is a special case.
2431 Inside a class (and only there) it is treated as backspace. Elsewhere
2432 it marks a word boundary. Other escapes have preset maps ready to
2433 or into the one we are building. We assume they have more than one
2434 character in them, so set class_charcount bigger than one. */
2438 c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
2440 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2441 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2442 else if (-c == ESC_Q) /* Handle start of quoted string */
2444 if (ptr[1] == '\\' && ptr[2] == 'E')
2446 ptr += 2; /* avoid empty string */
2454 register const uschar *cbits = cd->cbits;
2455 class_charcount += 2; /* Greater than 1 is what matters */
2459 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2463 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2467 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2471 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2475 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2476 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2480 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2481 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2489 int property = get_ucp(&ptr, &negated, errorptr);
2490 if (property < 0) goto FAILED;
2492 *class_utf8data++ = ((-c == ESC_p) != negated)?
2493 XCL_PROP : XCL_NOTPROP;
2494 *class_utf8data++ = property;
2495 class_charcount -= 2; /* Not a < 256 character */
2500 /* Unrecognized escapes are faulted if PCRE is running in its
2501 strict mode. By default, for compatibility with Perl, they are
2502 treated as literals. */
2505 if ((options & PCRE_EXTRA) != 0)
2510 c = *ptr; /* The final character */
2511 class_charcount -= 2; /* Undo the default count from above */
2515 /* Fall through if we have a single character (c >= 0). This may be
2516 > 256 in UTF-8 mode. */
2518 } /* End of backslash handling */
2520 /* A single character may be followed by '-' to form a range. However,
2521 Perl does not permit ']' to be the end of the range. A '-' character
2522 here is treated as a literal. */
2524 if (ptr[1] == '-' && ptr[2] != ']')
2531 { /* Braces are required because the */
2532 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2536 d = *ptr; /* Not UTF-8 mode */
2538 /* The second part of a range can be a single-character escape, but
2539 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2540 in such circumstances. */
2544 const uschar *oldptr = ptr;
2545 d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
2547 /* \b is backslash; \X is literal X; any other special means the '-'
2552 if (d == -ESC_b) d = '\b';
2553 else if (d == -ESC_X) d = 'X'; else
2556 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2561 /* The check that the two values are in the correct order happens in
2562 the pre-pass. Optimize one-character ranges */
2564 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2566 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2567 matching, we have to use an XCLASS with extra data items. Caseless
2568 matching for characters > 127 is available only if UCP support is
2572 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2576 /* With UCP support, we can find the other case equivalents of
2577 the relevant characters. There may be several ranges. Optimize how
2578 they fit with the basic range. */
2581 if ((options & PCRE_CASELESS) != 0)
2586 while (get_othercase_range(&cc, origd, &occ, &ocd))
2588 if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2590 if (occ < c && ocd >= c - 1) /* Extend the basic range */
2591 { /* if there is overlap, */
2592 c = occ; /* noting that if occ < c */
2593 continue; /* we can't have ocd > d */
2594 } /* because a subrange is */
2595 if (ocd > d && occ <= d + 1) /* always shorter than */
2596 { /* the basic range. */
2603 *class_utf8data++ = XCL_SINGLE;
2607 *class_utf8data++ = XCL_RANGE;
2608 class_utf8data += ord2utf8(occ, class_utf8data);
2610 class_utf8data += ord2utf8(ocd, class_utf8data);
2613 #endif /* SUPPORT_UCP */
2615 /* Now record the original range, possibly modified for UCP caseless
2616 overlapping ranges. */
2618 *class_utf8data++ = XCL_RANGE;
2619 class_utf8data += ord2utf8(c, class_utf8data);
2620 class_utf8data += ord2utf8(d, class_utf8data);
2622 /* With UCP support, we are done. Without UCP support, there is no
2623 caseless matching for UTF-8 characters > 127; we can use the bit map
2624 for the smaller ones. */
2627 continue; /* With next character in the class */
2629 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2631 /* Adjust upper limit and fall through to set up the map */
2635 #endif /* SUPPORT_UCP */
2637 #endif /* SUPPORT_UTF8 */
2639 /* We use the bit map for all cases when not in UTF-8 mode; else
2640 ranges that lie entirely within 0-127 when there is UCP support; else
2641 for partial ranges without UCP support. */
2645 classbits[c/8] |= (1 << (c&7));
2646 if ((options & PCRE_CASELESS) != 0)
2648 int uc = cd->fcc[c]; /* flip case */
2649 classbits[uc/8] |= (1 << (uc&7));
2651 class_charcount++; /* in case a one-char range */
2655 continue; /* Go get the next char in the class */
2658 /* Handle a lone single character - we can get here for a normal
2659 non-escape char, or after \ that introduces a single character or for an
2660 apparent range that isn't. */
2662 LONE_SINGLE_CHARACTER:
2664 /* Handle a character that cannot go in the bit map */
2667 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2670 *class_utf8data++ = XCL_SINGLE;
2671 class_utf8data += ord2utf8(c, class_utf8data);
2674 if ((options & PCRE_CASELESS) != 0)
2678 if (ucp_findchar(c, &chartype, &othercase) >= 0 && othercase > 0)
2680 *class_utf8data++ = XCL_SINGLE;
2681 class_utf8data += ord2utf8(othercase, class_utf8data);
2684 #endif /* SUPPORT_UCP */
2688 #endif /* SUPPORT_UTF8 */
2690 /* Handle a single-byte character */
2692 classbits[c/8] |= (1 << (c&7));
2693 if ((options & PCRE_CASELESS) != 0)
2695 c = cd->fcc[c]; /* flip case */
2696 classbits[c/8] |= (1 << (c&7));
2703 /* Loop until ']' reached; the check for end of string happens inside the
2704 loop. This "while" is the end of the "do" above. */
2706 while ((c = *(++ptr)) != ']' || inescq);
2708 /* If class_charcount is 1, we saw precisely one character whose value is
2709 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2710 can optimize the negative case only if there were no characters >= 128
2711 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2712 single-bytes only. This is an historical hangover. Maybe one day we can
2713 tidy these opcodes to handle multi-byte characters.
2715 The optimization throws away the bit map. We turn the item into a
2716 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2717 that OP_NOT does not support multibyte characters. In the positive case, it
2718 can cause firstbyte to be set. Otherwise, there can be no first char if
2719 this item is first, whatever repeat count may follow. In the case of
2720 reqbyte, save the previous value for reinstating. */
2723 if (class_charcount == 1 &&
2725 (!class_utf8 && (!negate_class || class_lastchar < 128))))
2728 if (class_charcount == 1)
2731 zeroreqbyte = reqbyte;
2733 /* The OP_NOT opcode works on one-byte characters only. */
2737 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2738 zerofirstbyte = firstbyte;
2740 *code++ = class_lastchar;
2744 /* For a single, positive character, get the value into mcbuffer, and
2745 then we can handle this with the normal one-character code. */
2748 if (utf8 && class_lastchar > 127)
2749 mclength = ord2utf8(class_lastchar, mcbuffer);
2753 mcbuffer[0] = class_lastchar;
2757 } /* End of 1-char optimization */
2759 /* The general case - not the one-char optimization. If this is the first
2760 thing in the branch, there can be no first char setting, whatever the
2761 repeat count. Any reqbyte setting must remain unchanged after any kind of
2764 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2765 zerofirstbyte = firstbyte;
2766 zeroreqbyte = reqbyte;
2768 /* If there are characters with values > 255, we have to compile an
2769 extended class, with its own opcode. If there are no characters < 256,
2770 we can omit the bitmap. */
2775 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2776 *code++ = OP_XCLASS;
2778 *code = negate_class? XCL_NOT : 0;
2780 /* If the map is required, install it, and move on to the end of
2783 if (class_charcount > 0)
2786 memcpy(code, classbits, 32);
2787 code = class_utf8data;
2790 /* If the map is not required, slide down the extra data. */
2794 int len = class_utf8data - (code + 33);
2795 memmove(code + 1, code + 33, len);
2799 /* Now fill in the complete length of the item */
2801 PUT(previous, 1, code - previous);
2802 break; /* End of class handling */
2806 /* If there are no characters > 255, negate the 32-byte map if necessary,
2807 and copy it into the code vector. If this is the first thing in the branch,
2808 there can be no first char setting, whatever the repeat count. Any reqbyte
2809 setting must remain unchanged after any kind of repeat. */
2813 *code++ = OP_NCLASS;
2814 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2819 memcpy(code, classbits, 32);
2824 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2825 has been tested above. */
2828 if (!is_quantifier) goto NORMAL_CHAR;
2829 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
2830 if (*errorptr != NULL) goto FAILED;
2848 if (previous == NULL)
2854 if (repeat_min == 0)
2856 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2857 reqbyte = zeroreqbyte; /* Ditto */
2860 /* Remember whether this is a variable length repeat */
2862 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2864 op_type = 0; /* Default single-char op codes */
2865 possessive_quantifier = FALSE; /* Default not possessive quantifier */
2867 /* Save start of previous item, in case we have to move it up to make space
2868 for an inserted OP_ONCE for the additional '+' extension. */
2870 tempcode = previous;
2872 /* If the next character is '+', we have a possessive quantifier. This
2873 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2874 If the next character is '?' this is a minimizing repeat, by default,
2875 but if PCRE_UNGREEDY is set, it works the other way round. We change the
2876 repeat type to the non-default. */
2880 repeat_type = 0; /* Force greedy */
2881 possessive_quantifier = TRUE;
2884 else if (ptr[1] == '?')
2886 repeat_type = greedy_non_default;
2889 else repeat_type = greedy_default;
2891 /* If previous was a recursion, we need to wrap it inside brackets so that
2892 it can be replicated if necessary. */
2894 if (*previous == OP_RECURSE)
2896 memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2897 code += 1 + LINK_SIZE;
2899 PUT(previous, 1, code - previous);
2901 PUT(code, 1, code - previous);
2902 code += 1 + LINK_SIZE;
2905 /* If previous was a character match, abolish the item and generate a
2906 repeat item instead. If a char item has a minumum of more than one, ensure
2907 that it is set in reqbyte - it might not be if a sequence such as x{3} is
2908 the first thing in a branch because the x will have gone into firstbyte
2911 if (*previous == OP_CHAR || *previous == OP_CHARNC)
2913 /* Deal with UTF-8 characters that take up more than one byte. It's
2914 easier to write this out separately than try to macrify it. Use c to
2915 hold the length of the character in bytes, plus 0x80 to flag that it's a
2916 length rather than a small character. */
2919 if (utf8 && (code[-1] & 0x80) != 0)
2921 uschar *lastchar = code - 1;
2922 while((*lastchar & 0xc0) == 0x80) lastchar--;
2923 c = code - lastchar; /* Length of UTF-8 character */
2924 memcpy(utf8_char, lastchar, c); /* Save the char */
2925 c |= 0x80; /* Flag c as a length */
2930 /* Handle the case of a single byte - either with no UTF8 support, or
2931 with UTF-8 disabled, or for a UTF-8 character < 128. */
2935 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2938 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2941 /* If previous was a single negated character ([^a] or similar), we use
2942 one of the special opcodes, replacing it. The code is shared with single-
2943 character repeats by setting opt_type to add a suitable offset into
2944 repeat_type. OP_NOT is currently used only for single-byte chars. */
2946 else if (*previous == OP_NOT)
2948 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2950 goto OUTPUT_SINGLE_REPEAT;
2953 /* If previous was a character type match (\d or similar), abolish it and
2954 create a suitable repeat item. The code is shared with single-character
2955 repeats by setting op_type to add a suitable offset into repeat_type. Note
2956 the the Unicode property types will be present only when SUPPORT_UCP is
2957 defined, but we don't wrap the little bits of code here because it just
2958 makes it horribly messy. */
2960 else if (*previous < OP_EODN)
2964 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2967 OUTPUT_SINGLE_REPEAT:
2968 prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?
2972 code = previous; /* Usually overwrite previous item */
2974 /* If the maximum is zero then the minimum must also be zero; Perl allows
2975 this case, so we do too - by simply omitting the item altogether. */
2977 if (repeat_max == 0) goto END_REPEAT;
2979 /* All real repeats make it impossible to handle partial matching (maybe
2980 one day we will be able to remove this restriction). */
2982 if (repeat_max != 1) cd->nopartial = TRUE;
2984 /* Combine the op_type with the repeat_type */
2986 repeat_type += op_type;
2988 /* A minimum of zero is handled either as the special case * or ?, or as
2989 an UPTO, with the maximum given. */
2991 if (repeat_min == 0)
2993 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2994 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2997 *code++ = OP_UPTO + repeat_type;
2998 PUT2INC(code, 0, repeat_max);
3002 /* A repeat minimum of 1 is optimized into some special cases. If the
3003 maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
3004 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3005 one less than the maximum. */
3007 else if (repeat_min == 1)
3009 if (repeat_max == -1)
3010 *code++ = OP_PLUS + repeat_type;
3013 code = oldcode; /* leave previous item in place */
3014 if (repeat_max == 1) goto END_REPEAT;
3015 *code++ = OP_UPTO + repeat_type;
3016 PUT2INC(code, 0, repeat_max - 1);
3020 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3021 handled as an EXACT followed by an UPTO. */
3025 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3026 PUT2INC(code, 0, repeat_min);
3028 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3029 we have to insert the character for the previous code. For a repeated
3030 Unicode property match, there is an extra byte that defines the
3031 required property. In UTF-8 mode, long characters have their length in
3032 c, with the 0x80 bit as a flag. */
3037 if (utf8 && c >= 128)
3039 memcpy(code, utf8_char, c & 7);
3046 if (prop_type >= 0) *code++ = prop_type;
3048 *code++ = OP_STAR + repeat_type;
3051 /* Else insert an UPTO if the max is greater than the min, again
3052 preceded by the character, for the previously inserted code. */
3054 else if (repeat_max != repeat_min)
3057 if (utf8 && c >= 128)
3059 memcpy(code, utf8_char, c & 7);
3065 if (prop_type >= 0) *code++ = prop_type;
3066 repeat_max -= repeat_min;
3067 *code++ = OP_UPTO + repeat_type;
3068 PUT2INC(code, 0, repeat_max);
3072 /* The character or character type itself comes last in all cases. */
3075 if (utf8 && c >= 128)
3077 memcpy(code, utf8_char, c & 7);
3084 /* For a repeated Unicode property match, there is an extra byte that
3085 defines the required property. */
3088 if (prop_type >= 0) *code++ = prop_type;
3092 /* If previous was a character class or a back reference, we put the repeat
3093 stuff after it, but just skip the item if the repeat was {0,0}. */
3095 else if (*previous == OP_CLASS ||
3096 *previous == OP_NCLASS ||
3098 *previous == OP_XCLASS ||
3100 *previous == OP_REF)
3102 if (repeat_max == 0)
3108 /* All real repeats make it impossible to handle partial matching (maybe
3109 one day we will be able to remove this restriction). */
3111 if (repeat_max != 1) cd->nopartial = TRUE;
3113 if (repeat_min == 0 && repeat_max == -1)
3114 *code++ = OP_CRSTAR + repeat_type;
3115 else if (repeat_min == 1 && repeat_max == -1)
3116 *code++ = OP_CRPLUS + repeat_type;
3117 else if (repeat_min == 0 && repeat_max == 1)
3118 *code++ = OP_CRQUERY + repeat_type;
3121 *code++ = OP_CRRANGE + repeat_type;
3122 PUT2INC(code, 0, repeat_min);
3123 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3124 PUT2INC(code, 0, repeat_max);
3128 /* If previous was a bracket group, we may have to replicate it in certain
3131 else if (*previous >= OP_BRA || *previous == OP_ONCE ||
3132 *previous == OP_COND)
3136 int len = code - previous;
3137 uschar *bralink = NULL;
3139 /* If the maximum repeat count is unlimited, find the end of the bracket
3140 by scanning through from the start, and compute the offset back to it
3141 from the current code pointer. There may be an OP_OPT setting following
3142 the final KET, so we can't find the end just by going back from the code
3145 if (repeat_max == -1)
3147 register uschar *ket = previous;
3148 do ket += GET(ket, 1); while (*ket != OP_KET);
3149 ketoffset = code - ket;
3152 /* The case of a zero minimum is special because of the need to stick
3153 OP_BRAZERO in front of it, and because the group appears once in the
3154 data, whereas in other cases it appears the minimum number of times. For
3155 this reason, it is simplest to treat this case separately, as otherwise
3156 the code gets far too messy. There are several special subcases when the
3159 if (repeat_min == 0)
3161 /* If the maximum is also zero, we just omit the group from the output
3164 if (repeat_max == 0)
3170 /* If the maximum is 1 or unlimited, we just have to stick in the
3171 BRAZERO and do no more at this point. However, we do need to adjust
3172 any OP_RECURSE calls inside the group that refer to the group itself or
3173 any internal group, because the offset is from the start of the whole
3174 regex. Temporarily terminate the pattern while doing this. */
3176 if (repeat_max <= 1)
3179 adjust_recurse(previous, 1, utf8, cd);
3180 memmove(previous+1, previous, len);
3182 *previous++ = OP_BRAZERO + repeat_type;
3185 /* If the maximum is greater than 1 and limited, we have to replicate
3186 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3187 The first one has to be handled carefully because it's the original
3188 copy, which has to be moved up. The remainder can be handled by code
3189 that is common with the non-zero minimum case below. We have to
3190 adjust the value or repeat_max, since one less copy is required. Once
3191 again, we may have to adjust any OP_RECURSE calls inside the group. */
3197 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
3198 memmove(previous + 2 + LINK_SIZE, previous, len);
3199 code += 2 + LINK_SIZE;
3200 *previous++ = OP_BRAZERO + repeat_type;
3201 *previous++ = OP_BRA;
3203 /* We chain together the bracket offset fields that have to be
3204 filled in later when the ends of the brackets are reached. */
3206 offset = (bralink == NULL)? 0 : previous - bralink;
3208 PUTINC(previous, 0, offset);
3214 /* If the minimum is greater than zero, replicate the group as many
3215 times as necessary, and adjust the maximum to the number of subsequent
3216 copies that we need. If we set a first char from the group, and didn't
3217 set a required char, copy the latter from the former. */
3223 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3224 for (i = 1; i < repeat_min; i++)
3226 memcpy(code, previous, len);
3230 if (repeat_max > 0) repeat_max -= repeat_min;
3233 /* This code is common to both the zero and non-zero minimum cases. If
3234 the maximum is limited, it replicates the group in a nested fashion,
3235 remembering the bracket starts on a stack. In the case of a zero minimum,
3236 the first one was set up above. In all cases the repeat_max now specifies
3237 the number of additional copies needed. */
3239 if (repeat_max >= 0)
3241 for (i = repeat_max - 1; i >= 0; i--)
3243 *code++ = OP_BRAZERO + repeat_type;
3245 /* All but the final copy start a new nesting, maintaining the
3246 chain of brackets outstanding. */
3252 offset = (bralink == NULL)? 0 : code - bralink;
3254 PUTINC(code, 0, offset);
3257 memcpy(code, previous, len);
3261 /* Now chain through the pending brackets, and fill in their length
3262 fields (which are holding the chain links pro tem). */
3264 while (bralink != NULL)
3267 int offset = code - bralink + 1;
3268 uschar *bra = code - offset;
3269 oldlinkoffset = GET(bra, 1);
3270 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3272 PUTINC(code, 0, offset);
3273 PUT(bra, 1, offset);
3277 /* If the maximum is unlimited, set a repeater in the final copy. We
3278 can't just offset backwards from the current code point, because we
3279 don't know if there's been an options resetting after the ket. The
3280 correct offset was computed above. */
3282 else code[-ketoffset] = OP_KETRMAX + repeat_type;
3285 /* Else there's some kind of shambles */
3293 /* If the character following a repeat is '+', we wrap the entire repeated
3294 item inside OP_ONCE brackets. This is just syntactic sugar, taken from
3295 Sun's Java package. The repeated item starts at tempcode, not at previous,
3296 which might be the first part of a string whose (former) last char we
3297 repeated. However, we don't support '+' after a greediness '?'. */
3299 if (possessive_quantifier)
3301 int len = code - tempcode;
3302 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3303 code += 1 + LINK_SIZE;
3304 len += 1 + LINK_SIZE;
3305 tempcode[0] = OP_ONCE;
3307 PUTINC(code, 0, len);
3308 PUT(tempcode, 1, len);
3311 /* In all case we no longer have a previous item. We also set the
3312 "follows varying string" flag for subsequently encountered reqbytes if
3313 it isn't already set and we have just passed a varying length item. */
3317 cd->req_varyopt |= reqvary;
3321 /* Start of nested bracket sub-expression, or comment or lookahead or
3322 lookbehind or option setting or condition. First deal with special things
3323 that can come after a bracket; all are introduced by ?, and the appearance
3324 of any of them means that this is not a referencing group. They were
3325 checked for validity in the first pass over the string, so we don't have to
3326 check for syntax errors here. */
3329 newoptions = options;
3332 if (*(++ptr) == '?')
3339 case '#': /* Comment; skip to ket */
3341 while (*ptr != ')') ptr++;
3344 case ':': /* Non-extracting bracket */
3350 bravalue = OP_COND; /* Conditional group */
3352 /* Condition to test for recursion */
3356 code[1+LINK_SIZE] = OP_CREF;
3357 PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
3362 /* Condition to test for a numbered subpattern match. We know that
3363 if a digit follows ( then there will just be digits until ) because
3364 the syntax was checked in the first pass. */
3366 else if ((digitab[ptr[1]] && ctype_digit) != 0)
3368 int condref; /* Don't amalgamate; some compilers */
3369 condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */
3370 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
3377 code[1+LINK_SIZE] = OP_CREF;
3378 PUT2(code, 2+LINK_SIZE, condref);
3381 /* For conditions that are assertions, we just fall through, having
3382 set bravalue above. */
3385 case '=': /* Positive lookahead */
3386 bravalue = OP_ASSERT;
3390 case '!': /* Negative lookahead */
3391 bravalue = OP_ASSERT_NOT;
3395 case '<': /* Lookbehinds */
3398 case '=': /* Positive lookbehind */
3399 bravalue = OP_ASSERTBACK;
3403 case '!': /* Negative lookbehind */
3404 bravalue = OP_ASSERTBACK_NOT;
3410 case '>': /* One-time brackets */
3415 case 'C': /* Callout - may be followed by digits; */
3416 previous_callout = code; /* Save for later completion */
3417 after_manual_callout = 1; /* Skip one item before completing */
3418 *code++ = OP_CALLOUT; /* Already checked that the terminating */
3419 { /* closing parenthesis is present. */
3421 while ((digitab[*(++ptr)] & ctype_digit) != 0)
3422 n = n * 10 + *ptr - '0';
3429 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
3430 PUT(code, LINK_SIZE, 0); /* Default length */
3431 code += 2 * LINK_SIZE;
3436 case 'P': /* Named subpattern handling */
3437 if (*(++ptr) == '<') /* Definition */
3440 uschar *slot = cd->name_table;
3441 const uschar *name; /* Don't amalgamate; some compilers */
3442 name = ++ptr; /* grumble at autoincrement in declaration */
3444 while (*ptr++ != '>');
3445 namelen = ptr - name - 1;
3447 for (i = 0; i < cd->names_found; i++)
3449 int crc = memcmp(name, slot+2, namelen);
3452 if (slot[2+namelen] == 0)
3457 crc = -1; /* Current name is substring */
3461 memmove(slot + cd->name_entry_size, slot,
3462 (cd->names_found - i) * cd->name_entry_size);
3465 slot += cd->name_entry_size;
3468 PUT2(slot, 0, *brackets + 1);
3469 memcpy(slot + 2, name, namelen);
3470 slot[2+namelen] = 0;
3472 goto NUMBERED_GROUP;
3475 if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
3479 const uschar *name = ptr;
3480 uschar *slot = cd->name_table;
3482 while (*ptr != ')') ptr++;
3483 namelen = ptr - name;
3485 for (i = 0; i < cd->names_found; i++)
3487 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3488 slot += cd->name_entry_size;
3490 if (i >= cd->names_found)
3496 recno = GET2(slot, 0);
3498 if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
3500 /* Back reference */
3504 PUT2INC(code, 0, recno);
3505 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
3506 if (recno > cd->top_backref) cd->top_backref = recno;
3510 /* Should never happen */
3513 case 'R': /* Pattern recursion */
3514 ptr++; /* Same as (?0) */
3517 /* Recursion or "subroutine" call */
3519 case '0': case '1': case '2': case '3': case '4':
3520 case '5': case '6': case '7': case '8': case '9':
3522 const uschar *called;
3524 while((digitab[*ptr] & ctype_digit) != 0)
3525 recno = recno * 10 + *ptr++ - '0';
3527 /* Come here from code above that handles a named recursion */
3533 /* Find the bracket that is being referenced. Temporarily end the
3534 regex in case it doesn't exist. */
3537 called = (recno == 0)?
3538 cd->start_code : find_bracket(cd->start_code, utf8, recno);
3546 /* If the subpattern is still open, this is a recursive call. We
3547 check to see if this is a left recursion that could loop for ever,
3548 and diagnose that case. */
3550 if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
3556 /* Insert the recursion/subroutine item */
3559 PUT(code, 1, called - cd->start_code);
3560 code += 1 + LINK_SIZE;
3564 /* Character after (? not specially recognized */
3566 default: /* Option setting */
3570 while (*ptr != ')' && *ptr != ':')
3574 case '-': optset = &unset; break;
3576 case 'i': *optset |= PCRE_CASELESS; break;
3577 case 'm': *optset |= PCRE_MULTILINE; break;
3578 case 's': *optset |= PCRE_DOTALL; break;
3579 case 'x': *optset |= PCRE_EXTENDED; break;
3580 case 'U': *optset |= PCRE_UNGREEDY; break;
3581 case 'X': *optset |= PCRE_EXTRA; break;
3585 /* Set up the changed option bits, but don't change anything yet. */
3587 newoptions = (options | set) & (~unset);
3589 /* If the options ended with ')' this is not the start of a nested
3590 group with option changes, so the options change at this level. Compile
3591 code to change the ims options if this setting actually changes any of
3592 them. We also pass the new setting back so that it can be put at the
3593 start of any following branches, and when this group ends (if we are in
3594 a group), a resetting item can be compiled.
3596 Note that if this item is right at the start of the pattern, the
3597 options will have been abstracted and made global, so there will be no
3598 change to compile. */
3602 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
3605 *code++ = newoptions & PCRE_IMS;
3608 /* Change options at this level, and pass them back for use
3609 in subsequent branches. Reset the greedy defaults and the case
3610 value for firstbyte and reqbyte. */
3612 *optionsptr = options = newoptions;
3613 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
3614 greedy_non_default = greedy_default ^ 1;
3615 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3617 previous = NULL; /* This item can't be repeated */
3618 continue; /* It is complete */
3621 /* If the options ended with ':' we are heading into a nested group
3622 with possible change of options. Such groups are non-capturing and are
3623 not assertions of any kind. All we need to do is skip over the ':';
3624 the newoptions value is handled below. */
3631 /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
3632 non-capturing and behave like (?:...) brackets */
3634 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
3639 /* Else we have a referencing group; adjust the opcode. If the bracket
3640 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
3641 arrange for the true number to follow later, in an OP_BRANUMBER item. */
3646 if (++(*brackets) > EXTRACT_BASIC_MAX)
3648 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
3649 code[1+LINK_SIZE] = OP_BRANUMBER;
3650 PUT2(code, 2+LINK_SIZE, *brackets);
3653 else bravalue = OP_BRA + *brackets;
3656 /* Process nested bracketed re. Assertions may not be repeated, but other
3657 kinds can be. We copy code into a non-register variable in order to be able
3658 to pass its address because some compilers complain otherwise. Pass in a
3659 new setting for the ims options if they have changed. */
3661 previous = (bravalue >= OP_ONCE)? code : NULL;
3664 tempreqvary = cd->req_varyopt; /* Save value before bracket */
3667 newoptions, /* The complete new option state */
3668 options & PCRE_IMS, /* The previous ims option state */
3669 brackets, /* Extracting bracket count */
3670 &tempcode, /* Where to put code (updated) */
3671 &ptr, /* Input pointer (updated) */
3672 errorptr, /* Where to put an error message */
3673 (bravalue == OP_ASSERTBACK ||
3674 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
3675 skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
3676 &subfirstbyte, /* For possible first char */
3677 &subreqbyte, /* For possible last char */
3678 bcptr, /* Current branch chain */
3679 cd)) /* Tables block */
3682 /* At the end of compiling, code is still pointing to the start of the
3683 group, while tempcode has been updated to point past the end of the group
3684 and any option resetting that may follow it. The pattern pointer (ptr)
3685 is on the bracket. */
3687 /* If this is a conditional bracket, check that there are no more than
3688 two branches in the group. */
3690 else if (bravalue == OP_COND)
3699 while (*tc != OP_KET);
3707 /* If there is just one branch, we must not make use of its firstbyte or
3708 reqbyte, because this is equivalent to an empty second branch. */
3710 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
3713 /* Handle updating of the required and first characters. Update for normal
3714 brackets of all kinds, and conditions with two branches (see code above).
3715 If the bracket is followed by a quantifier with zero repeat, we have to
3716 back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
3717 main loop so that they can be accessed for the back off. */
3719 zeroreqbyte = reqbyte;
3720 zerofirstbyte = firstbyte;
3721 groupsetfirstbyte = FALSE;
3723 if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
3725 /* If we have not yet set a firstbyte in this branch, take it from the
3726 subpattern, remembering that it was set here so that a repeat of more
3727 than one can replicate it as reqbyte if necessary. If the subpattern has
3728 no firstbyte, set "none" for the whole branch. In both cases, a zero
3729 repeat forces firstbyte to "none". */
3731 if (firstbyte == REQ_UNSET)
3733 if (subfirstbyte >= 0)
3735 firstbyte = subfirstbyte;
3736 groupsetfirstbyte = TRUE;
3738 else firstbyte = REQ_NONE;
3739 zerofirstbyte = REQ_NONE;
3742 /* If firstbyte was previously set, convert the subpattern's firstbyte
3743 into reqbyte if there wasn't one, using the vary flag that was in
3744 existence beforehand. */
3746 else if (subfirstbyte >= 0 && subreqbyte < 0)
3747 subreqbyte = subfirstbyte | tempreqvary;
3749 /* If the subpattern set a required byte (or set a first byte that isn't
3750 really the first byte - see above), set it. */
3752 if (subreqbyte >= 0) reqbyte = subreqbyte;
3755 /* For a forward assertion, we take the reqbyte, if set. This can be
3756 helpful if the pattern that follows the assertion doesn't set a different
3757 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3758 for an assertion, however because it leads to incorrect effect for patterns
3759 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3760 of a firstbyte. This is overcome by a scan at the end if there's no
3761 firstbyte, looking for an asserted first char. */
3763 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3765 /* Now update the main code pointer to the end of the group. */
3769 /* Error if hit end of pattern */
3778 /* Check \ for being a real metacharacter; if not, fall through and handle
3779 it as a data character at the start of a string. Escape items are checked
3780 for validity in the pre-compiling pass. */
3784 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
3786 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3787 are arranged to be the negation of the corresponding OP_values. For the
3788 back references, the values are ESC_REF plus the reference number. Only
3789 back references and those types that consume a character may be repeated.
3790 We can test for values between ESC_b and ESC_Z for the latter; this may
3791 have to change if any new ones are ever created. */
3795 if (-c == ESC_Q) /* Handle start of quoted string */
3797 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3802 /* For metasequences that actually match a character, we disable the
3803 setting of a first character if it hasn't already been set. */
3805 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3806 firstbyte = REQ_NONE;
3808 /* Set values to reset to if this is followed by a zero repeat. */
3810 zerofirstbyte = firstbyte;
3811 zeroreqbyte = reqbyte;
3813 /* Back references are handled specially */
3817 int number = -c - ESC_REF;
3820 PUT2INC(code, 0, number);
3823 /* So are Unicode property matches, if supported. We know that get_ucp
3824 won't fail because it was tested in the pre-pass. */
3827 else if (-c == ESC_P || -c == ESC_p)
3830 int value = get_ucp(&ptr, &negated, errorptr);
3832 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
3837 /* For the rest, we can obtain the OP value by negating the escape
3842 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3848 /* We have a data character whose value is in c. In UTF-8 mode it may have
3849 a value > 127. We set its representation in the length/buffer, and then
3850 handle it as a data character. */
3853 if (utf8 && c > 127)
3854 mclength = ord2utf8(c, mcbuffer);
3865 /* Handle a literal character. It is guaranteed not to be whitespace or #
3866 when the extended flag is set. If we are in UTF-8 mode, it may be a
3867 multi-byte literal character. */
3875 if (utf8 && (c & 0xc0) == 0xc0)
3877 while ((ptr[1] & 0xc0) == 0x80)
3878 mcbuffer[mclength++] = *(++ptr);
3882 /* At this point we have the character's bytes in mcbuffer, and the length
3883 in mclength. When not in UTF-8 mode, the length is always 1. */
3887 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
3888 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
3890 /* Set the first and required bytes appropriately. If no previous first
3891 byte, set it from this character, but revert to none on a zero repeat.
3892 Otherwise, leave the firstbyte value alone, and don't change it on a zero
3895 if (firstbyte == REQ_UNSET)
3897 zerofirstbyte = REQ_NONE;
3898 zeroreqbyte = reqbyte;
3900 /* If the character is more than one byte long, we can set firstbyte
3901 only if it is not to be matched caselessly. */
3903 if (mclength == 1 || req_caseopt == 0)
3905 firstbyte = mcbuffer[0] | req_caseopt;
3906 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
3908 else firstbyte = reqbyte = REQ_NONE;
3911 /* firstbyte was previously set; we can set reqbyte only the length is
3912 1 or the matching is caseful. */
3916 zerofirstbyte = firstbyte;
3917 zeroreqbyte = reqbyte;
3918 if (mclength == 1 || req_caseopt == 0)
3919 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3922 break; /* End of literal character handling */
3924 } /* end of big loop */
3926 /* Control never reaches here by falling through, only by a goto for all the
3927 error states. Pass back the position in the pattern so that it can be displayed
3928 to the user for diagnosing the error. */
3938 /*************************************************
3939 * Compile sequence of alternatives *
3940 *************************************************/
3942 /* On entry, ptr is pointing past the bracket character, but on return
3943 it points to the closing bracket, or vertical bar, or end of string.
3944 The code variable is pointing at the byte into which the BRA operator has been
3945 stored. If the ims options are changed at the start (for a (?ims: group) or
3946 during any branch, we need to insert an OP_OPT item at the start of every
3947 following branch to ensure they get set correctly at run time, and also pass
3948 the new options into every subsequent branch compile.
3951 options option bits, including any changes for this subpattern
3952 oldims previous settings of ims option bits
3953 brackets -> int containing the number of extracting brackets used
3954 codeptr -> the address of the current code pointer
3955 ptrptr -> the address of the current pattern pointer
3956 errorptr -> pointer to error message
3957 lookbehind TRUE if this is a lookbehind assertion
3958 skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3959 firstbyteptr place to put the first required character, or a negative number
3960 reqbyteptr place to put the last required character, or a negative number
3961 bcptr pointer to the chain of currently open branches
3962 cd points to the data block with tables pointers etc.
3964 Returns: TRUE on success
3968 compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3969 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
3970 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3972 const uschar *ptr = *ptrptr;
3973 uschar *code = *codeptr;
3974 uschar *last_branch = code;
3975 uschar *start_bracket = code;
3976 uschar *reverse_count = NULL;
3977 int firstbyte, reqbyte;
3978 int branchfirstbyte, branchreqbyte;
3984 firstbyte = reqbyte = REQ_UNSET;
3986 /* Offset is set zero to mark that this bracket is still open */
3989 code += 1 + LINK_SIZE + skipbytes;
3991 /* Loop for each alternative branch */
3995 /* Handle a change of ims options at the start of the branch */
3997 if ((options & PCRE_IMS) != oldims)
4000 *code++ = options & PCRE_IMS;
4003 /* Set up dummy OP_REVERSE if lookbehind assertion */
4007 *code++ = OP_REVERSE;
4008 reverse_count = code;
4012 /* Now compile the branch */
4014 if (!compile_branch(&options, brackets, &code, &ptr, errorptr,
4015 &branchfirstbyte, &branchreqbyte, &bc, cd))
4021 /* If this is the first branch, the firstbyte and reqbyte values for the
4022 branch become the values for the regex. */
4024 if (*last_branch != OP_ALT)
4026 firstbyte = branchfirstbyte;
4027 reqbyte = branchreqbyte;
4030 /* If this is not the first branch, the first char and reqbyte have to
4031 match the values from all the previous branches, except that if the previous
4032 value for reqbyte didn't have REQ_VARY set, it can still match, and we set
4033 REQ_VARY for the regex. */
4037 /* If we previously had a firstbyte, but it doesn't match the new branch,
4038 we have to abandon the firstbyte for the regex, but if there was previously
4039 no reqbyte, it takes on the value of the old firstbyte. */
4041 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
4043 if (reqbyte < 0) reqbyte = firstbyte;
4044 firstbyte = REQ_NONE;
4047 /* If we (now or from before) have no firstbyte, a firstbyte from the
4048 branch becomes a reqbyte if there isn't a branch reqbyte. */
4050 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
4051 branchreqbyte = branchfirstbyte;
4053 /* Now ensure that the reqbytes match */
4055 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
4057 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
4060 /* If lookbehind, check that this branch matches a fixed-length string,
4061 and put the length into the OP_REVERSE item. Temporarily mark the end of
4062 the branch with OP_END. */
4068 length = find_fixedlength(last_branch, options);
4069 DPRINTF(("fixed length = %d\n", length));
4072 *errorptr = (length == -2)? ERR36 : ERR25;
4076 PUT(reverse_count, 0, length);
4079 /* Reached end of expression, either ')' or end of pattern. Go back through
4080 the alternative branches and reverse the chain of offsets, with the field in
4081 the BRA item now becoming an offset to the first alternative. If there are
4082 no alternatives, it points to the end of the group. The length in the
4083 terminating ket is always the length of the whole bracketed item. If any of
4084 the ims options were changed inside the group, compile a resetting op-code
4085 following, except at the very end of the pattern. Return leaving the pointer
4086 at the terminating char. */
4090 int length = code - last_branch;
4093 int prev_length = GET(last_branch, 1);
4094 PUT(last_branch, 1, length);
4095 length = prev_length;
4096 last_branch -= length;
4100 /* Fill in the ket */
4103 PUT(code, 1, code - start_bracket);
4104 code += 1 + LINK_SIZE;
4106 /* Resetting option if needed */
4108 if ((options & PCRE_IMS) != oldims && *ptr == ')')
4114 /* Set values to pass back */
4118 *firstbyteptr = firstbyte;
4119 *reqbyteptr = reqbyte;
4123 /* Another branch follows; insert an "or" node. Its length field points back
4124 to the previous branch while the bracket remains open. At the end the chain
4125 is reversed. It's done like this so that the start of the bracket has a
4126 zero offset until it is closed, making it possible to detect recursion. */
4129 PUT(code, 1, code - last_branch);
4130 bc.current = last_branch = code;
4131 code += 1 + LINK_SIZE;
4134 /* Control never reaches here */
4140 /*************************************************
4141 * Check for anchored expression *
4142 *************************************************/
4144 /* Try to find out if this is an anchored regular expression. Consider each
4145 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
4146 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
4147 it's anchored. However, if this is a multiline pattern, then only OP_SOD
4148 counts, since OP_CIRC can match in the middle.
4150 We can also consider a regex to be anchored if OP_SOM starts all its branches.
4151 This is the code for \G, which means "match at start of match position, taking
4152 into account the match offset".
4154 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
4155 because that will try the rest of the pattern at all possible matching points,
4156 so there is no point trying again.... er ....
4158 .... except when the .* appears inside capturing parentheses, and there is a
4159 subsequent back reference to those parentheses. We haven't enough information
4160 to catch that case precisely.
4162 At first, the best we could do was to detect when .* was in capturing brackets
4163 and the highest back reference was greater than or equal to that level.
4164 However, by keeping a bitmap of the first 31 back references, we can catch some
4165 of the more common cases more precisely.
4168 code points to start of expression (the bracket)
4169 options points to the options setting
4170 bracket_map a bitmap of which brackets we are inside while testing; this
4171 handles up to substring 31; after that we just have to take
4172 the less precise approach
4173 backref_map the back reference bitmap
4175 Returns: TRUE or FALSE
4179 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
4180 unsigned int backref_map)
4183 const uschar *scode =
4184 first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);
4185 register int op = *scode;
4187 /* Capturing brackets */
4193 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
4194 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
4195 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
4198 /* Other brackets */
4200 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4202 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4205 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
4206 are or may be referenced. */
4208 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
4209 (*options & PCRE_DOTALL) != 0)
4211 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4214 /* Check for explicit anchoring */
4216 else if (op != OP_SOD && op != OP_SOM &&
4217 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
4219 code += GET(code, 1);
4221 while (*code == OP_ALT); /* Loop for each alternative */
4227 /*************************************************
4228 * Check for starting with ^ or .* *
4229 *************************************************/
4231 /* This is called to find out if every branch starts with ^ or .* so that
4232 "first char" processing can be done to speed things up in multiline
4233 matching and for non-DOTALL patterns that start with .* (which must start at
4234 the beginning or after \n). As in the case of is_anchored() (see above), we
4235 have to take account of back references to capturing brackets that contain .*
4236 because in that case we can't make the assumption.
4239 code points to start of expression (the bracket)
4240 bracket_map a bitmap of which brackets we are inside while testing; this
4241 handles up to substring 31; after that we just have to take
4242 the less precise approach
4243 backref_map the back reference bitmap
4245 Returns: TRUE or FALSE
4249 is_startline(const uschar *code, unsigned int bracket_map,
4250 unsigned int backref_map)
4253 const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,
4255 register int op = *scode;
4257 /* Capturing brackets */
4263 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
4264 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
4265 if (!is_startline(scode, new_map, backref_map)) return FALSE;
4268 /* Other brackets */
4270 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4271 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
4273 /* .* means "start at start or after \n" if it isn't in brackets that
4274 may be referenced. */
4276 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
4278 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4281 /* Check for explicit circumflex */
4283 else if (op != OP_CIRC) return FALSE;
4285 /* Move on to the next alternative */
4287 code += GET(code, 1);
4289 while (*code == OP_ALT); /* Loop for each alternative */
4295 /*************************************************
4296 * Check for asserted fixed first char *
4297 *************************************************/
4299 /* During compilation, the "first char" settings from forward assertions are
4300 discarded, because they can cause conflicts with actual literals that follow.
4301 However, if we end up without a first char setting for an unanchored pattern,
4302 it is worth scanning the regex to see if there is an initial asserted first
4303 char. If all branches start with the same asserted char, or with a bracket all
4304 of whose alternatives start with the same asserted char (recurse ad lib), then
4305 we return that char, otherwise -1.
4308 code points to start of expression (the bracket)
4309 options pointer to the options (used to check casing changes)
4310 inassert TRUE if in an assertion
4312 Returns: -1 or the fixed first char
4316 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
4318 register int c = -1;
4321 const uschar *scode =
4322 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
4323 register int op = *scode;
4325 if (op >= OP_BRA) op = OP_BRA;
4336 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
4338 if (c < 0) c = d; else if (c != d) return -1;
4341 case OP_EXACT: /* Fall through */
4348 if (!inassert) return -1;
4352 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
4354 else if (c != scode[1]) return -1;
4358 code += GET(code, 1);
4360 while (*code == OP_ALT);
4368 /*************************************************
4369 * Validate a UTF-8 string *
4370 *************************************************/
4372 /* This function is called (optionally) at the start of compile or match, to
4373 validate that a supposed UTF-8 string is actually valid. The early check means
4374 that subsequent code can assume it is dealing with a valid string. The check
4375 can be turned off for maximum performance, but then consequences of supplying
4376 an invalid string are then undefined.
4379 string points to the string
4380 length length of string, or -1 if the string is zero-terminated
4382 Returns: < 0 if the string is a valid UTF-8 string
4383 >= 0 otherwise; the value is the offset of the bad byte
4387 valid_utf8(const uschar *string, int length)
4389 register const uschar *p;
4393 for (p = string; *p != 0; p++);
4394 length = p - string;
4397 for (p = string; length-- > 0; p++)
4400 register int c = *p;
4401 if (c < 128) continue;
4402 if ((c & 0xc0) != 0xc0) return p - string;
4403 ab = utf8_table4[c & 0x3f]; /* Number of additional bytes */
4404 if (length < ab) return p - string;
4407 /* Check top bits in the second byte */
4408 if ((*(++p) & 0xc0) != 0x80) return p - string;
4410 /* Check for overlong sequences for each different length */
4413 /* Check for xx00 000x */
4415 if ((c & 0x3e) == 0) return p - string;
4416 continue; /* We know there aren't any more bytes to check */
4418 /* Check for 1110 0000, xx0x xxxx */
4420 if (c == 0xe0 && (*p & 0x20) == 0) return p - string;
4423 /* Check for 1111 0000, xx00 xxxx */
4425 if (c == 0xf0 && (*p & 0x30) == 0) return p - string;
4428 /* Check for 1111 1000, xx00 0xxx */
4430 if (c == 0xf8 && (*p & 0x38) == 0) return p - string;
4433 /* Check for leading 0xfe or 0xff, and then for 1111 1100, xx00 00xx */
4435 if (c == 0xfe || c == 0xff ||
4436 (c == 0xfc && (*p & 0x3c) == 0)) return p - string;
4440 /* Check for valid bytes after the 2nd, if any; all must start 10 */
4443 if ((*(++p) & 0xc0) != 0x80) return p - string;
4453 /*************************************************
4454 * Compile a Regular Expression *
4455 *************************************************/
4457 /* This function takes a string and returns a pointer to a block of store
4458 holding a compiled version of the expression.
4461 pattern the regular expression
4462 options various option bits
4463 errorptr pointer to pointer to error text
4464 erroroffset ptr offset in pattern where error was detected
4465 tables pointer to character tables or NULL
4467 Returns: pointer to compiled data block, or NULL on error,
4468 with errorptr and erroroffset set
4472 pcre_compile(const char *pattern, int options, const char **errorptr,
4473 int *erroroffset, const unsigned char *tables)
4476 int length = 1 + LINK_SIZE; /* For initial BRA plus length */
4477 int c, firstbyte, reqbyte;
4479 int branch_extra = 0;
4480 int branch_newextra;
4481 int item_count = -1;
4483 int max_name_size = 0;
4484 int lastitemlength = 0;
4489 BOOL inescq = FALSE;
4490 unsigned int brastackptr = 0;
4493 const uschar *codestart;
4495 compile_data compile_block;
4496 int brastack[BRASTACK_SIZE];
4497 uschar bralenstack[BRASTACK_SIZE];
4499 /* We can't pass back an error message if errorptr is NULL; I guess the best we
4500 can do is just return NULL. */
4502 if (errorptr == NULL) return NULL;
4505 /* However, we can give a message for this error */
4507 if (erroroffset == NULL)
4514 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
4517 utf8 = (options & PCRE_UTF8) != 0;
4518 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
4519 (*erroroffset = valid_utf8((uschar *)pattern, -1)) >= 0)
4525 if ((options & PCRE_UTF8) != 0)
4532 if ((options & ~PUBLIC_OPTIONS) != 0)
4538 /* Set up pointers to the individual character tables */
4540 if (tables == NULL) tables = pcre_default_tables;
4541 compile_block.lcc = tables + lcc_offset;
4542 compile_block.fcc = tables + fcc_offset;
4543 compile_block.cbits = tables + cbits_offset;
4544 compile_block.ctypes = tables + ctypes_offset;
4546 /* Maximum back reference and backref bitmap. This is updated for numeric
4547 references during the first pass, but for named references during the actual
4548 compile pass. The bitmap records up to 31 back references to help in deciding
4549 whether (.*) can be treated as anchored or not. */
4551 compile_block.top_backref = 0;
4552 compile_block.backref_map = 0;
4554 /* Reflect pattern for debugging output */
4556 DPRINTF(("------------------------------------------------------------------\n"));
4557 DPRINTF(("%s\n", pattern));
4559 /* The first thing to do is to make a pass over the pattern to compute the
4560 amount of store required to hold the compiled code. This does not have to be
4561 perfect as long as errors are overestimates. At the same time we can detect any
4562 flag settings right at the start, and extract them. Make an attempt to correct
4563 for any counted white space if an "extended" flag setting appears late in the
4564 pattern. We can't be so clever for #-comments. */
4566 ptr = (const uschar *)(pattern - 1);
4567 while ((c = *(++ptr)) != 0)
4574 /* If we are inside a \Q...\E sequence, all chars are literal */
4578 if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;
4582 /* Otherwise, first check for ignored whitespace and comments */
4584 if ((options & PCRE_EXTENDED) != 0)
4586 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
4589 /* The space before the ; is to avoid a warning on a silly compiler
4590 on the Macintosh. */
4591 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
4597 item_count++; /* Is zero for the first non-comment item */
4599 /* Allow space for auto callout before every item except quantifiers. */
4601 if ((options & PCRE_AUTO_CALLOUT) != 0 &&
4602 c != '*' && c != '+' && c != '?' &&
4603 (c != '{' || !is_counted_repeat(ptr + 1)))
4604 length += 2 + 2*LINK_SIZE;
4608 /* A backslashed item may be an escaped data character or it may be a
4612 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
4613 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4615 lastitemlength = 1; /* Default length of last item for repeats */
4617 if (c >= 0) /* Data character */
4619 length += 2; /* For a one-byte character */
4622 if (utf8 && c > 127)
4625 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
4626 if (c <= utf8_table1[i]) break;
4628 lastitemlength += i;
4635 /* If \Q, enter "literal" mode */
4643 /* \X is supported only if Unicode property support is compiled */
4649 goto PCRE_ERROR_RETURN;
4653 /* \P and \p are for Unicode properties, but only when the support has
4654 been compiled. Each item needs 2 bytes. */
4656 else if (-c == ESC_P || -c == ESC_p)
4662 if (get_ucp(&ptr, &negated, errorptr) < 0) goto PCRE_ERROR_RETURN;
4666 goto PCRE_ERROR_RETURN;
4670 /* Other escapes need one byte */
4674 /* A back reference needs an additional 2 bytes, plus either one or 5
4675 bytes for a repeat. We also need to keep the value of the highest
4680 int refnum = -c - ESC_REF;
4681 compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
4682 if (refnum > compile_block.top_backref)
4683 compile_block.top_backref = refnum;
4684 length += 2; /* For single back reference */
4685 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
4687 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
4688 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4689 if ((min == 0 && (max == 1 || max == -1)) ||
4690 (min == 1 && max == -1))
4693 if (ptr[1] == '?') ptr++;
4698 case '^': /* Single-byte metacharacters */
4705 case '*': /* These repeats won't be after brackets; */
4706 case '+': /* those are handled separately */
4709 goto POSESSIVE; /* A few lines below */
4711 /* This covers the cases of braced repeats after a single char, metachar,
4712 class, or back reference. */
4715 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
4716 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
4717 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4719 /* These special cases just insert one extra opcode */
4721 if ((min == 0 && (max == 1 || max == -1)) ||
4722 (min == 1 && max == -1))
4725 /* These cases might insert additional copies of a preceding character. */
4731 length -= lastitemlength; /* Uncount the original char or metachar */
4732 if (min > 0) length += 3 + lastitemlength;
4734 length += lastitemlength + ((max > 0)? 3 : 1);
4737 if (ptr[1] == '?') ptr++; /* Needs no extra length */
4739 POSESSIVE: /* Test for possessive quantifier */
4743 length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
4747 /* An alternation contains an offset to the next branch or ket. If any ims
4748 options changed in the previous branch(es), and/or if we are in a
4749 lookbehind assertion, extra space will be needed at the start of the
4750 branch. This is handled by branch_extra. */
4753 length += 1 + LINK_SIZE + branch_extra;
4756 /* A character class uses 33 characters provided that all the character
4757 values are less than 256. Otherwise, it uses a bit map for low valued
4758 characters, and individual items for others. Don't worry about character
4759 types that aren't allowed in classes - they'll get picked up during the
4760 compile. A character class that contains only one single-byte character
4761 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
4762 where we can. (In UTF-8 mode we can do this only for chars < 128.) */
4765 if (*(++ptr) == '^')
4767 class_optcount = 10; /* Greater than one */
4770 else class_optcount = 0;
4776 /* Written as a "do" so that an initial ']' is taken as data */
4780 /* Inside \Q...\E everything is literal except \E */
4784 if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER;
4790 /* Outside \Q...\E, check for escapes */
4794 c = check_escape(&ptr, errorptr, bracount, options, TRUE);
4795 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4797 /* \b is backspace inside a class; \X is literal */
4799 if (-c == ESC_b) c = '\b';
4800 else if (-c == ESC_X) c = 'X';
4802 /* \Q enters quoting mode */
4804 else if (-c == ESC_Q)
4810 /* Handle escapes that turn into characters */
4812 if (c >= 0) goto NON_SPECIAL_CHARACTER;
4814 /* Escapes that are meta-things. The normal ones just affect the
4815 bit map, but Unicode properties require an XCLASS extended item. */
4819 class_optcount = 10; /* \d, \s etc; make sure > 1 */
4821 if (-c == ESC_p || -c == ESC_P)
4826 length += LINK_SIZE + 2;
4834 /* Check the syntax for POSIX stuff. The bits we actually handle are
4835 checked during the real compile phase. */
4837 else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
4840 class_optcount = 10; /* Make sure > 1 */
4843 /* Anything else increments the possible optimization count. We have to
4844 detect ranges here so that we can compute the number of extra ranges for
4845 caseless wide characters when UCP support is available. If there are wide
4846 characters, we are going to have to use an XCLASS, even for single
4859 GETCHARLEN(c, ptr, extra);
4867 /* Come here from handling \ above when it escapes to a char value */
4869 NON_SPECIAL_CHARACTER:
4875 uschar const *hyptr = ptr++;
4879 d = check_escape(&ptr, errorptr, bracount, options, TRUE);
4880 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4881 if (-d == ESC_b) d = '\b'; /* backspace */
4882 else if (-d == ESC_X) d = 'X'; /* literal X in a class */
4884 else if (ptr[1] != 0 && ptr[1] != ']')
4891 GETCHARLEN(d, ptr, extra);
4898 if (d < 0) ptr = hyptr; /* go back to hyphen as data */
4901 /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
4902 127 for caseless matching, we will need to use an XCLASS. */
4906 class_optcount = 10; /* Ensure > 1 */
4910 goto PCRE_ERROR_RETURN;
4914 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4917 if (!class_utf8) /* Allow for XCLASS overhead */
4920 length += LINK_SIZE + 2;
4924 /* If we have UCP support, find out how many extra ranges are
4925 needed to map the other case of characters within this range. We
4926 have to mimic the range optimization here, because extending the
4927 range upwards might push d over a boundary that makes is use
4928 another byte in the UTF-8 representation. */
4930 if ((options & PCRE_CASELESS) != 0)
4935 while (get_othercase_range(&cc, origd, &occ, &ocd))
4937 if (occ >= c && ocd <= d) continue; /* Skip embedded */
4939 if (occ < c && ocd >= c - 1) /* Extend the basic range */
4940 { /* if there is overlap, */
4941 c = occ; /* noting that if occ < c */
4942 continue; /* we can't have ocd > d */
4943 } /* because a subrange is */
4944 if (ocd > d && occ <= d + 1) /* always shorter than */
4945 { /* the basic range. */
4950 /* An extra item is needed */
4952 length += 1 + ord2utf8(occ, buffer) +
4953 ((occ == ocd)? 0 : ord2utf8(ocd, buffer));
4956 #endif /* SUPPORT_UCP */
4958 /* The length of the (possibly extended) range */
4960 length += 1 + ord2utf8(c, buffer) + ord2utf8(d, buffer);
4962 #endif /* SUPPORT_UTF8 */
4966 /* We have a single character. There is nothing to be done unless we
4967 are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
4968 allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
4974 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4977 class_optcount = 10; /* Ensure > 1 */
4978 if (!class_utf8) /* Allow for XCLASS overhead */
4981 length += LINK_SIZE + 2;
4984 length += (((options & PCRE_CASELESS) != 0)? 2 : 1) *
4985 (1 + ord2utf8(c, buffer));
4986 #else /* SUPPORT_UCP */
4987 length += 1 + ord2utf8(c, buffer);
4988 #endif /* SUPPORT_UCP */
4990 #endif /* SUPPORT_UTF8 */
4994 while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
4996 if (*ptr == 0) /* Missing terminating ']' */
4999 goto PCRE_ERROR_RETURN;
5002 /* We can optimize when there was only one optimizable character. Repeats
5003 for positive and negated single one-byte chars are handled by the general
5004 code. Here, we handle repeats for the class opcodes. */
5006 if (class_optcount == 1) length += 3; else
5010 /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
5011 we also need extra for wrapping the whole thing in a sub-pattern. */
5013 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
5015 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
5016 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
5017 if ((min == 0 && (max == 1 || max == -1)) ||
5018 (min == 1 && max == -1))
5024 length += 2 + 2*LINK_SIZE;
5026 else if (ptr[1] == '?') ptr++;
5031 /* Brackets may be genuine groups or special things */
5034 branch_newextra = 0;
5035 bracket_length = 1 + LINK_SIZE;
5037 /* Handle special forms of bracket, which all start (? */
5046 /* Skip over comments entirely */
5049 while (*ptr != 0 && *ptr != ')') ptr++;
5053 goto PCRE_ERROR_RETURN;
5057 /* Non-referencing groups and lookaheads just move the pointer on, and
5058 then behave like a non-special bracket, except that they don't increment
5059 the count of extracting brackets. Ditto for the "once only" bracket,
5060 which is in Perl from version 5.005. */
5069 /* (?R) specifies a recursive call to the regex, which is an extension
5070 to provide the facility which can be obtained by (?p{perl-code}) in
5071 Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
5073 From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
5074 the appropriate numbered brackets. This includes both recursive and
5075 non-recursive calls. (?R) is now synonymous with (?0). */
5080 case '0': case '1': case '2': case '3': case '4':
5081 case '5': case '6': case '7': case '8': case '9':
5084 while ((digitab[*(++ptr)] & ctype_digit) != 0);
5088 goto PCRE_ERROR_RETURN;
5090 length += 1 + LINK_SIZE;
5092 /* If this item is quantified, it will get wrapped inside brackets so
5093 as to use the code for quantified brackets. We jump down and use the
5094 code that handles this for real brackets. */
5096 if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
5098 length += 2 + 2 * LINK_SIZE; /* to make bracketed */
5099 duplength = 5 + 3 * LINK_SIZE;
5100 goto HANDLE_QUANTIFIED_BRACKETS;
5104 /* (?C) is an extension which provides "callout" - to provide a bit of
5105 the functionality of the Perl (?{...}) feature. An optional number may
5106 follow (default is zero). */
5110 while ((digitab[*(++ptr)] & ctype_digit) != 0);
5114 goto PCRE_ERROR_RETURN;
5116 length += 2 + 2*LINK_SIZE;
5119 /* Named subpatterns are an extension copied from Python */
5125 const uschar *p; /* Don't amalgamate; some compilers */
5126 p = ++ptr; /* grumble at autoincrement in declaration */
5127 while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
5131 goto PCRE_ERROR_RETURN;
5134 if (ptr - p > max_name_size) max_name_size = (ptr - p);
5138 if (*ptr == '=' || *ptr == '>')
5140 while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
5144 goto PCRE_ERROR_RETURN;
5149 /* Unknown character after (?P */
5152 goto PCRE_ERROR_RETURN;
5154 /* Lookbehinds are in Perl from version 5.005 */
5158 if (*ptr == '=' || *ptr == '!')
5160 branch_newextra = 1 + LINK_SIZE;
5161 length += 1 + LINK_SIZE; /* For the first branch */
5165 goto PCRE_ERROR_RETURN;
5167 /* Conditionals are in Perl from version 5.005. The bracket must either
5168 be followed by a number (for bracket reference) or by an assertion
5169 group, or (a PCRE extension) by 'R' for a recursion test. */
5172 if (ptr[3] == 'R' && ptr[4] == ')')
5177 else if ((digitab[ptr[3]] & ctype_digit) != 0)
5181 while ((digitab[*ptr] & ctype_digit) != 0) ptr++;
5185 goto PCRE_ERROR_RETURN;
5188 else /* An assertion must follow */
5190 ptr++; /* Can treat like ':' as far as spacing is concerned */
5191 if (ptr[2] != '?' ||
5192 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
5194 ptr += 2; /* To get right offset in message */
5196 goto PCRE_ERROR_RETURN;
5201 /* Else loop checking valid options until ) is met. Anything else is an
5202 error. If we are without any brackets, i.e. at top level, the settings
5203 act as if specified in the options, so massage the options immediately.
5204 This is for backward compatibility with Perl 5.004. */
5217 *optset |= PCRE_CASELESS;
5221 *optset |= PCRE_MULTILINE;
5225 *optset |= PCRE_DOTALL;
5229 *optset |= PCRE_EXTENDED;
5233 *optset |= PCRE_EXTRA;
5237 *optset |= PCRE_UNGREEDY;
5244 /* A termination by ')' indicates an options-setting-only item; if
5245 this is at the very start of the pattern (indicated by item_count
5246 being zero), we use it to set the global options. This is helpful
5247 when analyzing the pattern for first characters, etc. Otherwise
5248 nothing is done here and it is handled during the compiling
5251 [Historical note: Up to Perl 5.8, options settings at top level
5252 were always global settings, wherever they appeared in the pattern.
5253 That is, they were equivalent to an external setting. From 5.8
5254 onwards, they apply only to what follows (which is what you might
5258 if (item_count == 0)
5260 options = (options | set) & (~unset);
5261 set = unset = 0; /* To save length */
5262 item_count--; /* To allow for several */
5267 /* A termination by ':' indicates the start of a nested group with
5268 the given options set. This is again handled at compile time, but
5269 we must allow for compiled space if any of the ims options are
5270 set. We also have to allow for resetting space at the end of
5271 the group, which is why 4 is added to the length and not just 2.
5272 If there are several changes of options within the same group, this
5273 will lead to an over-estimate on the length, but this shouldn't
5274 matter very much. We also have to allow for resetting options at
5275 the start of any alternations, which we do by setting
5276 branch_newextra to 2. Finally, we record whether the case-dependent
5277 flag ever changes within the regex. This is used by the "required
5281 if (((set|unset) & PCRE_IMS) != 0)
5284 branch_newextra = 2;
5285 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
5289 /* Unrecognized option character */
5293 goto PCRE_ERROR_RETURN;
5297 /* If we hit a closing bracket, that's it - this is a freestanding
5298 option-setting. We need to ensure that branch_extra is updated if
5299 necessary. The only values branch_newextra can have here are 0 or 2.
5300 If the value is 2, then branch_extra must either be 2 or 5, depending
5301 on whether this is a lookbehind group or not. */
5306 if (branch_newextra == 2 &&
5307 (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
5308 branch_extra += branch_newextra;
5312 /* If options were terminated by ':' control comes here. Fall through
5313 to handle the group below. */
5317 /* Extracting brackets must be counted so we can process escapes in a
5318 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
5319 need an additional 3 bytes of store per extracting bracket. However, if
5320 PCRE_NO_AUTO)CAPTURE is set, unadorned brackets become non-capturing, so we
5321 must leave the count alone (it will aways be zero). */
5323 else if ((options & PCRE_NO_AUTO_CAPTURE) == 0)
5326 if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
5329 /* Save length for computing whole length at end if there's a repeat that
5330 requires duplication of the group. Also save the current value of
5331 branch_extra, and start the new group with the new value. If non-zero, this
5332 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
5334 if (brastackptr >= sizeof(brastack)/sizeof(int))
5337 goto PCRE_ERROR_RETURN;
5340 bralenstack[brastackptr] = branch_extra;
5341 branch_extra = branch_newextra;
5343 brastack[brastackptr++] = length;
5344 length += bracket_length;
5347 /* Handle ket. Look for subsequent max/min; for certain sets of values we
5348 have to replicate this bracket up to that many times. If brastackptr is
5349 0 this is an unmatched bracket which will generate an error, but take care
5350 not to try to access brastack[-1] when computing the length and restoring
5351 the branch_extra value. */
5354 length += 1 + LINK_SIZE;
5355 if (brastackptr > 0)
5357 duplength = length - brastack[--brastackptr];
5358 branch_extra = bralenstack[brastackptr];
5362 /* The following code is also used when a recursion such as (?3) is
5363 followed by a quantifier, because in that case, it has to be wrapped inside
5364 brackets so that the quantifier works. The value of duplength must be
5365 set before arrival. */
5367 HANDLE_QUANTIFIED_BRACKETS:
5369 /* Leave ptr at the final char; for read_repeat_counts this happens
5370 automatically; for the others we need an increment. */
5372 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
5374 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
5375 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
5377 else if (c == '*') { min = 0; max = -1; ptr++; }
5378 else if (c == '+') { min = 1; max = -1; ptr++; }
5379 else if (c == '?') { min = 0; max = 1; ptr++; }
5380 else { min = 1; max = 1; }
5382 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
5383 group, and if the maximum is greater than zero, we have to replicate
5384 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
5390 if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
5393 /* When the minimum is greater than zero, we have to replicate up to
5394 minval-1 times, with no additions required in the copies. Then, if there
5395 is a limited maximum we have to replicate up to maxval-1 times allowing
5396 for a BRAZERO item before each optional copy and nesting brackets for all
5397 but one of the optional copies. */
5401 length += (min - 1) * duplength;
5402 if (max > min) /* Need this test as max=-1 means no limit */
5403 length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
5404 - (2 + 2*LINK_SIZE);
5407 /* Allow space for once brackets for "possessive quantifier" */
5412 length += 2 + 2*LINK_SIZE;
5416 /* Non-special character. It won't be space or # in extended mode, so it is
5417 always a genuine character. If we are in a \Q...\E sequence, check for the
5418 end; if not, we have a literal. */
5423 if (inescq && c == '\\' && ptr[1] == 'E')
5430 length += 2; /* For a one-byte character */
5431 lastitemlength = 1; /* Default length of last item for repeats */
5433 /* In UTF-8 mode, check for additional bytes. */
5436 if (utf8 && (c & 0xc0) == 0xc0)
5438 while ((ptr[1] & 0xc0) == 0x80) /* Can't flow over the end */
5439 { /* because the end is marked */
5440 lastitemlength++; /* by a zero byte. */
5451 length += 2 + LINK_SIZE; /* For final KET and END */
5453 if ((options & PCRE_AUTO_CALLOUT) != 0)
5454 length += 2 + 2*LINK_SIZE; /* For final callout */
5456 if (length > MAX_PATTERN_SIZE)
5462 /* Compute the size of data block needed and get it, either from malloc or
5463 externally provided function. */
5465 size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
5466 re = (real_pcre *)(pcre_malloc)(size);
5474 /* Put in the magic number, and save the sizes, options, and character table
5475 pointer. NULL is used for the default character tables. The nullpad field is at
5476 the end; it's there to help in the case when a regex compiled on a system with
5477 4-byte pointers is run on another with 8-byte pointers. */
5479 re->magic_number = MAGIC_NUMBER;
5481 re->options = options;
5482 re->dummy1 = re->dummy2 = 0;
5483 re->name_table_offset = sizeof(real_pcre);
5484 re->name_entry_size = max_name_size + 3;
5485 re->name_count = name_count;
5486 re->tables = (tables == pcre_default_tables)? NULL : tables;
5489 /* The starting points of the name/number translation table and of the code are
5490 passed around in the compile data block. */
5492 compile_block.names_found = 0;
5493 compile_block.name_entry_size = max_name_size + 3;
5494 compile_block.name_table = (uschar *)re + re->name_table_offset;
5495 codestart = compile_block.name_table + re->name_entry_size * re->name_count;
5496 compile_block.start_code = codestart;
5497 compile_block.start_pattern = (const uschar *)pattern;
5498 compile_block.req_varyopt = 0;
5499 compile_block.nopartial = FALSE;
5501 /* Set up a starting, non-extracting bracket, then compile the expression. On
5502 error, *errorptr will be set non-NULL, so we don't need to look at the result
5503 of the function here. */
5505 ptr = (const uschar *)pattern;
5506 code = (uschar *)codestart;
5509 (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
5510 errorptr, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block);
5511 re->top_bracket = bracount;
5512 re->top_backref = compile_block.top_backref;
5514 if (compile_block.nopartial) re->options |= PCRE_NOPARTIAL;
5516 /* If not reached end of pattern on success, there's an excess bracket. */
5518 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
5520 /* Fill in the terminating state and check for disastrous overflow, but
5521 if debugging, leave the test till after things are printed out. */
5526 if (code - codestart > length) *errorptr = ERR23;
5529 /* Give an error if there's back reference to a non-existent capturing
5532 if (re->top_backref > re->top_bracket) *errorptr = ERR15;
5534 /* Failed to compile, or error while post-processing */
5536 if (*errorptr != NULL)
5540 *erroroffset = ptr - (const uschar *)pattern;
5544 /* If the anchored option was not passed, set the flag if we can determine that
5545 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5546 as starting with .* when DOTALL is set).
5548 Otherwise, if we know what the first character has to be, save it, because that
5549 speeds up unanchored matches no end. If not, see if we can set the
5550 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5551 start with ^. and also when all branches start with .* for non-DOTALL matches.
5554 if ((options & PCRE_ANCHORED) == 0)
5556 int temp_options = options;
5557 if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))
5558 re->options |= PCRE_ANCHORED;
5562 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5563 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5565 int ch = firstbyte & 255;
5566 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5567 compile_block.fcc[ch] == ch)? ch : firstbyte;
5568 re->options |= PCRE_FIRSTSET;
5570 else if (is_startline(codestart, 0, compile_block.backref_map))
5571 re->options |= PCRE_STARTLINE;
5575 /* For an anchored pattern, we use the "required byte" only if it follows a
5576 variable length item in the regex. Remove the caseless flag for non-caseable
5580 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5582 int ch = reqbyte & 255;
5583 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5584 compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5585 re->options |= PCRE_REQCHSET;
5588 /* Print out the compiled data for debugging */
5592 printf("Length = %d top_bracket = %d top_backref = %d\n",
5593 length, re->top_bracket, re->top_backref);
5595 if (re->options != 0)
5597 printf("%s%s%s%s%s%s%s%s%s%s\n",
5598 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5599 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5600 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5601 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
5602 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5603 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5604 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5605 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5606 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5607 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5610 if ((re->options & PCRE_FIRSTSET) != 0)
5612 int ch = re->first_byte & 255;
5613 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5614 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5615 else printf("First char = \\x%02x%s\n", ch, caseless);
5618 if ((re->options & PCRE_REQCHSET) != 0)
5620 int ch = re->req_byte & 255;
5621 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5622 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5623 else printf("Req char = \\x%02x%s\n", ch, caseless);
5626 print_internals(re, stdout);
5628 /* This check is done here in the debugging case so that the code that
5629 was compiled can be seen. */
5631 if (code - codestart > length)
5635 *erroroffset = ptr - (uschar *)pattern;
5645 /*************************************************
5646 * Match a back-reference *
5647 *************************************************/
5649 /* If a back reference hasn't been set, the length that is passed is greater
5650 than the number of characters left in the string, so the match fails.
5653 offset index into the offset vector
5654 eptr points into the subject
5655 length length to be matched
5656 md points to match data block
5659 Returns: TRUE if matched
5663 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
5664 unsigned long int ims)
5666 const uschar *p = md->start_subject + md->offset_vector[offset];
5669 if (eptr >= md->end_subject)
5670 printf("matching subject <null>");
5673 printf("matching subject ");
5674 pchars(eptr, length, TRUE, md);
5676 printf(" against backref ");
5677 pchars(p, length, FALSE, md);
5681 /* Always fail if not enough characters left */
5683 if (length > md->end_subject - eptr) return FALSE;
5685 /* Separate the caselesss case for speed */
5687 if ((ims & PCRE_CASELESS) != 0)
5689 while (length-- > 0)
5690 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
5693 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
5700 /*************************************************
5701 * Match character against an XCLASS *
5702 *************************************************/
5704 /* This function is called from within the XCLASS code below, to match a
5705 character against an extended class which might match values > 255.
5709 data points to the flag byte of the XCLASS data
5711 Returns: TRUE if character matches, else FALSE
5715 match_xclass(int c, const uschar *data)
5718 BOOL negated = (*data & XCL_NOT) != 0;
5720 /* Character values < 256 are matched against a bitmap, if one is present. If
5721 not, we still carry on, because there may be ranges that start below 256 in the
5726 if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0)
5727 return !negated; /* char found */
5730 /* First skip the bit map if present. Then match against the list of Unicode
5731 properties or large chars or ranges that end with a large char. We won't ever
5732 encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */
5734 if ((*data++ & XCL_MAP) != 0) data += 32;
5736 while ((t = *data++) != XCL_END)
5739 if (t == XCL_SINGLE)
5741 GETCHARINC(x, data);
5742 if (c == x) return !negated;
5744 else if (t == XCL_RANGE)
5746 GETCHARINC(x, data);
5747 GETCHARINC(y, data);
5748 if (c >= x && c <= y) return !negated;
5752 else /* XCL_PROP & XCL_NOTPROP */
5754 int chartype, othercase;
5755 int rqdtype = *data++;
5756 int category = ucp_findchar(c, &chartype, &othercase);
5759 if ((rqdtype - 128 == category) == (t == XCL_PROP)) return !negated;
5763 if ((rqdtype == chartype) == (t == XCL_PROP)) return !negated;
5766 #endif /* SUPPORT_UCP */
5769 return negated; /* char did not match */
5774 /***************************************************************************
5775 ****************************************************************************
5776 RECURSION IN THE match() FUNCTION
5778 The match() function is highly recursive. Some regular expressions can cause
5779 it to recurse thousands of times. I was writing for Unix, so I just let it
5780 call itself recursively. This uses the stack for saving everything that has
5781 to be saved for a recursive call. On Unix, the stack can be large, and this
5784 It turns out that on non-Unix systems there are problems with programs that
5785 use a lot of stack. (This despite the fact that every last chip has oodles
5786 of memory these days, and techniques for extending the stack have been known
5787 for decades.) So....
5789 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
5790 calls by keeping local variables that need to be preserved in blocks of memory
5791 obtained from malloc instead instead of on the stack. Macros are used to
5792 achieve this so that the actual code doesn't look very different to what it
5794 ****************************************************************************
5795 ***************************************************************************/
5798 /* These versions of the macros use the stack, as normal */
5801 #define REGISTER register
5802 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) rx = match(ra,rb,rc,rd,re,rf,rg)
5803 #define RRETURN(ra) return ra
5807 /* These versions of the macros manage a private stack on the heap. Note
5808 that the rd argument of RMATCH isn't actually used. It's the md argument of
5809 match(), which never changes. */
5813 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg)\
5815 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
5816 if (setjmp(frame->Xwhere) == 0)\
5818 newframe->Xeptr = ra;\
5819 newframe->Xecode = rb;\
5820 newframe->Xoffset_top = rc;\
5821 newframe->Xims = re;\
5822 newframe->Xeptrb = rf;\
5823 newframe->Xflags = rg;\
5824 newframe->Xprevframe = frame;\
5826 DPRINTF(("restarting from line %d\n", __LINE__));\
5831 DPRINTF(("longjumped back to line %d\n", __LINE__));\
5832 frame = md->thisframe;\
5833 rx = frame->Xresult;\
5837 #define RRETURN(ra)\
5839 heapframe *newframe = frame;\
5840 frame = newframe->Xprevframe;\
5841 (pcre_stack_free)(newframe);\
5844 frame->Xresult = ra;\
5845 md->thisframe = frame;\
5846 longjmp(frame->Xwhere, 1);\
5852 /* Structure for remembering the local variables in a private frame */
5854 typedef struct heapframe {
5855 struct heapframe *Xprevframe;
5857 /* Function arguments that may change */
5859 const uschar *Xeptr;
5860 const uschar *Xecode;
5866 /* Function local variables */
5868 const uschar *Xcallpat;
5869 const uschar *Xcharptr;
5870 const uschar *Xdata;
5871 const uschar *Xnext;
5873 const uschar *Xprev;
5874 const uschar *Xsaved_eptr;
5876 recursion_info Xnew_recursive;
5883 unsigned long int Xoriginal_ims;
5887 int Xprop_fail_result;
5890 int Xprop_othercase;
5891 int Xprop_test_against;
5892 int *Xprop_test_variable;
5904 int Xsave_capture_last;
5905 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
5906 int Xstacksave[REC_STACK_SAVE_MAX];
5910 /* Place to pass back result, and where to jump back to */
5920 /***************************************************************************
5921 ***************************************************************************/
5925 /*************************************************
5926 * Match from current position *
5927 *************************************************/
5929 /* On entry ecode points to the first opcode, and eptr to the first character
5930 in the subject string, while eptrb holds the value of eptr at the start of the
5931 last bracketed group - used for breaking infinite loops matching zero-length
5932 strings. This function is called recursively in many circumstances. Whenever it
5933 returns a negative (error) response, the outer incarnation must also return the
5936 Performance note: It might be tempting to extract commonly used fields from the
5937 md structure (e.g. utf8, end_subject) into individual variables to improve
5938 performance. Tests using gcc on a SPARC disproved this; in the first case, it
5939 made performance worse.
5942 eptr pointer in subject
5943 ecode position in code
5944 offset_top current top pointer
5945 md pointer to "static" info for the match
5946 ims current /i, /m, and /s options
5947 eptrb pointer to chain of blocks containing eptr at start of
5948 brackets - for testing for empty matches
5950 match_condassert - this is an assertion condition
5951 match_isgroup - this is the start of a bracketed group
5953 Returns: MATCH_MATCH if matched ) these values are >= 0
5954 MATCH_NOMATCH if failed to match )
5955 a negative PCRE_ERROR_xxx value if aborted by an error condition
5956 (e.g. stopped by recursion limit)
5960 match(REGISTER const uschar *eptr, REGISTER const uschar *ecode,
5961 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
5964 /* These variables do not need to be preserved over recursion in this function,
5965 so they can be ordinary variables in all cases. Mark them with "register"
5966 because they are used a lot in loops. */
5968 register int rrc; /* Returns from recursive calls */
5969 register int i; /* Used for loops not involving calls to RMATCH() */
5970 register int c; /* Character values not kept over RMATCH() calls */
5972 /* When recursion is not being used, all "local" variables that have to be
5973 preserved over calls to RMATCH() are part of a "frame" which is obtained from
5974 heap storage. Set up the top-level frame here; others are obtained from the
5975 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
5978 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
5979 frame->Xprevframe = NULL; /* Marks the top level */
5981 /* Copy in the original argument variables */
5983 frame->Xeptr = eptr;
5984 frame->Xecode = ecode;
5985 frame->Xoffset_top = offset_top;
5987 frame->Xeptrb = eptrb;
5988 frame->Xflags = flags;
5990 /* This is where control jumps back to to effect "recursion" */
5994 /* Macros make the argument variables come from the current frame */
5996 #define eptr frame->Xeptr
5997 #define ecode frame->Xecode
5998 #define offset_top frame->Xoffset_top
5999 #define ims frame->Xims
6000 #define eptrb frame->Xeptrb
6001 #define flags frame->Xflags
6003 /* Ditto for the local variables */
6006 #define charptr frame->Xcharptr
6008 #define callpat frame->Xcallpat
6009 #define data frame->Xdata
6010 #define next frame->Xnext
6011 #define pp frame->Xpp
6012 #define prev frame->Xprev
6013 #define saved_eptr frame->Xsaved_eptr
6015 #define new_recursive frame->Xnew_recursive
6017 #define cur_is_word frame->Xcur_is_word
6018 #define condition frame->Xcondition
6019 #define minimize frame->Xminimize
6020 #define prev_is_word frame->Xprev_is_word
6022 #define original_ims frame->Xoriginal_ims
6025 #define prop_type frame->Xprop_type
6026 #define prop_fail_result frame->Xprop_fail_result
6027 #define prop_category frame->Xprop_category
6028 #define prop_chartype frame->Xprop_chartype
6029 #define prop_othercase frame->Xprop_othercase
6030 #define prop_test_against frame->Xprop_test_against
6031 #define prop_test_variable frame->Xprop_test_variable
6034 #define ctype frame->Xctype
6035 #define fc frame->Xfc
6036 #define fi frame->Xfi
6037 #define length frame->Xlength
6038 #define max frame->Xmax
6039 #define min frame->Xmin
6040 #define number frame->Xnumber
6041 #define offset frame->Xoffset
6042 #define op frame->Xop
6043 #define save_capture_last frame->Xsave_capture_last
6044 #define save_offset1 frame->Xsave_offset1
6045 #define save_offset2 frame->Xsave_offset2
6046 #define save_offset3 frame->Xsave_offset3
6047 #define stacksave frame->Xstacksave
6049 #define newptrb frame->Xnewptrb
6051 /* When recursion is being used, local variables are allocated on the stack and
6052 get preserved during recursion in the normal way. In this environment, fi and
6053 i, and fc and c, can be the same variables. */
6060 #ifdef SUPPORT_UTF8 /* Many of these variables are used ony */
6061 const uschar *charptr; /* small blocks of the code. My normal */
6062 #endif /* style of coding would have declared */
6063 const uschar *callpat; /* them within each of those blocks. */
6064 const uschar *data; /* However, in order to accommodate the */
6065 const uschar *next; /* version of this code that uses an */
6066 const uschar *pp; /* external "stack" implemented on the */
6067 const uschar *prev; /* heap, it is easier to declare them */
6068 const uschar *saved_eptr; /* all here, so the declarations can */
6069 /* be cut out in a block. The only */
6070 recursion_info new_recursive; /* declarations within blocks below are */
6071 /* for variables that do not have to */
6072 BOOL cur_is_word; /* be preserved over a recursive call */
6073 BOOL condition; /* to RMATCH(). */
6077 unsigned long int original_ims;
6081 int prop_fail_result;
6085 int prop_test_against;
6086 int *prop_test_variable;
6096 int save_capture_last;
6097 int save_offset1, save_offset2, save_offset3;
6098 int stacksave[REC_STACK_SAVE_MAX];
6103 /* These statements are here to stop the compiler complaining about unitialized
6107 prop_fail_result = 0;
6108 prop_test_against = 0;
6109 prop_test_variable = NULL;
6112 /* OK, now we can get on with the real code of the function. Recursion is
6113 specified by the macros RMATCH and RRETURN. When NO_RECURSE is *not* defined,
6114 these just turn into a recursive call to match() and a "return", respectively.
6115 However, RMATCH isn't like a function call because it's quite a complicated
6116 macro. It has to be used in one particular way. This shouldn't, however, impact
6117 performance when true recursion is being used. */
6119 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
6121 original_ims = ims; /* Save for resetting on ')' */
6123 /* At the start of a bracketed group, add the current subject pointer to the
6124 stack of such pointers, to be re-instated at the end of the group when we hit
6125 the closing ket. When match() is called in other circumstances, we don't add to
6128 if ((flags & match_isgroup) != 0)
6130 newptrb.epb_prev = eptrb;
6131 newptrb.epb_saved_eptr = eptr;
6135 /* Now start processing the operations. */
6142 /* For partial matching, remember if we ever hit the end of the subject after
6143 matching at least one subject character. */
6146 eptr >= md->end_subject &&
6147 eptr > md->start_match)
6150 /* Opening capturing bracket. If there is space in the offset vector, save
6151 the current subject position in the working slot at the top of the vector. We
6152 mustn't change the current values of the data slot, because they may be set
6153 from a previous iteration of this group, and be referred to by a reference
6156 If the bracket fails to match, we need to restore this value and also the
6157 values of the final offsets, in case they were set by a previous iteration of
6160 If there isn't enough space in the offset vector, treat this as if it were a
6161 non-capturing bracket. Don't worry about setting the flag for the error case
6162 here; that is handled in the code for KET. */
6166 number = op - OP_BRA;
6168 /* For extended extraction brackets (large number), we have to fish out the
6169 number from a dummy opcode at the start. */
6171 if (number > EXTRACT_BASIC_MAX)
6172 number = GET2(ecode, 2+LINK_SIZE);
6173 offset = number << 1;
6176 printf("start bracket %d subject=", number);
6177 pchars(eptr, 16, TRUE, md);
6181 if (offset < md->offset_max)
6183 save_offset1 = md->offset_vector[offset];
6184 save_offset2 = md->offset_vector[offset+1];
6185 save_offset3 = md->offset_vector[md->offset_end - number];
6186 save_capture_last = md->capture_last;
6188 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
6189 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
6193 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
6195 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6196 md->capture_last = save_capture_last;
6197 ecode += GET(ecode, 1);
6199 while (*ecode == OP_ALT);
6201 DPRINTF(("bracket %d failed\n", number));
6203 md->offset_vector[offset] = save_offset1;
6204 md->offset_vector[offset+1] = save_offset2;
6205 md->offset_vector[md->offset_end - number] = save_offset3;
6207 RRETURN(MATCH_NOMATCH);
6210 /* Insufficient room for saving captured contents */
6215 /* Other types of node can be handled by a switch */
6219 case OP_BRA: /* Non-capturing bracket: optimized */
6220 DPRINTF(("start bracket 0\n"));
6223 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
6225 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6226 ecode += GET(ecode, 1);
6228 while (*ecode == OP_ALT);
6229 DPRINTF(("bracket 0 failed\n"));
6230 RRETURN(MATCH_NOMATCH);
6232 /* Conditional group: compilation checked that there are no more than
6233 two branches. If the condition is false, skipping the first branch takes us
6234 past the end if there is only one branch, but that's OK because that is
6235 exactly what going to the ket would do. */
6238 if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
6240 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
6241 condition = (offset == CREF_RECURSE * 2)?
6242 (md->recursive != NULL) :
6243 (offset < offset_top && md->offset_vector[offset] >= 0);
6244 RMATCH(rrc, eptr, ecode + (condition?
6245 (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),
6246 offset_top, md, ims, eptrb, match_isgroup);
6250 /* The condition is an assertion. Call match() to evaluate it - setting
6251 the final argument TRUE causes it to stop at the end of an assertion. */
6255 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6256 match_condassert | match_isgroup);
6257 if (rrc == MATCH_MATCH)
6259 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
6260 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
6262 else if (rrc != MATCH_NOMATCH)
6264 RRETURN(rrc); /* Need braces because of following else */
6266 else ecode += GET(ecode, 1);
6267 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
6271 /* Control never reaches here */
6273 /* Skip over conditional reference or large extraction number data if
6281 /* End of the pattern. If we are in a recursion, we should restore the
6282 offsets appropriately and continue from after the call. */
6285 if (md->recursive != NULL && md->recursive->group_num == 0)
6287 recursion_info *rec = md->recursive;
6288 DPRINTF(("Hit the end in a (?0) recursion\n"));
6289 md->recursive = rec->prevrec;
6290 memmove(md->offset_vector, rec->offset_save,
6291 rec->saved_max * sizeof(int));
6292 md->start_match = rec->save_start;
6294 ecode = rec->after_call;
6298 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
6299 string - backtracking will then try other alternatives, if any. */
6301 if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
6302 md->end_match_ptr = eptr; /* Record where we ended */
6303 md->end_offset_top = offset_top; /* and how many extracts were taken */
6304 RRETURN(MATCH_MATCH);
6306 /* Change option settings */
6311 DPRINTF(("ims set to %02lx\n", ims));
6314 /* Assertion brackets. Check the alternative branches in turn - the
6315 matching won't pass the KET for an assertion. If any one branch matches,
6316 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
6317 start of each branch to move the current point backwards, so the code at
6318 this level is identical to the lookahead case. */
6324 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6326 if (rrc == MATCH_MATCH) break;
6327 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6328 ecode += GET(ecode, 1);
6330 while (*ecode == OP_ALT);
6331 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
6333 /* If checking an assertion for a condition, return MATCH_MATCH. */
6335 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
6337 /* Continue from after the assertion, updating the offsets high water
6338 mark, since extracts may have been taken during the assertion. */
6340 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6341 ecode += 1 + LINK_SIZE;
6342 offset_top = md->end_offset_top;
6345 /* Negative assertion: all branches must fail to match */
6348 case OP_ASSERTBACK_NOT:
6351 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6353 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
6354 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6355 ecode += GET(ecode,1);
6357 while (*ecode == OP_ALT);
6359 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
6361 ecode += 1 + LINK_SIZE;
6364 /* Move the subject pointer back. This occurs only at the start of
6365 each branch of a lookbehind assertion. If we are too close to the start to
6366 move back, this match function fails. When working with UTF-8 we move
6367 back a number of characters, not bytes. */
6374 for (i = 0; i < c; i++)
6377 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
6384 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
6387 eptr -= GET(ecode,1);
6388 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
6391 /* Skip to next op code */
6393 ecode += 1 + LINK_SIZE;
6396 /* The callout item calls an external function, if one is provided, passing
6397 details of the match so far. This is mainly for debugging, though the
6398 function is able to force a failure. */
6401 if (pcre_callout != NULL)
6403 pcre_callout_block cb;
6404 cb.version = 1; /* Version 1 of the callout block */
6405 cb.callout_number = ecode[1];
6406 cb.offset_vector = md->offset_vector;
6407 cb.subject = (const char *)md->start_subject;
6408 cb.subject_length = md->end_subject - md->start_subject;
6409 cb.start_match = md->start_match - md->start_subject;
6410 cb.current_position = eptr - md->start_subject;
6411 cb.pattern_position = GET(ecode, 2);
6412 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
6413 cb.capture_top = offset_top/2;
6414 cb.capture_last = md->capture_last;
6415 cb.callout_data = md->callout_data;
6416 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
6417 if (rrc < 0) RRETURN(rrc);
6419 ecode += 2 + 2*LINK_SIZE;
6422 /* Recursion either matches the current regex, or some subexpression. The
6423 offset data is the offset to the starting bracket from the start of the
6424 whole pattern. (This is so that it works from duplicated subpatterns.)
6426 If there are any capturing brackets started but not finished, we have to
6427 save their starting points and reinstate them after the recursion. However,
6428 we don't know how many such there are (offset_top records the completed
6429 total) so we just have to save all the potential data. There may be up to
6430 65535 such values, which is too large to put on the stack, but using malloc
6431 for small numbers seems expensive. As a compromise, the stack is used when
6432 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
6433 is used. A problem is what to do if the malloc fails ... there is no way of
6434 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
6435 values on the stack, and accept that the rest may be wrong.
6437 There are also other values that have to be saved. We use a chained
6438 sequence of blocks that actually live on the stack. Thanks to Robin Houston
6439 for the original version of this logic. */
6443 callpat = md->start_code + GET(ecode, 1);
6444 new_recursive.group_num = *callpat - OP_BRA;
6446 /* For extended extraction brackets (large number), we have to fish out
6447 the number from a dummy opcode at the start. */
6449 if (new_recursive.group_num > EXTRACT_BASIC_MAX)
6450 new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
6452 /* Add to "recursing stack" */
6454 new_recursive.prevrec = md->recursive;
6455 md->recursive = &new_recursive;
6457 /* Find where to continue from afterwards */
6459 ecode += 1 + LINK_SIZE;
6460 new_recursive.after_call = ecode;
6462 /* Now save the offset data. */
6464 new_recursive.saved_max = md->offset_end;
6465 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
6466 new_recursive.offset_save = stacksave;
6469 new_recursive.offset_save =
6470 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
6471 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
6474 memcpy(new_recursive.offset_save, md->offset_vector,
6475 new_recursive.saved_max * sizeof(int));
6476 new_recursive.save_start = md->start_match;
6477 md->start_match = eptr;
6479 /* OK, now we can do the recursion. For each top-level alternative we
6480 restore the offset and recursion data. */
6482 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
6485 RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
6486 eptrb, match_isgroup);
6487 if (rrc == MATCH_MATCH)
6489 md->recursive = new_recursive.prevrec;
6490 if (new_recursive.offset_save != stacksave)
6491 (pcre_free)(new_recursive.offset_save);
6492 RRETURN(MATCH_MATCH);
6494 else if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6496 md->recursive = &new_recursive;
6497 memcpy(md->offset_vector, new_recursive.offset_save,
6498 new_recursive.saved_max * sizeof(int));
6499 callpat += GET(callpat, 1);
6501 while (*callpat == OP_ALT);
6503 DPRINTF(("Recursion didn't match\n"));
6504 md->recursive = new_recursive.prevrec;
6505 if (new_recursive.offset_save != stacksave)
6506 (pcre_free)(new_recursive.offset_save);
6507 RRETURN(MATCH_NOMATCH);
6509 /* Control never reaches here */
6511 /* "Once" brackets are like assertion brackets except that after a match,
6512 the point in the subject string is not moved back. Thus there can never be
6513 a move back into the brackets. Friedl calls these "atomic" subpatterns.
6514 Check the alternative branches in turn - the matching won't pass the KET
6515 for this kind of subpattern. If any one branch matches, we carry on as at
6516 the end of a normal bracket, leaving the subject pointer. */
6525 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
6526 eptrb, match_isgroup);
6527 if (rrc == MATCH_MATCH) break;
6528 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6529 ecode += GET(ecode,1);
6531 while (*ecode == OP_ALT);
6533 /* If hit the end of the group (which could be repeated), fail */
6535 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
6537 /* Continue as from after the assertion, updating the offsets high water
6538 mark, since extracts may have been taken. */
6540 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6542 offset_top = md->end_offset_top;
6543 eptr = md->end_match_ptr;
6545 /* For a non-repeating ket, just continue at this level. This also
6546 happens for a repeating ket if no characters were matched in the group.
6547 This is the forcible breaking of infinite loops as implemented in Perl
6548 5.005. If there is an options reset, it will get obeyed in the normal
6549 course of events. */
6551 if (*ecode == OP_KET || eptr == saved_eptr)
6553 ecode += 1+LINK_SIZE;
6557 /* The repeating kets try the rest of the pattern or restart from the
6558 preceding bracket, in the appropriate order. We need to reset any options
6559 that changed within the bracket before re-running it, so check the next
6562 if (ecode[1+LINK_SIZE] == OP_OPT)
6564 ims = (ims & ~PCRE_IMS) | ecode[4];
6565 DPRINTF(("ims set to %02lx at group repeat\n", ims));
6568 if (*ecode == OP_KETRMIN)
6570 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
6571 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6572 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6573 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6575 else /* OP_KETRMAX */
6577 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6578 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6579 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6580 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6583 RRETURN(MATCH_NOMATCH);
6585 /* An alternation is the end of a branch; scan along to find the end of the
6586 bracketed group and go to there. */
6589 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6592 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
6593 that it may occur zero times. It may repeat infinitely, or not at all -
6594 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
6595 repeat limits are compiled as a number of copies, with the optional ones
6596 preceded by BRAZERO or BRAMINZERO. */
6601 RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup);
6602 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6603 do next += GET(next,1); while (*next == OP_ALT);
6604 ecode = next + 1+LINK_SIZE;
6611 do next += GET(next,1); while (*next == OP_ALT);
6612 RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
6614 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6619 /* End of a group, repeated or non-repeating. If we are at the end of
6620 an assertion "group", stop matching and return MATCH_MATCH, but record the
6621 current high water mark for use by positive assertions. Do this also
6622 for the "once" (not-backup up) groups. */
6628 prev = ecode - GET(ecode, 1);
6629 saved_eptr = eptrb->epb_saved_eptr;
6631 /* Back up the stack of bracket start pointers. */
6633 eptrb = eptrb->epb_prev;
6635 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
6636 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
6639 md->end_match_ptr = eptr; /* For ONCE */
6640 md->end_offset_top = offset_top;
6641 RRETURN(MATCH_MATCH);
6644 /* In all other cases except a conditional group we have to check the
6645 group number back at the start and if necessary complete handling an
6646 extraction by setting the offsets and bumping the high water mark. */
6648 if (*prev != OP_COND)
6650 number = *prev - OP_BRA;
6652 /* For extended extraction brackets (large number), we have to fish out
6653 the number from a dummy opcode at the start. */
6655 if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
6656 offset = number << 1;
6659 printf("end bracket %d", number);
6663 /* Test for a numbered group. This includes groups called as a result
6664 of recursion. Note that whole-pattern recursion is coded as a recurse
6665 into group 0, so it won't be picked up here. Instead, we catch it when
6666 the OP_END is reached. */
6670 md->capture_last = number;
6671 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
6673 md->offset_vector[offset] =
6674 md->offset_vector[md->offset_end - number];
6675 md->offset_vector[offset+1] = eptr - md->start_subject;
6676 if (offset_top <= offset) offset_top = offset + 2;
6679 /* Handle a recursively called group. Restore the offsets
6680 appropriately and continue from after the call. */
6682 if (md->recursive != NULL && md->recursive->group_num == number)
6684 recursion_info *rec = md->recursive;
6685 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
6686 md->recursive = rec->prevrec;
6687 md->start_match = rec->save_start;
6688 memcpy(md->offset_vector, rec->offset_save,
6689 rec->saved_max * sizeof(int));
6690 ecode = rec->after_call;
6697 /* Reset the value of the ims flags, in case they got changed during
6701 DPRINTF(("ims reset to %02lx\n", ims));
6703 /* For a non-repeating ket, just continue at this level. This also
6704 happens for a repeating ket if no characters were matched in the group.
6705 This is the forcible breaking of infinite loops as implemented in Perl
6706 5.005. If there is an options reset, it will get obeyed in the normal
6707 course of events. */
6709 if (*ecode == OP_KET || eptr == saved_eptr)
6711 ecode += 1 + LINK_SIZE;
6715 /* The repeating kets try the rest of the pattern or restart from the
6716 preceding bracket, in the appropriate order. */
6718 if (*ecode == OP_KETRMIN)
6720 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6721 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6722 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6723 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6725 else /* OP_KETRMAX */
6727 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6728 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6729 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6730 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6734 RRETURN(MATCH_NOMATCH);
6736 /* Start of subject unless notbol, or after internal newline if multiline */
6739 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
6740 if ((ims & PCRE_MULTILINE) != 0)
6742 if (eptr != md->start_subject && eptr[-1] != NEWLINE)
6743 RRETURN(MATCH_NOMATCH);
6747 /* ... else fall through */
6749 /* Start of subject assertion */
6752 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
6756 /* Start of match assertion */
6759 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
6763 /* Assert before internal newline if multiline, or before a terminating
6764 newline unless endonly is set, else end of subject unless noteol is set. */
6767 if ((ims & PCRE_MULTILINE) != 0)
6769 if (eptr < md->end_subject)
6770 { if (*eptr != NEWLINE) RRETURN(MATCH_NOMATCH); }
6772 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
6778 if (md->noteol) RRETURN(MATCH_NOMATCH);
6781 if (eptr < md->end_subject - 1 ||
6782 (eptr == md->end_subject - 1 && *eptr != NEWLINE))
6783 RRETURN(MATCH_NOMATCH);
6788 /* ... else fall through */
6790 /* End of subject assertion (\z) */
6793 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
6797 /* End of subject or ending \n assertion (\Z) */
6800 if (eptr < md->end_subject - 1 ||
6801 (eptr == md->end_subject - 1 && *eptr != NEWLINE)) RRETURN(MATCH_NOMATCH);
6805 /* Word boundary assertions */
6807 case OP_NOT_WORD_BOUNDARY:
6808 case OP_WORD_BOUNDARY:
6811 /* Find out if the previous and current characters are "word" characters.
6812 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
6813 be "non-word" characters. */
6818 if (eptr == md->start_subject) prev_is_word = FALSE; else
6820 const uschar *lastptr = eptr - 1;
6821 while((*lastptr & 0xc0) == 0x80) lastptr--;
6822 GETCHAR(c, lastptr);
6823 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
6825 if (eptr >= md->end_subject) cur_is_word = FALSE; else
6828 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
6834 /* More streamlined when not in UTF-8 mode */
6837 prev_is_word = (eptr != md->start_subject) &&
6838 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
6839 cur_is_word = (eptr < md->end_subject) &&
6840 ((md->ctypes[*eptr] & ctype_word) != 0);
6843 /* Now see if the situation is what we want */
6845 if ((*ecode++ == OP_WORD_BOUNDARY)?
6846 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
6847 RRETURN(MATCH_NOMATCH);
6851 /* Match a single character type; inline for speed */
6854 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
6855 RRETURN(MATCH_NOMATCH);
6856 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
6859 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
6864 /* Match a single byte, even in UTF-8 mode. This opcode really does match
6865 any byte, even newline, independent of the setting of PCRE_DOTALL. */
6868 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
6873 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6874 GETCHARINCTEST(c, eptr);
6879 (md->ctypes[c] & ctype_digit) != 0
6881 RRETURN(MATCH_NOMATCH);
6886 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6887 GETCHARINCTEST(c, eptr);
6892 (md->ctypes[c] & ctype_digit) == 0
6894 RRETURN(MATCH_NOMATCH);
6898 case OP_NOT_WHITESPACE:
6899 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6900 GETCHARINCTEST(c, eptr);
6905 (md->ctypes[c] & ctype_space) != 0
6907 RRETURN(MATCH_NOMATCH);
6912 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6913 GETCHARINCTEST(c, eptr);
6918 (md->ctypes[c] & ctype_space) == 0
6920 RRETURN(MATCH_NOMATCH);
6924 case OP_NOT_WORDCHAR:
6925 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6926 GETCHARINCTEST(c, eptr);
6931 (md->ctypes[c] & ctype_word) != 0
6933 RRETURN(MATCH_NOMATCH);
6938 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6939 GETCHARINCTEST(c, eptr);
6944 (md->ctypes[c] & ctype_word) == 0
6946 RRETURN(MATCH_NOMATCH);
6951 /* Check the next character by Unicode property. We will get here only
6952 if the support is in the binary; otherwise a compile-time error occurs. */
6956 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6957 GETCHARINCTEST(c, eptr);
6959 int chartype, rqdtype;
6961 int category = ucp_findchar(c, &chartype, &othercase);
6963 rqdtype = *(++ecode);
6968 if ((rqdtype - 128 != category) == (op == OP_PROP))
6969 RRETURN(MATCH_NOMATCH);
6973 if ((rqdtype != chartype) == (op == OP_PROP))
6974 RRETURN(MATCH_NOMATCH);
6979 /* Match an extended Unicode sequence. We will get here only if the support
6980 is in the binary; otherwise a compile-time error occurs. */
6983 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6984 GETCHARINCTEST(c, eptr);
6988 int category = ucp_findchar(c, &chartype, &othercase);
6989 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
6990 while (eptr < md->end_subject)
6993 if (!md->utf8) c = *eptr; else
6995 GETCHARLEN(c, eptr, len);
6997 category = ucp_findchar(c, &chartype, &othercase);
6998 if (category != ucp_M) break;
7007 /* Match a back reference, possibly repeatedly. Look past the end of the
7008 item to see if there is repeat information following. The code is similar
7009 to that for character classes, but repeated for efficiency. Then obey
7010 similar code to character type repeats - written out again for speed.
7011 However, if the referenced string is the empty string, always treat
7012 it as matched, any number of times (otherwise there could be infinite
7017 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
7018 ecode += 3; /* Advance past item */
7020 /* If the reference is unset, set the length to be longer than the amount
7021 of subject left; this ensures that every attempt at a match fails. We
7022 can't just fail here, because of the possibility of quantifiers with zero
7025 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
7026 md->end_subject - eptr + 1 :
7027 md->offset_vector[offset+1] - md->offset_vector[offset];
7029 /* Set up for repetition, or handle the non-repeated case */
7039 c = *ecode++ - OP_CRSTAR;
7040 minimize = (c & 1) != 0;
7041 min = rep_min[c]; /* Pick up values from tables; */
7042 max = rep_max[c]; /* zero for max => infinity */
7043 if (max == 0) max = INT_MAX;
7048 minimize = (*ecode == OP_CRMINRANGE);
7049 min = GET2(ecode, 1);
7050 max = GET2(ecode, 3);
7051 if (max == 0) max = INT_MAX;
7055 default: /* No repeat follows */
7056 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
7058 continue; /* With the main loop */
7061 /* If the length of the reference is zero, just continue with the
7064 if (length == 0) continue;
7066 /* First, ensure the minimum number of matches are present. We get back
7067 the length of the reference string explicitly rather than passing the
7068 address of eptr, so that eptr can be a register variable. */
7070 for (i = 1; i <= min; i++)
7072 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
7076 /* If min = max, continue at the same level without recursion.
7077 They are not both allowed to be zero. */
7079 if (min == max) continue;
7081 /* If minimizing, keep trying and advancing the pointer */
7085 for (fi = min;; fi++)
7087 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7088 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7089 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
7090 RRETURN(MATCH_NOMATCH);
7093 /* Control never gets here */
7096 /* If maximizing, find the longest string and work backwards */
7101 for (i = min; i < max; i++)
7103 if (!match_ref(offset, eptr, length, md, ims)) break;
7108 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7109 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7112 RRETURN(MATCH_NOMATCH);
7115 /* Control never gets here */
7119 /* Match a bit-mapped character class, possibly repeatedly. This op code is
7120 used when all the characters in the class have values in the range 0-255,
7121 and either the matching is caseful, or the characters are in the range
7122 0-127 when UTF-8 processing is enabled. The only difference between
7123 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
7126 First, look past the end of the item to see if there is repeat information
7127 following. Then obey similar code to character type repeats - written out
7133 data = ecode + 1; /* Save for matching */
7134 ecode += 33; /* Advance past the item */
7144 c = *ecode++ - OP_CRSTAR;
7145 minimize = (c & 1) != 0;
7146 min = rep_min[c]; /* Pick up values from tables; */
7147 max = rep_max[c]; /* zero for max => infinity */
7148 if (max == 0) max = INT_MAX;
7153 minimize = (*ecode == OP_CRMINRANGE);
7154 min = GET2(ecode, 1);
7155 max = GET2(ecode, 3);
7156 if (max == 0) max = INT_MAX;
7160 default: /* No repeat follows */
7165 /* First, ensure the minimum number of matches are present. */
7171 for (i = 1; i <= min; i++)
7173 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7174 GETCHARINC(c, eptr);
7177 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
7181 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7187 /* Not UTF-8 mode */
7189 for (i = 1; i <= min; i++)
7191 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7193 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7197 /* If max == min we can continue with the main loop without the
7200 if (min == max) continue;
7202 /* If minimizing, keep testing the rest of the expression and advancing
7203 the pointer while it matches the class. */
7211 for (fi = min;; fi++)
7213 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7214 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7215 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7216 GETCHARINC(c, eptr);
7219 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
7223 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7229 /* Not UTF-8 mode */
7231 for (fi = min;; fi++)
7233 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7234 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7235 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7237 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7240 /* Control never gets here */
7243 /* If maximizing, find the longest possible run, then work backwards. */
7253 for (i = min; i < max; i++)
7256 if (eptr >= md->end_subject) break;
7257 GETCHARLEN(c, eptr, len);
7260 if (op == OP_CLASS) break;
7264 if ((data[c/8] & (1 << (c&7))) == 0) break;
7270 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7271 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7272 if (eptr-- == pp) break; /* Stop if tried at original pos */
7278 /* Not UTF-8 mode */
7280 for (i = min; i < max; i++)
7282 if (eptr >= md->end_subject) break;
7284 if ((data[c/8] & (1 << (c&7))) == 0) break;
7289 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7291 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7295 RRETURN(MATCH_NOMATCH);
7298 /* Control never gets here */
7301 /* Match an extended character class. This opcode is encountered only
7302 in UTF-8 mode, because that's the only time it is compiled. */
7307 data = ecode + 1 + LINK_SIZE; /* Save for matching */
7308 ecode += GET(ecode, 1); /* Advance past the item */
7318 c = *ecode++ - OP_CRSTAR;
7319 minimize = (c & 1) != 0;
7320 min = rep_min[c]; /* Pick up values from tables; */
7321 max = rep_max[c]; /* zero for max => infinity */
7322 if (max == 0) max = INT_MAX;
7327 minimize = (*ecode == OP_CRMINRANGE);
7328 min = GET2(ecode, 1);
7329 max = GET2(ecode, 3);
7330 if (max == 0) max = INT_MAX;
7334 default: /* No repeat follows */
7339 /* First, ensure the minimum number of matches are present. */
7341 for (i = 1; i <= min; i++)
7343 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7344 GETCHARINC(c, eptr);
7345 if (!match_xclass(c, data)) RRETURN(MATCH_NOMATCH);
7348 /* If max == min we can continue with the main loop without the
7351 if (min == max) continue;
7353 /* If minimizing, keep testing the rest of the expression and advancing
7354 the pointer while it matches the class. */
7358 for (fi = min;; fi++)
7360 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7361 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7362 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7363 GETCHARINC(c, eptr);
7364 if (!match_xclass(c, data)) RRETURN(MATCH_NOMATCH);
7366 /* Control never gets here */
7369 /* If maximizing, find the longest possible run, then work backwards. */
7374 for (i = min; i < max; i++)
7377 if (eptr >= md->end_subject) break;
7378 GETCHARLEN(c, eptr, len);
7379 if (!match_xclass(c, data)) break;
7384 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7385 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7386 if (eptr-- == pp) break; /* Stop if tried at original pos */
7389 RRETURN(MATCH_NOMATCH);
7392 /* Control never gets here */
7394 #endif /* End of XCLASS */
7396 /* Match a single character, casefully */
7404 GETCHARLEN(fc, ecode, length);
7405 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7406 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
7411 /* Non-UTF-8 mode */
7413 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
7414 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
7419 /* Match a single character, caselessly */
7427 GETCHARLEN(fc, ecode, length);
7429 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7431 /* If the pattern character's value is < 128, we have only one byte, and
7432 can use the fast lookup table. */
7436 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7439 /* Otherwise we must pick up the subject character */
7444 GETCHARINC(dc, eptr);
7447 /* If we have Unicode property support, we can use it to test the other
7448 case of the character, if there is one. The result of ucp_findchar() is
7449 < 0 if the char isn't found, and othercase is returned as zero if there
7457 if (ucp_findchar(fc, &chartype, &othercase) < 0 || dc != othercase)
7459 RRETURN(MATCH_NOMATCH);
7464 #endif /* SUPPORT_UTF8 */
7466 /* Non-UTF-8 mode */
7468 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
7469 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7474 /* Match a single character repeatedly; different opcodes share code. */
7477 min = max = GET2(ecode, 1);
7484 max = GET2(ecode, 1);
7485 minimize = *ecode == OP_MINUPTO;
7495 c = *ecode++ - OP_STAR;
7496 minimize = (c & 1) != 0;
7497 min = rep_min[c]; /* Pick up values from tables; */
7498 max = rep_max[c]; /* zero for max => infinity */
7499 if (max == 0) max = INT_MAX;
7501 /* Common code for all repeated single-character matches. We can give
7502 up quickly if there are fewer than the minimum number of characters left in
7511 GETCHARLEN(fc, ecode, length);
7512 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7515 /* Handle multibyte character matching specially here. There is
7516 support for caseless matching if UCP support is present. */
7526 if ((ims & PCRE_CASELESS) != 0 &&
7527 ucp_findchar(fc, &chartype, &othercase) >= 0 &&
7529 oclength = ord2utf8(othercase, occhars);
7530 #endif /* SUPPORT_UCP */
7532 for (i = 1; i <= min; i++)
7534 if (memcmp(eptr, charptr, length) == 0) eptr += length;
7535 /* Need braces because of following else */
7536 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
7539 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
7544 if (min == max) continue;
7548 for (fi = min;; fi++)
7550 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7551 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7552 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7553 if (memcmp(eptr, charptr, length) == 0) eptr += length;
7554 /* Need braces because of following else */
7555 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
7558 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
7562 /* Control never gets here */
7567 for (i = min; i < max; i++)
7569 if (eptr > md->end_subject - length) break;
7570 if (memcmp(eptr, charptr, length) == 0) eptr += length;
7571 else if (oclength == 0) break;
7574 if (memcmp(eptr, occhars, oclength) != 0) break;
7580 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7581 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7584 RRETURN(MATCH_NOMATCH);
7586 /* Control never gets here */
7589 /* If the length of a UTF-8 character is 1, we fall through here, and
7590 obey the code as for non-UTF-8 characters below, though in this case the
7591 value of fc will always be < 128. */
7594 #endif /* SUPPORT_UTF8 */
7596 /* When not in UTF-8 mode, load a single-byte character. */
7598 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7602 /* The value of fc at this point is always less than 256, though we may or
7603 may not be in UTF-8 mode. The code is duplicated for the caseless and
7604 caseful cases, for speed, since matching characters is likely to be quite
7605 common. First, ensure the minimum number of matches are present. If min =
7606 max, continue at the same level without recursing. Otherwise, if
7607 minimizing, keep trying the rest of the expression and advancing one
7608 matching character if failing, up to the maximum. Alternatively, if
7609 maximizing, find the maximum number of characters and work backwards. */
7611 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
7614 if ((ims & PCRE_CASELESS) != 0)
7617 for (i = 1; i <= min; i++)
7618 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7619 if (min == max) continue;
7622 for (fi = min;; fi++)
7624 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7625 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7626 if (fi >= max || eptr >= md->end_subject ||
7627 fc != md->lcc[*eptr++])
7628 RRETURN(MATCH_NOMATCH);
7630 /* Control never gets here */
7635 for (i = min; i < max; i++)
7637 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
7642 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7644 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7646 RRETURN(MATCH_NOMATCH);
7648 /* Control never gets here */
7651 /* Caseful comparisons (includes all multi-byte characters) */
7655 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
7656 if (min == max) continue;
7659 for (fi = min;; fi++)
7661 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7662 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7663 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
7664 RRETURN(MATCH_NOMATCH);
7666 /* Control never gets here */
7671 for (i = min; i < max; i++)
7673 if (eptr >= md->end_subject || fc != *eptr) break;
7678 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7680 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7682 RRETURN(MATCH_NOMATCH);
7685 /* Control never gets here */
7687 /* Match a negated single one-byte character. The character we are
7688 checking can be multibyte. */
7691 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7693 GETCHARINCTEST(c, eptr);
7694 if ((ims & PCRE_CASELESS) != 0)
7700 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
7704 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
7708 /* Match a negated single one-byte character repeatedly. This is almost a
7709 repeat of the code for a repeated single character, but I haven't found a
7710 nice way of commoning these up that doesn't require a test of the
7711 positive/negative option for each character match. Maybe that wouldn't add
7712 very much to the time taken, but character matching *is* what this is all
7716 min = max = GET2(ecode, 1);
7723 max = GET2(ecode, 1);
7724 minimize = *ecode == OP_NOTMINUPTO;
7733 case OP_NOTMINQUERY:
7734 c = *ecode++ - OP_NOTSTAR;
7735 minimize = (c & 1) != 0;
7736 min = rep_min[c]; /* Pick up values from tables; */
7737 max = rep_max[c]; /* zero for max => infinity */
7738 if (max == 0) max = INT_MAX;
7740 /* Common code for all repeated single-byte matches. We can give up quickly
7741 if there are fewer than the minimum number of bytes left in the
7745 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7748 /* The code is duplicated for the caseless and caseful cases, for speed,
7749 since matching characters is likely to be quite common. First, ensure the
7750 minimum number of matches are present. If min = max, continue at the same
7751 level without recursing. Otherwise, if minimizing, keep trying the rest of
7752 the expression and advancing one matching character if failing, up to the
7753 maximum. Alternatively, if maximizing, find the maximum number of
7754 characters and work backwards. */
7756 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
7759 if ((ims & PCRE_CASELESS) != 0)
7768 for (i = 1; i <= min; i++)
7770 GETCHARINC(d, eptr);
7771 if (d < 256) d = md->lcc[d];
7772 if (fc == d) RRETURN(MATCH_NOMATCH);
7778 /* Not UTF-8 mode */
7780 for (i = 1; i <= min; i++)
7781 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7784 if (min == max) continue;
7793 for (fi = min;; fi++)
7795 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7796 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7797 GETCHARINC(d, eptr);
7798 if (d < 256) d = md->lcc[d];
7799 if (fi >= max || eptr >= md->end_subject || fc == d)
7800 RRETURN(MATCH_NOMATCH);
7805 /* Not UTF-8 mode */
7807 for (fi = min;; fi++)
7809 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7810 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7811 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
7812 RRETURN(MATCH_NOMATCH);
7815 /* Control never gets here */
7829 for (i = min; i < max; i++)
7832 if (eptr >= md->end_subject) break;
7833 GETCHARLEN(d, eptr, len);
7834 if (d < 256) d = md->lcc[d];
7840 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7841 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7842 if (eptr-- == pp) break; /* Stop if tried at original pos */
7848 /* Not UTF-8 mode */
7850 for (i = min; i < max; i++)
7852 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
7857 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7858 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7863 RRETURN(MATCH_NOMATCH);
7865 /* Control never gets here */
7868 /* Caseful comparisons */
7877 for (i = 1; i <= min; i++)
7879 GETCHARINC(d, eptr);
7880 if (fc == d) RRETURN(MATCH_NOMATCH);
7885 /* Not UTF-8 mode */
7887 for (i = 1; i <= min; i++)
7888 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
7891 if (min == max) continue;
7900 for (fi = min;; fi++)
7902 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7903 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7904 GETCHARINC(d, eptr);
7905 if (fi >= max || eptr >= md->end_subject || fc == d)
7906 RRETURN(MATCH_NOMATCH);
7911 /* Not UTF-8 mode */
7913 for (fi = min;; fi++)
7915 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7916 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7917 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
7918 RRETURN(MATCH_NOMATCH);
7921 /* Control never gets here */
7935 for (i = min; i < max; i++)
7938 if (eptr >= md->end_subject) break;
7939 GETCHARLEN(d, eptr, len);
7945 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7946 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7947 if (eptr-- == pp) break; /* Stop if tried at original pos */
7953 /* Not UTF-8 mode */
7955 for (i = min; i < max; i++)
7957 if (eptr >= md->end_subject || fc == *eptr) break;
7962 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7963 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7968 RRETURN(MATCH_NOMATCH);
7971 /* Control never gets here */
7973 /* Match a single character type repeatedly; several different opcodes
7974 share code. This is very similar to the code for single characters, but we
7975 repeat it in the interests of efficiency. */
7978 min = max = GET2(ecode, 1);
7984 case OP_TYPEMINUPTO:
7986 max = GET2(ecode, 1);
7987 minimize = *ecode == OP_TYPEMINUPTO;
7992 case OP_TYPEMINSTAR:
7994 case OP_TYPEMINPLUS:
7996 case OP_TYPEMINQUERY:
7997 c = *ecode++ - OP_TYPESTAR;
7998 minimize = (c & 1) != 0;
7999 min = rep_min[c]; /* Pick up values from tables; */
8000 max = rep_max[c]; /* zero for max => infinity */
8001 if (max == 0) max = INT_MAX;
8003 /* Common code for all repeated single character type matches. Note that
8004 in UTF-8 mode, '.' matches a character of any length, but for the other
8005 character types, the valid characters are all one-byte long. */
8008 ctype = *ecode++; /* Code for the character type */
8011 if (ctype == OP_PROP || ctype == OP_NOTPROP)
8013 prop_fail_result = ctype == OP_NOTPROP;
8014 prop_type = *ecode++;
8015 if (prop_type >= 128)
8017 prop_test_against = prop_type - 128;
8018 prop_test_variable = &prop_category;
8022 prop_test_against = prop_type;
8023 prop_test_variable = &prop_chartype;
8026 else prop_type = -1;
8029 /* First, ensure the minimum number of matches are present. Use inline
8030 code for maximizing the speed, and do the type test once at the start
8031 (i.e. keep it out of the loop). Also we can test that there are at least
8032 the minimum number of bytes before we start. This isn't as effective in
8033 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
8034 is tidier. Also separate the UCP code, which can be the same for both UTF-8
8035 and single-bytes. */
8037 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
8043 for (i = 1; i <= min; i++)
8045 GETCHARINC(c, eptr);
8046 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8047 if ((*prop_test_variable == prop_test_against) == prop_fail_result)
8048 RRETURN(MATCH_NOMATCH);
8052 /* Match extended Unicode sequences. We will get here only if the
8053 support is in the binary; otherwise a compile-time error occurs. */
8055 else if (ctype == OP_EXTUNI)
8057 for (i = 1; i <= min; i++)
8059 GETCHARINCTEST(c, eptr);
8060 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8061 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
8062 while (eptr < md->end_subject)
8065 if (!md->utf8) c = *eptr; else
8067 GETCHARLEN(c, eptr, len);
8069 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8070 if (prop_category != ucp_M) break;
8077 #endif /* SUPPORT_UCP */
8079 /* Handle all other cases when the coding is UTF-8 */
8082 if (md->utf8) switch(ctype)
8085 for (i = 1; i <= min; i++)
8087 if (eptr >= md->end_subject ||
8088 (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
8089 RRETURN(MATCH_NOMATCH);
8090 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8099 for (i = 1; i <= min; i++)
8101 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8102 GETCHARINC(c, eptr);
8103 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
8104 RRETURN(MATCH_NOMATCH);
8109 for (i = 1; i <= min; i++)
8111 if (eptr >= md->end_subject ||
8112 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
8113 RRETURN(MATCH_NOMATCH);
8114 /* No need to skip more bytes - we know it's a 1-byte character */
8118 case OP_NOT_WHITESPACE:
8119 for (i = 1; i <= min; i++)
8121 if (eptr >= md->end_subject ||
8122 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
8123 RRETURN(MATCH_NOMATCH);
8124 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8129 for (i = 1; i <= min; i++)
8131 if (eptr >= md->end_subject ||
8132 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
8133 RRETURN(MATCH_NOMATCH);
8134 /* No need to skip more bytes - we know it's a 1-byte character */
8138 case OP_NOT_WORDCHAR:
8139 for (i = 1; i <= min; i++)
8141 if (eptr >= md->end_subject ||
8142 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
8143 RRETURN(MATCH_NOMATCH);
8144 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8149 for (i = 1; i <= min; i++)
8151 if (eptr >= md->end_subject ||
8152 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
8153 RRETURN(MATCH_NOMATCH);
8154 /* No need to skip more bytes - we know it's a 1-byte character */
8159 RRETURN(PCRE_ERROR_INTERNAL);
8160 } /* End switch(ctype) */
8163 #endif /* SUPPORT_UTF8 */
8165 /* Code for the non-UTF-8 case for minimum matching of operators other
8166 than OP_PROP and OP_NOTPROP. */
8171 if ((ims & PCRE_DOTALL) == 0)
8173 for (i = 1; i <= min; i++)
8174 if (*eptr++ == NEWLINE) RRETURN(MATCH_NOMATCH);
8184 for (i = 1; i <= min; i++)
8185 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
8189 for (i = 1; i <= min; i++)
8190 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
8193 case OP_NOT_WHITESPACE:
8194 for (i = 1; i <= min; i++)
8195 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
8199 for (i = 1; i <= min; i++)
8200 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
8203 case OP_NOT_WORDCHAR:
8204 for (i = 1; i <= min; i++)
8205 if ((md->ctypes[*eptr++] & ctype_word) != 0)
8206 RRETURN(MATCH_NOMATCH);
8210 for (i = 1; i <= min; i++)
8211 if ((md->ctypes[*eptr++] & ctype_word) == 0)
8212 RRETURN(MATCH_NOMATCH);
8216 RRETURN(PCRE_ERROR_INTERNAL);
8220 /* If min = max, continue at the same level without recursing */
8222 if (min == max) continue;
8224 /* If minimizing, we have to test the rest of the pattern before each
8225 subsequent match. Again, separate the UTF-8 case for speed, and also
8226 separate the UCP cases. */
8233 for (fi = min;; fi++)
8235 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8236 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8237 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8238 GETCHARINC(c, eptr);
8239 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8240 if ((*prop_test_variable == prop_test_against) == prop_fail_result)
8241 RRETURN(MATCH_NOMATCH);
8245 /* Match extended Unicode sequences. We will get here only if the
8246 support is in the binary; otherwise a compile-time error occurs. */
8248 else if (ctype == OP_EXTUNI)
8250 for (fi = min;; fi++)
8252 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8253 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8254 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8255 GETCHARINCTEST(c, eptr);
8256 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8257 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
8258 while (eptr < md->end_subject)
8261 if (!md->utf8) c = *eptr; else
8263 GETCHARLEN(c, eptr, len);
8265 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8266 if (prop_category != ucp_M) break;
8273 #endif /* SUPPORT_UCP */
8279 for (fi = min;; fi++)
8281 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8282 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8283 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8285 GETCHARINC(c, eptr);
8289 if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
8296 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
8297 RRETURN(MATCH_NOMATCH);
8301 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
8302 RRETURN(MATCH_NOMATCH);
8305 case OP_NOT_WHITESPACE:
8306 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
8307 RRETURN(MATCH_NOMATCH);
8311 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
8312 RRETURN(MATCH_NOMATCH);
8315 case OP_NOT_WORDCHAR:
8316 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
8317 RRETURN(MATCH_NOMATCH);
8321 if (c >= 256 && (md->ctypes[c] & ctype_word) == 0)
8322 RRETURN(MATCH_NOMATCH);
8326 RRETURN(PCRE_ERROR_INTERNAL);
8332 /* Not UTF-8 mode */
8334 for (fi = min;; fi++)
8336 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8337 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8338 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8343 if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
8350 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
8354 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
8357 case OP_NOT_WHITESPACE:
8358 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
8362 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
8365 case OP_NOT_WORDCHAR:
8366 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
8370 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
8374 RRETURN(PCRE_ERROR_INTERNAL);
8378 /* Control never gets here */
8381 /* If maximizing it is worth using inline code for speed, doing the type
8382 test once at the start (i.e. keep it out of the loop). Again, keep the
8383 UTF-8 and UCP stuff separate. */
8387 pp = eptr; /* Remember where we started */
8392 for (i = min; i < max; i++)
8395 if (eptr >= md->end_subject) break;
8396 GETCHARLEN(c, eptr, len);
8397 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8398 if ((*prop_test_variable == prop_test_against) == prop_fail_result)
8403 /* eptr is now past the end of the maximum run */
8407 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8408 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8409 if (eptr-- == pp) break; /* Stop if tried at original pos */
8414 /* Match extended Unicode sequences. We will get here only if the
8415 support is in the binary; otherwise a compile-time error occurs. */
8417 else if (ctype == OP_EXTUNI)
8419 for (i = min; i < max; i++)
8421 if (eptr >= md->end_subject) break;
8422 GETCHARINCTEST(c, eptr);
8423 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8424 if (prop_category == ucp_M) break;
8425 while (eptr < md->end_subject)
8428 if (!md->utf8) c = *eptr; else
8430 GETCHARLEN(c, eptr, len);
8432 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8433 if (prop_category != ucp_M) break;
8438 /* eptr is now past the end of the maximum run */
8442 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8443 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8444 if (eptr-- == pp) break; /* Stop if tried at original pos */
8445 for (;;) /* Move back over one extended */
8449 if (!md->utf8) c = *eptr; else
8451 GETCHARLEN(c, eptr, len);
8453 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8454 if (prop_category != ucp_M) break;
8461 #endif /* SUPPORT_UCP */
8472 /* Special code is required for UTF8, but when the maximum is unlimited
8473 we don't need it, so we repeat the non-UTF8 code. This is probably
8474 worth it, because .* is quite a common idiom. */
8478 if ((ims & PCRE_DOTALL) == 0)
8480 for (i = min; i < max; i++)
8482 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
8484 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8489 for (i = min; i < max; i++)
8492 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8497 /* Handle unlimited UTF-8 repeat */
8501 if ((ims & PCRE_DOTALL) == 0)
8503 for (i = min; i < max; i++)
8505 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
8513 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
8519 /* The byte case is the same as non-UTF8 */
8523 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
8528 for (i = min; i < max; i++)
8531 if (eptr >= md->end_subject) break;
8532 GETCHARLEN(c, eptr, len);
8533 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
8539 for (i = min; i < max; i++)
8542 if (eptr >= md->end_subject) break;
8543 GETCHARLEN(c, eptr, len);
8544 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
8549 case OP_NOT_WHITESPACE:
8550 for (i = min; i < max; i++)
8553 if (eptr >= md->end_subject) break;
8554 GETCHARLEN(c, eptr, len);
8555 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
8561 for (i = min; i < max; i++)
8564 if (eptr >= md->end_subject) break;
8565 GETCHARLEN(c, eptr, len);
8566 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
8571 case OP_NOT_WORDCHAR:
8572 for (i = min; i < max; i++)
8575 if (eptr >= md->end_subject) break;
8576 GETCHARLEN(c, eptr, len);
8577 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
8583 for (i = min; i < max; i++)
8586 if (eptr >= md->end_subject) break;
8587 GETCHARLEN(c, eptr, len);
8588 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
8594 RRETURN(PCRE_ERROR_INTERNAL);
8597 /* eptr is now past the end of the maximum run */
8601 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8602 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8603 if (eptr-- == pp) break; /* Stop if tried at original pos */
8610 /* Not UTF-8 mode */
8615 if ((ims & PCRE_DOTALL) == 0)
8617 for (i = min; i < max; i++)
8619 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
8624 /* For DOTALL case, fall through and treat as \C */
8628 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
8633 for (i = min; i < max; i++)
8635 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
8642 for (i = min; i < max; i++)
8644 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
8650 case OP_NOT_WHITESPACE:
8651 for (i = min; i < max; i++)
8653 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
8660 for (i = min; i < max; i++)
8662 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
8668 case OP_NOT_WORDCHAR:
8669 for (i = min; i < max; i++)
8671 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
8678 for (i = min; i < max; i++)
8680 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
8687 RRETURN(PCRE_ERROR_INTERNAL);
8690 /* eptr is now past the end of the maximum run */
8694 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8696 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8700 /* Get here if we can't make it match with any permitted repetitions */
8702 RRETURN(MATCH_NOMATCH);
8704 /* Control never gets here */
8706 /* There's been some horrible disaster. Since all codes > OP_BRA are
8707 for capturing brackets, and there shouldn't be any gaps between 0 and
8708 OP_BRA, arrival here can only mean there is something seriously wrong
8709 in the code above or the OP_xxx definitions. */
8712 DPRINTF(("Unknown opcode %d\n", *ecode));
8713 RRETURN(PCRE_ERROR_UNKNOWN_NODE);
8716 /* Do not stick any code in here without much thought; it is assumed
8717 that "continue" in the code above comes out to here to repeat the main
8720 } /* End of main loop */
8721 /* Control never reaches here */
8725 /***************************************************************************
8726 ****************************************************************************
8727 RECURSION IN THE match() FUNCTION
8729 Undefine all the macros that were defined above to handle this. */
8747 #undef new_recursive
8763 #undef save_capture_last
8773 /* These two are defined as macros in both cases */
8778 /***************************************************************************
8779 ***************************************************************************/
8783 /*************************************************
8784 * Execute a Regular Expression *
8785 *************************************************/
8787 /* This function applies a compiled re to a subject string and picks out
8788 portions of the string if it matches. Two elements in the vector are set for
8789 each substring: the offsets to the start and end of the substring.
8792 argument_re points to the compiled expression
8793 extra_data points to extra data or is NULL
8794 subject points to the subject string
8795 length length of subject string (may contain binary zeros)
8796 start_offset where to start in the subject string
8798 offsets points to a vector of ints to be filled in with offsets
8799 offsetcount the number of elements in the vector
8801 Returns: > 0 => success; value is the number of elements filled in
8802 = 0 => success, but offsets is not big enough
8803 -1 => failed to match
8804 < -1 => some kind of unexpected problem
8808 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
8809 const char *subject, int length, int start_offset, int options, int *offsets,
8812 int rc, resetcount, ocount;
8813 int first_byte = -1;
8816 unsigned long int ims = 0;
8817 BOOL using_temporary_offsets = FALSE;
8820 BOOL first_byte_caseless = FALSE;
8821 BOOL req_byte_caseless = FALSE;
8822 match_data match_block;
8823 const uschar *tables;
8824 const uschar *start_bits = NULL;
8825 const uschar *start_match = (const uschar *)subject + start_offset;
8826 const uschar *end_subject;
8827 const uschar *req_byte_ptr = start_match - 1;
8829 pcre_study_data internal_study;
8830 const pcre_study_data *study;
8832 real_pcre internal_re;
8833 const real_pcre *external_re = (const real_pcre *)argument_re;
8834 const real_pcre *re = external_re;
8836 /* Plausibility checks */
8838 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
8839 if (re == NULL || subject == NULL ||
8840 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
8841 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
8843 /* Fish out the optional data from the extra_data structure, first setting
8844 the default values. */
8847 match_block.match_limit = MATCH_LIMIT;
8848 match_block.callout_data = NULL;
8850 /* The table pointer is always in native byte order. */
8852 tables = external_re->tables;
8854 if (extra_data != NULL)
8856 register unsigned int flags = extra_data->flags;
8857 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
8858 study = (const pcre_study_data *)extra_data->study_data;
8859 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
8860 match_block.match_limit = extra_data->match_limit;
8861 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
8862 match_block.callout_data = extra_data->callout_data;
8863 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
8866 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
8867 is a feature that makes it possible to save compiled regex and re-use them
8868 in other programs later. */
8870 if (tables == NULL) tables = pcre_default_tables;
8872 /* Check that the first field in the block is the magic number. If it is not,
8873 test for a regex that was compiled on a host of opposite endianness. If this is
8874 the case, flipped values are put in internal_re and internal_study if there was
8877 if (re->magic_number != MAGIC_NUMBER)
8879 re = try_flipped(re, &internal_re, study, &internal_study);
8880 if (re == NULL) return PCRE_ERROR_BADMAGIC;
8881 if (study != NULL) study = &internal_study;
8884 /* Set up other data */
8886 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
8887 startline = (re->options & PCRE_STARTLINE) != 0;
8889 /* The code starts after the real_pcre block and the capture name table. */
8891 match_block.start_code = (const uschar *)external_re + re->name_table_offset +
8892 re->name_count * re->name_entry_size;
8894 match_block.start_subject = (const uschar *)subject;
8895 match_block.start_offset = start_offset;
8896 match_block.end_subject = match_block.start_subject + length;
8897 end_subject = match_block.end_subject;
8899 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
8900 match_block.utf8 = (re->options & PCRE_UTF8) != 0;
8902 match_block.notbol = (options & PCRE_NOTBOL) != 0;
8903 match_block.noteol = (options & PCRE_NOTEOL) != 0;
8904 match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
8905 match_block.partial = (options & PCRE_PARTIAL) != 0;
8906 match_block.hitend = FALSE;
8908 match_block.recursive = NULL; /* No recursion at top level */
8910 match_block.lcc = tables + lcc_offset;
8911 match_block.ctypes = tables + ctypes_offset;
8913 /* Partial matching is supported only for a restricted set of regexes at the
8916 if (match_block.partial && (re->options & PCRE_NOPARTIAL) != 0)
8917 return PCRE_ERROR_BADPARTIAL;
8919 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
8920 back the character offset. */
8923 if (match_block.utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
8925 if (valid_utf8((uschar *)subject, length) >= 0)
8926 return PCRE_ERROR_BADUTF8;
8927 if (start_offset > 0 && start_offset < length)
8929 int tb = ((uschar *)subject)[start_offset];
8933 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
8939 /* The ims options can vary during the matching as a result of the presence
8940 of (?ims) items in the pattern. They are kept in a local variable so that
8941 restoring at the exit of a group is easy. */
8943 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
8945 /* If the expression has got more back references than the offsets supplied can
8946 hold, we get a temporary chunk of working store to use during the matching.
8947 Otherwise, we can use the vector supplied, rounding down its size to a multiple
8950 ocount = offsetcount - (offsetcount % 3);
8952 if (re->top_backref > 0 && re->top_backref >= ocount/3)
8954 ocount = re->top_backref * 3 + 3;
8955 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
8956 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
8957 using_temporary_offsets = TRUE;
8958 DPRINTF(("Got memory to hold back references\n"));
8960 else match_block.offset_vector = offsets;
8962 match_block.offset_end = ocount;
8963 match_block.offset_max = (2*ocount)/3;
8964 match_block.offset_overflow = FALSE;
8965 match_block.capture_last = -1;
8967 /* Compute the minimum number of offsets that we need to reset each time. Doing
8968 this makes a huge difference to execution time when there aren't many brackets
8971 resetcount = 2 + re->top_bracket * 2;
8972 if (resetcount > offsetcount) resetcount = ocount;
8974 /* Reset the working variable associated with each extraction. These should
8975 never be used unless previously set, but they get saved and restored, and so we
8976 initialize them to avoid reading uninitialized locations. */
8978 if (match_block.offset_vector != NULL)
8980 register int *iptr = match_block.offset_vector + ocount;
8981 register int *iend = iptr - resetcount/2 + 1;
8982 while (--iptr >= iend) *iptr = -1;
8985 /* Set up the first character to match, if available. The first_byte value is
8986 never set for an anchored regular expression, but the anchoring may be forced
8987 at run time, so we have to test for anchoring. The first char may be unset for
8988 an unanchored pattern, of course. If there's no first char and the pattern was
8989 studied, there may be a bitmap of possible first characters. */
8993 if ((re->options & PCRE_FIRSTSET) != 0)
8995 first_byte = re->first_byte & 255;
8996 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
8997 first_byte = match_block.lcc[first_byte];
9000 if (!startline && study != NULL &&
9001 (study->options & PCRE_STUDY_MAPPED) != 0)
9002 start_bits = study->start_bits;
9005 /* For anchored or unanchored matches, there may be a "last known required
9008 if ((re->options & PCRE_REQCHSET) != 0)
9010 req_byte = re->req_byte & 255;
9011 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
9012 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
9015 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
9016 the loop runs just once. */
9020 /* Reset the maximum number of extractions we might see. */
9022 if (match_block.offset_vector != NULL)
9024 register int *iptr = match_block.offset_vector;
9025 register int *iend = iptr + resetcount;
9026 while (iptr < iend) *iptr++ = -1;
9029 /* Advance to a unique first char if possible */
9031 if (first_byte >= 0)
9033 if (first_byte_caseless)
9034 while (start_match < end_subject &&
9035 match_block.lcc[*start_match] != first_byte)
9038 while (start_match < end_subject && *start_match != first_byte)
9042 /* Or to just after \n for a multiline match if possible */
9046 if (start_match > match_block.start_subject + start_offset)
9048 while (start_match < end_subject && start_match[-1] != NEWLINE)
9053 /* Or to a non-unique first char after study */
9055 else if (start_bits != NULL)
9057 while (start_match < end_subject)
9059 register unsigned int c = *start_match;
9060 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
9064 #ifdef DEBUG /* Sigh. Some compilers never learn. */
9065 printf(">>>> Match against: ");
9066 pchars(start_match, end_subject - start_match, TRUE, &match_block);
9070 /* If req_byte is set, we know that that character must appear in the subject
9071 for the match to succeed. If the first character is set, req_byte must be
9072 later in the subject; otherwise the test starts at the match point. This
9073 optimization can save a huge amount of backtracking in patterns with nested
9074 unlimited repeats that aren't going to match. Writing separate code for
9075 cased/caseless versions makes it go faster, as does using an autoincrement
9076 and backing off on a match.
9078 HOWEVER: when the subject string is very, very long, searching to its end can
9079 take a long time, and give bad performance on quite ordinary patterns. This
9080 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
9081 don't do this when the string is sufficiently long.
9083 ALSO: this processing is disabled when partial matching is requested.
9086 if (req_byte >= 0 &&
9087 end_subject - start_match < REQ_BYTE_MAX &&
9088 !match_block.partial)
9090 register const uschar *p = start_match + ((first_byte >= 0)? 1 : 0);
9092 /* We don't need to repeat the search if we haven't yet reached the
9093 place we found it at last time. */
9095 if (p > req_byte_ptr)
9097 if (req_byte_caseless)
9099 while (p < end_subject)
9101 register int pp = *p++;
9102 if (pp == req_byte || pp == req_byte2) { p--; break; }
9107 while (p < end_subject)
9109 if (*p++ == req_byte) { p--; break; }
9113 /* If we can't find the required character, break the matching loop */
9115 if (p >= end_subject) break;
9117 /* If we have found the required character, save the point where we
9118 found it, so that we don't search again next time round the loop if
9119 the start hasn't passed this character yet. */
9125 /* When a match occurs, substrings will be set for all internal extractions;
9126 we just need to set up the whole thing as substring 0 before returning. If
9127 there were too many extractions, set the return code to zero. In the case
9128 where we had to get some local store to hold offsets for backreferences, copy
9129 those back references that we can. In this case there need not be overflow
9130 if certain parts of the pattern were not used. */
9132 match_block.start_match = start_match;
9133 match_block.match_call_count = 0;
9135 rc = match(start_match, match_block.start_code, 2, &match_block, ims, NULL,
9138 if (rc == MATCH_NOMATCH)
9142 if (match_block.utf8)
9143 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
9149 if (rc != MATCH_MATCH)
9151 DPRINTF((">>>> error: returning %d\n", rc));
9155 /* We have a match! Copy the offset information from temporary store if
9158 if (using_temporary_offsets)
9160 if (offsetcount >= 4)
9162 memcpy(offsets + 2, match_block.offset_vector + 2,
9163 (offsetcount - 2) * sizeof(int));
9164 DPRINTF(("Copied offsets from temporary memory\n"));
9166 if (match_block.end_offset_top > offsetcount)
9167 match_block.offset_overflow = TRUE;
9169 DPRINTF(("Freeing temporary memory\n"));
9170 (pcre_free)(match_block.offset_vector);
9173 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
9175 if (offsetcount < 2) rc = 0; else
9177 offsets[0] = start_match - match_block.start_subject;
9178 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
9181 DPRINTF((">>>> returning %d\n", rc));
9185 /* This "while" is the end of the "do" above */
9187 while (!anchored && start_match <= end_subject);
9189 if (using_temporary_offsets)
9191 DPRINTF(("Freeing temporary memory\n"));
9192 (pcre_free)(match_block.offset_vector);
9195 if (match_block.partial && match_block.hitend)
9197 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
9198 return PCRE_ERROR_PARTIAL;
9202 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
9203 return PCRE_ERROR_NOMATCH;