1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
10 Written by: Philip Hazel <ph10@cam.ac.uk>
12 Copyright (c) 1997-2004 University of Cambridge
14 -----------------------------------------------------------------------------
15 Redistribution and use in source and binary forms, with or without
16 modification, are permitted provided that the following conditions are met:
18 * Redistributions of source code must retain the above copyright notice,
19 this list of conditions and the following disclaimer.
21 * Redistributions in binary form must reproduce the above copyright
22 notice, this list of conditions and the following disclaimer in the
23 documentation and/or other materials provided with the distribution.
25 * Neither the name of the University of Cambridge nor the names of its
26 contributors may be used to endorse or promote products derived from
27 this software without specific prior written permission.
29 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
30 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
33 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
34 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
35 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
36 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
37 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
38 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
39 POSSIBILITY OF SUCH DAMAGE.
40 -----------------------------------------------------------------------------
44 /* This module contains a debugging function for printing out the internal form
45 of a compiled regular expression. It is kept in a separate file so that it can
46 be #included both in the pcretest program, and in the library itself when
47 compiled with the debugging switch. */
50 static const char *OP_names[] = { OP_NAME_LIST };
53 /*************************************************
54 * Print single- or multi-byte character *
55 *************************************************/
57 /* These tables are actually copies of ones in pcre.c. If we compile the
58 library with debugging, they are included twice, but that isn't really a
59 problem - compiling with debugging is pretty rare and these are very small. */
61 static const int utf8_t3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
63 static const uschar utf8_t4[] = {
64 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
65 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
66 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
67 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
70 print_char(FILE *f, uschar *ptr, BOOL utf8)
74 if (!utf8 || (c & 0xc0) != 0xc0)
76 if (isprint(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c);
82 int a = utf8_t4[c & 0x3f]; /* Number of additional bytes */
84 c = (c & utf8_t3[a]) << s;
85 for (i = 1; i <= a; i++)
87 /* This is a check for malformed UTF-8; it should only occur if the sanity
88 check has been turned off. Rather than swallow random bytes, just stop if
89 we hit a bad one. Print it with \X instead of \x as an indication. */
91 if ((ptr[i] & 0xc0) != 0x80)
93 fprintf(f, "\\X{%x}", c);
100 c |= (ptr[i] & 0x3f) << s;
102 if (c < 128) fprintf(f, "\\x%02x", c); else fprintf(f, "\\x{%x}", c);
111 /*************************************************
112 * Find Unicode property name *
113 *************************************************/
116 get_ucpname(int property)
119 for (i = sizeof(utt)/sizeof(ucp_type_table); i >= 0; i--)
121 if (property == utt[i].value) break;
123 return (i >= 0)? utt[i].name : "??";
125 #endif /* SUPPORT_UCP */
129 /*************************************************
130 * Print compiled regex *
131 *************************************************/
133 /* Make this function work for a regex with integers either byte order.
134 However, we assume that what we are passed is a compiled regex. */
137 print_internals(pcre *external_re, FILE *f)
139 real_pcre *re = (real_pcre *)external_re;
140 uschar *codestart, *code;
143 unsigned int options = re->options;
144 int offset = re->name_table_offset;
145 int count = re->name_count;
146 int size = re->name_entry_size;
148 if (re->magic_number != MAGIC_NUMBER)
150 offset = ((offset << 8) & 0xff00) | ((offset >> 8) & 0xff);
151 count = ((count << 8) & 0xff00) | ((count >> 8) & 0xff);
152 size = ((size << 8) & 0xff00) | ((size >> 8) & 0xff);
153 options = ((options << 24) & 0xff000000) |
154 ((options << 8) & 0x00ff0000) |
155 ((options >> 8) & 0x0000ff00) |
156 ((options >> 24) & 0x000000ff);
159 code = codestart = (uschar *)re + offset + count * size;
160 utf8 = (options & PCRE_UTF8) != 0;
168 fprintf(f, "%3d ", code - codestart);
172 if (*code - OP_BRA > EXTRACT_BASIC_MAX)
173 fprintf(f, "%3d Bra extra\n", GET(code, 1));
175 fprintf(f, "%3d Bra %d\n", GET(code, 1), *code - OP_BRA);
176 code += OP_lengths[OP_BRA];
183 fprintf(f, " %s\n", OP_names[*code]);
184 fprintf(f, "------------------------------------------------------------------\n");
188 fprintf(f, " %.2x %s", code[1], OP_names[*code]);
197 code += 1 + print_char(f, code, utf8);
199 while (*code == OP_CHAR);
211 code += 1 + print_char(f, code, utf8);
213 while (*code == OP_CHARNC);
226 case OP_ASSERTBACK_NOT:
230 fprintf(f, "%3d %s", GET(code, 1), OP_names[*code]);
234 printf("%3d %s", GET2(code, 1), OP_names[*code]);
238 if (GET2(code, 1) == CREF_RECURSE)
239 fprintf(f, " Cond recurse");
241 fprintf(f, "%3d %s", GET2(code,1), OP_names[*code]);
255 case OP_TYPEMINQUERY:
257 if (*code >= OP_TYPESTAR)
259 fprintf(f, "%s", OP_names[code[1]]);
261 if (code[1] == OP_PROP || code[1] == OP_NOTPROP)
263 fprintf(f, " %s ", get_ucpname(code[2]));
268 else extra = print_char(f, code+1, utf8);
269 fprintf(f, "%s", OP_names[*code]);
276 extra = print_char(f, code+3, utf8);
278 if (*code != OP_EXACT) fprintf(f, ",");
279 fprintf(f, "%d}", GET2(code,1));
280 if (*code == OP_MINUPTO) fprintf(f, "?");
286 fprintf(f, " %s", OP_names[code[3]]);
288 if (code[3] == OP_PROP || code[3] == OP_NOTPROP)
290 fprintf(f, " %s ", get_ucpname(code[4]));
295 if (*code != OP_TYPEEXACT) fprintf(f, "0,");
296 fprintf(f, "%d}", GET2(code,1));
297 if (*code == OP_TYPEMINUPTO) fprintf(f, "?");
301 if (isprint(c = code[1])) fprintf(f, " [^%c]", c);
302 else fprintf(f, " [^\\x%02x]", c);
311 if (isprint(c = code[1])) fprintf(f, " [^%c]", c);
312 else fprintf(f, " [^\\x%02x]", c);
313 fprintf(f, "%s", OP_names[*code]);
319 if (isprint(c = code[3])) fprintf(f, " [^%c]{", c);
320 else fprintf(f, " [^\\x%02x]{", c);
321 if (*code != OP_NOTEXACT) fprintf(f, ",");
322 fprintf(f, "%d}", GET2(code,1));
323 if (*code == OP_NOTMINUPTO) fprintf(f, "?");
327 fprintf(f, "%3d %s", GET(code, 1), OP_names[*code]);
331 fprintf(f, " \\%d", GET2(code,1));
332 ccode = code + OP_lengths[*code];
333 goto CLASS_REF_REPEAT;
336 fprintf(f, " %s %d %d %d", OP_names[*code], code[1], GET(code,2),
337 GET(code, 2 + LINK_SIZE));
343 fprintf(f, " %s %s", OP_names[*code], get_ucpname(code[1]));
347 /* OP_XCLASS can only occur in UTF-8 mode. However, there's no harm in
348 having this code always here, and it makes it less messy without all those
360 if (*code == OP_XCLASS)
362 extra = GET(code, 1);
363 ccode = code + LINK_SIZE + 1;
364 printmap = (*ccode & XCL_MAP) != 0;
365 if ((*ccode++ & XCL_NOT) != 0) fprintf(f, "^");
373 /* Print a bit map */
377 for (i = 0; i < 256; i++)
379 if ((ccode[i/8] & (1 << (i&7))) != 0)
382 for (j = i+1; j < 256; j++)
383 if ((ccode[j/8] & (1 << (j&7))) == 0) break;
384 if (i == '-' || i == ']') fprintf(f, "\\");
385 if (isprint(i)) fprintf(f, "%c", i); else fprintf(f, "\\x%02x", i);
388 if (j != i + 1) fprintf(f, "-");
389 if (j == '-' || j == ']') fprintf(f, "\\");
390 if (isprint(j)) fprintf(f, "%c", j); else fprintf(f, "\\x%02x", j);
398 /* For an XCLASS there is always some additional data */
400 if (*code == OP_XCLASS)
403 while ((ch = *ccode++) != XCL_END)
408 fprintf(f, "\\p{%s}", get_ucpname(*ccode++));
410 else if (ch == XCL_NOTPROP)
412 fprintf(f, "\\P{%s}", get_ucpname(*ccode++));
417 ccode += 1 + print_char(f, ccode, TRUE);
421 ccode += 1 + print_char(f, ccode, TRUE);
427 /* Indicate a non-UTF8 class which was created by negation */
429 fprintf(f, "]%s", (*code == OP_NCLASS)? " (neg)" : "");
431 /* Handle repeats after a class or a back reference */
442 fprintf(f, "%s", OP_names[*ccode]);
443 extra += OP_lengths[*ccode];
450 if (max == 0) fprintf(f, "{%d,}", min);
451 else fprintf(f, "{%d,%d}", min, max);
452 if (*ccode == OP_CRMINRANGE) fprintf(f, "?");
453 extra += OP_lengths[*ccode];
459 /* Anything else is just an item with no data*/
462 fprintf(f, " %s", OP_names[*code]);
466 code += OP_lengths[*code] + extra;
471 /* End of printint.c */