1 /*************************************************
2 * PCRE testing program *
3 *************************************************/
5 /* This program was hacked up as a tester for PCRE. I really should have
6 written it more tidily in the first place. Will I ever learn? It has grown and
7 been extended and consequently is now rather untidy in places.
9 -----------------------------------------------------------------------------
10 Redistribution and use in source and binary forms, with or without
11 modification, are permitted provided that the following conditions are met:
13 * Redistributions of source code must retain the above copyright notice,
14 this list of conditions and the following disclaimer.
16 * Redistributions in binary form must reproduce the above copyright
17 notice, this list of conditions and the following disclaimer in the
18 documentation and/or other materials provided with the distribution.
20 * Neither the name of the University of Cambridge nor the names of its
21 contributors may be used to endorse or promote products derived from
22 this software without specific prior written permission.
24 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
25 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
28 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
29 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
30 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
31 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
32 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
33 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 POSSIBILITY OF SUCH DAMAGE.
35 -----------------------------------------------------------------------------
47 /* We need the internal info for displaying the results of pcre_study(). Also
48 for getting the opcodes for showing compiled code. */
50 #define PCRE_SPY /* For Win32 build, import data, not export */
53 /* It is possible to compile this test program without including support for
54 testing the POSIX interface, though this is not available via the standard
58 #include "pcreposix.h"
61 #ifndef CLOCKS_PER_SEC
63 #define CLOCKS_PER_SEC CLK_TCK
65 #define CLOCKS_PER_SEC 100
69 #define LOOPREPEAT 500000
71 #define BUFFER_SIZE 30000
72 #define PBUFFER_SIZE BUFFER_SIZE
73 #define DBUFFER_SIZE BUFFER_SIZE
77 static int log_store = 0;
78 static int callout_count;
79 static int callout_extra;
80 static int callout_fail_count;
81 static int callout_fail_id;
82 static int first_callout;
83 static int show_malloc;
85 static size_t gotten_store;
87 static uschar *pbuffer = NULL;
90 static const int utf8_table1[] = {
91 0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
93 static const int utf8_table2[] = {
94 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
96 static const int utf8_table3[] = {
97 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
101 /*************************************************
102 * Print compiled regex *
103 *************************************************/
105 /* The code for doing this is held in a separate file that is also included in
106 pcre.c when it is compiled with the debug switch. It defines a function called
107 print_internals(), which uses a table of opcode lengths defined by the macro
108 OP_LENGTHS, whose name must be OP_lengths. It also uses a table that translates
109 Unicode property names to numbers; this is kept in a separate file. */
111 static uschar OP_lengths[] = { OP_LENGTHS };
114 #include "ucptypetable.c"
115 #include "printint.c"
119 /*************************************************
120 * Read number from string *
121 *************************************************/
123 /* We don't use strtoul() because SunOS4 doesn't have it. Rather than mess
124 around with conditional compilation, just do the job by hand. It is only used
125 for unpicking the -o argument, so just keep it simple.
128 str string to be converted
129 endptr where to put the end pointer
131 Returns: the unsigned long
135 get_value(unsigned char *str, unsigned char **endptr)
138 while(*str != 0 && isspace(*str)) str++;
139 while (isdigit(*str)) result = result * 10 + (int)(*str++ - '0');
146 /*************************************************
147 * Convert character value to UTF-8 *
148 *************************************************/
150 /* This function takes an integer value in the range 0 - 0x7fffffff
151 and encodes it as a UTF-8 character in 0 to 6 bytes.
154 cvalue the character value
155 buffer pointer to buffer for result - at least 6 bytes long
157 Returns: number of characters placed in the buffer
158 -1 if input character is negative
159 0 if input character is positive but too big (only when
160 int is longer than 32 bits)
164 ord2utf8(int cvalue, unsigned char *buffer)
167 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
168 if (cvalue <= utf8_table1[i]) break;
169 if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
170 if (cvalue < 0) return -1;
173 for (j = i; j > 0; j--)
175 *buffer-- = 0x80 | (cvalue & 0x3f);
178 *buffer = utf8_table2[i] | cvalue;
183 /*************************************************
184 * Convert UTF-8 string to value *
185 *************************************************/
187 /* This function takes one or more bytes that represents a UTF-8 character,
188 and returns the value of the character.
191 buffer a pointer to the byte vector
192 vptr a pointer to an int to receive the value
194 Returns: > 0 => the number of bytes consumed
195 -6 to 0 => malformed UTF-8 character at offset = (-return)
199 utf82ord(unsigned char *buffer, int *vptr)
205 for (i = -1; i < 6; i++) /* i is number of additional bytes */
207 if ((d & 0x80) == 0) break;
211 if (i == -1) { *vptr = c; return 1; } /* ascii character */
212 if (i == 0 || i == 6) return 0; /* invalid UTF-8 */
214 /* i now has a value in the range 1-5 */
217 d = (c & utf8_table3[i]) << s;
219 for (j = 0; j < i; j++)
222 if ((c & 0xc0) != 0x80) return -(j+1);
224 d |= (c & 0x3f) << s;
227 /* Check that encoding was the correct unique one */
229 for (j = 0; j < sizeof(utf8_table1)/sizeof(int); j++)
230 if (d <= utf8_table1[j]) break;
231 if (j != i) return -(i+1);
241 /*************************************************
242 * Print character string *
243 *************************************************/
245 /* Character string printing function. Must handle UTF-8 strings in utf8
246 mode. Yields number of characters printed. If handed a NULL file, just counts
247 chars without printing. */
249 static int pchars(unsigned char *p, int length, FILE *f)
258 int rc = utf82ord(p, &c);
260 if (rc > 0 && rc <= length + 1) /* Mustn't run over the end */
264 if (c < 256 && isprint(c))
266 if (f != NULL) fprintf(f, "%c", c);
272 if (f != NULL) fprintf(f, "\\x{%02x}%n", c, &n);
279 /* Not UTF-8, or malformed UTF-8 */
281 if (isprint(c = *(p++)))
283 if (f != NULL) fprintf(f, "%c", c);
288 if (f != NULL) fprintf(f, "\\x%02x", c);
298 /*************************************************
300 *************************************************/
302 /* Called from PCRE as a result of the (?C) item. We print out where we are in
303 the match. Yield zero unless more callouts than the fail count, or the callout
306 static int callout(pcre_callout_block *cb)
308 FILE *f = (first_callout | callout_extra)? outfile : NULL;
309 int i, pre_start, post_start, subject_length;
313 fprintf(f, "Callout %d: last capture = %d\n",
314 cb->callout_number, cb->capture_last);
316 for (i = 0; i < cb->capture_top * 2; i += 2)
318 if (cb->offset_vector[i] < 0)
319 fprintf(f, "%2d: <unset>\n", i/2);
322 fprintf(f, "%2d: ", i/2);
323 (void)pchars((unsigned char *)cb->subject + cb->offset_vector[i],
324 cb->offset_vector[i+1] - cb->offset_vector[i], f);
330 /* Re-print the subject in canonical form, the first time or if giving full
331 datails. On subsequent calls in the same match, we use pchars just to find the
332 printed lengths of the substrings. */
334 if (f != NULL) fprintf(f, "--->");
336 pre_start = pchars((unsigned char *)cb->subject, cb->start_match, f);
337 post_start = pchars((unsigned char *)(cb->subject + cb->start_match),
338 cb->current_position - cb->start_match, f);
340 subject_length = pchars((unsigned char *)cb->subject, cb->subject_length, NULL);
342 (void)pchars((unsigned char *)(cb->subject + cb->current_position),
343 cb->subject_length - cb->current_position, f);
345 if (f != NULL) fprintf(f, "\n");
347 /* Always print appropriate indicators, with callout number if not already
348 shown. For automatic callouts, show the pattern offset. */
350 if (cb->callout_number == 255)
352 fprintf(outfile, "%+3d ", cb->pattern_position);
353 if (cb->pattern_position > 99) fprintf(outfile, "\n ");
357 if (callout_extra) fprintf(outfile, " ");
358 else fprintf(outfile, "%3d ", cb->callout_number);
361 for (i = 0; i < pre_start; i++) fprintf(outfile, " ");
362 fprintf(outfile, "^");
366 for (i = 0; i < post_start - 1; i++) fprintf(outfile, " ");
367 fprintf(outfile, "^");
370 for (i = 0; i < subject_length - pre_start - post_start + 4; i++)
371 fprintf(outfile, " ");
373 fprintf(outfile, "%.*s", (cb->next_item_length == 0)? 1 : cb->next_item_length,
374 pbuffer + cb->pattern_position);
376 fprintf(outfile, "\n");
379 if (cb->callout_data != NULL)
381 int callout_data = *((int *)(cb->callout_data));
382 if (callout_data != 0)
384 fprintf(outfile, "Callout data = %d\n", callout_data);
389 return (cb->callout_number != callout_fail_id)? 0 :
390 (++callout_count >= callout_fail_count)? 1 : 0;
394 /*************************************************
395 * Local malloc functions *
396 *************************************************/
398 /* Alternative malloc function, to test functionality and show the size of the
401 static void *new_malloc(size_t size)
403 void *block = malloc(size);
406 fprintf(outfile, "malloc %3d %p\n", size, block);
410 static void new_free(void *block)
413 fprintf(outfile, "free %p\n", block);
418 /* For recursion malloc/free, to test stacking calls */
420 static void *stack_malloc(size_t size)
422 void *block = malloc(size);
424 fprintf(outfile, "stack_malloc %3d %p\n", size, block);
428 static void stack_free(void *block)
431 fprintf(outfile, "stack_free %p\n", block);
436 /*************************************************
437 * Call pcre_fullinfo() *
438 *************************************************/
440 /* Get one piece of information from the pcre_fullinfo() function */
442 static void new_info(pcre *re, pcre_extra *study, int option, void *ptr)
445 if ((rc = pcre_fullinfo(re, study, option, ptr)) < 0)
446 fprintf(outfile, "Error %d from pcre_fullinfo(%d)\n", rc, option);
451 /*************************************************
452 * Byte flipping function *
453 *************************************************/
456 byteflip(long int value, int n)
458 if (n == 2) return ((value & 0x00ff) << 8) | ((value & 0xff00) >> 8);
459 return ((value & 0x000000ff) << 24) |
460 ((value & 0x0000ff00) << 8) |
461 ((value & 0x00ff0000) >> 8) |
462 ((value & 0xff000000) >> 24);
468 /*************************************************
470 *************************************************/
472 /* Read lines from named file or stdin and write to named file or stdout; lines
473 consist of a regular expression, in delimiters and optionally followed by
474 options, followed by a set of test data, terminated by an empty line. */
476 int main(int argc, char **argv)
478 FILE *infile = stdin;
480 int study_options = 0;
485 int size_offsets = 45;
486 int size_offsets_max;
494 unsigned char *buffer;
495 unsigned char *dbuffer;
497 /* Get buffers from malloc() so that Electric Fence will check their misuse
498 when I am debugging. */
500 buffer = (unsigned char *)malloc(BUFFER_SIZE);
501 dbuffer = (unsigned char *)malloc(DBUFFER_SIZE);
502 pbuffer = (unsigned char *)malloc(PBUFFER_SIZE);
504 /* The outfile variable is static so that new_malloc can use it. The _setmode()
505 stuff is some magic that I don't understand, but which apparently does good
506 things in Windows. It's related to line terminations. */
508 #if defined(_WIN32) || defined(WIN32)
509 _setmode( _fileno( stdout ), 0x8000 );
510 #endif /* defined(_WIN32) || defined(WIN32) */
516 while (argc > 1 && argv[op][0] == '-')
518 unsigned char *endptr;
520 if (strcmp(argv[op], "-s") == 0 || strcmp(argv[op], "-m") == 0)
522 else if (strcmp(argv[op], "-t") == 0) timeit = 1;
523 else if (strcmp(argv[op], "-i") == 0) showinfo = 1;
524 else if (strcmp(argv[op], "-d") == 0) showinfo = debug = 1;
525 else if (strcmp(argv[op], "-o") == 0 && argc > 2 &&
526 ((size_offsets = get_value((unsigned char *)argv[op+1], &endptr)),
533 else if (strcmp(argv[op], "-p") == 0) posix = 1;
535 else if (strcmp(argv[op], "-C") == 0)
538 printf("PCRE version %s\n", pcre_version());
539 printf("Compiled with\n");
540 (void)pcre_config(PCRE_CONFIG_UTF8, &rc);
541 printf(" %sUTF-8 support\n", rc? "" : "No ");
542 (void)pcre_config(PCRE_CONFIG_UNICODE_PROPERTIES, &rc);
543 printf(" %sUnicode properties support\n", rc? "" : "No ");
544 (void)pcre_config(PCRE_CONFIG_NEWLINE, &rc);
545 printf(" Newline character is %s\n", (rc == '\r')? "CR" : "LF");
546 (void)pcre_config(PCRE_CONFIG_LINK_SIZE, &rc);
547 printf(" Internal link size = %d\n", rc);
548 (void)pcre_config(PCRE_CONFIG_POSIX_MALLOC_THRESHOLD, &rc);
549 printf(" POSIX malloc threshold = %d\n", rc);
550 (void)pcre_config(PCRE_CONFIG_MATCH_LIMIT, &rc);
551 printf(" Default match limit = %d\n", rc);
552 (void)pcre_config(PCRE_CONFIG_STACKRECURSE, &rc);
553 printf(" Match recursion uses %s\n", rc? "stack" : "heap");
558 printf("** Unknown or malformed option %s\n", argv[op]);
559 printf("Usage: pcretest [-d] [-i] [-o <n>] [-p] [-s] [-t] [<input> [<output>]]\n");
560 printf(" -C show PCRE compile-time options and exit\n");
561 printf(" -d debug: show compiled code; implies -i\n"
562 " -i show information about compiled pattern\n"
563 " -m output memory used information\n"
564 " -o <n> set size of offsets vector to <n>\n");
566 printf(" -p use POSIX interface\n");
568 printf(" -s output store (memory) used information\n"
569 " -t time compilation and execution\n");
576 /* Get the store for the offsets vector, and remember what it was */
578 size_offsets_max = size_offsets;
579 offsets = (int *)malloc(size_offsets_max * sizeof(int));
582 printf("** Failed to get %d bytes of memory for offsets vector\n",
583 size_offsets_max * sizeof(int));
587 /* Sort out the input and output files */
591 infile = fopen(argv[op], "rb");
594 printf("** Failed to open %s\n", argv[op]);
601 outfile = fopen(argv[op+1], "wb");
604 printf("** Failed to open %s\n", argv[op+1]);
609 /* Set alternative malloc function */
611 pcre_malloc = new_malloc;
612 pcre_free = new_free;
613 pcre_stack_malloc = stack_malloc;
614 pcre_stack_free = stack_free;
616 /* Heading line, then prompt for first regex if stdin */
618 fprintf(outfile, "PCRE version %s\n\n", pcre_version());
625 pcre_extra *extra = NULL;
627 #if !defined NOPOSIX /* There are still compilers that require no indent */
633 unsigned char *p, *pp, *ppp;
634 unsigned char *to_file = NULL;
635 const unsigned char *tables = NULL;
636 unsigned long int true_size, true_study_size = 0;
637 size_t size, regex_gotten_store;
639 int do_debug = debug;
642 int do_showinfo = showinfo;
645 int erroroffset, len, delimiter;
649 if (infile == stdin) printf(" re> ");
650 if (fgets((char *)buffer, BUFFER_SIZE, infile) == NULL) break;
651 if (infile != stdin) fprintf(outfile, "%s", (char *)buffer);
655 while (isspace(*p)) p++;
656 if (*p == 0) continue;
658 /* See if the pattern is to be loaded pre-compiled from a file. */
660 if (*p == '<' && strchr((char *)(p+1), '<') == NULL)
662 unsigned long int magic;
667 pp = p + (int)strlen((char *)p);
668 while (isspace(pp[-1])) pp--;
671 f = fopen((char *)p, "rb");
674 fprintf(outfile, "Failed to open %s: %s\n", p, strerror(errno));
678 if (fread(sbuf, 1, 8, f) != 8) goto FAIL_READ;
681 (sbuf[0] << 24) | (sbuf[1] << 16) | (sbuf[2] << 8) | sbuf[3];
683 (sbuf[4] << 24) | (sbuf[5] << 16) | (sbuf[6] << 8) | sbuf[7];
685 re = (real_pcre *)new_malloc(true_size);
686 regex_gotten_store = gotten_store;
688 if (fread(re, 1, true_size, f) != true_size) goto FAIL_READ;
690 magic = ((real_pcre *)re)->magic_number;
691 if (magic != MAGIC_NUMBER)
693 if (byteflip(magic, sizeof(magic)) == MAGIC_NUMBER)
699 fprintf(outfile, "Data in %s is not a compiled PCRE regex\n", p);
705 fprintf(outfile, "Compiled regex%s loaded from %s\n",
706 do_flip? " (byte-inverted)" : "", p);
708 /* Need to know if UTF-8 for printing data strings */
710 new_info(re, NULL, PCRE_INFO_OPTIONS, &options);
711 use_utf8 = (options & PCRE_UTF8) != 0;
713 /* Now see if there is any following study data */
715 if (true_study_size != 0)
717 pcre_study_data *psd;
719 extra = (pcre_extra *)new_malloc(sizeof(pcre_extra) + true_study_size);
720 extra->flags = PCRE_EXTRA_STUDY_DATA;
722 psd = (pcre_study_data *)(((char *)extra) + sizeof(pcre_extra));
723 extra->study_data = psd;
725 if (fread(psd, 1, true_study_size, f) != true_study_size)
728 fprintf(outfile, "Failed to read data from %s\n", p);
729 if (extra != NULL) new_free(extra);
730 if (re != NULL) new_free(re);
734 fprintf(outfile, "Study data loaded from %s\n", p);
735 do_study = 1; /* To get the data output if requested */
737 else fprintf(outfile, "No study data\n");
743 /* In-line pattern (the usual case). Get the delimiter and seek the end of
744 the pattern; if is isn't complete, read more. */
748 if (isalnum(delimiter) || delimiter == '\\')
750 fprintf(outfile, "** Delimiter must not be alphameric or \\\n");
760 if (*pp == '\\' && pp[1] != 0) pp++;
761 else if (*pp == delimiter) break;
766 len = BUFFER_SIZE - (pp - buffer);
769 fprintf(outfile, "** Expression too long - missing delimiter?\n");
773 if (infile == stdin) printf(" > ");
774 if (fgets((char *)pp, len, infile) == NULL)
776 fprintf(outfile, "** Unexpected EOF\n");
780 if (infile != stdin) fprintf(outfile, "%s", (char *)pp);
783 /* If the first character after the delimiter is backslash, make
784 the pattern end with backslash. This is purely to provide a way
785 of testing for the error message when a pattern ends with backslash. */
787 if (pp[1] == '\\') *pp++ = '\\';
789 /* Terminate the pattern at the delimiter, and save a copy of the pattern
793 strcpy((char *)pbuffer, (char *)p);
795 /* Look for options after final delimiter */
799 log_store = showstore; /* default from command line */
805 case 'g': do_g = 1; break;
806 case 'i': options |= PCRE_CASELESS; break;
807 case 'm': options |= PCRE_MULTILINE; break;
808 case 's': options |= PCRE_DOTALL; break;
809 case 'x': options |= PCRE_EXTENDED; break;
811 case '+': do_showrest = 1; break;
812 case 'A': options |= PCRE_ANCHORED; break;
813 case 'C': options |= PCRE_AUTO_CALLOUT; break;
814 case 'D': do_debug = do_showinfo = 1; break;
815 case 'E': options |= PCRE_DOLLAR_ENDONLY; break;
816 case 'F': do_flip = 1; break;
817 case 'G': do_G = 1; break;
818 case 'I': do_showinfo = 1; break;
819 case 'M': log_store = 1; break;
820 case 'N': options |= PCRE_NO_AUTO_CAPTURE; break;
823 case 'P': do_posix = 1; break;
826 case 'S': do_study = 1; break;
827 case 'U': options |= PCRE_UNGREEDY; break;
828 case 'X': options |= PCRE_EXTRA; break;
829 case '8': options |= PCRE_UTF8; use_utf8 = 1; break;
830 case '?': options |= PCRE_NO_UTF8_CHECK; break;
834 while (*ppp != '\n' && *ppp != ' ') ppp++;
836 if (setlocale(LC_CTYPE, (const char *)pp) == NULL)
838 fprintf(outfile, "** Failed to set locale \"%s\"\n", pp);
841 tables = pcre_maketables();
847 while (*pp != 0) pp++;
848 while (isspace(pp[-1])) pp--;
852 case '\n': case ' ': break;
855 fprintf(outfile, "** Unknown option '%c'\n", pp[-1]);
860 /* Handle compiling via the POSIX interface, which doesn't support the
861 timing, showing, or debugging options, nor the ability to pass over
862 local character tables. */
865 if (posix || do_posix)
870 if ((options & PCRE_CASELESS) != 0) cflags |= REG_ICASE;
871 if ((options & PCRE_MULTILINE) != 0) cflags |= REG_NEWLINE;
872 rc = regcomp(&preg, (char *)p, cflags);
874 /* Compilation failed; go back for another re, skipping to blank line
875 if non-interactive. */
879 (void)regerror(rc, &preg, (char *)buffer, BUFFER_SIZE);
880 fprintf(outfile, "Failed: POSIX code %d: %s\n", rc, buffer);
885 /* Handle compiling via the native interface */
888 #endif /* !defined NOPOSIX */
895 clock_t start_time = clock();
896 for (i = 0; i < LOOPREPEAT; i++)
898 re = pcre_compile((char *)p, options, &error, &erroroffset, tables);
899 if (re != NULL) free(re);
901 time_taken = clock() - start_time;
902 fprintf(outfile, "Compile time %.3f milliseconds\n",
903 (((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
904 (double)CLOCKS_PER_SEC);
907 re = pcre_compile((char *)p, options, &error, &erroroffset, tables);
909 /* Compilation failed; go back for another re, skipping to blank line
910 if non-interactive. */
914 fprintf(outfile, "Failed: %s at offset %d\n", error, erroroffset);
920 if (fgets((char *)buffer, BUFFER_SIZE, infile) == NULL)
925 len = (int)strlen((char *)buffer);
926 while (len > 0 && isspace(buffer[len-1])) len--;
929 fprintf(outfile, "\n");
934 /* Compilation succeeded; print data if required. There are now two
935 info-returning functions. The old one has a limited interface and
936 returns only limited data. Check that it agrees with the newer one. */
939 fprintf(outfile, "Memory allocation (code space): %d\n",
942 ((real_pcre *)re)->name_count * ((real_pcre *)re)->name_entry_size));
944 /* Extract the size for possible writing before possibly flipping it,
945 and remember the store that was got. */
947 true_size = ((real_pcre *)re)->size;
948 regex_gotten_store = gotten_store;
950 /* If /S was present, study the regexp to generate additional info to
951 help with the matching. */
959 clock_t start_time = clock();
960 for (i = 0; i < LOOPREPEAT; i++)
961 extra = pcre_study(re, study_options, &error);
962 time_taken = clock() - start_time;
963 if (extra != NULL) free(extra);
964 fprintf(outfile, " Study time %.3f milliseconds\n",
965 (((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
966 (double)CLOCKS_PER_SEC);
968 extra = pcre_study(re, study_options, &error);
970 fprintf(outfile, "Failed to study: %s\n", error);
971 else if (extra != NULL)
972 true_study_size = ((pcre_study_data *)(extra->study_data))->size;
975 /* If the 'F' option was present, we flip the bytes of all the integer
976 fields in the regex data block and the study block. This is to make it
977 possible to test PCRE's handling of byte-flipped patterns, e.g. those
978 compiled on a different architecture. */
982 real_pcre *rre = (real_pcre *)re;
983 rre->magic_number = byteflip(rre->magic_number, sizeof(rre->magic_number));
984 rre->size = byteflip(rre->size, sizeof(rre->size));
985 rre->options = byteflip(rre->options, sizeof(rre->options));
986 rre->top_bracket = byteflip(rre->top_bracket, sizeof(rre->top_bracket));
987 rre->top_backref = byteflip(rre->top_backref, sizeof(rre->top_backref));
988 rre->first_byte = byteflip(rre->first_byte, sizeof(rre->first_byte));
989 rre->req_byte = byteflip(rre->req_byte, sizeof(rre->req_byte));
990 rre->name_table_offset = byteflip(rre->name_table_offset,
991 sizeof(rre->name_table_offset));
992 rre->name_entry_size = byteflip(rre->name_entry_size,
993 sizeof(rre->name_entry_size));
994 rre->name_count = byteflip(rre->name_count, sizeof(rre->name_count));
998 pcre_study_data *rsd = (pcre_study_data *)(extra->study_data);
999 rsd->size = byteflip(rsd->size, sizeof(rsd->size));
1000 rsd->options = byteflip(rsd->options, sizeof(rsd->options));
1004 /* Extract information from the compiled data if required */
1010 unsigned long int get_options, all_options;
1011 int old_first_char, old_options, old_count;
1012 int count, backrefmax, first_char, need_char;
1013 int nameentrysize, namecount;
1014 const uschar *nametable;
1018 fprintf(outfile, "------------------------------------------------------------------\n");
1019 print_internals(re, outfile);
1022 new_info(re, NULL, PCRE_INFO_OPTIONS, &get_options);
1023 new_info(re, NULL, PCRE_INFO_SIZE, &size);
1024 new_info(re, NULL, PCRE_INFO_CAPTURECOUNT, &count);
1025 new_info(re, NULL, PCRE_INFO_BACKREFMAX, &backrefmax);
1026 new_info(re, NULL, PCRE_INFO_FIRSTBYTE, &first_char);
1027 new_info(re, NULL, PCRE_INFO_LASTLITERAL, &need_char);
1028 new_info(re, NULL, PCRE_INFO_NAMEENTRYSIZE, &nameentrysize);
1029 new_info(re, NULL, PCRE_INFO_NAMECOUNT, &namecount);
1030 new_info(re, NULL, PCRE_INFO_NAMETABLE, (void *)&nametable);
1032 old_count = pcre_info(re, &old_options, &old_first_char);
1033 if (count < 0) fprintf(outfile,
1034 "Error %d from pcre_info()\n", count);
1037 if (old_count != count) fprintf(outfile,
1038 "Count disagreement: pcre_fullinfo=%d pcre_info=%d\n", count,
1041 if (old_first_char != first_char) fprintf(outfile,
1042 "First char disagreement: pcre_fullinfo=%d pcre_info=%d\n",
1043 first_char, old_first_char);
1045 if (old_options != (int)get_options) fprintf(outfile,
1046 "Options disagreement: pcre_fullinfo=%ld pcre_info=%d\n",
1047 get_options, old_options);
1050 if (size != regex_gotten_store) fprintf(outfile,
1051 "Size disagreement: pcre_fullinfo=%d call to malloc for %d\n",
1052 size, regex_gotten_store);
1054 fprintf(outfile, "Capturing subpattern count = %d\n", count);
1056 fprintf(outfile, "Max back reference = %d\n", backrefmax);
1060 fprintf(outfile, "Named capturing subpatterns:\n");
1061 while (namecount-- > 0)
1063 fprintf(outfile, " %s %*s%3d\n", nametable + 2,
1064 nameentrysize - 3 - (int)strlen((char *)nametable + 2), "",
1065 GET2(nametable, 0));
1066 nametable += nameentrysize;
1070 /* The NOPARTIAL bit is a private bit in the options, so we have
1071 to fish it out via out back door */
1073 all_options = ((real_pcre *)re)->options;
1076 all_options = byteflip(all_options, sizeof(all_options));
1079 if ((all_options & PCRE_NOPARTIAL) != 0)
1080 fprintf(outfile, "Partial matching not supported\n");
1082 if (get_options == 0) fprintf(outfile, "No options\n");
1083 else fprintf(outfile, "Options:%s%s%s%s%s%s%s%s%s%s\n",
1084 ((get_options & PCRE_ANCHORED) != 0)? " anchored" : "",
1085 ((get_options & PCRE_CASELESS) != 0)? " caseless" : "",
1086 ((get_options & PCRE_EXTENDED) != 0)? " extended" : "",
1087 ((get_options & PCRE_MULTILINE) != 0)? " multiline" : "",
1088 ((get_options & PCRE_DOTALL) != 0)? " dotall" : "",
1089 ((get_options & PCRE_DOLLAR_ENDONLY) != 0)? " dollar_endonly" : "",
1090 ((get_options & PCRE_EXTRA) != 0)? " extra" : "",
1091 ((get_options & PCRE_UNGREEDY) != 0)? " ungreedy" : "",
1092 ((get_options & PCRE_UTF8) != 0)? " utf8" : "",
1093 ((get_options & PCRE_NO_UTF8_CHECK) != 0)? " no_utf8_check" : "");
1095 if (((((real_pcre *)re)->options) & PCRE_ICHANGED) != 0)
1096 fprintf(outfile, "Case state changes\n");
1098 if (first_char == -1)
1100 fprintf(outfile, "First char at start or follows \\n\n");
1102 else if (first_char < 0)
1104 fprintf(outfile, "No first char\n");
1108 int ch = first_char & 255;
1109 const char *caseless = ((first_char & REQ_CASELESS) == 0)?
1112 fprintf(outfile, "First char = \'%c\'%s\n", ch, caseless);
1114 fprintf(outfile, "First char = %d%s\n", ch, caseless);
1119 fprintf(outfile, "No need char\n");
1123 int ch = need_char & 255;
1124 const char *caseless = ((need_char & REQ_CASELESS) == 0)?
1127 fprintf(outfile, "Need char = \'%c\'%s\n", ch, caseless);
1129 fprintf(outfile, "Need char = %d%s\n", ch, caseless);
1132 /* Don't output study size; at present it is in any case a fixed
1133 value, but it varies, depending on the computer architecture, and
1134 so messes up the test suite. (And with the /F option, it might be
1140 fprintf(outfile, "Study returned NULL\n");
1143 uschar *start_bits = NULL;
1144 new_info(re, extra, PCRE_INFO_FIRSTTABLE, &start_bits);
1146 if (start_bits == NULL)
1147 fprintf(outfile, "No starting byte set\n");
1152 fprintf(outfile, "Starting byte set: ");
1153 for (i = 0; i < 256; i++)
1155 if ((start_bits[i/8] & (1<<(i&7))) != 0)
1159 fprintf(outfile, "\n ");
1162 if (isprint(i) && i != ' ')
1164 fprintf(outfile, "%c ", i);
1169 fprintf(outfile, "\\x%02x ", i);
1174 fprintf(outfile, "\n");
1180 /* If the '>' option was present, we write out the regex to a file, and
1181 that is all. The first 8 bytes of the file are the regex length and then
1182 the study length, in big-endian order. */
1184 if (to_file != NULL)
1186 FILE *f = fopen((char *)to_file, "wb");
1189 fprintf(outfile, "Unable to open %s: %s\n", to_file, strerror(errno));
1194 sbuf[0] = (true_size >> 24) & 255;
1195 sbuf[1] = (true_size >> 16) & 255;
1196 sbuf[2] = (true_size >> 8) & 255;
1197 sbuf[3] = (true_size) & 255;
1199 sbuf[4] = (true_study_size >> 24) & 255;
1200 sbuf[5] = (true_study_size >> 16) & 255;
1201 sbuf[6] = (true_study_size >> 8) & 255;
1202 sbuf[7] = (true_study_size) & 255;
1204 if (fwrite(sbuf, 1, 8, f) < 8 ||
1205 fwrite(re, 1, true_size, f) < true_size)
1207 fprintf(outfile, "Write error on %s: %s\n", to_file, strerror(errno));
1211 fprintf(outfile, "Compiled regex written to %s\n", to_file);
1214 if (fwrite(extra->study_data, 1, true_study_size, f) <
1217 fprintf(outfile, "Write error on %s: %s\n", to_file,
1220 else fprintf(outfile, "Study data written to %s\n", to_file);
1225 continue; /* With next regex */
1227 } /* End of non-POSIX compile */
1229 /* Read data lines and test them */
1234 unsigned char *bptr = dbuffer;
1235 int *use_offsets = offsets;
1236 int use_size_offsets = size_offsets;
1237 int callout_data = 0;
1238 int callout_data_set = 0;
1240 int copystrings = 0;
1241 int find_match_limit = 0;
1245 int start_offset = 0;
1250 pcre_callout = callout;
1254 callout_fail_count = 999999;
1255 callout_fail_id = -1;
1258 if (infile == stdin) printf("data> ");
1259 if (fgets((char *)buffer, BUFFER_SIZE, infile) == NULL)
1264 if (infile != stdin) fprintf(outfile, "%s", (char *)buffer);
1266 len = (int)strlen((char *)buffer);
1267 while (len > 0 && isspace(buffer[len-1])) len--;
1269 if (len == 0) break;
1272 while (isspace(*p)) p++;
1275 while ((c = *p++) != 0)
1280 if (c == '\\') switch ((c = *p++))
1282 case 'a': c = 7; break;
1283 case 'b': c = '\b'; break;
1284 case 'e': c = 27; break;
1285 case 'f': c = '\f'; break;
1286 case 'n': c = '\n'; break;
1287 case 'r': c = '\r'; break;
1288 case 't': c = '\t'; break;
1289 case 'v': c = '\v'; break;
1291 case '0': case '1': case '2': case '3':
1292 case '4': case '5': case '6': case '7':
1294 while (i++ < 2 && isdigit(*p) && *p != '8' && *p != '9')
1295 c = c * 8 + *p++ - '0';
1300 /* Handle \x{..} specially - new Perl thing for utf8 */
1304 unsigned char *pt = p;
1306 while (isxdigit(*(++pt)))
1307 c = c * 16 + tolower(*pt) - ((isdigit(*pt))? '0' : 'W');
1310 unsigned char buff8[8];
1312 utn = ord2utf8(c, buff8);
1313 for (ii = 0; ii < utn - 1; ii++) *q++ = buff8[ii];
1314 c = buff8[ii]; /* Last byte */
1318 /* Not correct form; fall through */
1324 while (i++ < 2 && isxdigit(*p))
1326 c = c * 16 + tolower(*p) - ((isdigit(*p))? '0' : 'W');
1331 case 0: /* \ followed by EOF allows for an empty line */
1336 while(isdigit(*p)) start_offset = start_offset * 10 + *p++ - '0';
1339 case 'A': /* Option setting */
1340 options |= PCRE_ANCHORED;
1344 options |= PCRE_NOTBOL;
1348 if (isdigit(*p)) /* Set copy string */
1350 while(isdigit(*p)) n = n * 10 + *p++ - '0';
1351 copystrings |= 1 << n;
1353 else if (isalnum(*p))
1357 while (isalnum(*p)) *npp++ = *p++;
1359 n = pcre_get_stringnumber(re, (char *)name);
1361 fprintf(outfile, "no parentheses with name \"%s\"\n", name);
1362 else copystrings |= 1 << n;
1371 pcre_callout = NULL;
1376 callout_fail_id = 0;
1379 callout_fail_id = callout_fail_id * 10 + *p++ - '0';
1380 callout_fail_count = 0;
1385 callout_fail_count = callout_fail_count * 10 + *p++ - '0';
1392 if (*(++p) == '-') { sign = -1; p++; }
1394 callout_data = callout_data * 10 + *p++ - '0';
1395 callout_data *= sign;
1396 callout_data_set = 1;
1403 while(isdigit(*p)) n = n * 10 + *p++ - '0';
1404 getstrings |= 1 << n;
1406 else if (isalnum(*p))
1410 while (isalnum(*p)) *npp++ = *p++;
1412 n = pcre_get_stringnumber(re, (char *)name);
1414 fprintf(outfile, "no parentheses with name \"%s\"\n", name);
1415 else getstrings |= 1 << n;
1424 find_match_limit = 1;
1428 options |= PCRE_NOTEMPTY;
1432 while(isdigit(*p)) n = n * 10 + *p++ - '0';
1433 if (n > size_offsets_max)
1435 size_offsets_max = n;
1437 use_offsets = offsets = (int *)malloc(size_offsets_max * sizeof(int));
1438 if (offsets == NULL)
1440 printf("** Failed to get %d bytes of memory for offsets vector\n",
1441 size_offsets_max * sizeof(int));
1445 use_size_offsets = n;
1446 if (n == 0) use_offsets = NULL; /* Ensures it can't write to it */
1450 options |= PCRE_PARTIAL;
1458 options |= PCRE_NOTEOL;
1462 options |= PCRE_NO_UTF8_CHECK;
1470 /* Handle matching via the POSIX interface, which does not
1471 support timing or playing with the match limit or callout data. */
1473 #if !defined NOPOSIX
1474 if (posix || do_posix)
1478 regmatch_t *pmatch = NULL;
1479 if (use_size_offsets > 0)
1480 pmatch = (regmatch_t *)malloc(sizeof(regmatch_t) * use_size_offsets);
1481 if ((options & PCRE_NOTBOL) != 0) eflags |= REG_NOTBOL;
1482 if ((options & PCRE_NOTEOL) != 0) eflags |= REG_NOTEOL;
1484 rc = regexec(&preg, (const char *)bptr, use_size_offsets, pmatch, eflags);
1488 (void)regerror(rc, &preg, (char *)buffer, BUFFER_SIZE);
1489 fprintf(outfile, "No match: POSIX code %d: %s\n", rc, buffer);
1494 for (i = 0; i < (size_t)use_size_offsets; i++)
1496 if (pmatch[i].rm_so >= 0)
1498 fprintf(outfile, "%2d: ", (int)i);
1499 (void)pchars(dbuffer + pmatch[i].rm_so,
1500 pmatch[i].rm_eo - pmatch[i].rm_so, outfile);
1501 fprintf(outfile, "\n");
1502 if (i == 0 && do_showrest)
1504 fprintf(outfile, " 0+ ");
1505 (void)pchars(dbuffer + pmatch[i].rm_eo, len - pmatch[i].rm_eo,
1507 fprintf(outfile, "\n");
1515 /* Handle matching via the native interface - repeats for /g and /G */
1518 #endif /* !defined NOPOSIX */
1520 for (;; gmatched++) /* Loop for /g or /G */
1526 clock_t start_time = clock();
1527 for (i = 0; i < LOOPREPEAT; i++)
1528 count = pcre_exec(re, extra, (char *)bptr, len,
1529 start_offset, options | g_notempty, use_offsets, use_size_offsets);
1530 time_taken = clock() - start_time;
1531 fprintf(outfile, "Execute time %.3f milliseconds\n",
1532 (((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
1533 (double)CLOCKS_PER_SEC);
1536 /* If find_match_limit is set, we want to do repeated matches with
1537 varying limits in order to find the minimum value. */
1539 if (find_match_limit)
1547 extra = (pcre_extra *)malloc(sizeof(pcre_extra));
1550 extra->flags |= PCRE_EXTRA_MATCH_LIMIT;
1554 extra->match_limit = mid;
1555 count = pcre_exec(re, extra, (char *)bptr, len, start_offset,
1556 options | g_notempty, use_offsets, use_size_offsets);
1557 if (count == PCRE_ERROR_MATCHLIMIT)
1559 /* fprintf(outfile, "Testing match limit = %d\n", mid); */
1561 mid = (mid == max - 1)? max : (max > 0)? (min + max)/2 : mid*2;
1563 else if (count >= 0 || count == PCRE_ERROR_NOMATCH ||
1564 count == PCRE_ERROR_PARTIAL)
1568 fprintf(outfile, "Minimum match limit = %d\n", mid);
1571 /* fprintf(outfile, "Testing match limit = %d\n", mid); */
1573 mid = (min + mid)/2;
1575 else break; /* Some other error */
1578 extra->flags &= ~PCRE_EXTRA_MATCH_LIMIT;
1581 /* If callout_data is set, use the interface with additional data */
1583 else if (callout_data_set)
1587 extra = (pcre_extra *)malloc(sizeof(pcre_extra));
1590 extra->flags |= PCRE_EXTRA_CALLOUT_DATA;
1591 extra->callout_data = &callout_data;
1592 count = pcre_exec(re, extra, (char *)bptr, len, start_offset,
1593 options | g_notempty, use_offsets, use_size_offsets);
1594 extra->flags &= ~PCRE_EXTRA_CALLOUT_DATA;
1597 /* The normal case is just to do the match once, with the default
1598 value of match_limit. */
1602 count = pcre_exec(re, extra, (char *)bptr, len,
1603 start_offset, options | g_notempty, use_offsets, use_size_offsets);
1608 fprintf(outfile, "Matched, but too many substrings\n");
1609 count = use_size_offsets/3;
1617 for (i = 0; i < count * 2; i += 2)
1619 if (use_offsets[i] < 0)
1620 fprintf(outfile, "%2d: <unset>\n", i/2);
1623 fprintf(outfile, "%2d: ", i/2);
1624 (void)pchars(bptr + use_offsets[i],
1625 use_offsets[i+1] - use_offsets[i], outfile);
1626 fprintf(outfile, "\n");
1631 fprintf(outfile, " 0+ ");
1632 (void)pchars(bptr + use_offsets[i+1], len - use_offsets[i+1],
1634 fprintf(outfile, "\n");
1640 for (i = 0; i < 32; i++)
1642 if ((copystrings & (1 << i)) != 0)
1644 char copybuffer[16];
1645 int rc = pcre_copy_substring((char *)bptr, use_offsets, count,
1646 i, copybuffer, sizeof(copybuffer));
1648 fprintf(outfile, "copy substring %d failed %d\n", i, rc);
1650 fprintf(outfile, "%2dC %s (%d)\n", i, copybuffer, rc);
1654 for (i = 0; i < 32; i++)
1656 if ((getstrings & (1 << i)) != 0)
1658 const char *substring;
1659 int rc = pcre_get_substring((char *)bptr, use_offsets, count,
1662 fprintf(outfile, "get substring %d failed %d\n", i, rc);
1665 fprintf(outfile, "%2dG %s (%d)\n", i, substring, rc);
1666 /* free((void *)substring); */
1667 pcre_free_substring(substring);
1674 const char **stringlist;
1675 int rc = pcre_get_substring_list((char *)bptr, use_offsets, count,
1678 fprintf(outfile, "get substring list failed %d\n", rc);
1681 for (i = 0; i < count; i++)
1682 fprintf(outfile, "%2dL %s\n", i, stringlist[i]);
1683 if (stringlist[i] != NULL)
1684 fprintf(outfile, "string list not terminated by NULL\n");
1685 /* free((void *)stringlist); */
1686 pcre_free_substring_list(stringlist);
1691 /* There was a partial match */
1693 else if (count == PCRE_ERROR_PARTIAL)
1695 fprintf(outfile, "Partial match\n");
1696 break; /* Out of the /g loop */
1699 /* Failed to match. If this is a /g or /G loop and we previously set
1700 g_notempty after a null match, this is not necessarily the end.
1701 We want to advance the start offset, and continue. In the case of UTF-8
1702 matching, the advance must be one character, not one byte. Fudge the
1703 offset values to achieve this. We won't be at the end of the string -
1704 that was checked before setting g_notempty. */
1708 if (g_notempty != 0)
1711 use_offsets[0] = start_offset;
1714 while (start_offset + onechar < len)
1716 int tb = bptr[start_offset+onechar];
1717 if (tb <= 127) break;
1719 if (tb != 0 && tb != 0xc0) onechar++;
1722 use_offsets[1] = start_offset + onechar;
1726 if (count == PCRE_ERROR_NOMATCH)
1728 if (gmatched == 0) fprintf(outfile, "No match\n");
1730 else fprintf(outfile, "Error %d\n", count);
1731 break; /* Out of the /g loop */
1735 /* If not /g or /G we are done */
1737 if (!do_g && !do_G) break;
1739 /* If we have matched an empty string, first check to see if we are at
1740 the end of the subject. If so, the /g loop is over. Otherwise, mimic
1741 what Perl's /g options does. This turns out to be rather cunning. First
1742 we set PCRE_NOTEMPTY and PCRE_ANCHORED and try the match again at the
1743 same point. If this fails (picked up above) we advance to the next
1747 if (use_offsets[0] == use_offsets[1])
1749 if (use_offsets[0] == len) break;
1750 g_notempty = PCRE_NOTEMPTY | PCRE_ANCHORED;
1753 /* For /g, update the start offset, leaving the rest alone */
1755 if (do_g) start_offset = use_offsets[1];
1757 /* For /G, update the pointer and length */
1761 bptr += use_offsets[1];
1762 len -= use_offsets[1];
1764 } /* End of loop for /g and /G */
1765 } /* End of loop for data lines */
1769 #if !defined NOPOSIX
1770 if (posix || do_posix) regfree(&preg);
1773 if (re != NULL) free(re);
1774 if (extra != NULL) free(extra);
1777 free((void *)tables);
1778 setlocale(LC_CTYPE, "C");
1782 if (infile == stdin) fprintf(outfile, "\n");