1/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA. */
20
21static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 size_t length, reg_syntax_t syntax);
23static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
25 char *fastmap);
26static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
27#ifdef RE_ENABLE_I18N
28static void free_charset (re_charset_t *cset);
29#endif /* RE_ENABLE_I18N */
30static void free_workarea_compile (regex_t *preg);
31static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32#ifdef RE_ENABLE_I18N
33static void optimize_utf8 (re_dfa_t *dfa);
34#endif
35static reg_errcode_t analyze (regex_t *preg);
36static reg_errcode_t preorder (bin_tree_t *root,
37 reg_errcode_t (fn (void *, bin_tree_t *)),
38 void *extra);
39static reg_errcode_t postorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
41 void *extra);
42static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
43static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
44static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 bin_tree_t *node);
46static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
47static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
48static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
49static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
50static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
51 unsigned int constraint);
52static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
53static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 int node, int root);
55static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
56static int fetch_number (re_string_t *input, re_token_t *token,
57 reg_syntax_t syntax);
58static int peek_token (re_token_t *token, re_string_t *input,
59 reg_syntax_t syntax) internal_function;
60static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
61 reg_syntax_t syntax, reg_errcode_t *err);
62static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
63 re_token_t *token, reg_syntax_t syntax,
64 int nest, reg_errcode_t *err);
65static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 int nest, reg_errcode_t *err);
68static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 int nest, reg_errcode_t *err);
71static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 int nest, reg_errcode_t *err);
74static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
75 re_dfa_t *dfa, re_token_t *token,
76 reg_syntax_t syntax, reg_errcode_t *err);
77static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
78 re_token_t *token, reg_syntax_t syntax,
79 reg_errcode_t *err);
80static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_string_t *regexp,
82 re_token_t *token, int token_len,
83 re_dfa_t *dfa,
84 reg_syntax_t syntax,
85 int accept_hyphen);
86static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
87 re_string_t *regexp,
88 re_token_t *token);
89#ifdef RE_ENABLE_I18N
90static reg_errcode_t build_equiv_class (bitset_t sbcset,
91 re_charset_t *mbcset,
92 int *equiv_class_alloc,
93 const unsigned char *name);
94static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
95 bitset_t sbcset,
96 re_charset_t *mbcset,
97 int *char_class_alloc,
98 const char *class_name,
99 reg_syntax_t syntax);
100#else /* not RE_ENABLE_I18N */
101static reg_errcode_t build_equiv_class (bitset_t sbcset,
102 const unsigned char *name);
103static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
104 bitset_t sbcset,
105 const char *class_name,
106 reg_syntax_t syntax);
107#endif /* not RE_ENABLE_I18N */
108static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
109 RE_TRANSLATE_TYPE trans,
110 const char *class_name,
111 const char *extra,
112 int non_match, reg_errcode_t *err);
113static bin_tree_t *create_tree (re_dfa_t *dfa,
114 bin_tree_t *left, bin_tree_t *right,
115 re_token_type_t type);
116static bin_tree_t *create_token_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 const re_token_t *token);
119static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
120static void free_token (re_token_t *node);
121static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
122static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
123\f
124/* This table gives an error message for each of the error codes listed
125 in regex.h. Obviously the order here has to be same as there.
126 POSIX doesn't require that we do anything for REG_NOERROR,
127 but why not be nice? */
128
129const char __re_error_msgid[] attribute_hidden =
130 {
131#define REG_NOERROR_IDX 0
132 gettext_noop ("Success") /* REG_NOERROR */
133 "\0"
134#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
135 gettext_noop ("No match") /* REG_NOMATCH */
136 "\0"
137#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
138 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 "\0"
140#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
141 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 "\0"
143#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
144 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 "\0"
146#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
147 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 "\0"
149#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
150 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 "\0"
152#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
153 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 "\0"
155#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
156 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 "\0"
158#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
159 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 "\0"
161#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
162 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 "\0"
164#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
165 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 "\0"
167#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
168 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 "\0"
170#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
171 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 "\0"
173#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
174 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 "\0"
176#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
177 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 "\0"
179#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
180 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
181 };
182
183const size_t __re_error_msgid_idx[] attribute_hidden =
184 {
185 REG_NOERROR_IDX,
186 REG_NOMATCH_IDX,
187 REG_BADPAT_IDX,
188 REG_ECOLLATE_IDX,
189 REG_ECTYPE_IDX,
190 REG_EESCAPE_IDX,
191 REG_ESUBREG_IDX,
192 REG_EBRACK_IDX,
193 REG_EPAREN_IDX,
194 REG_EBRACE_IDX,
195 REG_BADBR_IDX,
196 REG_ERANGE_IDX,
197 REG_ESPACE_IDX,
198 REG_BADRPT_IDX,
199 REG_EEND_IDX,
200 REG_ESIZE_IDX,
201 REG_ERPAREN_IDX
202 };
203\f
204/* Entry points for GNU code. */
205
206
207#ifdef ZOS_USS
208
209/* For ZOS USS we must define btowc */
210
211wchar_t
212btowc (int c)
213{
214 wchar_t wtmp[2];
215 char tmp[2];
216
217 tmp[0] = c;
218 tmp[1] = 0;
219
220 mbtowc (wtmp, tmp, 1);
221 return wtmp[0];
222}
223#endif
224
225/* re_compile_pattern is the GNU regular expression compiler: it
226 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
227 Returns 0 if the pattern was valid, otherwise an error string.
228
229 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
230 are set in BUFP on entry. */
231
232const char *
233re_compile_pattern (pattern, length, bufp)
234 const char *pattern;
235 size_t length;
236 struct re_pattern_buffer *bufp;
237{
238 reg_errcode_t ret;
239
240 /* And GNU code determines whether or not to get register information
241 by passing null for the REGS argument to re_match, etc., not by
242 setting no_sub, unless RE_NO_SUB is set. */
243 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
244
245 /* Match anchors at newline. */
246 bufp->newline_anchor = 1;
247
248 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
249
250 if (!ret)
251 return NULL;
252 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
253}
254#ifdef _LIBC
255weak_alias (__re_compile_pattern, re_compile_pattern)
256#endif
257
258/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
259 also be assigned to arbitrarily: each pattern buffer stores its own
260 syntax, so it can be changed between regex compilations. */
261/* This has no initializer because initialized variables in Emacs
262 become read-only after dumping. */
263reg_syntax_t re_syntax_options;
264
265
266/* Specify the precise syntax of regexps for compilation. This provides
267 for compatibility for various utilities which historically have
268 different, incompatible syntaxes.
269
270 The argument SYNTAX is a bit mask comprised of the various bits
271 defined in regex.h. We return the old syntax. */
272
273reg_syntax_t
274re_set_syntax (syntax)
275 reg_syntax_t syntax;
276{
277 reg_syntax_t ret = re_syntax_options;
278
279 re_syntax_options = syntax;
280 return ret;
281}
282#ifdef _LIBC
283weak_alias (__re_set_syntax, re_set_syntax)
284#endif
285
286int
287re_compile_fastmap (bufp)
288 struct re_pattern_buffer *bufp;
289{
290 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
291 char *fastmap = bufp->fastmap;
292
293 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
294 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
295 if (dfa->init_state != dfa->init_state_word)
296 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
297 if (dfa->init_state != dfa->init_state_nl)
298 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
299 if (dfa->init_state != dfa->init_state_begbuf)
300 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
301 bufp->fastmap_accurate = 1;
302 return 0;
303}
304#ifdef _LIBC
305weak_alias (__re_compile_fastmap, re_compile_fastmap)
306#endif
307
308static inline void
309__attribute ((always_inline))
310re_set_fastmap (char *fastmap, int icase, int ch)
311{
312 fastmap[ch] = 1;
313 if (icase)
314 fastmap[tolower (ch)] = 1;
315}
316
317/* Helper function for re_compile_fastmap.
318 Compile fastmap for the initial_state INIT_STATE. */
319
320static void
321re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
322 char *fastmap)
323{
324 volatile re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
325 int node_cnt;
326 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
327 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
328 {
329 int node = init_state->nodes.elems[node_cnt];
330 re_token_type_t type = dfa->nodes[node].type;
331
332 if (type == CHARACTER)
333 {
334 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
335#ifdef RE_ENABLE_I18N
336 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
337 {
338 unsigned char *buf = re_malloc (unsigned char, dfa->mb_cur_max), *p;
339 wchar_t wc;
340 mbstate_t state;
341
342 p = buf;
343 *p++ = dfa->nodes[node].opr.c;
344 while (++node < dfa->nodes_len
345 && dfa->nodes[node].type == CHARACTER
346 && dfa->nodes[node].mb_partial)
347 *p++ = dfa->nodes[node].opr.c;
348 memset (&state, '\0', sizeof (state));
349 if (__mbrtowc (&wc, (const char *) buf, p - buf,
350 &state) == p - buf
351 && (__wcrtomb ((char *) buf, towlower (wc), &state)
352 != (size_t) -1))
353 re_set_fastmap (fastmap, 0, buf[0]);
354 re_free (buf);
355 }
356#endif
357 }
358 else if (type == SIMPLE_BRACKET)
359 {
360 int i, ch;
361 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
362 {
363 int j;
364 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
365 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
366 if (w & ((bitset_word_t) 1 << j))
367 re_set_fastmap (fastmap, icase, ch);
368 }
369 }
370#ifdef RE_ENABLE_I18N
371 else if (type == COMPLEX_BRACKET)
372 {
373 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
374 int i;
375
376# ifdef _LIBC
377 /* See if we have to try all bytes which start multiple collation
378 elements.
379 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
380 collation element, and don't catch 'b' since 'b' is
381 the only collation element which starts from 'b' (and
382 it is caught by SIMPLE_BRACKET). */
383 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
384 && (cset->ncoll_syms || cset->nranges))
385 {
386 const int32_t *table = (const int32_t *)
387 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
388 for (i = 0; i < SBC_MAX; ++i)
389 if (table[i] < 0)
390 re_set_fastmap (fastmap, icase, i);
391 }
392# endif /* _LIBC */
393
394 /* See if we have to start the match at all multibyte characters,
395 i.e. where we would not find an invalid sequence. This only
396 applies to multibyte character sets; for single byte character
397 sets, the SIMPLE_BRACKET again suffices. */
398 if (dfa->mb_cur_max > 1
399 && (cset->nchar_classes || cset->non_match || cset->nranges
400# ifdef _LIBC
401 || cset->nequiv_classes
402# endif /* _LIBC */
403 ))
404 {
405 unsigned char c = 0;
406 do
407 {
408 mbstate_t mbs;
409 memset (&mbs, 0, sizeof (mbs));
410 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
411 re_set_fastmap (fastmap, false, (int) c);
412 }
413 while (++c != 0);
414 }
415
416 else
417 {
418 /* ... Else catch all bytes which can start the mbchars. */
419 for (i = 0; i < cset->nmbchars; ++i)
420 {
421 char buf[256];
422 mbstate_t state;
423 memset (&state, '\0', sizeof (state));
424 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
425 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
426 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
427 {
428 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
429 != (size_t) -1)
430 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
431 }
432 }
433 }
434 }
435#endif /* RE_ENABLE_I18N */
436 else if (type == OP_PERIOD
437#ifdef RE_ENABLE_I18N
438 || type == OP_UTF8_PERIOD
439#endif /* RE_ENABLE_I18N */
440 || type == END_OF_RE)
441 {
442 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
443 if (type == END_OF_RE)
444 bufp->can_be_null = 1;
445 return;
446 }
447 }
448}
449\f
450/* Entry point for POSIX code. */
451/* regcomp takes a regular expression as a string and compiles it.
452
453 PREG is a regex_t *. We do not expect any fields to be initialized,
454 since POSIX says we shouldn't. Thus, we set
455
456 `buffer' to the compiled pattern;
457 `used' to the length of the compiled pattern;
458 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
459 REG_EXTENDED bit in CFLAGS is set; otherwise, to
460 RE_SYNTAX_POSIX_BASIC;
461 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
462 `fastmap' to an allocated space for the fastmap;
463 `fastmap_accurate' to zero;
464 `re_nsub' to the number of subexpressions in PATTERN.
465
466 PATTERN is the address of the pattern string.
467
468 CFLAGS is a series of bits which affect compilation.
469
470 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
471 use POSIX basic syntax.
472
473 If REG_NEWLINE is set, then . and [^...] don't match newline.
474 Also, regexec will try a match beginning after every newline.
475
476 If REG_ICASE is set, then we considers upper- and lowercase
477 versions of letters to be equivalent when matching.
478
479 If REG_NOSUB is set, then when PREG is passed to regexec, that
480 routine will report only success or failure, and nothing about the
481 registers.
482
483 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
484 the return codes and their meanings.) */
485
486int
487regcomp (preg, pattern, cflags)
488 regex_t *__restrict preg;
489 const char *__restrict pattern;
490 int cflags;
491{
492 reg_errcode_t ret;
493 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
494 : RE_SYNTAX_POSIX_BASIC);
495
496 preg->buffer = NULL;
497 preg->allocated = 0;
498 preg->used = 0;
499
500 /* Try to allocate space for the fastmap. */
501 preg->fastmap = re_malloc (char, SBC_MAX);
502 if (BE (preg->fastmap == NULL, 0))
503 return REG_ESPACE;
504
505 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
506
507 /* If REG_NEWLINE is set, newlines are treated differently. */
508 if (cflags & REG_NEWLINE)
509 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
510 syntax &= ~RE_DOT_NEWLINE;
511 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
512 /* It also changes the matching behavior. */
513 preg->newline_anchor = 1;
514 }
515 else
516 preg->newline_anchor = 0;
517 preg->no_sub = !!(cflags & REG_NOSUB);
518 preg->translate = NULL;
519
520 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
521
522 /* POSIX doesn't distinguish between an unmatched open-group and an
523 unmatched close-group: both are REG_EPAREN. */
524 if (ret == REG_ERPAREN)
525 ret = REG_EPAREN;
526
527 /* We have already checked preg->fastmap != NULL. */
528 if (BE (ret == REG_NOERROR, 1))
529 /* Compute the fastmap now, since regexec cannot modify the pattern
530 buffer. This function never fails in this implementation. */
531 (void) re_compile_fastmap (preg);
532 else
533 {
534 /* Some error occurred while compiling the expression. */
535 re_free (preg->fastmap);
536 preg->fastmap = NULL;
537 }
538
539 return (int) ret;
540}
541#ifdef _LIBC
542weak_alias (__regcomp, regcomp)
543#endif
544
545/* Returns a message corresponding to an error code, ERRCODE, returned
546 from either regcomp or regexec. We don't use PREG here. */
547
548size_t
549regerror(int errcode, const regex_t *__restrict preg,
550 char *__restrict errbuf, size_t errbuf_size)
551{
552 const char *msg;
553 size_t msg_size;
554
555 if (BE (errcode < 0
556 || errcode >= (int) (sizeof (__re_error_msgid_idx)
557 / sizeof (__re_error_msgid_idx[0])), 0))
558 /* Only error codes returned by the rest of the code should be passed
559 to this routine. If we are given anything else, or if other regex
560 code generates an invalid error code, then the program has a bug.
561 Dump core so we can fix it. */
562 abort ();
563
564 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
565
566 msg_size = strlen (msg) + 1; /* Includes the null. */
567
568 if (BE (errbuf_size != 0, 1))
569 {
570 if (BE (msg_size > errbuf_size, 0))
571 {
572 memcpy (errbuf, msg, errbuf_size - 1);
573 errbuf[errbuf_size - 1] = 0;
574 }
575 else
576 memcpy (errbuf, msg, msg_size);
577 }
578
579 return msg_size;
580}
581#ifdef _LIBC
582weak_alias (__regerror, regerror)
583#endif
584
585
586#ifdef RE_ENABLE_I18N
587/* This static array is used for the map to single-byte characters when
588 UTF-8 is used. Otherwise we would allocate memory just to initialize
589 it the same all the time. UTF-8 is the preferred encoding so this is
590 a worthwhile optimization. */
591#if __GNUC__ >= 3
592static const bitset_t utf8_sb_map = {
593 /* Set the first 128 bits. */
594 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
595};
596#else /* ! (__GNUC__ >= 3) */
597static bitset_t utf8_sb_map;
598#endif /* __GNUC__ >= 3 */
599#endif /* RE_ENABLE_I18N */
600
601
602static void
603free_dfa_content (re_dfa_t *dfa)
604{
605 int i, j;
606
607 if (dfa->nodes)
608 for (i = 0; i < dfa->nodes_len; ++i)
609 free_token (dfa->nodes + i);
610 re_free (dfa->nexts);
611 for (i = 0; i < dfa->nodes_len; ++i)
612 {
613 if (dfa->eclosures != NULL)
614 re_node_set_free (dfa->eclosures + i);
615 if (dfa->inveclosures != NULL)
616 re_node_set_free (dfa->inveclosures + i);
617 if (dfa->edests != NULL)
618 re_node_set_free (dfa->edests + i);
619 }
620 re_free (dfa->edests);
621 re_free (dfa->eclosures);
622 re_free (dfa->inveclosures);
623 re_free (dfa->nodes);
624
625 if (dfa->state_table)
626 for (i = 0; i <= dfa->state_hash_mask; ++i)
627 {
628 struct re_state_table_entry *entry = dfa->state_table + i;
629 for (j = 0; j < entry->num; ++j)
630 {
631 re_dfastate_t *state = entry->array[j];
632 free_state (state);
633 }
634 re_free (entry->array);
635 }
636 re_free (dfa->state_table);
637#ifdef RE_ENABLE_I18N
638 if (dfa->sb_char != utf8_sb_map)
639 re_free (dfa->sb_char);
640#endif
641 re_free (dfa->subexp_map);
642#ifdef DEBUG
643 re_free (dfa->re_str);
644#endif
645
646 re_free (dfa);
647}
648
649
650/* Free dynamically allocated space used by PREG. */
651
652void
653regfree (preg)
654 regex_t *preg;
655{
656 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
657 if (BE (dfa != NULL, 1))
658 free_dfa_content (dfa);
659 preg->buffer = NULL;
660 preg->allocated = 0;
661
662 re_free (preg->fastmap);
663 preg->fastmap = NULL;
664
665 re_free (preg->translate);
666 preg->translate = NULL;
667}
668#ifdef _LIBC
669weak_alias (__regfree, regfree)
670#endif
671\f
672/* Entry points compatible with 4.2 BSD regex library. We don't define
673 them unless specifically requested. */
674
675#if defined _REGEX_RE_COMP || defined _LIBC
676
677/* BSD has one and only one pattern buffer. */
678static struct re_pattern_buffer re_comp_buf;
679
680char *
681# ifdef _LIBC
682/* Make these definitions weak in libc, so POSIX programs can redefine
683 these names if they don't use our functions, and still use
684 regcomp/regexec above without link errors. */
685weak_function
686# endif
687re_comp (s)
688 const char *s;
689{
690 reg_errcode_t ret;
691 char *fastmap;
692
693 if (!s)
694 {
695 if (!re_comp_buf.buffer)
696 return gettext ("No previous regular expression");
697 return 0;
698 }
699
700 if (re_comp_buf.buffer)
701 {
702 fastmap = re_comp_buf.fastmap;
703 re_comp_buf.fastmap = NULL;
704 __regfree (&re_comp_buf);
705 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
706 re_comp_buf.fastmap = fastmap;
707 }
708
709 if (re_comp_buf.fastmap == NULL)
710 {
711 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
712 if (re_comp_buf.fastmap == NULL)
713 return (char *) gettext (__re_error_msgid
714 + __re_error_msgid_idx[(int) REG_ESPACE]);
715 }
716
717 /* Since `re_exec' always passes NULL for the `regs' argument, we
718 don't need to initialize the pattern buffer fields which affect it. */
719
720 /* Match anchors at newlines. */
721 re_comp_buf.newline_anchor = 1;
722
723 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
724
725 if (!ret)
726 return NULL;
727
728 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
729 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
730}
731
732#ifdef _LIBC
733libc_freeres_fn (free_mem)
734{
735 __regfree (&re_comp_buf);
736}
737#endif
738
739#endif /* _REGEX_RE_COMP */
740\f
741/* Internal entry point.
742 Compile the regular expression PATTERN, whose length is LENGTH.
743 SYNTAX indicate regular expression's syntax. */
744
745static reg_errcode_t
746re_compile_internal (regex_t *preg, const char * pattern, size_t length,
747 reg_syntax_t syntax)
748{
749 reg_errcode_t err = REG_NOERROR;
750 re_dfa_t *dfa;
751 re_string_t regexp;
752
753 /* Initialize the pattern buffer. */
754 preg->fastmap_accurate = 0;
755 preg->syntax = syntax;
756 preg->not_bol = preg->not_eol = 0;
757 preg->used = 0;
758 preg->re_nsub = 0;
759 preg->can_be_null = 0;
760 preg->regs_allocated = REGS_UNALLOCATED;
761
762 /* Initialize the dfa. */
763 dfa = (re_dfa_t *) preg->buffer;
764 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
765 {
766 /* If zero allocated, but buffer is non-null, try to realloc
767 enough space. This loses if buffer's address is bogus, but
768 that is the user's responsibility. If ->buffer is NULL this
769 is a simple allocation. */
770 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
771 if (dfa == NULL)
772 return REG_ESPACE;
773 preg->allocated = sizeof (re_dfa_t);
774 preg->buffer = (unsigned char *) dfa;
775 }
776 preg->used = sizeof (re_dfa_t);
777
778 err = init_dfa (dfa, length);
779 if (BE (err != REG_NOERROR, 0))
780 {
781 free_dfa_content (dfa);
782 preg->buffer = NULL;
783 preg->allocated = 0;
784 return err;
785 }
786#ifdef DEBUG
787 /* Note: length+1 will not overflow since it is checked in init_dfa. */
788 dfa->re_str = re_malloc (char, length + 1);
789 strncpy (dfa->re_str, pattern, length + 1);
790#endif
791
792 __libc_lock_init (dfa->lock);
793
794 err = re_string_construct (®exp, pattern, length, preg->translate,
795 syntax & RE_ICASE, dfa);
796 if (BE (err != REG_NOERROR, 0))
797 {
798 re_compile_internal_free_return:
799 free_workarea_compile (preg);
800 re_string_destruct (®exp);
801 free_dfa_content (dfa);
802 preg->buffer = NULL;
803 preg->allocated = 0;
804 return err;
805 }
806
807 /* Parse the regular expression, and build a structure tree. */
808 preg->re_nsub = 0;
809 dfa->str_tree = parse (®exp, preg, syntax, &err);
810 if (BE (dfa->str_tree == NULL, 0))
811 goto re_compile_internal_free_return;
812
813 /* Analyze the tree and create the nfa. */
814 err = analyze (preg);
815 if (BE (err != REG_NOERROR, 0))
816 goto re_compile_internal_free_return;
817
818#ifdef RE_ENABLE_I18N
819 /* If possible, do searching in single byte encoding to speed things up. */
820 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
821 optimize_utf8 (dfa);
822#endif
823
824 /* Then create the initial state of the dfa. */
825 err = create_initial_state (dfa);
826
827 /* Release work areas. */
828 free_workarea_compile (preg);
829 re_string_destruct (®exp);
830
831 if (BE (err != REG_NOERROR, 0))
832 {
833 free_dfa_content (dfa);
834 preg->buffer = NULL;
835 preg->allocated = 0;
836 }
837
838 return err;
839}
840
841/* Initialize DFA. We use the length of the regular expression PAT_LEN
842 as the initial length of some arrays. */
843
844static reg_errcode_t
845init_dfa (re_dfa_t *dfa, size_t pat_len)
846{
847 unsigned int table_size;
848#ifndef _LIBC
849 char *codeset_name;
850#endif
851
852 memset (dfa, '\0', sizeof (re_dfa_t));
853
854 /* Force allocation of str_tree_storage the first time. */
855 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
856
857 /* Avoid overflows. */
858 if (pat_len == SIZE_MAX)
859 return REG_ESPACE;
860
861 dfa->nodes_alloc = pat_len + 1;
862 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
863
864 /* table_size = 2 ^ ceil(log pat_len) */
865 for (table_size = 1; ; table_size <<= 1)
866 if (table_size > pat_len)
867 break;
868
869 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
870 dfa->state_hash_mask = table_size - 1;
871
872 dfa->mb_cur_max = MB_CUR_MAX;
873#ifdef _LIBC
874 if (dfa->mb_cur_max == 6
875 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
876 dfa->is_utf8 = 1;
877 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
878 != 0);
879#else
880# ifdef HAVE_LANGINFO_CODESET
881 codeset_name = nl_langinfo (CODESET);
882# else
883 codeset_name = getenv ("LC_ALL");
884 if (codeset_name == NULL || codeset_name[0] == '\0')
885 codeset_name = getenv ("LC_CTYPE");
886 if (codeset_name == NULL || codeset_name[0] == '\0')
887 codeset_name = getenv ("LANG");
888 if (codeset_name == NULL)
889 codeset_name = "";
890 else if (strchr (codeset_name, '.') != NULL)
891 codeset_name = strchr (codeset_name, '.') + 1;
892# endif
893
894 /* strcasecmp isn't a standard interface. brute force check */
895#if 0
896 if (strcasecmp (codeset_name, "UTF-8") == 0
897 || strcasecmp (codeset_name, "UTF8") == 0)
898 dfa->is_utf8 = 1;
899#else
900 if ( (codeset_name[0] == 'U' || codeset_name[0] == 'u')
901 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
902 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
903 && (codeset_name[3] == '-'
904 ? codeset_name[4] == '8' && codeset_name[5] == '\0'
905 : codeset_name[3] == '8' && codeset_name[4] == '\0'))
906 dfa->is_utf8 = 1;
907#endif
908
909 /* We check exhaustively in the loop below if this charset is a
910 superset of ASCII. */
911 dfa->map_notascii = 0;
912#endif
913
914#ifdef RE_ENABLE_I18N
915 if (dfa->mb_cur_max > 1)
916 {
917 if (dfa->is_utf8)
918 {
919#if !defined(__GNUC__) || __GNUC__ < 3
920 static short utf8_sb_map_inited = 0;
921
922 if (! utf8_sb_map_inited)
923 {
924 int i;
925
926 utf8_sb_map_inited = 0;
927 for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++)
928 utf8_sb_map[i] = BITSET_WORD_MAX;
929 }
930#endif
931 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
932 }
933 else
934 {
935 int i, j, ch;
936
937 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
938 if (BE (dfa->sb_char == NULL, 0))
939 return REG_ESPACE;
940
941 /* Set the bits corresponding to single byte chars. */
942 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
943 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
944 {
945 wint_t wch = __btowc (ch);
946 if (wch != WEOF)
947 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
948# ifndef _LIBC
949 if (isascii (ch) && wch != ch)
950 dfa->map_notascii = 1;
951# endif
952 }
953 }
954 }
955#endif
956
957 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
958 return REG_ESPACE;
959 return REG_NOERROR;
960}
961
962/* Initialize WORD_CHAR table, which indicate which character is
963 "word". In this case "word" means that it is the word construction
964 character used by some operators like "\<", "\>", etc. */
965
966static void
967internal_function
968init_word_char (re_dfa_t *dfa)
969{
970 int i, j, ch;
971 dfa->word_ops_used = 1;
972 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
973 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
974 if (isalnum (ch) || ch == '_')
975 dfa->word_char[i] |= (bitset_word_t) 1 << j;
976}
977
978/* Free the work area which are only used while compiling. */
979
980static void
981free_workarea_compile (regex_t *preg)
982{
983 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
984 bin_tree_storage_t *storage, *next;
985 for (storage = dfa->str_tree_storage; storage; storage = next)
986 {
987 next = storage->next;
988 re_free (storage);
989 }
990 dfa->str_tree_storage = NULL;
991 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
992 dfa->str_tree = NULL;
993 re_free (dfa->org_indices);
994 dfa->org_indices = NULL;
995}
996
997/* Create initial states for all contexts. */
998
999static reg_errcode_t
1000create_initial_state (re_dfa_t *dfa)
1001{
1002 int first, i;
1003 reg_errcode_t err;
1004 re_node_set init_nodes;
1005
1006 /* Initial states have the epsilon closure of the node which is
1007 the first node of the regular expression. */
1008 first = dfa->str_tree->first->node_idx;
1009 dfa->init_node = first;
1010 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1011 if (BE (err != REG_NOERROR, 0))
1012 return err;
1013
1014 /* The back-references which are in initial states can epsilon transit,
1015 since in this case all of the subexpressions can be null.
1016 Then we add epsilon closures of the nodes which are the next nodes of
1017 the back-references. */
1018 if (dfa->nbackref > 0)
1019 for (i = 0; i < init_nodes.nelem; ++i)
1020 {
1021 int node_idx = init_nodes.elems[i];
1022 re_token_type_t type = dfa->nodes[node_idx].type;
1023
1024 int clexp_idx;
1025 if (type != OP_BACK_REF)
1026 continue;
1027 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1028 {
1029 re_token_t *clexp_node;
1030 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1031 if (clexp_node->type == OP_CLOSE_SUBEXP
1032 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1033 break;
1034 }
1035 if (clexp_idx == init_nodes.nelem)
1036 continue;
1037
1038 if (type == OP_BACK_REF)
1039 {
1040 int dest_idx = dfa->edests[node_idx].elems[0];
1041 if (!re_node_set_contains (&init_nodes, dest_idx))
1042 {
1043 reg_errcode_t err = re_node_set_merge (&init_nodes,
1044 dfa->eclosures
1045 + dest_idx);
1046 if (err != REG_NOERROR)
1047 return err;
1048 i = 0;
1049 }
1050 }
1051 }
1052
1053 /* It must be the first time to invoke acquire_state. */
1054 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1055 /* We don't check ERR here, since the initial state must not be NULL. */
1056 if (BE (dfa->init_state == NULL, 0))
1057 return err;
1058 if (dfa->init_state->has_constraint)
1059 {
1060 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1061 CONTEXT_WORD);
1062 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1063 CONTEXT_NEWLINE);
1064 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1065 &init_nodes,
1066 CONTEXT_NEWLINE
1067 | CONTEXT_BEGBUF);
1068 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1069 || dfa->init_state_begbuf == NULL, 0))
1070 return err;
1071 }
1072 else
1073 dfa->init_state_word = dfa->init_state_nl
1074 = dfa->init_state_begbuf = dfa->init_state;
1075
1076 re_node_set_free (&init_nodes);
1077 return REG_NOERROR;
1078}
1079\f
1080#ifdef RE_ENABLE_I18N
1081/* If it is possible to do searching in single byte encoding instead of UTF-8
1082 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1083 DFA nodes where needed. */
1084
1085static void
1086optimize_utf8 (re_dfa_t *dfa)
1087{
1088 int node, i, mb_chars = 0, has_period = 0;
1089
1090 for (node = 0; node < dfa->nodes_len; ++node)
1091 switch (dfa->nodes[node].type)
1092 {
1093 case CHARACTER:
1094 if (dfa->nodes[node].opr.c >= 0x80)
1095 mb_chars = 1;
1096 break;
1097 case ANCHOR:
1098 switch (dfa->nodes[node].opr.ctx_type)
1099 {
1100 case LINE_FIRST:
1101 case LINE_LAST:
1102 case BUF_FIRST:
1103 case BUF_LAST:
1104 break;
1105 default:
1106 /* Word anchors etc. cannot be handled. It's okay to test
1107 opr.ctx_type since constraints (for all DFA nodes) are
1108 created by ORing one or more opr.ctx_type values. */
1109 return;
1110 }
1111 break;
1112 case OP_PERIOD:
1113 has_period = 1;
1114 break;
1115 case OP_BACK_REF:
1116 case OP_ALT:
1117 case END_OF_RE:
1118 case OP_DUP_ASTERISK:
1119 case OP_OPEN_SUBEXP:
1120 case OP_CLOSE_SUBEXP:
1121 break;
1122 case COMPLEX_BRACKET:
1123 return;
1124 case SIMPLE_BRACKET:
1125 /* Just double check. The non-ASCII range starts at 0x80. */
1126 assert (0x80 % BITSET_WORD_BITS == 0);
1127 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1128 if (dfa->nodes[node].opr.sbcset[i])
1129 return;
1130 break;
1131 default:
1132 abort ();
1133 }
1134
1135 if (mb_chars || has_period)
1136 for (node = 0; node < dfa->nodes_len; ++node)
1137 {
1138 if (dfa->nodes[node].type == CHARACTER
1139 && dfa->nodes[node].opr.c >= 0x80)
1140 dfa->nodes[node].mb_partial = 0;
1141 else if (dfa->nodes[node].type == OP_PERIOD)
1142 dfa->nodes[node].type = OP_UTF8_PERIOD;
1143 }
1144
1145 /* The search can be in single byte locale. */
1146 dfa->mb_cur_max = 1;
1147 dfa->is_utf8 = 0;
1148 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1149}
1150#endif
1151\f
1152/* Analyze the structure tree, and calculate "first", "next", "edest",
1153 "eclosure", and "inveclosure". */
1154
1155static reg_errcode_t
1156analyze (regex_t *preg)
1157{
1158 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1159 reg_errcode_t ret;
1160
1161 /* Allocate arrays. */
1162 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1163 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1164 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1165 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1166 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1167 || dfa->eclosures == NULL, 0))
1168 return REG_ESPACE;
1169
1170 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1171 if (dfa->subexp_map != NULL)
1172 {
1173 int i;
1174 for (i = 0; i < preg->re_nsub; i++)
1175 dfa->subexp_map[i] = i;
1176 preorder (dfa->str_tree, optimize_subexps, dfa);
1177 for (i = 0; i < preg->re_nsub; i++)
1178 if (dfa->subexp_map[i] != i)
1179 break;
1180 if (i == preg->re_nsub)
1181 {
1182 free (dfa->subexp_map);
1183 dfa->subexp_map = NULL;
1184 }
1185 }
1186
1187 ret = postorder (dfa->str_tree, lower_subexps, preg);
1188 if (BE (ret != REG_NOERROR, 0))
1189 return ret;
1190 ret = postorder (dfa->str_tree, calc_first, dfa);
1191 if (BE (ret != REG_NOERROR, 0))
1192 return ret;
1193 preorder (dfa->str_tree, calc_next, dfa);
1194 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1195 if (BE (ret != REG_NOERROR, 0))
1196 return ret;
1197 ret = calc_eclosure (dfa);
1198 if (BE (ret != REG_NOERROR, 0))
1199 return ret;
1200
1201 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1202 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1203 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1204 || dfa->nbackref)
1205 {
1206 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1207 if (BE (dfa->inveclosures == NULL, 0))
1208 return REG_ESPACE;
1209 ret = calc_inveclosure (dfa);
1210 }
1211
1212 return ret;
1213}
1214
1215/* Our parse trees are very unbalanced, so we cannot use a stack to
1216 implement parse tree visits. Instead, we use parent pointers and
1217 some hairy code in these two functions. */
1218static reg_errcode_t
1219postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1220 void *extra)
1221{
1222 bin_tree_t *node, *prev;
1223
1224 for (node = root; ; )
1225 {
1226 /* Descend down the tree, preferably to the left (or to the right
1227 if that's the only child). */
1228 while (node->left || node->right)
1229 if (node->left)
1230 node = node->left;
1231 else
1232 node = node->right;
1233
1234 do
1235 {
1236 reg_errcode_t err = fn (extra, node);
1237 if (BE (err != REG_NOERROR, 0))
1238 return err;
1239 if (node->parent == NULL)
1240 return REG_NOERROR;
1241 prev = node;
1242 node = node->parent;
1243 }
1244 /* Go up while we have a node that is reached from the right. */
1245 while (node->right == prev || node->right == NULL);
1246 node = node->right;
1247 }
1248}
1249
1250static reg_errcode_t
1251preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1252 void *extra)
1253{
1254 bin_tree_t *node;
1255
1256 for (node = root; ; )
1257 {
1258 reg_errcode_t err = fn (extra, node);
1259 if (BE (err != REG_NOERROR, 0))
1260 return err;
1261
1262 /* Go to the left node, or up and to the right. */
1263 if (node->left)
1264 node = node->left;
1265 else
1266 {
1267 bin_tree_t *prev = NULL;
1268 while (node->right == prev || node->right == NULL)
1269 {
1270 prev = node;
1271 node = node->parent;
1272 if (!node)
1273 return REG_NOERROR;
1274 }
1275 node = node->right;
1276 }
1277 }
1278}
1279
1280/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1281 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1282 backreferences as well. Requires a preorder visit. */
1283static reg_errcode_t
1284optimize_subexps (void *extra, bin_tree_t *node)
1285{
1286 re_dfa_t *dfa = (re_dfa_t *) extra;
1287
1288 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1289 {
1290 int idx = node->token.opr.idx;
1291 node->token.opr.idx = dfa->subexp_map[idx];
1292 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1293 }
1294
1295 else if (node->token.type == SUBEXP
1296 && node->left && node->left->token.type == SUBEXP)
1297 {
1298 int other_idx = node->left->token.opr.idx;
1299
1300 node->left = node->left->left;
1301 if (node->left)
1302 node->left->parent = node;
1303
1304 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1305 if (other_idx < BITSET_WORD_BITS)
1306 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1307 }
1308
1309 return REG_NOERROR;
1310}
1311
1312/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1313 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1314static reg_errcode_t
1315lower_subexps (void *extra, bin_tree_t *node)
1316{
1317 regex_t *preg = (regex_t *) extra;
1318 reg_errcode_t err = REG_NOERROR;
1319
1320 if (node->left && node->left->token.type == SUBEXP)
1321 {
1322 node->left = lower_subexp (&err, preg, node->left);
1323 if (node->left)
1324 node->left->parent = node;
1325 }
1326 if (node->right && node->right->token.type == SUBEXP)
1327 {
1328 node->right = lower_subexp (&err, preg, node->right);
1329 if (node->right)
1330 node->right->parent = node;
1331 }
1332
1333 return err;
1334}
1335
1336static bin_tree_t *
1337lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1338{
1339 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1340 bin_tree_t *body = node->left;
1341 bin_tree_t *op, *cls, *tree1, *tree;
1342
1343 if (preg->no_sub
1344 /* We do not optimize empty subexpressions, because otherwise we may
1345 have bad CONCAT nodes with NULL children. This is obviously not
1346 very common, so we do not lose much. An example that triggers
1347 this case is the sed "script" /\(\)/x. */
1348 && node->left != NULL
1349 && (node->token.opr.idx >= BITSET_WORD_BITS
1350 || !(dfa->used_bkref_map
1351 & ((bitset_word_t) 1 << node->token.opr.idx))))
1352 return node->left;
1353
1354 /* Convert the SUBEXP node to the concatenation of an
1355 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1356 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1357 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1358 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1359 tree = create_tree (dfa, op, tree1, CONCAT);
1360 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1361 {
1362 *err = REG_ESPACE;
1363 return NULL;
1364 }
1365
1366 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1367 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1368 return tree;
1369}
1370
1371/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1372 nodes. Requires a postorder visit. */
1373static reg_errcode_t
1374calc_first (void *extra, bin_tree_t *node)
1375{
1376 re_dfa_t *dfa = (re_dfa_t *) extra;
1377 if (node->token.type == CONCAT)
1378 {
1379 node->first = node->left->first;
1380 node->node_idx = node->left->node_idx;
1381 }
1382 else
1383 {
1384 node->first = node;
1385 node->node_idx = re_dfa_add_node (dfa, node->token);
1386 if (BE (node->node_idx == -1, 0))
1387 return REG_ESPACE;
1388 if (node->token.type == ANCHOR)
1389 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1390 }
1391 return REG_NOERROR;
1392}
1393
1394/* Pass 2: compute NEXT on the tree. Preorder visit. */
1395static reg_errcode_t
1396calc_next (void *extra, bin_tree_t *node)
1397{
1398 switch (node->token.type)
1399 {
1400 case OP_DUP_ASTERISK:
1401 node->left->next = node;
1402 break;
1403 case CONCAT:
1404 node->left->next = node->right->first;
1405 node->right->next = node->next;
1406 break;
1407 default:
1408 if (node->left)
1409 node->left->next = node->next;
1410 if (node->right)
1411 node->right->next = node->next;
1412 break;
1413 }
1414 return REG_NOERROR;
1415}
1416
1417/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1418static reg_errcode_t
1419link_nfa_nodes (void *extra, bin_tree_t *node)
1420{
1421 re_dfa_t *dfa = (re_dfa_t *) extra;
1422 int idx = node->node_idx;
1423 reg_errcode_t err = REG_NOERROR;
1424
1425 switch (node->token.type)
1426 {
1427 case CONCAT:
1428 break;
1429
1430 case END_OF_RE:
1431 assert (node->next == NULL);
1432 break;
1433
1434 case OP_DUP_ASTERISK:
1435 case OP_ALT:
1436 {
1437 int left, right;
1438 dfa->has_plural_match = 1;
1439 if (node->left != NULL)
1440 left = node->left->first->node_idx;
1441 else
1442 left = node->next->node_idx;
1443 if (node->right != NULL)
1444 right = node->right->first->node_idx;
1445 else
1446 right = node->next->node_idx;
1447 assert (left > -1);
1448 assert (right > -1);
1449 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1450 }
1451 break;
1452
1453 case ANCHOR:
1454 case OP_OPEN_SUBEXP:
1455 case OP_CLOSE_SUBEXP:
1456 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1457 break;
1458
1459 case OP_BACK_REF:
1460 dfa->nexts[idx] = node->next->node_idx;
1461 if (node->token.type == OP_BACK_REF)
1462 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1463 break;
1464
1465 default:
1466 assert (!IS_EPSILON_NODE (node->token.type));
1467 dfa->nexts[idx] = node->next->node_idx;
1468 break;
1469 }
1470
1471 return err;
1472}
1473
1474/* Duplicate the epsilon closure of the node ROOT_NODE.
1475 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1476 to their own constraint. */
1477
1478static reg_errcode_t
1479internal_function
1480duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1481 int root_node, unsigned int init_constraint)
1482{
1483 int org_node, clone_node, ret;
1484 unsigned int constraint = init_constraint;
1485 for (org_node = top_org_node, clone_node = top_clone_node;;)
1486 {
1487 int org_dest, clone_dest;
1488 if (dfa->nodes[org_node].type == OP_BACK_REF)
1489 {
1490 /* If the back reference epsilon-transit, its destination must
1491 also have the constraint. Then duplicate the epsilon closure
1492 of the destination of the back reference, and store it in
1493 edests of the back reference. */
1494 org_dest = dfa->nexts[org_node];
1495 re_node_set_empty (dfa->edests + clone_node);
1496 clone_dest = duplicate_node (dfa, org_dest, constraint);
1497 if (BE (clone_dest == -1, 0))
1498 return REG_ESPACE;
1499 dfa->nexts[clone_node] = dfa->nexts[org_node];
1500 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1501 if (BE (ret < 0, 0))
1502 return REG_ESPACE;
1503 }
1504 else if (dfa->edests[org_node].nelem == 0)
1505 {
1506 /* In case of the node can't epsilon-transit, don't duplicate the
1507 destination and store the original destination as the
1508 destination of the node. */
1509 dfa->nexts[clone_node] = dfa->nexts[org_node];
1510 break;
1511 }
1512 else if (dfa->edests[org_node].nelem == 1)
1513 {
1514 /* In case of the node can epsilon-transit, and it has only one
1515 destination. */
1516 org_dest = dfa->edests[org_node].elems[0];
1517 re_node_set_empty (dfa->edests + clone_node);
1518 /* If the node is root_node itself, it means the epsilon clsoure
1519 has a loop. Then tie it to the destination of the root_node. */
1520 if (org_node == root_node && clone_node != org_node)
1521 {
1522 ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1523 if (BE (ret < 0, 0))
1524 return REG_ESPACE;
1525 break;
1526 }
1527 /* In case of the node has another constraint, add it. */
1528 constraint |= dfa->nodes[org_node].constraint;
1529 clone_dest = duplicate_node (dfa, org_dest, constraint);
1530 if (BE (clone_dest == -1, 0))
1531 return REG_ESPACE;
1532 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1533 if (BE (ret < 0, 0))
1534 return REG_ESPACE;
1535 }
1536 else /* dfa->edests[org_node].nelem == 2 */
1537 {
1538 /* In case of the node can epsilon-transit, and it has two
1539 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1540 org_dest = dfa->edests[org_node].elems[0];
1541 re_node_set_empty (dfa->edests + clone_node);
1542 /* Search for a duplicated node which satisfies the constraint. */
1543 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1544 if (clone_dest == -1)
1545 {
1546 /* There is no such duplicated node, create a new one. */
1547 reg_errcode_t err;
1548 clone_dest = duplicate_node (dfa, org_dest, constraint);
1549 if (BE (clone_dest == -1, 0))
1550 return REG_ESPACE;
1551 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1552 if (BE (ret < 0, 0))
1553 return REG_ESPACE;
1554 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1555 root_node, constraint);
1556 if (BE (err != REG_NOERROR, 0))
1557 return err;
1558 }
1559 else
1560 {
1561 /* There is a duplicated node which satisfies the constraint,
1562 use it to avoid infinite loop. */
1563 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1564 if (BE (ret < 0, 0))
1565 return REG_ESPACE;
1566 }
1567
1568 org_dest = dfa->edests[org_node].elems[1];
1569 clone_dest = duplicate_node (dfa, org_dest, constraint);
1570 if (BE (clone_dest == -1, 0))
1571 return REG_ESPACE;
1572 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1573 if (BE (ret < 0, 0))
1574 return REG_ESPACE;
1575 }
1576 org_node = org_dest;
1577 clone_node = clone_dest;
1578 }
1579 return REG_NOERROR;
1580}
1581
1582/* Search for a node which is duplicated from the node ORG_NODE, and
1583 satisfies the constraint CONSTRAINT. */
1584
1585static int
1586search_duplicated_node (const re_dfa_t *dfa, int org_node,
1587 unsigned int constraint)
1588{
1589 int idx;
1590 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1591 {
1592 if (org_node == dfa->org_indices[idx]
1593 && constraint == dfa->nodes[idx].constraint)
1594 return idx; /* Found. */
1595 }
1596 return -1; /* Not found. */
1597}
1598
1599/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1600 Return the index of the new node, or -1 if insufficient storage is
1601 available. */
1602
1603static int
1604duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1605{
1606 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1607 if (BE (dup_idx != -1, 1))
1608 {
1609 dfa->nodes[dup_idx].constraint = constraint;
1610 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1611 dfa->nodes[dup_idx].duplicated = 1;
1612
1613 /* Store the index of the original node. */
1614 dfa->org_indices[dup_idx] = org_idx;
1615 }
1616 return dup_idx;
1617}
1618
1619static reg_errcode_t
1620calc_inveclosure (re_dfa_t *dfa)
1621{
1622 int src, idx, ret;
1623 for (idx = 0; idx < dfa->nodes_len; ++idx)
1624 re_node_set_init_empty (dfa->inveclosures + idx);
1625
1626 for (src = 0; src < dfa->nodes_len; ++src)
1627 {
1628 int *elems = dfa->eclosures[src].elems;
1629 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1630 {
1631 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1632 if (BE (ret == -1, 0))
1633 return REG_ESPACE;
1634 }
1635 }
1636
1637 return REG_NOERROR;
1638}
1639
1640/* Calculate "eclosure" for all the node in DFA. */
1641
1642static reg_errcode_t
1643calc_eclosure (re_dfa_t *dfa)
1644{
1645 int node_idx, incomplete;
1646#ifdef DEBUG
1647 assert (dfa->nodes_len > 0);
1648#endif
1649 incomplete = 0;
1650 /* For each nodes, calculate epsilon closure. */
1651 for (node_idx = 0; ; ++node_idx)
1652 {
1653 reg_errcode_t err;
1654 re_node_set eclosure_elem;
1655 if (node_idx == dfa->nodes_len)
1656 {
1657 if (!incomplete)
1658 break;
1659 incomplete = 0;
1660 node_idx = 0;
1661 }
1662
1663#ifdef DEBUG
1664 assert (dfa->eclosures[node_idx].nelem != -1);
1665#endif
1666
1667 /* If we have already calculated, skip it. */
1668 if (dfa->eclosures[node_idx].nelem != 0)
1669 continue;
1670 /* Calculate epsilon closure of `node_idx'. */
1671 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1672 if (BE (err != REG_NOERROR, 0))
1673 return err;
1674
1675 if (dfa->eclosures[node_idx].nelem == 0)
1676 {
1677 incomplete = 1;
1678 re_node_set_free (&eclosure_elem);
1679 }
1680 }
1681 return REG_NOERROR;
1682}
1683
1684/* Calculate epsilon closure of NODE. */
1685
1686static reg_errcode_t
1687calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1688{
1689 reg_errcode_t err;
1690 int i;
1691 re_node_set eclosure;
1692 int ret;
1693 int incomplete = 0;
1694 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1695 if (BE (err != REG_NOERROR, 0))
1696 return err;
1697
1698 /* This indicates that we are calculating this node now.
1699 We reference this value to avoid infinite loop. */
1700 dfa->eclosures[node].nelem = -1;
1701
1702 /* If the current node has constraints, duplicate all nodes
1703 since they must inherit the constraints. */
1704 if (dfa->nodes[node].constraint
1705 && dfa->edests[node].nelem
1706 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1707 {
1708 err = duplicate_node_closure (dfa, node, node, node,
1709 dfa->nodes[node].constraint);
1710 if (BE (err != REG_NOERROR, 0))
1711 return err;
1712 }
1713
1714 /* Expand each epsilon destination nodes. */
1715 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1716 for (i = 0; i < dfa->edests[node].nelem; ++i)
1717 {
1718 re_node_set eclosure_elem;
1719 int edest = dfa->edests[node].elems[i];
1720 /* If calculating the epsilon closure of `edest' is in progress,
1721 return intermediate result. */
1722 if (dfa->eclosures[edest].nelem == -1)
1723 {
1724 incomplete = 1;
1725 continue;
1726 }
1727 /* If we haven't calculated the epsilon closure of `edest' yet,
1728 calculate now. Otherwise use calculated epsilon closure. */
1729 if (dfa->eclosures[edest].nelem == 0)
1730 {
1731 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1732 if (BE (err != REG_NOERROR, 0))
1733 return err;
1734 }
1735 else
1736 eclosure_elem = dfa->eclosures[edest];
1737 /* Merge the epsilon closure of `edest'. */
1738 err = re_node_set_merge (&eclosure, &eclosure_elem);
1739 if (BE (err != REG_NOERROR, 0))
1740 return err;
1741 /* If the epsilon closure of `edest' is incomplete,
1742 the epsilon closure of this node is also incomplete. */
1743 if (dfa->eclosures[edest].nelem == 0)
1744 {
1745 incomplete = 1;
1746 re_node_set_free (&eclosure_elem);
1747 }
1748 }
1749
1750 /* An epsilon closure includes itself. */
1751 ret = re_node_set_insert (&eclosure, node);
1752 if (BE (ret < 0, 0))
1753 return REG_ESPACE;
1754 if (incomplete && !root)
1755 dfa->eclosures[node].nelem = 0;
1756 else
1757 dfa->eclosures[node] = eclosure;
1758 *new_set = eclosure;
1759 return REG_NOERROR;
1760}
1761\f
1762/* Functions for token which are used in the parser. */
1763
1764/* Fetch a token from INPUT.
1765 We must not use this function inside bracket expressions. */
1766
1767static void
1768internal_function
1769fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1770{
1771 re_string_skip_bytes (input, peek_token (result, input, syntax));
1772}
1773
1774/* Peek a token from INPUT, and return the length of the token.
1775 We must not use this function inside bracket expressions. */
1776
1777static int
1778internal_function
1779peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1780{
1781 unsigned char c;
1782
1783 if (re_string_eoi (input))
1784 {
1785 token->type = END_OF_RE;
1786 return 0;
1787 }
1788
1789 c = re_string_peek_byte (input, 0);
1790 token->opr.c = c;
1791
1792 token->word_char = 0;
1793#ifdef RE_ENABLE_I18N
1794 token->mb_partial = 0;
1795 if (input->mb_cur_max > 1 &&
1796 !re_string_first_byte (input, re_string_cur_idx (input)))
1797 {
1798 token->type = CHARACTER;
1799 token->mb_partial = 1;
1800 return 1;
1801 }
1802#endif
1803 if (c == '\\')
1804 {
1805 unsigned char c2;
1806 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1807 {
1808 token->type = BACK_SLASH;
1809 return 1;
1810 }
1811
1812 c2 = re_string_peek_byte_case (input, 1);
1813 token->opr.c = c2;
1814 token->type = CHARACTER;
1815#ifdef RE_ENABLE_I18N
1816 if (input->mb_cur_max > 1)
1817 {
1818 wint_t wc = re_string_wchar_at (input,
1819 re_string_cur_idx (input) + 1);
1820 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1821 }
1822 else
1823#endif
1824 token->word_char = IS_WORD_CHAR (c2) != 0;
1825
1826 switch (c2)
1827 {
1828 case '|':
1829 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1830 token->type = OP_ALT;
1831 break;
1832 case '1': case '2': case '3': case '4': case '5':
1833 case '6': case '7': case '8': case '9':
1834 if (!(syntax & RE_NO_BK_REFS))
1835 {
1836 token->type = OP_BACK_REF;
1837 token->opr.idx = c2 - '1';
1838 }
1839 break;
1840 case '<':
1841 if (!(syntax & RE_NO_GNU_OPS))
1842 {
1843 token->type = ANCHOR;
1844 token->opr.ctx_type = WORD_FIRST;
1845 }
1846 break;
1847 case '>':
1848 if (!(syntax & RE_NO_GNU_OPS))
1849 {
1850 token->type = ANCHOR;
1851 token->opr.ctx_type = WORD_LAST;
1852 }
1853 break;
1854 case 'b':
1855 if (!(syntax & RE_NO_GNU_OPS))
1856 {
1857 token->type = ANCHOR;
1858 token->opr.ctx_type = WORD_DELIM;
1859 }
1860 break;
1861 case 'B':
1862 if (!(syntax & RE_NO_GNU_OPS))
1863 {
1864 token->type = ANCHOR;
1865 token->opr.ctx_type = NOT_WORD_DELIM;
1866 }
1867 break;
1868 case 'w':
1869 if (!(syntax & RE_NO_GNU_OPS))
1870 token->type = OP_WORD;
1871 break;
1872 case 'W':
1873 if (!(syntax & RE_NO_GNU_OPS))
1874 token->type = OP_NOTWORD;
1875 break;
1876 case 's':
1877 if (!(syntax & RE_NO_GNU_OPS))
1878 token->type = OP_SPACE;
1879 break;
1880 case 'S':
1881 if (!(syntax & RE_NO_GNU_OPS))
1882 token->type = OP_NOTSPACE;
1883 break;
1884 case '`':
1885 if (!(syntax & RE_NO_GNU_OPS))
1886 {
1887 token->type = ANCHOR;
1888 token->opr.ctx_type = BUF_FIRST;
1889 }
1890 break;
1891 case '\'':
1892 if (!(syntax & RE_NO_GNU_OPS))
1893 {
1894 token->type = ANCHOR;
1895 token->opr.ctx_type = BUF_LAST;
1896 }
1897 break;
1898 case '(':
1899 if (!(syntax & RE_NO_BK_PARENS))
1900 token->type = OP_OPEN_SUBEXP;
1901 break;
1902 case ')':
1903 if (!(syntax & RE_NO_BK_PARENS))
1904 token->type = OP_CLOSE_SUBEXP;
1905 break;
1906 case '+':
1907 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1908 token->type = OP_DUP_PLUS;
1909 break;
1910 case '?':
1911 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1912 token->type = OP_DUP_QUESTION;
1913 break;
1914 case '{':
1915 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1916 token->type = OP_OPEN_DUP_NUM;
1917 break;
1918 case '}':
1919 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1920 token->type = OP_CLOSE_DUP_NUM;
1921 break;
1922 default:
1923 break;
1924 }
1925 return 2;
1926 }
1927
1928 token->type = CHARACTER;
1929#ifdef RE_ENABLE_I18N
1930 if (input->mb_cur_max > 1)
1931 {
1932 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1933 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1934 }
1935 else
1936#endif
1937 token->word_char = IS_WORD_CHAR (token->opr.c);
1938
1939 switch (c)
1940 {
1941 case '\n':
1942 if (syntax & RE_NEWLINE_ALT)
1943 token->type = OP_ALT;
1944 break;
1945 case '|':
1946 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1947 token->type = OP_ALT;
1948 break;
1949 case '*':
1950 token->type = OP_DUP_ASTERISK;
1951 break;
1952 case '+':
1953 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1954 token->type = OP_DUP_PLUS;
1955 break;
1956 case '?':
1957 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1958 token->type = OP_DUP_QUESTION;
1959 break;
1960 case '{':
1961 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1962 token->type = OP_OPEN_DUP_NUM;
1963 break;
1964 case '}':
1965 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1966 token->type = OP_CLOSE_DUP_NUM;
1967 break;
1968 case '(':
1969 if (syntax & RE_NO_BK_PARENS)
1970 token->type = OP_OPEN_SUBEXP;
1971 break;
1972 case ')':
1973 if (syntax & RE_NO_BK_PARENS)
1974 token->type = OP_CLOSE_SUBEXP;
1975 break;
1976 case '[':
1977 token->type = OP_OPEN_BRACKET;
1978 break;
1979 case '.':
1980 token->type = OP_PERIOD;
1981 break;
1982 case '^':
1983 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1984 re_string_cur_idx (input) != 0)
1985 {
1986 char prev = re_string_peek_byte (input, -1);
1987 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1988 break;
1989 }
1990 token->type = ANCHOR;
1991 token->opr.ctx_type = LINE_FIRST;
1992 break;
1993 case '$':
1994 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1995 re_string_cur_idx (input) + 1 != re_string_length (input))
1996 {
1997 re_token_t next;
1998 re_string_skip_bytes (input, 1);
1999 peek_token (&next, input, syntax);
2000 re_string_skip_bytes (input, -1);
2001 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2002 break;
2003 }
2004 token->type = ANCHOR;
2005 token->opr.ctx_type = LINE_LAST;
2006 break;
2007 default:
2008 break;
2009 }
2010 return 1;
2011}
2012
2013/* Peek a token from INPUT, and return the length of the token.
2014 We must not use this function out of bracket expressions. */
2015
2016static int
2017internal_function
2018peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2019{
2020 unsigned char c;
2021 if (re_string_eoi (input))
2022 {
2023 token->type = END_OF_RE;
2024 return 0;
2025 }
2026 c = re_string_peek_byte (input, 0);
2027 token->opr.c = c;
2028
2029#ifdef RE_ENABLE_I18N
2030 if (input->mb_cur_max > 1 &&
2031 !re_string_first_byte (input, re_string_cur_idx (input)))
2032 {
2033 token->type = CHARACTER;
2034 return 1;
2035 }
2036#endif /* RE_ENABLE_I18N */
2037
2038 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2039 && re_string_cur_idx (input) + 1 < re_string_length (input))
2040 {
2041 /* In this case, '\' escape a character. */
2042 unsigned char c2;
2043 re_string_skip_bytes (input, 1);
2044 c2 = re_string_peek_byte (input, 0);
2045 token->opr.c = c2;
2046 token->type = CHARACTER;
2047 return 1;
2048 }
2049 if (c == '[') /* '[' is a special char in a bracket exps. */
2050 {
2051 unsigned char c2;
2052 int token_len;
2053 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2054 c2 = re_string_peek_byte (input, 1);
2055 else
2056 c2 = 0;
2057 token->opr.c = c2;
2058 token_len = 2;
2059 switch (c2)
2060 {
2061 case '.':
2062 token->type = OP_OPEN_COLL_ELEM;
2063 break;
2064 case '=':
2065 token->type = OP_OPEN_EQUIV_CLASS;
2066 break;
2067 case ':':
2068 if (syntax & RE_CHAR_CLASSES)
2069 {
2070 token->type = OP_OPEN_CHAR_CLASS;
2071 break;
2072 }
2073 /* else fall through. */
2074 default:
2075 token->type = CHARACTER;
2076 token->opr.c = c;
2077 token_len = 1;
2078 break;
2079 }
2080 return token_len;
2081 }
2082 switch (c)
2083 {
2084 case '-':
2085 token->type = OP_CHARSET_RANGE;
2086 break;
2087 case ']':
2088 token->type = OP_CLOSE_BRACKET;
2089 break;
2090 case '^':
2091 token->type = OP_NON_MATCH_LIST;
2092 break;
2093 default:
2094 token->type = CHARACTER;
2095 }
2096 return 1;
2097}
2098\f
2099/* Functions for parser. */
2100
2101/* Entry point of the parser.
2102 Parse the regular expression REGEXP and return the structure tree.
2103 If an error is occured, ERR is set by error code, and return NULL.
2104 This function build the following tree, from regular expression <reg_exp>:
2105 CAT
2106 / \
2107 / \
2108 <reg_exp> EOR
2109
2110 CAT means concatenation.
2111 EOR means end of regular expression. */
2112
2113static bin_tree_t *
2114parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2115 reg_errcode_t *err)
2116{
2117 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2118 bin_tree_t *tree, *eor, *root;
2119 re_token_t current_token;
2120 dfa->syntax = syntax;
2121 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2122 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2123 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2124 return NULL;
2125 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2126 if (tree != NULL)
2127 root = create_tree (dfa, tree, eor, CONCAT);
2128 else
2129 root = eor;
2130 if (BE (eor == NULL || root == NULL, 0))
2131 {
2132 *err = REG_ESPACE;
2133 return NULL;
2134 }
2135 return root;
2136}
2137
2138/* This function build the following tree, from regular expression
2139 <branch1>|<branch2>:
2140 ALT
2141 / \
2142 / \
2143 <branch1> <branch2>
2144
2145 ALT means alternative, which represents the operator `|'. */
2146
2147static bin_tree_t *
2148parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2149 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2150{
2151 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2152 bin_tree_t *tree, *branch = NULL;
2153 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2154 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2155 return NULL;
2156
2157 while (token->type == OP_ALT)
2158 {
2159 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2160 if (token->type != OP_ALT && token->type != END_OF_RE
2161 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2162 {
2163 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2164 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2165 return NULL;
2166 }
2167 else
2168 branch = NULL;
2169 tree = create_tree (dfa, tree, branch, OP_ALT);
2170 if (BE (tree == NULL, 0))
2171 {
2172 *err = REG_ESPACE;
2173 return NULL;
2174 }
2175 }
2176 return tree;
2177}
2178
2179/* This function build the following tree, from regular expression
2180 <exp1><exp2>:
2181 CAT
2182 / \
2183 / \
2184 <exp1> <exp2>
2185
2186 CAT means concatenation. */
2187
2188static bin_tree_t *
2189parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2190 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2191{
2192 bin_tree_t *tree, *exp;
2193 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2194 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2195 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2196 return NULL;
2197
2198 while (token->type != OP_ALT && token->type != END_OF_RE
2199 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2200 {
2201 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2202 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2203 {
2204 return NULL;
2205 }
2206 if (tree != NULL && exp != NULL)
2207 {
2208 tree = create_tree (dfa, tree, exp, CONCAT);
2209 if (tree == NULL)
2210 {
2211 *err = REG_ESPACE;
2212 return NULL;
2213 }
2214 }
2215 else if (tree == NULL)
2216 tree = exp;
2217 /* Otherwise exp == NULL, we don't need to create new tree. */
2218 }
2219 return tree;
2220}
2221
2222/* This function build the following tree, from regular expression a*:
2223 *
2224 |
2225 a
2226*/
2227
2228static bin_tree_t *
2229parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2230 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2231{
2232 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2233 bin_tree_t *tree;
2234 switch (token->type)
2235 {
2236 case CHARACTER:
2237 tree = create_token_tree (dfa, NULL, NULL, token);
2238 if (BE (tree == NULL, 0))
2239 {
2240 *err = REG_ESPACE;
2241 return NULL;
2242 }
2243#ifdef RE_ENABLE_I18N
2244 if (dfa->mb_cur_max > 1)
2245 {
2246 while (!re_string_eoi (regexp)
2247 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2248 {
2249 bin_tree_t *mbc_remain;
2250 fetch_token (token, regexp, syntax);
2251 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2252 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2253 if (BE (mbc_remain == NULL || tree == NULL, 0))
2254 {
2255 *err = REG_ESPACE;
2256 return NULL;
2257 }
2258 }
2259 }
2260#endif
2261 break;
2262 case OP_OPEN_SUBEXP:
2263 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2264 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2265 return NULL;
2266 break;
2267 case OP_OPEN_BRACKET:
2268 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2269 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2270 return NULL;
2271 break;
2272 case OP_BACK_REF:
2273 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2274 {
2275 *err = REG_ESUBREG;
2276 return NULL;
2277 }
2278 dfa->used_bkref_map |= 1 << token->opr.idx;
2279 tree = create_token_tree (dfa, NULL, NULL, token);
2280 if (BE (tree == NULL, 0))
2281 {
2282 *err = REG_ESPACE;
2283 return NULL;
2284 }
2285 ++dfa->nbackref;
2286 dfa->has_mb_node = 1;
2287 break;
2288 case OP_OPEN_DUP_NUM:
2289 if (syntax & RE_CONTEXT_INVALID_DUP)
2290 {
2291 *err = REG_BADRPT;
2292 return NULL;
2293 }
2294 /* FALLTHROUGH */
2295 case OP_DUP_ASTERISK:
2296 case OP_DUP_PLUS:
2297 case OP_DUP_QUESTION:
2298 if (syntax & RE_CONTEXT_INVALID_OPS)
2299 {
2300 *err = REG_BADRPT;
2301 return NULL;
2302 }
2303 else if (syntax & RE_CONTEXT_INDEP_OPS)
2304 {
2305 fetch_token (token, regexp, syntax);
2306 return parse_expression (regexp, preg, token, syntax, nest, err);
2307 }
2308 /* else fall through */
2309 case OP_CLOSE_SUBEXP:
2310 if ((token->type == OP_CLOSE_SUBEXP) &&
2311 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2312 {
2313 *err = REG_ERPAREN;
2314 return NULL;
2315 }
2316 /* else fall through */
2317 case OP_CLOSE_DUP_NUM:
2318 /* We treat it as a normal character. */
2319
2320 /* Then we can these characters as normal characters. */
2321 token->type = CHARACTER;
2322 /* mb_partial and word_char bits should be initialized already
2323 by peek_token. */
2324 tree = create_token_tree (dfa, NULL, NULL, token);
2325 if (BE (tree == NULL, 0))
2326 {
2327 *err = REG_ESPACE;
2328 return NULL;
2329 }
2330 break;
2331 case ANCHOR:
2332 if ((token->opr.ctx_type
2333 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2334 && dfa->word_ops_used == 0)
2335 init_word_char (dfa);
2336 if (token->opr.ctx_type == WORD_DELIM
2337 || token->opr.ctx_type == NOT_WORD_DELIM)
2338 {
2339 bin_tree_t *tree_first, *tree_last;
2340 if (token->opr.ctx_type == WORD_DELIM)
2341 {
2342 token->opr.ctx_type = WORD_FIRST;
2343 tree_first = create_token_tree (dfa, NULL, NULL, token);
2344 token->opr.ctx_type = WORD_LAST;
2345 }
2346 else
2347 {
2348 token->opr.ctx_type = INSIDE_WORD;
2349 tree_first = create_token_tree (dfa, NULL, NULL, token);
2350 token->opr.ctx_type = INSIDE_NOTWORD;
2351 }
2352 tree_last = create_token_tree (dfa, NULL, NULL, token);
2353 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2354 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2355 {
2356 *err = REG_ESPACE;
2357 return NULL;
2358 }
2359 }
2360 else
2361 {
2362 tree = create_token_tree (dfa, NULL, NULL, token);
2363 if (BE (tree == NULL, 0))
2364 {
2365 *err = REG_ESPACE;
2366 return NULL;
2367 }
2368 }
2369 /* We must return here, since ANCHORs can't be followed
2370 by repetition operators.
2371 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2372 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2373 fetch_token (token, regexp, syntax);
2374 return tree;
2375 case OP_PERIOD:
2376 tree = create_token_tree (dfa, NULL, NULL, token);
2377 if (BE (tree == NULL, 0))
2378 {
2379 *err = REG_ESPACE;
2380 return NULL;
2381 }
2382 if (dfa->mb_cur_max > 1)
2383 dfa->has_mb_node = 1;
2384 break;
2385 case OP_WORD:
2386 case OP_NOTWORD:
2387 tree = build_charclass_op (dfa, regexp->trans,
2388 "alnum",
2389 "_",
2390 token->type == OP_NOTWORD, err);
2391 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2392 return NULL;
2393 break;
2394 case OP_SPACE:
2395 case OP_NOTSPACE:
2396 tree = build_charclass_op (dfa, regexp->trans,
2397 "space",
2398 "",
2399 token->type == OP_NOTSPACE, err);
2400 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2401 return NULL;
2402 break;
2403 case OP_ALT:
2404 case END_OF_RE:
2405 return NULL;
2406 case BACK_SLASH:
2407 *err = REG_EESCAPE;
2408 return NULL;
2409 default:
2410 /* Must not happen? */
2411#ifdef DEBUG
2412 assert (0);
2413#endif
2414 return NULL;
2415 }
2416 fetch_token (token, regexp, syntax);
2417
2418 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2419 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2420 {
2421 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2422 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2423 return NULL;
2424 /* In BRE consecutive duplications are not allowed. */
2425 if ((syntax & RE_CONTEXT_INVALID_DUP)
2426 && (token->type == OP_DUP_ASTERISK
2427 || token->type == OP_OPEN_DUP_NUM))
2428 {
2429 *err = REG_BADRPT;
2430 return NULL;
2431 }
2432 }
2433
2434 return tree;
2435}
2436
2437/* This function build the following tree, from regular expression
2438 (<reg_exp>):
2439 SUBEXP
2440 |
2441 <reg_exp>
2442*/
2443
2444static bin_tree_t *
2445parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2446 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2447{
2448 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2449 bin_tree_t *tree;
2450 size_t cur_nsub;
2451 cur_nsub = preg->re_nsub++;
2452
2453 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2454
2455 /* The subexpression may be a null string. */
2456 if (token->type == OP_CLOSE_SUBEXP)
2457 tree = NULL;
2458 else
2459 {
2460 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2461 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2462 *err = REG_EPAREN;
2463 if (BE (*err != REG_NOERROR, 0))
2464 return NULL;
2465 }
2466
2467 if (cur_nsub <= '9' - '1')
2468 dfa->completed_bkref_map |= 1 << cur_nsub;
2469
2470 tree = create_tree (dfa, tree, NULL, SUBEXP);
2471 if (BE (tree == NULL, 0))
2472 {
2473 *err = REG_ESPACE;
2474 return NULL;
2475 }
2476 tree->token.opr.idx = cur_nsub;
2477 return tree;
2478}
2479
2480/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2481
2482static bin_tree_t *
2483parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2484 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2485{
2486 bin_tree_t *tree = NULL, *old_tree = NULL;
2487 int i, start, end, start_idx = re_string_cur_idx (regexp);
2488#ifndef RE_TOKEN_INIT_BUG
2489 re_token_t start_token = *token;
2490#else
2491 re_token_t start_token;
2492
2493 memcpy ((void *) &start_token, (void *) token, sizeof start_token);
2494#endif
2495
2496 if (token->type == OP_OPEN_DUP_NUM)
2497 {
2498 end = 0;
2499 start = fetch_number (regexp, token, syntax);
2500 if (start == -1)
2501 {
2502 if (token->type == CHARACTER && token->opr.c == ',')
2503 start = 0; /* We treat "{,m}" as "{0,m}". */
2504 else
2505 {
2506 *err = REG_BADBR; /* <re>{} is invalid. */
2507 return NULL;
2508 }
2509 }
2510 if (BE (start != -2, 1))
2511 {
2512 /* We treat "{n}" as "{n,n}". */
2513 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2514 : ((token->type == CHARACTER && token->opr.c == ',')
2515 ? fetch_number (regexp, token, syntax) : -2));
2516 }
2517 if (BE (start == -2 || end == -2, 0))
2518 {
2519 /* Invalid sequence. */
2520 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2521 {
2522 if (token->type == END_OF_RE)
2523 *err = REG_EBRACE;
2524 else
2525 *err = REG_BADBR;
2526
2527 return NULL;
2528 }
2529
2530 /* If the syntax bit is set, rollback. */
2531 re_string_set_index (regexp, start_idx);
2532 *token = start_token;
2533 token->type = CHARACTER;
2534 /* mb_partial and word_char bits should be already initialized by
2535 peek_token. */
2536 return elem;
2537 }
2538
2539 if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2540 {
2541 /* First number greater than second. */
2542 *err = REG_BADBR;
2543 return NULL;
2544 }
2545 }
2546 else
2547 {
2548 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2549 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2550 }
2551
2552 fetch_token (token, regexp, syntax);
2553
2554 if (BE (elem == NULL, 0))
2555 return NULL;
2556 if (BE (start == 0 && end == 0, 0))
2557 {
2558 postorder (elem, free_tree, NULL);
2559 return NULL;
2560 }
2561
2562 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2563 if (BE (start > 0, 0))
2564 {
2565 tree = elem;
2566 for (i = 2; i <= start; ++i)
2567 {
2568 elem = duplicate_tree (elem, dfa);
2569 tree = create_tree (dfa, tree, elem, CONCAT);
2570 if (BE (elem == NULL || tree == NULL, 0))
2571 goto parse_dup_op_espace;
2572 }
2573
2574 if (start == end)
2575 return tree;
2576
2577 /* Duplicate ELEM before it is marked optional. */
2578 elem = duplicate_tree (elem, dfa);
2579 old_tree = tree;
2580 }
2581 else
2582 old_tree = NULL;
2583
2584 if (elem->token.type == SUBEXP)
2585 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2586
2587 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2588 if (BE (tree == NULL, 0))
2589 goto parse_dup_op_espace;
2590
2591 /* This loop is actually executed only when end != -1,
2592 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2593 already created the start+1-th copy. */
2594 for (i = start + 2; i <= end; ++i)
2595 {
2596 elem = duplicate_tree (elem, dfa);
2597 tree = create_tree (dfa, tree, elem, CONCAT);
2598 if (BE (elem == NULL || tree == NULL, 0))
2599 goto parse_dup_op_espace;
2600
2601 tree = create_tree (dfa, tree, NULL, OP_ALT);
2602 if (BE (tree == NULL, 0))
2603 goto parse_dup_op_espace;
2604 }
2605
2606 if (old_tree)
2607 tree = create_tree (dfa, old_tree, tree, CONCAT);
2608
2609 return tree;
2610
2611 parse_dup_op_espace:
2612 *err = REG_ESPACE;
2613 return NULL;
2614}
2615
2616/* Size of the names for collating symbol/equivalence_class/character_class.
2617 I'm not sure, but maybe enough. */
2618#define BRACKET_NAME_BUF_SIZE 32
2619
2620#ifndef _LIBC
2621 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2622 Build the range expression which starts from START_ELEM, and ends
2623 at END_ELEM. The result are written to MBCSET and SBCSET.
2624 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2625 mbcset->range_ends, is a pointer argument sinse we may
2626 update it. */
2627
2628static reg_errcode_t
2629internal_function
2630# ifdef RE_ENABLE_I18N
2631build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2632 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2633# else /* not RE_ENABLE_I18N */
2634build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2635 bracket_elem_t *end_elem)
2636# endif /* not RE_ENABLE_I18N */
2637{
2638 unsigned int start_ch, end_ch;
2639 /* Equivalence Classes and Character Classes can't be a range start/end. */
2640 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2641 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2642 0))
2643 return REG_ERANGE;
2644
2645 /* We can handle no multi character collating elements without libc
2646 support. */
2647 if (BE ((start_elem->type == COLL_SYM
2648 && strlen ((char *) start_elem->opr.name) > 1)
2649 || (end_elem->type == COLL_SYM
2650 && strlen ((char *) end_elem->opr.name) > 1), 0))
2651 return REG_ECOLLATE;
2652
2653# ifdef RE_ENABLE_I18N
2654 {
2655 wchar_t wc;
2656 wint_t start_wc;
2657 wint_t end_wc;
2658 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2659
2660 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2661 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2662 : 0));
2663 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2664 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2665 : 0));
2666#ifdef GAWK
2667 /*
2668 * Fedora Core 2, maybe others, have broken `btowc' that returns -1
2669 * for any value > 127. Sigh. Note that `start_ch' and `end_ch' are
2670 * unsigned, so we don't have sign extension problems.
2671 */
2672 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2673 ? start_ch : start_elem->opr.wch);
2674 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2675 ? end_ch : end_elem->opr.wch);
2676#else
2677 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2678 ? __btowc (start_ch) : start_elem->opr.wch);
2679 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2680 ? __btowc (end_ch) : end_elem->opr.wch);
2681#endif
2682 if (start_wc == WEOF || end_wc == WEOF)
2683 return REG_ECOLLATE;
2684 cmp_buf[0] = start_wc;
2685 cmp_buf[4] = end_wc;
2686 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2687 return REG_ERANGE;
2688
2689 /* Got valid collation sequence values, add them as a new entry.
2690 However, for !_LIBC we have no collation elements: if the
2691 character set is single byte, the single byte character set
2692 that we build below suffices. parse_bracket_exp passes
2693 no MBCSET if dfa->mb_cur_max == 1. */
2694 if (mbcset)
2695 {
2696 /* Check the space of the arrays. */
2697 if (BE (*range_alloc == mbcset->nranges, 0))
2698 {
2699 /* There is not enough space, need realloc. */
2700 wchar_t *new_array_start, *new_array_end;
2701 int new_nranges;
2702
2703 /* +1 in case of mbcset->nranges is 0. */
2704 new_nranges = 2 * mbcset->nranges + 1;
2705 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2706 are NULL if *range_alloc == 0. */
2707 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2708 new_nranges);
2709 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2710 new_nranges);
2711
2712 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2713 return REG_ESPACE;
2714
2715 mbcset->range_starts = new_array_start;
2716 mbcset->range_ends = new_array_end;
2717 *range_alloc = new_nranges;
2718 }
2719
2720 mbcset->range_starts[mbcset->nranges] = start_wc;
2721 mbcset->range_ends[mbcset->nranges++] = end_wc;
2722 }
2723
2724 /* Build the table for single byte characters. */
2725 for (wc = 0; wc < SBC_MAX; ++wc)
2726 {
2727 cmp_buf[2] = wc;
2728 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2729 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2730 bitset_set (sbcset, wc);
2731 }
2732 }
2733# else /* not RE_ENABLE_I18N */
2734 {
2735 unsigned int ch;
2736 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2737 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2738 : 0));
2739 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2740 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2741 : 0));
2742 if (start_ch > end_ch)
2743 return REG_ERANGE;
2744 /* Build the table for single byte characters. */
2745 for (ch = 0; ch < SBC_MAX; ++ch)
2746 if (start_ch <= ch && ch <= end_ch)
2747 bitset_set (sbcset, ch);
2748 }
2749# endif /* not RE_ENABLE_I18N */
2750 return REG_NOERROR;
2751}
2752#endif /* not _LIBC */
2753
2754#ifndef _LIBC
2755/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2756 Build the collating element which is represented by NAME.
2757 The result are written to MBCSET and SBCSET.
2758 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2759 pointer argument since we may update it. */
2760
2761static reg_errcode_t
2762internal_function
2763# ifdef RE_ENABLE_I18N
2764build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2765 int *coll_sym_alloc, const unsigned char *name)
2766# else /* not RE_ENABLE_I18N */
2767build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2768# endif /* not RE_ENABLE_I18N */
2769{
2770 size_t name_len = strlen ((const char *) name);
2771 if (BE (name_len != 1, 0))
2772 return REG_ECOLLATE;
2773 else
2774 {
2775 bitset_set (sbcset, name[0]);
2776 return REG_NOERROR;
2777 }
2778}
2779#endif /* not _LIBC */
2780
2781/* This function parse bracket expression like "[abc]", "[a-c]",
2782 "[[.a-a.]]" etc. */
2783
2784static bin_tree_t *
2785parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2786 reg_syntax_t syntax, reg_errcode_t *err)
2787{
2788#ifdef _LIBC
2789 const unsigned char *collseqmb;
2790 const char *collseqwc;
2791 uint32_t nrules;
2792 int32_t table_size;
2793 const int32_t *symb_table;
2794 const unsigned char *extra;
2795
2796 /* Local function for parse_bracket_exp used in _LIBC environement.
2797 Seek the collating symbol entry correspondings to NAME.
2798 Return the index of the symbol in the SYMB_TABLE. */
2799
2800 auto inline int32_t
2801 __attribute ((always_inline))
2802 seek_collating_symbol_entry (name, name_len)
2803 const unsigned char *name;
2804 size_t name_len;
2805 {
2806 int32_t hash = elem_hash ((const char *) name, name_len);
2807 int32_t elem = hash % table_size;
2808 if (symb_table[2 * elem] != 0)
2809 {
2810 int32_t second = hash % (table_size - 2) + 1;
2811
2812 do
2813 {
2814 /* First compare the hashing value. */
2815 if (symb_table[2 * elem] == hash
2816 /* Compare the length of the name. */
2817 && name_len == extra[symb_table[2 * elem + 1]]
2818 /* Compare the name. */
2819 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2820 name_len) == 0)
2821 {
2822 /* Yep, this is the entry. */
2823 break;
2824 }
2825
2826 /* Next entry. */
2827 elem += second;
2828 }
2829 while (symb_table[2 * elem] != 0);
2830 }
2831 return elem;
2832 }
2833
2834 /* Local function for parse_bracket_exp used in _LIBC environment.
2835 Look up the collation sequence value of BR_ELEM.
2836 Return the value if succeeded, UINT_MAX otherwise. */
2837
2838 auto inline unsigned int
2839 __attribute ((always_inline))
2840 lookup_collation_sequence_value (br_elem)
2841 bracket_elem_t *br_elem;
2842 {
2843 if (br_elem->type == SB_CHAR)
2844 {
2845 /*
2846 if (MB_CUR_MAX == 1)
2847 */
2848 if (nrules == 0)
2849 return collseqmb[br_elem->opr.ch];
2850 else
2851 {
2852 wint_t wc = __btowc (br_elem->opr.ch);
2853 return __collseq_table_lookup (collseqwc, wc);
2854 }
2855 }
2856 else if (br_elem->type == MB_CHAR)
2857 {
2858 if (nrules != 0)
2859 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2860 }
2861 else if (br_elem->type == COLL_SYM)
2862 {
2863 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2864 if (nrules != 0)
2865 {
2866 int32_t elem, idx;
2867 elem = seek_collating_symbol_entry (br_elem->opr.name,
2868 sym_name_len);
2869 if (symb_table[2 * elem] != 0)
2870 {
2871 /* We found the entry. */
2872 idx = symb_table[2 * elem + 1];
2873 /* Skip the name of collating element name. */
2874 idx += 1 + extra[idx];
2875 /* Skip the byte sequence of the collating element. */
2876 idx += 1 + extra[idx];
2877 /* Adjust for the alignment. */
2878 idx = (idx + 3) & ~3;
2879 /* Skip the multibyte collation sequence value. */
2880 idx += sizeof (unsigned int);
2881 /* Skip the wide char sequence of the collating element. */
2882 idx += sizeof (unsigned int) *
2883 (1 + *(unsigned int *) (extra + idx));
2884 /* Return the collation sequence value. */
2885 return *(unsigned int *) (extra + idx);
2886 }
2887 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2888 {
2889 /* No valid character. Match it as a single byte
2890 character. */
2891 return collseqmb[br_elem->opr.name[0]];
2892 }
2893 }
2894 else if (sym_name_len == 1)
2895 return collseqmb[br_elem->opr.name[0]];
2896 }
2897 return UINT_MAX;
2898 }
2899
2900 /* Local function for parse_bracket_exp used in _LIBC environement.
2901 Build the range expression which starts from START_ELEM, and ends
2902 at END_ELEM. The result are written to MBCSET and SBCSET.
2903 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2904 mbcset->range_ends, is a pointer argument sinse we may
2905 update it. */
2906
2907 auto inline reg_errcode_t
2908 __attribute ((always_inline))
2909 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2910 re_charset_t *mbcset;
2911 int *range_alloc;
2912 bitset_t sbcset;
2913 bracket_elem_t *start_elem, *end_elem;
2914 {
2915 unsigned int ch;
2916 uint32_t start_collseq;
2917 uint32_t end_collseq;
2918
2919 /* Equivalence Classes and Character Classes can't be a range
2920 start/end. */
2921 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2922 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2923 0))
2924 return REG_ERANGE;
2925
2926 start_collseq = lookup_collation_sequence_value (start_elem);
2927 end_collseq = lookup_collation_sequence_value (end_elem);
2928 /* Check start/end collation sequence values. */
2929 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2930 return REG_ECOLLATE;
2931 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2932 return REG_ERANGE;
2933
2934 /* Got valid collation sequence values, add them as a new entry.
2935 However, if we have no collation elements, and the character set
2936 is single byte, the single byte character set that we
2937 build below suffices. */
2938 if (nrules > 0 || dfa->mb_cur_max > 1)
2939 {
2940 /* Check the space of the arrays. */
2941 if (BE (*range_alloc == mbcset->nranges, 0))
2942 {
2943 /* There is not enough space, need realloc. */
2944 uint32_t *new_array_start;
2945 uint32_t *new_array_end;
2946 int new_nranges;
2947
2948 /* +1 in case of mbcset->nranges is 0. */
2949 new_nranges = 2 * mbcset->nranges + 1;
2950 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2951 new_nranges);
2952 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2953 new_nranges);
2954
2955 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2956 return REG_ESPACE;
2957
2958 mbcset->range_starts = new_array_start;
2959 mbcset->range_ends = new_array_end;
2960 *range_alloc = new_nranges;
2961 }
2962
2963 mbcset->range_starts[mbcset->nranges] = start_collseq;
2964 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2965 }
2966
2967 /* Build the table for single byte characters. */
2968 for (ch = 0; ch < SBC_MAX; ch++)
2969 {
2970 uint32_t ch_collseq;
2971 /*
2972 if (MB_CUR_MAX == 1)
2973 */
2974 if (nrules == 0)
2975 ch_collseq = collseqmb[ch];
2976 else
2977 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2978 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2979 bitset_set (sbcset, ch);
2980 }
2981 return REG_NOERROR;
2982 }
2983
2984 /* Local function for parse_bracket_exp used in _LIBC environement.
2985 Build the collating element which is represented by NAME.
2986 The result are written to MBCSET and SBCSET.
2987 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2988 pointer argument sinse we may update it. */
2989
2990 auto inline reg_errcode_t
2991 __attribute ((always_inline))
2992 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2993 re_charset_t *mbcset;
2994 int *coll_sym_alloc;
2995 bitset_t sbcset;
2996 const unsigned char *name;
2997 {
2998 int32_t elem, idx;
2999 size_t name_len = strlen ((const char *) name);
3000 if (nrules != 0)
3001 {
3002 elem = seek_collating_symbol_entry (name, name_len);
3003 if (symb_table[2 * elem] != 0)
3004 {
3005 /* We found the entry. */
3006 idx = symb_table[2 * elem + 1];
3007 /* Skip the name of collating element name. */
3008 idx += 1 + extra[idx];
3009 }
3010 else if (symb_table[2 * elem] == 0 && name_len == 1)
3011 {
3012 /* No valid character, treat it as a normal
3013 character. */
3014 bitset_set (sbcset, name[0]);
3015 return REG_NOERROR;
3016 }
3017 else
3018 return REG_ECOLLATE;
3019
3020 /* Got valid collation sequence, add it as a new entry. */
3021 /* Check the space of the arrays. */
3022 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3023 {
3024 /* Not enough, realloc it. */
3025 /* +1 in case of mbcset->ncoll_syms is 0. */
3026 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3027 /* Use realloc since mbcset->coll_syms is NULL
3028 if *alloc == 0. */
3029 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3030 new_coll_sym_alloc);
3031 if (BE (new_coll_syms == NULL, 0))
3032 return REG_ESPACE;
3033 mbcset->coll_syms = new_coll_syms;
3034 *coll_sym_alloc = new_coll_sym_alloc;
3035 }
3036 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3037 return REG_NOERROR;
3038 }
3039 else
3040 {
3041 if (BE (name_len != 1, 0))
3042 return REG_ECOLLATE;
3043 else
3044 {
3045 bitset_set (sbcset, name[0]);
3046 return REG_NOERROR;
3047 }
3048 }
3049 }
3050#endif
3051
3052 re_token_t br_token;
3053 re_bitset_ptr_t sbcset;
3054#ifdef RE_ENABLE_I18N
3055 re_charset_t *mbcset;
3056 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3057 int equiv_class_alloc = 0, char_class_alloc = 0;
3058#endif /* not RE_ENABLE_I18N */
3059 int non_match = 0;
3060 bin_tree_t *work_tree;
3061 int token_len;
3062 int first_round = 1;
3063#ifdef _LIBC
3064 collseqmb = (const unsigned char *)
3065 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3066 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3067 if (nrules)
3068 {
3069 /*
3070 if (MB_CUR_MAX > 1)
3071 */
3072 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3073 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3074 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3075 _NL_COLLATE_SYMB_TABLEMB);
3076 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3077 _NL_COLLATE_SYMB_EXTRAMB);
3078 }
3079#endif
3080 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3081#ifdef RE_ENABLE_I18N
3082 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3083#endif /* RE_ENABLE_I18N */
3084#ifdef RE_ENABLE_I18N
3085 if (BE (sbcset == NULL || mbcset == NULL, 0))
3086#else
3087 if (BE (sbcset == NULL, 0))
3088#endif /* RE_ENABLE_I18N */
3089 {
3090 *err = REG_ESPACE;
3091 return NULL;
3092 }
3093
3094 token_len = peek_token_bracket (token, regexp, syntax);
3095 if (BE (token->type == END_OF_RE, 0))
3096 {
3097 *err = REG_BADPAT;
3098 goto parse_bracket_exp_free_return;
3099 }
3100 if (token->type == OP_NON_MATCH_LIST)
3101 {
3102#ifdef RE_ENABLE_I18N
3103 mbcset->non_match = 1;
3104#endif /* not RE_ENABLE_I18N */
3105 non_match = 1;
3106 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3107 bitset_set (sbcset, '\n');
3108 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3109 token_len = peek_token_bracket (token, regexp, syntax);
3110 if (BE (token->type == END_OF_RE, 0))
3111 {
3112 *err = REG_BADPAT;
3113 goto parse_bracket_exp_free_return;
3114 }
3115 }
3116
3117 /* We treat the first ']' as a normal character. */
3118 if (token->type == OP_CLOSE_BRACKET)
3119 token->type = CHARACTER;
3120
3121 while (1)
3122 {
3123 bracket_elem_t start_elem, end_elem;
3124 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3125 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3126 reg_errcode_t ret;
3127 int token_len2 = 0, is_range_exp = 0;
3128 re_token_t token2;
3129
3130 start_elem.opr.name = start_name_buf;
3131 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3132 syntax, first_round);
3133 if (BE (ret != REG_NOERROR, 0))
3134 {
3135 *err = ret;
3136 goto parse_bracket_exp_free_return;
3137 }
3138 first_round = 0;
3139
3140 /* Get information about the next token. We need it in any case. */
3141 token_len = peek_token_bracket (token, regexp, syntax);
3142
3143 /* Do not check for ranges if we know they are not allowed. */
3144 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3145 {
3146 if (BE (token->type == END_OF_RE, 0))
3147 {
3148 *err = REG_EBRACK;
3149 goto parse_bracket_exp_free_return;
3150 }
3151 if (token->type == OP_CHARSET_RANGE)
3152 {
3153 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3154 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3155 if (BE (token2.type == END_OF_RE, 0))
3156 {
3157 *err = REG_EBRACK;
3158 goto parse_bracket_exp_free_return;
3159 }
3160 if (token2.type == OP_CLOSE_BRACKET)
3161 {
3162 /* We treat the last '-' as a normal character. */
3163 re_string_skip_bytes (regexp, -token_len);
3164 token->type = CHARACTER;
3165 }
3166 else
3167 is_range_exp = 1;
3168 }
3169 }
3170
3171 if (is_range_exp == 1)
3172 {
3173 end_elem.opr.name = end_name_buf;
3174 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3175 dfa, syntax, 1);
3176 if (BE (ret != REG_NOERROR, 0))
3177 {
3178 *err = ret;
3179 goto parse_bracket_exp_free_return;
3180 }
3181
3182 token_len = peek_token_bracket (token, regexp, syntax);
3183
3184#ifdef _LIBC
3185 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3186 &start_elem, &end_elem);
3187#else
3188# ifdef RE_ENABLE_I18N
3189 *err = build_range_exp (sbcset,
3190 dfa->mb_cur_max > 1 ? mbcset : NULL,
3191 &range_alloc, &start_elem, &end_elem);
3192# else
3193 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3194# endif
3195#endif /* RE_ENABLE_I18N */
3196 if (BE (*err != REG_NOERROR, 0))
3197 goto parse_bracket_exp_free_return;
3198 }
3199 else
3200 {
3201 switch (start_elem.type)
3202 {
3203 case SB_CHAR:
3204 bitset_set (sbcset, start_elem.opr.ch);
3205 break;
3206#ifdef RE_ENABLE_I18N
3207 case MB_CHAR:
3208 /* Check whether the array has enough space. */
3209 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3210 {
3211 wchar_t *new_mbchars;
3212 /* Not enough, realloc it. */
3213 /* +1 in case of mbcset->nmbchars is 0. */
3214 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3215 /* Use realloc since array is NULL if *alloc == 0. */
3216 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3217 mbchar_alloc);
3218 if (BE (new_mbchars == NULL, 0))
3219 goto parse_bracket_exp_espace;
3220 mbcset->mbchars = new_mbchars;
3221 }
3222 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3223 break;
3224#endif /* RE_ENABLE_I18N */
3225 case EQUIV_CLASS:
3226 *err = build_equiv_class (sbcset,
3227#ifdef RE_ENABLE_I18N
3228 mbcset, &equiv_class_alloc,
3229#endif /* RE_ENABLE_I18N */
3230 start_elem.opr.name);
3231 if (BE (*err != REG_NOERROR, 0))
3232 goto parse_bracket_exp_free_return;
3233 break;
3234 case COLL_SYM:
3235 *err = build_collating_symbol (sbcset,
3236#ifdef RE_ENABLE_I18N
3237 mbcset, &coll_sym_alloc,
3238#endif /* RE_ENABLE_I18N */
3239 start_elem.opr.name);
3240 if (BE (*err != REG_NOERROR, 0))
3241 goto parse_bracket_exp_free_return;
3242 break;
3243 case CHAR_CLASS:
3244 *err = build_charclass (regexp->trans, sbcset,
3245#ifdef RE_ENABLE_I18N
3246 mbcset, &char_class_alloc,
3247#endif /* RE_ENABLE_I18N */
3248 (const char *) start_elem.opr.name, syntax);
3249 if (BE (*err != REG_NOERROR, 0))
3250 goto parse_bracket_exp_free_return;
3251 break;
3252 default:
3253 assert (0);
3254 break;
3255 }
3256 }
3257 if (BE (token->type == END_OF_RE, 0))
3258 {
3259 *err = REG_EBRACK;
3260 goto parse_bracket_exp_free_return;
3261 }
3262 if (token->type == OP_CLOSE_BRACKET)
3263 break;
3264 }
3265
3266 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3267
3268 /* If it is non-matching list. */
3269 if (non_match)
3270 bitset_not (sbcset);
3271
3272#ifdef RE_ENABLE_I18N
3273 /* Ensure only single byte characters are set. */
3274 if (dfa->mb_cur_max > 1)
3275 bitset_mask (sbcset, dfa->sb_char);
3276
3277 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3278 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3279 || mbcset->non_match)))
3280 {
3281 bin_tree_t *mbc_tree;
3282 int sbc_idx;
3283 /* Build a tree for complex bracket. */
3284 dfa->has_mb_node = 1;
3285 br_token.type = COMPLEX_BRACKET;
3286 br_token.opr.mbcset = mbcset;
3287 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3288 if (BE (mbc_tree == NULL, 0))
3289 goto parse_bracket_exp_espace;
3290 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3291 if (sbcset[sbc_idx])
3292 break;
3293 /* If there are no bits set in sbcset, there is no point
3294 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3295 if (sbc_idx < BITSET_WORDS)
3296 {
3297 /* Build a tree for simple bracket. */
3298 br_token.type = SIMPLE_BRACKET;
3299 br_token.opr.sbcset = sbcset;
3300 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3301 if (BE (work_tree == NULL, 0))
3302 goto parse_bracket_exp_espace;
3303
3304 /* Then join them by ALT node. */
3305 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3306 if (BE (work_tree == NULL, 0))
3307 goto parse_bracket_exp_espace;
3308 }
3309 else
3310 {
3311 re_free (sbcset);
3312 work_tree = mbc_tree;
3313 }
3314 }
3315 else
3316#endif /* not RE_ENABLE_I18N */
3317 {
3318#ifdef RE_ENABLE_I18N
3319 free_charset (mbcset);
3320#endif
3321 /* Build a tree for simple bracket. */
3322 br_token.type = SIMPLE_BRACKET;
3323 br_token.opr.sbcset = sbcset;
3324 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3325 if (BE (work_tree == NULL, 0))
3326 goto parse_bracket_exp_espace;
3327 }
3328 return work_tree;
3329
3330 parse_bracket_exp_espace:
3331 *err = REG_ESPACE;
3332 parse_bracket_exp_free_return:
3333 re_free (sbcset);
3334#ifdef RE_ENABLE_I18N
3335 free_charset (mbcset);
3336#endif /* RE_ENABLE_I18N */
3337 return NULL;
3338}
3339
3340/* Parse an element in the bracket expression. */
3341
3342static reg_errcode_t
3343parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3344 re_token_t *token, int token_len, re_dfa_t *dfa,
3345 reg_syntax_t syntax, int accept_hyphen)
3346{
3347#ifdef RE_ENABLE_I18N
3348 int cur_char_size;
3349 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3350 if (cur_char_size > 1)
3351 {
3352 elem->type = MB_CHAR;
3353 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3354 re_string_skip_bytes (regexp, cur_char_size);
3355 return REG_NOERROR;
3356 }
3357#endif /* RE_ENABLE_I18N */
3358 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3359 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3360 || token->type == OP_OPEN_EQUIV_CLASS)
3361 return parse_bracket_symbol (elem, regexp, token);
3362 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3363 {
3364 /* A '-' must only appear as anything but a range indicator before
3365 the closing bracket. Everything else is an error. */
3366 re_token_t token2;
3367 (void) peek_token_bracket (&token2, regexp, syntax);
3368 if (token2.type != OP_CLOSE_BRACKET)
3369 /* The actual error value is not standardized since this whole
3370 case is undefined. But ERANGE makes good sense. */
3371 return REG_ERANGE;
3372 }
3373 elem->type = SB_CHAR;
3374 elem->opr.ch = token->opr.c;
3375 return REG_NOERROR;
3376}
3377
3378/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3379 such as [:<character_class>:], [.<collating_element>.], and
3380 [=<equivalent_class>=]. */
3381
3382static reg_errcode_t
3383parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3384 re_token_t *token)
3385{
3386 unsigned char ch, delim = token->opr.c;
3387 int i = 0;
3388 if (re_string_eoi(regexp))
3389 return REG_EBRACK;
3390 for (;; ++i)
3391 {
3392 if (i >= BRACKET_NAME_BUF_SIZE)
3393 return REG_EBRACK;
3394 if (token->type == OP_OPEN_CHAR_CLASS)
3395 ch = re_string_fetch_byte_case (regexp);
3396 else
3397 ch = re_string_fetch_byte (regexp);
3398 if (re_string_eoi(regexp))
3399 return REG_EBRACK;
3400 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3401 break;
3402 elem->opr.name[i] = ch;
3403 }
3404 re_string_skip_bytes (regexp, 1);
3405 elem->opr.name[i] = '\0';
3406 switch (token->type)
3407 {
3408 case OP_OPEN_COLL_ELEM:
3409 elem->type = COLL_SYM;
3410 break;
3411 case OP_OPEN_EQUIV_CLASS:
3412 elem->type = EQUIV_CLASS;
3413 break;
3414 case OP_OPEN_CHAR_CLASS:
3415 elem->type = CHAR_CLASS;
3416 break;
3417 default:
3418 break;
3419 }
3420 return REG_NOERROR;
3421}
3422
3423 /* Helper function for parse_bracket_exp.
3424 Build the equivalence class which is represented by NAME.
3425 The result are written to MBCSET and SBCSET.
3426 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3427 is a pointer argument sinse we may update it. */
3428
3429static reg_errcode_t
3430#ifdef RE_ENABLE_I18N
3431build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3432 int *equiv_class_alloc, const unsigned char *name)
3433#else /* not RE_ENABLE_I18N */
3434build_equiv_class (bitset_t sbcset, const unsigned char *name)
3435#endif /* not RE_ENABLE_I18N */
3436{
3437#ifdef _LIBC
3438 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3439 if (nrules != 0)
3440 {
3441 const int32_t *table, *indirect;
3442 const unsigned char *weights, *extra, *cp;
3443 unsigned char char_buf[2];
3444 int32_t idx1, idx2;
3445 unsigned int ch;
3446 size_t len;
3447 /* This #include defines a local function! */
3448# include <locale/weight.h>
3449 /* Calculate the index for equivalence class. */
3450 cp = name;
3451 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3452 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3453 _NL_COLLATE_WEIGHTMB);
3454 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3455 _NL_COLLATE_EXTRAMB);
3456 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3457 _NL_COLLATE_INDIRECTMB);
3458 idx1 = findidx (&cp);
3459 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3460 /* This isn't a valid character. */
3461 return REG_ECOLLATE;
3462
3463 /* Build single byte matcing table for this equivalence class. */
3464 char_buf[1] = (unsigned char) '\0';
3465 len = weights[idx1 & 0xffffff];
3466 for (ch = 0; ch < SBC_MAX; ++ch)
3467 {
3468 char_buf[0] = ch;
3469 cp = char_buf;
3470 idx2 = findidx (&cp);
3471/*
3472 idx2 = table[ch];
3473*/
3474 if (idx2 == 0)
3475 /* This isn't a valid character. */
3476 continue;
3477 /* Compare only if the length matches and the collation rule
3478 index is the same. */
3479 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3480 {
3481 int cnt = 0;
3482
3483 while (cnt <= len &&
3484 weights[(idx1 & 0xffffff) + 1 + cnt]
3485 == weights[(idx2 & 0xffffff) + 1 + cnt])
3486 ++cnt;
3487
3488 if (cnt > len)
3489 bitset_set (sbcset, ch);
3490 }
3491 }
3492 /* Check whether the array has enough space. */
3493 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3494 {
3495 /* Not enough, realloc it. */
3496 /* +1 in case of mbcset->nequiv_classes is 0. */
3497 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3498 /* Use realloc since the array is NULL if *alloc == 0. */
3499 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3500 int32_t,
3501 new_equiv_class_alloc);
3502 if (BE (new_equiv_classes == NULL, 0))
3503 return REG_ESPACE;
3504 mbcset->equiv_classes = new_equiv_classes;
3505 *equiv_class_alloc = new_equiv_class_alloc;
3506 }
3507 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3508 }
3509 else
3510#endif /* _LIBC */
3511 {
3512 if (BE (strlen ((const char *) name) != 1, 0))
3513 return REG_ECOLLATE;
3514 bitset_set (sbcset, *name);
3515 }
3516 return REG_NOERROR;
3517}
3518
3519 /* Helper function for parse_bracket_exp.
3520 Build the character class which is represented by NAME.
3521 The result are written to MBCSET and SBCSET.
3522 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3523 is a pointer argument sinse we may update it. */
3524
3525static reg_errcode_t
3526#ifdef RE_ENABLE_I18N
3527build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3528 re_charset_t *mbcset, int *char_class_alloc,
3529 const char *class_name, reg_syntax_t syntax)
3530#else /* not RE_ENABLE_I18N */
3531build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3532 const char *class_name, reg_syntax_t syntax)
3533#endif /* not RE_ENABLE_I18N */
3534{
3535 int i;
3536
3537 /* In case of REG_ICASE "upper" and "lower" match the both of
3538 upper and lower cases. */
3539 if ((syntax & RE_ICASE)
3540 && (strcmp (class_name, "upper") == 0 || strcmp (class_name, "lower") == 0))
3541 class_name = "alpha";
3542
3543#ifdef RE_ENABLE_I18N
3544 /* Check the space of the arrays. */
3545 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3546 {
3547 /* Not enough, realloc it. */
3548 /* +1 in case of mbcset->nchar_classes is 0. */
3549 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3550 /* Use realloc since array is NULL if *alloc == 0. */
3551 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3552 new_char_class_alloc);
3553 if (BE (new_char_classes == NULL, 0))
3554 return REG_ESPACE;
3555 mbcset->char_classes = new_char_classes;
3556 *char_class_alloc = new_char_class_alloc;
3557 }
3558 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name);
3559#endif /* RE_ENABLE_I18N */
3560
3561#define BUILD_CHARCLASS_LOOP(ctype_func) \
3562 do { \
3563 if (BE (trans != NULL, 0)) \
3564 { \
3565 for (i = 0; i < SBC_MAX; ++i) \
3566 if (ctype_func (i)) \
3567 bitset_set (sbcset, trans[i]); \
3568 } \
3569 else \
3570 { \
3571 for (i = 0; i < SBC_MAX; ++i) \
3572 if (ctype_func (i)) \
3573 bitset_set (sbcset, i); \
3574 } \
3575 } while (0)
3576
3577 if (strcmp (class_name, "alnum") == 0)
3578 BUILD_CHARCLASS_LOOP (isalnum);
3579 else if (strcmp (class_name, "cntrl") == 0)
3580 BUILD_CHARCLASS_LOOP (iscntrl);
3581 else if (strcmp (class_name, "lower") == 0)
3582 BUILD_CHARCLASS_LOOP (islower);
3583 else if (strcmp (class_name, "space") == 0)
3584 BUILD_CHARCLASS_LOOP (isspace);
3585 else if (strcmp (class_name, "alpha") == 0)
3586 BUILD_CHARCLASS_LOOP (isalpha);
3587 else if (strcmp (class_name, "digit") == 0)
3588 BUILD_CHARCLASS_LOOP (isdigit);
3589 else if (strcmp (class_name, "print") == 0)
3590 BUILD_CHARCLASS_LOOP (isprint);
3591 else if (strcmp (class_name, "upper") == 0)
3592 BUILD_CHARCLASS_LOOP (isupper);
3593 else if (strcmp (class_name, "blank") == 0)
3594#ifndef GAWK
3595 BUILD_CHARCLASS_LOOP (isblank);
3596#else
3597 /* see comments above */
3598 BUILD_CHARCLASS_LOOP (is_blank);
3599#endif
3600 else if (strcmp (class_name, "graph") == 0)
3601 BUILD_CHARCLASS_LOOP (isgraph);
3602 else if (strcmp (class_name, "punct") == 0)
3603 BUILD_CHARCLASS_LOOP (ispunct);
3604 else if (strcmp (class_name, "xdigit") == 0)
3605 BUILD_CHARCLASS_LOOP (isxdigit);
3606 else
3607 return REG_ECTYPE;
3608
3609 return REG_NOERROR;
3610}
3611
3612static bin_tree_t *
3613build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3614 const char *class_name,
3615 const char *extra, int non_match,
3616 reg_errcode_t *err)
3617{
3618 re_bitset_ptr_t sbcset;
3619#ifdef RE_ENABLE_I18N
3620 re_charset_t *mbcset;
3621 int alloc = 0;
3622#endif /* not RE_ENABLE_I18N */
3623 reg_errcode_t ret;
3624 re_token_t br_token;
3625 bin_tree_t *tree;
3626
3627 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3628#ifdef RE_ENABLE_I18N
3629 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3630#endif /* RE_ENABLE_I18N */
3631
3632#ifdef RE_ENABLE_I18N
3633 if (BE (sbcset == NULL || mbcset == NULL, 0))
3634#else /* not RE_ENABLE_I18N */
3635 if (BE (sbcset == NULL, 0))
3636#endif /* not RE_ENABLE_I18N */
3637 {
3638 *err = REG_ESPACE;
3639 return NULL;
3640 }
3641
3642 if (non_match)
3643 {
3644#ifdef RE_ENABLE_I18N
3645 mbcset->non_match = 1;
3646#endif /* not RE_ENABLE_I18N */
3647 }
3648
3649 /* We don't care the syntax in this case. */
3650 ret = build_charclass (trans, sbcset,
3651#ifdef RE_ENABLE_I18N
3652 mbcset, &alloc,
3653#endif /* RE_ENABLE_I18N */
3654 class_name, 0);
3655
3656 if (BE (ret != REG_NOERROR, 0))
3657 {
3658 re_free (sbcset);
3659#ifdef RE_ENABLE_I18N
3660 free_charset (mbcset);
3661#endif /* RE_ENABLE_I18N */
3662 *err = ret;
3663 return NULL;
3664 }
3665 /* \w match '_' also. */
3666 for (; *extra; extra++)
3667 bitset_set (sbcset, *extra);
3668
3669 /* If it is non-matching list. */
3670 if (non_match)
3671 bitset_not (sbcset);
3672
3673#ifdef RE_ENABLE_I18N
3674 /* Ensure only single byte characters are set. */
3675 if (dfa->mb_cur_max > 1)
3676 bitset_mask (sbcset, dfa->sb_char);
3677#endif
3678
3679 /* Build a tree for simple bracket. */
3680 br_token.type = SIMPLE_BRACKET;
3681 br_token.opr.sbcset = sbcset;
3682 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3683 if (BE (tree == NULL, 0))
3684 goto build_word_op_espace;
3685
3686#ifdef RE_ENABLE_I18N
3687 if (dfa->mb_cur_max > 1)
3688 {
3689 bin_tree_t *mbc_tree;
3690 /* Build a tree for complex bracket. */
3691 br_token.type = COMPLEX_BRACKET;
3692 br_token.opr.mbcset = mbcset;
3693 dfa->has_mb_node = 1;
3694 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3695 if (BE (mbc_tree == NULL, 0))
3696 goto build_word_op_espace;
3697 /* Then join them by ALT node. */
3698 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3699 if (BE (mbc_tree != NULL, 1))
3700 return tree;
3701 }
3702 else
3703 {
3704 free_charset (mbcset);
3705 return tree;
3706 }
3707#else /* not RE_ENABLE_I18N */
3708 return tree;
3709#endif /* not RE_ENABLE_I18N */
3710
3711 build_word_op_espace:
3712 re_free (sbcset);
3713#ifdef RE_ENABLE_I18N
3714 free_charset (mbcset);
3715#endif /* RE_ENABLE_I18N */
3716 *err = REG_ESPACE;
3717 return NULL;
3718}
3719
3720/* This is intended for the expressions like "a{1,3}".
3721 Fetch a number from `input', and return the number.
3722 Return -1, if the number field is empty like "{,1}".
3723 Return -2, If an error is occured. */
3724
3725static int
3726fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3727{
3728 int num = -1;
3729 unsigned char c;
3730 while (1)
3731 {
3732 fetch_token (token, input, syntax);
3733 c = token->opr.c;
3734 if (BE (token->type == END_OF_RE, 0))
3735 return -2;
3736 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3737 break;
3738 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3739 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3740 num = (num > RE_DUP_MAX) ? -2 : num;
3741 }
3742 return num;
3743}
3744\f
3745#ifdef RE_ENABLE_I18N
3746static void
3747free_charset (re_charset_t *cset)
3748{
3749 re_free (cset->mbchars);
3750# ifdef _LIBC
3751 re_free (cset->coll_syms);
3752 re_free (cset->equiv_classes);
3753 re_free (cset->range_starts);
3754 re_free (cset->range_ends);
3755# endif
3756 re_free (cset->char_classes);
3757 re_free (cset);
3758}
3759#endif /* RE_ENABLE_I18N */
3760\f
3761/* Functions for binary tree operation. */
3762
3763/* Create a tree node. */
3764
3765static bin_tree_t *
3766create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3767 re_token_type_t type)
3768{
3769 re_token_t t;
3770 t.type = type;
3771 return create_token_tree (dfa, left, right, &t);
3772}
3773
3774static bin_tree_t *
3775create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3776 const re_token_t *token)
3777{
3778 bin_tree_t *tree;
3779 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3780 {
3781 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3782
3783 if (storage == NULL)
3784 return NULL;
3785 storage->next = dfa->str_tree_storage;
3786 dfa->str_tree_storage = storage;
3787 dfa->str_tree_storage_idx = 0;
3788 }
3789 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3790
3791 tree->parent = NULL;
3792 tree->left = left;
3793 tree->right = right;
3794 tree->token = *token;
3795 tree->token.duplicated = 0;
3796 tree->token.opt_subexp = 0;
3797 tree->first = NULL;
3798 tree->next = NULL;
3799 tree->node_idx = -1;
3800
3801 if (left != NULL)
3802 left->parent = tree;
3803 if (right != NULL)
3804 right->parent = tree;
3805 return tree;
3806}
3807
3808/* Mark the tree SRC as an optional subexpression.
3809 To be called from preorder or postorder. */
3810
3811static reg_errcode_t
3812mark_opt_subexp (void *extra, bin_tree_t *node)
3813{
3814 int idx = (int) (long) extra;
3815 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3816 node->token.opt_subexp = 1;
3817
3818 return REG_NOERROR;
3819}
3820
3821/* Free the allocated memory inside NODE. */
3822
3823static void
3824free_token (re_token_t *node)
3825{
3826#ifdef RE_ENABLE_I18N
3827 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3828 free_charset (node->opr.mbcset);
3829 else
3830#endif /* RE_ENABLE_I18N */
3831 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3832 re_free (node->opr.sbcset);
3833}
3834
3835/* Worker function for tree walking. Free the allocated memory inside NODE
3836 and its children. */
3837
3838static reg_errcode_t
3839free_tree (void *extra, bin_tree_t *node)
3840{
3841 free_token (&node->token);
3842 return REG_NOERROR;
3843}
3844
3845
3846/* Duplicate the node SRC, and return new node. This is a preorder
3847 visit similar to the one implemented by the generic visitor, but
3848 we need more infrastructure to maintain two parallel trees --- so,
3849 it's easier to duplicate. */
3850
3851static bin_tree_t *
3852duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3853{
3854 const bin_tree_t *node;
3855 bin_tree_t *dup_root;
3856 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3857
3858 for (node = root; ; )
3859 {
3860 /* Create a new tree and link it back to the current parent. */
3861 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3862 if (*p_new == NULL)
3863 return NULL;
3864 (*p_new)->parent = dup_node;
3865 (*p_new)->token.duplicated = 1;
3866 dup_node = *p_new;
3867
3868 /* Go to the left node, or up and to the right. */
3869 if (node->left)
3870 {
3871 node = node->left;
3872 p_new = &dup_node->left;
3873 }
3874 else
3875 {
3876 const bin_tree_t *prev = NULL;
3877 while (node->right == prev || node->right == NULL)
3878 {
3879 prev = node;
3880 node = node->parent;
3881 dup_node = dup_node->parent;
3882 if (!node)
3883 return dup_root;
3884 }
3885 node = node->right;
3886 p_new = &dup_node->right;
3887 }
3888 }
3889}