1/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA. */
20
21static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 size_t length, reg_syntax_t syntax);
23static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
25 char *fastmap);
26static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
27#ifdef RE_ENABLE_I18N
28static void free_charset (re_charset_t *cset);
29#endif /* RE_ENABLE_I18N */
30static void free_workarea_compile (regex_t *preg);
31static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32#ifdef RE_ENABLE_I18N
33static void optimize_utf8 (re_dfa_t *dfa);
34#endif
35static reg_errcode_t analyze (regex_t *preg);
36static reg_errcode_t preorder (bin_tree_t *root,
37 reg_errcode_t (fn (void *, bin_tree_t *)),
38 void *extra);
39static reg_errcode_t postorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
41 void *extra);
42static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
43static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
44static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 bin_tree_t *node);
46static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
47static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
48static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
49static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
50static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
51 unsigned int constraint);
52static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
53static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 int node, int root);
55static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
56static int fetch_number (re_string_t *input, re_token_t *token,
57 reg_syntax_t syntax);
58static int peek_token (re_token_t *token, re_string_t *input,
59 reg_syntax_t syntax) internal_function;
60static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
61 reg_syntax_t syntax, reg_errcode_t *err);
62static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
63 re_token_t *token, reg_syntax_t syntax,
64 int nest, reg_errcode_t *err);
65static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 int nest, reg_errcode_t *err);
68static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 int nest, reg_errcode_t *err);
71static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 int nest, reg_errcode_t *err);
74static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
75 re_dfa_t *dfa, re_token_t *token,
76 reg_syntax_t syntax, reg_errcode_t *err);
77static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
78 re_token_t *token, reg_syntax_t syntax,
79 reg_errcode_t *err);
80static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_string_t *regexp,
82 re_token_t *token, int token_len,
83 re_dfa_t *dfa,
84 reg_syntax_t syntax,
85 int accept_hyphen);
86static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
87 re_string_t *regexp,
88 re_token_t *token);
89#ifdef RE_ENABLE_I18N
90static reg_errcode_t build_equiv_class (bitset_t sbcset,
91 re_charset_t *mbcset,
92 int *equiv_class_alloc,
93 const unsigned char *name);
94static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
95 bitset_t sbcset,
96 re_charset_t *mbcset,
97 int *char_class_alloc,
98 const char *class_name,
99 reg_syntax_t syntax);
100#else /* not RE_ENABLE_I18N */
101static reg_errcode_t build_equiv_class (bitset_t sbcset,
102 const unsigned char *name);
103static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
104 bitset_t sbcset,
105 const char *class_name,
106 reg_syntax_t syntax);
107#endif /* not RE_ENABLE_I18N */
108static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
109 RE_TRANSLATE_TYPE trans,
110 const char *class_name,
111 const char *extra,
112 int non_match, reg_errcode_t *err);
113static bin_tree_t *create_tree (re_dfa_t *dfa,
114 bin_tree_t *left, bin_tree_t *right,
115 re_token_type_t type);
116static bin_tree_t *create_token_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 const re_token_t *token);
119static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
120static void free_token (re_token_t *node);
121static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
122static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
123\f
124/* This table gives an error message for each of the error codes listed
125 in regex.h. Obviously the order here has to be same as there.
126 POSIX doesn't require that we do anything for REG_NOERROR,
127 but why not be nice? */
128
129const char __re_error_msgid[] attribute_hidden =
130 {
131#define REG_NOERROR_IDX 0
132 gettext_noop ("Success") /* REG_NOERROR */
133 "\0"
134#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
135 gettext_noop ("No match") /* REG_NOMATCH */
136 "\0"
137#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
138 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 "\0"
140#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
141 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 "\0"
143#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
144 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 "\0"
146#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
147 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 "\0"
149#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
150 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 "\0"
152#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
153 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 "\0"
155#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
156 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 "\0"
158#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
159 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 "\0"
161#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
162 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 "\0"
164#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
165 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 "\0"
167#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
168 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 "\0"
170#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
171 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 "\0"
173#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
174 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 "\0"
176#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
177 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 "\0"
179#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
180 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
181 };
182
183const size_t __re_error_msgid_idx[] attribute_hidden =
184 {
185 REG_NOERROR_IDX,
186 REG_NOMATCH_IDX,
187 REG_BADPAT_IDX,
188 REG_ECOLLATE_IDX,
189 REG_ECTYPE_IDX,
190 REG_EESCAPE_IDX,
191 REG_ESUBREG_IDX,
192 REG_EBRACK_IDX,
193 REG_EPAREN_IDX,
194 REG_EBRACE_IDX,
195 REG_BADBR_IDX,
196 REG_ERANGE_IDX,
197 REG_ESPACE_IDX,
198 REG_BADRPT_IDX,
199 REG_EEND_IDX,
200 REG_ESIZE_IDX,
201 REG_ERPAREN_IDX
202 };
203\f
204/* Entry points for GNU code. */
205
206
207#ifdef ZOS_USS
208
209/* For ZOS USS we must define btowc */
210
211wchar_t
212btowc (int c)
213{
214 wchar_t wtmp[2];
215 char tmp[2];
216
217 tmp[0] = c;
218 tmp[1] = 0;
219
220 mbtowc (wtmp, tmp, 1);
221 return wtmp[0];
222}
223#endif
224
225/* re_compile_pattern is the GNU regular expression compiler: it
226 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
227 Returns 0 if the pattern was valid, otherwise an error string.
228
229 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
230 are set in BUFP on entry. */
231
232const char *
233re_compile_pattern (pattern, length, bufp)
234 const char *pattern;
235 size_t length;
236 struct re_pattern_buffer *bufp;
237{
238 reg_errcode_t ret;
239
240 /* And GNU code determines whether or not to get register information
241 by passing null for the REGS argument to re_match, etc., not by
242 setting no_sub, unless RE_NO_SUB is set. */
243 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
244
245 /* Match anchors at newline. */
246 bufp->newline_anchor = 1;
247
248 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
249
250 if (!ret)
251 return NULL;
252 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
253}
254#ifdef _LIBC
255weak_alias (__re_compile_pattern, re_compile_pattern)
256#endif
257
258/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
259 also be assigned to arbitrarily: each pattern buffer stores its own
260 syntax, so it can be changed between regex compilations. */
261/* This has no initializer because initialized variables in Emacs
262 become read-only after dumping. */
263reg_syntax_t re_syntax_options;
264
265
266/* Specify the precise syntax of regexps for compilation. This provides
267 for compatibility for various utilities which historically have
268 different, incompatible syntaxes.
269
270 The argument SYNTAX is a bit mask comprised of the various bits
271 defined in regex.h. We return the old syntax. */
272
273reg_syntax_t
274re_set_syntax (syntax)
275 reg_syntax_t syntax;
276{
277 reg_syntax_t ret = re_syntax_options;
278
279 re_syntax_options = syntax;
280 return ret;
281}
282#ifdef _LIBC
283weak_alias (__re_set_syntax, re_set_syntax)
284#endif
285
286int
287re_compile_fastmap (bufp)
288 struct re_pattern_buffer *bufp;
289{
290 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
291 char *fastmap = bufp->fastmap;
292
293 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
294 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
295 if (dfa->init_state != dfa->init_state_word)
296 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
297 if (dfa->init_state != dfa->init_state_nl)
298 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
299 if (dfa->init_state != dfa->init_state_begbuf)
300 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
301 bufp->fastmap_accurate = 1;
302 return 0;
303}
304#ifdef _LIBC
305weak_alias (__re_compile_fastmap, re_compile_fastmap)
306#endif
307
308static inline void
309__attribute ((always_inline))
310re_set_fastmap (char *fastmap, int icase, int ch)
311{
312 fastmap[ch] = 1;
313 if (icase)
314 fastmap[tolower (ch)] = 1;
315}
316
317/* Helper function for re_compile_fastmap.
318 Compile fastmap for the initial_state INIT_STATE. */
319
320static void
321re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
322 char *fastmap)
323{
324 volatile re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
325 int node_cnt;
326 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
327 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
328 {
329 int node = init_state->nodes.elems[node_cnt];
330 re_token_type_t type = dfa->nodes[node].type;
331
332 if (type == CHARACTER)
333 {
334 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
335#ifdef RE_ENABLE_I18N
336 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
337 {
338 unsigned char *buf = re_malloc (unsigned char, dfa->mb_cur_max), *p;
339 wchar_t wc;
340 mbstate_t state;
341
342 p = buf;
343 *p++ = dfa->nodes[node].opr.c;
344 while (++node < dfa->nodes_len
345 && dfa->nodes[node].type == CHARACTER
346 && dfa->nodes[node].mb_partial)
347 *p++ = dfa->nodes[node].opr.c;
348 memset (&state, '\0', sizeof (state));
349 if (__mbrtowc (&wc, (const char *) buf, p - buf,
350 &state) == p - buf
351 && (__wcrtomb ((char *) buf, towlower (wc), &state)
352 != (size_t) -1))
353 re_set_fastmap (fastmap, 0, buf[0]);
354 re_free (buf);
355 }
356#endif
357 }
358 else if (type == SIMPLE_BRACKET)
359 {
360 int i, ch;
361 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
362 {
363 int j;
364 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
365 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
366 if (w & ((bitset_word_t) 1 << j))
367 re_set_fastmap (fastmap, icase, ch);
368 }
369 }
370#ifdef RE_ENABLE_I18N
371 else if (type == COMPLEX_BRACKET)
372 {
373 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
374 int i;
375
376# ifdef _LIBC
377 /* See if we have to try all bytes which start multiple collation
378 elements.
379 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
380 collation element, and don't catch 'b' since 'b' is
381 the only collation element which starts from 'b' (and
382 it is caught by SIMPLE_BRACKET). */
383 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
384 && (cset->ncoll_syms || cset->nranges))
385 {
386 const int32_t *table = (const int32_t *)
387 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
388 for (i = 0; i < SBC_MAX; ++i)
389 if (table[i] < 0)
390 re_set_fastmap (fastmap, icase, i);
391 }
392# endif /* _LIBC */
393
394 /* See if we have to start the match at all multibyte characters,
395 i.e. where we would not find an invalid sequence. This only
396 applies to multibyte character sets; for single byte character
397 sets, the SIMPLE_BRACKET again suffices. */
398 if (dfa->mb_cur_max > 1
399 && (cset->nchar_classes || cset->non_match || cset->nranges
400# ifdef _LIBC
401 || cset->nequiv_classes
402# endif /* _LIBC */
403 ))
404 {
405 unsigned char c = 0;
406 do
407 {
408 mbstate_t mbs;
409 memset (&mbs, 0, sizeof (mbs));
410 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
411 re_set_fastmap (fastmap, false, (int) c);
412 }
413 while (++c != 0);
414 }
415
416 else
417 {
418 /* ... Else catch all bytes which can start the mbchars. */
419 for (i = 0; i < cset->nmbchars; ++i)
420 {
421 char buf[256];
422 mbstate_t state;
423 memset (&state, '\0', sizeof (state));
424 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
425 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
426 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
427 {
428 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
429 != (size_t) -1)
430 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
431 }
432 }
433 }
434 }
435#endif /* RE_ENABLE_I18N */
436 else if (type == OP_PERIOD
437#ifdef RE_ENABLE_I18N
438 || type == OP_UTF8_PERIOD
439#endif /* RE_ENABLE_I18N */
440 || type == END_OF_RE)
441 {
442 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
443 if (type == END_OF_RE)
444 bufp->can_be_null = 1;
445 return;
446 }
447 }
448}
449\f
450/* Entry point for POSIX code. */
451/* regcomp takes a regular expression as a string and compiles it.
452
453 PREG is a regex_t *. We do not expect any fields to be initialized,
454 since POSIX says we shouldn't. Thus, we set
455
456 `buffer' to the compiled pattern;
457 `used' to the length of the compiled pattern;
458 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
459 REG_EXTENDED bit in CFLAGS is set; otherwise, to
460 RE_SYNTAX_POSIX_BASIC;
461 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
462 `fastmap' to an allocated space for the fastmap;
463 `fastmap_accurate' to zero;
464 `re_nsub' to the number of subexpressions in PATTERN.
465
466 PATTERN is the address of the pattern string.
467
468 CFLAGS is a series of bits which affect compilation.
469
470 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
471 use POSIX basic syntax.
472
473 If REG_NEWLINE is set, then . and [^...] don't match newline.
474 Also, regexec will try a match beginning after every newline.
475
476 If REG_ICASE is set, then we considers upper- and lowercase
477 versions of letters to be equivalent when matching.
478
479 If REG_NOSUB is set, then when PREG is passed to regexec, that
480 routine will report only success or failure, and nothing about the
481 registers.
482
483 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
484 the return codes and their meanings.) */
485
486int
487regcomp (preg, pattern, cflags)
488 regex_t *__restrict preg;
489 const char *__restrict pattern;
490 int cflags;
491{
492 reg_errcode_t ret;
493 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
494 : RE_SYNTAX_POSIX_BASIC);
495
496 preg->buffer = NULL;
497 preg->allocated = 0;
498 preg->used = 0;
499
500 /* Try to allocate space for the fastmap. */
501 preg->fastmap = re_malloc (char, SBC_MAX);
502 if (BE (preg->fastmap == NULL, 0))
503 return REG_ESPACE;
504
505 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
506
507 /* If REG_NEWLINE is set, newlines are treated differently. */
508 if (cflags & REG_NEWLINE)
509 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
510 syntax &= ~RE_DOT_NEWLINE;
511 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
512 /* It also changes the matching behavior. */
513 preg->newline_anchor = 1;
514 }
515 else
516 preg->newline_anchor = 0;
517 preg->no_sub = !!(cflags & REG_NOSUB);
518 preg->translate = NULL;
519
520 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
521
522 /* POSIX doesn't distinguish between an unmatched open-group and an
523 unmatched close-group: both are REG_EPAREN. */
524 if (ret == REG_ERPAREN)
525 ret = REG_EPAREN;
526
527 /* We have already checked preg->fastmap != NULL. */
528 if (BE (ret == REG_NOERROR, 1))
529 /* Compute the fastmap now, since regexec cannot modify the pattern
530 buffer. This function never fails in this implementation. */
531 (void) re_compile_fastmap (preg);
532 else
533 {
534 /* Some error occurred while compiling the expression. */
535 re_free (preg->fastmap);
536 preg->fastmap = NULL;
537 }
538
539 return (int) ret;
540}
541#ifdef _LIBC
542weak_alias (__regcomp, regcomp)
543#endif
544
545/* Returns a message corresponding to an error code, ERRCODE, returned
546 from either regcomp or regexec. We don't use PREG here. */
547
548size_t
549regerror (errcode, preg, errbuf, errbuf_size)
550 int errcode;
551 const regex_t *__restrict preg;
552 char *__restrict errbuf;
553 size_t errbuf_size;
554{
555 const char *msg;
556 size_t msg_size;
557
558 if (BE (errcode < 0
559 || errcode >= (int) (sizeof (__re_error_msgid_idx)
560 / sizeof (__re_error_msgid_idx[0])), 0))
561 /* Only error codes returned by the rest of the code should be passed
562 to this routine. If we are given anything else, or if other regex
563 code generates an invalid error code, then the program has a bug.
564 Dump core so we can fix it. */
565 abort ();
566
567 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
568
569 msg_size = strlen (msg) + 1; /* Includes the null. */
570
571 if (BE (errbuf_size != 0, 1))
572 {
573 if (BE (msg_size > errbuf_size, 0))
574 {
575 memcpy (errbuf, msg, errbuf_size - 1);
576 errbuf[errbuf_size - 1] = 0;
577 }
578 else
579 memcpy (errbuf, msg, msg_size);
580 }
581
582 return msg_size;
583}
584#ifdef _LIBC
585weak_alias (__regerror, regerror)
586#endif
587
588
589#ifdef RE_ENABLE_I18N
590/* This static array is used for the map to single-byte characters when
591 UTF-8 is used. Otherwise we would allocate memory just to initialize
592 it the same all the time. UTF-8 is the preferred encoding so this is
593 a worthwhile optimization. */
594#if __GNUC__ >= 3
595static const bitset_t utf8_sb_map = {
596 /* Set the first 128 bits. */
597 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
598};
599#else /* ! (__GNUC__ >= 3) */
600static bitset_t utf8_sb_map;
601#endif /* __GNUC__ >= 3 */
602#endif /* RE_ENABLE_I18N */
603
604
605static void
606free_dfa_content (re_dfa_t *dfa)
607{
608 int i, j;
609
610 if (dfa->nodes)
611 for (i = 0; i < dfa->nodes_len; ++i)
612 free_token (dfa->nodes + i);
613 re_free (dfa->nexts);
614 for (i = 0; i < dfa->nodes_len; ++i)
615 {
616 if (dfa->eclosures != NULL)
617 re_node_set_free (dfa->eclosures + i);
618 if (dfa->inveclosures != NULL)
619 re_node_set_free (dfa->inveclosures + i);
620 if (dfa->edests != NULL)
621 re_node_set_free (dfa->edests + i);
622 }
623 re_free (dfa->edests);
624 re_free (dfa->eclosures);
625 re_free (dfa->inveclosures);
626 re_free (dfa->nodes);
627
628 if (dfa->state_table)
629 for (i = 0; i <= dfa->state_hash_mask; ++i)
630 {
631 struct re_state_table_entry *entry = dfa->state_table + i;
632 for (j = 0; j < entry->num; ++j)
633 {
634 re_dfastate_t *state = entry->array[j];
635 free_state (state);
636 }
637 re_free (entry->array);
638 }
639 re_free (dfa->state_table);
640#ifdef RE_ENABLE_I18N
641 if (dfa->sb_char != utf8_sb_map)
642 re_free (dfa->sb_char);
643#endif
644 re_free (dfa->subexp_map);
645#ifdef DEBUG
646 re_free (dfa->re_str);
647#endif
648
649 re_free (dfa);
650}
651
652
653/* Free dynamically allocated space used by PREG. */
654
655void
656regfree (preg)
657 regex_t *preg;
658{
659 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
660 if (BE (dfa != NULL, 1))
661 free_dfa_content (dfa);
662 preg->buffer = NULL;
663 preg->allocated = 0;
664
665 re_free (preg->fastmap);
666 preg->fastmap = NULL;
667
668 re_free (preg->translate);
669 preg->translate = NULL;
670}
671#ifdef _LIBC
672weak_alias (__regfree, regfree)
673#endif
674\f
675/* Entry points compatible with 4.2 BSD regex library. We don't define
676 them unless specifically requested. */
677
678#if defined _REGEX_RE_COMP || defined _LIBC
679
680/* BSD has one and only one pattern buffer. */
681static struct re_pattern_buffer re_comp_buf;
682
683char *
684# ifdef _LIBC
685/* Make these definitions weak in libc, so POSIX programs can redefine
686 these names if they don't use our functions, and still use
687 regcomp/regexec above without link errors. */
688weak_function
689# endif
690re_comp (s)
691 const char *s;
692{
693 reg_errcode_t ret;
694 char *fastmap;
695
696 if (!s)
697 {
698 if (!re_comp_buf.buffer)
699 return gettext ("No previous regular expression");
700 return 0;
701 }
702
703 if (re_comp_buf.buffer)
704 {
705 fastmap = re_comp_buf.fastmap;
706 re_comp_buf.fastmap = NULL;
707 __regfree (&re_comp_buf);
708 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
709 re_comp_buf.fastmap = fastmap;
710 }
711
712 if (re_comp_buf.fastmap == NULL)
713 {
714 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
715 if (re_comp_buf.fastmap == NULL)
716 return (char *) gettext (__re_error_msgid
717 + __re_error_msgid_idx[(int) REG_ESPACE]);
718 }
719
720 /* Since `re_exec' always passes NULL for the `regs' argument, we
721 don't need to initialize the pattern buffer fields which affect it. */
722
723 /* Match anchors at newlines. */
724 re_comp_buf.newline_anchor = 1;
725
726 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
727
728 if (!ret)
729 return NULL;
730
731 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
732 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
733}
734
735#ifdef _LIBC
736libc_freeres_fn (free_mem)
737{
738 __regfree (&re_comp_buf);
739}
740#endif
741
742#endif /* _REGEX_RE_COMP */
743\f
744/* Internal entry point.
745 Compile the regular expression PATTERN, whose length is LENGTH.
746 SYNTAX indicate regular expression's syntax. */
747
748static reg_errcode_t
749re_compile_internal (regex_t *preg, const char * pattern, size_t length,
750 reg_syntax_t syntax)
751{
752 reg_errcode_t err = REG_NOERROR;
753 re_dfa_t *dfa;
754 re_string_t regexp;
755
756 /* Initialize the pattern buffer. */
757 preg->fastmap_accurate = 0;
758 preg->syntax = syntax;
759 preg->not_bol = preg->not_eol = 0;
760 preg->used = 0;
761 preg->re_nsub = 0;
762 preg->can_be_null = 0;
763 preg->regs_allocated = REGS_UNALLOCATED;
764
765 /* Initialize the dfa. */
766 dfa = (re_dfa_t *) preg->buffer;
767 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
768 {
769 /* If zero allocated, but buffer is non-null, try to realloc
770 enough space. This loses if buffer's address is bogus, but
771 that is the user's responsibility. If ->buffer is NULL this
772 is a simple allocation. */
773 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
774 if (dfa == NULL)
775 return REG_ESPACE;
776 preg->allocated = sizeof (re_dfa_t);
777 preg->buffer = (unsigned char *) dfa;
778 }
779 preg->used = sizeof (re_dfa_t);
780
781 err = init_dfa (dfa, length);
782 if (BE (err != REG_NOERROR, 0))
783 {
784 free_dfa_content (dfa);
785 preg->buffer = NULL;
786 preg->allocated = 0;
787 return err;
788 }
789#ifdef DEBUG
790 /* Note: length+1 will not overflow since it is checked in init_dfa. */
791 dfa->re_str = re_malloc (char, length + 1);
792 strncpy (dfa->re_str, pattern, length + 1);
793#endif
794
795 __libc_lock_init (dfa->lock);
796
797 err = re_string_construct (®exp, pattern, length, preg->translate,
798 syntax & RE_ICASE, dfa);
799 if (BE (err != REG_NOERROR, 0))
800 {
801 re_compile_internal_free_return:
802 free_workarea_compile (preg);
803 re_string_destruct (®exp);
804 free_dfa_content (dfa);
805 preg->buffer = NULL;
806 preg->allocated = 0;
807 return err;
808 }
809
810 /* Parse the regular expression, and build a structure tree. */
811 preg->re_nsub = 0;
812 dfa->str_tree = parse (®exp, preg, syntax, &err);
813 if (BE (dfa->str_tree == NULL, 0))
814 goto re_compile_internal_free_return;
815
816 /* Analyze the tree and create the nfa. */
817 err = analyze (preg);
818 if (BE (err != REG_NOERROR, 0))
819 goto re_compile_internal_free_return;
820
821#ifdef RE_ENABLE_I18N
822 /* If possible, do searching in single byte encoding to speed things up. */
823 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
824 optimize_utf8 (dfa);
825#endif
826
827 /* Then create the initial state of the dfa. */
828 err = create_initial_state (dfa);
829
830 /* Release work areas. */
831 free_workarea_compile (preg);
832 re_string_destruct (®exp);
833
834 if (BE (err != REG_NOERROR, 0))
835 {
836 free_dfa_content (dfa);
837 preg->buffer = NULL;
838 preg->allocated = 0;
839 }
840
841 return err;
842}
843
844/* Initialize DFA. We use the length of the regular expression PAT_LEN
845 as the initial length of some arrays. */
846
847static reg_errcode_t
848init_dfa (re_dfa_t *dfa, size_t pat_len)
849{
850 unsigned int table_size;
851#ifndef _LIBC
852 char *codeset_name;
853#endif
854
855 memset (dfa, '\0', sizeof (re_dfa_t));
856
857 /* Force allocation of str_tree_storage the first time. */
858 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
859
860 /* Avoid overflows. */
861 if (pat_len == SIZE_MAX)
862 return REG_ESPACE;
863
864 dfa->nodes_alloc = pat_len + 1;
865 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
866
867 /* table_size = 2 ^ ceil(log pat_len) */
868 for (table_size = 1; ; table_size <<= 1)
869 if (table_size > pat_len)
870 break;
871
872 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
873 dfa->state_hash_mask = table_size - 1;
874
875 dfa->mb_cur_max = MB_CUR_MAX;
876#ifdef _LIBC
877 if (dfa->mb_cur_max == 6
878 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
879 dfa->is_utf8 = 1;
880 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
881 != 0);
882#else
883# ifdef HAVE_LANGINFO_CODESET
884 codeset_name = nl_langinfo (CODESET);
885# else
886 codeset_name = getenv ("LC_ALL");
887 if (codeset_name == NULL || codeset_name[0] == '\0')
888 codeset_name = getenv ("LC_CTYPE");
889 if (codeset_name == NULL || codeset_name[0] == '\0')
890 codeset_name = getenv ("LANG");
891 if (codeset_name == NULL)
892 codeset_name = "";
893 else if (strchr (codeset_name, '.') != NULL)
894 codeset_name = strchr (codeset_name, '.') + 1;
895# endif
896
897 /* strcasecmp isn't a standard interface. brute force check */
898#if 0
899 if (strcasecmp (codeset_name, "UTF-8") == 0
900 || strcasecmp (codeset_name, "UTF8") == 0)
901 dfa->is_utf8 = 1;
902#else
903 if ( (codeset_name[0] == 'U' || codeset_name[0] == 'u')
904 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
905 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
906 && (codeset_name[3] == '-'
907 ? codeset_name[4] == '8' && codeset_name[5] == '\0'
908 : codeset_name[3] == '8' && codeset_name[4] == '\0'))
909 dfa->is_utf8 = 1;
910#endif
911
912 /* We check exhaustively in the loop below if this charset is a
913 superset of ASCII. */
914 dfa->map_notascii = 0;
915#endif
916
917#ifdef RE_ENABLE_I18N
918 if (dfa->mb_cur_max > 1)
919 {
920 if (dfa->is_utf8)
921 {
922#if !defined(__GNUC__) || __GNUC__ < 3
923 static short utf8_sb_map_inited = 0;
924
925 if (! utf8_sb_map_inited)
926 {
927 int i;
928
929 utf8_sb_map_inited = 0;
930 for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++)
931 utf8_sb_map[i] = BITSET_WORD_MAX;
932 }
933#endif
934 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
935 }
936 else
937 {
938 int i, j, ch;
939
940 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
941 if (BE (dfa->sb_char == NULL, 0))
942 return REG_ESPACE;
943
944 /* Set the bits corresponding to single byte chars. */
945 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
946 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
947 {
948 wint_t wch = __btowc (ch);
949 if (wch != WEOF)
950 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
951# ifndef _LIBC
952 if (isascii (ch) && wch != ch)
953 dfa->map_notascii = 1;
954# endif
955 }
956 }
957 }
958#endif
959
960 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
961 return REG_ESPACE;
962 return REG_NOERROR;
963}
964
965/* Initialize WORD_CHAR table, which indicate which character is
966 "word". In this case "word" means that it is the word construction
967 character used by some operators like "\<", "\>", etc. */
968
969static void
970internal_function
971init_word_char (re_dfa_t *dfa)
972{
973 int i, j, ch;
974 dfa->word_ops_used = 1;
975 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
976 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
977 if (isalnum (ch) || ch == '_')
978 dfa->word_char[i] |= (bitset_word_t) 1 << j;
979}
980
981/* Free the work area which are only used while compiling. */
982
983static void
984free_workarea_compile (regex_t *preg)
985{
986 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
987 bin_tree_storage_t *storage, *next;
988 for (storage = dfa->str_tree_storage; storage; storage = next)
989 {
990 next = storage->next;
991 re_free (storage);
992 }
993 dfa->str_tree_storage = NULL;
994 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
995 dfa->str_tree = NULL;
996 re_free (dfa->org_indices);
997 dfa->org_indices = NULL;
998}
999
1000/* Create initial states for all contexts. */
1001
1002static reg_errcode_t
1003create_initial_state (re_dfa_t *dfa)
1004{
1005 int first, i;
1006 reg_errcode_t err;
1007 re_node_set init_nodes;
1008
1009 /* Initial states have the epsilon closure of the node which is
1010 the first node of the regular expression. */
1011 first = dfa->str_tree->first->node_idx;
1012 dfa->init_node = first;
1013 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1014 if (BE (err != REG_NOERROR, 0))
1015 return err;
1016
1017 /* The back-references which are in initial states can epsilon transit,
1018 since in this case all of the subexpressions can be null.
1019 Then we add epsilon closures of the nodes which are the next nodes of
1020 the back-references. */
1021 if (dfa->nbackref > 0)
1022 for (i = 0; i < init_nodes.nelem; ++i)
1023 {
1024 int node_idx = init_nodes.elems[i];
1025 re_token_type_t type = dfa->nodes[node_idx].type;
1026
1027 int clexp_idx;
1028 if (type != OP_BACK_REF)
1029 continue;
1030 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1031 {
1032 re_token_t *clexp_node;
1033 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1034 if (clexp_node->type == OP_CLOSE_SUBEXP
1035 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1036 break;
1037 }
1038 if (clexp_idx == init_nodes.nelem)
1039 continue;
1040
1041 if (type == OP_BACK_REF)
1042 {
1043 int dest_idx = dfa->edests[node_idx].elems[0];
1044 if (!re_node_set_contains (&init_nodes, dest_idx))
1045 {
1046 reg_errcode_t err = re_node_set_merge (&init_nodes,
1047 dfa->eclosures
1048 + dest_idx);
1049 if (err != REG_NOERROR)
1050 return err;
1051 i = 0;
1052 }
1053 }
1054 }
1055
1056 /* It must be the first time to invoke acquire_state. */
1057 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1058 /* We don't check ERR here, since the initial state must not be NULL. */
1059 if (BE (dfa->init_state == NULL, 0))
1060 return err;
1061 if (dfa->init_state->has_constraint)
1062 {
1063 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1064 CONTEXT_WORD);
1065 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1066 CONTEXT_NEWLINE);
1067 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1068 &init_nodes,
1069 CONTEXT_NEWLINE
1070 | CONTEXT_BEGBUF);
1071 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1072 || dfa->init_state_begbuf == NULL, 0))
1073 return err;
1074 }
1075 else
1076 dfa->init_state_word = dfa->init_state_nl
1077 = dfa->init_state_begbuf = dfa->init_state;
1078
1079 re_node_set_free (&init_nodes);
1080 return REG_NOERROR;
1081}
1082\f
1083#ifdef RE_ENABLE_I18N
1084/* If it is possible to do searching in single byte encoding instead of UTF-8
1085 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1086 DFA nodes where needed. */
1087
1088static void
1089optimize_utf8 (re_dfa_t *dfa)
1090{
1091 int node, i, mb_chars = 0, has_period = 0;
1092
1093 for (node = 0; node < dfa->nodes_len; ++node)
1094 switch (dfa->nodes[node].type)
1095 {
1096 case CHARACTER:
1097 if (dfa->nodes[node].opr.c >= 0x80)
1098 mb_chars = 1;
1099 break;
1100 case ANCHOR:
1101 switch (dfa->nodes[node].opr.ctx_type)
1102 {
1103 case LINE_FIRST:
1104 case LINE_LAST:
1105 case BUF_FIRST:
1106 case BUF_LAST:
1107 break;
1108 default:
1109 /* Word anchors etc. cannot be handled. It's okay to test
1110 opr.ctx_type since constraints (for all DFA nodes) are
1111 created by ORing one or more opr.ctx_type values. */
1112 return;
1113 }
1114 break;
1115 case OP_PERIOD:
1116 has_period = 1;
1117 break;
1118 case OP_BACK_REF:
1119 case OP_ALT:
1120 case END_OF_RE:
1121 case OP_DUP_ASTERISK:
1122 case OP_OPEN_SUBEXP:
1123 case OP_CLOSE_SUBEXP:
1124 break;
1125 case COMPLEX_BRACKET:
1126 return;
1127 case SIMPLE_BRACKET:
1128 /* Just double check. The non-ASCII range starts at 0x80. */
1129 assert (0x80 % BITSET_WORD_BITS == 0);
1130 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1131 if (dfa->nodes[node].opr.sbcset[i])
1132 return;
1133 break;
1134 default:
1135 abort ();
1136 }
1137
1138 if (mb_chars || has_period)
1139 for (node = 0; node < dfa->nodes_len; ++node)
1140 {
1141 if (dfa->nodes[node].type == CHARACTER
1142 && dfa->nodes[node].opr.c >= 0x80)
1143 dfa->nodes[node].mb_partial = 0;
1144 else if (dfa->nodes[node].type == OP_PERIOD)
1145 dfa->nodes[node].type = OP_UTF8_PERIOD;
1146 }
1147
1148 /* The search can be in single byte locale. */
1149 dfa->mb_cur_max = 1;
1150 dfa->is_utf8 = 0;
1151 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1152}
1153#endif
1154\f
1155/* Analyze the structure tree, and calculate "first", "next", "edest",
1156 "eclosure", and "inveclosure". */
1157
1158static reg_errcode_t
1159analyze (regex_t *preg)
1160{
1161 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1162 reg_errcode_t ret;
1163
1164 /* Allocate arrays. */
1165 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1166 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1167 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1168 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1169 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1170 || dfa->eclosures == NULL, 0))
1171 return REG_ESPACE;
1172
1173 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1174 if (dfa->subexp_map != NULL)
1175 {
1176 int i;
1177 for (i = 0; i < preg->re_nsub; i++)
1178 dfa->subexp_map[i] = i;
1179 preorder (dfa->str_tree, optimize_subexps, dfa);
1180 for (i = 0; i < preg->re_nsub; i++)
1181 if (dfa->subexp_map[i] != i)
1182 break;
1183 if (i == preg->re_nsub)
1184 {
1185 free (dfa->subexp_map);
1186 dfa->subexp_map = NULL;
1187 }
1188 }
1189
1190 ret = postorder (dfa->str_tree, lower_subexps, preg);
1191 if (BE (ret != REG_NOERROR, 0))
1192 return ret;
1193 ret = postorder (dfa->str_tree, calc_first, dfa);
1194 if (BE (ret != REG_NOERROR, 0))
1195 return ret;
1196 preorder (dfa->str_tree, calc_next, dfa);
1197 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1198 if (BE (ret != REG_NOERROR, 0))
1199 return ret;
1200 ret = calc_eclosure (dfa);
1201 if (BE (ret != REG_NOERROR, 0))
1202 return ret;
1203
1204 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1205 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1206 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1207 || dfa->nbackref)
1208 {
1209 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1210 if (BE (dfa->inveclosures == NULL, 0))
1211 return REG_ESPACE;
1212 ret = calc_inveclosure (dfa);
1213 }
1214
1215 return ret;
1216}
1217
1218/* Our parse trees are very unbalanced, so we cannot use a stack to
1219 implement parse tree visits. Instead, we use parent pointers and
1220 some hairy code in these two functions. */
1221static reg_errcode_t
1222postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1223 void *extra)
1224{
1225 bin_tree_t *node, *prev;
1226
1227 for (node = root; ; )
1228 {
1229 /* Descend down the tree, preferably to the left (or to the right
1230 if that's the only child). */
1231 while (node->left || node->right)
1232 if (node->left)
1233 node = node->left;
1234 else
1235 node = node->right;
1236
1237 do
1238 {
1239 reg_errcode_t err = fn (extra, node);
1240 if (BE (err != REG_NOERROR, 0))
1241 return err;
1242 if (node->parent == NULL)
1243 return REG_NOERROR;
1244 prev = node;
1245 node = node->parent;
1246 }
1247 /* Go up while we have a node that is reached from the right. */
1248 while (node->right == prev || node->right == NULL);
1249 node = node->right;
1250 }
1251}
1252
1253static reg_errcode_t
1254preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1255 void *extra)
1256{
1257 bin_tree_t *node;
1258
1259 for (node = root; ; )
1260 {
1261 reg_errcode_t err = fn (extra, node);
1262 if (BE (err != REG_NOERROR, 0))
1263 return err;
1264
1265 /* Go to the left node, or up and to the right. */
1266 if (node->left)
1267 node = node->left;
1268 else
1269 {
1270 bin_tree_t *prev = NULL;
1271 while (node->right == prev || node->right == NULL)
1272 {
1273 prev = node;
1274 node = node->parent;
1275 if (!node)
1276 return REG_NOERROR;
1277 }
1278 node = node->right;
1279 }
1280 }
1281}
1282
1283/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1284 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1285 backreferences as well. Requires a preorder visit. */
1286static reg_errcode_t
1287optimize_subexps (void *extra, bin_tree_t *node)
1288{
1289 re_dfa_t *dfa = (re_dfa_t *) extra;
1290
1291 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1292 {
1293 int idx = node->token.opr.idx;
1294 node->token.opr.idx = dfa->subexp_map[idx];
1295 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1296 }
1297
1298 else if (node->token.type == SUBEXP
1299 && node->left && node->left->token.type == SUBEXP)
1300 {
1301 int other_idx = node->left->token.opr.idx;
1302
1303 node->left = node->left->left;
1304 if (node->left)
1305 node->left->parent = node;
1306
1307 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1308 if (other_idx < BITSET_WORD_BITS)
1309 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1310 }
1311
1312 return REG_NOERROR;
1313}
1314
1315/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1316 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1317static reg_errcode_t
1318lower_subexps (void *extra, bin_tree_t *node)
1319{
1320 regex_t *preg = (regex_t *) extra;
1321 reg_errcode_t err = REG_NOERROR;
1322
1323 if (node->left && node->left->token.type == SUBEXP)
1324 {
1325 node->left = lower_subexp (&err, preg, node->left);
1326 if (node->left)
1327 node->left->parent = node;
1328 }
1329 if (node->right && node->right->token.type == SUBEXP)
1330 {
1331 node->right = lower_subexp (&err, preg, node->right);
1332 if (node->right)
1333 node->right->parent = node;
1334 }
1335
1336 return err;
1337}
1338
1339static bin_tree_t *
1340lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1341{
1342 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1343 bin_tree_t *body = node->left;
1344 bin_tree_t *op, *cls, *tree1, *tree;
1345
1346 if (preg->no_sub
1347 /* We do not optimize empty subexpressions, because otherwise we may
1348 have bad CONCAT nodes with NULL children. This is obviously not
1349 very common, so we do not lose much. An example that triggers
1350 this case is the sed "script" /\(\)/x. */
1351 && node->left != NULL
1352 && (node->token.opr.idx >= BITSET_WORD_BITS
1353 || !(dfa->used_bkref_map
1354 & ((bitset_word_t) 1 << node->token.opr.idx))))
1355 return node->left;
1356
1357 /* Convert the SUBEXP node to the concatenation of an
1358 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1359 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1360 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1361 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1362 tree = create_tree (dfa, op, tree1, CONCAT);
1363 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1364 {
1365 *err = REG_ESPACE;
1366 return NULL;
1367 }
1368
1369 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1370 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1371 return tree;
1372}
1373
1374/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1375 nodes. Requires a postorder visit. */
1376static reg_errcode_t
1377calc_first (void *extra, bin_tree_t *node)
1378{
1379 re_dfa_t *dfa = (re_dfa_t *) extra;
1380 if (node->token.type == CONCAT)
1381 {
1382 node->first = node->left->first;
1383 node->node_idx = node->left->node_idx;
1384 }
1385 else
1386 {
1387 node->first = node;
1388 node->node_idx = re_dfa_add_node (dfa, node->token);
1389 if (BE (node->node_idx == -1, 0))
1390 return REG_ESPACE;
1391 if (node->token.type == ANCHOR)
1392 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1393 }
1394 return REG_NOERROR;
1395}
1396
1397/* Pass 2: compute NEXT on the tree. Preorder visit. */
1398static reg_errcode_t
1399calc_next (void *extra, bin_tree_t *node)
1400{
1401 switch (node->token.type)
1402 {
1403 case OP_DUP_ASTERISK:
1404 node->left->next = node;
1405 break;
1406 case CONCAT:
1407 node->left->next = node->right->first;
1408 node->right->next = node->next;
1409 break;
1410 default:
1411 if (node->left)
1412 node->left->next = node->next;
1413 if (node->right)
1414 node->right->next = node->next;
1415 break;
1416 }
1417 return REG_NOERROR;
1418}
1419
1420/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1421static reg_errcode_t
1422link_nfa_nodes (void *extra, bin_tree_t *node)
1423{
1424 re_dfa_t *dfa = (re_dfa_t *) extra;
1425 int idx = node->node_idx;
1426 reg_errcode_t err = REG_NOERROR;
1427
1428 switch (node->token.type)
1429 {
1430 case CONCAT:
1431 break;
1432
1433 case END_OF_RE:
1434 assert (node->next == NULL);
1435 break;
1436
1437 case OP_DUP_ASTERISK:
1438 case OP_ALT:
1439 {
1440 int left, right;
1441 dfa->has_plural_match = 1;
1442 if (node->left != NULL)
1443 left = node->left->first->node_idx;
1444 else
1445 left = node->next->node_idx;
1446 if (node->right != NULL)
1447 right = node->right->first->node_idx;
1448 else
1449 right = node->next->node_idx;
1450 assert (left > -1);
1451 assert (right > -1);
1452 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1453 }
1454 break;
1455
1456 case ANCHOR:
1457 case OP_OPEN_SUBEXP:
1458 case OP_CLOSE_SUBEXP:
1459 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1460 break;
1461
1462 case OP_BACK_REF:
1463 dfa->nexts[idx] = node->next->node_idx;
1464 if (node->token.type == OP_BACK_REF)
1465 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1466 break;
1467
1468 default:
1469 assert (!IS_EPSILON_NODE (node->token.type));
1470 dfa->nexts[idx] = node->next->node_idx;
1471 break;
1472 }
1473
1474 return err;
1475}
1476
1477/* Duplicate the epsilon closure of the node ROOT_NODE.
1478 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1479 to their own constraint. */
1480
1481static reg_errcode_t
1482internal_function
1483duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1484 int root_node, unsigned int init_constraint)
1485{
1486 int org_node, clone_node, ret;
1487 unsigned int constraint = init_constraint;
1488 for (org_node = top_org_node, clone_node = top_clone_node;;)
1489 {
1490 int org_dest, clone_dest;
1491 if (dfa->nodes[org_node].type == OP_BACK_REF)
1492 {
1493 /* If the back reference epsilon-transit, its destination must
1494 also have the constraint. Then duplicate the epsilon closure
1495 of the destination of the back reference, and store it in
1496 edests of the back reference. */
1497 org_dest = dfa->nexts[org_node];
1498 re_node_set_empty (dfa->edests + clone_node);
1499 clone_dest = duplicate_node (dfa, org_dest, constraint);
1500 if (BE (clone_dest == -1, 0))
1501 return REG_ESPACE;
1502 dfa->nexts[clone_node] = dfa->nexts[org_node];
1503 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1504 if (BE (ret < 0, 0))
1505 return REG_ESPACE;
1506 }
1507 else if (dfa->edests[org_node].nelem == 0)
1508 {
1509 /* In case of the node can't epsilon-transit, don't duplicate the
1510 destination and store the original destination as the
1511 destination of the node. */
1512 dfa->nexts[clone_node] = dfa->nexts[org_node];
1513 break;
1514 }
1515 else if (dfa->edests[org_node].nelem == 1)
1516 {
1517 /* In case of the node can epsilon-transit, and it has only one
1518 destination. */
1519 org_dest = dfa->edests[org_node].elems[0];
1520 re_node_set_empty (dfa->edests + clone_node);
1521 /* If the node is root_node itself, it means the epsilon clsoure
1522 has a loop. Then tie it to the destination of the root_node. */
1523 if (org_node == root_node && clone_node != org_node)
1524 {
1525 ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1526 if (BE (ret < 0, 0))
1527 return REG_ESPACE;
1528 break;
1529 }
1530 /* In case of the node has another constraint, add it. */
1531 constraint |= dfa->nodes[org_node].constraint;
1532 clone_dest = duplicate_node (dfa, org_dest, constraint);
1533 if (BE (clone_dest == -1, 0))
1534 return REG_ESPACE;
1535 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1536 if (BE (ret < 0, 0))
1537 return REG_ESPACE;
1538 }
1539 else /* dfa->edests[org_node].nelem == 2 */
1540 {
1541 /* In case of the node can epsilon-transit, and it has two
1542 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1543 org_dest = dfa->edests[org_node].elems[0];
1544 re_node_set_empty (dfa->edests + clone_node);
1545 /* Search for a duplicated node which satisfies the constraint. */
1546 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1547 if (clone_dest == -1)
1548 {
1549 /* There is no such duplicated node, create a new one. */
1550 reg_errcode_t err;
1551 clone_dest = duplicate_node (dfa, org_dest, constraint);
1552 if (BE (clone_dest == -1, 0))
1553 return REG_ESPACE;
1554 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1555 if (BE (ret < 0, 0))
1556 return REG_ESPACE;
1557 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1558 root_node, constraint);
1559 if (BE (err != REG_NOERROR, 0))
1560 return err;
1561 }
1562 else
1563 {
1564 /* There is a duplicated node which satisfies the constraint,
1565 use it to avoid infinite loop. */
1566 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1567 if (BE (ret < 0, 0))
1568 return REG_ESPACE;
1569 }
1570
1571 org_dest = dfa->edests[org_node].elems[1];
1572 clone_dest = duplicate_node (dfa, org_dest, constraint);
1573 if (BE (clone_dest == -1, 0))
1574 return REG_ESPACE;
1575 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1576 if (BE (ret < 0, 0))
1577 return REG_ESPACE;
1578 }
1579 org_node = org_dest;
1580 clone_node = clone_dest;
1581 }
1582 return REG_NOERROR;
1583}
1584
1585/* Search for a node which is duplicated from the node ORG_NODE, and
1586 satisfies the constraint CONSTRAINT. */
1587
1588static int
1589search_duplicated_node (const re_dfa_t *dfa, int org_node,
1590 unsigned int constraint)
1591{
1592 int idx;
1593 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1594 {
1595 if (org_node == dfa->org_indices[idx]
1596 && constraint == dfa->nodes[idx].constraint)
1597 return idx; /* Found. */
1598 }
1599 return -1; /* Not found. */
1600}
1601
1602/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1603 Return the index of the new node, or -1 if insufficient storage is
1604 available. */
1605
1606static int
1607duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1608{
1609 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1610 if (BE (dup_idx != -1, 1))
1611 {
1612 dfa->nodes[dup_idx].constraint = constraint;
1613 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1614 dfa->nodes[dup_idx].duplicated = 1;
1615
1616 /* Store the index of the original node. */
1617 dfa->org_indices[dup_idx] = org_idx;
1618 }
1619 return dup_idx;
1620}
1621
1622static reg_errcode_t
1623calc_inveclosure (re_dfa_t *dfa)
1624{
1625 int src, idx, ret;
1626 for (idx = 0; idx < dfa->nodes_len; ++idx)
1627 re_node_set_init_empty (dfa->inveclosures + idx);
1628
1629 for (src = 0; src < dfa->nodes_len; ++src)
1630 {
1631 int *elems = dfa->eclosures[src].elems;
1632 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1633 {
1634 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1635 if (BE (ret == -1, 0))
1636 return REG_ESPACE;
1637 }
1638 }
1639
1640 return REG_NOERROR;
1641}
1642
1643/* Calculate "eclosure" for all the node in DFA. */
1644
1645static reg_errcode_t
1646calc_eclosure (re_dfa_t *dfa)
1647{
1648 int node_idx, incomplete;
1649#ifdef DEBUG
1650 assert (dfa->nodes_len > 0);
1651#endif
1652 incomplete = 0;
1653 /* For each nodes, calculate epsilon closure. */
1654 for (node_idx = 0; ; ++node_idx)
1655 {
1656 reg_errcode_t err;
1657 re_node_set eclosure_elem;
1658 if (node_idx == dfa->nodes_len)
1659 {
1660 if (!incomplete)
1661 break;
1662 incomplete = 0;
1663 node_idx = 0;
1664 }
1665
1666#ifdef DEBUG
1667 assert (dfa->eclosures[node_idx].nelem != -1);
1668#endif
1669
1670 /* If we have already calculated, skip it. */
1671 if (dfa->eclosures[node_idx].nelem != 0)
1672 continue;
1673 /* Calculate epsilon closure of `node_idx'. */
1674 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1675 if (BE (err != REG_NOERROR, 0))
1676 return err;
1677
1678 if (dfa->eclosures[node_idx].nelem == 0)
1679 {
1680 incomplete = 1;
1681 re_node_set_free (&eclosure_elem);
1682 }
1683 }
1684 return REG_NOERROR;
1685}
1686
1687/* Calculate epsilon closure of NODE. */
1688
1689static reg_errcode_t
1690calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1691{
1692 reg_errcode_t err;
1693 int i;
1694 re_node_set eclosure;
1695 int ret;
1696 int incomplete = 0;
1697 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1698 if (BE (err != REG_NOERROR, 0))
1699 return err;
1700
1701 /* This indicates that we are calculating this node now.
1702 We reference this value to avoid infinite loop. */
1703 dfa->eclosures[node].nelem = -1;
1704
1705 /* If the current node has constraints, duplicate all nodes
1706 since they must inherit the constraints. */
1707 if (dfa->nodes[node].constraint
1708 && dfa->edests[node].nelem
1709 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1710 {
1711 err = duplicate_node_closure (dfa, node, node, node,
1712 dfa->nodes[node].constraint);
1713 if (BE (err != REG_NOERROR, 0))
1714 return err;
1715 }
1716
1717 /* Expand each epsilon destination nodes. */
1718 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1719 for (i = 0; i < dfa->edests[node].nelem; ++i)
1720 {
1721 re_node_set eclosure_elem;
1722 int edest = dfa->edests[node].elems[i];
1723 /* If calculating the epsilon closure of `edest' is in progress,
1724 return intermediate result. */
1725 if (dfa->eclosures[edest].nelem == -1)
1726 {
1727 incomplete = 1;
1728 continue;
1729 }
1730 /* If we haven't calculated the epsilon closure of `edest' yet,
1731 calculate now. Otherwise use calculated epsilon closure. */
1732 if (dfa->eclosures[edest].nelem == 0)
1733 {
1734 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1735 if (BE (err != REG_NOERROR, 0))
1736 return err;
1737 }
1738 else
1739 eclosure_elem = dfa->eclosures[edest];
1740 /* Merge the epsilon closure of `edest'. */
1741 err = re_node_set_merge (&eclosure, &eclosure_elem);
1742 if (BE (err != REG_NOERROR, 0))
1743 return err;
1744 /* If the epsilon closure of `edest' is incomplete,
1745 the epsilon closure of this node is also incomplete. */
1746 if (dfa->eclosures[edest].nelem == 0)
1747 {
1748 incomplete = 1;
1749 re_node_set_free (&eclosure_elem);
1750 }
1751 }
1752
1753 /* An epsilon closure includes itself. */
1754 ret = re_node_set_insert (&eclosure, node);
1755 if (BE (ret < 0, 0))
1756 return REG_ESPACE;
1757 if (incomplete && !root)
1758 dfa->eclosures[node].nelem = 0;
1759 else
1760 dfa->eclosures[node] = eclosure;
1761 *new_set = eclosure;
1762 return REG_NOERROR;
1763}
1764\f
1765/* Functions for token which are used in the parser. */
1766
1767/* Fetch a token from INPUT.
1768 We must not use this function inside bracket expressions. */
1769
1770static void
1771internal_function
1772fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1773{
1774 re_string_skip_bytes (input, peek_token (result, input, syntax));
1775}
1776
1777/* Peek a token from INPUT, and return the length of the token.
1778 We must not use this function inside bracket expressions. */
1779
1780static int
1781internal_function
1782peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1783{
1784 unsigned char c;
1785
1786 if (re_string_eoi (input))
1787 {
1788 token->type = END_OF_RE;
1789 return 0;
1790 }
1791
1792 c = re_string_peek_byte (input, 0);
1793 token->opr.c = c;
1794
1795 token->word_char = 0;
1796#ifdef RE_ENABLE_I18N
1797 token->mb_partial = 0;
1798 if (input->mb_cur_max > 1 &&
1799 !re_string_first_byte (input, re_string_cur_idx (input)))
1800 {
1801 token->type = CHARACTER;
1802 token->mb_partial = 1;
1803 return 1;
1804 }
1805#endif
1806 if (c == '\\')
1807 {
1808 unsigned char c2;
1809 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1810 {
1811 token->type = BACK_SLASH;
1812 return 1;
1813 }
1814
1815 c2 = re_string_peek_byte_case (input, 1);
1816 token->opr.c = c2;
1817 token->type = CHARACTER;
1818#ifdef RE_ENABLE_I18N
1819 if (input->mb_cur_max > 1)
1820 {
1821 wint_t wc = re_string_wchar_at (input,
1822 re_string_cur_idx (input) + 1);
1823 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1824 }
1825 else
1826#endif
1827 token->word_char = IS_WORD_CHAR (c2) != 0;
1828
1829 switch (c2)
1830 {
1831 case '|':
1832 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1833 token->type = OP_ALT;
1834 break;
1835 case '1': case '2': case '3': case '4': case '5':
1836 case '6': case '7': case '8': case '9':
1837 if (!(syntax & RE_NO_BK_REFS))
1838 {
1839 token->type = OP_BACK_REF;
1840 token->opr.idx = c2 - '1';
1841 }
1842 break;
1843 case '<':
1844 if (!(syntax & RE_NO_GNU_OPS))
1845 {
1846 token->type = ANCHOR;
1847 token->opr.ctx_type = WORD_FIRST;
1848 }
1849 break;
1850 case '>':
1851 if (!(syntax & RE_NO_GNU_OPS))
1852 {
1853 token->type = ANCHOR;
1854 token->opr.ctx_type = WORD_LAST;
1855 }
1856 break;
1857 case 'b':
1858 if (!(syntax & RE_NO_GNU_OPS))
1859 {
1860 token->type = ANCHOR;
1861 token->opr.ctx_type = WORD_DELIM;
1862 }
1863 break;
1864 case 'B':
1865 if (!(syntax & RE_NO_GNU_OPS))
1866 {
1867 token->type = ANCHOR;
1868 token->opr.ctx_type = NOT_WORD_DELIM;
1869 }
1870 break;
1871 case 'w':
1872 if (!(syntax & RE_NO_GNU_OPS))
1873 token->type = OP_WORD;
1874 break;
1875 case 'W':
1876 if (!(syntax & RE_NO_GNU_OPS))
1877 token->type = OP_NOTWORD;
1878 break;
1879 case 's':
1880 if (!(syntax & RE_NO_GNU_OPS))
1881 token->type = OP_SPACE;
1882 break;
1883 case 'S':
1884 if (!(syntax & RE_NO_GNU_OPS))
1885 token->type = OP_NOTSPACE;
1886 break;
1887 case '`':
1888 if (!(syntax & RE_NO_GNU_OPS))
1889 {
1890 token->type = ANCHOR;
1891 token->opr.ctx_type = BUF_FIRST;
1892 }
1893 break;
1894 case '\'':
1895 if (!(syntax & RE_NO_GNU_OPS))
1896 {
1897 token->type = ANCHOR;
1898 token->opr.ctx_type = BUF_LAST;
1899 }
1900 break;
1901 case '(':
1902 if (!(syntax & RE_NO_BK_PARENS))
1903 token->type = OP_OPEN_SUBEXP;
1904 break;
1905 case ')':
1906 if (!(syntax & RE_NO_BK_PARENS))
1907 token->type = OP_CLOSE_SUBEXP;
1908 break;
1909 case '+':
1910 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1911 token->type = OP_DUP_PLUS;
1912 break;
1913 case '?':
1914 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1915 token->type = OP_DUP_QUESTION;
1916 break;
1917 case '{':
1918 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1919 token->type = OP_OPEN_DUP_NUM;
1920 break;
1921 case '}':
1922 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1923 token->type = OP_CLOSE_DUP_NUM;
1924 break;
1925 default:
1926 break;
1927 }
1928 return 2;
1929 }
1930
1931 token->type = CHARACTER;
1932#ifdef RE_ENABLE_I18N
1933 if (input->mb_cur_max > 1)
1934 {
1935 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1936 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1937 }
1938 else
1939#endif
1940 token->word_char = IS_WORD_CHAR (token->opr.c);
1941
1942 switch (c)
1943 {
1944 case '\n':
1945 if (syntax & RE_NEWLINE_ALT)
1946 token->type = OP_ALT;
1947 break;
1948 case '|':
1949 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1950 token->type = OP_ALT;
1951 break;
1952 case '*':
1953 token->type = OP_DUP_ASTERISK;
1954 break;
1955 case '+':
1956 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1957 token->type = OP_DUP_PLUS;
1958 break;
1959 case '?':
1960 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1961 token->type = OP_DUP_QUESTION;
1962 break;
1963 case '{':
1964 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1965 token->type = OP_OPEN_DUP_NUM;
1966 break;
1967 case '}':
1968 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1969 token->type = OP_CLOSE_DUP_NUM;
1970 break;
1971 case '(':
1972 if (syntax & RE_NO_BK_PARENS)
1973 token->type = OP_OPEN_SUBEXP;
1974 break;
1975 case ')':
1976 if (syntax & RE_NO_BK_PARENS)
1977 token->type = OP_CLOSE_SUBEXP;
1978 break;
1979 case '[':
1980 token->type = OP_OPEN_BRACKET;
1981 break;
1982 case '.':
1983 token->type = OP_PERIOD;
1984 break;
1985 case '^':
1986 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1987 re_string_cur_idx (input) != 0)
1988 {
1989 char prev = re_string_peek_byte (input, -1);
1990 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1991 break;
1992 }
1993 token->type = ANCHOR;
1994 token->opr.ctx_type = LINE_FIRST;
1995 break;
1996 case '$':
1997 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1998 re_string_cur_idx (input) + 1 != re_string_length (input))
1999 {
2000 re_token_t next;
2001 re_string_skip_bytes (input, 1);
2002 peek_token (&next, input, syntax);
2003 re_string_skip_bytes (input, -1);
2004 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2005 break;
2006 }
2007 token->type = ANCHOR;
2008 token->opr.ctx_type = LINE_LAST;
2009 break;
2010 default:
2011 break;
2012 }
2013 return 1;
2014}
2015
2016/* Peek a token from INPUT, and return the length of the token.
2017 We must not use this function out of bracket expressions. */
2018
2019static int
2020internal_function
2021peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2022{
2023 unsigned char c;
2024 if (re_string_eoi (input))
2025 {
2026 token->type = END_OF_RE;
2027 return 0;
2028 }
2029 c = re_string_peek_byte (input, 0);
2030 token->opr.c = c;
2031
2032#ifdef RE_ENABLE_I18N
2033 if (input->mb_cur_max > 1 &&
2034 !re_string_first_byte (input, re_string_cur_idx (input)))
2035 {
2036 token->type = CHARACTER;
2037 return 1;
2038 }
2039#endif /* RE_ENABLE_I18N */
2040
2041 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2042 && re_string_cur_idx (input) + 1 < re_string_length (input))
2043 {
2044 /* In this case, '\' escape a character. */
2045 unsigned char c2;
2046 re_string_skip_bytes (input, 1);
2047 c2 = re_string_peek_byte (input, 0);
2048 token->opr.c = c2;
2049 token->type = CHARACTER;
2050 return 1;
2051 }
2052 if (c == '[') /* '[' is a special char in a bracket exps. */
2053 {
2054 unsigned char c2;
2055 int token_len;
2056 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2057 c2 = re_string_peek_byte (input, 1);
2058 else
2059 c2 = 0;
2060 token->opr.c = c2;
2061 token_len = 2;
2062 switch (c2)
2063 {
2064 case '.':
2065 token->type = OP_OPEN_COLL_ELEM;
2066 break;
2067 case '=':
2068 token->type = OP_OPEN_EQUIV_CLASS;
2069 break;
2070 case ':':
2071 if (syntax & RE_CHAR_CLASSES)
2072 {
2073 token->type = OP_OPEN_CHAR_CLASS;
2074 break;
2075 }
2076 /* else fall through. */
2077 default:
2078 token->type = CHARACTER;
2079 token->opr.c = c;
2080 token_len = 1;
2081 break;
2082 }
2083 return token_len;
2084 }
2085 switch (c)
2086 {
2087 case '-':
2088 token->type = OP_CHARSET_RANGE;
2089 break;
2090 case ']':
2091 token->type = OP_CLOSE_BRACKET;
2092 break;
2093 case '^':
2094 token->type = OP_NON_MATCH_LIST;
2095 break;
2096 default:
2097 token->type = CHARACTER;
2098 }
2099 return 1;
2100}
2101\f
2102/* Functions for parser. */
2103
2104/* Entry point of the parser.
2105 Parse the regular expression REGEXP and return the structure tree.
2106 If an error is occured, ERR is set by error code, and return NULL.
2107 This function build the following tree, from regular expression <reg_exp>:
2108 CAT
2109 / \
2110 / \
2111 <reg_exp> EOR
2112
2113 CAT means concatenation.
2114 EOR means end of regular expression. */
2115
2116static bin_tree_t *
2117parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2118 reg_errcode_t *err)
2119{
2120 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2121 bin_tree_t *tree, *eor, *root;
2122 re_token_t current_token;
2123 dfa->syntax = syntax;
2124 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2125 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2126 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2127 return NULL;
2128 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2129 if (tree != NULL)
2130 root = create_tree (dfa, tree, eor, CONCAT);
2131 else
2132 root = eor;
2133 if (BE (eor == NULL || root == NULL, 0))
2134 {
2135 *err = REG_ESPACE;
2136 return NULL;
2137 }
2138 return root;
2139}
2140
2141/* This function build the following tree, from regular expression
2142 <branch1>|<branch2>:
2143 ALT
2144 / \
2145 / \
2146 <branch1> <branch2>
2147
2148 ALT means alternative, which represents the operator `|'. */
2149
2150static bin_tree_t *
2151parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2152 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2153{
2154 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2155 bin_tree_t *tree, *branch = NULL;
2156 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2157 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2158 return NULL;
2159
2160 while (token->type == OP_ALT)
2161 {
2162 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2163 if (token->type != OP_ALT && token->type != END_OF_RE
2164 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2165 {
2166 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2167 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2168 return NULL;
2169 }
2170 else
2171 branch = NULL;
2172 tree = create_tree (dfa, tree, branch, OP_ALT);
2173 if (BE (tree == NULL, 0))
2174 {
2175 *err = REG_ESPACE;
2176 return NULL;
2177 }
2178 }
2179 return tree;
2180}
2181
2182/* This function build the following tree, from regular expression
2183 <exp1><exp2>:
2184 CAT
2185 / \
2186 / \
2187 <exp1> <exp2>
2188
2189 CAT means concatenation. */
2190
2191static bin_tree_t *
2192parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2193 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2194{
2195 bin_tree_t *tree, *exp;
2196 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2197 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2198 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2199 return NULL;
2200
2201 while (token->type != OP_ALT && token->type != END_OF_RE
2202 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2203 {
2204 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2205 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2206 {
2207 return NULL;
2208 }
2209 if (tree != NULL && exp != NULL)
2210 {
2211 tree = create_tree (dfa, tree, exp, CONCAT);
2212 if (tree == NULL)
2213 {
2214 *err = REG_ESPACE;
2215 return NULL;
2216 }
2217 }
2218 else if (tree == NULL)
2219 tree = exp;
2220 /* Otherwise exp == NULL, we don't need to create new tree. */
2221 }
2222 return tree;
2223}
2224
2225/* This function build the following tree, from regular expression a*:
2226 *
2227 |
2228 a
2229*/
2230
2231static bin_tree_t *
2232parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2233 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2234{
2235 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2236 bin_tree_t *tree;
2237 switch (token->type)
2238 {
2239 case CHARACTER:
2240 tree = create_token_tree (dfa, NULL, NULL, token);
2241 if (BE (tree == NULL, 0))
2242 {
2243 *err = REG_ESPACE;
2244 return NULL;
2245 }
2246#ifdef RE_ENABLE_I18N
2247 if (dfa->mb_cur_max > 1)
2248 {
2249 while (!re_string_eoi (regexp)
2250 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2251 {
2252 bin_tree_t *mbc_remain;
2253 fetch_token (token, regexp, syntax);
2254 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2255 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2256 if (BE (mbc_remain == NULL || tree == NULL, 0))
2257 {
2258 *err = REG_ESPACE;
2259 return NULL;
2260 }
2261 }
2262 }
2263#endif
2264 break;
2265 case OP_OPEN_SUBEXP:
2266 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2267 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2268 return NULL;
2269 break;
2270 case OP_OPEN_BRACKET:
2271 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2272 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2273 return NULL;
2274 break;
2275 case OP_BACK_REF:
2276 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2277 {
2278 *err = REG_ESUBREG;
2279 return NULL;
2280 }
2281 dfa->used_bkref_map |= 1 << token->opr.idx;
2282 tree = create_token_tree (dfa, NULL, NULL, token);
2283 if (BE (tree == NULL, 0))
2284 {
2285 *err = REG_ESPACE;
2286 return NULL;
2287 }
2288 ++dfa->nbackref;
2289 dfa->has_mb_node = 1;
2290 break;
2291 case OP_OPEN_DUP_NUM:
2292 if (syntax & RE_CONTEXT_INVALID_DUP)
2293 {
2294 *err = REG_BADRPT;
2295 return NULL;
2296 }
2297 /* FALLTHROUGH */
2298 case OP_DUP_ASTERISK:
2299 case OP_DUP_PLUS:
2300 case OP_DUP_QUESTION:
2301 if (syntax & RE_CONTEXT_INVALID_OPS)
2302 {
2303 *err = REG_BADRPT;
2304 return NULL;
2305 }
2306 else if (syntax & RE_CONTEXT_INDEP_OPS)
2307 {
2308 fetch_token (token, regexp, syntax);
2309 return parse_expression (regexp, preg, token, syntax, nest, err);
2310 }
2311 /* else fall through */
2312 case OP_CLOSE_SUBEXP:
2313 if ((token->type == OP_CLOSE_SUBEXP) &&
2314 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2315 {
2316 *err = REG_ERPAREN;
2317 return NULL;
2318 }
2319 /* else fall through */
2320 case OP_CLOSE_DUP_NUM:
2321 /* We treat it as a normal character. */
2322
2323 /* Then we can these characters as normal characters. */
2324 token->type = CHARACTER;
2325 /* mb_partial and word_char bits should be initialized already
2326 by peek_token. */
2327 tree = create_token_tree (dfa, NULL, NULL, token);
2328 if (BE (tree == NULL, 0))
2329 {
2330 *err = REG_ESPACE;
2331 return NULL;
2332 }
2333 break;
2334 case ANCHOR:
2335 if ((token->opr.ctx_type
2336 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2337 && dfa->word_ops_used == 0)
2338 init_word_char (dfa);
2339 if (token->opr.ctx_type == WORD_DELIM
2340 || token->opr.ctx_type == NOT_WORD_DELIM)
2341 {
2342 bin_tree_t *tree_first, *tree_last;
2343 if (token->opr.ctx_type == WORD_DELIM)
2344 {
2345 token->opr.ctx_type = WORD_FIRST;
2346 tree_first = create_token_tree (dfa, NULL, NULL, token);
2347 token->opr.ctx_type = WORD_LAST;
2348 }
2349 else
2350 {
2351 token->opr.ctx_type = INSIDE_WORD;
2352 tree_first = create_token_tree (dfa, NULL, NULL, token);
2353 token->opr.ctx_type = INSIDE_NOTWORD;
2354 }
2355 tree_last = create_token_tree (dfa, NULL, NULL, token);
2356 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2357 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2358 {
2359 *err = REG_ESPACE;
2360 return NULL;
2361 }
2362 }
2363 else
2364 {
2365 tree = create_token_tree (dfa, NULL, NULL, token);
2366 if (BE (tree == NULL, 0))
2367 {
2368 *err = REG_ESPACE;
2369 return NULL;
2370 }
2371 }
2372 /* We must return here, since ANCHORs can't be followed
2373 by repetition operators.
2374 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2375 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2376 fetch_token (token, regexp, syntax);
2377 return tree;
2378 case OP_PERIOD:
2379 tree = create_token_tree (dfa, NULL, NULL, token);
2380 if (BE (tree == NULL, 0))
2381 {
2382 *err = REG_ESPACE;
2383 return NULL;
2384 }
2385 if (dfa->mb_cur_max > 1)
2386 dfa->has_mb_node = 1;
2387 break;
2388 case OP_WORD:
2389 case OP_NOTWORD:
2390 tree = build_charclass_op (dfa, regexp->trans,
2391 "alnum",
2392 "_",
2393 token->type == OP_NOTWORD, err);
2394 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2395 return NULL;
2396 break;
2397 case OP_SPACE:
2398 case OP_NOTSPACE:
2399 tree = build_charclass_op (dfa, regexp->trans,
2400 "space",
2401 "",
2402 token->type == OP_NOTSPACE, err);
2403 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2404 return NULL;
2405 break;
2406 case OP_ALT:
2407 case END_OF_RE:
2408 return NULL;
2409 case BACK_SLASH:
2410 *err = REG_EESCAPE;
2411 return NULL;
2412 default:
2413 /* Must not happen? */
2414#ifdef DEBUG
2415 assert (0);
2416#endif
2417 return NULL;
2418 }
2419 fetch_token (token, regexp, syntax);
2420
2421 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2422 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2423 {
2424 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2425 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2426 return NULL;
2427 /* In BRE consecutive duplications are not allowed. */
2428 if ((syntax & RE_CONTEXT_INVALID_DUP)
2429 && (token->type == OP_DUP_ASTERISK
2430 || token->type == OP_OPEN_DUP_NUM))
2431 {
2432 *err = REG_BADRPT;
2433 return NULL;
2434 }
2435 }
2436
2437 return tree;
2438}
2439
2440/* This function build the following tree, from regular expression
2441 (<reg_exp>):
2442 SUBEXP
2443 |
2444 <reg_exp>
2445*/
2446
2447static bin_tree_t *
2448parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2449 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2450{
2451 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2452 bin_tree_t *tree;
2453 size_t cur_nsub;
2454 cur_nsub = preg->re_nsub++;
2455
2456 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2457
2458 /* The subexpression may be a null string. */
2459 if (token->type == OP_CLOSE_SUBEXP)
2460 tree = NULL;
2461 else
2462 {
2463 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2464 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2465 *err = REG_EPAREN;
2466 if (BE (*err != REG_NOERROR, 0))
2467 return NULL;
2468 }
2469
2470 if (cur_nsub <= '9' - '1')
2471 dfa->completed_bkref_map |= 1 << cur_nsub;
2472
2473 tree = create_tree (dfa, tree, NULL, SUBEXP);
2474 if (BE (tree == NULL, 0))
2475 {
2476 *err = REG_ESPACE;
2477 return NULL;
2478 }
2479 tree->token.opr.idx = cur_nsub;
2480 return tree;
2481}
2482
2483/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2484
2485static bin_tree_t *
2486parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2487 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2488{
2489 bin_tree_t *tree = NULL, *old_tree = NULL;
2490 int i, start, end, start_idx = re_string_cur_idx (regexp);
2491#ifndef RE_TOKEN_INIT_BUG
2492 re_token_t start_token = *token;
2493#else
2494 re_token_t start_token;
2495
2496 memcpy ((void *) &start_token, (void *) token, sizeof start_token);
2497#endif
2498
2499 if (token->type == OP_OPEN_DUP_NUM)
2500 {
2501 end = 0;
2502 start = fetch_number (regexp, token, syntax);
2503 if (start == -1)
2504 {
2505 if (token->type == CHARACTER && token->opr.c == ',')
2506 start = 0; /* We treat "{,m}" as "{0,m}". */
2507 else
2508 {
2509 *err = REG_BADBR; /* <re>{} is invalid. */
2510 return NULL;
2511 }
2512 }
2513 if (BE (start != -2, 1))
2514 {
2515 /* We treat "{n}" as "{n,n}". */
2516 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2517 : ((token->type == CHARACTER && token->opr.c == ',')
2518 ? fetch_number (regexp, token, syntax) : -2));
2519 }
2520 if (BE (start == -2 || end == -2, 0))
2521 {
2522 /* Invalid sequence. */
2523 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2524 {
2525 if (token->type == END_OF_RE)
2526 *err = REG_EBRACE;
2527 else
2528 *err = REG_BADBR;
2529
2530 return NULL;
2531 }
2532
2533 /* If the syntax bit is set, rollback. */
2534 re_string_set_index (regexp, start_idx);
2535 *token = start_token;
2536 token->type = CHARACTER;
2537 /* mb_partial and word_char bits should be already initialized by
2538 peek_token. */
2539 return elem;
2540 }
2541
2542 if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2543 {
2544 /* First number greater than second. */
2545 *err = REG_BADBR;
2546 return NULL;
2547 }
2548 }
2549 else
2550 {
2551 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2552 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2553 }
2554
2555 fetch_token (token, regexp, syntax);
2556
2557 if (BE (elem == NULL, 0))
2558 return NULL;
2559 if (BE (start == 0 && end == 0, 0))
2560 {
2561 postorder (elem, free_tree, NULL);
2562 return NULL;
2563 }
2564
2565 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2566 if (BE (start > 0, 0))
2567 {
2568 tree = elem;
2569 for (i = 2; i <= start; ++i)
2570 {
2571 elem = duplicate_tree (elem, dfa);
2572 tree = create_tree (dfa, tree, elem, CONCAT);
2573 if (BE (elem == NULL || tree == NULL, 0))
2574 goto parse_dup_op_espace;
2575 }
2576
2577 if (start == end)
2578 return tree;
2579
2580 /* Duplicate ELEM before it is marked optional. */
2581 elem = duplicate_tree (elem, dfa);
2582 old_tree = tree;
2583 }
2584 else
2585 old_tree = NULL;
2586
2587 if (elem->token.type == SUBEXP)
2588 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2589
2590 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2591 if (BE (tree == NULL, 0))
2592 goto parse_dup_op_espace;
2593
2594 /* This loop is actually executed only when end != -1,
2595 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2596 already created the start+1-th copy. */
2597 for (i = start + 2; i <= end; ++i)
2598 {
2599 elem = duplicate_tree (elem, dfa);
2600 tree = create_tree (dfa, tree, elem, CONCAT);
2601 if (BE (elem == NULL || tree == NULL, 0))
2602 goto parse_dup_op_espace;
2603
2604 tree = create_tree (dfa, tree, NULL, OP_ALT);
2605 if (BE (tree == NULL, 0))
2606 goto parse_dup_op_espace;
2607 }
2608
2609 if (old_tree)
2610 tree = create_tree (dfa, old_tree, tree, CONCAT);
2611
2612 return tree;
2613
2614 parse_dup_op_espace:
2615 *err = REG_ESPACE;
2616 return NULL;
2617}
2618
2619/* Size of the names for collating symbol/equivalence_class/character_class.
2620 I'm not sure, but maybe enough. */
2621#define BRACKET_NAME_BUF_SIZE 32
2622
2623#ifndef _LIBC
2624 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2625 Build the range expression which starts from START_ELEM, and ends
2626 at END_ELEM. The result are written to MBCSET and SBCSET.
2627 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2628 mbcset->range_ends, is a pointer argument sinse we may
2629 update it. */
2630
2631static reg_errcode_t
2632internal_function
2633# ifdef RE_ENABLE_I18N
2634build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2635 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2636# else /* not RE_ENABLE_I18N */
2637build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2638 bracket_elem_t *end_elem)
2639# endif /* not RE_ENABLE_I18N */
2640{
2641 unsigned int start_ch, end_ch;
2642 /* Equivalence Classes and Character Classes can't be a range start/end. */
2643 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2644 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2645 0))
2646 return REG_ERANGE;
2647
2648 /* We can handle no multi character collating elements without libc
2649 support. */
2650 if (BE ((start_elem->type == COLL_SYM
2651 && strlen ((char *) start_elem->opr.name) > 1)
2652 || (end_elem->type == COLL_SYM
2653 && strlen ((char *) end_elem->opr.name) > 1), 0))
2654 return REG_ECOLLATE;
2655
2656# ifdef RE_ENABLE_I18N
2657 {
2658 wchar_t wc;
2659 wint_t start_wc;
2660 wint_t end_wc;
2661 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2662
2663 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2664 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2665 : 0));
2666 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2667 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2668 : 0));
2669#ifdef GAWK
2670 /*
2671 * Fedora Core 2, maybe others, have broken `btowc' that returns -1
2672 * for any value > 127. Sigh. Note that `start_ch' and `end_ch' are
2673 * unsigned, so we don't have sign extension problems.
2674 */
2675 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2676 ? start_ch : start_elem->opr.wch);
2677 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2678 ? end_ch : end_elem->opr.wch);
2679#else
2680 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2681 ? __btowc (start_ch) : start_elem->opr.wch);
2682 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2683 ? __btowc (end_ch) : end_elem->opr.wch);
2684#endif
2685 if (start_wc == WEOF || end_wc == WEOF)
2686 return REG_ECOLLATE;
2687 cmp_buf[0] = start_wc;
2688 cmp_buf[4] = end_wc;
2689 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2690 return REG_ERANGE;
2691
2692 /* Got valid collation sequence values, add them as a new entry.
2693 However, for !_LIBC we have no collation elements: if the
2694 character set is single byte, the single byte character set
2695 that we build below suffices. parse_bracket_exp passes
2696 no MBCSET if dfa->mb_cur_max == 1. */
2697 if (mbcset)
2698 {
2699 /* Check the space of the arrays. */
2700 if (BE (*range_alloc == mbcset->nranges, 0))
2701 {
2702 /* There is not enough space, need realloc. */
2703 wchar_t *new_array_start, *new_array_end;
2704 int new_nranges;
2705
2706 /* +1 in case of mbcset->nranges is 0. */
2707 new_nranges = 2 * mbcset->nranges + 1;
2708 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2709 are NULL if *range_alloc == 0. */
2710 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2711 new_nranges);
2712 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2713 new_nranges);
2714
2715 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2716 return REG_ESPACE;
2717
2718 mbcset->range_starts = new_array_start;
2719 mbcset->range_ends = new_array_end;
2720 *range_alloc = new_nranges;
2721 }
2722
2723 mbcset->range_starts[mbcset->nranges] = start_wc;
2724 mbcset->range_ends[mbcset->nranges++] = end_wc;
2725 }
2726
2727 /* Build the table for single byte characters. */
2728 for (wc = 0; wc < SBC_MAX; ++wc)
2729 {
2730 cmp_buf[2] = wc;
2731 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2732 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2733 bitset_set (sbcset, wc);
2734 }
2735 }
2736# else /* not RE_ENABLE_I18N */
2737 {
2738 unsigned int ch;
2739 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2740 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2741 : 0));
2742 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2743 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2744 : 0));
2745 if (start_ch > end_ch)
2746 return REG_ERANGE;
2747 /* Build the table for single byte characters. */
2748 for (ch = 0; ch < SBC_MAX; ++ch)
2749 if (start_ch <= ch && ch <= end_ch)
2750 bitset_set (sbcset, ch);
2751 }
2752# endif /* not RE_ENABLE_I18N */
2753 return REG_NOERROR;
2754}
2755#endif /* not _LIBC */
2756
2757#ifndef _LIBC
2758/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2759 Build the collating element which is represented by NAME.
2760 The result are written to MBCSET and SBCSET.
2761 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2762 pointer argument since we may update it. */
2763
2764static reg_errcode_t
2765internal_function
2766# ifdef RE_ENABLE_I18N
2767build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2768 int *coll_sym_alloc, const unsigned char *name)
2769# else /* not RE_ENABLE_I18N */
2770build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2771# endif /* not RE_ENABLE_I18N */
2772{
2773 size_t name_len = strlen ((const char *) name);
2774 if (BE (name_len != 1, 0))
2775 return REG_ECOLLATE;
2776 else
2777 {
2778 bitset_set (sbcset, name[0]);
2779 return REG_NOERROR;
2780 }
2781}
2782#endif /* not _LIBC */
2783
2784/* This function parse bracket expression like "[abc]", "[a-c]",
2785 "[[.a-a.]]" etc. */
2786
2787static bin_tree_t *
2788parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2789 reg_syntax_t syntax, reg_errcode_t *err)
2790{
2791#ifdef _LIBC
2792 const unsigned char *collseqmb;
2793 const char *collseqwc;
2794 uint32_t nrules;
2795 int32_t table_size;
2796 const int32_t *symb_table;
2797 const unsigned char *extra;
2798
2799 /* Local function for parse_bracket_exp used in _LIBC environement.
2800 Seek the collating symbol entry correspondings to NAME.
2801 Return the index of the symbol in the SYMB_TABLE. */
2802
2803 auto inline int32_t
2804 __attribute ((always_inline))
2805 seek_collating_symbol_entry (name, name_len)
2806 const unsigned char *name;
2807 size_t name_len;
2808 {
2809 int32_t hash = elem_hash ((const char *) name, name_len);
2810 int32_t elem = hash % table_size;
2811 if (symb_table[2 * elem] != 0)
2812 {
2813 int32_t second = hash % (table_size - 2) + 1;
2814
2815 do
2816 {
2817 /* First compare the hashing value. */
2818 if (symb_table[2 * elem] == hash
2819 /* Compare the length of the name. */
2820 && name_len == extra[symb_table[2 * elem + 1]]
2821 /* Compare the name. */
2822 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2823 name_len) == 0)
2824 {
2825 /* Yep, this is the entry. */
2826 break;
2827 }
2828
2829 /* Next entry. */
2830 elem += second;
2831 }
2832 while (symb_table[2 * elem] != 0);
2833 }
2834 return elem;
2835 }
2836
2837 /* Local function for parse_bracket_exp used in _LIBC environment.
2838 Look up the collation sequence value of BR_ELEM.
2839 Return the value if succeeded, UINT_MAX otherwise. */
2840
2841 auto inline unsigned int
2842 __attribute ((always_inline))
2843 lookup_collation_sequence_value (br_elem)
2844 bracket_elem_t *br_elem;
2845 {
2846 if (br_elem->type == SB_CHAR)
2847 {
2848 /*
2849 if (MB_CUR_MAX == 1)
2850 */
2851 if (nrules == 0)
2852 return collseqmb[br_elem->opr.ch];
2853 else
2854 {
2855 wint_t wc = __btowc (br_elem->opr.ch);
2856 return __collseq_table_lookup (collseqwc, wc);
2857 }
2858 }
2859 else if (br_elem->type == MB_CHAR)
2860 {
2861 if (nrules != 0)
2862 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2863 }
2864 else if (br_elem->type == COLL_SYM)
2865 {
2866 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2867 if (nrules != 0)
2868 {
2869 int32_t elem, idx;
2870 elem = seek_collating_symbol_entry (br_elem->opr.name,
2871 sym_name_len);
2872 if (symb_table[2 * elem] != 0)
2873 {
2874 /* We found the entry. */
2875 idx = symb_table[2 * elem + 1];
2876 /* Skip the name of collating element name. */
2877 idx += 1 + extra[idx];
2878 /* Skip the byte sequence of the collating element. */
2879 idx += 1 + extra[idx];
2880 /* Adjust for the alignment. */
2881 idx = (idx + 3) & ~3;
2882 /* Skip the multibyte collation sequence value. */
2883 idx += sizeof (unsigned int);
2884 /* Skip the wide char sequence of the collating element. */
2885 idx += sizeof (unsigned int) *
2886 (1 + *(unsigned int *) (extra + idx));
2887 /* Return the collation sequence value. */
2888 return *(unsigned int *) (extra + idx);
2889 }
2890 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2891 {
2892 /* No valid character. Match it as a single byte
2893 character. */
2894 return collseqmb[br_elem->opr.name[0]];
2895 }
2896 }
2897 else if (sym_name_len == 1)
2898 return collseqmb[br_elem->opr.name[0]];
2899 }
2900 return UINT_MAX;
2901 }
2902
2903 /* Local function for parse_bracket_exp used in _LIBC environement.
2904 Build the range expression which starts from START_ELEM, and ends
2905 at END_ELEM. The result are written to MBCSET and SBCSET.
2906 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2907 mbcset->range_ends, is a pointer argument sinse we may
2908 update it. */
2909
2910 auto inline reg_errcode_t
2911 __attribute ((always_inline))
2912 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2913 re_charset_t *mbcset;
2914 int *range_alloc;
2915 bitset_t sbcset;
2916 bracket_elem_t *start_elem, *end_elem;
2917 {
2918 unsigned int ch;
2919 uint32_t start_collseq;
2920 uint32_t end_collseq;
2921
2922 /* Equivalence Classes and Character Classes can't be a range
2923 start/end. */
2924 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2925 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2926 0))
2927 return REG_ERANGE;
2928
2929 start_collseq = lookup_collation_sequence_value (start_elem);
2930 end_collseq = lookup_collation_sequence_value (end_elem);
2931 /* Check start/end collation sequence values. */
2932 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2933 return REG_ECOLLATE;
2934 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2935 return REG_ERANGE;
2936
2937 /* Got valid collation sequence values, add them as a new entry.
2938 However, if we have no collation elements, and the character set
2939 is single byte, the single byte character set that we
2940 build below suffices. */
2941 if (nrules > 0 || dfa->mb_cur_max > 1)
2942 {
2943 /* Check the space of the arrays. */
2944 if (BE (*range_alloc == mbcset->nranges, 0))
2945 {
2946 /* There is not enough space, need realloc. */
2947 uint32_t *new_array_start;
2948 uint32_t *new_array_end;
2949 int new_nranges;
2950
2951 /* +1 in case of mbcset->nranges is 0. */
2952 new_nranges = 2 * mbcset->nranges + 1;
2953 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2954 new_nranges);
2955 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2956 new_nranges);
2957
2958 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2959 return REG_ESPACE;
2960
2961 mbcset->range_starts = new_array_start;
2962 mbcset->range_ends = new_array_end;
2963 *range_alloc = new_nranges;
2964 }
2965
2966 mbcset->range_starts[mbcset->nranges] = start_collseq;
2967 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2968 }
2969
2970 /* Build the table for single byte characters. */
2971 for (ch = 0; ch < SBC_MAX; ch++)
2972 {
2973 uint32_t ch_collseq;
2974 /*
2975 if (MB_CUR_MAX == 1)
2976 */
2977 if (nrules == 0)
2978 ch_collseq = collseqmb[ch];
2979 else
2980 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2981 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2982 bitset_set (sbcset, ch);
2983 }
2984 return REG_NOERROR;
2985 }
2986
2987 /* Local function for parse_bracket_exp used in _LIBC environement.
2988 Build the collating element which is represented by NAME.
2989 The result are written to MBCSET and SBCSET.
2990 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2991 pointer argument sinse we may update it. */
2992
2993 auto inline reg_errcode_t
2994 __attribute ((always_inline))
2995 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2996 re_charset_t *mbcset;
2997 int *coll_sym_alloc;
2998 bitset_t sbcset;
2999 const unsigned char *name;
3000 {
3001 int32_t elem, idx;
3002 size_t name_len = strlen ((const char *) name);
3003 if (nrules != 0)
3004 {
3005 elem = seek_collating_symbol_entry (name, name_len);
3006 if (symb_table[2 * elem] != 0)
3007 {
3008 /* We found the entry. */
3009 idx = symb_table[2 * elem + 1];
3010 /* Skip the name of collating element name. */
3011 idx += 1 + extra[idx];
3012 }
3013 else if (symb_table[2 * elem] == 0 && name_len == 1)
3014 {
3015 /* No valid character, treat it as a normal
3016 character. */
3017 bitset_set (sbcset, name[0]);
3018 return REG_NOERROR;
3019 }
3020 else
3021 return REG_ECOLLATE;
3022
3023 /* Got valid collation sequence, add it as a new entry. */
3024 /* Check the space of the arrays. */
3025 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3026 {
3027 /* Not enough, realloc it. */
3028 /* +1 in case of mbcset->ncoll_syms is 0. */
3029 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3030 /* Use realloc since mbcset->coll_syms is NULL
3031 if *alloc == 0. */
3032 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3033 new_coll_sym_alloc);
3034 if (BE (new_coll_syms == NULL, 0))
3035 return REG_ESPACE;
3036 mbcset->coll_syms = new_coll_syms;
3037 *coll_sym_alloc = new_coll_sym_alloc;
3038 }
3039 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3040 return REG_NOERROR;
3041 }
3042 else
3043 {
3044 if (BE (name_len != 1, 0))
3045 return REG_ECOLLATE;
3046 else
3047 {
3048 bitset_set (sbcset, name[0]);
3049 return REG_NOERROR;
3050 }
3051 }
3052 }
3053#endif
3054
3055 re_token_t br_token;
3056 re_bitset_ptr_t sbcset;
3057#ifdef RE_ENABLE_I18N
3058 re_charset_t *mbcset;
3059 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3060 int equiv_class_alloc = 0, char_class_alloc = 0;
3061#endif /* not RE_ENABLE_I18N */
3062 int non_match = 0;
3063 bin_tree_t *work_tree;
3064 int token_len;
3065 int first_round = 1;
3066#ifdef _LIBC
3067 collseqmb = (const unsigned char *)
3068 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3069 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3070 if (nrules)
3071 {
3072 /*
3073 if (MB_CUR_MAX > 1)
3074 */
3075 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3076 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3077 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3078 _NL_COLLATE_SYMB_TABLEMB);
3079 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3080 _NL_COLLATE_SYMB_EXTRAMB);
3081 }
3082#endif
3083 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3084#ifdef RE_ENABLE_I18N
3085 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3086#endif /* RE_ENABLE_I18N */
3087#ifdef RE_ENABLE_I18N
3088 if (BE (sbcset == NULL || mbcset == NULL, 0))
3089#else
3090 if (BE (sbcset == NULL, 0))
3091#endif /* RE_ENABLE_I18N */
3092 {
3093 *err = REG_ESPACE;
3094 return NULL;
3095 }
3096
3097 token_len = peek_token_bracket (token, regexp, syntax);
3098 if (BE (token->type == END_OF_RE, 0))
3099 {
3100 *err = REG_BADPAT;
3101 goto parse_bracket_exp_free_return;
3102 }
3103 if (token->type == OP_NON_MATCH_LIST)
3104 {
3105#ifdef RE_ENABLE_I18N
3106 mbcset->non_match = 1;
3107#endif /* not RE_ENABLE_I18N */
3108 non_match = 1;
3109 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3110 bitset_set (sbcset, '\n');
3111 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3112 token_len = peek_token_bracket (token, regexp, syntax);
3113 if (BE (token->type == END_OF_RE, 0))
3114 {
3115 *err = REG_BADPAT;
3116 goto parse_bracket_exp_free_return;
3117 }
3118 }
3119
3120 /* We treat the first ']' as a normal character. */
3121 if (token->type == OP_CLOSE_BRACKET)
3122 token->type = CHARACTER;
3123
3124 while (1)
3125 {
3126 bracket_elem_t start_elem, end_elem;
3127 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3128 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3129 reg_errcode_t ret;
3130 int token_len2 = 0, is_range_exp = 0;
3131 re_token_t token2;
3132
3133 start_elem.opr.name = start_name_buf;
3134 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3135 syntax, first_round);
3136 if (BE (ret != REG_NOERROR, 0))
3137 {
3138 *err = ret;
3139 goto parse_bracket_exp_free_return;
3140 }
3141 first_round = 0;
3142
3143 /* Get information about the next token. We need it in any case. */
3144 token_len = peek_token_bracket (token, regexp, syntax);
3145
3146 /* Do not check for ranges if we know they are not allowed. */
3147 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3148 {
3149 if (BE (token->type == END_OF_RE, 0))
3150 {
3151 *err = REG_EBRACK;
3152 goto parse_bracket_exp_free_return;
3153 }
3154 if (token->type == OP_CHARSET_RANGE)
3155 {
3156 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3157 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3158 if (BE (token2.type == END_OF_RE, 0))
3159 {
3160 *err = REG_EBRACK;
3161 goto parse_bracket_exp_free_return;
3162 }
3163 if (token2.type == OP_CLOSE_BRACKET)
3164 {
3165 /* We treat the last '-' as a normal character. */
3166 re_string_skip_bytes (regexp, -token_len);
3167 token->type = CHARACTER;
3168 }
3169 else
3170 is_range_exp = 1;
3171 }
3172 }
3173
3174 if (is_range_exp == 1)
3175 {
3176 end_elem.opr.name = end_name_buf;
3177 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3178 dfa, syntax, 1);
3179 if (BE (ret != REG_NOERROR, 0))
3180 {
3181 *err = ret;
3182 goto parse_bracket_exp_free_return;
3183 }
3184
3185 token_len = peek_token_bracket (token, regexp, syntax);
3186
3187#ifdef _LIBC
3188 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3189 &start_elem, &end_elem);
3190#else
3191# ifdef RE_ENABLE_I18N
3192 *err = build_range_exp (sbcset,
3193 dfa->mb_cur_max > 1 ? mbcset : NULL,
3194 &range_alloc, &start_elem, &end_elem);
3195# else
3196 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3197# endif
3198#endif /* RE_ENABLE_I18N */
3199 if (BE (*err != REG_NOERROR, 0))
3200 goto parse_bracket_exp_free_return;
3201 }
3202 else
3203 {
3204 switch (start_elem.type)
3205 {
3206 case SB_CHAR:
3207 bitset_set (sbcset, start_elem.opr.ch);
3208 break;
3209#ifdef RE_ENABLE_I18N
3210 case MB_CHAR:
3211 /* Check whether the array has enough space. */
3212 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3213 {
3214 wchar_t *new_mbchars;
3215 /* Not enough, realloc it. */
3216 /* +1 in case of mbcset->nmbchars is 0. */
3217 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3218 /* Use realloc since array is NULL if *alloc == 0. */
3219 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3220 mbchar_alloc);
3221 if (BE (new_mbchars == NULL, 0))
3222 goto parse_bracket_exp_espace;
3223 mbcset->mbchars = new_mbchars;
3224 }
3225 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3226 break;
3227#endif /* RE_ENABLE_I18N */
3228 case EQUIV_CLASS:
3229 *err = build_equiv_class (sbcset,
3230#ifdef RE_ENABLE_I18N
3231 mbcset, &equiv_class_alloc,
3232#endif /* RE_ENABLE_I18N */
3233 start_elem.opr.name);
3234 if (BE (*err != REG_NOERROR, 0))
3235 goto parse_bracket_exp_free_return;
3236 break;
3237 case COLL_SYM:
3238 *err = build_collating_symbol (sbcset,
3239#ifdef RE_ENABLE_I18N
3240 mbcset, &coll_sym_alloc,
3241#endif /* RE_ENABLE_I18N */
3242 start_elem.opr.name);
3243 if (BE (*err != REG_NOERROR, 0))
3244 goto parse_bracket_exp_free_return;
3245 break;
3246 case CHAR_CLASS:
3247 *err = build_charclass (regexp->trans, sbcset,
3248#ifdef RE_ENABLE_I18N
3249 mbcset, &char_class_alloc,
3250#endif /* RE_ENABLE_I18N */
3251 (const char *) start_elem.opr.name, syntax);
3252 if (BE (*err != REG_NOERROR, 0))
3253 goto parse_bracket_exp_free_return;
3254 break;
3255 default:
3256 assert (0);
3257 break;
3258 }
3259 }
3260 if (BE (token->type == END_OF_RE, 0))
3261 {
3262 *err = REG_EBRACK;
3263 goto parse_bracket_exp_free_return;
3264 }
3265 if (token->type == OP_CLOSE_BRACKET)
3266 break;
3267 }
3268
3269 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3270
3271 /* If it is non-matching list. */
3272 if (non_match)
3273 bitset_not (sbcset);
3274
3275#ifdef RE_ENABLE_I18N
3276 /* Ensure only single byte characters are set. */
3277 if (dfa->mb_cur_max > 1)
3278 bitset_mask (sbcset, dfa->sb_char);
3279
3280 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3281 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3282 || mbcset->non_match)))
3283 {
3284 bin_tree_t *mbc_tree;
3285 int sbc_idx;
3286 /* Build a tree for complex bracket. */
3287 dfa->has_mb_node = 1;
3288 br_token.type = COMPLEX_BRACKET;
3289 br_token.opr.mbcset = mbcset;
3290 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3291 if (BE (mbc_tree == NULL, 0))
3292 goto parse_bracket_exp_espace;
3293 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3294 if (sbcset[sbc_idx])
3295 break;
3296 /* If there are no bits set in sbcset, there is no point
3297 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3298 if (sbc_idx < BITSET_WORDS)
3299 {
3300 /* Build a tree for simple bracket. */
3301 br_token.type = SIMPLE_BRACKET;
3302 br_token.opr.sbcset = sbcset;
3303 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3304 if (BE (work_tree == NULL, 0))
3305 goto parse_bracket_exp_espace;
3306
3307 /* Then join them by ALT node. */
3308 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3309 if (BE (work_tree == NULL, 0))
3310 goto parse_bracket_exp_espace;
3311 }
3312 else
3313 {
3314 re_free (sbcset);
3315 work_tree = mbc_tree;
3316 }
3317 }
3318 else
3319#endif /* not RE_ENABLE_I18N */
3320 {
3321#ifdef RE_ENABLE_I18N
3322 free_charset (mbcset);
3323#endif
3324 /* Build a tree for simple bracket. */
3325 br_token.type = SIMPLE_BRACKET;
3326 br_token.opr.sbcset = sbcset;
3327 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3328 if (BE (work_tree == NULL, 0))
3329 goto parse_bracket_exp_espace;
3330 }
3331 return work_tree;
3332
3333 parse_bracket_exp_espace:
3334 *err = REG_ESPACE;
3335 parse_bracket_exp_free_return:
3336 re_free (sbcset);
3337#ifdef RE_ENABLE_I18N
3338 free_charset (mbcset);
3339#endif /* RE_ENABLE_I18N */
3340 return NULL;
3341}
3342
3343/* Parse an element in the bracket expression. */
3344
3345static reg_errcode_t
3346parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3347 re_token_t *token, int token_len, re_dfa_t *dfa,
3348 reg_syntax_t syntax, int accept_hyphen)
3349{
3350#ifdef RE_ENABLE_I18N
3351 int cur_char_size;
3352 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3353 if (cur_char_size > 1)
3354 {
3355 elem->type = MB_CHAR;
3356 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3357 re_string_skip_bytes (regexp, cur_char_size);
3358 return REG_NOERROR;
3359 }
3360#endif /* RE_ENABLE_I18N */
3361 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3362 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3363 || token->type == OP_OPEN_EQUIV_CLASS)
3364 return parse_bracket_symbol (elem, regexp, token);
3365 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3366 {
3367 /* A '-' must only appear as anything but a range indicator before
3368 the closing bracket. Everything else is an error. */
3369 re_token_t token2;
3370 (void) peek_token_bracket (&token2, regexp, syntax);
3371 if (token2.type != OP_CLOSE_BRACKET)
3372 /* The actual error value is not standardized since this whole
3373 case is undefined. But ERANGE makes good sense. */
3374 return REG_ERANGE;
3375 }
3376 elem->type = SB_CHAR;
3377 elem->opr.ch = token->opr.c;
3378 return REG_NOERROR;
3379}
3380
3381/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3382 such as [:<character_class>:], [.<collating_element>.], and
3383 [=<equivalent_class>=]. */
3384
3385static reg_errcode_t
3386parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3387 re_token_t *token)
3388{
3389 unsigned char ch, delim = token->opr.c;
3390 int i = 0;
3391 if (re_string_eoi(regexp))
3392 return REG_EBRACK;
3393 for (;; ++i)
3394 {
3395 if (i >= BRACKET_NAME_BUF_SIZE)
3396 return REG_EBRACK;
3397 if (token->type == OP_OPEN_CHAR_CLASS)
3398 ch = re_string_fetch_byte_case (regexp);
3399 else
3400 ch = re_string_fetch_byte (regexp);
3401 if (re_string_eoi(regexp))
3402 return REG_EBRACK;
3403 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3404 break;
3405 elem->opr.name[i] = ch;
3406 }
3407 re_string_skip_bytes (regexp, 1);
3408 elem->opr.name[i] = '\0';
3409 switch (token->type)
3410 {
3411 case OP_OPEN_COLL_ELEM:
3412 elem->type = COLL_SYM;
3413 break;
3414 case OP_OPEN_EQUIV_CLASS:
3415 elem->type = EQUIV_CLASS;
3416 break;
3417 case OP_OPEN_CHAR_CLASS:
3418 elem->type = CHAR_CLASS;
3419 break;
3420 default:
3421 break;
3422 }
3423 return REG_NOERROR;
3424}
3425
3426 /* Helper function for parse_bracket_exp.
3427 Build the equivalence class which is represented by NAME.
3428 The result are written to MBCSET and SBCSET.
3429 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3430 is a pointer argument sinse we may update it. */
3431
3432static reg_errcode_t
3433#ifdef RE_ENABLE_I18N
3434build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3435 int *equiv_class_alloc, const unsigned char *name)
3436#else /* not RE_ENABLE_I18N */
3437build_equiv_class (bitset_t sbcset, const unsigned char *name)
3438#endif /* not RE_ENABLE_I18N */
3439{
3440#ifdef _LIBC
3441 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3442 if (nrules != 0)
3443 {
3444 const int32_t *table, *indirect;
3445 const unsigned char *weights, *extra, *cp;
3446 unsigned char char_buf[2];
3447 int32_t idx1, idx2;
3448 unsigned int ch;
3449 size_t len;
3450 /* This #include defines a local function! */
3451# include <locale/weight.h>
3452 /* Calculate the index for equivalence class. */
3453 cp = name;
3454 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3455 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3456 _NL_COLLATE_WEIGHTMB);
3457 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3458 _NL_COLLATE_EXTRAMB);
3459 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3460 _NL_COLLATE_INDIRECTMB);
3461 idx1 = findidx (&cp);
3462 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3463 /* This isn't a valid character. */
3464 return REG_ECOLLATE;
3465
3466 /* Build single byte matcing table for this equivalence class. */
3467 char_buf[1] = (unsigned char) '\0';
3468 len = weights[idx1 & 0xffffff];
3469 for (ch = 0; ch < SBC_MAX; ++ch)
3470 {
3471 char_buf[0] = ch;
3472 cp = char_buf;
3473 idx2 = findidx (&cp);
3474/*
3475 idx2 = table[ch];
3476*/
3477 if (idx2 == 0)
3478 /* This isn't a valid character. */
3479 continue;
3480 /* Compare only if the length matches and the collation rule
3481 index is the same. */
3482 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3483 {
3484 int cnt = 0;
3485
3486 while (cnt <= len &&
3487 weights[(idx1 & 0xffffff) + 1 + cnt]
3488 == weights[(idx2 & 0xffffff) + 1 + cnt])
3489 ++cnt;
3490
3491 if (cnt > len)
3492 bitset_set (sbcset, ch);
3493 }
3494 }
3495 /* Check whether the array has enough space. */
3496 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3497 {
3498 /* Not enough, realloc it. */
3499 /* +1 in case of mbcset->nequiv_classes is 0. */
3500 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3501 /* Use realloc since the array is NULL if *alloc == 0. */
3502 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3503 int32_t,
3504 new_equiv_class_alloc);
3505 if (BE (new_equiv_classes == NULL, 0))
3506 return REG_ESPACE;
3507 mbcset->equiv_classes = new_equiv_classes;
3508 *equiv_class_alloc = new_equiv_class_alloc;
3509 }
3510 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3511 }
3512 else
3513#endif /* _LIBC */
3514 {
3515 if (BE (strlen ((const char *) name) != 1, 0))
3516 return REG_ECOLLATE;
3517 bitset_set (sbcset, *name);
3518 }
3519 return REG_NOERROR;
3520}
3521
3522 /* Helper function for parse_bracket_exp.
3523 Build the character class which is represented by NAME.
3524 The result are written to MBCSET and SBCSET.
3525 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3526 is a pointer argument sinse we may update it. */
3527
3528static reg_errcode_t
3529#ifdef RE_ENABLE_I18N
3530build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3531 re_charset_t *mbcset, int *char_class_alloc,
3532 const char *class_name, reg_syntax_t syntax)
3533#else /* not RE_ENABLE_I18N */
3534build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3535 const char *class_name, reg_syntax_t syntax)
3536#endif /* not RE_ENABLE_I18N */
3537{
3538 int i;
3539
3540 /* In case of REG_ICASE "upper" and "lower" match the both of
3541 upper and lower cases. */
3542 if ((syntax & RE_ICASE)
3543 && (strcmp (class_name, "upper") == 0 || strcmp (class_name, "lower") == 0))
3544 class_name = "alpha";
3545
3546#ifdef RE_ENABLE_I18N
3547 /* Check the space of the arrays. */
3548 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3549 {
3550 /* Not enough, realloc it. */
3551 /* +1 in case of mbcset->nchar_classes is 0. */
3552 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3553 /* Use realloc since array is NULL if *alloc == 0. */
3554 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3555 new_char_class_alloc);
3556 if (BE (new_char_classes == NULL, 0))
3557 return REG_ESPACE;
3558 mbcset->char_classes = new_char_classes;
3559 *char_class_alloc = new_char_class_alloc;
3560 }
3561 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name);
3562#endif /* RE_ENABLE_I18N */
3563
3564#define BUILD_CHARCLASS_LOOP(ctype_func) \
3565 do { \
3566 if (BE (trans != NULL, 0)) \
3567 { \
3568 for (i = 0; i < SBC_MAX; ++i) \
3569 if (ctype_func (i)) \
3570 bitset_set (sbcset, trans[i]); \
3571 } \
3572 else \
3573 { \
3574 for (i = 0; i < SBC_MAX; ++i) \
3575 if (ctype_func (i)) \
3576 bitset_set (sbcset, i); \
3577 } \
3578 } while (0)
3579
3580 if (strcmp (class_name, "alnum") == 0)
3581 BUILD_CHARCLASS_LOOP (isalnum);
3582 else if (strcmp (class_name, "cntrl") == 0)
3583 BUILD_CHARCLASS_LOOP (iscntrl);
3584 else if (strcmp (class_name, "lower") == 0)
3585 BUILD_CHARCLASS_LOOP (islower);
3586 else if (strcmp (class_name, "space") == 0)
3587 BUILD_CHARCLASS_LOOP (isspace);
3588 else if (strcmp (class_name, "alpha") == 0)
3589 BUILD_CHARCLASS_LOOP (isalpha);
3590 else if (strcmp (class_name, "digit") == 0)
3591 BUILD_CHARCLASS_LOOP (isdigit);
3592 else if (strcmp (class_name, "print") == 0)
3593 BUILD_CHARCLASS_LOOP (isprint);
3594 else if (strcmp (class_name, "upper") == 0)
3595 BUILD_CHARCLASS_LOOP (isupper);
3596 else if (strcmp (class_name, "blank") == 0)
3597#ifndef GAWK
3598 BUILD_CHARCLASS_LOOP (isblank);
3599#else
3600 /* see comments above */
3601 BUILD_CHARCLASS_LOOP (is_blank);
3602#endif
3603 else if (strcmp (class_name, "graph") == 0)
3604 BUILD_CHARCLASS_LOOP (isgraph);
3605 else if (strcmp (class_name, "punct") == 0)
3606 BUILD_CHARCLASS_LOOP (ispunct);
3607 else if (strcmp (class_name, "xdigit") == 0)
3608 BUILD_CHARCLASS_LOOP (isxdigit);
3609 else
3610 return REG_ECTYPE;
3611
3612 return REG_NOERROR;
3613}
3614
3615static bin_tree_t *
3616build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3617 const char *class_name,
3618 const char *extra, int non_match,
3619 reg_errcode_t *err)
3620{
3621 re_bitset_ptr_t sbcset;
3622#ifdef RE_ENABLE_I18N
3623 re_charset_t *mbcset;
3624 int alloc = 0;
3625#endif /* not RE_ENABLE_I18N */
3626 reg_errcode_t ret;
3627 re_token_t br_token;
3628 bin_tree_t *tree;
3629
3630 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3631#ifdef RE_ENABLE_I18N
3632 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3633#endif /* RE_ENABLE_I18N */
3634
3635#ifdef RE_ENABLE_I18N
3636 if (BE (sbcset == NULL || mbcset == NULL, 0))
3637#else /* not RE_ENABLE_I18N */
3638 if (BE (sbcset == NULL, 0))
3639#endif /* not RE_ENABLE_I18N */
3640 {
3641 *err = REG_ESPACE;
3642 return NULL;
3643 }
3644
3645 if (non_match)
3646 {
3647#ifdef RE_ENABLE_I18N
3648 mbcset->non_match = 1;
3649#endif /* not RE_ENABLE_I18N */
3650 }
3651
3652 /* We don't care the syntax in this case. */
3653 ret = build_charclass (trans, sbcset,
3654#ifdef RE_ENABLE_I18N
3655 mbcset, &alloc,
3656#endif /* RE_ENABLE_I18N */
3657 class_name, 0);
3658
3659 if (BE (ret != REG_NOERROR, 0))
3660 {
3661 re_free (sbcset);
3662#ifdef RE_ENABLE_I18N
3663 free_charset (mbcset);
3664#endif /* RE_ENABLE_I18N */
3665 *err = ret;
3666 return NULL;
3667 }
3668 /* \w match '_' also. */
3669 for (; *extra; extra++)
3670 bitset_set (sbcset, *extra);
3671
3672 /* If it is non-matching list. */
3673 if (non_match)
3674 bitset_not (sbcset);
3675
3676#ifdef RE_ENABLE_I18N
3677 /* Ensure only single byte characters are set. */
3678 if (dfa->mb_cur_max > 1)
3679 bitset_mask (sbcset, dfa->sb_char);
3680#endif
3681
3682 /* Build a tree for simple bracket. */
3683 br_token.type = SIMPLE_BRACKET;
3684 br_token.opr.sbcset = sbcset;
3685 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3686 if (BE (tree == NULL, 0))
3687 goto build_word_op_espace;
3688
3689#ifdef RE_ENABLE_I18N
3690 if (dfa->mb_cur_max > 1)
3691 {
3692 bin_tree_t *mbc_tree;
3693 /* Build a tree for complex bracket. */
3694 br_token.type = COMPLEX_BRACKET;
3695 br_token.opr.mbcset = mbcset;
3696 dfa->has_mb_node = 1;
3697 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3698 if (BE (mbc_tree == NULL, 0))
3699 goto build_word_op_espace;
3700 /* Then join them by ALT node. */
3701 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3702 if (BE (mbc_tree != NULL, 1))
3703 return tree;
3704 }
3705 else
3706 {
3707 free_charset (mbcset);
3708 return tree;
3709 }
3710#else /* not RE_ENABLE_I18N */
3711 return tree;
3712#endif /* not RE_ENABLE_I18N */
3713
3714 build_word_op_espace:
3715 re_free (sbcset);
3716#ifdef RE_ENABLE_I18N
3717 free_charset (mbcset);
3718#endif /* RE_ENABLE_I18N */
3719 *err = REG_ESPACE;
3720 return NULL;
3721}
3722
3723/* This is intended for the expressions like "a{1,3}".
3724 Fetch a number from `input', and return the number.
3725 Return -1, if the number field is empty like "{,1}".
3726 Return -2, If an error is occured. */
3727
3728static int
3729fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3730{
3731 int num = -1;
3732 unsigned char c;
3733 while (1)
3734 {
3735 fetch_token (token, input, syntax);
3736 c = token->opr.c;
3737 if (BE (token->type == END_OF_RE, 0))
3738 return -2;
3739 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3740 break;
3741 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3742 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3743 num = (num > RE_DUP_MAX) ? -2 : num;
3744 }
3745 return num;
3746}
3747\f
3748#ifdef RE_ENABLE_I18N
3749static void
3750free_charset (re_charset_t *cset)
3751{
3752 re_free (cset->mbchars);
3753# ifdef _LIBC
3754 re_free (cset->coll_syms);
3755 re_free (cset->equiv_classes);
3756 re_free (cset->range_starts);
3757 re_free (cset->range_ends);
3758# endif
3759 re_free (cset->char_classes);
3760 re_free (cset);
3761}
3762#endif /* RE_ENABLE_I18N */
3763\f
3764/* Functions for binary tree operation. */
3765
3766/* Create a tree node. */
3767
3768static bin_tree_t *
3769create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3770 re_token_type_t type)
3771{
3772 re_token_t t;
3773 t.type = type;
3774 return create_token_tree (dfa, left, right, &t);
3775}
3776
3777static bin_tree_t *
3778create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3779 const re_token_t *token)
3780{
3781 bin_tree_t *tree;
3782 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3783 {
3784 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3785
3786 if (storage == NULL)
3787 return NULL;
3788 storage->next = dfa->str_tree_storage;
3789 dfa->str_tree_storage = storage;
3790 dfa->str_tree_storage_idx = 0;
3791 }
3792 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3793
3794 tree->parent = NULL;
3795 tree->left = left;
3796 tree->right = right;
3797 tree->token = *token;
3798 tree->token.duplicated = 0;
3799 tree->token.opt_subexp = 0;
3800 tree->first = NULL;
3801 tree->next = NULL;
3802 tree->node_idx = -1;
3803
3804 if (left != NULL)
3805 left->parent = tree;
3806 if (right != NULL)
3807 right->parent = tree;
3808 return tree;
3809}
3810
3811/* Mark the tree SRC as an optional subexpression.
3812 To be called from preorder or postorder. */
3813
3814static reg_errcode_t
3815mark_opt_subexp (void *extra, bin_tree_t *node)
3816{
3817 int idx = (int) (long) extra;
3818 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3819 node->token.opt_subexp = 1;
3820
3821 return REG_NOERROR;
3822}
3823
3824/* Free the allocated memory inside NODE. */
3825
3826static void
3827free_token (re_token_t *node)
3828{
3829#ifdef RE_ENABLE_I18N
3830 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3831 free_charset (node->opr.mbcset);
3832 else
3833#endif /* RE_ENABLE_I18N */
3834 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3835 re_free (node->opr.sbcset);
3836}
3837
3838/* Worker function for tree walking. Free the allocated memory inside NODE
3839 and its children. */
3840
3841static reg_errcode_t
3842free_tree (void *extra, bin_tree_t *node)
3843{
3844 free_token (&node->token);
3845 return REG_NOERROR;
3846}
3847
3848
3849/* Duplicate the node SRC, and return new node. This is a preorder
3850 visit similar to the one implemented by the generic visitor, but
3851 we need more infrastructure to maintain two parallel trees --- so,
3852 it's easier to duplicate. */
3853
3854static bin_tree_t *
3855duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3856{
3857 const bin_tree_t *node;
3858 bin_tree_t *dup_root;
3859 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3860
3861 for (node = root; ; )
3862 {
3863 /* Create a new tree and link it back to the current parent. */
3864 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3865 if (*p_new == NULL)
3866 return NULL;
3867 (*p_new)->parent = dup_node;
3868 (*p_new)->token.duplicated = 1;
3869 dup_node = *p_new;
3870
3871 /* Go to the left node, or up and to the right. */
3872 if (node->left)
3873 {
3874 node = node->left;
3875 p_new = &dup_node->left;
3876 }
3877 else
3878 {
3879 const bin_tree_t *prev = NULL;
3880 while (node->right == prev || node->right == NULL)
3881 {
3882 prev = node;
3883 node = node->parent;
3884 dup_node = dup_node->parent;
3885 if (!node)
3886 return dup_root;
3887 }
3888 node = node->right;
3889 p_new = &dup_node->right;
3890 }
3891 }
3892}