Andrew's git - gitweb.git/blob - compat/regex/regcomp.c

   1/* Extended regular expression matching and search library.
   2   Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
   3   This file is part of the GNU C Library.
   4   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6   The GNU C Library is free software; you can redistribute it and/or
   7   modify it under the terms of the GNU Lesser General Public
   8   License as published by the Free Software Foundation; either
   9   version 2.1 of the License, or (at your option) any later version.
  10
  11   The GNU C Library is distributed in the hope that it will be useful,
  12   but WITHOUT ANY WARRANTY; without even the implied warranty of
  13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14   Lesser General Public License for more details.
  15
  16   You should have received a copy of the GNU Lesser General Public
  17   License along with the GNU C Library; if not, write to the Free
  18   Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19   02110-1301 USA.  */
  20
  21static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  22                                          size_t length, reg_syntax_t syntax);
  23static void re_compile_fastmap_iter (regex_t *bufp,
  24                                     const re_dfastate_t *init_state,
  25                                     char *fastmap);
  26static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  27#ifdef RE_ENABLE_I18N
  28static void free_charset (re_charset_t *cset);
  29#endif /* RE_ENABLE_I18N */
  30static void free_workarea_compile (regex_t *preg);
  31static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  32#ifdef RE_ENABLE_I18N
  33static void optimize_utf8 (re_dfa_t *dfa);
  34#endif
  35static reg_errcode_t analyze (regex_t *preg);
  36static reg_errcode_t preorder (bin_tree_t *root,
  37                               reg_errcode_t (fn (void *, bin_tree_t *)),
  38                               void *extra);
  39static reg_errcode_t postorder (bin_tree_t *root,
  40                                reg_errcode_t (fn (void *, bin_tree_t *)),
  41                                void *extra);
  42static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  43static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  44static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  45                                 bin_tree_t *node);
  46static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  47static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  48static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  49static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
  50static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
  51                                   unsigned int constraint);
  52static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  53static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  54                                         int node, int root);
  55static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  56static int fetch_number (re_string_t *input, re_token_t *token,
  57                         reg_syntax_t syntax);
  58static int peek_token (re_token_t *token, re_string_t *input,
  59                        reg_syntax_t syntax) internal_function;
  60static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  61                          reg_syntax_t syntax, reg_errcode_t *err);
  62static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  63                                  re_token_t *token, reg_syntax_t syntax,
  64                                  int nest, reg_errcode_t *err);
  65static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  66                                 re_token_t *token, reg_syntax_t syntax,
  67                                 int nest, reg_errcode_t *err);
  68static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  69                                     re_token_t *token, reg_syntax_t syntax,
  70                                     int nest, reg_errcode_t *err);
  71static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  72                                  re_token_t *token, reg_syntax_t syntax,
  73                                  int nest, reg_errcode_t *err);
  74static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  75                                 re_dfa_t *dfa, re_token_t *token,
  76                                 reg_syntax_t syntax, reg_errcode_t *err);
  77static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  78                                      re_token_t *token, reg_syntax_t syntax,
  79                                      reg_errcode_t *err);
  80static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  81                                            re_string_t *regexp,
  82                                            re_token_t *token, int token_len,
  83                                            re_dfa_t *dfa,
  84                                            reg_syntax_t syntax,
  85                                            int accept_hyphen);
  86static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  87                                          re_string_t *regexp,
  88                                          re_token_t *token);
  89#ifdef RE_ENABLE_I18N
  90static reg_errcode_t build_equiv_class (bitset_t sbcset,
  91                                        re_charset_t *mbcset,
  92                                        int *equiv_class_alloc,
  93                                        const unsigned char *name);
  94static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  95                                      bitset_t sbcset,
  96                                      re_charset_t *mbcset,
  97                                      int *char_class_alloc,
  98                                      const char *class_name,
  99                                      reg_syntax_t syntax);
 100#else  /* not RE_ENABLE_I18N */
 101static reg_errcode_t build_equiv_class (bitset_t sbcset,
 102                                        const unsigned char *name);
 103static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 104                                      bitset_t sbcset,
 105                                      const char *class_name,
 106                                      reg_syntax_t syntax);
 107#endif /* not RE_ENABLE_I18N */
 108static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 109                                       RE_TRANSLATE_TYPE trans,
 110                                       const char *class_name,
 111                                       const char *extra,
 112                                       int non_match, reg_errcode_t *err);
 113static bin_tree_t *create_tree (re_dfa_t *dfa,
 114                                bin_tree_t *left, bin_tree_t *right,
 115                                re_token_type_t type);
 116static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 117                                      bin_tree_t *left, bin_tree_t *right,
 118                                      const re_token_t *token);
 119static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 120static void free_token (re_token_t *node);
 121static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 122static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 123\f
 124/* This table gives an error message for each of the error codes listed
 125   in regex.h.  Obviously the order here has to be same as there.
 126   POSIX doesn't require that we do anything for REG_NOERROR,
 127   but why not be nice?  */
 128
 129const char __re_error_msgid[] attribute_hidden =
 130  {
 131#define REG_NOERROR_IDX 0
 132    gettext_noop ("Success")    /* REG_NOERROR */
 133    "\0"
 134#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 135    gettext_noop ("No match")   /* REG_NOMATCH */
 136    "\0"
 137#define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 138    gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 139    "\0"
 140#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 141    gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 142    "\0"
 143#define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 144    gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 145    "\0"
 146#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 147    gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 148    "\0"
 149#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 150    gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 151    "\0"
 152#define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 153    gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 154    "\0"
 155#define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 156    gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 157    "\0"
 158#define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 159    gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 160    "\0"
 161#define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 162    gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 163    "\0"
 164#define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 165    gettext_noop ("Invalid range end")  /* REG_ERANGE */
 166    "\0"
 167#define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 168    gettext_noop ("Memory exhausted") /* REG_ESPACE */
 169    "\0"
 170#define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 171    gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 172    "\0"
 173#define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 174    gettext_noop ("Premature end of regular expression") /* REG_EEND */
 175    "\0"
 176#define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 177    gettext_noop ("Regular expression too big") /* REG_ESIZE */
 178    "\0"
 179#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 180    gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 181  };
 182
 183const size_t __re_error_msgid_idx[] attribute_hidden =
 184  {
 185    REG_NOERROR_IDX,
 186    REG_NOMATCH_IDX,
 187    REG_BADPAT_IDX,
 188    REG_ECOLLATE_IDX,
 189    REG_ECTYPE_IDX,
 190    REG_EESCAPE_IDX,
 191    REG_ESUBREG_IDX,
 192    REG_EBRACK_IDX,
 193    REG_EPAREN_IDX,
 194    REG_EBRACE_IDX,
 195    REG_BADBR_IDX,
 196    REG_ERANGE_IDX,
 197    REG_ESPACE_IDX,
 198    REG_BADRPT_IDX,
 199    REG_EEND_IDX,
 200    REG_ESIZE_IDX,
 201    REG_ERPAREN_IDX
 202  };
 203\f
 204/* Entry points for GNU code.  */
 205
 206
 207#ifdef ZOS_USS
 208
 209/* For ZOS USS we must define btowc */
 210
 211wchar_t 
 212btowc (int c)
 213{
 214   wchar_t wtmp[2];
 215   char tmp[2];
 216
 217   tmp[0] = c;
 218   tmp[1] = 0;
 219
 220   mbtowc (wtmp, tmp, 1);
 221   return wtmp[0];
 222}
 223#endif
 224
 225/* re_compile_pattern is the GNU regular expression compiler: it
 226   compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 227   Returns 0 if the pattern was valid, otherwise an error string.
 228
 229   Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 230   are set in BUFP on entry.  */
 231
 232const char *
 233re_compile_pattern (pattern, length, bufp)
 234    const char *pattern;
 235    size_t length;
 236    struct re_pattern_buffer *bufp;
 237{
 238  reg_errcode_t ret;
 239
 240  /* And GNU code determines whether or not to get register information
 241     by passing null for the REGS argument to re_match, etc., not by
 242     setting no_sub, unless RE_NO_SUB is set.  */
 243  bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 244
 245  /* Match anchors at newline.  */
 246  bufp->newline_anchor = 1;
 247
 248  ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 249
 250  if (!ret)
 251    return NULL;
 252  return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 253}
 254#ifdef _LIBC
 255weak_alias (__re_compile_pattern, re_compile_pattern)
 256#endif
 257
 258/* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 259   also be assigned to arbitrarily: each pattern buffer stores its own
 260   syntax, so it can be changed between regex compilations.  */
 261/* This has no initializer because initialized variables in Emacs
 262   become read-only after dumping.  */
 263reg_syntax_t re_syntax_options;
 264
 265
 266/* Specify the precise syntax of regexps for compilation.  This provides
 267   for compatibility for various utilities which historically have
 268   different, incompatible syntaxes.
 269
 270   The argument SYNTAX is a bit mask comprised of the various bits
 271   defined in regex.h.  We return the old syntax.  */
 272
 273reg_syntax_t
 274re_set_syntax (syntax)
 275    reg_syntax_t syntax;
 276{
 277  reg_syntax_t ret = re_syntax_options;
 278
 279  re_syntax_options = syntax;
 280  return ret;
 281}
 282#ifdef _LIBC
 283weak_alias (__re_set_syntax, re_set_syntax)
 284#endif
 285
 286int
 287re_compile_fastmap (bufp)
 288    struct re_pattern_buffer *bufp;
 289{
 290  re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 291  char *fastmap = bufp->fastmap;
 292
 293  memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 294  re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 295  if (dfa->init_state != dfa->init_state_word)
 296    re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 297  if (dfa->init_state != dfa->init_state_nl)
 298    re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 299  if (dfa->init_state != dfa->init_state_begbuf)
 300    re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 301  bufp->fastmap_accurate = 1;
 302  return 0;
 303}
 304#ifdef _LIBC
 305weak_alias (__re_compile_fastmap, re_compile_fastmap)
 306#endif
 307
 308static inline void
 309__attribute ((always_inline))
 310re_set_fastmap (char *fastmap, int icase, int ch)
 311{
 312  fastmap[ch] = 1;
 313  if (icase)
 314    fastmap[tolower (ch)] = 1;
 315}
 316
 317/* Helper function for re_compile_fastmap.
 318   Compile fastmap for the initial_state INIT_STATE.  */
 319
 320static void
 321re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 322                         char *fastmap)
 323{
 324  volatile re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 325  int node_cnt;
 326  int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 327  for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 328    {
 329      int node = init_state->nodes.elems[node_cnt];
 330      re_token_type_t type = dfa->nodes[node].type;
 331
 332      if (type == CHARACTER)
 333        {
 334          re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 335#ifdef RE_ENABLE_I18N
 336          if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 337            {
 338              unsigned char *buf = re_malloc (unsigned char, dfa->mb_cur_max), *p;
 339              wchar_t wc;
 340              mbstate_t state;
 341
 342              p = buf;
 343              *p++ = dfa->nodes[node].opr.c;
 344              while (++node < dfa->nodes_len
 345                     && dfa->nodes[node].type == CHARACTER
 346                     && dfa->nodes[node].mb_partial)
 347                *p++ = dfa->nodes[node].opr.c;
 348              memset (&state, '\0', sizeof (state));
 349              if (__mbrtowc (&wc, (const char *) buf, p - buf,
 350                             &state) == p - buf
 351                  && (__wcrtomb ((char *) buf, towlower (wc), &state)
 352                      != (size_t) -1))
 353                re_set_fastmap (fastmap, 0, buf[0]);
 354              re_free (buf);
 355            }
 356#endif
 357        }
 358      else if (type == SIMPLE_BRACKET)
 359        {
 360          int i, ch;
 361          for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 362            {
 363              int j;
 364              bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 365              for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 366                if (w & ((bitset_word_t) 1 << j))
 367                  re_set_fastmap (fastmap, icase, ch);
 368            }
 369        }
 370#ifdef RE_ENABLE_I18N
 371      else if (type == COMPLEX_BRACKET)
 372        {
 373          re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 374          int i;
 375
 376# ifdef _LIBC
 377          /* See if we have to try all bytes which start multiple collation
 378             elements.
 379             e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 380                  collation element, and don't catch 'b' since 'b' is
 381                  the only collation element which starts from 'b' (and
 382                  it is caught by SIMPLE_BRACKET).  */
 383              if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 384                  && (cset->ncoll_syms || cset->nranges))
 385                {
 386                  const int32_t *table = (const int32_t *)
 387                    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 388                  for (i = 0; i < SBC_MAX; ++i)
 389                    if (table[i] < 0)
 390                      re_set_fastmap (fastmap, icase, i);
 391                }
 392# endif /* _LIBC */
 393
 394          /* See if we have to start the match at all multibyte characters,
 395             i.e. where we would not find an invalid sequence.  This only
 396             applies to multibyte character sets; for single byte character
 397             sets, the SIMPLE_BRACKET again suffices.  */
 398          if (dfa->mb_cur_max > 1
 399              && (cset->nchar_classes || cset->non_match || cset->nranges
 400# ifdef _LIBC
 401                  || cset->nequiv_classes
 402# endif /* _LIBC */
 403                 ))
 404            {
 405              unsigned char c = 0;
 406              do
 407                {
 408                  mbstate_t mbs;
 409                  memset (&mbs, 0, sizeof (mbs));
 410                  if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 411                    re_set_fastmap (fastmap, false, (int) c);
 412                }
 413              while (++c != 0);
 414            }
 415
 416          else
 417            {
 418              /* ... Else catch all bytes which can start the mbchars.  */
 419              for (i = 0; i < cset->nmbchars; ++i)
 420                {
 421                  char buf[256];
 422                  mbstate_t state;
 423                  memset (&state, '\0', sizeof (state));
 424                  if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 425                    re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 426                  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 427                    {
 428                      if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 429                          != (size_t) -1)
 430                        re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 431                    }
 432                }
 433            }
 434        }
 435#endif /* RE_ENABLE_I18N */
 436      else if (type == OP_PERIOD
 437#ifdef RE_ENABLE_I18N
 438               || type == OP_UTF8_PERIOD
 439#endif /* RE_ENABLE_I18N */
 440               || type == END_OF_RE)
 441        {
 442          memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 443          if (type == END_OF_RE)
 444            bufp->can_be_null = 1;
 445          return;
 446        }
 447    }
 448}
 449\f
 450/* Entry point for POSIX code.  */
 451/* regcomp takes a regular expression as a string and compiles it.
 452
 453   PREG is a regex_t *.  We do not expect any fields to be initialized,
 454   since POSIX says we shouldn't.  Thus, we set
 455
 456     `buffer' to the compiled pattern;
 457     `used' to the length of the compiled pattern;
 458     `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 459       REG_EXTENDED bit in CFLAGS is set; otherwise, to
 460       RE_SYNTAX_POSIX_BASIC;
 461     `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 462     `fastmap' to an allocated space for the fastmap;
 463     `fastmap_accurate' to zero;
 464     `re_nsub' to the number of subexpressions in PATTERN.
 465
 466   PATTERN is the address of the pattern string.
 467
 468   CFLAGS is a series of bits which affect compilation.
 469
 470     If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 471     use POSIX basic syntax.
 472
 473     If REG_NEWLINE is set, then . and [^...] don't match newline.
 474     Also, regexec will try a match beginning after every newline.
 475
 476     If REG_ICASE is set, then we considers upper- and lowercase
 477     versions of letters to be equivalent when matching.
 478
 479     If REG_NOSUB is set, then when PREG is passed to regexec, that
 480     routine will report only success or failure, and nothing about the
 481     registers.
 482
 483   It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 484   the return codes and their meanings.)  */
 485
 486int
 487regcomp (preg, pattern, cflags)
 488    regex_t *__restrict preg;
 489    const char *__restrict pattern;
 490    int cflags;
 491{
 492  reg_errcode_t ret;
 493  reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 494                         : RE_SYNTAX_POSIX_BASIC);
 495
 496  preg->buffer = NULL;
 497  preg->allocated = 0;
 498  preg->used = 0;
 499
 500  /* Try to allocate space for the fastmap.  */
 501  preg->fastmap = re_malloc (char, SBC_MAX);
 502  if (BE (preg->fastmap == NULL, 0))
 503    return REG_ESPACE;
 504
 505  syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 506
 507  /* If REG_NEWLINE is set, newlines are treated differently.  */
 508  if (cflags & REG_NEWLINE)
 509    { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 510      syntax &= ~RE_DOT_NEWLINE;
 511      syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 512      /* It also changes the matching behavior.  */
 513      preg->newline_anchor = 1;
 514    }
 515  else
 516    preg->newline_anchor = 0;
 517  preg->no_sub = !!(cflags & REG_NOSUB);
 518  preg->translate = NULL;
 519
 520  ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 521
 522  /* POSIX doesn't distinguish between an unmatched open-group and an
 523     unmatched close-group: both are REG_EPAREN.  */
 524  if (ret == REG_ERPAREN)
 525    ret = REG_EPAREN;
 526
 527  /* We have already checked preg->fastmap != NULL.  */
 528  if (BE (ret == REG_NOERROR, 1))
 529    /* Compute the fastmap now, since regexec cannot modify the pattern
 530       buffer.  This function never fails in this implementation.  */
 531    (void) re_compile_fastmap (preg);
 532  else
 533    {
 534      /* Some error occurred while compiling the expression.  */
 535      re_free (preg->fastmap);
 536      preg->fastmap = NULL;
 537    }
 538
 539  return (int) ret;
 540}
 541#ifdef _LIBC
 542weak_alias (__regcomp, regcomp)
 543#endif
 544
 545/* Returns a message corresponding to an error code, ERRCODE, returned
 546   from either regcomp or regexec.   We don't use PREG here.  */
 547
 548size_t
 549regerror(int errcode, const regex_t *__restrict preg,
 550         char *__restrict errbuf, size_t errbuf_size)
 551{
 552  const char *msg;
 553  size_t msg_size;
 554
 555  if (BE (errcode < 0
 556          || errcode >= (int) (sizeof (__re_error_msgid_idx)
 557                               / sizeof (__re_error_msgid_idx[0])), 0))
 558    /* Only error codes returned by the rest of the code should be passed
 559       to this routine.  If we are given anything else, or if other regex
 560       code generates an invalid error code, then the program has a bug.
 561       Dump core so we can fix it.  */
 562    abort ();
 563
 564  msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 565
 566  msg_size = strlen (msg) + 1; /* Includes the null.  */
 567
 568  if (BE (errbuf_size != 0, 1))
 569    {
 570      if (BE (msg_size > errbuf_size, 0))
 571        {
 572          memcpy (errbuf, msg, errbuf_size - 1);
 573          errbuf[errbuf_size - 1] = 0;
 574        }
 575      else
 576        memcpy (errbuf, msg, msg_size);
 577    }
 578
 579  return msg_size;
 580}
 581#ifdef _LIBC
 582weak_alias (__regerror, regerror)
 583#endif
 584
 585
 586#ifdef RE_ENABLE_I18N
 587/* This static array is used for the map to single-byte characters when
 588   UTF-8 is used.  Otherwise we would allocate memory just to initialize
 589   it the same all the time.  UTF-8 is the preferred encoding so this is
 590   a worthwhile optimization.  */
 591#if __GNUC__ >= 3
 592static const bitset_t utf8_sb_map = {
 593  /* Set the first 128 bits.  */
 594  [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 595};
 596#else /* ! (__GNUC__ >= 3) */
 597static bitset_t utf8_sb_map;
 598#endif /* __GNUC__ >= 3 */
 599#endif /* RE_ENABLE_I18N */
 600
 601
 602static void
 603free_dfa_content (re_dfa_t *dfa)
 604{
 605  int i, j;
 606
 607  if (dfa->nodes)
 608    for (i = 0; i < dfa->nodes_len; ++i)
 609      free_token (dfa->nodes + i);
 610  re_free (dfa->nexts);
 611  for (i = 0; i < dfa->nodes_len; ++i)
 612    {
 613      if (dfa->eclosures != NULL)
 614        re_node_set_free (dfa->eclosures + i);
 615      if (dfa->inveclosures != NULL)
 616        re_node_set_free (dfa->inveclosures + i);
 617      if (dfa->edests != NULL)
 618        re_node_set_free (dfa->edests + i);
 619    }
 620  re_free (dfa->edests);
 621  re_free (dfa->eclosures);
 622  re_free (dfa->inveclosures);
 623  re_free (dfa->nodes);
 624
 625  if (dfa->state_table)
 626    for (i = 0; i <= dfa->state_hash_mask; ++i)
 627      {
 628        struct re_state_table_entry *entry = dfa->state_table + i;
 629        for (j = 0; j < entry->num; ++j)
 630          {
 631            re_dfastate_t *state = entry->array[j];
 632            free_state (state);
 633          }
 634        re_free (entry->array);
 635      }
 636  re_free (dfa->state_table);
 637#ifdef RE_ENABLE_I18N
 638  if (dfa->sb_char != utf8_sb_map)
 639    re_free (dfa->sb_char);
 640#endif
 641  re_free (dfa->subexp_map);
 642#ifdef DEBUG
 643  re_free (dfa->re_str);
 644#endif
 645
 646  re_free (dfa);
 647}
 648
 649
 650/* Free dynamically allocated space used by PREG.  */
 651
 652void
 653regfree (preg)
 654    regex_t *preg;
 655{
 656  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 657  if (BE (dfa != NULL, 1))
 658    free_dfa_content (dfa);
 659  preg->buffer = NULL;
 660  preg->allocated = 0;
 661
 662  re_free (preg->fastmap);
 663  preg->fastmap = NULL;
 664
 665  re_free (preg->translate);
 666  preg->translate = NULL;
 667}
 668#ifdef _LIBC
 669weak_alias (__regfree, regfree)
 670#endif
 671\f
 672/* Entry points compatible with 4.2 BSD regex library.  We don't define
 673   them unless specifically requested.  */
 674
 675#if defined _REGEX_RE_COMP || defined _LIBC
 676
 677/* BSD has one and only one pattern buffer.  */
 678static struct re_pattern_buffer re_comp_buf;
 679
 680char *
 681# ifdef _LIBC
 682/* Make these definitions weak in libc, so POSIX programs can redefine
 683   these names if they don't use our functions, and still use
 684   regcomp/regexec above without link errors.  */
 685weak_function
 686# endif
 687re_comp (s)
 688     const char *s;
 689{
 690  reg_errcode_t ret;
 691  char *fastmap;
 692
 693  if (!s)
 694    {
 695      if (!re_comp_buf.buffer)
 696        return gettext ("No previous regular expression");
 697      return 0;
 698    }
 699
 700  if (re_comp_buf.buffer)
 701    {
 702      fastmap = re_comp_buf.fastmap;
 703      re_comp_buf.fastmap = NULL;
 704      __regfree (&re_comp_buf);
 705      memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 706      re_comp_buf.fastmap = fastmap;
 707    }
 708
 709  if (re_comp_buf.fastmap == NULL)
 710    {
 711      re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 712      if (re_comp_buf.fastmap == NULL)
 713        return (char *) gettext (__re_error_msgid
 714                                 + __re_error_msgid_idx[(int) REG_ESPACE]);
 715    }
 716
 717  /* Since `re_exec' always passes NULL for the `regs' argument, we
 718     don't need to initialize the pattern buffer fields which affect it.  */
 719
 720  /* Match anchors at newlines.  */
 721  re_comp_buf.newline_anchor = 1;
 722
 723  ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 724
 725  if (!ret)
 726    return NULL;
 727
 728  /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 729  return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 730}
 731
 732#ifdef _LIBC
 733libc_freeres_fn (free_mem)
 734{
 735  __regfree (&re_comp_buf);
 736}
 737#endif
 738
 739#endif /* _REGEX_RE_COMP */
 740\f
 741/* Internal entry point.
 742   Compile the regular expression PATTERN, whose length is LENGTH.
 743   SYNTAX indicate regular expression's syntax.  */
 744
 745static reg_errcode_t
 746re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 747                     reg_syntax_t syntax)
 748{
 749  reg_errcode_t err = REG_NOERROR;
 750  re_dfa_t *dfa;
 751  re_string_t regexp;
 752
 753  /* Initialize the pattern buffer.  */
 754  preg->fastmap_accurate = 0;
 755  preg->syntax = syntax;
 756  preg->not_bol = preg->not_eol = 0;
 757  preg->used = 0;
 758  preg->re_nsub = 0;
 759  preg->can_be_null = 0;
 760  preg->regs_allocated = REGS_UNALLOCATED;
 761
 762  /* Initialize the dfa.  */
 763  dfa = (re_dfa_t *) preg->buffer;
 764  if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 765    {
 766      /* If zero allocated, but buffer is non-null, try to realloc
 767         enough space.  This loses if buffer's address is bogus, but
 768         that is the user's responsibility.  If ->buffer is NULL this
 769         is a simple allocation.  */
 770      dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 771      if (dfa == NULL)
 772        return REG_ESPACE;
 773      preg->allocated = sizeof (re_dfa_t);
 774      preg->buffer = (unsigned char *) dfa;
 775    }
 776  preg->used = sizeof (re_dfa_t);
 777
 778  err = init_dfa (dfa, length);
 779  if (BE (err != REG_NOERROR, 0))
 780    {
 781      free_dfa_content (dfa);
 782      preg->buffer = NULL;
 783      preg->allocated = 0;
 784      return err;
 785    }
 786#ifdef DEBUG
 787  /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 788  dfa->re_str = re_malloc (char, length + 1);
 789  strncpy (dfa->re_str, pattern, length + 1);
 790#endif
 791
 792  __libc_lock_init (dfa->lock);
 793
 794  err = re_string_construct (&regexp, pattern, length, preg->translate,
 795                             syntax & RE_ICASE, dfa);
 796  if (BE (err != REG_NOERROR, 0))
 797    {
 798    re_compile_internal_free_return:
 799      free_workarea_compile (preg);
 800      re_string_destruct (&regexp);
 801      free_dfa_content (dfa);
 802      preg->buffer = NULL;
 803      preg->allocated = 0;
 804      return err;
 805    }
 806
 807  /* Parse the regular expression, and build a structure tree.  */
 808  preg->re_nsub = 0;
 809  dfa->str_tree = parse (&regexp, preg, syntax, &err);
 810  if (BE (dfa->str_tree == NULL, 0))
 811    goto re_compile_internal_free_return;
 812
 813  /* Analyze the tree and create the nfa.  */
 814  err = analyze (preg);
 815  if (BE (err != REG_NOERROR, 0))
 816    goto re_compile_internal_free_return;
 817
 818#ifdef RE_ENABLE_I18N
 819  /* If possible, do searching in single byte encoding to speed things up.  */
 820  if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 821    optimize_utf8 (dfa);
 822#endif
 823
 824  /* Then create the initial state of the dfa.  */
 825  err = create_initial_state (dfa);
 826
 827  /* Release work areas.  */
 828  free_workarea_compile (preg);
 829  re_string_destruct (&regexp);
 830
 831  if (BE (err != REG_NOERROR, 0))
 832    {
 833      free_dfa_content (dfa);
 834      preg->buffer = NULL;
 835      preg->allocated = 0;
 836    }
 837
 838  return err;
 839}
 840
 841/* Initialize DFA.  We use the length of the regular expression PAT_LEN
 842   as the initial length of some arrays.  */
 843
 844static reg_errcode_t
 845init_dfa (re_dfa_t *dfa, size_t pat_len)
 846{
 847  unsigned int table_size;
 848#ifndef _LIBC
 849  char *codeset_name;
 850#endif
 851
 852  memset (dfa, '\0', sizeof (re_dfa_t));
 853
 854  /* Force allocation of str_tree_storage the first time.  */
 855  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 856
 857  /* Avoid overflows.  */
 858  if (pat_len == SIZE_MAX)
 859    return REG_ESPACE;
 860
 861  dfa->nodes_alloc = pat_len + 1;
 862  dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 863
 864  /*  table_size = 2 ^ ceil(log pat_len) */
 865  for (table_size = 1; ; table_size <<= 1)
 866    if (table_size > pat_len)
 867      break;
 868
 869  dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 870  dfa->state_hash_mask = table_size - 1;
 871
 872  dfa->mb_cur_max = MB_CUR_MAX;
 873#ifdef _LIBC
 874  if (dfa->mb_cur_max == 6
 875      && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 876    dfa->is_utf8 = 1;
 877  dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 878                       != 0);
 879#else
 880# ifdef HAVE_LANGINFO_CODESET
 881  codeset_name = nl_langinfo (CODESET);
 882# else
 883  codeset_name = getenv ("LC_ALL");
 884  if (codeset_name == NULL || codeset_name[0] == '\0')
 885    codeset_name = getenv ("LC_CTYPE");
 886  if (codeset_name == NULL || codeset_name[0] == '\0')
 887    codeset_name = getenv ("LANG");
 888  if (codeset_name == NULL)
 889    codeset_name = "";
 890  else if (strchr (codeset_name, '.') !=  NULL)
 891    codeset_name = strchr (codeset_name, '.') + 1;
 892# endif
 893
 894  /* strcasecmp isn't a standard interface. brute force check */
 895#if 0
 896  if (strcasecmp (codeset_name, "UTF-8") == 0
 897      || strcasecmp (codeset_name, "UTF8") == 0)
 898    dfa->is_utf8 = 1;
 899#else
 900  if (   (codeset_name[0] == 'U' || codeset_name[0] == 'u')
 901      && (codeset_name[1] == 'T' || codeset_name[1] == 't')
 902      && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
 903      && (codeset_name[3] == '-'
 904          ? codeset_name[4] == '8' && codeset_name[5] == '\0'
 905          : codeset_name[3] == '8' && codeset_name[4] == '\0'))
 906    dfa->is_utf8 = 1;
 907#endif
 908
 909  /* We check exhaustively in the loop below if this charset is a
 910     superset of ASCII.  */
 911  dfa->map_notascii = 0;
 912#endif
 913
 914#ifdef RE_ENABLE_I18N
 915  if (dfa->mb_cur_max > 1)
 916    {
 917      if (dfa->is_utf8)
 918        {
 919#if !defined(__GNUC__) || __GNUC__ < 3
 920          static short utf8_sb_map_inited = 0;
 921
 922          if (! utf8_sb_map_inited)
 923            {
 924                int i;
 925
 926                utf8_sb_map_inited = 0;
 927                for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++)
 928                  utf8_sb_map[i] = BITSET_WORD_MAX;
 929            }
 930#endif
 931          dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 932        }
 933      else
 934        {
 935          int i, j, ch;
 936
 937          dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 938          if (BE (dfa->sb_char == NULL, 0))
 939            return REG_ESPACE;
 940
 941          /* Set the bits corresponding to single byte chars.  */
 942          for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 943            for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 944              {
 945                wint_t wch = __btowc (ch);
 946                if (wch != WEOF)
 947                  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 948# ifndef _LIBC
 949                if (isascii (ch) && wch != ch)
 950                  dfa->map_notascii = 1;
 951# endif
 952              }
 953        }
 954    }
 955#endif
 956
 957  if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 958    return REG_ESPACE;
 959  return REG_NOERROR;
 960}
 961
 962/* Initialize WORD_CHAR table, which indicate which character is
 963   "word".  In this case "word" means that it is the word construction
 964   character used by some operators like "\<", "\>", etc.  */
 965
 966static void
 967internal_function
 968init_word_char (re_dfa_t *dfa)
 969{
 970  int i, j, ch;
 971  dfa->word_ops_used = 1;
 972  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 973    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 974      if (isalnum (ch) || ch == '_')
 975        dfa->word_char[i] |= (bitset_word_t) 1 << j;
 976}
 977
 978/* Free the work area which are only used while compiling.  */
 979
 980static void
 981free_workarea_compile (regex_t *preg)
 982{
 983  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 984  bin_tree_storage_t *storage, *next;
 985  for (storage = dfa->str_tree_storage; storage; storage = next)
 986    {
 987      next = storage->next;
 988      re_free (storage);
 989    }
 990  dfa->str_tree_storage = NULL;
 991  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 992  dfa->str_tree = NULL;
 993  re_free (dfa->org_indices);
 994  dfa->org_indices = NULL;
 995}
 996
 997/* Create initial states for all contexts.  */
 998
 999static reg_errcode_t
1000create_initial_state (re_dfa_t *dfa)
1001{
1002  int first, i;
1003  reg_errcode_t err;
1004  re_node_set init_nodes;
1005
1006  /* Initial states have the epsilon closure of the node which is
1007     the first node of the regular expression.  */
1008  first = dfa->str_tree->first->node_idx;
1009  dfa->init_node = first;
1010  err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1011  if (BE (err != REG_NOERROR, 0))
1012    return err;
1013
1014  /* The back-references which are in initial states can epsilon transit,
1015     since in this case all of the subexpressions can be null.
1016     Then we add epsilon closures of the nodes which are the next nodes of
1017     the back-references.  */
1018  if (dfa->nbackref > 0)
1019    for (i = 0; i < init_nodes.nelem; ++i)
1020      {
1021        int node_idx = init_nodes.elems[i];
1022        re_token_type_t type = dfa->nodes[node_idx].type;
1023
1024        int clexp_idx;
1025        if (type != OP_BACK_REF)
1026          continue;
1027        for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1028          {
1029            re_token_t *clexp_node;
1030            clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1031            if (clexp_node->type == OP_CLOSE_SUBEXP
1032                && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1033              break;
1034          }
1035        if (clexp_idx == init_nodes.nelem)
1036          continue;
1037
1038        if (type == OP_BACK_REF)
1039          {
1040            int dest_idx = dfa->edests[node_idx].elems[0];
1041            if (!re_node_set_contains (&init_nodes, dest_idx))
1042              {
1043                reg_errcode_t err = re_node_set_merge (&init_nodes,
1044                                                       dfa->eclosures
1045                                                       + dest_idx);
1046                if (err != REG_NOERROR)
1047                  return err;
1048                i = 0;
1049              }
1050          }
1051      }
1052
1053  /* It must be the first time to invoke acquire_state.  */
1054  dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1055  /* We don't check ERR here, since the initial state must not be NULL.  */
1056  if (BE (dfa->init_state == NULL, 0))
1057    return err;
1058  if (dfa->init_state->has_constraint)
1059    {
1060      dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1061                                                       CONTEXT_WORD);
1062      dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1063                                                     CONTEXT_NEWLINE);
1064      dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1065                                                         &init_nodes,
1066                                                         CONTEXT_NEWLINE
1067                                                         | CONTEXT_BEGBUF);
1068      if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1069              || dfa->init_state_begbuf == NULL, 0))
1070        return err;
1071    }
1072  else
1073    dfa->init_state_word = dfa->init_state_nl
1074      = dfa->init_state_begbuf = dfa->init_state;
1075
1076  re_node_set_free (&init_nodes);
1077  return REG_NOERROR;
1078}
1079\f
1080#ifdef RE_ENABLE_I18N
1081/* If it is possible to do searching in single byte encoding instead of UTF-8
1082   to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1083   DFA nodes where needed.  */
1084
1085static void
1086optimize_utf8 (re_dfa_t *dfa)
1087{
1088  int node, i, mb_chars = 0, has_period = 0;
1089
1090  for (node = 0; node < dfa->nodes_len; ++node)
1091    switch (dfa->nodes[node].type)
1092      {
1093      case CHARACTER:
1094        if (dfa->nodes[node].opr.c >= 0x80)
1095          mb_chars = 1;
1096        break;
1097      case ANCHOR:
1098        switch (dfa->nodes[node].opr.ctx_type)
1099          {
1100          case LINE_FIRST:
1101          case LINE_LAST:
1102          case BUF_FIRST:
1103          case BUF_LAST:
1104            break;
1105          default:
1106            /* Word anchors etc. cannot be handled.  It's okay to test
1107               opr.ctx_type since constraints (for all DFA nodes) are
1108               created by ORing one or more opr.ctx_type values.  */
1109            return;
1110          }
1111        break;
1112      case OP_PERIOD:
1113        has_period = 1;
1114        break;
1115      case OP_BACK_REF:
1116      case OP_ALT:
1117      case END_OF_RE:
1118      case OP_DUP_ASTERISK:
1119      case OP_OPEN_SUBEXP:
1120      case OP_CLOSE_SUBEXP:
1121        break;
1122      case COMPLEX_BRACKET:
1123        return;
1124      case SIMPLE_BRACKET:
1125        /* Just double check.  The non-ASCII range starts at 0x80.  */
1126        assert (0x80 % BITSET_WORD_BITS == 0);
1127        for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1128          if (dfa->nodes[node].opr.sbcset[i])
1129            return;
1130        break;
1131      default:
1132        abort ();
1133      }
1134
1135  if (mb_chars || has_period)
1136    for (node = 0; node < dfa->nodes_len; ++node)
1137      {
1138        if (dfa->nodes[node].type == CHARACTER
1139            && dfa->nodes[node].opr.c >= 0x80)
1140          dfa->nodes[node].mb_partial = 0;
1141        else if (dfa->nodes[node].type == OP_PERIOD)
1142          dfa->nodes[node].type = OP_UTF8_PERIOD;
1143      }
1144
1145  /* The search can be in single byte locale.  */
1146  dfa->mb_cur_max = 1;
1147  dfa->is_utf8 = 0;
1148  dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1149}
1150#endif
1151\f
1152/* Analyze the structure tree, and calculate "first", "next", "edest",
1153   "eclosure", and "inveclosure".  */
1154
1155static reg_errcode_t
1156analyze (regex_t *preg)
1157{
1158  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1159  reg_errcode_t ret;
1160
1161  /* Allocate arrays.  */
1162  dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1163  dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1164  dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1165  dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1166  if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1167          || dfa->eclosures == NULL, 0))
1168    return REG_ESPACE;
1169
1170  dfa->subexp_map = re_malloc (int, preg->re_nsub);
1171  if (dfa->subexp_map != NULL)
1172    {
1173      int i;
1174      for (i = 0; i < preg->re_nsub; i++)
1175        dfa->subexp_map[i] = i;
1176      preorder (dfa->str_tree, optimize_subexps, dfa);
1177      for (i = 0; i < preg->re_nsub; i++)
1178        if (dfa->subexp_map[i] != i)
1179          break;
1180      if (i == preg->re_nsub)
1181        {
1182          free (dfa->subexp_map);
1183          dfa->subexp_map = NULL;
1184        }
1185    }
1186
1187  ret = postorder (dfa->str_tree, lower_subexps, preg);
1188  if (BE (ret != REG_NOERROR, 0))
1189    return ret;
1190  ret = postorder (dfa->str_tree, calc_first, dfa);
1191  if (BE (ret != REG_NOERROR, 0))
1192    return ret;
1193  preorder (dfa->str_tree, calc_next, dfa);
1194  ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1195  if (BE (ret != REG_NOERROR, 0))
1196    return ret;
1197  ret = calc_eclosure (dfa);
1198  if (BE (ret != REG_NOERROR, 0))
1199    return ret;
1200
1201  /* We only need this during the prune_impossible_nodes pass in regexec.c;
1202     skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1203  if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1204      || dfa->nbackref)
1205    {
1206      dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1207      if (BE (dfa->inveclosures == NULL, 0))
1208        return REG_ESPACE;
1209      ret = calc_inveclosure (dfa);
1210    }
1211
1212  return ret;
1213}
1214
1215/* Our parse trees are very unbalanced, so we cannot use a stack to
1216   implement parse tree visits.  Instead, we use parent pointers and
1217   some hairy code in these two functions.  */
1218static reg_errcode_t
1219postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1220           void *extra)
1221{
1222  bin_tree_t *node, *prev;
1223
1224  for (node = root; ; )
1225    {
1226      /* Descend down the tree, preferably to the left (or to the right
1227         if that's the only child).  */
1228      while (node->left || node->right)
1229        if (node->left)
1230          node = node->left;
1231        else
1232          node = node->right;
1233
1234      do
1235        {
1236          reg_errcode_t err = fn (extra, node);
1237          if (BE (err != REG_NOERROR, 0))
1238            return err;
1239          if (node->parent == NULL)
1240            return REG_NOERROR;
1241          prev = node;
1242          node = node->parent;
1243        }
1244      /* Go up while we have a node that is reached from the right.  */
1245      while (node->right == prev || node->right == NULL);
1246      node = node->right;
1247    }
1248}
1249
1250static reg_errcode_t
1251preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1252          void *extra)
1253{
1254  bin_tree_t *node;
1255
1256  for (node = root; ; )
1257    {
1258      reg_errcode_t err = fn (extra, node);
1259      if (BE (err != REG_NOERROR, 0))
1260        return err;
1261
1262      /* Go to the left node, or up and to the right.  */
1263      if (node->left)
1264        node = node->left;
1265      else
1266        {
1267          bin_tree_t *prev = NULL;
1268          while (node->right == prev || node->right == NULL)
1269            {
1270              prev = node;
1271              node = node->parent;
1272              if (!node)
1273                return REG_NOERROR;
1274            }
1275          node = node->right;
1276        }
1277    }
1278}
1279
1280/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1281   re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1282   backreferences as well.  Requires a preorder visit.  */
1283static reg_errcode_t
1284optimize_subexps (void *extra, bin_tree_t *node)
1285{
1286  re_dfa_t *dfa = (re_dfa_t *) extra;
1287
1288  if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1289    {
1290      int idx = node->token.opr.idx;
1291      node->token.opr.idx = dfa->subexp_map[idx];
1292      dfa->used_bkref_map |= 1 << node->token.opr.idx;
1293    }
1294
1295  else if (node->token.type == SUBEXP
1296           && node->left && node->left->token.type == SUBEXP)
1297    {
1298      int other_idx = node->left->token.opr.idx;
1299
1300      node->left = node->left->left;
1301      if (node->left)
1302        node->left->parent = node;
1303
1304      dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1305      if (other_idx < BITSET_WORD_BITS)
1306          dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1307    }
1308
1309  return REG_NOERROR;
1310}
1311
1312/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1313   of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1314static reg_errcode_t
1315lower_subexps (void *extra, bin_tree_t *node)
1316{
1317  regex_t *preg = (regex_t *) extra;
1318  reg_errcode_t err = REG_NOERROR;
1319
1320  if (node->left && node->left->token.type == SUBEXP)
1321    {
1322      node->left = lower_subexp (&err, preg, node->left);
1323      if (node->left)
1324        node->left->parent = node;
1325    }
1326  if (node->right && node->right->token.type == SUBEXP)
1327    {
1328      node->right = lower_subexp (&err, preg, node->right);
1329      if (node->right)
1330        node->right->parent = node;
1331    }
1332
1333  return err;
1334}
1335
1336static bin_tree_t *
1337lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1338{
1339  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1340  bin_tree_t *body = node->left;
1341  bin_tree_t *op, *cls, *tree1, *tree;
1342
1343  if (preg->no_sub
1344      /* We do not optimize empty subexpressions, because otherwise we may
1345         have bad CONCAT nodes with NULL children.  This is obviously not
1346         very common, so we do not lose much.  An example that triggers
1347         this case is the sed "script" /\(\)/x.  */
1348      && node->left != NULL
1349      && (node->token.opr.idx >= BITSET_WORD_BITS
1350          || !(dfa->used_bkref_map
1351               & ((bitset_word_t) 1 << node->token.opr.idx))))
1352    return node->left;
1353
1354  /* Convert the SUBEXP node to the concatenation of an
1355     OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1356  op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1357  cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1358  tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1359  tree = create_tree (dfa, op, tree1, CONCAT);
1360  if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1361    {
1362      *err = REG_ESPACE;
1363      return NULL;
1364    }
1365
1366  op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1367  op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1368  return tree;
1369}
1370
1371/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1372   nodes.  Requires a postorder visit.  */
1373static reg_errcode_t
1374calc_first (void *extra, bin_tree_t *node)
1375{
1376  re_dfa_t *dfa = (re_dfa_t *) extra;
1377  if (node->token.type == CONCAT)
1378    {
1379      node->first = node->left->first;
1380      node->node_idx = node->left->node_idx;
1381    }
1382  else
1383    {
1384      node->first = node;
1385      node->node_idx = re_dfa_add_node (dfa, node->token);
1386      if (BE (node->node_idx == -1, 0))
1387        return REG_ESPACE;
1388      if (node->token.type == ANCHOR)
1389        dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1390    }
1391  return REG_NOERROR;
1392}
1393
1394/* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1395static reg_errcode_t
1396calc_next (void *extra, bin_tree_t *node)
1397{
1398  switch (node->token.type)
1399    {
1400    case OP_DUP_ASTERISK:
1401      node->left->next = node;
1402      break;
1403    case CONCAT:
1404      node->left->next = node->right->first;
1405      node->right->next = node->next;
1406      break;
1407    default:
1408      if (node->left)
1409        node->left->next = node->next;
1410      if (node->right)
1411        node->right->next = node->next;
1412      break;
1413    }
1414  return REG_NOERROR;
1415}
1416
1417/* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1418static reg_errcode_t
1419link_nfa_nodes (void *extra, bin_tree_t *node)
1420{
1421  re_dfa_t *dfa = (re_dfa_t *) extra;
1422  int idx = node->node_idx;
1423  reg_errcode_t err = REG_NOERROR;
1424
1425  switch (node->token.type)
1426    {
1427    case CONCAT:
1428      break;
1429
1430    case END_OF_RE:
1431      assert (node->next == NULL);
1432      break;
1433
1434    case OP_DUP_ASTERISK:
1435    case OP_ALT:
1436      {
1437        int left, right;
1438        dfa->has_plural_match = 1;
1439        if (node->left != NULL)
1440          left = node->left->first->node_idx;
1441        else
1442          left = node->next->node_idx;
1443        if (node->right != NULL)
1444          right = node->right->first->node_idx;
1445        else
1446          right = node->next->node_idx;
1447        assert (left > -1);
1448        assert (right > -1);
1449        err = re_node_set_init_2 (dfa->edests + idx, left, right);
1450      }
1451      break;
1452
1453    case ANCHOR:
1454    case OP_OPEN_SUBEXP:
1455    case OP_CLOSE_SUBEXP:
1456      err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1457      break;
1458
1459    case OP_BACK_REF:
1460      dfa->nexts[idx] = node->next->node_idx;
1461      if (node->token.type == OP_BACK_REF)
1462        err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1463      break;
1464
1465    default:
1466      assert (!IS_EPSILON_NODE (node->token.type));
1467      dfa->nexts[idx] = node->next->node_idx;
1468      break;
1469    }
1470
1471  return err;
1472}
1473
1474/* Duplicate the epsilon closure of the node ROOT_NODE.
1475   Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1476   to their own constraint.  */
1477
1478static reg_errcode_t
1479internal_function
1480duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1481                        int root_node, unsigned int init_constraint)
1482{
1483  int org_node, clone_node, ret;
1484  unsigned int constraint = init_constraint;
1485  for (org_node = top_org_node, clone_node = top_clone_node;;)
1486    {
1487      int org_dest, clone_dest;
1488      if (dfa->nodes[org_node].type == OP_BACK_REF)
1489        {
1490          /* If the back reference epsilon-transit, its destination must
1491             also have the constraint.  Then duplicate the epsilon closure
1492             of the destination of the back reference, and store it in
1493             edests of the back reference.  */
1494          org_dest = dfa->nexts[org_node];
1495          re_node_set_empty (dfa->edests + clone_node);
1496          clone_dest = duplicate_node (dfa, org_dest, constraint);
1497          if (BE (clone_dest == -1, 0))
1498            return REG_ESPACE;
1499          dfa->nexts[clone_node] = dfa->nexts[org_node];
1500          ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1501          if (BE (ret < 0, 0))
1502            return REG_ESPACE;
1503        }
1504      else if (dfa->edests[org_node].nelem == 0)
1505        {
1506          /* In case of the node can't epsilon-transit, don't duplicate the
1507             destination and store the original destination as the
1508             destination of the node.  */
1509          dfa->nexts[clone_node] = dfa->nexts[org_node];
1510          break;
1511        }
1512      else if (dfa->edests[org_node].nelem == 1)
1513        {
1514          /* In case of the node can epsilon-transit, and it has only one
1515             destination.  */
1516          org_dest = dfa->edests[org_node].elems[0];
1517          re_node_set_empty (dfa->edests + clone_node);
1518          /* If the node is root_node itself, it means the epsilon clsoure
1519             has a loop.   Then tie it to the destination of the root_node.  */
1520          if (org_node == root_node && clone_node != org_node)
1521            {
1522              ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1523              if (BE (ret < 0, 0))
1524                return REG_ESPACE;
1525              break;
1526            }
1527          /* In case of the node has another constraint, add it.  */
1528          constraint |= dfa->nodes[org_node].constraint;
1529          clone_dest = duplicate_node (dfa, org_dest, constraint);
1530          if (BE (clone_dest == -1, 0))
1531            return REG_ESPACE;
1532          ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1533          if (BE (ret < 0, 0))
1534            return REG_ESPACE;
1535        }
1536      else /* dfa->edests[org_node].nelem == 2 */
1537        {
1538          /* In case of the node can epsilon-transit, and it has two
1539             destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1540          org_dest = dfa->edests[org_node].elems[0];
1541          re_node_set_empty (dfa->edests + clone_node);
1542          /* Search for a duplicated node which satisfies the constraint.  */
1543          clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1544          if (clone_dest == -1)
1545            {
1546              /* There is no such duplicated node, create a new one.  */
1547              reg_errcode_t err;
1548              clone_dest = duplicate_node (dfa, org_dest, constraint);
1549              if (BE (clone_dest == -1, 0))
1550                return REG_ESPACE;
1551              ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1552              if (BE (ret < 0, 0))
1553                return REG_ESPACE;
1554              err = duplicate_node_closure (dfa, org_dest, clone_dest,
1555                                            root_node, constraint);
1556              if (BE (err != REG_NOERROR, 0))
1557                return err;
1558            }
1559          else
1560            {
1561              /* There is a duplicated node which satisfies the constraint,
1562                 use it to avoid infinite loop.  */
1563              ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1564              if (BE (ret < 0, 0))
1565                return REG_ESPACE;
1566            }
1567
1568          org_dest = dfa->edests[org_node].elems[1];
1569          clone_dest = duplicate_node (dfa, org_dest, constraint);
1570          if (BE (clone_dest == -1, 0))
1571            return REG_ESPACE;
1572          ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1573          if (BE (ret < 0, 0))
1574            return REG_ESPACE;
1575        }
1576      org_node = org_dest;
1577      clone_node = clone_dest;
1578    }
1579  return REG_NOERROR;
1580}
1581
1582/* Search for a node which is duplicated from the node ORG_NODE, and
1583   satisfies the constraint CONSTRAINT.  */
1584
1585static int
1586search_duplicated_node (const re_dfa_t *dfa, int org_node,
1587                        unsigned int constraint)
1588{
1589  int idx;
1590  for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1591    {
1592      if (org_node == dfa->org_indices[idx]
1593          && constraint == dfa->nodes[idx].constraint)
1594        return idx; /* Found.  */
1595    }
1596  return -1; /* Not found.  */
1597}
1598
1599/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1600   Return the index of the new node, or -1 if insufficient storage is
1601   available.  */
1602
1603static int
1604duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1605{
1606  int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1607  if (BE (dup_idx != -1, 1))
1608    {
1609      dfa->nodes[dup_idx].constraint = constraint;
1610      dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1611      dfa->nodes[dup_idx].duplicated = 1;
1612
1613      /* Store the index of the original node.  */
1614      dfa->org_indices[dup_idx] = org_idx;
1615    }
1616  return dup_idx;
1617}
1618
1619static reg_errcode_t
1620calc_inveclosure (re_dfa_t *dfa)
1621{
1622  int src, idx, ret;
1623  for (idx = 0; idx < dfa->nodes_len; ++idx)
1624    re_node_set_init_empty (dfa->inveclosures + idx);
1625
1626  for (src = 0; src < dfa->nodes_len; ++src)
1627    {
1628      int *elems = dfa->eclosures[src].elems;
1629      for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1630        {
1631          ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1632          if (BE (ret == -1, 0))
1633            return REG_ESPACE;
1634        }
1635    }
1636
1637  return REG_NOERROR;
1638}
1639
1640/* Calculate "eclosure" for all the node in DFA.  */
1641
1642static reg_errcode_t
1643calc_eclosure (re_dfa_t *dfa)
1644{
1645  int node_idx, incomplete;
1646#ifdef DEBUG
1647  assert (dfa->nodes_len > 0);
1648#endif
1649  incomplete = 0;
1650  /* For each nodes, calculate epsilon closure.  */
1651  for (node_idx = 0; ; ++node_idx)
1652    {
1653      reg_errcode_t err;
1654      re_node_set eclosure_elem;
1655      if (node_idx == dfa->nodes_len)
1656        {
1657          if (!incomplete)
1658            break;
1659          incomplete = 0;
1660          node_idx = 0;
1661        }
1662
1663#ifdef DEBUG
1664      assert (dfa->eclosures[node_idx].nelem != -1);
1665#endif
1666
1667      /* If we have already calculated, skip it.  */
1668      if (dfa->eclosures[node_idx].nelem != 0)
1669        continue;
1670      /* Calculate epsilon closure of `node_idx'.  */
1671      err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1672      if (BE (err != REG_NOERROR, 0))
1673        return err;
1674
1675      if (dfa->eclosures[node_idx].nelem == 0)
1676        {
1677          incomplete = 1;
1678          re_node_set_free (&eclosure_elem);
1679        }
1680    }
1681  return REG_NOERROR;
1682}
1683
1684/* Calculate epsilon closure of NODE.  */
1685
1686static reg_errcode_t
1687calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1688{
1689  reg_errcode_t err;
1690  int i;
1691  re_node_set eclosure;
1692  int ret;
1693  int incomplete = 0;
1694  err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1695  if (BE (err != REG_NOERROR, 0))
1696    return err;
1697
1698  /* This indicates that we are calculating this node now.
1699     We reference this value to avoid infinite loop.  */
1700  dfa->eclosures[node].nelem = -1;
1701
1702  /* If the current node has constraints, duplicate all nodes
1703     since they must inherit the constraints.  */
1704  if (dfa->nodes[node].constraint
1705      && dfa->edests[node].nelem
1706      && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1707    {
1708      err = duplicate_node_closure (dfa, node, node, node,
1709                                    dfa->nodes[node].constraint);
1710      if (BE (err != REG_NOERROR, 0))
1711        return err;
1712    }
1713
1714  /* Expand each epsilon destination nodes.  */
1715  if (IS_EPSILON_NODE(dfa->nodes[node].type))
1716    for (i = 0; i < dfa->edests[node].nelem; ++i)
1717      {
1718        re_node_set eclosure_elem;
1719        int edest = dfa->edests[node].elems[i];
1720        /* If calculating the epsilon closure of `edest' is in progress,
1721           return intermediate result.  */
1722        if (dfa->eclosures[edest].nelem == -1)
1723          {
1724            incomplete = 1;
1725            continue;
1726          }
1727        /* If we haven't calculated the epsilon closure of `edest' yet,
1728           calculate now. Otherwise use calculated epsilon closure.  */
1729        if (dfa->eclosures[edest].nelem == 0)
1730          {
1731            err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1732            if (BE (err != REG_NOERROR, 0))
1733              return err;
1734          }
1735        else
1736          eclosure_elem = dfa->eclosures[edest];
1737        /* Merge the epsilon closure of `edest'.  */
1738        err = re_node_set_merge (&eclosure, &eclosure_elem);
1739        if (BE (err != REG_NOERROR, 0))
1740          return err;
1741        /* If the epsilon closure of `edest' is incomplete,
1742           the epsilon closure of this node is also incomplete.  */
1743        if (dfa->eclosures[edest].nelem == 0)
1744          {
1745            incomplete = 1;
1746            re_node_set_free (&eclosure_elem);
1747          }
1748      }
1749
1750  /* An epsilon closure includes itself.  */
1751  ret = re_node_set_insert (&eclosure, node);
1752  if (BE (ret < 0, 0))
1753    return REG_ESPACE;
1754  if (incomplete && !root)
1755    dfa->eclosures[node].nelem = 0;
1756  else
1757    dfa->eclosures[node] = eclosure;
1758  *new_set = eclosure;
1759  return REG_NOERROR;
1760}
1761\f
1762/* Functions for token which are used in the parser.  */
1763
1764/* Fetch a token from INPUT.
1765   We must not use this function inside bracket expressions.  */
1766
1767static void
1768internal_function
1769fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1770{
1771  re_string_skip_bytes (input, peek_token (result, input, syntax));
1772}
1773
1774/* Peek a token from INPUT, and return the length of the token.
1775   We must not use this function inside bracket expressions.  */
1776
1777static int
1778internal_function
1779peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1780{
1781  unsigned char c;
1782
1783  if (re_string_eoi (input))
1784    {
1785      token->type = END_OF_RE;
1786      return 0;
1787    }
1788
1789  c = re_string_peek_byte (input, 0);
1790  token->opr.c = c;
1791
1792  token->word_char = 0;
1793#ifdef RE_ENABLE_I18N
1794  token->mb_partial = 0;
1795  if (input->mb_cur_max > 1 &&
1796      !re_string_first_byte (input, re_string_cur_idx (input)))
1797    {
1798      token->type = CHARACTER;
1799      token->mb_partial = 1;
1800      return 1;
1801    }
1802#endif
1803  if (c == '\\')
1804    {
1805      unsigned char c2;
1806      if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1807        {
1808          token->type = BACK_SLASH;
1809          return 1;
1810        }
1811
1812      c2 = re_string_peek_byte_case (input, 1);
1813      token->opr.c = c2;
1814      token->type = CHARACTER;
1815#ifdef RE_ENABLE_I18N
1816      if (input->mb_cur_max > 1)
1817        {
1818          wint_t wc = re_string_wchar_at (input,
1819                                          re_string_cur_idx (input) + 1);
1820          token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1821        }
1822      else
1823#endif
1824        token->word_char = IS_WORD_CHAR (c2) != 0;
1825
1826      switch (c2)
1827        {
1828        case '|':
1829          if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1830            token->type = OP_ALT;
1831          break;
1832        case '1': case '2': case '3': case '4': case '5':
1833        case '6': case '7': case '8': case '9':
1834          if (!(syntax & RE_NO_BK_REFS))
1835            {
1836              token->type = OP_BACK_REF;
1837              token->opr.idx = c2 - '1';
1838            }
1839          break;
1840        case '<':
1841          if (!(syntax & RE_NO_GNU_OPS))
1842            {
1843              token->type = ANCHOR;
1844              token->opr.ctx_type = WORD_FIRST;
1845            }
1846          break;
1847        case '>':
1848          if (!(syntax & RE_NO_GNU_OPS))
1849            {
1850              token->type = ANCHOR;
1851              token->opr.ctx_type = WORD_LAST;
1852            }
1853          break;
1854        case 'b':
1855          if (!(syntax & RE_NO_GNU_OPS))
1856            {
1857              token->type = ANCHOR;
1858              token->opr.ctx_type = WORD_DELIM;
1859            }
1860          break;
1861        case 'B':
1862          if (!(syntax & RE_NO_GNU_OPS))
1863            {
1864              token->type = ANCHOR;
1865              token->opr.ctx_type = NOT_WORD_DELIM;
1866            }
1867          break;
1868        case 'w':
1869          if (!(syntax & RE_NO_GNU_OPS))
1870            token->type = OP_WORD;
1871          break;
1872        case 'W':
1873          if (!(syntax & RE_NO_GNU_OPS))
1874            token->type = OP_NOTWORD;
1875          break;
1876        case 's':
1877          if (!(syntax & RE_NO_GNU_OPS))
1878            token->type = OP_SPACE;
1879          break;
1880        case 'S':
1881          if (!(syntax & RE_NO_GNU_OPS))
1882            token->type = OP_NOTSPACE;
1883          break;
1884        case '`':
1885          if (!(syntax & RE_NO_GNU_OPS))
1886            {
1887              token->type = ANCHOR;
1888              token->opr.ctx_type = BUF_FIRST;
1889            }
1890          break;
1891        case '\'':
1892          if (!(syntax & RE_NO_GNU_OPS))
1893            {
1894              token->type = ANCHOR;
1895              token->opr.ctx_type = BUF_LAST;
1896            }
1897          break;
1898        case '(':
1899          if (!(syntax & RE_NO_BK_PARENS))
1900            token->type = OP_OPEN_SUBEXP;
1901          break;
1902        case ')':
1903          if (!(syntax & RE_NO_BK_PARENS))
1904            token->type = OP_CLOSE_SUBEXP;
1905          break;
1906        case '+':
1907          if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1908            token->type = OP_DUP_PLUS;
1909          break;
1910        case '?':
1911          if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1912            token->type = OP_DUP_QUESTION;
1913          break;
1914        case '{':
1915          if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1916            token->type = OP_OPEN_DUP_NUM;
1917          break;
1918        case '}':
1919          if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1920            token->type = OP_CLOSE_DUP_NUM;
1921          break;
1922        default:
1923          break;
1924        }
1925      return 2;
1926    }
1927
1928  token->type = CHARACTER;
1929#ifdef RE_ENABLE_I18N
1930  if (input->mb_cur_max > 1)
1931    {
1932      wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1933      token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1934    }
1935  else
1936#endif
1937    token->word_char = IS_WORD_CHAR (token->opr.c);
1938
1939  switch (c)
1940    {
1941    case '\n':
1942      if (syntax & RE_NEWLINE_ALT)
1943        token->type = OP_ALT;
1944      break;
1945    case '|':
1946      if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1947        token->type = OP_ALT;
1948      break;
1949    case '*':
1950      token->type = OP_DUP_ASTERISK;
1951      break;
1952    case '+':
1953      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1954        token->type = OP_DUP_PLUS;
1955      break;
1956    case '?':
1957      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1958        token->type = OP_DUP_QUESTION;
1959      break;
1960    case '{':
1961      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1962        token->type = OP_OPEN_DUP_NUM;
1963      break;
1964    case '}':
1965      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1966        token->type = OP_CLOSE_DUP_NUM;
1967      break;
1968    case '(':
1969      if (syntax & RE_NO_BK_PARENS)
1970        token->type = OP_OPEN_SUBEXP;
1971      break;
1972    case ')':
1973      if (syntax & RE_NO_BK_PARENS)
1974        token->type = OP_CLOSE_SUBEXP;
1975      break;
1976    case '[':
1977      token->type = OP_OPEN_BRACKET;
1978      break;
1979    case '.':
1980      token->type = OP_PERIOD;
1981      break;
1982    case '^':
1983      if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1984          re_string_cur_idx (input) != 0)
1985        {
1986          char prev = re_string_peek_byte (input, -1);
1987          if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1988            break;
1989        }
1990      token->type = ANCHOR;
1991      token->opr.ctx_type = LINE_FIRST;
1992      break;
1993    case '$':
1994      if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1995          re_string_cur_idx (input) + 1 != re_string_length (input))
1996        {
1997          re_token_t next;
1998          re_string_skip_bytes (input, 1);
1999          peek_token (&next, input, syntax);
2000          re_string_skip_bytes (input, -1);
2001          if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2002            break;
2003        }
2004      token->type = ANCHOR;
2005      token->opr.ctx_type = LINE_LAST;
2006      break;
2007    default:
2008      break;
2009    }
2010  return 1;
2011}
2012
2013/* Peek a token from INPUT, and return the length of the token.
2014   We must not use this function out of bracket expressions.  */
2015
2016static int
2017internal_function
2018peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2019{
2020  unsigned char c;
2021  if (re_string_eoi (input))
2022    {
2023      token->type = END_OF_RE;
2024      return 0;
2025    }
2026  c = re_string_peek_byte (input, 0);
2027  token->opr.c = c;
2028
2029#ifdef RE_ENABLE_I18N
2030  if (input->mb_cur_max > 1 &&
2031      !re_string_first_byte (input, re_string_cur_idx (input)))
2032    {
2033      token->type = CHARACTER;
2034      return 1;
2035    }
2036#endif /* RE_ENABLE_I18N */
2037
2038  if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2039      && re_string_cur_idx (input) + 1 < re_string_length (input))
2040    {
2041      /* In this case, '\' escape a character.  */
2042      unsigned char c2;
2043      re_string_skip_bytes (input, 1);
2044      c2 = re_string_peek_byte (input, 0);
2045      token->opr.c = c2;
2046      token->type = CHARACTER;
2047      return 1;
2048    }
2049  if (c == '[') /* '[' is a special char in a bracket exps.  */
2050    {
2051      unsigned char c2;
2052      int token_len;
2053      if (re_string_cur_idx (input) + 1 < re_string_length (input))
2054        c2 = re_string_peek_byte (input, 1);
2055      else
2056        c2 = 0;
2057      token->opr.c = c2;
2058      token_len = 2;
2059      switch (c2)
2060        {
2061        case '.':
2062          token->type = OP_OPEN_COLL_ELEM;
2063          break;
2064        case '=':
2065          token->type = OP_OPEN_EQUIV_CLASS;
2066          break;
2067        case ':':
2068          if (syntax & RE_CHAR_CLASSES)
2069            {
2070              token->type = OP_OPEN_CHAR_CLASS;
2071              break;
2072            }
2073          /* else fall through.  */
2074        default:
2075          token->type = CHARACTER;
2076          token->opr.c = c;
2077          token_len = 1;
2078          break;
2079        }
2080      return token_len;
2081    }
2082  switch (c)
2083    {
2084    case '-':
2085      token->type = OP_CHARSET_RANGE;
2086      break;
2087    case ']':
2088      token->type = OP_CLOSE_BRACKET;
2089      break;
2090    case '^':
2091      token->type = OP_NON_MATCH_LIST;
2092      break;
2093    default:
2094      token->type = CHARACTER;
2095    }
2096  return 1;
2097}
2098\f
2099/* Functions for parser.  */
2100
2101/* Entry point of the parser.
2102   Parse the regular expression REGEXP and return the structure tree.
2103   If an error is occured, ERR is set by error code, and return NULL.
2104   This function build the following tree, from regular expression <reg_exp>:
2105           CAT
2106           / \
2107          /   \
2108   <reg_exp>  EOR
2109
2110   CAT means concatenation.
2111   EOR means end of regular expression.  */
2112
2113static bin_tree_t *
2114parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2115       reg_errcode_t *err)
2116{
2117  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2118  bin_tree_t *tree, *eor, *root;
2119  re_token_t current_token;
2120  dfa->syntax = syntax;
2121  fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2122  tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2123  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2124    return NULL;
2125  eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2126  if (tree != NULL)
2127    root = create_tree (dfa, tree, eor, CONCAT);
2128  else
2129    root = eor;
2130  if (BE (eor == NULL || root == NULL, 0))
2131    {
2132      *err = REG_ESPACE;
2133      return NULL;
2134    }
2135  return root;
2136}
2137
2138/* This function build the following tree, from regular expression
2139   <branch1>|<branch2>:
2140           ALT
2141           / \
2142          /   \
2143   <branch1> <branch2>
2144
2145   ALT means alternative, which represents the operator `|'.  */
2146
2147static bin_tree_t *
2148parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2149               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2150{
2151  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2152  bin_tree_t *tree, *branch = NULL;
2153  tree = parse_branch (regexp, preg, token, syntax, nest, err);
2154  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2155    return NULL;
2156
2157  while (token->type == OP_ALT)
2158    {
2159      fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2160      if (token->type != OP_ALT && token->type != END_OF_RE
2161          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2162        {
2163          branch = parse_branch (regexp, preg, token, syntax, nest, err);
2164          if (BE (*err != REG_NOERROR && branch == NULL, 0))
2165            return NULL;
2166        }
2167      else
2168        branch = NULL;
2169      tree = create_tree (dfa, tree, branch, OP_ALT);
2170      if (BE (tree == NULL, 0))
2171        {
2172          *err = REG_ESPACE;
2173          return NULL;
2174        }
2175    }
2176  return tree;
2177}
2178
2179/* This function build the following tree, from regular expression
2180   <exp1><exp2>:
2181        CAT
2182        / \
2183       /   \
2184   <exp1> <exp2>
2185
2186   CAT means concatenation.  */
2187
2188static bin_tree_t *
2189parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2190              reg_syntax_t syntax, int nest, reg_errcode_t *err)
2191{
2192  bin_tree_t *tree, *exp;
2193  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2194  tree = parse_expression (regexp, preg, token, syntax, nest, err);
2195  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2196    return NULL;
2197
2198  while (token->type != OP_ALT && token->type != END_OF_RE
2199         && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2200    {
2201      exp = parse_expression (regexp, preg, token, syntax, nest, err);
2202      if (BE (*err != REG_NOERROR && exp == NULL, 0))
2203        {
2204          return NULL;
2205        }
2206      if (tree != NULL && exp != NULL)
2207        {
2208          tree = create_tree (dfa, tree, exp, CONCAT);
2209          if (tree == NULL)
2210            {
2211              *err = REG_ESPACE;
2212              return NULL;
2213            }
2214        }
2215      else if (tree == NULL)
2216        tree = exp;
2217      /* Otherwise exp == NULL, we don't need to create new tree.  */
2218    }
2219  return tree;
2220}
2221
2222/* This function build the following tree, from regular expression a*:
2223         *
2224         |
2225         a
2226*/
2227
2228static bin_tree_t *
2229parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2230                  reg_syntax_t syntax, int nest, reg_errcode_t *err)
2231{
2232  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2233  bin_tree_t *tree;
2234  switch (token->type)
2235    {
2236    case CHARACTER:
2237      tree = create_token_tree (dfa, NULL, NULL, token);
2238      if (BE (tree == NULL, 0))
2239        {
2240          *err = REG_ESPACE;
2241          return NULL;
2242        }
2243#ifdef RE_ENABLE_I18N
2244      if (dfa->mb_cur_max > 1)
2245        {
2246          while (!re_string_eoi (regexp)
2247                 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2248            {
2249              bin_tree_t *mbc_remain;
2250              fetch_token (token, regexp, syntax);
2251              mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2252              tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2253              if (BE (mbc_remain == NULL || tree == NULL, 0))
2254                {
2255                  *err = REG_ESPACE;
2256                  return NULL;
2257                }
2258            }
2259        }
2260#endif
2261      break;
2262    case OP_OPEN_SUBEXP:
2263      tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2264      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2265        return NULL;
2266      break;
2267    case OP_OPEN_BRACKET:
2268      tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2269      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2270        return NULL;
2271      break;
2272    case OP_BACK_REF:
2273      if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2274        {
2275          *err = REG_ESUBREG;
2276          return NULL;
2277        }
2278      dfa->used_bkref_map |= 1 << token->opr.idx;
2279      tree = create_token_tree (dfa, NULL, NULL, token);
2280      if (BE (tree == NULL, 0))
2281        {
2282          *err = REG_ESPACE;
2283          return NULL;
2284        }
2285      ++dfa->nbackref;
2286      dfa->has_mb_node = 1;
2287      break;
2288    case OP_OPEN_DUP_NUM:
2289      if (syntax & RE_CONTEXT_INVALID_DUP)
2290        {
2291          *err = REG_BADRPT;
2292          return NULL;
2293        }
2294      /* FALLTHROUGH */
2295    case OP_DUP_ASTERISK:
2296    case OP_DUP_PLUS:
2297    case OP_DUP_QUESTION:
2298      if (syntax & RE_CONTEXT_INVALID_OPS)
2299        {
2300          *err = REG_BADRPT;
2301          return NULL;
2302        }
2303      else if (syntax & RE_CONTEXT_INDEP_OPS)
2304        {
2305          fetch_token (token, regexp, syntax);
2306          return parse_expression (regexp, preg, token, syntax, nest, err);
2307        }
2308      /* else fall through  */
2309    case OP_CLOSE_SUBEXP:
2310      if ((token->type == OP_CLOSE_SUBEXP) &&
2311          !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2312        {
2313          *err = REG_ERPAREN;
2314          return NULL;
2315        }
2316      /* else fall through  */
2317    case OP_CLOSE_DUP_NUM:
2318      /* We treat it as a normal character.  */
2319
2320      /* Then we can these characters as normal characters.  */
2321      token->type = CHARACTER;
2322      /* mb_partial and word_char bits should be initialized already
2323         by peek_token.  */
2324      tree = create_token_tree (dfa, NULL, NULL, token);
2325      if (BE (tree == NULL, 0))
2326        {
2327          *err = REG_ESPACE;
2328          return NULL;
2329        }
2330      break;
2331    case ANCHOR:
2332      if ((token->opr.ctx_type
2333           & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2334          && dfa->word_ops_used == 0)
2335        init_word_char (dfa);
2336      if (token->opr.ctx_type == WORD_DELIM
2337          || token->opr.ctx_type == NOT_WORD_DELIM)
2338        {
2339          bin_tree_t *tree_first, *tree_last;
2340          if (token->opr.ctx_type == WORD_DELIM)
2341            {
2342              token->opr.ctx_type = WORD_FIRST;
2343              tree_first = create_token_tree (dfa, NULL, NULL, token);
2344              token->opr.ctx_type = WORD_LAST;
2345            }
2346          else
2347            {
2348              token->opr.ctx_type = INSIDE_WORD;
2349              tree_first = create_token_tree (dfa, NULL, NULL, token);
2350              token->opr.ctx_type = INSIDE_NOTWORD;
2351            }
2352          tree_last = create_token_tree (dfa, NULL, NULL, token);
2353          tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2354          if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2355            {
2356              *err = REG_ESPACE;
2357              return NULL;
2358            }
2359        }
2360      else
2361        {
2362          tree = create_token_tree (dfa, NULL, NULL, token);
2363          if (BE (tree == NULL, 0))
2364            {
2365              *err = REG_ESPACE;
2366              return NULL;
2367            }
2368        }
2369      /* We must return here, since ANCHORs can't be followed
2370         by repetition operators.
2371         eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2372             it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2373      fetch_token (token, regexp, syntax);
2374      return tree;
2375    case OP_PERIOD:
2376      tree = create_token_tree (dfa, NULL, NULL, token);
2377      if (BE (tree == NULL, 0))
2378        {
2379          *err = REG_ESPACE;
2380          return NULL;
2381        }
2382      if (dfa->mb_cur_max > 1)
2383        dfa->has_mb_node = 1;
2384      break;
2385    case OP_WORD:
2386    case OP_NOTWORD:
2387      tree = build_charclass_op (dfa, regexp->trans,
2388                                 "alnum",
2389                                 "_",
2390                                 token->type == OP_NOTWORD, err);
2391      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2392        return NULL;
2393      break;
2394    case OP_SPACE:
2395    case OP_NOTSPACE:
2396      tree = build_charclass_op (dfa, regexp->trans,
2397                                 "space",
2398                                 "",
2399                                 token->type == OP_NOTSPACE, err);
2400      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2401        return NULL;
2402      break;
2403    case OP_ALT:
2404    case END_OF_RE:
2405      return NULL;
2406    case BACK_SLASH:
2407      *err = REG_EESCAPE;
2408      return NULL;
2409    default:
2410      /* Must not happen?  */
2411#ifdef DEBUG
2412      assert (0);
2413#endif
2414      return NULL;
2415    }
2416  fetch_token (token, regexp, syntax);
2417
2418  while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2419         || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2420    {
2421      tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2422      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2423        return NULL;
2424      /* In BRE consecutive duplications are not allowed.  */
2425      if ((syntax & RE_CONTEXT_INVALID_DUP)
2426          && (token->type == OP_DUP_ASTERISK
2427              || token->type == OP_OPEN_DUP_NUM))
2428        {
2429          *err = REG_BADRPT;
2430          return NULL;
2431        }
2432    }
2433
2434  return tree;
2435}
2436
2437/* This function build the following tree, from regular expression
2438   (<reg_exp>):
2439         SUBEXP
2440            |
2441        <reg_exp>
2442*/
2443
2444static bin_tree_t *
2445parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2446               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2447{
2448  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2449  bin_tree_t *tree;
2450  size_t cur_nsub;
2451  cur_nsub = preg->re_nsub++;
2452
2453  fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2454
2455  /* The subexpression may be a null string.  */
2456  if (token->type == OP_CLOSE_SUBEXP)
2457    tree = NULL;
2458  else
2459    {
2460      tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2461      if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2462        *err = REG_EPAREN;
2463      if (BE (*err != REG_NOERROR, 0))
2464        return NULL;
2465    }
2466
2467  if (cur_nsub <= '9' - '1')
2468    dfa->completed_bkref_map |= 1 << cur_nsub;
2469
2470  tree = create_tree (dfa, tree, NULL, SUBEXP);
2471  if (BE (tree == NULL, 0))
2472    {
2473      *err = REG_ESPACE;
2474      return NULL;
2475    }
2476  tree->token.opr.idx = cur_nsub;
2477  return tree;
2478}
2479
2480/* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2481
2482static bin_tree_t *
2483parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2484              re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2485{
2486  bin_tree_t *tree = NULL, *old_tree = NULL;
2487  int i, start, end, start_idx = re_string_cur_idx (regexp);
2488#ifndef RE_TOKEN_INIT_BUG
2489  re_token_t start_token = *token;
2490#else
2491  re_token_t start_token;
2492
2493  memcpy ((void *) &start_token, (void *) token, sizeof start_token);
2494#endif
2495
2496  if (token->type == OP_OPEN_DUP_NUM)
2497    {
2498      end = 0;
2499      start = fetch_number (regexp, token, syntax);
2500      if (start == -1)
2501        {
2502          if (token->type == CHARACTER && token->opr.c == ',')
2503            start = 0; /* We treat "{,m}" as "{0,m}".  */
2504          else
2505            {
2506              *err = REG_BADBR; /* <re>{} is invalid.  */
2507              return NULL;
2508            }
2509        }
2510      if (BE (start != -2, 1))
2511        {
2512          /* We treat "{n}" as "{n,n}".  */
2513          end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2514                 : ((token->type == CHARACTER && token->opr.c == ',')
2515                    ? fetch_number (regexp, token, syntax) : -2));
2516        }
2517      if (BE (start == -2 || end == -2, 0))
2518        {
2519          /* Invalid sequence.  */
2520          if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2521            {
2522              if (token->type == END_OF_RE)
2523                *err = REG_EBRACE;
2524              else
2525                *err = REG_BADBR;
2526
2527              return NULL;
2528            }
2529
2530          /* If the syntax bit is set, rollback.  */
2531          re_string_set_index (regexp, start_idx);
2532          *token = start_token;
2533          token->type = CHARACTER;
2534          /* mb_partial and word_char bits should be already initialized by
2535             peek_token.  */
2536          return elem;
2537        }
2538
2539      if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2540        {
2541          /* First number greater than second.  */
2542          *err = REG_BADBR;
2543          return NULL;
2544        }
2545    }
2546  else
2547    {
2548      start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2549      end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2550    }
2551
2552  fetch_token (token, regexp, syntax);
2553
2554  if (BE (elem == NULL, 0))
2555    return NULL;
2556  if (BE (start == 0 && end == 0, 0))
2557    {
2558      postorder (elem, free_tree, NULL);
2559      return NULL;
2560    }
2561
2562  /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2563  if (BE (start > 0, 0))
2564    {
2565      tree = elem;
2566      for (i = 2; i <= start; ++i)
2567        {
2568          elem = duplicate_tree (elem, dfa);
2569          tree = create_tree (dfa, tree, elem, CONCAT);
2570          if (BE (elem == NULL || tree == NULL, 0))
2571            goto parse_dup_op_espace;
2572        }
2573
2574      if (start == end)
2575        return tree;
2576
2577      /* Duplicate ELEM before it is marked optional.  */
2578      elem = duplicate_tree (elem, dfa);
2579      old_tree = tree;
2580    }
2581  else
2582    old_tree = NULL;
2583
2584  if (elem->token.type == SUBEXP)
2585    postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2586
2587  tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2588  if (BE (tree == NULL, 0))
2589    goto parse_dup_op_espace;
2590
2591  /* This loop is actually executed only when end != -1,
2592     to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2593     already created the start+1-th copy.  */
2594  for (i = start + 2; i <= end; ++i)
2595    {
2596      elem = duplicate_tree (elem, dfa);
2597      tree = create_tree (dfa, tree, elem, CONCAT);
2598      if (BE (elem == NULL || tree == NULL, 0))
2599        goto parse_dup_op_espace;
2600
2601      tree = create_tree (dfa, tree, NULL, OP_ALT);
2602      if (BE (tree == NULL, 0))
2603        goto parse_dup_op_espace;
2604    }
2605
2606  if (old_tree)
2607    tree = create_tree (dfa, old_tree, tree, CONCAT);
2608
2609  return tree;
2610
2611 parse_dup_op_espace:
2612  *err = REG_ESPACE;
2613  return NULL;
2614}
2615
2616/* Size of the names for collating symbol/equivalence_class/character_class.
2617   I'm not sure, but maybe enough.  */
2618#define BRACKET_NAME_BUF_SIZE 32
2619
2620#ifndef _LIBC
2621  /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2622     Build the range expression which starts from START_ELEM, and ends
2623     at END_ELEM.  The result are written to MBCSET and SBCSET.
2624     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2625     mbcset->range_ends, is a pointer argument sinse we may
2626     update it.  */
2627
2628static reg_errcode_t
2629internal_function
2630# ifdef RE_ENABLE_I18N
2631build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2632                 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2633# else /* not RE_ENABLE_I18N */
2634build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2635                 bracket_elem_t *end_elem)
2636# endif /* not RE_ENABLE_I18N */
2637{
2638  unsigned int start_ch, end_ch;
2639  /* Equivalence Classes and Character Classes can't be a range start/end.  */
2640  if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2641          || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2642          0))
2643    return REG_ERANGE;
2644
2645  /* We can handle no multi character collating elements without libc
2646     support.  */
2647  if (BE ((start_elem->type == COLL_SYM
2648           && strlen ((char *) start_elem->opr.name) > 1)
2649          || (end_elem->type == COLL_SYM
2650              && strlen ((char *) end_elem->opr.name) > 1), 0))
2651    return REG_ECOLLATE;
2652
2653# ifdef RE_ENABLE_I18N
2654  {
2655    wchar_t wc;
2656    wint_t start_wc;
2657    wint_t end_wc;
2658    wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2659
2660    start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2661                : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2662                   : 0));
2663    end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2664              : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2665                 : 0));
2666#ifdef GAWK
2667    /*
2668     * Fedora Core 2, maybe others, have broken `btowc' that returns -1
2669     * for any value > 127. Sigh. Note that `start_ch' and `end_ch' are
2670     * unsigned, so we don't have sign extension problems.
2671     */
2672    start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2673                ? start_ch : start_elem->opr.wch);
2674    end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2675              ? end_ch : end_elem->opr.wch);
2676#else
2677    start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2678                ? __btowc (start_ch) : start_elem->opr.wch);
2679    end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2680              ? __btowc (end_ch) : end_elem->opr.wch);
2681#endif
2682    if (start_wc == WEOF || end_wc == WEOF)
2683      return REG_ECOLLATE;
2684    cmp_buf[0] = start_wc;
2685    cmp_buf[4] = end_wc;
2686    if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2687      return REG_ERANGE;
2688
2689    /* Got valid collation sequence values, add them as a new entry.
2690       However, for !_LIBC we have no collation elements: if the
2691       character set is single byte, the single byte character set
2692       that we build below suffices.  parse_bracket_exp passes
2693       no MBCSET if dfa->mb_cur_max == 1.  */
2694    if (mbcset)
2695      {
2696        /* Check the space of the arrays.  */
2697        if (BE (*range_alloc == mbcset->nranges, 0))
2698          {
2699            /* There is not enough space, need realloc.  */
2700            wchar_t *new_array_start, *new_array_end;
2701            int new_nranges;
2702
2703            /* +1 in case of mbcset->nranges is 0.  */
2704            new_nranges = 2 * mbcset->nranges + 1;
2705            /* Use realloc since mbcset->range_starts and mbcset->range_ends
2706               are NULL if *range_alloc == 0.  */
2707            new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2708                                          new_nranges);
2709            new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2710                                        new_nranges);
2711
2712            if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2713              return REG_ESPACE;
2714
2715            mbcset->range_starts = new_array_start;
2716            mbcset->range_ends = new_array_end;
2717            *range_alloc = new_nranges;
2718          }
2719
2720        mbcset->range_starts[mbcset->nranges] = start_wc;
2721        mbcset->range_ends[mbcset->nranges++] = end_wc;
2722      }
2723
2724    /* Build the table for single byte characters.  */
2725    for (wc = 0; wc < SBC_MAX; ++wc)
2726      {
2727        cmp_buf[2] = wc;
2728        if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2729            && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2730          bitset_set (sbcset, wc);
2731      }
2732  }
2733# else /* not RE_ENABLE_I18N */
2734  {
2735    unsigned int ch;
2736    start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2737                : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2738                   : 0));
2739    end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2740              : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2741                 : 0));
2742    if (start_ch > end_ch)
2743      return REG_ERANGE;
2744    /* Build the table for single byte characters.  */
2745    for (ch = 0; ch < SBC_MAX; ++ch)
2746      if (start_ch <= ch  && ch <= end_ch)
2747        bitset_set (sbcset, ch);
2748  }
2749# endif /* not RE_ENABLE_I18N */
2750  return REG_NOERROR;
2751}
2752#endif /* not _LIBC */
2753
2754#ifndef _LIBC
2755/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2756   Build the collating element which is represented by NAME.
2757   The result are written to MBCSET and SBCSET.
2758   COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2759   pointer argument since we may update it.  */
2760
2761static reg_errcode_t
2762internal_function
2763# ifdef RE_ENABLE_I18N
2764build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2765                        int *coll_sym_alloc, const unsigned char *name)
2766# else /* not RE_ENABLE_I18N */
2767build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2768# endif /* not RE_ENABLE_I18N */
2769{
2770  size_t name_len = strlen ((const char *) name);
2771  if (BE (name_len != 1, 0))
2772    return REG_ECOLLATE;
2773  else
2774    {
2775      bitset_set (sbcset, name[0]);
2776      return REG_NOERROR;
2777    }
2778}
2779#endif /* not _LIBC */
2780
2781/* This function parse bracket expression like "[abc]", "[a-c]",
2782   "[[.a-a.]]" etc.  */
2783
2784static bin_tree_t *
2785parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2786                   reg_syntax_t syntax, reg_errcode_t *err)
2787{
2788#ifdef _LIBC
2789  const unsigned char *collseqmb;
2790  const char *collseqwc;
2791  uint32_t nrules;
2792  int32_t table_size;
2793  const int32_t *symb_table;
2794  const unsigned char *extra;
2795
2796  /* Local function for parse_bracket_exp used in _LIBC environement.
2797     Seek the collating symbol entry correspondings to NAME.
2798     Return the index of the symbol in the SYMB_TABLE.  */
2799
2800  auto inline int32_t
2801  __attribute ((always_inline))
2802  seek_collating_symbol_entry (name, name_len)
2803         const unsigned char *name;
2804         size_t name_len;
2805    {
2806      int32_t hash = elem_hash ((const char *) name, name_len);
2807      int32_t elem = hash % table_size;
2808      if (symb_table[2 * elem] != 0)
2809        {
2810          int32_t second = hash % (table_size - 2) + 1;
2811
2812          do
2813            {
2814              /* First compare the hashing value.  */
2815              if (symb_table[2 * elem] == hash
2816                  /* Compare the length of the name.  */
2817                  && name_len == extra[symb_table[2 * elem + 1]]
2818                  /* Compare the name.  */
2819                  && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2820                             name_len) == 0)
2821                {
2822                  /* Yep, this is the entry.  */
2823                  break;
2824                }
2825
2826              /* Next entry.  */
2827              elem += second;
2828            }
2829          while (symb_table[2 * elem] != 0);
2830        }
2831      return elem;
2832    }
2833
2834  /* Local function for parse_bracket_exp used in _LIBC environment.
2835     Look up the collation sequence value of BR_ELEM.
2836     Return the value if succeeded, UINT_MAX otherwise.  */
2837
2838  auto inline unsigned int
2839  __attribute ((always_inline))
2840  lookup_collation_sequence_value (br_elem)
2841         bracket_elem_t *br_elem;
2842    {
2843      if (br_elem->type == SB_CHAR)
2844        {
2845          /*
2846          if (MB_CUR_MAX == 1)
2847          */
2848          if (nrules == 0)
2849            return collseqmb[br_elem->opr.ch];
2850          else
2851            {
2852              wint_t wc = __btowc (br_elem->opr.ch);
2853              return __collseq_table_lookup (collseqwc, wc);
2854            }
2855        }
2856      else if (br_elem->type == MB_CHAR)
2857        {
2858          if (nrules != 0)
2859            return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2860        }
2861      else if (br_elem->type == COLL_SYM)
2862        {
2863          size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2864          if (nrules != 0)
2865            {
2866              int32_t elem, idx;
2867              elem = seek_collating_symbol_entry (br_elem->opr.name,
2868                                                  sym_name_len);
2869              if (symb_table[2 * elem] != 0)
2870                {
2871                  /* We found the entry.  */
2872                  idx = symb_table[2 * elem + 1];
2873                  /* Skip the name of collating element name.  */
2874                  idx += 1 + extra[idx];
2875                  /* Skip the byte sequence of the collating element.  */
2876                  idx += 1 + extra[idx];
2877                  /* Adjust for the alignment.  */
2878                  idx = (idx + 3) & ~3;
2879                  /* Skip the multibyte collation sequence value.  */
2880                  idx += sizeof (unsigned int);
2881                  /* Skip the wide char sequence of the collating element.  */
2882                  idx += sizeof (unsigned int) *
2883                    (1 + *(unsigned int *) (extra + idx));
2884                  /* Return the collation sequence value.  */
2885                  return *(unsigned int *) (extra + idx);
2886                }
2887              else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2888                {
2889                  /* No valid character.  Match it as a single byte
2890                     character.  */
2891                  return collseqmb[br_elem->opr.name[0]];
2892                }
2893            }
2894          else if (sym_name_len == 1)
2895            return collseqmb[br_elem->opr.name[0]];
2896        }
2897      return UINT_MAX;
2898    }
2899
2900  /* Local function for parse_bracket_exp used in _LIBC environement.
2901     Build the range expression which starts from START_ELEM, and ends
2902     at END_ELEM.  The result are written to MBCSET and SBCSET.
2903     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2904     mbcset->range_ends, is a pointer argument sinse we may
2905     update it.  */
2906
2907  auto inline reg_errcode_t
2908  __attribute ((always_inline))
2909  build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2910         re_charset_t *mbcset;
2911         int *range_alloc;
2912         bitset_t sbcset;
2913         bracket_elem_t *start_elem, *end_elem;
2914    {
2915      unsigned int ch;
2916      uint32_t start_collseq;
2917      uint32_t end_collseq;
2918
2919      /* Equivalence Classes and Character Classes can't be a range
2920         start/end.  */
2921      if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2922              || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2923              0))
2924        return REG_ERANGE;
2925
2926      start_collseq = lookup_collation_sequence_value (start_elem);
2927      end_collseq = lookup_collation_sequence_value (end_elem);
2928      /* Check start/end collation sequence values.  */
2929      if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2930        return REG_ECOLLATE;
2931      if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2932        return REG_ERANGE;
2933
2934      /* Got valid collation sequence values, add them as a new entry.
2935         However, if we have no collation elements, and the character set
2936         is single byte, the single byte character set that we
2937         build below suffices. */
2938      if (nrules > 0 || dfa->mb_cur_max > 1)
2939        {
2940          /* Check the space of the arrays.  */
2941          if (BE (*range_alloc == mbcset->nranges, 0))
2942            {
2943              /* There is not enough space, need realloc.  */
2944              uint32_t *new_array_start;
2945              uint32_t *new_array_end;
2946              int new_nranges;
2947
2948              /* +1 in case of mbcset->nranges is 0.  */
2949              new_nranges = 2 * mbcset->nranges + 1;
2950              new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2951                                            new_nranges);
2952              new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2953                                          new_nranges);
2954
2955              if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2956                return REG_ESPACE;
2957
2958              mbcset->range_starts = new_array_start;
2959              mbcset->range_ends = new_array_end;
2960              *range_alloc = new_nranges;
2961            }
2962
2963          mbcset->range_starts[mbcset->nranges] = start_collseq;
2964          mbcset->range_ends[mbcset->nranges++] = end_collseq;
2965        }
2966
2967      /* Build the table for single byte characters.  */
2968      for (ch = 0; ch < SBC_MAX; ch++)
2969        {
2970          uint32_t ch_collseq;
2971          /*
2972          if (MB_CUR_MAX == 1)
2973          */
2974          if (nrules == 0)
2975            ch_collseq = collseqmb[ch];
2976          else
2977            ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2978          if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2979            bitset_set (sbcset, ch);
2980        }
2981      return REG_NOERROR;
2982    }
2983
2984  /* Local function for parse_bracket_exp used in _LIBC environement.
2985     Build the collating element which is represented by NAME.
2986     The result are written to MBCSET and SBCSET.
2987     COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2988     pointer argument sinse we may update it.  */
2989
2990  auto inline reg_errcode_t
2991  __attribute ((always_inline))
2992  build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2993         re_charset_t *mbcset;
2994         int *coll_sym_alloc;
2995         bitset_t sbcset;
2996         const unsigned char *name;
2997    {
2998      int32_t elem, idx;
2999      size_t name_len = strlen ((const char *) name);
3000      if (nrules != 0)
3001        {
3002          elem = seek_collating_symbol_entry (name, name_len);
3003          if (symb_table[2 * elem] != 0)
3004            {
3005              /* We found the entry.  */
3006              idx = symb_table[2 * elem + 1];
3007              /* Skip the name of collating element name.  */
3008              idx += 1 + extra[idx];
3009            }
3010          else if (symb_table[2 * elem] == 0 && name_len == 1)
3011            {
3012              /* No valid character, treat it as a normal
3013                 character.  */
3014              bitset_set (sbcset, name[0]);
3015              return REG_NOERROR;
3016            }
3017          else
3018            return REG_ECOLLATE;
3019
3020          /* Got valid collation sequence, add it as a new entry.  */
3021          /* Check the space of the arrays.  */
3022          if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3023            {
3024              /* Not enough, realloc it.  */
3025              /* +1 in case of mbcset->ncoll_syms is 0.  */
3026              int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3027              /* Use realloc since mbcset->coll_syms is NULL
3028                 if *alloc == 0.  */
3029              int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3030                                                   new_coll_sym_alloc);
3031              if (BE (new_coll_syms == NULL, 0))
3032                return REG_ESPACE;
3033              mbcset->coll_syms = new_coll_syms;
3034              *coll_sym_alloc = new_coll_sym_alloc;
3035            }
3036          mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3037          return REG_NOERROR;
3038        }
3039      else
3040        {
3041          if (BE (name_len != 1, 0))
3042            return REG_ECOLLATE;
3043          else
3044            {
3045              bitset_set (sbcset, name[0]);
3046              return REG_NOERROR;
3047            }
3048        }
3049    }
3050#endif
3051
3052  re_token_t br_token;
3053  re_bitset_ptr_t sbcset;
3054#ifdef RE_ENABLE_I18N
3055  re_charset_t *mbcset;
3056  int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3057  int equiv_class_alloc = 0, char_class_alloc = 0;
3058#endif /* not RE_ENABLE_I18N */
3059  int non_match = 0;
3060  bin_tree_t *work_tree;
3061  int token_len;
3062  int first_round = 1;
3063#ifdef _LIBC
3064  collseqmb = (const unsigned char *)
3065    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3066  nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3067  if (nrules)
3068    {
3069      /*
3070      if (MB_CUR_MAX > 1)
3071      */
3072      collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3073      table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3074      symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3075                                                  _NL_COLLATE_SYMB_TABLEMB);
3076      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3077                                                   _NL_COLLATE_SYMB_EXTRAMB);
3078    }
3079#endif
3080  sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3081#ifdef RE_ENABLE_I18N
3082  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3083#endif /* RE_ENABLE_I18N */
3084#ifdef RE_ENABLE_I18N
3085  if (BE (sbcset == NULL || mbcset == NULL, 0))
3086#else
3087  if (BE (sbcset == NULL, 0))
3088#endif /* RE_ENABLE_I18N */
3089    {
3090      *err = REG_ESPACE;
3091      return NULL;
3092    }
3093
3094  token_len = peek_token_bracket (token, regexp, syntax);
3095  if (BE (token->type == END_OF_RE, 0))
3096    {
3097      *err = REG_BADPAT;
3098      goto parse_bracket_exp_free_return;
3099    }
3100  if (token->type == OP_NON_MATCH_LIST)
3101    {
3102#ifdef RE_ENABLE_I18N
3103      mbcset->non_match = 1;
3104#endif /* not RE_ENABLE_I18N */
3105      non_match = 1;
3106      if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3107        bitset_set (sbcset, '\n');
3108      re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3109      token_len = peek_token_bracket (token, regexp, syntax);
3110      if (BE (token->type == END_OF_RE, 0))
3111        {
3112          *err = REG_BADPAT;
3113          goto parse_bracket_exp_free_return;
3114        }
3115    }
3116
3117  /* We treat the first ']' as a normal character.  */
3118  if (token->type == OP_CLOSE_BRACKET)
3119    token->type = CHARACTER;
3120
3121  while (1)
3122    {
3123      bracket_elem_t start_elem, end_elem;
3124      unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3125      unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3126      reg_errcode_t ret;
3127      int token_len2 = 0, is_range_exp = 0;
3128      re_token_t token2;
3129
3130      start_elem.opr.name = start_name_buf;
3131      ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3132                                   syntax, first_round);
3133      if (BE (ret != REG_NOERROR, 0))
3134        {
3135          *err = ret;
3136          goto parse_bracket_exp_free_return;
3137        }
3138      first_round = 0;
3139
3140      /* Get information about the next token.  We need it in any case.  */
3141      token_len = peek_token_bracket (token, regexp, syntax);
3142
3143      /* Do not check for ranges if we know they are not allowed.  */
3144      if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3145        {
3146          if (BE (token->type == END_OF_RE, 0))
3147            {
3148              *err = REG_EBRACK;
3149              goto parse_bracket_exp_free_return;
3150            }
3151          if (token->type == OP_CHARSET_RANGE)
3152            {
3153              re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3154              token_len2 = peek_token_bracket (&token2, regexp, syntax);
3155              if (BE (token2.type == END_OF_RE, 0))
3156                {
3157                  *err = REG_EBRACK;
3158                  goto parse_bracket_exp_free_return;
3159                }
3160              if (token2.type == OP_CLOSE_BRACKET)
3161                {
3162                  /* We treat the last '-' as a normal character.  */
3163                  re_string_skip_bytes (regexp, -token_len);
3164                  token->type = CHARACTER;
3165                }
3166              else
3167                is_range_exp = 1;
3168            }
3169        }
3170
3171      if (is_range_exp == 1)
3172        {
3173          end_elem.opr.name = end_name_buf;
3174          ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3175                                       dfa, syntax, 1);
3176          if (BE (ret != REG_NOERROR, 0))
3177            {
3178              *err = ret;
3179              goto parse_bracket_exp_free_return;
3180            }
3181
3182          token_len = peek_token_bracket (token, regexp, syntax);
3183
3184#ifdef _LIBC
3185          *err = build_range_exp (sbcset, mbcset, &range_alloc,
3186                                  &start_elem, &end_elem);
3187#else
3188# ifdef RE_ENABLE_I18N
3189          *err = build_range_exp (sbcset,
3190                                  dfa->mb_cur_max > 1 ? mbcset : NULL,
3191                                  &range_alloc, &start_elem, &end_elem);
3192# else
3193          *err = build_range_exp (sbcset, &start_elem, &end_elem);
3194# endif
3195#endif /* RE_ENABLE_I18N */
3196          if (BE (*err != REG_NOERROR, 0))
3197            goto parse_bracket_exp_free_return;
3198        }
3199      else
3200        {
3201          switch (start_elem.type)
3202            {
3203            case SB_CHAR:
3204              bitset_set (sbcset, start_elem.opr.ch);
3205              break;
3206#ifdef RE_ENABLE_I18N
3207            case MB_CHAR:
3208              /* Check whether the array has enough space.  */
3209              if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3210                {
3211                  wchar_t *new_mbchars;
3212                  /* Not enough, realloc it.  */
3213                  /* +1 in case of mbcset->nmbchars is 0.  */
3214                  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3215                  /* Use realloc since array is NULL if *alloc == 0.  */
3216                  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3217                                            mbchar_alloc);
3218                  if (BE (new_mbchars == NULL, 0))
3219                    goto parse_bracket_exp_espace;
3220                  mbcset->mbchars = new_mbchars;
3221                }
3222              mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3223              break;
3224#endif /* RE_ENABLE_I18N */
3225            case EQUIV_CLASS:
3226              *err = build_equiv_class (sbcset,
3227#ifdef RE_ENABLE_I18N
3228                                        mbcset, &equiv_class_alloc,
3229#endif /* RE_ENABLE_I18N */
3230                                        start_elem.opr.name);
3231              if (BE (*err != REG_NOERROR, 0))
3232                goto parse_bracket_exp_free_return;
3233              break;
3234            case COLL_SYM:
3235              *err = build_collating_symbol (sbcset,
3236#ifdef RE_ENABLE_I18N
3237                                             mbcset, &coll_sym_alloc,
3238#endif /* RE_ENABLE_I18N */
3239                                             start_elem.opr.name);
3240              if (BE (*err != REG_NOERROR, 0))
3241                goto parse_bracket_exp_free_return;
3242              break;
3243            case CHAR_CLASS:
3244              *err = build_charclass (regexp->trans, sbcset,
3245#ifdef RE_ENABLE_I18N
3246                                      mbcset, &char_class_alloc,
3247#endif /* RE_ENABLE_I18N */
3248                                      (const char *) start_elem.opr.name, syntax);
3249              if (BE (*err != REG_NOERROR, 0))
3250               goto parse_bracket_exp_free_return;
3251              break;
3252            default:
3253              assert (0);
3254              break;
3255            }
3256        }
3257      if (BE (token->type == END_OF_RE, 0))
3258        {
3259          *err = REG_EBRACK;
3260          goto parse_bracket_exp_free_return;
3261        }
3262      if (token->type == OP_CLOSE_BRACKET)
3263        break;
3264    }
3265
3266  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3267
3268  /* If it is non-matching list.  */
3269  if (non_match)
3270    bitset_not (sbcset);
3271
3272#ifdef RE_ENABLE_I18N
3273  /* Ensure only single byte characters are set.  */
3274  if (dfa->mb_cur_max > 1)
3275    bitset_mask (sbcset, dfa->sb_char);
3276
3277  if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3278      || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3279                                                     || mbcset->non_match)))
3280    {
3281      bin_tree_t *mbc_tree;
3282      int sbc_idx;
3283      /* Build a tree for complex bracket.  */
3284      dfa->has_mb_node = 1;
3285      br_token.type = COMPLEX_BRACKET;
3286      br_token.opr.mbcset = mbcset;
3287      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3288      if (BE (mbc_tree == NULL, 0))
3289        goto parse_bracket_exp_espace;
3290      for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3291        if (sbcset[sbc_idx])
3292          break;
3293      /* If there are no bits set in sbcset, there is no point
3294         of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3295      if (sbc_idx < BITSET_WORDS)
3296        {
3297          /* Build a tree for simple bracket.  */
3298          br_token.type = SIMPLE_BRACKET;
3299          br_token.opr.sbcset = sbcset;
3300          work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3301          if (BE (work_tree == NULL, 0))
3302            goto parse_bracket_exp_espace;
3303
3304          /* Then join them by ALT node.  */
3305          work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3306          if (BE (work_tree == NULL, 0))
3307            goto parse_bracket_exp_espace;
3308        }
3309      else
3310        {
3311          re_free (sbcset);
3312          work_tree = mbc_tree;
3313        }
3314    }
3315  else
3316#endif /* not RE_ENABLE_I18N */
3317    {
3318#ifdef RE_ENABLE_I18N
3319      free_charset (mbcset);
3320#endif
3321      /* Build a tree for simple bracket.  */
3322      br_token.type = SIMPLE_BRACKET;
3323      br_token.opr.sbcset = sbcset;
3324      work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3325      if (BE (work_tree == NULL, 0))
3326        goto parse_bracket_exp_espace;
3327    }
3328  return work_tree;
3329
3330 parse_bracket_exp_espace:
3331  *err = REG_ESPACE;
3332 parse_bracket_exp_free_return:
3333  re_free (sbcset);
3334#ifdef RE_ENABLE_I18N
3335  free_charset (mbcset);
3336#endif /* RE_ENABLE_I18N */
3337  return NULL;
3338}
3339
3340/* Parse an element in the bracket expression.  */
3341
3342static reg_errcode_t
3343parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3344                       re_token_t *token, int token_len, re_dfa_t *dfa,
3345                       reg_syntax_t syntax, int accept_hyphen)
3346{
3347#ifdef RE_ENABLE_I18N
3348  int cur_char_size;
3349  cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3350  if (cur_char_size > 1)
3351    {
3352      elem->type = MB_CHAR;
3353      elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3354      re_string_skip_bytes (regexp, cur_char_size);
3355      return REG_NOERROR;
3356    }
3357#endif /* RE_ENABLE_I18N */
3358  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3359  if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3360      || token->type == OP_OPEN_EQUIV_CLASS)
3361    return parse_bracket_symbol (elem, regexp, token);
3362  if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3363    {
3364      /* A '-' must only appear as anything but a range indicator before
3365         the closing bracket.  Everything else is an error.  */
3366      re_token_t token2;
3367      (void) peek_token_bracket (&token2, regexp, syntax);
3368      if (token2.type != OP_CLOSE_BRACKET)
3369        /* The actual error value is not standardized since this whole
3370           case is undefined.  But ERANGE makes good sense.  */
3371        return REG_ERANGE;
3372    }
3373  elem->type = SB_CHAR;
3374  elem->opr.ch = token->opr.c;
3375  return REG_NOERROR;
3376}
3377
3378/* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3379   such as [:<character_class>:], [.<collating_element>.], and
3380   [=<equivalent_class>=].  */
3381
3382static reg_errcode_t
3383parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3384                      re_token_t *token)
3385{
3386  unsigned char ch, delim = token->opr.c;
3387  int i = 0;
3388  if (re_string_eoi(regexp))
3389    return REG_EBRACK;
3390  for (;; ++i)
3391    {
3392      if (i >= BRACKET_NAME_BUF_SIZE)
3393        return REG_EBRACK;
3394      if (token->type == OP_OPEN_CHAR_CLASS)
3395        ch = re_string_fetch_byte_case (regexp);
3396      else
3397        ch = re_string_fetch_byte (regexp);
3398      if (re_string_eoi(regexp))
3399        return REG_EBRACK;
3400      if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3401        break;
3402      elem->opr.name[i] = ch;
3403    }
3404  re_string_skip_bytes (regexp, 1);
3405  elem->opr.name[i] = '\0';
3406  switch (token->type)
3407    {
3408    case OP_OPEN_COLL_ELEM:
3409      elem->type = COLL_SYM;
3410      break;
3411    case OP_OPEN_EQUIV_CLASS:
3412      elem->type = EQUIV_CLASS;
3413      break;
3414    case OP_OPEN_CHAR_CLASS:
3415      elem->type = CHAR_CLASS;
3416      break;
3417    default:
3418      break;
3419    }
3420  return REG_NOERROR;
3421}
3422
3423  /* Helper function for parse_bracket_exp.
3424     Build the equivalence class which is represented by NAME.
3425     The result are written to MBCSET and SBCSET.
3426     EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3427     is a pointer argument sinse we may update it.  */
3428
3429static reg_errcode_t
3430#ifdef RE_ENABLE_I18N
3431build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3432                   int *equiv_class_alloc, const unsigned char *name)
3433#else /* not RE_ENABLE_I18N */
3434build_equiv_class (bitset_t sbcset, const unsigned char *name)
3435#endif /* not RE_ENABLE_I18N */
3436{
3437#ifdef _LIBC
3438  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3439  if (nrules != 0)
3440    {
3441      const int32_t *table, *indirect;
3442      const unsigned char *weights, *extra, *cp;
3443      unsigned char char_buf[2];
3444      int32_t idx1, idx2;
3445      unsigned int ch;
3446      size_t len;
3447      /* This #include defines a local function!  */
3448# include <locale/weight.h>
3449      /* Calculate the index for equivalence class.  */
3450      cp = name;
3451      table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3452      weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3453                                               _NL_COLLATE_WEIGHTMB);
3454      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3455                                                   _NL_COLLATE_EXTRAMB);
3456      indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3457                                                _NL_COLLATE_INDIRECTMB);
3458      idx1 = findidx (&cp);
3459      if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3460        /* This isn't a valid character.  */
3461        return REG_ECOLLATE;
3462
3463      /* Build single byte matcing table for this equivalence class.  */
3464      char_buf[1] = (unsigned char) '\0';
3465      len = weights[idx1 & 0xffffff];
3466      for (ch = 0; ch < SBC_MAX; ++ch)
3467        {
3468          char_buf[0] = ch;
3469          cp = char_buf;
3470          idx2 = findidx (&cp);
3471/*
3472          idx2 = table[ch];
3473*/
3474          if (idx2 == 0)
3475            /* This isn't a valid character.  */
3476            continue;
3477          /* Compare only if the length matches and the collation rule
3478             index is the same.  */
3479          if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3480            {
3481              int cnt = 0;
3482
3483              while (cnt <= len &&
3484                     weights[(idx1 & 0xffffff) + 1 + cnt]
3485                     == weights[(idx2 & 0xffffff) + 1 + cnt])
3486                ++cnt;
3487
3488              if (cnt > len)
3489                bitset_set (sbcset, ch);
3490            }
3491        }
3492      /* Check whether the array has enough space.  */
3493      if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3494        {
3495          /* Not enough, realloc it.  */
3496          /* +1 in case of mbcset->nequiv_classes is 0.  */
3497          int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3498          /* Use realloc since the array is NULL if *alloc == 0.  */
3499          int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3500                                                   int32_t,
3501                                                   new_equiv_class_alloc);
3502          if (BE (new_equiv_classes == NULL, 0))
3503            return REG_ESPACE;
3504          mbcset->equiv_classes = new_equiv_classes;
3505          *equiv_class_alloc = new_equiv_class_alloc;
3506        }
3507      mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3508    }
3509  else
3510#endif /* _LIBC */
3511    {
3512      if (BE (strlen ((const char *) name) != 1, 0))
3513        return REG_ECOLLATE;
3514      bitset_set (sbcset, *name);
3515    }
3516  return REG_NOERROR;
3517}
3518
3519  /* Helper function for parse_bracket_exp.
3520     Build the character class which is represented by NAME.
3521     The result are written to MBCSET and SBCSET.
3522     CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3523     is a pointer argument sinse we may update it.  */
3524
3525static reg_errcode_t
3526#ifdef RE_ENABLE_I18N
3527build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3528                 re_charset_t *mbcset, int *char_class_alloc,
3529                 const char *class_name, reg_syntax_t syntax)
3530#else /* not RE_ENABLE_I18N */
3531build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3532                 const char *class_name, reg_syntax_t syntax)
3533#endif /* not RE_ENABLE_I18N */
3534{
3535  int i;
3536
3537  /* In case of REG_ICASE "upper" and "lower" match the both of
3538     upper and lower cases.  */
3539  if ((syntax & RE_ICASE)
3540      && (strcmp (class_name, "upper") == 0 || strcmp (class_name, "lower") == 0))
3541    class_name = "alpha";
3542
3543#ifdef RE_ENABLE_I18N
3544  /* Check the space of the arrays.  */
3545  if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3546    {
3547      /* Not enough, realloc it.  */
3548      /* +1 in case of mbcset->nchar_classes is 0.  */
3549      int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3550      /* Use realloc since array is NULL if *alloc == 0.  */
3551      wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3552                                               new_char_class_alloc);
3553      if (BE (new_char_classes == NULL, 0))
3554        return REG_ESPACE;
3555      mbcset->char_classes = new_char_classes;
3556      *char_class_alloc = new_char_class_alloc;
3557    }
3558  mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name);
3559#endif /* RE_ENABLE_I18N */
3560
3561#define BUILD_CHARCLASS_LOOP(ctype_func)        \
3562  do {                                          \
3563    if (BE (trans != NULL, 0))                  \
3564      {                                         \
3565        for (i = 0; i < SBC_MAX; ++i)           \
3566          if (ctype_func (i))                   \
3567            bitset_set (sbcset, trans[i]);      \
3568      }                                         \
3569    else                                        \
3570      {                                         \
3571        for (i = 0; i < SBC_MAX; ++i)           \
3572          if (ctype_func (i))                   \
3573            bitset_set (sbcset, i);             \
3574      }                                         \
3575  } while (0)
3576
3577  if (strcmp (class_name, "alnum") == 0)
3578    BUILD_CHARCLASS_LOOP (isalnum);
3579  else if (strcmp (class_name, "cntrl") == 0)
3580    BUILD_CHARCLASS_LOOP (iscntrl);
3581  else if (strcmp (class_name, "lower") == 0)
3582    BUILD_CHARCLASS_LOOP (islower);
3583  else if (strcmp (class_name, "space") == 0)
3584    BUILD_CHARCLASS_LOOP (isspace);
3585  else if (strcmp (class_name, "alpha") == 0)
3586    BUILD_CHARCLASS_LOOP (isalpha);
3587  else if (strcmp (class_name, "digit") == 0)
3588    BUILD_CHARCLASS_LOOP (isdigit);
3589  else if (strcmp (class_name, "print") == 0)
3590    BUILD_CHARCLASS_LOOP (isprint);
3591  else if (strcmp (class_name, "upper") == 0)
3592    BUILD_CHARCLASS_LOOP (isupper);
3593  else if (strcmp (class_name, "blank") == 0)
3594#ifndef GAWK
3595    BUILD_CHARCLASS_LOOP (isblank);
3596#else
3597    /* see comments above */
3598    BUILD_CHARCLASS_LOOP (is_blank);
3599#endif
3600  else if (strcmp (class_name, "graph") == 0)
3601    BUILD_CHARCLASS_LOOP (isgraph);
3602  else if (strcmp (class_name, "punct") == 0)
3603    BUILD_CHARCLASS_LOOP (ispunct);
3604  else if (strcmp (class_name, "xdigit") == 0)
3605    BUILD_CHARCLASS_LOOP (isxdigit);
3606  else
3607    return REG_ECTYPE;
3608
3609  return REG_NOERROR;
3610}
3611
3612static bin_tree_t *
3613build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3614                    const char *class_name,
3615                    const char *extra, int non_match,
3616                    reg_errcode_t *err)
3617{
3618  re_bitset_ptr_t sbcset;
3619#ifdef RE_ENABLE_I18N
3620  re_charset_t *mbcset;
3621  int alloc = 0;
3622#endif /* not RE_ENABLE_I18N */
3623  reg_errcode_t ret;
3624  re_token_t br_token;
3625  bin_tree_t *tree;
3626
3627  sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3628#ifdef RE_ENABLE_I18N
3629  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3630#endif /* RE_ENABLE_I18N */
3631
3632#ifdef RE_ENABLE_I18N
3633  if (BE (sbcset == NULL || mbcset == NULL, 0))
3634#else /* not RE_ENABLE_I18N */
3635  if (BE (sbcset == NULL, 0))
3636#endif /* not RE_ENABLE_I18N */
3637    {
3638      *err = REG_ESPACE;
3639      return NULL;
3640    }
3641
3642  if (non_match)
3643    {
3644#ifdef RE_ENABLE_I18N
3645      mbcset->non_match = 1;
3646#endif /* not RE_ENABLE_I18N */
3647    }
3648
3649  /* We don't care the syntax in this case.  */
3650  ret = build_charclass (trans, sbcset,
3651#ifdef RE_ENABLE_I18N
3652                         mbcset, &alloc,
3653#endif /* RE_ENABLE_I18N */
3654                         class_name, 0);
3655
3656  if (BE (ret != REG_NOERROR, 0))
3657    {
3658      re_free (sbcset);
3659#ifdef RE_ENABLE_I18N
3660      free_charset (mbcset);
3661#endif /* RE_ENABLE_I18N */
3662      *err = ret;
3663      return NULL;
3664    }
3665  /* \w match '_' also.  */
3666  for (; *extra; extra++)
3667    bitset_set (sbcset, *extra);
3668
3669  /* If it is non-matching list.  */
3670  if (non_match)
3671    bitset_not (sbcset);
3672
3673#ifdef RE_ENABLE_I18N
3674  /* Ensure only single byte characters are set.  */
3675  if (dfa->mb_cur_max > 1)
3676    bitset_mask (sbcset, dfa->sb_char);
3677#endif
3678
3679  /* Build a tree for simple bracket.  */
3680  br_token.type = SIMPLE_BRACKET;
3681  br_token.opr.sbcset = sbcset;
3682  tree = create_token_tree (dfa, NULL, NULL, &br_token);
3683  if (BE (tree == NULL, 0))
3684    goto build_word_op_espace;
3685
3686#ifdef RE_ENABLE_I18N
3687  if (dfa->mb_cur_max > 1)
3688    {
3689      bin_tree_t *mbc_tree;
3690      /* Build a tree for complex bracket.  */
3691      br_token.type = COMPLEX_BRACKET;
3692      br_token.opr.mbcset = mbcset;
3693      dfa->has_mb_node = 1;
3694      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3695      if (BE (mbc_tree == NULL, 0))
3696        goto build_word_op_espace;
3697      /* Then join them by ALT node.  */
3698      tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3699      if (BE (mbc_tree != NULL, 1))
3700        return tree;
3701    }
3702  else
3703    {
3704      free_charset (mbcset);
3705      return tree;
3706    }
3707#else /* not RE_ENABLE_I18N */
3708  return tree;
3709#endif /* not RE_ENABLE_I18N */
3710
3711 build_word_op_espace:
3712  re_free (sbcset);
3713#ifdef RE_ENABLE_I18N
3714  free_charset (mbcset);
3715#endif /* RE_ENABLE_I18N */
3716  *err = REG_ESPACE;
3717  return NULL;
3718}
3719
3720/* This is intended for the expressions like "a{1,3}".
3721   Fetch a number from `input', and return the number.
3722   Return -1, if the number field is empty like "{,1}".
3723   Return -2, If an error is occured.  */
3724
3725static int
3726fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3727{
3728  int num = -1;
3729  unsigned char c;
3730  while (1)
3731    {
3732      fetch_token (token, input, syntax);
3733      c = token->opr.c;
3734      if (BE (token->type == END_OF_RE, 0))
3735        return -2;
3736      if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3737        break;
3738      num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3739             ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3740      num = (num > RE_DUP_MAX) ? -2 : num;
3741    }
3742  return num;
3743}
3744\f
3745#ifdef RE_ENABLE_I18N
3746static void
3747free_charset (re_charset_t *cset)
3748{
3749  re_free (cset->mbchars);
3750# ifdef _LIBC
3751  re_free (cset->coll_syms);
3752  re_free (cset->equiv_classes);
3753  re_free (cset->range_starts);
3754  re_free (cset->range_ends);
3755# endif
3756  re_free (cset->char_classes);
3757  re_free (cset);
3758}
3759#endif /* RE_ENABLE_I18N */
3760\f
3761/* Functions for binary tree operation.  */
3762
3763/* Create a tree node.  */
3764
3765static bin_tree_t *
3766create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3767             re_token_type_t type)
3768{
3769  re_token_t t;
3770  t.type = type;
3771  return create_token_tree (dfa, left, right, &t);
3772}
3773
3774static bin_tree_t *
3775create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3776                   const re_token_t *token)
3777{
3778  bin_tree_t *tree;
3779  if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3780    {
3781      bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3782
3783      if (storage == NULL)
3784        return NULL;
3785      storage->next = dfa->str_tree_storage;
3786      dfa->str_tree_storage = storage;
3787      dfa->str_tree_storage_idx = 0;
3788    }
3789  tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3790
3791  tree->parent = NULL;
3792  tree->left = left;
3793  tree->right = right;
3794  tree->token = *token;
3795  tree->token.duplicated = 0;
3796  tree->token.opt_subexp = 0;
3797  tree->first = NULL;
3798  tree->next = NULL;
3799  tree->node_idx = -1;
3800
3801  if (left != NULL)
3802    left->parent = tree;
3803  if (right != NULL)
3804    right->parent = tree;
3805  return tree;
3806}
3807
3808/* Mark the tree SRC as an optional subexpression.
3809   To be called from preorder or postorder.  */
3810
3811static reg_errcode_t
3812mark_opt_subexp (void *extra, bin_tree_t *node)
3813{
3814  int idx = (int) (long) extra;
3815  if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3816    node->token.opt_subexp = 1;
3817
3818  return REG_NOERROR;
3819}
3820
3821/* Free the allocated memory inside NODE. */
3822
3823static void
3824free_token (re_token_t *node)
3825{
3826#ifdef RE_ENABLE_I18N
3827  if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3828    free_charset (node->opr.mbcset);
3829  else
3830#endif /* RE_ENABLE_I18N */
3831    if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3832      re_free (node->opr.sbcset);
3833}
3834
3835/* Worker function for tree walking.  Free the allocated memory inside NODE
3836   and its children. */
3837
3838static reg_errcode_t
3839free_tree (void *extra, bin_tree_t *node)
3840{
3841  free_token (&node->token);
3842  return REG_NOERROR;
3843}
3844
3845
3846/* Duplicate the node SRC, and return new node.  This is a preorder
3847   visit similar to the one implemented by the generic visitor, but
3848   we need more infrastructure to maintain two parallel trees --- so,
3849   it's easier to duplicate.  */
3850
3851static bin_tree_t *
3852duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3853{
3854  const bin_tree_t *node;
3855  bin_tree_t *dup_root;
3856  bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3857
3858  for (node = root; ; )
3859    {
3860      /* Create a new tree and link it back to the current parent.  */
3861      *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3862      if (*p_new == NULL)
3863        return NULL;
3864      (*p_new)->parent = dup_node;
3865      (*p_new)->token.duplicated = 1;
3866      dup_node = *p_new;
3867
3868      /* Go to the left node, or up and to the right.  */
3869      if (node->left)
3870        {
3871          node = node->left;
3872          p_new = &dup_node->left;
3873        }
3874      else
3875        {
3876          const bin_tree_t *prev = NULL;
3877          while (node->right == prev || node->right == NULL)
3878            {
3879              prev = node;
3880              node = node->parent;
3881              dup_node = dup_node->parent;
3882              if (!node)
3883                return dup_root;
3884            }
3885          node = node->right;
3886          p_new = &dup_node->right;
3887        }
3888    }
3889}