compat / regex / regcomp.con commit compat/regex: use the regex engine from gawk for compat (d18f76d)
   1/* Extended regular expression matching and search library.
   2   Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
   3   This file is part of the GNU C Library.
   4   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6   The GNU C Library is free software; you can redistribute it and/or
   7   modify it under the terms of the GNU Lesser General Public
   8   License as published by the Free Software Foundation; either
   9   version 2.1 of the License, or (at your option) any later version.
  10
  11   The GNU C Library is distributed in the hope that it will be useful,
  12   but WITHOUT ANY WARRANTY; without even the implied warranty of
  13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14   Lesser General Public License for more details.
  15
  16   You should have received a copy of the GNU Lesser General Public
  17   License along with the GNU C Library; if not, write to the Free
  18   Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19   02110-1301 USA.  */
  20
  21static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  22                                          size_t length, reg_syntax_t syntax);
  23static void re_compile_fastmap_iter (regex_t *bufp,
  24                                     const re_dfastate_t *init_state,
  25                                     char *fastmap);
  26static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  27#ifdef RE_ENABLE_I18N
  28static void free_charset (re_charset_t *cset);
  29#endif /* RE_ENABLE_I18N */
  30static void free_workarea_compile (regex_t *preg);
  31static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  32#ifdef RE_ENABLE_I18N
  33static void optimize_utf8 (re_dfa_t *dfa);
  34#endif
  35static reg_errcode_t analyze (regex_t *preg);
  36static reg_errcode_t preorder (bin_tree_t *root,
  37                               reg_errcode_t (fn (void *, bin_tree_t *)),
  38                               void *extra);
  39static reg_errcode_t postorder (bin_tree_t *root,
  40                                reg_errcode_t (fn (void *, bin_tree_t *)),
  41                                void *extra);
  42static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  43static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  44static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  45                                 bin_tree_t *node);
  46static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  47static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  48static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  49static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
  50static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
  51                                   unsigned int constraint);
  52static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  53static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  54                                         int node, int root);
  55static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  56static int fetch_number (re_string_t *input, re_token_t *token,
  57                         reg_syntax_t syntax);
  58static int peek_token (re_token_t *token, re_string_t *input,
  59                        reg_syntax_t syntax) internal_function;
  60static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  61                          reg_syntax_t syntax, reg_errcode_t *err);
  62static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  63                                  re_token_t *token, reg_syntax_t syntax,
  64                                  int nest, reg_errcode_t *err);
  65static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  66                                 re_token_t *token, reg_syntax_t syntax,
  67                                 int nest, reg_errcode_t *err);
  68static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  69                                     re_token_t *token, reg_syntax_t syntax,
  70                                     int nest, reg_errcode_t *err);
  71static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  72                                  re_token_t *token, reg_syntax_t syntax,
  73                                  int nest, reg_errcode_t *err);
  74static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  75                                 re_dfa_t *dfa, re_token_t *token,
  76                                 reg_syntax_t syntax, reg_errcode_t *err);
  77static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  78                                      re_token_t *token, reg_syntax_t syntax,
  79                                      reg_errcode_t *err);
  80static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  81                                            re_string_t *regexp,
  82                                            re_token_t *token, int token_len,
  83                                            re_dfa_t *dfa,
  84                                            reg_syntax_t syntax,
  85                                            int accept_hyphen);
  86static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  87                                          re_string_t *regexp,
  88                                          re_token_t *token);
  89#ifdef RE_ENABLE_I18N
  90static reg_errcode_t build_equiv_class (bitset_t sbcset,
  91                                        re_charset_t *mbcset,
  92                                        int *equiv_class_alloc,
  93                                        const unsigned char *name);
  94static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  95                                      bitset_t sbcset,
  96                                      re_charset_t *mbcset,
  97                                      int *char_class_alloc,
  98                                      const char *class_name,
  99                                      reg_syntax_t syntax);
 100#else  /* not RE_ENABLE_I18N */
 101static reg_errcode_t build_equiv_class (bitset_t sbcset,
 102                                        const unsigned char *name);
 103static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 104                                      bitset_t sbcset,
 105                                      const char *class_name,
 106                                      reg_syntax_t syntax);
 107#endif /* not RE_ENABLE_I18N */
 108static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 109                                       RE_TRANSLATE_TYPE trans,
 110                                       const char *class_name,
 111                                       const char *extra,
 112                                       int non_match, reg_errcode_t *err);
 113static bin_tree_t *create_tree (re_dfa_t *dfa,
 114                                bin_tree_t *left, bin_tree_t *right,
 115                                re_token_type_t type);
 116static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 117                                      bin_tree_t *left, bin_tree_t *right,
 118                                      const re_token_t *token);
 119static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 120static void free_token (re_token_t *node);
 121static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 122static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 123\f
 124/* This table gives an error message for each of the error codes listed
 125   in regex.h.  Obviously the order here has to be same as there.
 126   POSIX doesn't require that we do anything for REG_NOERROR,
 127   but why not be nice?  */
 128
 129const char __re_error_msgid[] attribute_hidden =
 130  {
 131#define REG_NOERROR_IDX 0
 132    gettext_noop ("Success")    /* REG_NOERROR */
 133    "\0"
 134#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 135    gettext_noop ("No match")   /* REG_NOMATCH */
 136    "\0"
 137#define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 138    gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 139    "\0"
 140#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 141    gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 142    "\0"
 143#define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 144    gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 145    "\0"
 146#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 147    gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 148    "\0"
 149#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 150    gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 151    "\0"
 152#define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 153    gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 154    "\0"
 155#define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 156    gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 157    "\0"
 158#define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 159    gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 160    "\0"
 161#define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 162    gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 163    "\0"
 164#define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 165    gettext_noop ("Invalid range end")  /* REG_ERANGE */
 166    "\0"
 167#define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 168    gettext_noop ("Memory exhausted") /* REG_ESPACE */
 169    "\0"
 170#define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 171    gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 172    "\0"
 173#define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 174    gettext_noop ("Premature end of regular expression") /* REG_EEND */
 175    "\0"
 176#define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 177    gettext_noop ("Regular expression too big") /* REG_ESIZE */
 178    "\0"
 179#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 180    gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 181  };
 182
 183const size_t __re_error_msgid_idx[] attribute_hidden =
 184  {
 185    REG_NOERROR_IDX,
 186    REG_NOMATCH_IDX,
 187    REG_BADPAT_IDX,
 188    REG_ECOLLATE_IDX,
 189    REG_ECTYPE_IDX,
 190    REG_EESCAPE_IDX,
 191    REG_ESUBREG_IDX,
 192    REG_EBRACK_IDX,
 193    REG_EPAREN_IDX,
 194    REG_EBRACE_IDX,
 195    REG_BADBR_IDX,
 196    REG_ERANGE_IDX,
 197    REG_ESPACE_IDX,
 198    REG_BADRPT_IDX,
 199    REG_EEND_IDX,
 200    REG_ESIZE_IDX,
 201    REG_ERPAREN_IDX
 202  };
 203\f
 204/* Entry points for GNU code.  */
 205
 206
 207#ifdef ZOS_USS
 208
 209/* For ZOS USS we must define btowc */
 210
 211wchar_t 
 212btowc (int c)
 213{
 214   wchar_t wtmp[2];
 215   char tmp[2];
 216
 217   tmp[0] = c;
 218   tmp[1] = 0;
 219
 220   mbtowc (wtmp, tmp, 1);
 221   return wtmp[0];
 222}
 223#endif
 224
 225/* re_compile_pattern is the GNU regular expression compiler: it
 226   compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 227   Returns 0 if the pattern was valid, otherwise an error string.
 228
 229   Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 230   are set in BUFP on entry.  */
 231
 232const char *
 233re_compile_pattern (pattern, length, bufp)
 234    const char *pattern;
 235    size_t length;
 236    struct re_pattern_buffer *bufp;
 237{
 238  reg_errcode_t ret;
 239
 240  /* And GNU code determines whether or not to get register information
 241     by passing null for the REGS argument to re_match, etc., not by
 242     setting no_sub, unless RE_NO_SUB is set.  */
 243  bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 244
 245  /* Match anchors at newline.  */
 246  bufp->newline_anchor = 1;
 247
 248  ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 249
 250  if (!ret)
 251    return NULL;
 252  return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 253}
 254#ifdef _LIBC
 255weak_alias (__re_compile_pattern, re_compile_pattern)
 256#endif
 257
 258/* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 259   also be assigned to arbitrarily: each pattern buffer stores its own
 260   syntax, so it can be changed between regex compilations.  */
 261/* This has no initializer because initialized variables in Emacs
 262   become read-only after dumping.  */
 263reg_syntax_t re_syntax_options;
 264
 265
 266/* Specify the precise syntax of regexps for compilation.  This provides
 267   for compatibility for various utilities which historically have
 268   different, incompatible syntaxes.
 269
 270   The argument SYNTAX is a bit mask comprised of the various bits
 271   defined in regex.h.  We return the old syntax.  */
 272
 273reg_syntax_t
 274re_set_syntax (syntax)
 275    reg_syntax_t syntax;
 276{
 277  reg_syntax_t ret = re_syntax_options;
 278
 279  re_syntax_options = syntax;
 280  return ret;
 281}
 282#ifdef _LIBC
 283weak_alias (__re_set_syntax, re_set_syntax)
 284#endif
 285
 286int
 287re_compile_fastmap (bufp)
 288    struct re_pattern_buffer *bufp;
 289{
 290  re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 291  char *fastmap = bufp->fastmap;
 292
 293  memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 294  re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 295  if (dfa->init_state != dfa->init_state_word)
 296    re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 297  if (dfa->init_state != dfa->init_state_nl)
 298    re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 299  if (dfa->init_state != dfa->init_state_begbuf)
 300    re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 301  bufp->fastmap_accurate = 1;
 302  return 0;
 303}
 304#ifdef _LIBC
 305weak_alias (__re_compile_fastmap, re_compile_fastmap)
 306#endif
 307
 308static inline void
 309__attribute ((always_inline))
 310re_set_fastmap (char *fastmap, int icase, int ch)
 311{
 312  fastmap[ch] = 1;
 313  if (icase)
 314    fastmap[tolower (ch)] = 1;
 315}
 316
 317/* Helper function for re_compile_fastmap.
 318   Compile fastmap for the initial_state INIT_STATE.  */
 319
 320static void
 321re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 322                         char *fastmap)
 323{
 324  volatile re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 325  int node_cnt;
 326  int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 327  for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 328    {
 329      int node = init_state->nodes.elems[node_cnt];
 330      re_token_type_t type = dfa->nodes[node].type;
 331
 332      if (type == CHARACTER)
 333        {
 334          re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 335#ifdef RE_ENABLE_I18N
 336          if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 337            {
 338              unsigned char *buf = re_malloc (unsigned char, dfa->mb_cur_max), *p;
 339              wchar_t wc;
 340              mbstate_t state;
 341
 342              p = buf;
 343              *p++ = dfa->nodes[node].opr.c;
 344              while (++node < dfa->nodes_len
 345                     && dfa->nodes[node].type == CHARACTER
 346                     && dfa->nodes[node].mb_partial)
 347                *p++ = dfa->nodes[node].opr.c;
 348              memset (&state, '\0', sizeof (state));
 349              if (__mbrtowc (&wc, (const char *) buf, p - buf,
 350                             &state) == p - buf
 351                  && (__wcrtomb ((char *) buf, towlower (wc), &state)
 352                      != (size_t) -1))
 353                re_set_fastmap (fastmap, 0, buf[0]);
 354              re_free (buf);
 355            }
 356#endif
 357        }
 358      else if (type == SIMPLE_BRACKET)
 359        {
 360          int i, ch;
 361          for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 362            {
 363              int j;
 364              bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 365              for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 366                if (w & ((bitset_word_t) 1 << j))
 367                  re_set_fastmap (fastmap, icase, ch);
 368            }
 369        }
 370#ifdef RE_ENABLE_I18N
 371      else if (type == COMPLEX_BRACKET)
 372        {
 373          re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 374          int i;
 375
 376# ifdef _LIBC
 377          /* See if we have to try all bytes which start multiple collation
 378             elements.
 379             e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 380                  collation element, and don't catch 'b' since 'b' is
 381                  the only collation element which starts from 'b' (and
 382                  it is caught by SIMPLE_BRACKET).  */
 383              if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 384                  && (cset->ncoll_syms || cset->nranges))
 385                {
 386                  const int32_t *table = (const int32_t *)
 387                    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 388                  for (i = 0; i < SBC_MAX; ++i)
 389                    if (table[i] < 0)
 390                      re_set_fastmap (fastmap, icase, i);
 391                }
 392# endif /* _LIBC */
 393
 394          /* See if we have to start the match at all multibyte characters,
 395             i.e. where we would not find an invalid sequence.  This only
 396             applies to multibyte character sets; for single byte character
 397             sets, the SIMPLE_BRACKET again suffices.  */
 398          if (dfa->mb_cur_max > 1
 399              && (cset->nchar_classes || cset->non_match || cset->nranges
 400# ifdef _LIBC
 401                  || cset->nequiv_classes
 402# endif /* _LIBC */
 403                 ))
 404            {
 405              unsigned char c = 0;
 406              do
 407                {
 408                  mbstate_t mbs;
 409                  memset (&mbs, 0, sizeof (mbs));
 410                  if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 411                    re_set_fastmap (fastmap, false, (int) c);
 412                }
 413              while (++c != 0);
 414            }
 415
 416          else
 417            {
 418              /* ... Else catch all bytes which can start the mbchars.  */
 419              for (i = 0; i < cset->nmbchars; ++i)
 420                {
 421                  char buf[256];
 422                  mbstate_t state;
 423                  memset (&state, '\0', sizeof (state));
 424                  if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 425                    re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 426                  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 427                    {
 428                      if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 429                          != (size_t) -1)
 430                        re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 431                    }
 432                }
 433            }
 434        }
 435#endif /* RE_ENABLE_I18N */
 436      else if (type == OP_PERIOD
 437#ifdef RE_ENABLE_I18N
 438               || type == OP_UTF8_PERIOD
 439#endif /* RE_ENABLE_I18N */
 440               || type == END_OF_RE)
 441        {
 442          memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 443          if (type == END_OF_RE)
 444            bufp->can_be_null = 1;
 445          return;
 446        }
 447    }
 448}
 449\f
 450/* Entry point for POSIX code.  */
 451/* regcomp takes a regular expression as a string and compiles it.
 452
 453   PREG is a regex_t *.  We do not expect any fields to be initialized,
 454   since POSIX says we shouldn't.  Thus, we set
 455
 456     `buffer' to the compiled pattern;
 457     `used' to the length of the compiled pattern;
 458     `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 459       REG_EXTENDED bit in CFLAGS is set; otherwise, to
 460       RE_SYNTAX_POSIX_BASIC;
 461     `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 462     `fastmap' to an allocated space for the fastmap;
 463     `fastmap_accurate' to zero;
 464     `re_nsub' to the number of subexpressions in PATTERN.
 465
 466   PATTERN is the address of the pattern string.
 467
 468   CFLAGS is a series of bits which affect compilation.
 469
 470     If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 471     use POSIX basic syntax.
 472
 473     If REG_NEWLINE is set, then . and [^...] don't match newline.
 474     Also, regexec will try a match beginning after every newline.
 475
 476     If REG_ICASE is set, then we considers upper- and lowercase
 477     versions of letters to be equivalent when matching.
 478
 479     If REG_NOSUB is set, then when PREG is passed to regexec, that
 480     routine will report only success or failure, and nothing about the
 481     registers.
 482
 483   It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 484   the return codes and their meanings.)  */
 485
 486int
 487regcomp (preg, pattern, cflags)
 488    regex_t *__restrict preg;
 489    const char *__restrict pattern;
 490    int cflags;
 491{
 492  reg_errcode_t ret;
 493  reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 494                         : RE_SYNTAX_POSIX_BASIC);
 495
 496  preg->buffer = NULL;
 497  preg->allocated = 0;
 498  preg->used = 0;
 499
 500  /* Try to allocate space for the fastmap.  */
 501  preg->fastmap = re_malloc (char, SBC_MAX);
 502  if (BE (preg->fastmap == NULL, 0))
 503    return REG_ESPACE;
 504
 505  syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 506
 507  /* If REG_NEWLINE is set, newlines are treated differently.  */
 508  if (cflags & REG_NEWLINE)
 509    { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 510      syntax &= ~RE_DOT_NEWLINE;
 511      syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 512      /* It also changes the matching behavior.  */
 513      preg->newline_anchor = 1;
 514    }
 515  else
 516    preg->newline_anchor = 0;
 517  preg->no_sub = !!(cflags & REG_NOSUB);
 518  preg->translate = NULL;
 519
 520  ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 521
 522  /* POSIX doesn't distinguish between an unmatched open-group and an
 523     unmatched close-group: both are REG_EPAREN.  */
 524  if (ret == REG_ERPAREN)
 525    ret = REG_EPAREN;
 526
 527  /* We have already checked preg->fastmap != NULL.  */
 528  if (BE (ret == REG_NOERROR, 1))
 529    /* Compute the fastmap now, since regexec cannot modify the pattern
 530       buffer.  This function never fails in this implementation.  */
 531    (void) re_compile_fastmap (preg);
 532  else
 533    {
 534      /* Some error occurred while compiling the expression.  */
 535      re_free (preg->fastmap);
 536      preg->fastmap = NULL;
 537    }
 538
 539  return (int) ret;
 540}
 541#ifdef _LIBC
 542weak_alias (__regcomp, regcomp)
 543#endif
 544
 545/* Returns a message corresponding to an error code, ERRCODE, returned
 546   from either regcomp or regexec.   We don't use PREG here.  */
 547
 548size_t
 549regerror (errcode, preg, errbuf, errbuf_size)
 550    int errcode;
 551    const regex_t *__restrict preg;
 552    char *__restrict errbuf;
 553    size_t errbuf_size;
 554{
 555  const char *msg;
 556  size_t msg_size;
 557
 558  if (BE (errcode < 0
 559          || errcode >= (int) (sizeof (__re_error_msgid_idx)
 560                               / sizeof (__re_error_msgid_idx[0])), 0))
 561    /* Only error codes returned by the rest of the code should be passed
 562       to this routine.  If we are given anything else, or if other regex
 563       code generates an invalid error code, then the program has a bug.
 564       Dump core so we can fix it.  */
 565    abort ();
 566
 567  msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 568
 569  msg_size = strlen (msg) + 1; /* Includes the null.  */
 570
 571  if (BE (errbuf_size != 0, 1))
 572    {
 573      if (BE (msg_size > errbuf_size, 0))
 574        {
 575          memcpy (errbuf, msg, errbuf_size - 1);
 576          errbuf[errbuf_size - 1] = 0;
 577        }
 578      else
 579        memcpy (errbuf, msg, msg_size);
 580    }
 581
 582  return msg_size;
 583}
 584#ifdef _LIBC
 585weak_alias (__regerror, regerror)
 586#endif
 587
 588
 589#ifdef RE_ENABLE_I18N
 590/* This static array is used for the map to single-byte characters when
 591   UTF-8 is used.  Otherwise we would allocate memory just to initialize
 592   it the same all the time.  UTF-8 is the preferred encoding so this is
 593   a worthwhile optimization.  */
 594#if __GNUC__ >= 3
 595static const bitset_t utf8_sb_map = {
 596  /* Set the first 128 bits.  */
 597  [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 598};
 599#else /* ! (__GNUC__ >= 3) */
 600static bitset_t utf8_sb_map;
 601#endif /* __GNUC__ >= 3 */
 602#endif /* RE_ENABLE_I18N */
 603
 604
 605static void
 606free_dfa_content (re_dfa_t *dfa)
 607{
 608  int i, j;
 609
 610  if (dfa->nodes)
 611    for (i = 0; i < dfa->nodes_len; ++i)
 612      free_token (dfa->nodes + i);
 613  re_free (dfa->nexts);
 614  for (i = 0; i < dfa->nodes_len; ++i)
 615    {
 616      if (dfa->eclosures != NULL)
 617        re_node_set_free (dfa->eclosures + i);
 618      if (dfa->inveclosures != NULL)
 619        re_node_set_free (dfa->inveclosures + i);
 620      if (dfa->edests != NULL)
 621        re_node_set_free (dfa->edests + i);
 622    }
 623  re_free (dfa->edests);
 624  re_free (dfa->eclosures);
 625  re_free (dfa->inveclosures);
 626  re_free (dfa->nodes);
 627
 628  if (dfa->state_table)
 629    for (i = 0; i <= dfa->state_hash_mask; ++i)
 630      {
 631        struct re_state_table_entry *entry = dfa->state_table + i;
 632        for (j = 0; j < entry->num; ++j)
 633          {
 634            re_dfastate_t *state = entry->array[j];
 635            free_state (state);
 636          }
 637        re_free (entry->array);
 638      }
 639  re_free (dfa->state_table);
 640#ifdef RE_ENABLE_I18N
 641  if (dfa->sb_char != utf8_sb_map)
 642    re_free (dfa->sb_char);
 643#endif
 644  re_free (dfa->subexp_map);
 645#ifdef DEBUG
 646  re_free (dfa->re_str);
 647#endif
 648
 649  re_free (dfa);
 650}
 651
 652
 653/* Free dynamically allocated space used by PREG.  */
 654
 655void
 656regfree (preg)
 657    regex_t *preg;
 658{
 659  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 660  if (BE (dfa != NULL, 1))
 661    free_dfa_content (dfa);
 662  preg->buffer = NULL;
 663  preg->allocated = 0;
 664
 665  re_free (preg->fastmap);
 666  preg->fastmap = NULL;
 667
 668  re_free (preg->translate);
 669  preg->translate = NULL;
 670}
 671#ifdef _LIBC
 672weak_alias (__regfree, regfree)
 673#endif
 674\f
 675/* Entry points compatible with 4.2 BSD regex library.  We don't define
 676   them unless specifically requested.  */
 677
 678#if defined _REGEX_RE_COMP || defined _LIBC
 679
 680/* BSD has one and only one pattern buffer.  */
 681static struct re_pattern_buffer re_comp_buf;
 682
 683char *
 684# ifdef _LIBC
 685/* Make these definitions weak in libc, so POSIX programs can redefine
 686   these names if they don't use our functions, and still use
 687   regcomp/regexec above without link errors.  */
 688weak_function
 689# endif
 690re_comp (s)
 691     const char *s;
 692{
 693  reg_errcode_t ret;
 694  char *fastmap;
 695
 696  if (!s)
 697    {
 698      if (!re_comp_buf.buffer)
 699        return gettext ("No previous regular expression");
 700      return 0;
 701    }
 702
 703  if (re_comp_buf.buffer)
 704    {
 705      fastmap = re_comp_buf.fastmap;
 706      re_comp_buf.fastmap = NULL;
 707      __regfree (&re_comp_buf);
 708      memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 709      re_comp_buf.fastmap = fastmap;
 710    }
 711
 712  if (re_comp_buf.fastmap == NULL)
 713    {
 714      re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 715      if (re_comp_buf.fastmap == NULL)
 716        return (char *) gettext (__re_error_msgid
 717                                 + __re_error_msgid_idx[(int) REG_ESPACE]);
 718    }
 719
 720  /* Since `re_exec' always passes NULL for the `regs' argument, we
 721     don't need to initialize the pattern buffer fields which affect it.  */
 722
 723  /* Match anchors at newlines.  */
 724  re_comp_buf.newline_anchor = 1;
 725
 726  ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 727
 728  if (!ret)
 729    return NULL;
 730
 731  /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 732  return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 733}
 734
 735#ifdef _LIBC
 736libc_freeres_fn (free_mem)
 737{
 738  __regfree (&re_comp_buf);
 739}
 740#endif
 741
 742#endif /* _REGEX_RE_COMP */
 743\f
 744/* Internal entry point.
 745   Compile the regular expression PATTERN, whose length is LENGTH.
 746   SYNTAX indicate regular expression's syntax.  */
 747
 748static reg_errcode_t
 749re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 750                     reg_syntax_t syntax)
 751{
 752  reg_errcode_t err = REG_NOERROR;
 753  re_dfa_t *dfa;
 754  re_string_t regexp;
 755
 756  /* Initialize the pattern buffer.  */
 757  preg->fastmap_accurate = 0;
 758  preg->syntax = syntax;
 759  preg->not_bol = preg->not_eol = 0;
 760  preg->used = 0;
 761  preg->re_nsub = 0;
 762  preg->can_be_null = 0;
 763  preg->regs_allocated = REGS_UNALLOCATED;
 764
 765  /* Initialize the dfa.  */
 766  dfa = (re_dfa_t *) preg->buffer;
 767  if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 768    {
 769      /* If zero allocated, but buffer is non-null, try to realloc
 770         enough space.  This loses if buffer's address is bogus, but
 771         that is the user's responsibility.  If ->buffer is NULL this
 772         is a simple allocation.  */
 773      dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 774      if (dfa == NULL)
 775        return REG_ESPACE;
 776      preg->allocated = sizeof (re_dfa_t);
 777      preg->buffer = (unsigned char *) dfa;
 778    }
 779  preg->used = sizeof (re_dfa_t);
 780
 781  err = init_dfa (dfa, length);
 782  if (BE (err != REG_NOERROR, 0))
 783    {
 784      free_dfa_content (dfa);
 785      preg->buffer = NULL;
 786      preg->allocated = 0;
 787      return err;
 788    }
 789#ifdef DEBUG
 790  /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 791  dfa->re_str = re_malloc (char, length + 1);
 792  strncpy (dfa->re_str, pattern, length + 1);
 793#endif
 794
 795  __libc_lock_init (dfa->lock);
 796
 797  err = re_string_construct (&regexp, pattern, length, preg->translate,
 798                             syntax & RE_ICASE, dfa);
 799  if (BE (err != REG_NOERROR, 0))
 800    {
 801    re_compile_internal_free_return:
 802      free_workarea_compile (preg);
 803      re_string_destruct (&regexp);
 804      free_dfa_content (dfa);
 805      preg->buffer = NULL;
 806      preg->allocated = 0;
 807      return err;
 808    }
 809
 810  /* Parse the regular expression, and build a structure tree.  */
 811  preg->re_nsub = 0;
 812  dfa->str_tree = parse (&regexp, preg, syntax, &err);
 813  if (BE (dfa->str_tree == NULL, 0))
 814    goto re_compile_internal_free_return;
 815
 816  /* Analyze the tree and create the nfa.  */
 817  err = analyze (preg);
 818  if (BE (err != REG_NOERROR, 0))
 819    goto re_compile_internal_free_return;
 820
 821#ifdef RE_ENABLE_I18N
 822  /* If possible, do searching in single byte encoding to speed things up.  */
 823  if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 824    optimize_utf8 (dfa);
 825#endif
 826
 827  /* Then create the initial state of the dfa.  */
 828  err = create_initial_state (dfa);
 829
 830  /* Release work areas.  */
 831  free_workarea_compile (preg);
 832  re_string_destruct (&regexp);
 833
 834  if (BE (err != REG_NOERROR, 0))
 835    {
 836      free_dfa_content (dfa);
 837      preg->buffer = NULL;
 838      preg->allocated = 0;
 839    }
 840
 841  return err;
 842}
 843
 844/* Initialize DFA.  We use the length of the regular expression PAT_LEN
 845   as the initial length of some arrays.  */
 846
 847static reg_errcode_t
 848init_dfa (re_dfa_t *dfa, size_t pat_len)
 849{
 850  unsigned int table_size;
 851#ifndef _LIBC
 852  char *codeset_name;
 853#endif
 854
 855  memset (dfa, '\0', sizeof (re_dfa_t));
 856
 857  /* Force allocation of str_tree_storage the first time.  */
 858  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 859
 860  /* Avoid overflows.  */
 861  if (pat_len == SIZE_MAX)
 862    return REG_ESPACE;
 863
 864  dfa->nodes_alloc = pat_len + 1;
 865  dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 866
 867  /*  table_size = 2 ^ ceil(log pat_len) */
 868  for (table_size = 1; ; table_size <<= 1)
 869    if (table_size > pat_len)
 870      break;
 871
 872  dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 873  dfa->state_hash_mask = table_size - 1;
 874
 875  dfa->mb_cur_max = MB_CUR_MAX;
 876#ifdef _LIBC
 877  if (dfa->mb_cur_max == 6
 878      && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 879    dfa->is_utf8 = 1;
 880  dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 881                       != 0);
 882#else
 883# ifdef HAVE_LANGINFO_CODESET
 884  codeset_name = nl_langinfo (CODESET);
 885# else
 886  codeset_name = getenv ("LC_ALL");
 887  if (codeset_name == NULL || codeset_name[0] == '\0')
 888    codeset_name = getenv ("LC_CTYPE");
 889  if (codeset_name == NULL || codeset_name[0] == '\0')
 890    codeset_name = getenv ("LANG");
 891  if (codeset_name == NULL)
 892    codeset_name = "";
 893  else if (strchr (codeset_name, '.') !=  NULL)
 894    codeset_name = strchr (codeset_name, '.') + 1;
 895# endif
 896
 897  /* strcasecmp isn't a standard interface. brute force check */
 898#if 0
 899  if (strcasecmp (codeset_name, "UTF-8") == 0
 900      || strcasecmp (codeset_name, "UTF8") == 0)
 901    dfa->is_utf8 = 1;
 902#else
 903  if (   (codeset_name[0] == 'U' || codeset_name[0] == 'u')
 904      && (codeset_name[1] == 'T' || codeset_name[1] == 't')
 905      && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
 906      && (codeset_name[3] == '-'
 907          ? codeset_name[4] == '8' && codeset_name[5] == '\0'
 908          : codeset_name[3] == '8' && codeset_name[4] == '\0'))
 909    dfa->is_utf8 = 1;
 910#endif
 911
 912  /* We check exhaustively in the loop below if this charset is a
 913     superset of ASCII.  */
 914  dfa->map_notascii = 0;
 915#endif
 916
 917#ifdef RE_ENABLE_I18N
 918  if (dfa->mb_cur_max > 1)
 919    {
 920      if (dfa->is_utf8)
 921        {
 922#if !defined(__GNUC__) || __GNUC__ < 3
 923          static short utf8_sb_map_inited = 0;
 924
 925          if (! utf8_sb_map_inited)
 926            {
 927                int i;
 928
 929                utf8_sb_map_inited = 0;
 930                for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++)
 931                  utf8_sb_map[i] = BITSET_WORD_MAX;
 932            }
 933#endif
 934          dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 935        }
 936      else
 937        {
 938          int i, j, ch;
 939
 940          dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 941          if (BE (dfa->sb_char == NULL, 0))
 942            return REG_ESPACE;
 943
 944          /* Set the bits corresponding to single byte chars.  */
 945          for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 946            for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 947              {
 948                wint_t wch = __btowc (ch);
 949                if (wch != WEOF)
 950                  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 951# ifndef _LIBC
 952                if (isascii (ch) && wch != ch)
 953                  dfa->map_notascii = 1;
 954# endif
 955              }
 956        }
 957    }
 958#endif
 959
 960  if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 961    return REG_ESPACE;
 962  return REG_NOERROR;
 963}
 964
 965/* Initialize WORD_CHAR table, which indicate which character is
 966   "word".  In this case "word" means that it is the word construction
 967   character used by some operators like "\<", "\>", etc.  */
 968
 969static void
 970internal_function
 971init_word_char (re_dfa_t *dfa)
 972{
 973  int i, j, ch;
 974  dfa->word_ops_used = 1;
 975  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 976    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 977      if (isalnum (ch) || ch == '_')
 978        dfa->word_char[i] |= (bitset_word_t) 1 << j;
 979}
 980
 981/* Free the work area which are only used while compiling.  */
 982
 983static void
 984free_workarea_compile (regex_t *preg)
 985{
 986  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 987  bin_tree_storage_t *storage, *next;
 988  for (storage = dfa->str_tree_storage; storage; storage = next)
 989    {
 990      next = storage->next;
 991      re_free (storage);
 992    }
 993  dfa->str_tree_storage = NULL;
 994  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 995  dfa->str_tree = NULL;
 996  re_free (dfa->org_indices);
 997  dfa->org_indices = NULL;
 998}
 999
1000/* Create initial states for all contexts.  */
1001
1002static reg_errcode_t
1003create_initial_state (re_dfa_t *dfa)
1004{
1005  int first, i;
1006  reg_errcode_t err;
1007  re_node_set init_nodes;
1008
1009  /* Initial states have the epsilon closure of the node which is
1010     the first node of the regular expression.  */
1011  first = dfa->str_tree->first->node_idx;
1012  dfa->init_node = first;
1013  err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1014  if (BE (err != REG_NOERROR, 0))
1015    return err;
1016
1017  /* The back-references which are in initial states can epsilon transit,
1018     since in this case all of the subexpressions can be null.
1019     Then we add epsilon closures of the nodes which are the next nodes of
1020     the back-references.  */
1021  if (dfa->nbackref > 0)
1022    for (i = 0; i < init_nodes.nelem; ++i)
1023      {
1024        int node_idx = init_nodes.elems[i];
1025        re_token_type_t type = dfa->nodes[node_idx].type;
1026
1027        int clexp_idx;
1028        if (type != OP_BACK_REF)
1029          continue;
1030        for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1031          {
1032            re_token_t *clexp_node;
1033            clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1034            if (clexp_node->type == OP_CLOSE_SUBEXP
1035                && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1036              break;
1037          }
1038        if (clexp_idx == init_nodes.nelem)
1039          continue;
1040
1041        if (type == OP_BACK_REF)
1042          {
1043            int dest_idx = dfa->edests[node_idx].elems[0];
1044            if (!re_node_set_contains (&init_nodes, dest_idx))
1045              {
1046                reg_errcode_t err = re_node_set_merge (&init_nodes,
1047                                                       dfa->eclosures
1048                                                       + dest_idx);
1049                if (err != REG_NOERROR)
1050                  return err;
1051                i = 0;
1052              }
1053          }
1054      }
1055
1056  /* It must be the first time to invoke acquire_state.  */
1057  dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1058  /* We don't check ERR here, since the initial state must not be NULL.  */
1059  if (BE (dfa->init_state == NULL, 0))
1060    return err;
1061  if (dfa->init_state->has_constraint)
1062    {
1063      dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1064                                                       CONTEXT_WORD);
1065      dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1066                                                     CONTEXT_NEWLINE);
1067      dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1068                                                         &init_nodes,
1069                                                         CONTEXT_NEWLINE
1070                                                         | CONTEXT_BEGBUF);
1071      if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1072              || dfa->init_state_begbuf == NULL, 0))
1073        return err;
1074    }
1075  else
1076    dfa->init_state_word = dfa->init_state_nl
1077      = dfa->init_state_begbuf = dfa->init_state;
1078
1079  re_node_set_free (&init_nodes);
1080  return REG_NOERROR;
1081}
1082\f
1083#ifdef RE_ENABLE_I18N
1084/* If it is possible to do searching in single byte encoding instead of UTF-8
1085   to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1086   DFA nodes where needed.  */
1087
1088static void
1089optimize_utf8 (re_dfa_t *dfa)
1090{
1091  int node, i, mb_chars = 0, has_period = 0;
1092
1093  for (node = 0; node < dfa->nodes_len; ++node)
1094    switch (dfa->nodes[node].type)
1095      {
1096      case CHARACTER:
1097        if (dfa->nodes[node].opr.c >= 0x80)
1098          mb_chars = 1;
1099        break;
1100      case ANCHOR:
1101        switch (dfa->nodes[node].opr.ctx_type)
1102          {
1103          case LINE_FIRST:
1104          case LINE_LAST:
1105          case BUF_FIRST:
1106          case BUF_LAST:
1107            break;
1108          default:
1109            /* Word anchors etc. cannot be handled.  It's okay to test
1110               opr.ctx_type since constraints (for all DFA nodes) are
1111               created by ORing one or more opr.ctx_type values.  */
1112            return;
1113          }
1114        break;
1115      case OP_PERIOD:
1116        has_period = 1;
1117        break;
1118      case OP_BACK_REF:
1119      case OP_ALT:
1120      case END_OF_RE:
1121      case OP_DUP_ASTERISK:
1122      case OP_OPEN_SUBEXP:
1123      case OP_CLOSE_SUBEXP:
1124        break;
1125      case COMPLEX_BRACKET:
1126        return;
1127      case SIMPLE_BRACKET:
1128        /* Just double check.  The non-ASCII range starts at 0x80.  */
1129        assert (0x80 % BITSET_WORD_BITS == 0);
1130        for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1131          if (dfa->nodes[node].opr.sbcset[i])
1132            return;
1133        break;
1134      default:
1135        abort ();
1136      }
1137
1138  if (mb_chars || has_period)
1139    for (node = 0; node < dfa->nodes_len; ++node)
1140      {
1141        if (dfa->nodes[node].type == CHARACTER
1142            && dfa->nodes[node].opr.c >= 0x80)
1143          dfa->nodes[node].mb_partial = 0;
1144        else if (dfa->nodes[node].type == OP_PERIOD)
1145          dfa->nodes[node].type = OP_UTF8_PERIOD;
1146      }
1147
1148  /* The search can be in single byte locale.  */
1149  dfa->mb_cur_max = 1;
1150  dfa->is_utf8 = 0;
1151  dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1152}
1153#endif
1154\f
1155/* Analyze the structure tree, and calculate "first", "next", "edest",
1156   "eclosure", and "inveclosure".  */
1157
1158static reg_errcode_t
1159analyze (regex_t *preg)
1160{
1161  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1162  reg_errcode_t ret;
1163
1164  /* Allocate arrays.  */
1165  dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1166  dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1167  dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1168  dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1169  if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1170          || dfa->eclosures == NULL, 0))
1171    return REG_ESPACE;
1172
1173  dfa->subexp_map = re_malloc (int, preg->re_nsub);
1174  if (dfa->subexp_map != NULL)
1175    {
1176      int i;
1177      for (i = 0; i < preg->re_nsub; i++)
1178        dfa->subexp_map[i] = i;
1179      preorder (dfa->str_tree, optimize_subexps, dfa);
1180      for (i = 0; i < preg->re_nsub; i++)
1181        if (dfa->subexp_map[i] != i)
1182          break;
1183      if (i == preg->re_nsub)
1184        {
1185          free (dfa->subexp_map);
1186          dfa->subexp_map = NULL;
1187        }
1188    }
1189
1190  ret = postorder (dfa->str_tree, lower_subexps, preg);
1191  if (BE (ret != REG_NOERROR, 0))
1192    return ret;
1193  ret = postorder (dfa->str_tree, calc_first, dfa);
1194  if (BE (ret != REG_NOERROR, 0))
1195    return ret;
1196  preorder (dfa->str_tree, calc_next, dfa);
1197  ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1198  if (BE (ret != REG_NOERROR, 0))
1199    return ret;
1200  ret = calc_eclosure (dfa);
1201  if (BE (ret != REG_NOERROR, 0))
1202    return ret;
1203
1204  /* We only need this during the prune_impossible_nodes pass in regexec.c;
1205     skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1206  if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1207      || dfa->nbackref)
1208    {
1209      dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1210      if (BE (dfa->inveclosures == NULL, 0))
1211        return REG_ESPACE;
1212      ret = calc_inveclosure (dfa);
1213    }
1214
1215  return ret;
1216}
1217
1218/* Our parse trees are very unbalanced, so we cannot use a stack to
1219   implement parse tree visits.  Instead, we use parent pointers and
1220   some hairy code in these two functions.  */
1221static reg_errcode_t
1222postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1223           void *extra)
1224{
1225  bin_tree_t *node, *prev;
1226
1227  for (node = root; ; )
1228    {
1229      /* Descend down the tree, preferably to the left (or to the right
1230         if that's the only child).  */
1231      while (node->left || node->right)
1232        if (node->left)
1233          node = node->left;
1234        else
1235          node = node->right;
1236
1237      do
1238        {
1239          reg_errcode_t err = fn (extra, node);
1240          if (BE (err != REG_NOERROR, 0))
1241            return err;
1242          if (node->parent == NULL)
1243            return REG_NOERROR;
1244          prev = node;
1245          node = node->parent;
1246        }
1247      /* Go up while we have a node that is reached from the right.  */
1248      while (node->right == prev || node->right == NULL);
1249      node = node->right;
1250    }
1251}
1252
1253static reg_errcode_t
1254preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1255          void *extra)
1256{
1257  bin_tree_t *node;
1258
1259  for (node = root; ; )
1260    {
1261      reg_errcode_t err = fn (extra, node);
1262      if (BE (err != REG_NOERROR, 0))
1263        return err;
1264
1265      /* Go to the left node, or up and to the right.  */
1266      if (node->left)
1267        node = node->left;
1268      else
1269        {
1270          bin_tree_t *prev = NULL;
1271          while (node->right == prev || node->right == NULL)
1272            {
1273              prev = node;
1274              node = node->parent;
1275              if (!node)
1276                return REG_NOERROR;
1277            }
1278          node = node->right;
1279        }
1280    }
1281}
1282
1283/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1284   re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1285   backreferences as well.  Requires a preorder visit.  */
1286static reg_errcode_t
1287optimize_subexps (void *extra, bin_tree_t *node)
1288{
1289  re_dfa_t *dfa = (re_dfa_t *) extra;
1290
1291  if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1292    {
1293      int idx = node->token.opr.idx;
1294      node->token.opr.idx = dfa->subexp_map[idx];
1295      dfa->used_bkref_map |= 1 << node->token.opr.idx;
1296    }
1297
1298  else if (node->token.type == SUBEXP
1299           && node->left && node->left->token.type == SUBEXP)
1300    {
1301      int other_idx = node->left->token.opr.idx;
1302
1303      node->left = node->left->left;
1304      if (node->left)
1305        node->left->parent = node;
1306
1307      dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1308      if (other_idx < BITSET_WORD_BITS)
1309          dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1310    }
1311
1312  return REG_NOERROR;
1313}
1314
1315/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1316   of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1317static reg_errcode_t
1318lower_subexps (void *extra, bin_tree_t *node)
1319{
1320  regex_t *preg = (regex_t *) extra;
1321  reg_errcode_t err = REG_NOERROR;
1322
1323  if (node->left && node->left->token.type == SUBEXP)
1324    {
1325      node->left = lower_subexp (&err, preg, node->left);
1326      if (node->left)
1327        node->left->parent = node;
1328    }
1329  if (node->right && node->right->token.type == SUBEXP)
1330    {
1331      node->right = lower_subexp (&err, preg, node->right);
1332      if (node->right)
1333        node->right->parent = node;
1334    }
1335
1336  return err;
1337}
1338
1339static bin_tree_t *
1340lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1341{
1342  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1343  bin_tree_t *body = node->left;
1344  bin_tree_t *op, *cls, *tree1, *tree;
1345
1346  if (preg->no_sub
1347      /* We do not optimize empty subexpressions, because otherwise we may
1348         have bad CONCAT nodes with NULL children.  This is obviously not
1349         very common, so we do not lose much.  An example that triggers
1350         this case is the sed "script" /\(\)/x.  */
1351      && node->left != NULL
1352      && (node->token.opr.idx >= BITSET_WORD_BITS
1353          || !(dfa->used_bkref_map
1354               & ((bitset_word_t) 1 << node->token.opr.idx))))
1355    return node->left;
1356
1357  /* Convert the SUBEXP node to the concatenation of an
1358     OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1359  op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1360  cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1361  tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1362  tree = create_tree (dfa, op, tree1, CONCAT);
1363  if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1364    {
1365      *err = REG_ESPACE;
1366      return NULL;
1367    }
1368
1369  op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1370  op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1371  return tree;
1372}
1373
1374/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1375   nodes.  Requires a postorder visit.  */
1376static reg_errcode_t
1377calc_first (void *extra, bin_tree_t *node)
1378{
1379  re_dfa_t *dfa = (re_dfa_t *) extra;
1380  if (node->token.type == CONCAT)
1381    {
1382      node->first = node->left->first;
1383      node->node_idx = node->left->node_idx;
1384    }
1385  else
1386    {
1387      node->first = node;
1388      node->node_idx = re_dfa_add_node (dfa, node->token);
1389      if (BE (node->node_idx == -1, 0))
1390        return REG_ESPACE;
1391      if (node->token.type == ANCHOR)
1392        dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1393    }
1394  return REG_NOERROR;
1395}
1396
1397/* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1398static reg_errcode_t
1399calc_next (void *extra, bin_tree_t *node)
1400{
1401  switch (node->token.type)
1402    {
1403    case OP_DUP_ASTERISK:
1404      node->left->next = node;
1405      break;
1406    case CONCAT:
1407      node->left->next = node->right->first;
1408      node->right->next = node->next;
1409      break;
1410    default:
1411      if (node->left)
1412        node->left->next = node->next;
1413      if (node->right)
1414        node->right->next = node->next;
1415      break;
1416    }
1417  return REG_NOERROR;
1418}
1419
1420/* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1421static reg_errcode_t
1422link_nfa_nodes (void *extra, bin_tree_t *node)
1423{
1424  re_dfa_t *dfa = (re_dfa_t *) extra;
1425  int idx = node->node_idx;
1426  reg_errcode_t err = REG_NOERROR;
1427
1428  switch (node->token.type)
1429    {
1430    case CONCAT:
1431      break;
1432
1433    case END_OF_RE:
1434      assert (node->next == NULL);
1435      break;
1436
1437    case OP_DUP_ASTERISK:
1438    case OP_ALT:
1439      {
1440        int left, right;
1441        dfa->has_plural_match = 1;
1442        if (node->left != NULL)
1443          left = node->left->first->node_idx;
1444        else
1445          left = node->next->node_idx;
1446        if (node->right != NULL)
1447          right = node->right->first->node_idx;
1448        else
1449          right = node->next->node_idx;
1450        assert (left > -1);
1451        assert (right > -1);
1452        err = re_node_set_init_2 (dfa->edests + idx, left, right);
1453      }
1454      break;
1455
1456    case ANCHOR:
1457    case OP_OPEN_SUBEXP:
1458    case OP_CLOSE_SUBEXP:
1459      err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1460      break;
1461
1462    case OP_BACK_REF:
1463      dfa->nexts[idx] = node->next->node_idx;
1464      if (node->token.type == OP_BACK_REF)
1465        err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1466      break;
1467
1468    default:
1469      assert (!IS_EPSILON_NODE (node->token.type));
1470      dfa->nexts[idx] = node->next->node_idx;
1471      break;
1472    }
1473
1474  return err;
1475}
1476
1477/* Duplicate the epsilon closure of the node ROOT_NODE.
1478   Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1479   to their own constraint.  */
1480
1481static reg_errcode_t
1482internal_function
1483duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1484                        int root_node, unsigned int init_constraint)
1485{
1486  int org_node, clone_node, ret;
1487  unsigned int constraint = init_constraint;
1488  for (org_node = top_org_node, clone_node = top_clone_node;;)
1489    {
1490      int org_dest, clone_dest;
1491      if (dfa->nodes[org_node].type == OP_BACK_REF)
1492        {
1493          /* If the back reference epsilon-transit, its destination must
1494             also have the constraint.  Then duplicate the epsilon closure
1495             of the destination of the back reference, and store it in
1496             edests of the back reference.  */
1497          org_dest = dfa->nexts[org_node];
1498          re_node_set_empty (dfa->edests + clone_node);
1499          clone_dest = duplicate_node (dfa, org_dest, constraint);
1500          if (BE (clone_dest == -1, 0))
1501            return REG_ESPACE;
1502          dfa->nexts[clone_node] = dfa->nexts[org_node];
1503          ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1504          if (BE (ret < 0, 0))
1505            return REG_ESPACE;
1506        }
1507      else if (dfa->edests[org_node].nelem == 0)
1508        {
1509          /* In case of the node can't epsilon-transit, don't duplicate the
1510             destination and store the original destination as the
1511             destination of the node.  */
1512          dfa->nexts[clone_node] = dfa->nexts[org_node];
1513          break;
1514        }
1515      else if (dfa->edests[org_node].nelem == 1)
1516        {
1517          /* In case of the node can epsilon-transit, and it has only one
1518             destination.  */
1519          org_dest = dfa->edests[org_node].elems[0];
1520          re_node_set_empty (dfa->edests + clone_node);
1521          /* If the node is root_node itself, it means the epsilon clsoure
1522             has a loop.   Then tie it to the destination of the root_node.  */
1523          if (org_node == root_node && clone_node != org_node)
1524            {
1525              ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1526              if (BE (ret < 0, 0))
1527                return REG_ESPACE;
1528              break;
1529            }
1530          /* In case of the node has another constraint, add it.  */
1531          constraint |= dfa->nodes[org_node].constraint;
1532          clone_dest = duplicate_node (dfa, org_dest, constraint);
1533          if (BE (clone_dest == -1, 0))
1534            return REG_ESPACE;
1535          ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1536          if (BE (ret < 0, 0))
1537            return REG_ESPACE;
1538        }
1539      else /* dfa->edests[org_node].nelem == 2 */
1540        {
1541          /* In case of the node can epsilon-transit, and it has two
1542             destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1543          org_dest = dfa->edests[org_node].elems[0];
1544          re_node_set_empty (dfa->edests + clone_node);
1545          /* Search for a duplicated node which satisfies the constraint.  */
1546          clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1547          if (clone_dest == -1)
1548            {
1549              /* There is no such duplicated node, create a new one.  */
1550              reg_errcode_t err;
1551              clone_dest = duplicate_node (dfa, org_dest, constraint);
1552              if (BE (clone_dest == -1, 0))
1553                return REG_ESPACE;
1554              ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1555              if (BE (ret < 0, 0))
1556                return REG_ESPACE;
1557              err = duplicate_node_closure (dfa, org_dest, clone_dest,
1558                                            root_node, constraint);
1559              if (BE (err != REG_NOERROR, 0))
1560                return err;
1561            }
1562          else
1563            {
1564              /* There is a duplicated node which satisfies the constraint,
1565                 use it to avoid infinite loop.  */
1566              ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1567              if (BE (ret < 0, 0))
1568                return REG_ESPACE;
1569            }
1570
1571          org_dest = dfa->edests[org_node].elems[1];
1572          clone_dest = duplicate_node (dfa, org_dest, constraint);
1573          if (BE (clone_dest == -1, 0))
1574            return REG_ESPACE;
1575          ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1576          if (BE (ret < 0, 0))
1577            return REG_ESPACE;
1578        }
1579      org_node = org_dest;
1580      clone_node = clone_dest;
1581    }
1582  return REG_NOERROR;
1583}
1584
1585/* Search for a node which is duplicated from the node ORG_NODE, and
1586   satisfies the constraint CONSTRAINT.  */
1587
1588static int
1589search_duplicated_node (const re_dfa_t *dfa, int org_node,
1590                        unsigned int constraint)
1591{
1592  int idx;
1593  for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1594    {
1595      if (org_node == dfa->org_indices[idx]
1596          && constraint == dfa->nodes[idx].constraint)
1597        return idx; /* Found.  */
1598    }
1599  return -1; /* Not found.  */
1600}
1601
1602/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1603   Return the index of the new node, or -1 if insufficient storage is
1604   available.  */
1605
1606static int
1607duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1608{
1609  int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1610  if (BE (dup_idx != -1, 1))
1611    {
1612      dfa->nodes[dup_idx].constraint = constraint;
1613      dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1614      dfa->nodes[dup_idx].duplicated = 1;
1615
1616      /* Store the index of the original node.  */
1617      dfa->org_indices[dup_idx] = org_idx;
1618    }
1619  return dup_idx;
1620}
1621
1622static reg_errcode_t
1623calc_inveclosure (re_dfa_t *dfa)
1624{
1625  int src, idx, ret;
1626  for (idx = 0; idx < dfa->nodes_len; ++idx)
1627    re_node_set_init_empty (dfa->inveclosures + idx);
1628
1629  for (src = 0; src < dfa->nodes_len; ++src)
1630    {
1631      int *elems = dfa->eclosures[src].elems;
1632      for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1633        {
1634          ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1635          if (BE (ret == -1, 0))
1636            return REG_ESPACE;
1637        }
1638    }
1639
1640  return REG_NOERROR;
1641}
1642
1643/* Calculate "eclosure" for all the node in DFA.  */
1644
1645static reg_errcode_t
1646calc_eclosure (re_dfa_t *dfa)
1647{
1648  int node_idx, incomplete;
1649#ifdef DEBUG
1650  assert (dfa->nodes_len > 0);
1651#endif
1652  incomplete = 0;
1653  /* For each nodes, calculate epsilon closure.  */
1654  for (node_idx = 0; ; ++node_idx)
1655    {
1656      reg_errcode_t err;
1657      re_node_set eclosure_elem;
1658      if (node_idx == dfa->nodes_len)
1659        {
1660          if (!incomplete)
1661            break;
1662          incomplete = 0;
1663          node_idx = 0;
1664        }
1665
1666#ifdef DEBUG
1667      assert (dfa->eclosures[node_idx].nelem != -1);
1668#endif
1669
1670      /* If we have already calculated, skip it.  */
1671      if (dfa->eclosures[node_idx].nelem != 0)
1672        continue;
1673      /* Calculate epsilon closure of `node_idx'.  */
1674      err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1675      if (BE (err != REG_NOERROR, 0))
1676        return err;
1677
1678      if (dfa->eclosures[node_idx].nelem == 0)
1679        {
1680          incomplete = 1;
1681          re_node_set_free (&eclosure_elem);
1682        }
1683    }
1684  return REG_NOERROR;
1685}
1686
1687/* Calculate epsilon closure of NODE.  */
1688
1689static reg_errcode_t
1690calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1691{
1692  reg_errcode_t err;
1693  int i;
1694  re_node_set eclosure;
1695  int ret;
1696  int incomplete = 0;
1697  err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1698  if (BE (err != REG_NOERROR, 0))
1699    return err;
1700
1701  /* This indicates that we are calculating this node now.
1702     We reference this value to avoid infinite loop.  */
1703  dfa->eclosures[node].nelem = -1;
1704
1705  /* If the current node has constraints, duplicate all nodes
1706     since they must inherit the constraints.  */
1707  if (dfa->nodes[node].constraint
1708      && dfa->edests[node].nelem
1709      && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1710    {
1711      err = duplicate_node_closure (dfa, node, node, node,
1712                                    dfa->nodes[node].constraint);
1713      if (BE (err != REG_NOERROR, 0))
1714        return err;
1715    }
1716
1717  /* Expand each epsilon destination nodes.  */
1718  if (IS_EPSILON_NODE(dfa->nodes[node].type))
1719    for (i = 0; i < dfa->edests[node].nelem; ++i)
1720      {
1721        re_node_set eclosure_elem;
1722        int edest = dfa->edests[node].elems[i];
1723        /* If calculating the epsilon closure of `edest' is in progress,
1724           return intermediate result.  */
1725        if (dfa->eclosures[edest].nelem == -1)
1726          {
1727            incomplete = 1;
1728            continue;
1729          }
1730        /* If we haven't calculated the epsilon closure of `edest' yet,
1731           calculate now. Otherwise use calculated epsilon closure.  */
1732        if (dfa->eclosures[edest].nelem == 0)
1733          {
1734            err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1735            if (BE (err != REG_NOERROR, 0))
1736              return err;
1737          }
1738        else
1739          eclosure_elem = dfa->eclosures[edest];
1740        /* Merge the epsilon closure of `edest'.  */
1741        err = re_node_set_merge (&eclosure, &eclosure_elem);
1742        if (BE (err != REG_NOERROR, 0))
1743          return err;
1744        /* If the epsilon closure of `edest' is incomplete,
1745           the epsilon closure of this node is also incomplete.  */
1746        if (dfa->eclosures[edest].nelem == 0)
1747          {
1748            incomplete = 1;
1749            re_node_set_free (&eclosure_elem);
1750          }
1751      }
1752
1753  /* An epsilon closure includes itself.  */
1754  ret = re_node_set_insert (&eclosure, node);
1755  if (BE (ret < 0, 0))
1756    return REG_ESPACE;
1757  if (incomplete && !root)
1758    dfa->eclosures[node].nelem = 0;
1759  else
1760    dfa->eclosures[node] = eclosure;
1761  *new_set = eclosure;
1762  return REG_NOERROR;
1763}
1764\f
1765/* Functions for token which are used in the parser.  */
1766
1767/* Fetch a token from INPUT.
1768   We must not use this function inside bracket expressions.  */
1769
1770static void
1771internal_function
1772fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1773{
1774  re_string_skip_bytes (input, peek_token (result, input, syntax));
1775}
1776
1777/* Peek a token from INPUT, and return the length of the token.
1778   We must not use this function inside bracket expressions.  */
1779
1780static int
1781internal_function
1782peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1783{
1784  unsigned char c;
1785
1786  if (re_string_eoi (input))
1787    {
1788      token->type = END_OF_RE;
1789      return 0;
1790    }
1791
1792  c = re_string_peek_byte (input, 0);
1793  token->opr.c = c;
1794
1795  token->word_char = 0;
1796#ifdef RE_ENABLE_I18N
1797  token->mb_partial = 0;
1798  if (input->mb_cur_max > 1 &&
1799      !re_string_first_byte (input, re_string_cur_idx (input)))
1800    {
1801      token->type = CHARACTER;
1802      token->mb_partial = 1;
1803      return 1;
1804    }
1805#endif
1806  if (c == '\\')
1807    {
1808      unsigned char c2;
1809      if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1810        {
1811          token->type = BACK_SLASH;
1812          return 1;
1813        }
1814
1815      c2 = re_string_peek_byte_case (input, 1);
1816      token->opr.c = c2;
1817      token->type = CHARACTER;
1818#ifdef RE_ENABLE_I18N
1819      if (input->mb_cur_max > 1)
1820        {
1821          wint_t wc = re_string_wchar_at (input,
1822                                          re_string_cur_idx (input) + 1);
1823          token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1824        }
1825      else
1826#endif
1827        token->word_char = IS_WORD_CHAR (c2) != 0;
1828
1829      switch (c2)
1830        {
1831        case '|':
1832          if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1833            token->type = OP_ALT;
1834          break;
1835        case '1': case '2': case '3': case '4': case '5':
1836        case '6': case '7': case '8': case '9':
1837          if (!(syntax & RE_NO_BK_REFS))
1838            {
1839              token->type = OP_BACK_REF;
1840              token->opr.idx = c2 - '1';
1841            }
1842          break;
1843        case '<':
1844          if (!(syntax & RE_NO_GNU_OPS))
1845            {
1846              token->type = ANCHOR;
1847              token->opr.ctx_type = WORD_FIRST;
1848            }
1849          break;
1850        case '>':
1851          if (!(syntax & RE_NO_GNU_OPS))
1852            {
1853              token->type = ANCHOR;
1854              token->opr.ctx_type = WORD_LAST;
1855            }
1856          break;
1857        case 'b':
1858          if (!(syntax & RE_NO_GNU_OPS))
1859            {
1860              token->type = ANCHOR;
1861              token->opr.ctx_type = WORD_DELIM;
1862            }
1863          break;
1864        case 'B':
1865          if (!(syntax & RE_NO_GNU_OPS))
1866            {
1867              token->type = ANCHOR;
1868              token->opr.ctx_type = NOT_WORD_DELIM;
1869            }
1870          break;
1871        case 'w':
1872          if (!(syntax & RE_NO_GNU_OPS))
1873            token->type = OP_WORD;
1874          break;
1875        case 'W':
1876          if (!(syntax & RE_NO_GNU_OPS))
1877            token->type = OP_NOTWORD;
1878          break;
1879        case 's':
1880          if (!(syntax & RE_NO_GNU_OPS))
1881            token->type = OP_SPACE;
1882          break;
1883        case 'S':
1884          if (!(syntax & RE_NO_GNU_OPS))
1885            token->type = OP_NOTSPACE;
1886          break;
1887        case '`':
1888          if (!(syntax & RE_NO_GNU_OPS))
1889            {
1890              token->type = ANCHOR;
1891              token->opr.ctx_type = BUF_FIRST;
1892            }
1893          break;
1894        case '\'':
1895          if (!(syntax & RE_NO_GNU_OPS))
1896            {
1897              token->type = ANCHOR;
1898              token->opr.ctx_type = BUF_LAST;
1899            }
1900          break;
1901        case '(':
1902          if (!(syntax & RE_NO_BK_PARENS))
1903            token->type = OP_OPEN_SUBEXP;
1904          break;
1905        case ')':
1906          if (!(syntax & RE_NO_BK_PARENS))
1907            token->type = OP_CLOSE_SUBEXP;
1908          break;
1909        case '+':
1910          if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1911            token->type = OP_DUP_PLUS;
1912          break;
1913        case '?':
1914          if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1915            token->type = OP_DUP_QUESTION;
1916          break;
1917        case '{':
1918          if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1919            token->type = OP_OPEN_DUP_NUM;
1920          break;
1921        case '}':
1922          if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1923            token->type = OP_CLOSE_DUP_NUM;
1924          break;
1925        default:
1926          break;
1927        }
1928      return 2;
1929    }
1930
1931  token->type = CHARACTER;
1932#ifdef RE_ENABLE_I18N
1933  if (input->mb_cur_max > 1)
1934    {
1935      wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1936      token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1937    }
1938  else
1939#endif
1940    token->word_char = IS_WORD_CHAR (token->opr.c);
1941
1942  switch (c)
1943    {
1944    case '\n':
1945      if (syntax & RE_NEWLINE_ALT)
1946        token->type = OP_ALT;
1947      break;
1948    case '|':
1949      if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1950        token->type = OP_ALT;
1951      break;
1952    case '*':
1953      token->type = OP_DUP_ASTERISK;
1954      break;
1955    case '+':
1956      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1957        token->type = OP_DUP_PLUS;
1958      break;
1959    case '?':
1960      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1961        token->type = OP_DUP_QUESTION;
1962      break;
1963    case '{':
1964      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1965        token->type = OP_OPEN_DUP_NUM;
1966      break;
1967    case '}':
1968      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1969        token->type = OP_CLOSE_DUP_NUM;
1970      break;
1971    case '(':
1972      if (syntax & RE_NO_BK_PARENS)
1973        token->type = OP_OPEN_SUBEXP;
1974      break;
1975    case ')':
1976      if (syntax & RE_NO_BK_PARENS)
1977        token->type = OP_CLOSE_SUBEXP;
1978      break;
1979    case '[':
1980      token->type = OP_OPEN_BRACKET;
1981      break;
1982    case '.':
1983      token->type = OP_PERIOD;
1984      break;
1985    case '^':
1986      if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1987          re_string_cur_idx (input) != 0)
1988        {
1989          char prev = re_string_peek_byte (input, -1);
1990          if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1991            break;
1992        }
1993      token->type = ANCHOR;
1994      token->opr.ctx_type = LINE_FIRST;
1995      break;
1996    case '$':
1997      if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1998          re_string_cur_idx (input) + 1 != re_string_length (input))
1999        {
2000          re_token_t next;
2001          re_string_skip_bytes (input, 1);
2002          peek_token (&next, input, syntax);
2003          re_string_skip_bytes (input, -1);
2004          if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2005            break;
2006        }
2007      token->type = ANCHOR;
2008      token->opr.ctx_type = LINE_LAST;
2009      break;
2010    default:
2011      break;
2012    }
2013  return 1;
2014}
2015
2016/* Peek a token from INPUT, and return the length of the token.
2017   We must not use this function out of bracket expressions.  */
2018
2019static int
2020internal_function
2021peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2022{
2023  unsigned char c;
2024  if (re_string_eoi (input))
2025    {
2026      token->type = END_OF_RE;
2027      return 0;
2028    }
2029  c = re_string_peek_byte (input, 0);
2030  token->opr.c = c;
2031
2032#ifdef RE_ENABLE_I18N
2033  if (input->mb_cur_max > 1 &&
2034      !re_string_first_byte (input, re_string_cur_idx (input)))
2035    {
2036      token->type = CHARACTER;
2037      return 1;
2038    }
2039#endif /* RE_ENABLE_I18N */
2040
2041  if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2042      && re_string_cur_idx (input) + 1 < re_string_length (input))
2043    {
2044      /* In this case, '\' escape a character.  */
2045      unsigned char c2;
2046      re_string_skip_bytes (input, 1);
2047      c2 = re_string_peek_byte (input, 0);
2048      token->opr.c = c2;
2049      token->type = CHARACTER;
2050      return 1;
2051    }
2052  if (c == '[') /* '[' is a special char in a bracket exps.  */
2053    {
2054      unsigned char c2;
2055      int token_len;
2056      if (re_string_cur_idx (input) + 1 < re_string_length (input))
2057        c2 = re_string_peek_byte (input, 1);
2058      else
2059        c2 = 0;
2060      token->opr.c = c2;
2061      token_len = 2;
2062      switch (c2)
2063        {
2064        case '.':
2065          token->type = OP_OPEN_COLL_ELEM;
2066          break;
2067        case '=':
2068          token->type = OP_OPEN_EQUIV_CLASS;
2069          break;
2070        case ':':
2071          if (syntax & RE_CHAR_CLASSES)
2072            {
2073              token->type = OP_OPEN_CHAR_CLASS;
2074              break;
2075            }
2076          /* else fall through.  */
2077        default:
2078          token->type = CHARACTER;
2079          token->opr.c = c;
2080          token_len = 1;
2081          break;
2082        }
2083      return token_len;
2084    }
2085  switch (c)
2086    {
2087    case '-':
2088      token->type = OP_CHARSET_RANGE;
2089      break;
2090    case ']':
2091      token->type = OP_CLOSE_BRACKET;
2092      break;
2093    case '^':
2094      token->type = OP_NON_MATCH_LIST;
2095      break;
2096    default:
2097      token->type = CHARACTER;
2098    }
2099  return 1;
2100}
2101\f
2102/* Functions for parser.  */
2103
2104/* Entry point of the parser.
2105   Parse the regular expression REGEXP and return the structure tree.
2106   If an error is occured, ERR is set by error code, and return NULL.
2107   This function build the following tree, from regular expression <reg_exp>:
2108           CAT
2109           / \
2110          /   \
2111   <reg_exp>  EOR
2112
2113   CAT means concatenation.
2114   EOR means end of regular expression.  */
2115
2116static bin_tree_t *
2117parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2118       reg_errcode_t *err)
2119{
2120  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2121  bin_tree_t *tree, *eor, *root;
2122  re_token_t current_token;
2123  dfa->syntax = syntax;
2124  fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2125  tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2126  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2127    return NULL;
2128  eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2129  if (tree != NULL)
2130    root = create_tree (dfa, tree, eor, CONCAT);
2131  else
2132    root = eor;
2133  if (BE (eor == NULL || root == NULL, 0))
2134    {
2135      *err = REG_ESPACE;
2136      return NULL;
2137    }
2138  return root;
2139}
2140
2141/* This function build the following tree, from regular expression
2142   <branch1>|<branch2>:
2143           ALT
2144           / \
2145          /   \
2146   <branch1> <branch2>
2147
2148   ALT means alternative, which represents the operator `|'.  */
2149
2150static bin_tree_t *
2151parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2152               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2153{
2154  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2155  bin_tree_t *tree, *branch = NULL;
2156  tree = parse_branch (regexp, preg, token, syntax, nest, err);
2157  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2158    return NULL;
2159
2160  while (token->type == OP_ALT)
2161    {
2162      fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2163      if (token->type != OP_ALT && token->type != END_OF_RE
2164          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2165        {
2166          branch = parse_branch (regexp, preg, token, syntax, nest, err);
2167          if (BE (*err != REG_NOERROR && branch == NULL, 0))
2168            return NULL;
2169        }
2170      else
2171        branch = NULL;
2172      tree = create_tree (dfa, tree, branch, OP_ALT);
2173      if (BE (tree == NULL, 0))
2174        {
2175          *err = REG_ESPACE;
2176          return NULL;
2177        }
2178    }
2179  return tree;
2180}
2181
2182/* This function build the following tree, from regular expression
2183   <exp1><exp2>:
2184        CAT
2185        / \
2186       /   \
2187   <exp1> <exp2>
2188
2189   CAT means concatenation.  */
2190
2191static bin_tree_t *
2192parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2193              reg_syntax_t syntax, int nest, reg_errcode_t *err)
2194{
2195  bin_tree_t *tree, *exp;
2196  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2197  tree = parse_expression (regexp, preg, token, syntax, nest, err);
2198  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2199    return NULL;
2200
2201  while (token->type != OP_ALT && token->type != END_OF_RE
2202         && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2203    {
2204      exp = parse_expression (regexp, preg, token, syntax, nest, err);
2205      if (BE (*err != REG_NOERROR && exp == NULL, 0))
2206        {
2207          return NULL;
2208        }
2209      if (tree != NULL && exp != NULL)
2210        {
2211          tree = create_tree (dfa, tree, exp, CONCAT);
2212          if (tree == NULL)
2213            {
2214              *err = REG_ESPACE;
2215              return NULL;
2216            }
2217        }
2218      else if (tree == NULL)
2219        tree = exp;
2220      /* Otherwise exp == NULL, we don't need to create new tree.  */
2221    }
2222  return tree;
2223}
2224
2225/* This function build the following tree, from regular expression a*:
2226         *
2227         |
2228         a
2229*/
2230
2231static bin_tree_t *
2232parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2233                  reg_syntax_t syntax, int nest, reg_errcode_t *err)
2234{
2235  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2236  bin_tree_t *tree;
2237  switch (token->type)
2238    {
2239    case CHARACTER:
2240      tree = create_token_tree (dfa, NULL, NULL, token);
2241      if (BE (tree == NULL, 0))
2242        {
2243          *err = REG_ESPACE;
2244          return NULL;
2245        }
2246#ifdef RE_ENABLE_I18N
2247      if (dfa->mb_cur_max > 1)
2248        {
2249          while (!re_string_eoi (regexp)
2250                 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2251            {
2252              bin_tree_t *mbc_remain;
2253              fetch_token (token, regexp, syntax);
2254              mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2255              tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2256              if (BE (mbc_remain == NULL || tree == NULL, 0))
2257                {
2258                  *err = REG_ESPACE;
2259                  return NULL;
2260                }
2261            }
2262        }
2263#endif
2264      break;
2265    case OP_OPEN_SUBEXP:
2266      tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2267      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2268        return NULL;
2269      break;
2270    case OP_OPEN_BRACKET:
2271      tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2272      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2273        return NULL;
2274      break;
2275    case OP_BACK_REF:
2276      if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2277        {
2278          *err = REG_ESUBREG;
2279          return NULL;
2280        }
2281      dfa->used_bkref_map |= 1 << token->opr.idx;
2282      tree = create_token_tree (dfa, NULL, NULL, token);
2283      if (BE (tree == NULL, 0))
2284        {
2285          *err = REG_ESPACE;
2286          return NULL;
2287        }
2288      ++dfa->nbackref;
2289      dfa->has_mb_node = 1;
2290      break;
2291    case OP_OPEN_DUP_NUM:
2292      if (syntax & RE_CONTEXT_INVALID_DUP)
2293        {
2294          *err = REG_BADRPT;
2295          return NULL;
2296        }
2297      /* FALLTHROUGH */
2298    case OP_DUP_ASTERISK:
2299    case OP_DUP_PLUS:
2300    case OP_DUP_QUESTION:
2301      if (syntax & RE_CONTEXT_INVALID_OPS)
2302        {
2303          *err = REG_BADRPT;
2304          return NULL;
2305        }
2306      else if (syntax & RE_CONTEXT_INDEP_OPS)
2307        {
2308          fetch_token (token, regexp, syntax);
2309          return parse_expression (regexp, preg, token, syntax, nest, err);
2310        }
2311      /* else fall through  */
2312    case OP_CLOSE_SUBEXP:
2313      if ((token->type == OP_CLOSE_SUBEXP) &&
2314          !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2315        {
2316          *err = REG_ERPAREN;
2317          return NULL;
2318        }
2319      /* else fall through  */
2320    case OP_CLOSE_DUP_NUM:
2321      /* We treat it as a normal character.  */
2322
2323      /* Then we can these characters as normal characters.  */
2324      token->type = CHARACTER;
2325      /* mb_partial and word_char bits should be initialized already
2326         by peek_token.  */
2327      tree = create_token_tree (dfa, NULL, NULL, token);
2328      if (BE (tree == NULL, 0))
2329        {
2330          *err = REG_ESPACE;
2331          return NULL;
2332        }
2333      break;
2334    case ANCHOR:
2335      if ((token->opr.ctx_type
2336           & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2337          && dfa->word_ops_used == 0)
2338        init_word_char (dfa);
2339      if (token->opr.ctx_type == WORD_DELIM
2340          || token->opr.ctx_type == NOT_WORD_DELIM)
2341        {
2342          bin_tree_t *tree_first, *tree_last;
2343          if (token->opr.ctx_type == WORD_DELIM)
2344            {
2345              token->opr.ctx_type = WORD_FIRST;
2346              tree_first = create_token_tree (dfa, NULL, NULL, token);
2347              token->opr.ctx_type = WORD_LAST;
2348            }
2349          else
2350            {
2351              token->opr.ctx_type = INSIDE_WORD;
2352              tree_first = create_token_tree (dfa, NULL, NULL, token);
2353              token->opr.ctx_type = INSIDE_NOTWORD;
2354            }
2355          tree_last = create_token_tree (dfa, NULL, NULL, token);
2356          tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2357          if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2358            {
2359              *err = REG_ESPACE;
2360              return NULL;
2361            }
2362        }
2363      else
2364        {
2365          tree = create_token_tree (dfa, NULL, NULL, token);
2366          if (BE (tree == NULL, 0))
2367            {
2368              *err = REG_ESPACE;
2369              return NULL;
2370            }
2371        }
2372      /* We must return here, since ANCHORs can't be followed
2373         by repetition operators.
2374         eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2375             it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2376      fetch_token (token, regexp, syntax);
2377      return tree;
2378    case OP_PERIOD:
2379      tree = create_token_tree (dfa, NULL, NULL, token);
2380      if (BE (tree == NULL, 0))
2381        {
2382          *err = REG_ESPACE;
2383          return NULL;
2384        }
2385      if (dfa->mb_cur_max > 1)
2386        dfa->has_mb_node = 1;
2387      break;
2388    case OP_WORD:
2389    case OP_NOTWORD:
2390      tree = build_charclass_op (dfa, regexp->trans,
2391                                 "alnum",
2392                                 "_",
2393                                 token->type == OP_NOTWORD, err);
2394      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2395        return NULL;
2396      break;
2397    case OP_SPACE:
2398    case OP_NOTSPACE:
2399      tree = build_charclass_op (dfa, regexp->trans,
2400                                 "space",
2401                                 "",
2402                                 token->type == OP_NOTSPACE, err);
2403      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2404        return NULL;
2405      break;
2406    case OP_ALT:
2407    case END_OF_RE:
2408      return NULL;
2409    case BACK_SLASH:
2410      *err = REG_EESCAPE;
2411      return NULL;
2412    default:
2413      /* Must not happen?  */
2414#ifdef DEBUG
2415      assert (0);
2416#endif
2417      return NULL;
2418    }
2419  fetch_token (token, regexp, syntax);
2420
2421  while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2422         || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2423    {
2424      tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2425      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2426        return NULL;
2427      /* In BRE consecutive duplications are not allowed.  */
2428      if ((syntax & RE_CONTEXT_INVALID_DUP)
2429          && (token->type == OP_DUP_ASTERISK
2430              || token->type == OP_OPEN_DUP_NUM))
2431        {
2432          *err = REG_BADRPT;
2433          return NULL;
2434        }
2435    }
2436
2437  return tree;
2438}
2439
2440/* This function build the following tree, from regular expression
2441   (<reg_exp>):
2442         SUBEXP
2443            |
2444        <reg_exp>
2445*/
2446
2447static bin_tree_t *
2448parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2449               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2450{
2451  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2452  bin_tree_t *tree;
2453  size_t cur_nsub;
2454  cur_nsub = preg->re_nsub++;
2455
2456  fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2457
2458  /* The subexpression may be a null string.  */
2459  if (token->type == OP_CLOSE_SUBEXP)
2460    tree = NULL;
2461  else
2462    {
2463      tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2464      if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2465        *err = REG_EPAREN;
2466      if (BE (*err != REG_NOERROR, 0))
2467        return NULL;
2468    }
2469
2470  if (cur_nsub <= '9' - '1')
2471    dfa->completed_bkref_map |= 1 << cur_nsub;
2472
2473  tree = create_tree (dfa, tree, NULL, SUBEXP);
2474  if (BE (tree == NULL, 0))
2475    {
2476      *err = REG_ESPACE;
2477      return NULL;
2478    }
2479  tree->token.opr.idx = cur_nsub;
2480  return tree;
2481}
2482
2483/* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2484
2485static bin_tree_t *
2486parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2487              re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2488{
2489  bin_tree_t *tree = NULL, *old_tree = NULL;
2490  int i, start, end, start_idx = re_string_cur_idx (regexp);
2491#ifndef RE_TOKEN_INIT_BUG
2492  re_token_t start_token = *token;
2493#else
2494  re_token_t start_token;
2495
2496  memcpy ((void *) &start_token, (void *) token, sizeof start_token);
2497#endif
2498
2499  if (token->type == OP_OPEN_DUP_NUM)
2500    {
2501      end = 0;
2502      start = fetch_number (regexp, token, syntax);
2503      if (start == -1)
2504        {
2505          if (token->type == CHARACTER && token->opr.c == ',')
2506            start = 0; /* We treat "{,m}" as "{0,m}".  */
2507          else
2508            {
2509              *err = REG_BADBR; /* <re>{} is invalid.  */
2510              return NULL;
2511            }
2512        }
2513      if (BE (start != -2, 1))
2514        {
2515          /* We treat "{n}" as "{n,n}".  */
2516          end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2517                 : ((token->type == CHARACTER && token->opr.c == ',')
2518                    ? fetch_number (regexp, token, syntax) : -2));
2519        }
2520      if (BE (start == -2 || end == -2, 0))
2521        {
2522          /* Invalid sequence.  */
2523          if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2524            {
2525              if (token->type == END_OF_RE)
2526                *err = REG_EBRACE;
2527              else
2528                *err = REG_BADBR;
2529
2530              return NULL;
2531            }
2532
2533          /* If the syntax bit is set, rollback.  */
2534          re_string_set_index (regexp, start_idx);
2535          *token = start_token;
2536          token->type = CHARACTER;
2537          /* mb_partial and word_char bits should be already initialized by
2538             peek_token.  */
2539          return elem;
2540        }
2541
2542      if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2543        {
2544          /* First number greater than second.  */
2545          *err = REG_BADBR;
2546          return NULL;
2547        }
2548    }
2549  else
2550    {
2551      start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2552      end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2553    }
2554
2555  fetch_token (token, regexp, syntax);
2556
2557  if (BE (elem == NULL, 0))
2558    return NULL;
2559  if (BE (start == 0 && end == 0, 0))
2560    {
2561      postorder (elem, free_tree, NULL);
2562      return NULL;
2563    }
2564
2565  /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2566  if (BE (start > 0, 0))
2567    {
2568      tree = elem;
2569      for (i = 2; i <= start; ++i)
2570        {
2571          elem = duplicate_tree (elem, dfa);
2572          tree = create_tree (dfa, tree, elem, CONCAT);
2573          if (BE (elem == NULL || tree == NULL, 0))
2574            goto parse_dup_op_espace;
2575        }
2576
2577      if (start == end)
2578        return tree;
2579
2580      /* Duplicate ELEM before it is marked optional.  */
2581      elem = duplicate_tree (elem, dfa);
2582      old_tree = tree;
2583    }
2584  else
2585    old_tree = NULL;
2586
2587  if (elem->token.type == SUBEXP)
2588    postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2589
2590  tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2591  if (BE (tree == NULL, 0))
2592    goto parse_dup_op_espace;
2593
2594  /* This loop is actually executed only when end != -1,
2595     to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2596     already created the start+1-th copy.  */
2597  for (i = start + 2; i <= end; ++i)
2598    {
2599      elem = duplicate_tree (elem, dfa);
2600      tree = create_tree (dfa, tree, elem, CONCAT);
2601      if (BE (elem == NULL || tree == NULL, 0))
2602        goto parse_dup_op_espace;
2603
2604      tree = create_tree (dfa, tree, NULL, OP_ALT);
2605      if (BE (tree == NULL, 0))
2606        goto parse_dup_op_espace;
2607    }
2608
2609  if (old_tree)
2610    tree = create_tree (dfa, old_tree, tree, CONCAT);
2611
2612  return tree;
2613
2614 parse_dup_op_espace:
2615  *err = REG_ESPACE;
2616  return NULL;
2617}
2618
2619/* Size of the names for collating symbol/equivalence_class/character_class.
2620   I'm not sure, but maybe enough.  */
2621#define BRACKET_NAME_BUF_SIZE 32
2622
2623#ifndef _LIBC
2624  /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2625     Build the range expression which starts from START_ELEM, and ends
2626     at END_ELEM.  The result are written to MBCSET and SBCSET.
2627     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2628     mbcset->range_ends, is a pointer argument sinse we may
2629     update it.  */
2630
2631static reg_errcode_t
2632internal_function
2633# ifdef RE_ENABLE_I18N
2634build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2635                 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2636# else /* not RE_ENABLE_I18N */
2637build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2638                 bracket_elem_t *end_elem)
2639# endif /* not RE_ENABLE_I18N */
2640{
2641  unsigned int start_ch, end_ch;
2642  /* Equivalence Classes and Character Classes can't be a range start/end.  */
2643  if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2644          || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2645          0))
2646    return REG_ERANGE;
2647
2648  /* We can handle no multi character collating elements without libc
2649     support.  */
2650  if (BE ((start_elem->type == COLL_SYM
2651           && strlen ((char *) start_elem->opr.name) > 1)
2652          || (end_elem->type == COLL_SYM
2653              && strlen ((char *) end_elem->opr.name) > 1), 0))
2654    return REG_ECOLLATE;
2655
2656# ifdef RE_ENABLE_I18N
2657  {
2658    wchar_t wc;
2659    wint_t start_wc;
2660    wint_t end_wc;
2661    wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2662
2663    start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2664                : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2665                   : 0));
2666    end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2667              : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2668                 : 0));
2669#ifdef GAWK
2670    /*
2671     * Fedora Core 2, maybe others, have broken `btowc' that returns -1
2672     * for any value > 127. Sigh. Note that `start_ch' and `end_ch' are
2673     * unsigned, so we don't have sign extension problems.
2674     */
2675    start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2676                ? start_ch : start_elem->opr.wch);
2677    end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2678              ? end_ch : end_elem->opr.wch);
2679#else
2680    start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2681                ? __btowc (start_ch) : start_elem->opr.wch);
2682    end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2683              ? __btowc (end_ch) : end_elem->opr.wch);
2684#endif
2685    if (start_wc == WEOF || end_wc == WEOF)
2686      return REG_ECOLLATE;
2687    cmp_buf[0] = start_wc;
2688    cmp_buf[4] = end_wc;
2689    if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2690      return REG_ERANGE;
2691
2692    /* Got valid collation sequence values, add them as a new entry.
2693       However, for !_LIBC we have no collation elements: if the
2694       character set is single byte, the single byte character set
2695       that we build below suffices.  parse_bracket_exp passes
2696       no MBCSET if dfa->mb_cur_max == 1.  */
2697    if (mbcset)
2698      {
2699        /* Check the space of the arrays.  */
2700        if (BE (*range_alloc == mbcset->nranges, 0))
2701          {
2702            /* There is not enough space, need realloc.  */
2703            wchar_t *new_array_start, *new_array_end;
2704            int new_nranges;
2705
2706            /* +1 in case of mbcset->nranges is 0.  */
2707            new_nranges = 2 * mbcset->nranges + 1;
2708            /* Use realloc since mbcset->range_starts and mbcset->range_ends
2709               are NULL if *range_alloc == 0.  */
2710            new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2711                                          new_nranges);
2712            new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2713                                        new_nranges);
2714
2715            if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2716              return REG_ESPACE;
2717
2718            mbcset->range_starts = new_array_start;
2719            mbcset->range_ends = new_array_end;
2720            *range_alloc = new_nranges;
2721          }
2722
2723        mbcset->range_starts[mbcset->nranges] = start_wc;
2724        mbcset->range_ends[mbcset->nranges++] = end_wc;
2725      }
2726
2727    /* Build the table for single byte characters.  */
2728    for (wc = 0; wc < SBC_MAX; ++wc)
2729      {
2730        cmp_buf[2] = wc;
2731        if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2732            && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2733          bitset_set (sbcset, wc);
2734      }
2735  }
2736# else /* not RE_ENABLE_I18N */
2737  {
2738    unsigned int ch;
2739    start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2740                : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2741                   : 0));
2742    end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2743              : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2744                 : 0));
2745    if (start_ch > end_ch)
2746      return REG_ERANGE;
2747    /* Build the table for single byte characters.  */
2748    for (ch = 0; ch < SBC_MAX; ++ch)
2749      if (start_ch <= ch  && ch <= end_ch)
2750        bitset_set (sbcset, ch);
2751  }
2752# endif /* not RE_ENABLE_I18N */
2753  return REG_NOERROR;
2754}
2755#endif /* not _LIBC */
2756
2757#ifndef _LIBC
2758/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2759   Build the collating element which is represented by NAME.
2760   The result are written to MBCSET and SBCSET.
2761   COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2762   pointer argument since we may update it.  */
2763
2764static reg_errcode_t
2765internal_function
2766# ifdef RE_ENABLE_I18N
2767build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2768                        int *coll_sym_alloc, const unsigned char *name)
2769# else /* not RE_ENABLE_I18N */
2770build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2771# endif /* not RE_ENABLE_I18N */
2772{
2773  size_t name_len = strlen ((const char *) name);
2774  if (BE (name_len != 1, 0))
2775    return REG_ECOLLATE;
2776  else
2777    {
2778      bitset_set (sbcset, name[0]);
2779      return REG_NOERROR;
2780    }
2781}
2782#endif /* not _LIBC */
2783
2784/* This function parse bracket expression like "[abc]", "[a-c]",
2785   "[[.a-a.]]" etc.  */
2786
2787static bin_tree_t *
2788parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2789                   reg_syntax_t syntax, reg_errcode_t *err)
2790{
2791#ifdef _LIBC
2792  const unsigned char *collseqmb;
2793  const char *collseqwc;
2794  uint32_t nrules;
2795  int32_t table_size;
2796  const int32_t *symb_table;
2797  const unsigned char *extra;
2798
2799  /* Local function for parse_bracket_exp used in _LIBC environement.
2800     Seek the collating symbol entry correspondings to NAME.
2801     Return the index of the symbol in the SYMB_TABLE.  */
2802
2803  auto inline int32_t
2804  __attribute ((always_inline))
2805  seek_collating_symbol_entry (name, name_len)
2806         const unsigned char *name;
2807         size_t name_len;
2808    {
2809      int32_t hash = elem_hash ((const char *) name, name_len);
2810      int32_t elem = hash % table_size;
2811      if (symb_table[2 * elem] != 0)
2812        {
2813          int32_t second = hash % (table_size - 2) + 1;
2814
2815          do
2816            {
2817              /* First compare the hashing value.  */
2818              if (symb_table[2 * elem] == hash
2819                  /* Compare the length of the name.  */
2820                  && name_len == extra[symb_table[2 * elem + 1]]
2821                  /* Compare the name.  */
2822                  && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2823                             name_len) == 0)
2824                {
2825                  /* Yep, this is the entry.  */
2826                  break;
2827                }
2828
2829              /* Next entry.  */
2830              elem += second;
2831            }
2832          while (symb_table[2 * elem] != 0);
2833        }
2834      return elem;
2835    }
2836
2837  /* Local function for parse_bracket_exp used in _LIBC environment.
2838     Look up the collation sequence value of BR_ELEM.
2839     Return the value if succeeded, UINT_MAX otherwise.  */
2840
2841  auto inline unsigned int
2842  __attribute ((always_inline))
2843  lookup_collation_sequence_value (br_elem)
2844         bracket_elem_t *br_elem;
2845    {
2846      if (br_elem->type == SB_CHAR)
2847        {
2848          /*
2849          if (MB_CUR_MAX == 1)
2850          */
2851          if (nrules == 0)
2852            return collseqmb[br_elem->opr.ch];
2853          else
2854            {
2855              wint_t wc = __btowc (br_elem->opr.ch);
2856              return __collseq_table_lookup (collseqwc, wc);
2857            }
2858        }
2859      else if (br_elem->type == MB_CHAR)
2860        {
2861          if (nrules != 0)
2862            return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2863        }
2864      else if (br_elem->type == COLL_SYM)
2865        {
2866          size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2867          if (nrules != 0)
2868            {
2869              int32_t elem, idx;
2870              elem = seek_collating_symbol_entry (br_elem->opr.name,
2871                                                  sym_name_len);
2872              if (symb_table[2 * elem] != 0)
2873                {
2874                  /* We found the entry.  */
2875                  idx = symb_table[2 * elem + 1];
2876                  /* Skip the name of collating element name.  */
2877                  idx += 1 + extra[idx];
2878                  /* Skip the byte sequence of the collating element.  */
2879                  idx += 1 + extra[idx];
2880                  /* Adjust for the alignment.  */
2881                  idx = (idx + 3) & ~3;
2882                  /* Skip the multibyte collation sequence value.  */
2883                  idx += sizeof (unsigned int);
2884                  /* Skip the wide char sequence of the collating element.  */
2885                  idx += sizeof (unsigned int) *
2886                    (1 + *(unsigned int *) (extra + idx));
2887                  /* Return the collation sequence value.  */
2888                  return *(unsigned int *) (extra + idx);
2889                }
2890              else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2891                {
2892                  /* No valid character.  Match it as a single byte
2893                     character.  */
2894                  return collseqmb[br_elem->opr.name[0]];
2895                }
2896            }
2897          else if (sym_name_len == 1)
2898            return collseqmb[br_elem->opr.name[0]];
2899        }
2900      return UINT_MAX;
2901    }
2902
2903  /* Local function for parse_bracket_exp used in _LIBC environement.
2904     Build the range expression which starts from START_ELEM, and ends
2905     at END_ELEM.  The result are written to MBCSET and SBCSET.
2906     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2907     mbcset->range_ends, is a pointer argument sinse we may
2908     update it.  */
2909
2910  auto inline reg_errcode_t
2911  __attribute ((always_inline))
2912  build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2913         re_charset_t *mbcset;
2914         int *range_alloc;
2915         bitset_t sbcset;
2916         bracket_elem_t *start_elem, *end_elem;
2917    {
2918      unsigned int ch;
2919      uint32_t start_collseq;
2920      uint32_t end_collseq;
2921
2922      /* Equivalence Classes and Character Classes can't be a range
2923         start/end.  */
2924      if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2925              || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2926              0))
2927        return REG_ERANGE;
2928
2929      start_collseq = lookup_collation_sequence_value (start_elem);
2930      end_collseq = lookup_collation_sequence_value (end_elem);
2931      /* Check start/end collation sequence values.  */
2932      if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2933        return REG_ECOLLATE;
2934      if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2935        return REG_ERANGE;
2936
2937      /* Got valid collation sequence values, add them as a new entry.
2938         However, if we have no collation elements, and the character set
2939         is single byte, the single byte character set that we
2940         build below suffices. */
2941      if (nrules > 0 || dfa->mb_cur_max > 1)
2942        {
2943          /* Check the space of the arrays.  */
2944          if (BE (*range_alloc == mbcset->nranges, 0))
2945            {
2946              /* There is not enough space, need realloc.  */
2947              uint32_t *new_array_start;
2948              uint32_t *new_array_end;
2949              int new_nranges;
2950
2951              /* +1 in case of mbcset->nranges is 0.  */
2952              new_nranges = 2 * mbcset->nranges + 1;
2953              new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2954                                            new_nranges);
2955              new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2956                                          new_nranges);
2957
2958              if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2959                return REG_ESPACE;
2960
2961              mbcset->range_starts = new_array_start;
2962              mbcset->range_ends = new_array_end;
2963              *range_alloc = new_nranges;
2964            }
2965
2966          mbcset->range_starts[mbcset->nranges] = start_collseq;
2967          mbcset->range_ends[mbcset->nranges++] = end_collseq;
2968        }
2969
2970      /* Build the table for single byte characters.  */
2971      for (ch = 0; ch < SBC_MAX; ch++)
2972        {
2973          uint32_t ch_collseq;
2974          /*
2975          if (MB_CUR_MAX == 1)
2976          */
2977          if (nrules == 0)
2978            ch_collseq = collseqmb[ch];
2979          else
2980            ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2981          if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2982            bitset_set (sbcset, ch);
2983        }
2984      return REG_NOERROR;
2985    }
2986
2987  /* Local function for parse_bracket_exp used in _LIBC environement.
2988     Build the collating element which is represented by NAME.
2989     The result are written to MBCSET and SBCSET.
2990     COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2991     pointer argument sinse we may update it.  */
2992
2993  auto inline reg_errcode_t
2994  __attribute ((always_inline))
2995  build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2996         re_charset_t *mbcset;
2997         int *coll_sym_alloc;
2998         bitset_t sbcset;
2999         const unsigned char *name;
3000    {
3001      int32_t elem, idx;
3002      size_t name_len = strlen ((const char *) name);
3003      if (nrules != 0)
3004        {
3005          elem = seek_collating_symbol_entry (name, name_len);
3006          if (symb_table[2 * elem] != 0)
3007            {
3008              /* We found the entry.  */
3009              idx = symb_table[2 * elem + 1];
3010              /* Skip the name of collating element name.  */
3011              idx += 1 + extra[idx];
3012            }
3013          else if (symb_table[2 * elem] == 0 && name_len == 1)
3014            {
3015              /* No valid character, treat it as a normal
3016                 character.  */
3017              bitset_set (sbcset, name[0]);
3018              return REG_NOERROR;
3019            }
3020          else
3021            return REG_ECOLLATE;
3022
3023          /* Got valid collation sequence, add it as a new entry.  */
3024          /* Check the space of the arrays.  */
3025          if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3026            {
3027              /* Not enough, realloc it.  */
3028              /* +1 in case of mbcset->ncoll_syms is 0.  */
3029              int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3030              /* Use realloc since mbcset->coll_syms is NULL
3031                 if *alloc == 0.  */
3032              int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3033                                                   new_coll_sym_alloc);
3034              if (BE (new_coll_syms == NULL, 0))
3035                return REG_ESPACE;
3036              mbcset->coll_syms = new_coll_syms;
3037              *coll_sym_alloc = new_coll_sym_alloc;
3038            }
3039          mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3040          return REG_NOERROR;
3041        }
3042      else
3043        {
3044          if (BE (name_len != 1, 0))
3045            return REG_ECOLLATE;
3046          else
3047            {
3048              bitset_set (sbcset, name[0]);
3049              return REG_NOERROR;
3050            }
3051        }
3052    }
3053#endif
3054
3055  re_token_t br_token;
3056  re_bitset_ptr_t sbcset;
3057#ifdef RE_ENABLE_I18N
3058  re_charset_t *mbcset;
3059  int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3060  int equiv_class_alloc = 0, char_class_alloc = 0;
3061#endif /* not RE_ENABLE_I18N */
3062  int non_match = 0;
3063  bin_tree_t *work_tree;
3064  int token_len;
3065  int first_round = 1;
3066#ifdef _LIBC
3067  collseqmb = (const unsigned char *)
3068    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3069  nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3070  if (nrules)
3071    {
3072      /*
3073      if (MB_CUR_MAX > 1)
3074      */
3075      collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3076      table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3077      symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3078                                                  _NL_COLLATE_SYMB_TABLEMB);
3079      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3080                                                   _NL_COLLATE_SYMB_EXTRAMB);
3081    }
3082#endif
3083  sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3084#ifdef RE_ENABLE_I18N
3085  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3086#endif /* RE_ENABLE_I18N */
3087#ifdef RE_ENABLE_I18N
3088  if (BE (sbcset == NULL || mbcset == NULL, 0))
3089#else
3090  if (BE (sbcset == NULL, 0))
3091#endif /* RE_ENABLE_I18N */
3092    {
3093      *err = REG_ESPACE;
3094      return NULL;
3095    }
3096
3097  token_len = peek_token_bracket (token, regexp, syntax);
3098  if (BE (token->type == END_OF_RE, 0))
3099    {
3100      *err = REG_BADPAT;
3101      goto parse_bracket_exp_free_return;
3102    }
3103  if (token->type == OP_NON_MATCH_LIST)
3104    {
3105#ifdef RE_ENABLE_I18N
3106      mbcset->non_match = 1;
3107#endif /* not RE_ENABLE_I18N */
3108      non_match = 1;
3109      if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3110        bitset_set (sbcset, '\n');
3111      re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3112      token_len = peek_token_bracket (token, regexp, syntax);
3113      if (BE (token->type == END_OF_RE, 0))
3114        {
3115          *err = REG_BADPAT;
3116          goto parse_bracket_exp_free_return;
3117        }
3118    }
3119
3120  /* We treat the first ']' as a normal character.  */
3121  if (token->type == OP_CLOSE_BRACKET)
3122    token->type = CHARACTER;
3123
3124  while (1)
3125    {
3126      bracket_elem_t start_elem, end_elem;
3127      unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3128      unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3129      reg_errcode_t ret;
3130      int token_len2 = 0, is_range_exp = 0;
3131      re_token_t token2;
3132
3133      start_elem.opr.name = start_name_buf;
3134      ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3135                                   syntax, first_round);
3136      if (BE (ret != REG_NOERROR, 0))
3137        {
3138          *err = ret;
3139          goto parse_bracket_exp_free_return;
3140        }
3141      first_round = 0;
3142
3143      /* Get information about the next token.  We need it in any case.  */
3144      token_len = peek_token_bracket (token, regexp, syntax);
3145
3146      /* Do not check for ranges if we know they are not allowed.  */
3147      if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3148        {
3149          if (BE (token->type == END_OF_RE, 0))
3150            {
3151              *err = REG_EBRACK;
3152              goto parse_bracket_exp_free_return;
3153            }
3154          if (token->type == OP_CHARSET_RANGE)
3155            {
3156              re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3157              token_len2 = peek_token_bracket (&token2, regexp, syntax);
3158              if (BE (token2.type == END_OF_RE, 0))
3159                {
3160                  *err = REG_EBRACK;
3161                  goto parse_bracket_exp_free_return;
3162                }
3163              if (token2.type == OP_CLOSE_BRACKET)
3164                {
3165                  /* We treat the last '-' as a normal character.  */
3166                  re_string_skip_bytes (regexp, -token_len);
3167                  token->type = CHARACTER;
3168                }
3169              else
3170                is_range_exp = 1;
3171            }
3172        }
3173
3174      if (is_range_exp == 1)
3175        {
3176          end_elem.opr.name = end_name_buf;
3177          ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3178                                       dfa, syntax, 1);
3179          if (BE (ret != REG_NOERROR, 0))
3180            {
3181              *err = ret;
3182              goto parse_bracket_exp_free_return;
3183            }
3184
3185          token_len = peek_token_bracket (token, regexp, syntax);
3186
3187#ifdef _LIBC
3188          *err = build_range_exp (sbcset, mbcset, &range_alloc,
3189                                  &start_elem, &end_elem);
3190#else
3191# ifdef RE_ENABLE_I18N
3192          *err = build_range_exp (sbcset,
3193                                  dfa->mb_cur_max > 1 ? mbcset : NULL,
3194                                  &range_alloc, &start_elem, &end_elem);
3195# else
3196          *err = build_range_exp (sbcset, &start_elem, &end_elem);
3197# endif
3198#endif /* RE_ENABLE_I18N */
3199          if (BE (*err != REG_NOERROR, 0))
3200            goto parse_bracket_exp_free_return;
3201        }
3202      else
3203        {
3204          switch (start_elem.type)
3205            {
3206            case SB_CHAR:
3207              bitset_set (sbcset, start_elem.opr.ch);
3208              break;
3209#ifdef RE_ENABLE_I18N
3210            case MB_CHAR:
3211              /* Check whether the array has enough space.  */
3212              if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3213                {
3214                  wchar_t *new_mbchars;
3215                  /* Not enough, realloc it.  */
3216                  /* +1 in case of mbcset->nmbchars is 0.  */
3217                  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3218                  /* Use realloc since array is NULL if *alloc == 0.  */
3219                  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3220                                            mbchar_alloc);
3221                  if (BE (new_mbchars == NULL, 0))
3222                    goto parse_bracket_exp_espace;
3223                  mbcset->mbchars = new_mbchars;
3224                }
3225              mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3226              break;
3227#endif /* RE_ENABLE_I18N */
3228            case EQUIV_CLASS:
3229              *err = build_equiv_class (sbcset,
3230#ifdef RE_ENABLE_I18N
3231                                        mbcset, &equiv_class_alloc,
3232#endif /* RE_ENABLE_I18N */
3233                                        start_elem.opr.name);
3234              if (BE (*err != REG_NOERROR, 0))
3235                goto parse_bracket_exp_free_return;
3236              break;
3237            case COLL_SYM:
3238              *err = build_collating_symbol (sbcset,
3239#ifdef RE_ENABLE_I18N
3240                                             mbcset, &coll_sym_alloc,
3241#endif /* RE_ENABLE_I18N */
3242                                             start_elem.opr.name);
3243              if (BE (*err != REG_NOERROR, 0))
3244                goto parse_bracket_exp_free_return;
3245              break;
3246            case CHAR_CLASS:
3247              *err = build_charclass (regexp->trans, sbcset,
3248#ifdef RE_ENABLE_I18N
3249                                      mbcset, &char_class_alloc,
3250#endif /* RE_ENABLE_I18N */
3251                                      (const char *) start_elem.opr.name, syntax);
3252              if (BE (*err != REG_NOERROR, 0))
3253               goto parse_bracket_exp_free_return;
3254              break;
3255            default:
3256              assert (0);
3257              break;
3258            }
3259        }
3260      if (BE (token->type == END_OF_RE, 0))
3261        {
3262          *err = REG_EBRACK;
3263          goto parse_bracket_exp_free_return;
3264        }
3265      if (token->type == OP_CLOSE_BRACKET)
3266        break;
3267    }
3268
3269  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3270
3271  /* If it is non-matching list.  */
3272  if (non_match)
3273    bitset_not (sbcset);
3274
3275#ifdef RE_ENABLE_I18N
3276  /* Ensure only single byte characters are set.  */
3277  if (dfa->mb_cur_max > 1)
3278    bitset_mask (sbcset, dfa->sb_char);
3279
3280  if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3281      || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3282                                                     || mbcset->non_match)))
3283    {
3284      bin_tree_t *mbc_tree;
3285      int sbc_idx;
3286      /* Build a tree for complex bracket.  */
3287      dfa->has_mb_node = 1;
3288      br_token.type = COMPLEX_BRACKET;
3289      br_token.opr.mbcset = mbcset;
3290      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3291      if (BE (mbc_tree == NULL, 0))
3292        goto parse_bracket_exp_espace;
3293      for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3294        if (sbcset[sbc_idx])
3295          break;
3296      /* If there are no bits set in sbcset, there is no point
3297         of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3298      if (sbc_idx < BITSET_WORDS)
3299        {
3300          /* Build a tree for simple bracket.  */
3301          br_token.type = SIMPLE_BRACKET;
3302          br_token.opr.sbcset = sbcset;
3303          work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3304          if (BE (work_tree == NULL, 0))
3305            goto parse_bracket_exp_espace;
3306
3307          /* Then join them by ALT node.  */
3308          work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3309          if (BE (work_tree == NULL, 0))
3310            goto parse_bracket_exp_espace;
3311        }
3312      else
3313        {
3314          re_free (sbcset);
3315          work_tree = mbc_tree;
3316        }
3317    }
3318  else
3319#endif /* not RE_ENABLE_I18N */
3320    {
3321#ifdef RE_ENABLE_I18N
3322      free_charset (mbcset);
3323#endif
3324      /* Build a tree for simple bracket.  */
3325      br_token.type = SIMPLE_BRACKET;
3326      br_token.opr.sbcset = sbcset;
3327      work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3328      if (BE (work_tree == NULL, 0))
3329        goto parse_bracket_exp_espace;
3330    }
3331  return work_tree;
3332
3333 parse_bracket_exp_espace:
3334  *err = REG_ESPACE;
3335 parse_bracket_exp_free_return:
3336  re_free (sbcset);
3337#ifdef RE_ENABLE_I18N
3338  free_charset (mbcset);
3339#endif /* RE_ENABLE_I18N */
3340  return NULL;
3341}
3342
3343/* Parse an element in the bracket expression.  */
3344
3345static reg_errcode_t
3346parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3347                       re_token_t *token, int token_len, re_dfa_t *dfa,
3348                       reg_syntax_t syntax, int accept_hyphen)
3349{
3350#ifdef RE_ENABLE_I18N
3351  int cur_char_size;
3352  cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3353  if (cur_char_size > 1)
3354    {
3355      elem->type = MB_CHAR;
3356      elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3357      re_string_skip_bytes (regexp, cur_char_size);
3358      return REG_NOERROR;
3359    }
3360#endif /* RE_ENABLE_I18N */
3361  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3362  if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3363      || token->type == OP_OPEN_EQUIV_CLASS)
3364    return parse_bracket_symbol (elem, regexp, token);
3365  if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3366    {
3367      /* A '-' must only appear as anything but a range indicator before
3368         the closing bracket.  Everything else is an error.  */
3369      re_token_t token2;
3370      (void) peek_token_bracket (&token2, regexp, syntax);
3371      if (token2.type != OP_CLOSE_BRACKET)
3372        /* The actual error value is not standardized since this whole
3373           case is undefined.  But ERANGE makes good sense.  */
3374        return REG_ERANGE;
3375    }
3376  elem->type = SB_CHAR;
3377  elem->opr.ch = token->opr.c;
3378  return REG_NOERROR;
3379}
3380
3381/* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3382   such as [:<character_class>:], [.<collating_element>.], and
3383   [=<equivalent_class>=].  */
3384
3385static reg_errcode_t
3386parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3387                      re_token_t *token)
3388{
3389  unsigned char ch, delim = token->opr.c;
3390  int i = 0;
3391  if (re_string_eoi(regexp))
3392    return REG_EBRACK;
3393  for (;; ++i)
3394    {
3395      if (i >= BRACKET_NAME_BUF_SIZE)
3396        return REG_EBRACK;
3397      if (token->type == OP_OPEN_CHAR_CLASS)
3398        ch = re_string_fetch_byte_case (regexp);
3399      else
3400        ch = re_string_fetch_byte (regexp);
3401      if (re_string_eoi(regexp))
3402        return REG_EBRACK;
3403      if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3404        break;
3405      elem->opr.name[i] = ch;
3406    }
3407  re_string_skip_bytes (regexp, 1);
3408  elem->opr.name[i] = '\0';
3409  switch (token->type)
3410    {
3411    case OP_OPEN_COLL_ELEM:
3412      elem->type = COLL_SYM;
3413      break;
3414    case OP_OPEN_EQUIV_CLASS:
3415      elem->type = EQUIV_CLASS;
3416      break;
3417    case OP_OPEN_CHAR_CLASS:
3418      elem->type = CHAR_CLASS;
3419      break;
3420    default:
3421      break;
3422    }
3423  return REG_NOERROR;
3424}
3425
3426  /* Helper function for parse_bracket_exp.
3427     Build the equivalence class which is represented by NAME.
3428     The result are written to MBCSET and SBCSET.
3429     EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3430     is a pointer argument sinse we may update it.  */
3431
3432static reg_errcode_t
3433#ifdef RE_ENABLE_I18N
3434build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3435                   int *equiv_class_alloc, const unsigned char *name)
3436#else /* not RE_ENABLE_I18N */
3437build_equiv_class (bitset_t sbcset, const unsigned char *name)
3438#endif /* not RE_ENABLE_I18N */
3439{
3440#ifdef _LIBC
3441  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3442  if (nrules != 0)
3443    {
3444      const int32_t *table, *indirect;
3445      const unsigned char *weights, *extra, *cp;
3446      unsigned char char_buf[2];
3447      int32_t idx1, idx2;
3448      unsigned int ch;
3449      size_t len;
3450      /* This #include defines a local function!  */
3451# include <locale/weight.h>
3452      /* Calculate the index for equivalence class.  */
3453      cp = name;
3454      table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3455      weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3456                                               _NL_COLLATE_WEIGHTMB);
3457      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3458                                                   _NL_COLLATE_EXTRAMB);
3459      indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3460                                                _NL_COLLATE_INDIRECTMB);
3461      idx1 = findidx (&cp);
3462      if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3463        /* This isn't a valid character.  */
3464        return REG_ECOLLATE;
3465
3466      /* Build single byte matcing table for this equivalence class.  */
3467      char_buf[1] = (unsigned char) '\0';
3468      len = weights[idx1 & 0xffffff];
3469      for (ch = 0; ch < SBC_MAX; ++ch)
3470        {
3471          char_buf[0] = ch;
3472          cp = char_buf;
3473          idx2 = findidx (&cp);
3474/*
3475          idx2 = table[ch];
3476*/
3477          if (idx2 == 0)
3478            /* This isn't a valid character.  */
3479            continue;
3480          /* Compare only if the length matches and the collation rule
3481             index is the same.  */
3482          if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3483            {
3484              int cnt = 0;
3485
3486              while (cnt <= len &&
3487                     weights[(idx1 & 0xffffff) + 1 + cnt]
3488                     == weights[(idx2 & 0xffffff) + 1 + cnt])
3489                ++cnt;
3490
3491              if (cnt > len)
3492                bitset_set (sbcset, ch);
3493            }
3494        }
3495      /* Check whether the array has enough space.  */
3496      if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3497        {
3498          /* Not enough, realloc it.  */
3499          /* +1 in case of mbcset->nequiv_classes is 0.  */
3500          int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3501          /* Use realloc since the array is NULL if *alloc == 0.  */
3502          int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3503                                                   int32_t,
3504                                                   new_equiv_class_alloc);
3505          if (BE (new_equiv_classes == NULL, 0))
3506            return REG_ESPACE;
3507          mbcset->equiv_classes = new_equiv_classes;
3508          *equiv_class_alloc = new_equiv_class_alloc;
3509        }
3510      mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3511    }
3512  else
3513#endif /* _LIBC */
3514    {
3515      if (BE (strlen ((const char *) name) != 1, 0))
3516        return REG_ECOLLATE;
3517      bitset_set (sbcset, *name);
3518    }
3519  return REG_NOERROR;
3520}
3521
3522  /* Helper function for parse_bracket_exp.
3523     Build the character class which is represented by NAME.
3524     The result are written to MBCSET and SBCSET.
3525     CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3526     is a pointer argument sinse we may update it.  */
3527
3528static reg_errcode_t
3529#ifdef RE_ENABLE_I18N
3530build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3531                 re_charset_t *mbcset, int *char_class_alloc,
3532                 const char *class_name, reg_syntax_t syntax)
3533#else /* not RE_ENABLE_I18N */
3534build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3535                 const char *class_name, reg_syntax_t syntax)
3536#endif /* not RE_ENABLE_I18N */
3537{
3538  int i;
3539
3540  /* In case of REG_ICASE "upper" and "lower" match the both of
3541     upper and lower cases.  */
3542  if ((syntax & RE_ICASE)
3543      && (strcmp (class_name, "upper") == 0 || strcmp (class_name, "lower") == 0))
3544    class_name = "alpha";
3545
3546#ifdef RE_ENABLE_I18N
3547  /* Check the space of the arrays.  */
3548  if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3549    {
3550      /* Not enough, realloc it.  */
3551      /* +1 in case of mbcset->nchar_classes is 0.  */
3552      int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3553      /* Use realloc since array is NULL if *alloc == 0.  */
3554      wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3555                                               new_char_class_alloc);
3556      if (BE (new_char_classes == NULL, 0))
3557        return REG_ESPACE;
3558      mbcset->char_classes = new_char_classes;
3559      *char_class_alloc = new_char_class_alloc;
3560    }
3561  mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name);
3562#endif /* RE_ENABLE_I18N */
3563
3564#define BUILD_CHARCLASS_LOOP(ctype_func)        \
3565  do {                                          \
3566    if (BE (trans != NULL, 0))                  \
3567      {                                         \
3568        for (i = 0; i < SBC_MAX; ++i)           \
3569          if (ctype_func (i))                   \
3570            bitset_set (sbcset, trans[i]);      \
3571      }                                         \
3572    else                                        \
3573      {                                         \
3574        for (i = 0; i < SBC_MAX; ++i)           \
3575          if (ctype_func (i))                   \
3576            bitset_set (sbcset, i);             \
3577      }                                         \
3578  } while (0)
3579
3580  if (strcmp (class_name, "alnum") == 0)
3581    BUILD_CHARCLASS_LOOP (isalnum);
3582  else if (strcmp (class_name, "cntrl") == 0)
3583    BUILD_CHARCLASS_LOOP (iscntrl);
3584  else if (strcmp (class_name, "lower") == 0)
3585    BUILD_CHARCLASS_LOOP (islower);
3586  else if (strcmp (class_name, "space") == 0)
3587    BUILD_CHARCLASS_LOOP (isspace);
3588  else if (strcmp (class_name, "alpha") == 0)
3589    BUILD_CHARCLASS_LOOP (isalpha);
3590  else if (strcmp (class_name, "digit") == 0)
3591    BUILD_CHARCLASS_LOOP (isdigit);
3592  else if (strcmp (class_name, "print") == 0)
3593    BUILD_CHARCLASS_LOOP (isprint);
3594  else if (strcmp (class_name, "upper") == 0)
3595    BUILD_CHARCLASS_LOOP (isupper);
3596  else if (strcmp (class_name, "blank") == 0)
3597#ifndef GAWK
3598    BUILD_CHARCLASS_LOOP (isblank);
3599#else
3600    /* see comments above */
3601    BUILD_CHARCLASS_LOOP (is_blank);
3602#endif
3603  else if (strcmp (class_name, "graph") == 0)
3604    BUILD_CHARCLASS_LOOP (isgraph);
3605  else if (strcmp (class_name, "punct") == 0)
3606    BUILD_CHARCLASS_LOOP (ispunct);
3607  else if (strcmp (class_name, "xdigit") == 0)
3608    BUILD_CHARCLASS_LOOP (isxdigit);
3609  else
3610    return REG_ECTYPE;
3611
3612  return REG_NOERROR;
3613}
3614
3615static bin_tree_t *
3616build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3617                    const char *class_name,
3618                    const char *extra, int non_match,
3619                    reg_errcode_t *err)
3620{
3621  re_bitset_ptr_t sbcset;
3622#ifdef RE_ENABLE_I18N
3623  re_charset_t *mbcset;
3624  int alloc = 0;
3625#endif /* not RE_ENABLE_I18N */
3626  reg_errcode_t ret;
3627  re_token_t br_token;
3628  bin_tree_t *tree;
3629
3630  sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3631#ifdef RE_ENABLE_I18N
3632  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3633#endif /* RE_ENABLE_I18N */
3634
3635#ifdef RE_ENABLE_I18N
3636  if (BE (sbcset == NULL || mbcset == NULL, 0))
3637#else /* not RE_ENABLE_I18N */
3638  if (BE (sbcset == NULL, 0))
3639#endif /* not RE_ENABLE_I18N */
3640    {
3641      *err = REG_ESPACE;
3642      return NULL;
3643    }
3644
3645  if (non_match)
3646    {
3647#ifdef RE_ENABLE_I18N
3648      mbcset->non_match = 1;
3649#endif /* not RE_ENABLE_I18N */
3650    }
3651
3652  /* We don't care the syntax in this case.  */
3653  ret = build_charclass (trans, sbcset,
3654#ifdef RE_ENABLE_I18N
3655                         mbcset, &alloc,
3656#endif /* RE_ENABLE_I18N */
3657                         class_name, 0);
3658
3659  if (BE (ret != REG_NOERROR, 0))
3660    {
3661      re_free (sbcset);
3662#ifdef RE_ENABLE_I18N
3663      free_charset (mbcset);
3664#endif /* RE_ENABLE_I18N */
3665      *err = ret;
3666      return NULL;
3667    }
3668  /* \w match '_' also.  */
3669  for (; *extra; extra++)
3670    bitset_set (sbcset, *extra);
3671
3672  /* If it is non-matching list.  */
3673  if (non_match)
3674    bitset_not (sbcset);
3675
3676#ifdef RE_ENABLE_I18N
3677  /* Ensure only single byte characters are set.  */
3678  if (dfa->mb_cur_max > 1)
3679    bitset_mask (sbcset, dfa->sb_char);
3680#endif
3681
3682  /* Build a tree for simple bracket.  */
3683  br_token.type = SIMPLE_BRACKET;
3684  br_token.opr.sbcset = sbcset;
3685  tree = create_token_tree (dfa, NULL, NULL, &br_token);
3686  if (BE (tree == NULL, 0))
3687    goto build_word_op_espace;
3688
3689#ifdef RE_ENABLE_I18N
3690  if (dfa->mb_cur_max > 1)
3691    {
3692      bin_tree_t *mbc_tree;
3693      /* Build a tree for complex bracket.  */
3694      br_token.type = COMPLEX_BRACKET;
3695      br_token.opr.mbcset = mbcset;
3696      dfa->has_mb_node = 1;
3697      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3698      if (BE (mbc_tree == NULL, 0))
3699        goto build_word_op_espace;
3700      /* Then join them by ALT node.  */
3701      tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3702      if (BE (mbc_tree != NULL, 1))
3703        return tree;
3704    }
3705  else
3706    {
3707      free_charset (mbcset);
3708      return tree;
3709    }
3710#else /* not RE_ENABLE_I18N */
3711  return tree;
3712#endif /* not RE_ENABLE_I18N */
3713
3714 build_word_op_espace:
3715  re_free (sbcset);
3716#ifdef RE_ENABLE_I18N
3717  free_charset (mbcset);
3718#endif /* RE_ENABLE_I18N */
3719  *err = REG_ESPACE;
3720  return NULL;
3721}
3722
3723/* This is intended for the expressions like "a{1,3}".
3724   Fetch a number from `input', and return the number.
3725   Return -1, if the number field is empty like "{,1}".
3726   Return -2, If an error is occured.  */
3727
3728static int
3729fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3730{
3731  int num = -1;
3732  unsigned char c;
3733  while (1)
3734    {
3735      fetch_token (token, input, syntax);
3736      c = token->opr.c;
3737      if (BE (token->type == END_OF_RE, 0))
3738        return -2;
3739      if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3740        break;
3741      num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3742             ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3743      num = (num > RE_DUP_MAX) ? -2 : num;
3744    }
3745  return num;
3746}
3747\f
3748#ifdef RE_ENABLE_I18N
3749static void
3750free_charset (re_charset_t *cset)
3751{
3752  re_free (cset->mbchars);
3753# ifdef _LIBC
3754  re_free (cset->coll_syms);
3755  re_free (cset->equiv_classes);
3756  re_free (cset->range_starts);
3757  re_free (cset->range_ends);
3758# endif
3759  re_free (cset->char_classes);
3760  re_free (cset);
3761}
3762#endif /* RE_ENABLE_I18N */
3763\f
3764/* Functions for binary tree operation.  */
3765
3766/* Create a tree node.  */
3767
3768static bin_tree_t *
3769create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3770             re_token_type_t type)
3771{
3772  re_token_t t;
3773  t.type = type;
3774  return create_token_tree (dfa, left, right, &t);
3775}
3776
3777static bin_tree_t *
3778create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3779                   const re_token_t *token)
3780{
3781  bin_tree_t *tree;
3782  if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3783    {
3784      bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3785
3786      if (storage == NULL)
3787        return NULL;
3788      storage->next = dfa->str_tree_storage;
3789      dfa->str_tree_storage = storage;
3790      dfa->str_tree_storage_idx = 0;
3791    }
3792  tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3793
3794  tree->parent = NULL;
3795  tree->left = left;
3796  tree->right = right;
3797  tree->token = *token;
3798  tree->token.duplicated = 0;
3799  tree->token.opt_subexp = 0;
3800  tree->first = NULL;
3801  tree->next = NULL;
3802  tree->node_idx = -1;
3803
3804  if (left != NULL)
3805    left->parent = tree;
3806  if (right != NULL)
3807    right->parent = tree;
3808  return tree;
3809}
3810
3811/* Mark the tree SRC as an optional subexpression.
3812   To be called from preorder or postorder.  */
3813
3814static reg_errcode_t
3815mark_opt_subexp (void *extra, bin_tree_t *node)
3816{
3817  int idx = (int) (long) extra;
3818  if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3819    node->token.opt_subexp = 1;
3820
3821  return REG_NOERROR;
3822}
3823
3824/* Free the allocated memory inside NODE. */
3825
3826static void
3827free_token (re_token_t *node)
3828{
3829#ifdef RE_ENABLE_I18N
3830  if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3831    free_charset (node->opr.mbcset);
3832  else
3833#endif /* RE_ENABLE_I18N */
3834    if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3835      re_free (node->opr.sbcset);
3836}
3837
3838/* Worker function for tree walking.  Free the allocated memory inside NODE
3839   and its children. */
3840
3841static reg_errcode_t
3842free_tree (void *extra, bin_tree_t *node)
3843{
3844  free_token (&node->token);
3845  return REG_NOERROR;
3846}
3847
3848
3849/* Duplicate the node SRC, and return new node.  This is a preorder
3850   visit similar to the one implemented by the generic visitor, but
3851   we need more infrastructure to maintain two parallel trees --- so,
3852   it's easier to duplicate.  */
3853
3854static bin_tree_t *
3855duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3856{
3857  const bin_tree_t *node;
3858  bin_tree_t *dup_root;
3859  bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3860
3861  for (node = root; ; )
3862    {
3863      /* Create a new tree and link it back to the current parent.  */
3864      *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3865      if (*p_new == NULL)
3866        return NULL;
3867      (*p_new)->parent = dup_node;
3868      (*p_new)->token.duplicated = 1;
3869      dup_node = *p_new;
3870
3871      /* Go to the left node, or up and to the right.  */
3872      if (node->left)
3873        {
3874          node = node->left;
3875          p_new = &dup_node->left;
3876        }
3877      else
3878        {
3879          const bin_tree_t *prev = NULL;
3880          while (node->right == prev || node->right == NULL)
3881            {
3882              prev = node;
3883              node = node->parent;
3884              dup_node = dup_node->parent;
3885              if (!node)
3886                return dup_root;
3887            }
3888          node = node->right;
3889          p_new = &dup_node->right;
3890        }
3891    }
3892}