grep.con commit builtin-grep: make pieces of it available as library. (83b5d2f)
   1#include "cache.h"
   2#include <regex.h>
   3#include "grep.h"
   4
   5void append_grep_pattern(struct grep_opt *opt, const char *pat,
   6                         const char *origin, int no, enum grep_pat_token t)
   7{
   8        struct grep_pat *p = xcalloc(1, sizeof(*p));
   9        p->pattern = pat;
  10        p->origin = origin;
  11        p->no = no;
  12        p->token = t;
  13        *opt->pattern_tail = p;
  14        opt->pattern_tail = &p->next;
  15        p->next = NULL;
  16}
  17
  18static void compile_regexp(struct grep_pat *p, struct grep_opt *opt)
  19{
  20        int err = regcomp(&p->regexp, p->pattern, opt->regflags);
  21        if (err) {
  22                char errbuf[1024];
  23                char where[1024];
  24                if (p->no)
  25                        sprintf(where, "In '%s' at %d, ",
  26                                p->origin, p->no);
  27                else if (p->origin)
  28                        sprintf(where, "%s, ", p->origin);
  29                else
  30                        where[0] = 0;
  31                regerror(err, &p->regexp, errbuf, 1024);
  32                regfree(&p->regexp);
  33                die("%s'%s': %s", where, p->pattern, errbuf);
  34        }
  35}
  36
  37static struct grep_expr *compile_pattern_expr(struct grep_pat **);
  38static struct grep_expr *compile_pattern_atom(struct grep_pat **list)
  39{
  40        struct grep_pat *p;
  41        struct grep_expr *x;
  42
  43        p = *list;
  44        switch (p->token) {
  45        case GREP_PATTERN: /* atom */
  46                x = xcalloc(1, sizeof (struct grep_expr));
  47                x->node = GREP_NODE_ATOM;
  48                x->u.atom = p;
  49                *list = p->next;
  50                return x;
  51        case GREP_OPEN_PAREN:
  52                *list = p->next;
  53                x = compile_pattern_expr(list);
  54                if (!x)
  55                        return NULL;
  56                if (!*list || (*list)->token != GREP_CLOSE_PAREN)
  57                        die("unmatched parenthesis");
  58                *list = (*list)->next;
  59                return x;
  60        default:
  61                return NULL;
  62        }
  63}
  64
  65static struct grep_expr *compile_pattern_not(struct grep_pat **list)
  66{
  67        struct grep_pat *p;
  68        struct grep_expr *x;
  69
  70        p = *list;
  71        switch (p->token) {
  72        case GREP_NOT:
  73                if (!p->next)
  74                        die("--not not followed by pattern expression");
  75                *list = p->next;
  76                x = xcalloc(1, sizeof (struct grep_expr));
  77                x->node = GREP_NODE_NOT;
  78                x->u.unary = compile_pattern_not(list);
  79                if (!x->u.unary)
  80                        die("--not followed by non pattern expression");
  81                return x;
  82        default:
  83                return compile_pattern_atom(list);
  84        }
  85}
  86
  87static struct grep_expr *compile_pattern_and(struct grep_pat **list)
  88{
  89        struct grep_pat *p;
  90        struct grep_expr *x, *y, *z;
  91
  92        x = compile_pattern_not(list);
  93        p = *list;
  94        if (p && p->token == GREP_AND) {
  95                if (!p->next)
  96                        die("--and not followed by pattern expression");
  97                *list = p->next;
  98                y = compile_pattern_and(list);
  99                if (!y)
 100                        die("--and not followed by pattern expression");
 101                z = xcalloc(1, sizeof (struct grep_expr));
 102                z->node = GREP_NODE_AND;
 103                z->u.binary.left = x;
 104                z->u.binary.right = y;
 105                return z;
 106        }
 107        return x;
 108}
 109
 110static struct grep_expr *compile_pattern_or(struct grep_pat **list)
 111{
 112        struct grep_pat *p;
 113        struct grep_expr *x, *y, *z;
 114
 115        x = compile_pattern_and(list);
 116        p = *list;
 117        if (x && p && p->token != GREP_CLOSE_PAREN) {
 118                y = compile_pattern_or(list);
 119                if (!y)
 120                        die("not a pattern expression %s", p->pattern);
 121                z = xcalloc(1, sizeof (struct grep_expr));
 122                z->node = GREP_NODE_OR;
 123                z->u.binary.left = x;
 124                z->u.binary.right = y;
 125                return z;
 126        }
 127        return x;
 128}
 129
 130static struct grep_expr *compile_pattern_expr(struct grep_pat **list)
 131{
 132        return compile_pattern_or(list);
 133}
 134
 135void compile_grep_patterns(struct grep_opt *opt)
 136{
 137        struct grep_pat *p;
 138
 139        if (opt->fixed)
 140                return;
 141
 142        /* First compile regexps */
 143        for (p = opt->pattern_list; p; p = p->next) {
 144                if (p->token == GREP_PATTERN)
 145                        compile_regexp(p, opt);
 146                else
 147                        opt->extended = 1;
 148        }
 149
 150        if (!opt->extended)
 151                return;
 152
 153        /* Then bundle them up in an expression.
 154         * A classic recursive descent parser would do.
 155         */
 156        p = opt->pattern_list;
 157        opt->pattern_expression = compile_pattern_expr(&p);
 158        if (p)
 159                die("incomplete pattern expression: %s", p->pattern);
 160}
 161
 162static char *end_of_line(char *cp, unsigned long *left)
 163{
 164        unsigned long l = *left;
 165        while (l && *cp != '\n') {
 166                l--;
 167                cp++;
 168        }
 169        *left = l;
 170        return cp;
 171}
 172
 173static int word_char(char ch)
 174{
 175        return isalnum(ch) || ch == '_';
 176}
 177
 178static void show_line(struct grep_opt *opt, const char *bol, const char *eol,
 179                      const char *name, unsigned lno, char sign)
 180{
 181        if (opt->pathname)
 182                printf("%s%c", name, sign);
 183        if (opt->linenum)
 184                printf("%d%c", lno, sign);
 185        printf("%.*s\n", (int)(eol-bol), bol);
 186}
 187
 188/*
 189 * NEEDSWORK: share code with diff.c
 190 */
 191#define FIRST_FEW_BYTES 8000
 192static int buffer_is_binary(const char *ptr, unsigned long size)
 193{
 194        if (FIRST_FEW_BYTES < size)
 195                size = FIRST_FEW_BYTES;
 196        return !!memchr(ptr, 0, size);
 197}
 198
 199static int fixmatch(const char *pattern, char *line, regmatch_t *match)
 200{
 201        char *hit = strstr(line, pattern);
 202        if (!hit) {
 203                match->rm_so = match->rm_eo = -1;
 204                return REG_NOMATCH;
 205        }
 206        else {
 207                match->rm_so = hit - line;
 208                match->rm_eo = match->rm_so + strlen(pattern);
 209                return 0;
 210        }
 211}
 212
 213static int match_one_pattern(struct grep_opt *opt, struct grep_pat *p, char *bol, char *eol)
 214{
 215        int hit = 0;
 216        int at_true_bol = 1;
 217        regmatch_t pmatch[10];
 218
 219 again:
 220        if (!opt->fixed) {
 221                regex_t *exp = &p->regexp;
 222                hit = !regexec(exp, bol, ARRAY_SIZE(pmatch),
 223                               pmatch, 0);
 224        }
 225        else {
 226                hit = !fixmatch(p->pattern, bol, pmatch);
 227        }
 228
 229        if (hit && opt->word_regexp) {
 230                if ((pmatch[0].rm_so < 0) ||
 231                    (eol - bol) <= pmatch[0].rm_so ||
 232                    (pmatch[0].rm_eo < 0) ||
 233                    (eol - bol) < pmatch[0].rm_eo)
 234                        die("regexp returned nonsense");
 235
 236                /* Match beginning must be either beginning of the
 237                 * line, or at word boundary (i.e. the last char must
 238                 * not be a word char).  Similarly, match end must be
 239                 * either end of the line, or at word boundary
 240                 * (i.e. the next char must not be a word char).
 241                 */
 242                if ( ((pmatch[0].rm_so == 0 && at_true_bol) ||
 243                      !word_char(bol[pmatch[0].rm_so-1])) &&
 244                     ((pmatch[0].rm_eo == (eol-bol)) ||
 245                      !word_char(bol[pmatch[0].rm_eo])) )
 246                        ;
 247                else
 248                        hit = 0;
 249
 250                if (!hit && pmatch[0].rm_so + bol + 1 < eol) {
 251                        /* There could be more than one match on the
 252                         * line, and the first match might not be
 253                         * strict word match.  But later ones could be!
 254                         */
 255                        bol = pmatch[0].rm_so + bol + 1;
 256                        at_true_bol = 0;
 257                        goto again;
 258                }
 259        }
 260        return hit;
 261}
 262
 263static int match_expr_eval(struct grep_opt *opt,
 264                           struct grep_expr *x,
 265                           char *bol, char *eol)
 266{
 267        switch (x->node) {
 268        case GREP_NODE_ATOM:
 269                return match_one_pattern(opt, x->u.atom, bol, eol);
 270                break;
 271        case GREP_NODE_NOT:
 272                return !match_expr_eval(opt, x->u.unary, bol, eol);
 273        case GREP_NODE_AND:
 274                return (match_expr_eval(opt, x->u.binary.left, bol, eol) &&
 275                        match_expr_eval(opt, x->u.binary.right, bol, eol));
 276        case GREP_NODE_OR:
 277                return (match_expr_eval(opt, x->u.binary.left, bol, eol) ||
 278                        match_expr_eval(opt, x->u.binary.right, bol, eol));
 279        }
 280        die("Unexpected node type (internal error) %d\n", x->node);
 281}
 282
 283static int match_expr(struct grep_opt *opt, char *bol, char *eol)
 284{
 285        struct grep_expr *x = opt->pattern_expression;
 286        return match_expr_eval(opt, x, bol, eol);
 287}
 288
 289static int match_line(struct grep_opt *opt, char *bol, char *eol)
 290{
 291        struct grep_pat *p;
 292        if (opt->extended)
 293                return match_expr(opt, bol, eol);
 294        for (p = opt->pattern_list; p; p = p->next) {
 295                if (match_one_pattern(opt, p, bol, eol))
 296                        return 1;
 297        }
 298        return 0;
 299}
 300
 301int grep_buffer(struct grep_opt *opt, const char *name, char *buf, unsigned long size)
 302{
 303        char *bol = buf;
 304        unsigned long left = size;
 305        unsigned lno = 1;
 306        struct pre_context_line {
 307                char *bol;
 308                char *eol;
 309        } *prev = NULL, *pcl;
 310        unsigned last_hit = 0;
 311        unsigned last_shown = 0;
 312        int binary_match_only = 0;
 313        const char *hunk_mark = "";
 314        unsigned count = 0;
 315
 316        if (buffer_is_binary(buf, size)) {
 317                switch (opt->binary) {
 318                case GREP_BINARY_DEFAULT:
 319                        binary_match_only = 1;
 320                        break;
 321                case GREP_BINARY_NOMATCH:
 322                        return 0; /* Assume unmatch */
 323                        break;
 324                default:
 325                        break;
 326                }
 327        }
 328
 329        if (opt->pre_context)
 330                prev = xcalloc(opt->pre_context, sizeof(*prev));
 331        if (opt->pre_context || opt->post_context)
 332                hunk_mark = "--\n";
 333
 334        while (left) {
 335                char *eol, ch;
 336                int hit = 0;
 337
 338                eol = end_of_line(bol, &left);
 339                ch = *eol;
 340                *eol = 0;
 341
 342                hit = match_line(opt, bol, eol);
 343                *eol = ch;
 344
 345                /* "grep -v -e foo -e bla" should list lines
 346                 * that do not have either, so inversion should
 347                 * be done outside.
 348                 */
 349                if (opt->invert)
 350                        hit = !hit;
 351                if (opt->unmatch_name_only) {
 352                        if (hit)
 353                                return 0;
 354                        goto next_line;
 355                }
 356                if (hit) {
 357                        count++;
 358                        if (opt->status_only)
 359                                return 1;
 360                        if (binary_match_only) {
 361                                printf("Binary file %s matches\n", name);
 362                                return 1;
 363                        }
 364                        if (opt->name_only) {
 365                                printf("%s\n", name);
 366                                return 1;
 367                        }
 368                        /* Hit at this line.  If we haven't shown the
 369                         * pre-context lines, we would need to show them.
 370                         * When asked to do "count", this still show
 371                         * the context which is nonsense, but the user
 372                         * deserves to get that ;-).
 373                         */
 374                        if (opt->pre_context) {
 375                                unsigned from;
 376                                if (opt->pre_context < lno)
 377                                        from = lno - opt->pre_context;
 378                                else
 379                                        from = 1;
 380                                if (from <= last_shown)
 381                                        from = last_shown + 1;
 382                                if (last_shown && from != last_shown + 1)
 383                                        printf(hunk_mark);
 384                                while (from < lno) {
 385                                        pcl = &prev[lno-from-1];
 386                                        show_line(opt, pcl->bol, pcl->eol,
 387                                                  name, from, '-');
 388                                        from++;
 389                                }
 390                                last_shown = lno-1;
 391                        }
 392                        if (last_shown && lno != last_shown + 1)
 393                                printf(hunk_mark);
 394                        if (!opt->count)
 395                                show_line(opt, bol, eol, name, lno, ':');
 396                        last_shown = last_hit = lno;
 397                }
 398                else if (last_hit &&
 399                         lno <= last_hit + opt->post_context) {
 400                        /* If the last hit is within the post context,
 401                         * we need to show this line.
 402                         */
 403                        if (last_shown && lno != last_shown + 1)
 404                                printf(hunk_mark);
 405                        show_line(opt, bol, eol, name, lno, '-');
 406                        last_shown = lno;
 407                }
 408                if (opt->pre_context) {
 409                        memmove(prev+1, prev,
 410                                (opt->pre_context-1) * sizeof(*prev));
 411                        prev->bol = bol;
 412                        prev->eol = eol;
 413                }
 414
 415        next_line:
 416                bol = eol + 1;
 417                if (!left)
 418                        break;
 419                left--;
 420                lno++;
 421        }
 422
 423        if (opt->status_only)
 424                return 0;
 425        if (opt->unmatch_name_only) {
 426                /* We did not see any hit, so we want to show this */
 427                printf("%s\n", name);
 428                return 1;
 429        }
 430
 431        /* NEEDSWORK:
 432         * The real "grep -c foo *.c" gives many "bar.c:0" lines,
 433         * which feels mostly useless but sometimes useful.  Maybe
 434         * make it another option?  For now suppress them.
 435         */
 436        if (opt->count && count)
 437                printf("%s:%u\n", name, count);
 438        return !!last_hit;
 439}
 440