Merge branch 'kw/patch-ids-optim'
authorJunio C Hamano <gitster@pobox.com>
Fri, 12 Aug 2016 16:47:39 +0000 (09:47 -0700)
committerJunio C Hamano <gitster@pobox.com>
Fri, 12 Aug 2016 16:47:39 +0000 (09:47 -0700)
When "git rebase" tries to compare set of changes on the updated
upstream and our own branch, it computes patch-id for all of these
changes and attempts to find matches. This has been optimized by
lazily computing the full patch-id (which is expensive) to be
compared only for changes that touch the same set of paths.

* kw/patch-ids-optim:
rebase: avoid computing unnecessary patch IDs
patch-ids: add flag to create the diff patch id using header only data
patch-ids: replace the seen indicator with a commit pointer
patch-ids: stop using a hand-rolled hashmap implementation

builtin/log.c
diff.c
diff.h
patch-ids.c
patch-ids.h
revision.c
t/perf/p3400-rebase.sh [new file with mode: 0644]
t/t6007-rev-list-cherry-pick-file.sh
index 1f116bea8c8873d652f573db2047c558fa51bbd3..92dc34dcb0cc54df27b99c91d08786b6c0d5b62a 100644 (file)
@@ -1343,7 +1343,7 @@ static void prepare_bases(struct base_tree_info *bases,
                struct object_id *patch_id;
                if (commit->util)
                        continue;
-               if (commit_patch_id(commit, &diffopt, sha1))
+               if (commit_patch_id(commit, &diffopt, sha1, 0))
                        die(_("cannot get patch id"));
                ALLOC_GROW(bases->patch_id, bases->nr_patch_id + 1, bases->alloc_patch_id);
                patch_id = bases->patch_id + bases->nr_patch_id;
diff --git a/diff.c b/diff.c
index b43d3dd2ecb7b2154ce6c445983e23ce89c69111..534c12e28ea8b077c9fa305a634f7a73cf30589d 100644 (file)
--- a/diff.c
+++ b/diff.c
@@ -4462,7 +4462,7 @@ static void patch_id_consume(void *priv, char *line, unsigned long len)
 }
 
 /* returns 0 upon success, and writes result into sha1 */
-static int diff_get_patch_id(struct diff_options *options, unsigned char *sha1)
+static int diff_get_patch_id(struct diff_options *options, unsigned char *sha1, int diff_header_only)
 {
        struct diff_queue_struct *q = &diff_queued_diff;
        int i;
@@ -4497,9 +4497,6 @@ static int diff_get_patch_id(struct diff_options *options, unsigned char *sha1)
 
                diff_fill_sha1_info(p->one);
                diff_fill_sha1_info(p->two);
-               if (fill_mmfile(&mf1, p->one) < 0 ||
-                               fill_mmfile(&mf2, p->two) < 0)
-                       return error("unable to read files to diff");
 
                len1 = remove_space(p->one->path, strlen(p->one->path));
                len2 = remove_space(p->two->path, strlen(p->two->path));
@@ -4534,6 +4531,13 @@ static int diff_get_patch_id(struct diff_options *options, unsigned char *sha1)
                                        len2, p->two->path);
                git_SHA1_Update(&ctx, buffer, len1);
 
+               if (diff_header_only)
+                       continue;
+
+               if (fill_mmfile(&mf1, p->one) < 0 ||
+                   fill_mmfile(&mf2, p->two) < 0)
+                       return error("unable to read files to diff");
+
                if (diff_filespec_is_binary(p->one) ||
                    diff_filespec_is_binary(p->two)) {
                        git_SHA1_Update(&ctx, oid_to_hex(&p->one->oid),
@@ -4556,11 +4560,11 @@ static int diff_get_patch_id(struct diff_options *options, unsigned char *sha1)
        return 0;
 }
 
-int diff_flush_patch_id(struct diff_options *options, unsigned char *sha1)
+int diff_flush_patch_id(struct diff_options *options, unsigned char *sha1, int diff_header_only)
 {
        struct diff_queue_struct *q = &diff_queued_diff;
        int i;
-       int result = diff_get_patch_id(options, sha1);
+       int result = diff_get_patch_id(options, sha1, diff_header_only);
 
        for (i = 0; i < q->nr; i++)
                diff_free_filepair(q->queue[i]);
diff --git a/diff.h b/diff.h
index 125447be09ebf1d79beb7312785c2e89654fbdb0..7883729edf10ae2888ebc672f62f6015dee21550 100644 (file)
--- a/diff.h
+++ b/diff.h
@@ -342,7 +342,7 @@ extern int run_diff_files(struct rev_info *revs, unsigned int option);
 extern int run_diff_index(struct rev_info *revs, int cached);
 
 extern int do_diff_cache(const unsigned char *, struct diff_options *);
-extern int diff_flush_patch_id(struct diff_options *, unsigned char *);
+extern int diff_flush_patch_id(struct diff_options *, unsigned char *, int);
 
 extern int diff_result_code(struct diff_options *, int);
 
index a4d00166646d3dc354f3c06e9e75ce7dc3724f6a..082412aca66be6ce826034c5177e2f7f4b3bda58 100644 (file)
@@ -5,7 +5,7 @@
 #include "patch-ids.h"
 
 int commit_patch_id(struct commit *commit, struct diff_options *options,
-                   unsigned char *sha1)
+                   unsigned char *sha1, int diff_header_only)
 {
        if (commit->parents)
                diff_tree_sha1(commit->parents->item->object.oid.hash,
@@ -13,93 +13,86 @@ int commit_patch_id(struct commit *commit, struct diff_options *options,
        else
                diff_root_tree_sha1(commit->object.oid.hash, "", options);
        diffcore_std(options);
-       return diff_flush_patch_id(options, sha1);
+       return diff_flush_patch_id(options, sha1, diff_header_only);
 }
 
-static const unsigned char *patch_id_access(size_t index, void *table)
+/*
+ * When we cannot load the full patch-id for both commits for whatever
+ * reason, the function returns -1 (i.e. return error(...)). Despite
+ * the "cmp" in the name of this function, the caller only cares about
+ * the return value being zero (a and b are equivalent) or non-zero (a
+ * and b are different), and returning non-zero would keep both in the
+ * result, even if they actually were equivalent, in order to err on
+ * the side of safety.  The actual value being negative does not have
+ * any significance; only that it is non-zero matters.
+ */
+static int patch_id_cmp(struct patch_id *a,
+                       struct patch_id *b,
+                       struct diff_options *opt)
 {
-       struct patch_id **id_table = table;
-       return id_table[index]->patch_id;
+       if (is_null_sha1(a->patch_id) &&
+           commit_patch_id(a->commit, opt, a->patch_id, 0))
+               return error("Could not get patch ID for %s",
+                       oid_to_hex(&a->commit->object.oid));
+       if (is_null_sha1(b->patch_id) &&
+           commit_patch_id(b->commit, opt, b->patch_id, 0))
+               return error("Could not get patch ID for %s",
+                       oid_to_hex(&b->commit->object.oid));
+       return hashcmp(a->patch_id, b->patch_id);
 }
 
-static int patch_pos(struct patch_id **table, int nr, const unsigned char *id)
-{
-       return sha1_pos(id, table, nr, patch_id_access);
-}
-
-#define BUCKET_SIZE 190 /* 190 * 21 = 3990, with slop close enough to 4K */
-struct patch_id_bucket {
-       struct patch_id_bucket *next;
-       int nr;
-       struct patch_id bucket[BUCKET_SIZE];
-};
-
 int init_patch_ids(struct patch_ids *ids)
 {
        memset(ids, 0, sizeof(*ids));
        diff_setup(&ids->diffopts);
        DIFF_OPT_SET(&ids->diffopts, RECURSIVE);
        diff_setup_done(&ids->diffopts);
+       hashmap_init(&ids->patches, (hashmap_cmp_fn)patch_id_cmp, 256);
        return 0;
 }
 
 int free_patch_ids(struct patch_ids *ids)
 {
-       struct patch_id_bucket *next, *patches;
-
-       free(ids->table);
-       for (patches = ids->patches; patches; patches = next) {
-               next = patches->next;
-               free(patches);
-       }
+       hashmap_free(&ids->patches, 1);
        return 0;
 }
 
-static struct patch_id *add_commit(struct commit *commit,
-                                  struct patch_ids *ids,
-                                  int no_add)
+static int init_patch_id_entry(struct patch_id *patch,
+                              struct commit *commit,
+                              struct patch_ids *ids)
 {
-       struct patch_id_bucket *bucket;
-       struct patch_id *ent;
-       unsigned char sha1[20];
-       int pos;
+       unsigned char header_only_patch_id[GIT_SHA1_RAWSZ];
 
-       if (commit_patch_id(commit, &ids->diffopts, sha1))
-               return NULL;
-       pos = patch_pos(ids->table, ids->nr, sha1);
-       if (0 <= pos)
-               return ids->table[pos];
-       if (no_add)
-               return NULL;
+       patch->commit = commit;
+       if (commit_patch_id(commit, &ids->diffopts, header_only_patch_id, 1))
+               return -1;
 
-       pos = -1 - pos;
-
-       bucket = ids->patches;
-       if (!bucket || (BUCKET_SIZE <= bucket->nr)) {
-               bucket = xcalloc(1, sizeof(*bucket));
-               bucket->next = ids->patches;
-               ids->patches = bucket;
-       }
-       ent = &bucket->bucket[bucket->nr++];
-       hashcpy(ent->patch_id, sha1);
-
-       ALLOC_GROW(ids->table, ids->nr + 1, ids->alloc);
-       if (pos < ids->nr)
-               memmove(ids->table + pos + 1, ids->table + pos,
-                       sizeof(ent) * (ids->nr - pos));
-       ids->nr++;
-       ids->table[pos] = ent;
-       return ids->table[pos];
+       hashmap_entry_init(patch, sha1hash(header_only_patch_id));
+       return 0;
 }
 
 struct patch_id *has_commit_patch_id(struct commit *commit,
                                     struct patch_ids *ids)
 {
-       return add_commit(commit, ids, 1);
+       struct patch_id patch;
+
+       memset(&patch, 0, sizeof(patch));
+       if (init_patch_id_entry(&patch, commit, ids))
+               return NULL;
+
+       return hashmap_get(&ids->patches, &patch, &ids->diffopts);
 }
 
 struct patch_id *add_commit_patch_id(struct commit *commit,
                                     struct patch_ids *ids)
 {
-       return add_commit(commit, ids, 0);
+       struct patch_id *key = xcalloc(1, sizeof(*key));
+
+       if (init_patch_id_entry(key, commit, ids)) {
+               free(key);
+               return NULL;
+       }
+
+       hashmap_add(&ids->patches, key);
+       return key;
 }
index eeb56b307f602773a41ea3b8cb22d89959c4f9d9..0f34ea11eadffea14620ea2ad45d69894469aef7 100644 (file)
@@ -2,19 +2,18 @@
 #define PATCH_IDS_H
 
 struct patch_id {
-       unsigned char patch_id[20];
-       char seen;
+       struct hashmap_entry ent;
+       unsigned char patch_id[GIT_SHA1_RAWSZ];
+       struct commit *commit;
 };
 
 struct patch_ids {
+       struct hashmap patches;
        struct diff_options diffopts;
-       int nr, alloc;
-       struct patch_id **table;
-       struct patch_id_bucket *patches;
 };
 
 int commit_patch_id(struct commit *commit, struct diff_options *options,
-                   unsigned char *sha1);
+                   unsigned char *sha1, int);
 int init_patch_ids(struct patch_ids *);
 int free_patch_ids(struct patch_ids *);
 struct patch_id *add_commit_patch_id(struct commit *, struct patch_ids *);
index 15873bf24d72175b026e28071e83b1db7a7ed004..8a29cb03c51be119589a763b170809b8208e1857 100644 (file)
@@ -846,7 +846,7 @@ static void cherry_pick_list(struct commit_list *list, struct rev_info *revs)
                 */
                if (left_first != !!(flags & SYMMETRIC_LEFT))
                        continue;
-               commit->util = add_commit_patch_id(commit, &ids);
+               add_commit_patch_id(commit, &ids);
        }
 
        /* either cherry_mark or cherry_pick are true */
@@ -873,21 +873,9 @@ static void cherry_pick_list(struct commit_list *list, struct rev_info *revs)
                id = has_commit_patch_id(commit, &ids);
                if (!id)
                        continue;
-               id->seen = 1;
-               commit->object.flags |= cherry_flag;
-       }
 
-       /* Now check the original side for seen ones */
-       for (p = list; p; p = p->next) {
-               struct commit *commit = p->item;
-               struct patch_id *ent;
-
-               ent = commit->util;
-               if (!ent)
-                       continue;
-               if (ent->seen)
-                       commit->object.flags |= cherry_flag;
-               commit->util = NULL;
+               commit->object.flags |= cherry_flag;
+               id->commit->object.flags |= cherry_flag;
        }
 
        free_patch_ids(&ids);
diff --git a/t/perf/p3400-rebase.sh b/t/perf/p3400-rebase.sh
new file mode 100644 (file)
index 0000000..b3e7d52
--- /dev/null
@@ -0,0 +1,36 @@
+#!/bin/sh
+
+test_description='Tests rebase performance'
+. ./perf-lib.sh
+
+test_perf_default_repo
+
+test_expect_success 'setup' '
+       git checkout -f -b base &&
+       git checkout -b to-rebase &&
+       git checkout -b upstream &&
+       for i in $(seq 100)
+       do
+               # simulate huge diffs
+               echo change$i >unrelated-file$i &&
+               seq 1000 >>unrelated-file$i &&
+               git add unrelated-file$i &&
+               test_tick &&
+               git commit -m commit$i unrelated-file$i &&
+               echo change$i >unrelated-file$i &&
+               seq 1000 | tac >>unrelated-file$i &&
+               git add unrelated-file$i &&
+               test_tick &&
+               git commit -m commit$i-reverse unrelated-file$i ||
+               break
+       done &&
+       git checkout to-rebase &&
+       test_commit our-patch interesting-file
+'
+
+test_perf 'rebase on top of a lot of unrelated changes' '
+       git rebase --onto upstream HEAD^ &&
+       git rebase --onto base HEAD^
+'
+
+test_done
index 28d4f6b259c1696435698cb3826c837c8eaf90e3..1408b608eb030f0e6bcc3d531aad426da5f06b12 100755 (executable)
@@ -207,4 +207,25 @@ test_expect_success '--count --left-right' '
        test_cmp expect actual
 '
 
+# Corrupt the object store deliberately to make sure
+# the object is not even checked for its existence.
+remove_loose_object () {
+       sha1="$(git rev-parse "$1")" &&
+       remainder=${sha1#??} &&
+       firsttwo=${sha1%$remainder} &&
+       rm .git/objects/$firsttwo/$remainder
+}
+
+test_expect_success '--cherry-pick avoids looking at full diffs' '
+       git checkout -b shy-diff &&
+       test_commit dont-look-at-me &&
+       echo Hello >dont-look-at-me.t &&
+       test_tick &&
+       git commit -m tip dont-look-at-me.t &&
+       git checkout -b mainline HEAD^ &&
+       test_commit to-cherry-pick &&
+       remove_loose_object shy-diff^:dont-look-at-me.t &&
+       git rev-list --cherry-pick ...shy-diff
+'
+
 test_done