Merge branch 'jk/fewer-pack-rescan'
authorJunio C Hamano <gitster@pobox.com>
Wed, 6 Dec 2017 17:23:42 +0000 (09:23 -0800)
committerJunio C Hamano <gitster@pobox.com>
Wed, 6 Dec 2017 17:23:42 +0000 (09:23 -0800)
Internaly we use 0{40} as a placeholder object name to signal the
codepath that there is no such object (e.g. the fast-forward check
while "git fetch" stores a new remote-tracking ref says "we know
there is no 'old' thing pointed at by the ref, as we are creating
it anew" by passing 0{40} for the 'old' side), and expect that a
codepath to locate an in-core object to return NULL as a sign that
the object does not exist. A look-up for an object that does not
exist however is quite costly with a repository with large number
of packfiles. This access pattern has been optimized.

* jk/fewer-pack-rescan:
sha1_file: fast-path null sha1 as a missing object
everything_local: use "quick" object existence check
p5551: add a script to test fetch pack-dir rescans
t/perf/lib-pack: use fast-import checkpoint to create packs
p5550: factor out nonsense-pack creation

fetch-pack.c
sha1_file.c
t/perf/lib-pack.sh [new file with mode: 0644]
t/perf/p5550-fetch-tags.sh
t/perf/p5551-fetch-rescan.sh [new file with mode: 0755]
index 008b25d3db08726b0fc1569f88fba76694bbcc12..9f6b07ad91f8c2c17e85004a0e71a8b69752d120 100644 (file)
@@ -716,7 +716,8 @@ static int everything_local(struct fetch_pack_args *args,
        for (ref = *refs; ref; ref = ref->next) {
                struct object *o;
 
-               if (!has_object_file(&ref->old_oid))
+               if (!has_object_file_with_flags(&ref->old_oid,
+                                               OBJECT_INFO_QUICK))
                        continue;
 
                o = parse_object(&ref->old_oid);
index 8ae6cb6285a86fe3117200cb789bbf57a2063923..afe4b90f6ee0e6c00af3e57c7c92ac5245bd1792 100644 (file)
@@ -1164,6 +1164,9 @@ int sha1_object_info_extended(const unsigned char *sha1, struct object_info *oi,
                                    lookup_replace_object(sha1) :
                                    sha1;
 
+       if (is_null_sha1(real))
+               return -1;
+
        if (!oi)
                oi = &blank_oi;
 
diff --git a/t/perf/lib-pack.sh b/t/perf/lib-pack.sh
new file mode 100644 (file)
index 0000000..d3865db
--- /dev/null
@@ -0,0 +1,25 @@
+# Helpers for dealing with large numbers of packs.
+
+# create $1 nonsense packs, each with a single blob
+create_packs () {
+       perl -le '
+               my ($n) = @ARGV;
+               for (1..$n) {
+                       print "blob";
+                       print "data <<EOF";
+                       print "$_";
+                       print "EOF";
+                       print "checkpoint"
+               }
+       ' "$@" |
+       git fast-import
+}
+
+# create a large number of packs, disabling any gc which might
+# cause us to repack them
+setup_many_packs () {
+       git config gc.auto 0 &&
+       git config gc.autopacklimit 0 &&
+       git config fastimport.unpacklimit 0 &&
+       create_packs 500
+}
index a5dc39f86af65d6f6b200b95c4f3c66d5f482ec1..d0e0e019ea364cb1cef3863d256aa5596e01eb56 100755 (executable)
@@ -20,6 +20,7 @@ start to show a noticeable performance problem on my machine, but without
 taking too long to set up and run the tests.
 '
 . ./perf-lib.sh
+. "$TEST_DIRECTORY/perf/lib-pack.sh"
 
 # make a long nonsense history on branch $1, consisting of $2 commits, each
 # with a unique file pointing to the blob at $2.
@@ -44,26 +45,6 @@ create_tags () {
        git update-ref --stdin
 }
 
-# create $1 nonsense packs, each with a single blob
-create_packs () {
-       perl -le '
-               my ($n) = @ARGV;
-               for (1..$n) {
-                       print "blob";
-                       print "data <<EOF";
-                       print "$_";
-                       print "EOF";
-               }
-       ' "$@" |
-       git fast-import &&
-
-       git cat-file --batch-all-objects --batch-check='%(objectname)' |
-       while read sha1
-       do
-               echo $sha1 | git pack-objects .git/objects/pack/pack
-       done
-}
-
 test_expect_success 'create parent and child' '
        git init parent &&
        git -C parent commit --allow-empty -m base &&
@@ -84,9 +65,7 @@ test_expect_success 'populate parent tags' '
 test_expect_success 'create child packs' '
        (
                cd child &&
-               git config gc.auto 0 &&
-               git config gc.autopacklimit 0 &&
-               create_packs 500
+               setup_many_packs
        )
 '
 
diff --git a/t/perf/p5551-fetch-rescan.sh b/t/perf/p5551-fetch-rescan.sh
new file mode 100755 (executable)
index 0000000..b99dc23
--- /dev/null
@@ -0,0 +1,55 @@
+#!/bin/sh
+
+test_description='fetch performance with many packs
+
+It is common for fetch to consider objects that we might not have, and it is an
+easy mistake for the code to use a function like `parse_object` that might
+give the correct _answer_ on such an object, but do so slowly (due to
+re-scanning the pack directory for lookup failures).
+
+The resulting performance drop can be hard to notice in a real repository, but
+becomes quite large in a repository with a large number of packs. So this
+test creates a more pathological case, since any mistakes would produce a more
+noticeable slowdown.
+'
+. ./perf-lib.sh
+. "$TEST_DIRECTORY"/perf/lib-pack.sh
+
+test_expect_success 'create parent and child' '
+       git init parent &&
+       git clone parent child
+'
+
+
+test_expect_success 'create refs in the parent' '
+       (
+               cd parent &&
+               git commit --allow-empty -m foo &&
+               head=$(git rev-parse HEAD) &&
+               test_seq 1000 |
+               sed "s,.*,update refs/heads/& $head," |
+               $MODERN_GIT update-ref --stdin
+       )
+'
+
+test_expect_success 'create many packs in the child' '
+       (
+               cd child &&
+               setup_many_packs
+       )
+'
+
+test_perf 'fetch' '
+       # start at the same state for each iteration
+       obj=$($MODERN_GIT -C parent rev-parse HEAD) &&
+       (
+               cd child &&
+               $MODERN_GIT for-each-ref --format="delete %(refname)" refs/remotes |
+               $MODERN_GIT update-ref --stdin &&
+               rm -vf .git/objects/$(echo $obj | sed "s|^..|&/|") &&
+
+               git fetch
+       )
+'
+
+test_done