1/*2* Copyright (C) 2005 Junio C Hamano3*/4#include "cache.h"5#include "diff.h"6#include "diffcore.h"7#include "delta.h"8#include "count-delta.h"910/* Table of rename/copy destinations */1112static struct diff_rename_dst {13struct diff_filespec *two;14struct diff_filepair *pair;15} *rename_dst;16static int rename_dst_nr, rename_dst_alloc;1718static struct diff_rename_dst *locate_rename_dst(struct diff_filespec *two,19int insert_ok)20{21int first, last;2223first = 0;24last = rename_dst_nr;25while (last > first) {26int next = (last + first) >> 1;27struct diff_rename_dst *dst = &(rename_dst[next]);28int cmp = strcmp(two->path, dst->two->path);29if (!cmp)30return dst;31if (cmp < 0) {32last = next;33continue;34}35first = next+1;36}37/* not found */38if (!insert_ok)39return NULL;40/* insert to make it at "first" */41if (rename_dst_alloc <= rename_dst_nr) {42rename_dst_alloc = alloc_nr(rename_dst_alloc);43rename_dst = xrealloc(rename_dst,44rename_dst_alloc * sizeof(*rename_dst));45}46rename_dst_nr++;47if (first < rename_dst_nr)48memmove(rename_dst + first + 1, rename_dst + first,49(rename_dst_nr - first - 1) * sizeof(*rename_dst));50rename_dst[first].two = alloc_filespec(two->path);51fill_filespec(rename_dst[first].two, two->sha1, two->mode);52rename_dst[first].pair = NULL;53return &(rename_dst[first]);54}5556/* Table of rename/copy src files */57static struct diff_rename_src {58struct diff_filespec *one;59unsigned src_path_left : 1;60} *rename_src;61static int rename_src_nr, rename_src_alloc;6263static struct diff_rename_src *register_rename_src(struct diff_filespec *one,64int src_path_left)65{66int first, last;6768first = 0;69last = rename_src_nr;70while (last > first) {71int next = (last + first) >> 1;72struct diff_rename_src *src = &(rename_src[next]);73int cmp = strcmp(one->path, src->one->path);74if (!cmp)75return src;76if (cmp < 0) {77last = next;78continue;79}80first = next+1;81}8283/* insert to make it at "first" */84if (rename_src_alloc <= rename_src_nr) {85rename_src_alloc = alloc_nr(rename_src_alloc);86rename_src = xrealloc(rename_src,87rename_src_alloc * sizeof(*rename_src));88}89rename_src_nr++;90if (first < rename_src_nr)91memmove(rename_src + first + 1, rename_src + first,92(rename_src_nr - first - 1) * sizeof(*rename_src));93rename_src[first].one = one;94rename_src[first].src_path_left = src_path_left;95return &(rename_src[first]);96}9798static int is_exact_match(struct diff_filespec *src, struct diff_filespec *dst)99{100if (src->sha1_valid && dst->sha1_valid &&101!memcmp(src->sha1, dst->sha1, 20))102return 1;103if (diff_populate_filespec(src, 1) || diff_populate_filespec(dst, 1))104return 0;105if (src->size != dst->size)106return 0;107if (diff_populate_filespec(src, 0) || diff_populate_filespec(dst, 0))108return 0;109if (src->size == dst->size &&110!memcmp(src->data, dst->data, src->size))111return 1;112return 0;113}114115struct diff_score {116int src; /* index in rename_src */117int dst; /* index in rename_dst */118int score;119};120121static int estimate_similarity(struct diff_filespec *src,122struct diff_filespec *dst,123int minimum_score)124{125/* src points at a file that existed in the original tree (or126* optionally a file in the destination tree) and dst points127* at a newly created file. They may be quite similar, in which128* case we want to say src is renamed to dst or src is copied into129* dst, and then some edit has been applied to dst.130*131* Compare them and return how similar they are, representing132* the score as an integer between 0 and MAX_SCORE.133*134* When there is an exact match, it is considered a better135* match than anything else; the destination does not even136* call into this function in that case.137*/138void *delta;139unsigned long delta_size, base_size, src_copied, literal_added;140unsigned long delta_limit;141int score;142143/* We deal only with regular files. Symlink renames are handled144* only when they are exact matches --- in other words, no edits145* after renaming.146*/147if (!S_ISREG(src->mode) || !S_ISREG(dst->mode))148return 0;149150delta_size = ((src->size < dst->size) ?151(dst->size - src->size) : (src->size - dst->size));152base_size = ((src->size < dst->size) ? src->size : dst->size);153154/* We would not consider edits that change the file size so155* drastically. delta_size must be smaller than156* (MAX_SCORE-minimum_score)/MAX_SCORE * min(src->size, dst->size).157*158* Note that base_size == 0 case is handled here already159* and the final score computation below would not have a160* divide-by-zero issue.161*/162if (base_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE)163return 0;164165if (diff_populate_filespec(src, 0) || diff_populate_filespec(dst, 0))166return 0; /* error but caught downstream */167168delta_limit = base_size * (MAX_SCORE-minimum_score) / MAX_SCORE;169delta = diff_delta(src->data, src->size,170dst->data, dst->size,171&delta_size, delta_limit);172if (!delta)173/* If delta_limit is exceeded, we have too much differences */174return 0;175176/* A delta that has a lot of literal additions would have177* big delta_size no matter what else it does.178*/179if (base_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE)180return 0;181182/* Estimate the edit size by interpreting delta. */183if (count_delta(delta, delta_size, &src_copied, &literal_added)) {184free(delta);185return 0;186}187free(delta);188189/* Extent of damage */190if (src->size + literal_added < src_copied)191delta_size = 0;192else193delta_size = (src->size - src_copied) + literal_added;194195/*196* Now we will give some score to it. 100% edit gets 0 points197* and 0% edit gets MAX_SCORE points.198*/199score = MAX_SCORE - (MAX_SCORE * delta_size / base_size);200if (score < 0) return 0;201if (MAX_SCORE < score) return MAX_SCORE;202return score;203}204205static void record_rename_pair(int dst_index, int src_index, int score)206{207struct diff_filespec *one, *two, *src, *dst;208struct diff_filepair *dp;209210if (rename_dst[dst_index].pair)211die("internal error: dst already matched.");212213src = rename_src[src_index].one;214one = alloc_filespec(src->path);215fill_filespec(one, src->sha1, src->mode);216217dst = rename_dst[dst_index].two;218two = alloc_filespec(dst->path);219fill_filespec(two, dst->sha1, dst->mode);220221dp = diff_queue(NULL, one, two);222dp->score = score;223dp->source_stays = rename_src[src_index].src_path_left;224rename_dst[dst_index].pair = dp;225}226227/*228* We sort the rename similarity matrix with the score, in descending229* order (the most similar first).230*/231static int score_compare(const void *a_, const void *b_)232{233const struct diff_score *a = a_, *b = b_;234return b->score - a->score;235}236237static int compute_stays(struct diff_queue_struct *q,238struct diff_filespec *one)239{240int i;241for (i = 0; i < q->nr; i++) {242struct diff_filepair *p = q->queue[i];243if (strcmp(one->path, p->two->path))244continue;245if (DIFF_PAIR_RENAME(p)) {246return 0; /* something else is renamed into this */247}248}249return 1;250}251252void diffcore_rename(int detect_rename, int minimum_score)253{254struct diff_queue_struct *q = &diff_queued_diff;255struct diff_queue_struct outq;256struct diff_score *mx;257int i, j, rename_count;258int num_create, num_src, dst_cnt;259260if (!minimum_score)261minimum_score = DEFAULT_RENAME_SCORE;262rename_count = 0;263264for (i = 0; i < q->nr; i++) {265struct diff_filepair *p = q->queue[i];266if (!DIFF_FILE_VALID(p->one))267if (!DIFF_FILE_VALID(p->two))268continue; /* unmerged */269else270locate_rename_dst(p->two, 1);271else if (!DIFF_FILE_VALID(p->two)) {272/* If the source is a broken "delete", and273* they did not really want to get broken,274* that means the source actually stays.275*/276int stays = (p->broken_pair && !p->score);277register_rename_src(p->one, stays);278}279else if (detect_rename == DIFF_DETECT_COPY)280register_rename_src(p->one, 1);281}282if (rename_dst_nr == 0)283goto cleanup; /* nothing to do */284285/* We really want to cull the candidates list early286* with cheap tests in order to avoid doing deltas.287*/288for (i = 0; i < rename_dst_nr; i++) {289struct diff_filespec *two = rename_dst[i].two;290for (j = 0; j < rename_src_nr; j++) {291struct diff_filespec *one = rename_src[j].one;292if (!is_exact_match(one, two))293continue;294record_rename_pair(i, j, MAX_SCORE);295rename_count++;296break; /* we are done with this entry */297}298}299300/* Have we run out the created file pool? If so we can avoid301* doing the delta matrix altogether.302*/303if (rename_count == rename_dst_nr)304goto cleanup;305306num_create = (rename_dst_nr - rename_count);307num_src = rename_src_nr;308mx = xmalloc(sizeof(*mx) * num_create * num_src);309for (dst_cnt = i = 0; i < rename_dst_nr; i++) {310int base = dst_cnt * num_src;311struct diff_filespec *two = rename_dst[i].two;312if (rename_dst[i].pair)313continue; /* dealt with exact match already. */314for (j = 0; j < rename_src_nr; j++) {315struct diff_filespec *one = rename_src[j].one;316struct diff_score *m = &mx[base+j];317m->src = j;318m->dst = i;319m->score = estimate_similarity(one, two,320minimum_score);321}322dst_cnt++;323}324/* cost matrix sorted by most to least similar pair */325qsort(mx, num_create * num_src, sizeof(*mx), score_compare);326for (i = 0; i < num_create * num_src; i++) {327struct diff_rename_dst *dst = &rename_dst[mx[i].dst];328if (dst->pair)329continue; /* already done, either exact or fuzzy. */330if (mx[i].score < minimum_score)331break; /* there is no more usable pair. */332record_rename_pair(mx[i].dst, mx[i].src, mx[i].score);333rename_count++;334}335free(mx);336337cleanup:338/* At this point, we have found some renames and copies and they339* are recorded in rename_dst. The original list is still in *q.340*/341outq.queue = NULL;342outq.nr = outq.alloc = 0;343for (i = 0; i < q->nr; i++) {344struct diff_filepair *p = q->queue[i];345struct diff_filepair *pair_to_free = NULL;346347if (!DIFF_FILE_VALID(p->one) && DIFF_FILE_VALID(p->two)) {348/*349* Creation350*351* We would output this create record if it has352* not been turned into a rename/copy already.353*/354struct diff_rename_dst *dst =355locate_rename_dst(p->two, 0);356if (dst && dst->pair) {357diff_q(&outq, dst->pair);358pair_to_free = p;359}360else361/* no matching rename/copy source, so362* record this as a creation.363*/364diff_q(&outq, p);365}366else if (DIFF_FILE_VALID(p->one) && !DIFF_FILE_VALID(p->two)) {367/*368* Deletion369*370* We would output this delete record if:371*372* (1) this is a broken delete and the counterpart373* broken create remains in the output; or374* (2) this is not a broken delete, and rename_dst375* does not have a rename/copy to move p->one->path376* out of existence.377*378* Otherwise, the counterpart broken create379* has been turned into a rename-edit; or380* delete did not have a matching create to381* begin with.382*/383if (DIFF_PAIR_BROKEN(p)) {384/* broken delete */385struct diff_rename_dst *dst =386locate_rename_dst(p->one, 0);387if (dst && dst->pair)388/* counterpart is now rename/copy */389pair_to_free = p;390}391else {392for (j = 0; j < rename_dst_nr; j++) {393if (!rename_dst[j].pair)394continue;395if (strcmp(rename_dst[j].pair->396one->path,397p->one->path))398continue;399break;400}401if (j < rename_dst_nr)402/* this path remains */403pair_to_free = p;404}405406if (pair_to_free)407;408else409diff_q(&outq, p);410}411else if (!diff_unmodified_pair(p))412/* all the usual ones need to be kept */413diff_q(&outq, p);414else415/* no need to keep unmodified pairs */416pair_to_free = p;417418if (pair_to_free)419diff_free_filepair(pair_to_free);420}421diff_debug_queue("done copying original", &outq);422423free(q->queue);424*q = outq;425diff_debug_queue("done collapsing", q);426427/* We need to see which rename source really stays here;428* earlier we only checked if the path is left in the result,429* but even if a path remains in the result, if that is coming430* from copying something else on top of it, then the original431* source is lost and does not stay.432*/433for (i = 0; i < q->nr; i++) {434struct diff_filepair *p = q->queue[i];435if (DIFF_PAIR_RENAME(p) && p->source_stays) {436/* If one appears as the target of a rename-copy,437* then mark p->source_stays = 0; otherwise438* leave it as is.439*/440p->source_stays = compute_stays(q, p->one);441}442}443444for (i = 0; i < rename_dst_nr; i++) {445diff_free_filespec_data(rename_dst[i].two);446free(rename_dst[i].two);447}448449free(rename_dst);450rename_dst = NULL;451rename_dst_nr = rename_dst_alloc = 0;452free(rename_src);453rename_src = NULL;454rename_src_nr = rename_src_alloc = 0;455return;456}