1/* 2 * Copyright (C) 2005 Junio C Hamano 3 */ 4#include"cache.h" 5#include"diff.h" 6#include"diffcore.h" 7#include"delta.h" 8 9struct diff_rename_pool { 10struct diff_filespec **s; 11int nr, alloc; 12}; 13 14static voiddiff_rename_pool_clear(struct diff_rename_pool *pool) 15{ 16 pool->s = NULL; pool->nr = pool->alloc =0; 17} 18 19static voiddiff_rename_pool_add(struct diff_rename_pool *pool, 20struct diff_filespec *s) 21{ 22if(S_ISDIR(s->mode)) 23return;/* rename/copy patch for tree does not make sense. */ 24 25if(pool->alloc <= pool->nr) { 26 pool->alloc =alloc_nr(pool->alloc); 27 pool->s =xrealloc(pool->s, 28sizeof(*(pool->s)) * pool->alloc); 29} 30 pool->s[pool->nr] = s; 31 pool->nr++; 32} 33 34static intis_exact_match(struct diff_filespec *src,struct diff_filespec *dst) 35{ 36if(src->sha1_valid && dst->sha1_valid && 37!memcmp(src->sha1, dst->sha1,20)) 38return1; 39if(diff_populate_filespec(src) ||diff_populate_filespec(dst)) 40/* this is an error but will be caught downstream */ 41return0; 42if(src->size == dst->size && 43!memcmp(src->data, dst->data, src->size)) 44return1; 45return0; 46} 47 48struct diff_score { 49struct diff_filespec *src; 50struct diff_filespec *dst; 51int score; 52int rank; 53}; 54 55static intestimate_similarity(struct diff_filespec *src, 56struct diff_filespec *dst, 57int minimum_score) 58{ 59/* src points at a file that existed in the original tree (or 60 * optionally a file in the destination tree) and dst points 61 * at a newly created file. They may be quite similar, in which 62 * case we want to say src is renamed to dst or src is copied into 63 * dst, and then some edit has been applied to dst. 64 * 65 * Compare them and return how similar they are, representing 66 * the score as an integer between 0 and 10000, except 67 * where they match exactly it is considered better than anything 68 * else. 69 */ 70void*delta; 71unsigned long delta_size; 72int score; 73 74 delta_size = ((src->size < dst->size) ? 75(dst->size - src->size) : (src->size - dst->size)); 76 77/* We would not consider rename followed by more than 78 * minimum_score/MAX_SCORE edits; that is, delta_size must be smaller 79 * than (src->size + dst->size)/2 * minimum_score/MAX_SCORE, 80 * which means... 81 */ 82 83if((src->size+dst->size)*minimum_score < delta_size*MAX_SCORE*2) 84return0; 85 86 delta =diff_delta(src->data, src->size, 87 dst->data, dst->size, 88&delta_size); 89free(delta); 90 91/* This "delta" is really xdiff with adler32 and all the 92 * overheads but it is a quick and dirty approximation. 93 * 94 * Now we will give some score to it. 100% edit gets 95 * 0 points and 0% edit gets MAX_SCORE points. That is, every 96 * 1/MAX_SCORE edit gets 1 point penalty. The amount of penalty is: 97 * 98 * (delta_size * 2 / (src->size + dst->size)) * MAX_SCORE 99 * 100 */ 101 score = MAX_SCORE-(MAX_SCORE*2*delta_size/(src->size+dst->size)); 102if(score <0)return0; 103if(MAX_SCORE < score)return MAX_SCORE; 104return score; 105} 106 107static voidrecord_rename_pair(struct diff_queue_struct *outq, 108struct diff_filespec *src, 109struct diff_filespec *dst, 110int rank, 111int score) 112{ 113/* The rank is used to sort the final output, because there 114 * are certain dependencies. 115 * 116 * - rank #0 depends on deleted ones. 117 * - rank #1 depends on kept files before they are modified. 118 * - rank #2 depends on kept files after they are modified; 119 * currently not used. 120 * 121 * Therefore, the final output order should be: 122 * 123 * 1. rank #0 rename/copy diffs. 124 * 2. deletions in the original. 125 * 3. rank #1 rename/copy diffs. 126 * 4. additions and modifications in the original. 127 * 5. rank #2 rename/copy diffs; currently not used. 128 * 129 * To achieve this sort order, we give xform_work the number 130 * above. 131 */ 132struct diff_file_pair *dp =diff_queue(outq, src, dst); 133 dp->xfrm_work = (rank *2+1) | (score<<RENAME_SCORE_SHIFT); 134 dst->xfrm_flags |= RENAME_DST_MATCHED; 135} 136 137#if 0 138static voiddebug_filespec(struct diff_filespec *s,int x,const char*one) 139{ 140fprintf(stderr,"queue[%d]%s(%s)%s %06o%s\n", 141 x, one, 142 s->path, 143 s->file_valid ?"valid":"invalid", 144 s->mode, 145 s->sha1_valid ?sha1_to_hex(s->sha1) :""); 146fprintf(stderr,"queue[%d]%ssize%lu flags%d\n", 147 x, one, 148 s->size, s->xfrm_flags); 149} 150 151static voiddebug_filepair(const struct diff_file_pair *p,int i) 152{ 153debug_filespec(p->one, i,"one"); 154debug_filespec(p->two, i,"two"); 155fprintf(stderr,"pair flags%d, orig order%d, score%d\n", 156(p->xfrm_work & ((1<<RENAME_SCORE_SHIFT) -1)), 157 p->orig_order, 158(p->xfrm_work >> RENAME_SCORE_SHIFT)); 159} 160 161static voiddebug_queue(const char*msg,struct diff_queue_struct *q) 162{ 163int i; 164if(msg) 165fprintf(stderr,"%s\n", msg); 166fprintf(stderr,"q->nr =%d\n", q->nr); 167for(i =0; i < q->nr; i++) { 168struct diff_file_pair *p = q->queue[i]; 169debug_filepair(p, i); 170} 171} 172#else 173#define debug_queue(a,b) do { ;/*nothing*/ } while(0) 174#endif 175 176/* 177 * We sort the outstanding diff entries according to the rank (see 178 * comment at the beginning of record_rename_pair) and tiebreak with 179 * the order in the original input. 180 */ 181static intrank_compare(const void*a_,const void*b_) 182{ 183const struct diff_file_pair *a = *(const struct diff_file_pair **)a_; 184const struct diff_file_pair *b = *(const struct diff_file_pair **)b_; 185int a_rank = a->xfrm_work & ((1<<RENAME_SCORE_SHIFT) -1); 186int b_rank = b->xfrm_work & ((1<<RENAME_SCORE_SHIFT) -1); 187 188if(a_rank != b_rank) 189return a_rank - b_rank; 190return a->orig_order - b->orig_order; 191} 192 193/* 194 * We sort the rename similarity matrix with the score, in descending 195 * order (more similar first). 196 */ 197static intscore_compare(const void*a_,const void*b_) 198{ 199const struct diff_score *a = a_, *b = b_; 200return b->score - a->score; 201} 202 203static intneeds_to_stay(struct diff_queue_struct *q,int i, 204struct diff_filespec *it) 205{ 206/* If it will be used in later entry (either stay or used 207 * as the source of rename/copy), we need to copy, not rename. 208 */ 209while(i < q->nr) { 210struct diff_file_pair *p = q->queue[i++]; 211if(!p->two->file_valid) 212continue;/* removed is fine */ 213if(strcmp(p->one->path, it->path)) 214continue;/* not relevant */ 215 216/* p has its src set to *it and it is not a delete; 217 * it will be used for in-place change or rename/copy, 218 * so we cannot rename it out. 219 */ 220return1; 221} 222return0; 223} 224 225voiddiff_detect_rename(struct diff_queue_struct *q, 226int detect_rename, 227int minimum_score) 228{ 229struct diff_queue_struct outq; 230struct diff_rename_pool created, deleted, stay; 231struct diff_rename_pool *(srcs[2]); 232struct diff_score *mx; 233int h, i, j; 234int num_create, num_src, dst_cnt, src_cnt; 235 236 outq.queue = NULL; 237 outq.nr = outq.alloc =0; 238 239diff_rename_pool_clear(&created); 240diff_rename_pool_clear(&deleted); 241diff_rename_pool_clear(&stay); 242 243 srcs[0] = &deleted; 244 srcs[1] = &stay; 245 246/* NEEDSWORK: 247 * (1) make sure we properly ignore but pass trees. 248 * 249 * (2) make sure we do right thing on the same path deleted 250 * and created in the same patch. 251 */ 252 253for(i =0; i < q->nr; i++) { 254struct diff_file_pair *p = q->queue[i]; 255if(!p->one->file_valid) 256if(!p->two->file_valid) 257continue;/* ignore nonsense */ 258else 259diff_rename_pool_add(&created, p->two); 260else if(!p->two->file_valid) 261diff_rename_pool_add(&deleted, p->one); 262else if(1< detect_rename)/* find copy, too */ 263diff_rename_pool_add(&stay, p->one); 264} 265if(created.nr ==0) 266goto cleanup;/* nothing to do */ 267 268/* We really want to cull the candidates list early 269 * with cheap tests in order to avoid doing deltas. 270 * 271 * With the current callers, we should not have already 272 * matched entries at this point, but it is nonetheless 273 * checked for sanity. 274 */ 275for(i =0; i < created.nr; i++) { 276if(created.s[i]->xfrm_flags & RENAME_DST_MATCHED) 277continue;/* we have matched exactly already */ 278for(h =0; h <sizeof(srcs)/sizeof(srcs[0]); h++) { 279struct diff_rename_pool *p = srcs[h]; 280for(j =0; j < p->nr; j++) { 281if(!is_exact_match(p->s[j], created.s[i])) 282continue; 283record_rename_pair(&outq, 284 p->s[j], created.s[i], h, 285 MAX_SCORE); 286break;/* we are done with this entry */ 287} 288} 289} 290debug_queue("done detecting exact", &outq); 291 292/* Have we run out the created file pool? If so we can avoid 293 * doing the delta matrix altogether. 294 */ 295if(outq.nr == created.nr) 296goto flush_rest; 297 298 num_create = (created.nr - outq.nr); 299 num_src = deleted.nr + stay.nr; 300 mx =xmalloc(sizeof(*mx) * num_create * num_src); 301for(dst_cnt = i =0; i < created.nr; i++) { 302int base = dst_cnt * num_src; 303if(created.s[i]->xfrm_flags & RENAME_DST_MATCHED) 304continue;/* dealt with exact match already. */ 305for(src_cnt = h =0; h <sizeof(srcs)/sizeof(srcs[0]); h++) { 306struct diff_rename_pool *p = srcs[h]; 307for(j =0; j < p->nr; j++, src_cnt++) { 308struct diff_score *m = &mx[base + src_cnt]; 309 m->src = p->s[j]; 310 m->dst = created.s[i]; 311 m->score =estimate_similarity(m->src, m->dst, 312 minimum_score); 313 m->rank = h; 314} 315} 316 dst_cnt++; 317} 318/* cost matrix sorted by most to least similar pair */ 319qsort(mx, num_create * num_src,sizeof(*mx), score_compare); 320for(i =0; i < num_create * num_src; i++) { 321if(mx[i].dst->xfrm_flags & RENAME_DST_MATCHED) 322continue;/* alreayd done, either exact or fuzzy. */ 323if(mx[i].score < minimum_score) 324continue; 325record_rename_pair(&outq, 326 mx[i].src, mx[i].dst, mx[i].rank, 327 mx[i].score); 328} 329free(mx); 330debug_queue("done detecting fuzzy", &outq); 331 332 flush_rest: 333/* At this point, we have found some renames and copies and they 334 * are kept in outq. The original list is still in *q. 335 * 336 * Scan the original list and move them into the outq; we will sort 337 * outq and swap it into the queue supplied to pass that to 338 * downstream, so we assign the sort keys in this loop. 339 * 340 * See comments at the top of record_rename_pair for numbers used 341 * to assign xfrm_work. 342 * 343 * Note that we have not annotated the diff_file_pair with any comment 344 * so there is nothing other than p to free. 345 */ 346for(i =0; i < q->nr; i++) { 347struct diff_file_pair *dp, *p = q->queue[i]; 348if(!p->one->file_valid) { 349if(p->two->file_valid) { 350/* creation */ 351 dp =diff_queue(&outq, p->one, p->two); 352 dp->xfrm_work =4; 353} 354/* otherwise it is a nonsense; just ignore it */ 355} 356else if(!p->two->file_valid) { 357/* deletion */ 358 dp =diff_queue(&outq, p->one, p->two); 359 dp->xfrm_work =2; 360} 361else{ 362/* modification, or stay as is */ 363 dp =diff_queue(&outq, p->one, p->two); 364 dp->xfrm_work =4; 365} 366free(p); 367} 368debug_queue("done copying original", &outq); 369 370/* Sort outq */ 371qsort(outq.queue, outq.nr,sizeof(outq.queue[0]), rank_compare); 372 373debug_queue("done sorting", &outq); 374 375free(q->queue); 376 q->nr = q->alloc =0; 377 q->queue = NULL; 378 379/* Copy it out to q, removing duplicates. */ 380for(i =0; i < outq.nr; i++) { 381struct diff_file_pair *p = outq.queue[i]; 382if(!p->one->file_valid) { 383/* created */ 384if(p->two->xfrm_flags & RENAME_DST_MATCHED) 385;/* rename/copy created it already */ 386else 387diff_queue(q, p->one, p->two); 388} 389else if(!p->two->file_valid) { 390/* deleted */ 391if(p->one->xfrm_flags & RENAME_SRC_GONE) 392;/* rename/copy deleted it already */ 393else 394diff_queue(q, p->one, p->two); 395} 396else if(strcmp(p->one->path, p->two->path)) { 397/* rename or copy */ 398struct diff_file_pair *dp = 399diff_queue(q, p->one, p->two); 400int msglen = (strlen(p->one->path) + 401strlen(p->two->path) +100); 402int score = (p->xfrm_work >> RENAME_SCORE_SHIFT); 403 dp->xfrm_msg =xmalloc(msglen); 404 405/* if we have a later entry that is a rename/copy 406 * that depends on p->one, then we copy here. 407 * otherwise we rename it. 408 */ 409if(needs_to_stay(&outq, i+1, p->one)) { 410/* copy it */ 411sprintf(dp->xfrm_msg, 412"similarity index%d%%\n" 413"copy from%s\n" 414"copy to%s\n", 415(int)(0.5+ score *100/ MAX_SCORE), 416 p->one->path, p->two->path); 417} 418else{ 419/* rename it, and mark it as gone. */ 420 p->one->xfrm_flags |= RENAME_SRC_GONE; 421sprintf(dp->xfrm_msg, 422"similarity index%d%%\n" 423"rename old%s\n" 424"rename new%s\n", 425(int)(0.5+ score *100/ MAX_SCORE), 426 p->one->path, p->two->path); 427} 428} 429else 430/* otherwise it is a modified (or stayed) entry */ 431diff_queue(q, p->one, p->two); 432free(p); 433} 434 435free(outq.queue); 436debug_queue("done collapsing", q); 437 438 cleanup: 439free(created.s); 440free(deleted.s); 441free(stay.s); 442return; 443}