From cee7f245dcaef6dade28464f59420095a9949aac Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Thu, 19 Oct 2006 16:00:04 -0700 Subject: git-pickaxe: blame rewritten. Currently it does what git-blame does, but only faster. More importantly, its internal structure is designed to support content movement (aka cut-and-paste) more easily by allowing more than one paths to be taken from the same commit. Signed-off-by: Junio C Hamano diff --git a/Documentation/git-pickaxe.txt b/Documentation/git-pickaxe.txt new file mode 100644 index 0000000..7685bd0 --- /dev/null +++ b/Documentation/git-pickaxe.txt @@ -0,0 +1,104 @@ +git-pickaxe(1) +============== + +NAME +---- +git-pickaxe - Show what revision and author last modified each line of a file + +SYNOPSIS +-------- +'git-pickaxe' [-c] [-l] [-t] [-f] [-n] [-p] [-L n,m] [-S ] [] [--] + +DESCRIPTION +----------- + +Annotates each line in the given file with information from the revision which +last modified the line. Optionally, start annotating from the given revision. + +Also it can limit the range of lines annotated. + +This report doesn't tell you anything about lines which have been deleted or +replaced; you need to use a tool such as gitlink:git-diff[1] or the "pickaxe" +interface briefly mentioned in the following paragraph. + +Apart from supporting file annotation, git also supports searching the +development history for when a code snippet occured in a change. This makes it +possible to track when a code snippet was added to a file, moved or copied +between files, and eventually deleted or replaced. It works by searching for +a text string in the diff. A small example: + +----------------------------------------------------------------------------- +$ git log --pretty=oneline -S'blame_usage' +5040f17eba15504bad66b14a645bddd9b015ebb7 blame -S +ea4c7f9bf69e781dd0cd88d2bccb2bf5cc15c9a7 git-blame: Make the output +----------------------------------------------------------------------------- + +OPTIONS +------- +-c, --compatibility:: + Use the same output mode as gitlink:git-annotate[1] (Default: off). + +-L n,m:: + Annotate only the specified line range (lines count from 1). + +-l, --long:: + Show long rev (Default: off). + +-t, --time:: + Show raw timestamp (Default: off). + +-S, --rev-file :: + Use revs from revs-file instead of calling gitlink:git-rev-list[1]. + +-f, --show-name:: + Show filename in the original commit. By default + filename is shown if there is any line that came from a + file with different name, due to rename detection. + +-n, --show-number:: + Show line number in the original commit (Default: off). + +-p, --porcelain:: + Show in a format designed for machine consumption. + +-h, --help:: + Show help message. + + +THE PORCELAIN FORMAT +-------------------- + +In this format, each line is output after a header; the +header at the minumum has the first line which has: + +- 40-byte SHA-1 of the commit the line is attributed to; +- the line number of the line in the original file; +- the line number of the line in the final file; +- on a line that starts a group of line from a different + commit than the previous one, the number of lines in this + group. On subsequent lines this field is absent. + +This header line is followed by the following information +at least once for each commit: + +- author name ("author"), email ("author-mail"), time + ("author-time"), and timezone ("author-tz"); similarly + for committer. +- filename in the commit the line is attributed to. +- the first line of the commit log message ("summary"). + +The contents of the actual line is output after the above +header, prefixed by a TAB. This is to allow adding more +header elements later. + +SEE ALSO +-------- +gitlink:git-blame[1] + +AUTHOR +------ +Written by Junio C Hamano + +GIT +--- +Part of the gitlink:git[7] suite diff --git a/Documentation/git.txt b/Documentation/git.txt index 3af6fc6..7074e32 100644 --- a/Documentation/git.txt +++ b/Documentation/git.txt @@ -430,6 +430,9 @@ gitlink:git-annotate[1]:: gitlink:git-blame[1]:: Blame file lines on commits. +gitlink:git-pickaxe[1]:: + Find out where each line in a file came from. + gitlink:git-check-ref-format[1]:: Make sure ref name is well formed. diff --git a/Makefile b/Makefile index 66c8b4b..461fef6 100644 --- a/Makefile +++ b/Makefile @@ -288,6 +288,7 @@ BUILTIN_OBJS = \ builtin-mv.o \ builtin-name-rev.o \ builtin-pack-objects.o \ + builtin-pickaxe.o \ builtin-prune.o \ builtin-prune-packed.o \ builtin-push.o \ diff --git a/builtin-pickaxe.c b/builtin-pickaxe.c new file mode 100644 index 0000000..cb69fcc --- /dev/null +++ b/builtin-pickaxe.c @@ -0,0 +1,1194 @@ +/* + * Pickaxe + * + * Copyright (c) 2006, Junio C Hamano + */ + +#include "cache.h" +#include "builtin.h" +#include "blob.h" +#include "commit.h" +#include "tag.h" +#include "tree-walk.h" +#include "diff.h" +#include "diffcore.h" +#include "revision.h" +#include "xdiff-interface.h" + +#include +#include + +static char pickaxe_usage[] = +"git-pickaxe [-c] [-l] [-t] [-f] [-n] [-p] [-L n,m] [-S ] [commit] [--] file\n" +" -c, --compatibility Use the same output mode as git-annotate (Default: off)\n" +" -l, --long Show long commit SHA1 (Default: off)\n" +" -t, --time Show raw timestamp (Default: off)\n" +" -f, --show-name Show original filename (Default: auto)\n" +" -n, --show-number Show original linenumber (Default: off)\n" +" -p, --porcelain Show in a format designed for machine consumption\n" +" -L n,m Process only line range n,m, counting from 1\n" +" -S revs-file Use revisions from revs-file instead of calling git-rev-list\n"; + +static int longest_file; +static int longest_author; +static int max_orig_digits; +static int max_digits; + +#define DEBUG 0 + +/* bits #0..7 in revision.h, #8..11 used for merge_bases() in commit.c */ +#define METAINFO_SHOWN (1u<<12) +#define MORE_THAN_ONE_PATH (1u<<13) + +/* + * One blob in a commit + */ +struct origin { + struct commit *commit; + unsigned char blob_sha1[20]; + char path[FLEX_ARRAY]; +}; + +struct blame_entry { + struct blame_entry *prev; + struct blame_entry *next; + + /* the first line of this group in the final image; + * internally all line numbers are 0 based. + */ + int lno; + + /* how many lines this group has */ + int num_lines; + + /* the commit that introduced this group into the final image */ + struct origin *suspect; + + /* true if the suspect is truly guilty; false while we have not + * checked if the group came from one of its parents. + */ + char guilty; + + /* the line number of the first line of this group in the + * suspect's file; internally all line numbers are 0 based. + */ + int s_lno; +}; + +struct scoreboard { + /* the final commit (i.e. where we started digging from) */ + struct commit *final; + + const char *path; + + /* the contents in the final; pointed into by buf pointers of + * blame_entries + */ + const char *final_buf; + unsigned long final_buf_size; + + /* linked list of blames */ + struct blame_entry *ent; + + int num_lines; + int *lineno; +}; + +static void coalesce(struct scoreboard *sb) +{ + struct blame_entry *ent, *next; + + for (ent = sb->ent; ent && (next = ent->next); ent = next) { + if (ent->suspect == next->suspect && + ent->guilty == next->guilty && + ent->s_lno + ent->num_lines == next->s_lno) { + ent->num_lines += next->num_lines; + ent->next = next->next; + if (ent->next) + ent->next->prev = ent; + free(next); + next = ent; /* again */ + } + } +} + +static void free_origin(struct origin *o) +{ + free(o); +} + +static struct origin *find_origin(struct scoreboard *sb, + struct commit *commit, + const char *path) +{ + struct blame_entry *ent; + struct origin *o; + unsigned mode; + char type[10]; + + for (ent = sb->ent; ent; ent = ent->next) { + if (ent->suspect->commit == commit && + !strcmp(ent->suspect->path, path)) + return ent->suspect; + } + + o = xcalloc(1, sizeof(*o) + strlen(path) + 1); + o->commit = commit; + strcpy(o->path, path); + if (get_tree_entry(commit->object.sha1, path, o->blob_sha1, &mode)) + goto err_out; + if (sha1_object_info(o->blob_sha1, type, NULL) || + strcmp(type, blob_type)) + goto err_out; + return o; + err_out: + free_origin(o); + return NULL; +} + +static struct origin *find_rename(struct scoreboard *sb, + struct commit *parent, + struct origin *origin) +{ + struct origin *porigin = NULL; + struct diff_options diff_opts; + int i; + const char *paths[1]; + + diff_setup(&diff_opts); + diff_opts.recursive = 1; + diff_opts.detect_rename = DIFF_DETECT_RENAME; + diff_opts.output_format = DIFF_FORMAT_NO_OUTPUT; + paths[0] = NULL; + diff_tree_setup_paths(paths, &diff_opts); + if (diff_setup_done(&diff_opts) < 0) + die("diff-setup"); + diff_tree_sha1(origin->commit->tree->object.sha1, + parent->tree->object.sha1, + "", &diff_opts); + diffcore_std(&diff_opts); + + for (i = 0; i < diff_queued_diff.nr; i++) { + struct diff_filepair *p = diff_queued_diff.queue[i]; + if (p->status == 'R' && !strcmp(p->one->path, origin->path)) { + porigin = find_origin(sb, parent, p->two->path); + break; + } + } + diff_flush(&diff_opts); + return porigin; +} + +struct chunk { + /* line number in postimage; up to but not including this + * line is the same as preimage + */ + int same; + + /* preimage line number after this chunk */ + int p_next; + + /* postimage line number after this chunk */ + int t_next; +}; + +struct patch { + struct chunk *chunks; + int num; +}; + +struct blame_diff_state { + struct xdiff_emit_state xm; + struct patch *ret; + unsigned hunk_post_context; + unsigned hunk_in_pre_context : 1; +}; + +static void process_u_diff(void *state_, char *line, unsigned long len) +{ + struct blame_diff_state *state = state_; + struct chunk *chunk; + int off1, off2, len1, len2, num; + + if (DEBUG) + fprintf(stderr, "%.*s", (int) len, line); + + num = state->ret->num; + if (len < 4 || line[0] != '@' || line[1] != '@') { + if (state->hunk_in_pre_context && line[0] == ' ') + state->ret->chunks[num - 1].same++; + else { + state->hunk_in_pre_context = 0; + if (line[0] == ' ') + state->hunk_post_context++; + else + state->hunk_post_context = 0; + } + return; + } + + if (num && state->hunk_post_context) { + chunk = &state->ret->chunks[num - 1]; + chunk->p_next -= state->hunk_post_context; + chunk->t_next -= state->hunk_post_context; + } + state->ret->num = ++num; + state->ret->chunks = xrealloc(state->ret->chunks, + sizeof(struct chunk) * num); + chunk = &state->ret->chunks[num - 1]; + if (parse_hunk_header(line, len, &off1, &len1, &off2, &len2)) { + state->ret->num--; + return; + } + + /* Line numbers in patch output are one based. */ + off1--; + off2--; + + chunk->same = len2 ? off2 : (off2 + 1); + + chunk->p_next = off1 + (len1 ? len1 : 1); + chunk->t_next = chunk->same + len2; + state->hunk_in_pre_context = 1; + state->hunk_post_context = 0; +} + +static struct patch *compare_buffer(mmfile_t *file_p, mmfile_t *file_o, + int context) +{ + struct blame_diff_state state; + xpparam_t xpp; + xdemitconf_t xecfg; + xdemitcb_t ecb; + + xpp.flags = XDF_NEED_MINIMAL; + xecfg.ctxlen = context; + xecfg.flags = 0; + ecb.outf = xdiff_outf; + ecb.priv = &state; + memset(&state, 0, sizeof(state)); + state.xm.consume = process_u_diff; + state.ret = xmalloc(sizeof(struct patch)); + state.ret->chunks = NULL; + state.ret->num = 0; + + xdl_diff(file_p, file_o, &xpp, &xecfg, &ecb); + + if (state.ret->num) { + struct chunk *chunk; + chunk = &state.ret->chunks[state.ret->num - 1]; + chunk->p_next -= state.hunk_post_context; + chunk->t_next -= state.hunk_post_context; + } + return state.ret; +} + +static struct patch *get_patch(struct origin *parent, struct origin *origin) +{ + mmfile_t file_p, file_o; + char type[10]; + char *blob_p, *blob_o; + struct patch *patch; + + if (DEBUG) fprintf(stderr, "get patch %.8s %.8s\n", + sha1_to_hex(parent->commit->object.sha1), + sha1_to_hex(origin->commit->object.sha1)); + + blob_p = read_sha1_file(parent->blob_sha1, type, + (unsigned long *) &file_p.size); + blob_o = read_sha1_file(origin->blob_sha1, type, + (unsigned long *) &file_o.size); + file_p.ptr = blob_p; + file_o.ptr = blob_o; + if (!file_p.ptr || !file_o.ptr) { + free(blob_p); + free(blob_o); + return NULL; + } + + patch = compare_buffer(&file_p, &file_o, 0); + free(blob_p); + free(blob_o); + return patch; +} + +static void free_patch(struct patch *p) +{ + free(p->chunks); + free(p); +} + +static void add_blame_entry(struct scoreboard *sb, struct blame_entry *e) +{ + struct blame_entry *ent, *prev = NULL; + + for (ent = sb->ent; ent && ent->lno < e->lno; ent = ent->next) + prev = ent; + + /* prev, if not NULL, is the last one that is below e */ + e->prev = prev; + if (prev) { + e->next = prev->next; + prev->next = e; + } + else { + e->next = sb->ent; + sb->ent = e; + } + if (e->next) + e->next->prev = e; +} + +static void dup_entry(struct blame_entry *dst, struct blame_entry *src) +{ + struct blame_entry *p, *n; + p = dst->prev; + n = dst->next; + memcpy(dst, src, sizeof(*src)); + dst->prev = p; + dst->next = n; +} + +static const char *nth_line(struct scoreboard *sb, int lno) +{ + return sb->final_buf + sb->lineno[lno]; +} + +static void split_overlap(struct blame_entry split[3], + struct blame_entry *e, + int tlno, int plno, int same, + struct origin *parent) +{ + /* it is known that lines between tlno to same came from + * parent, and e has an overlap with that range. it also is + * known that parent's line plno corresponds to e's line tlno. + * + * <---- e -----> + * <------> + * <------------> + * <------------> + * <------------------> + * + * Potentially we need to split e into three parts; before + * this chunk, the chunk to be blamed for parent, and after + * that portion. + */ + int chunk_end_lno; + memset(split, 0, sizeof(struct blame_entry [3])); + + if (e->s_lno < tlno) { + /* there is a pre-chunk part not blamed on parent */ + split[0].suspect = e->suspect; + split[0].lno = e->lno; + split[0].s_lno = e->s_lno; + split[0].num_lines = tlno - e->s_lno; + split[1].lno = e->lno + tlno - e->s_lno; + split[1].s_lno = plno; + } + else { + split[1].lno = e->lno; + split[1].s_lno = plno + (e->s_lno - tlno); + } + + if (same < e->s_lno + e->num_lines) { + /* there is a post-chunk part not blamed on parent */ + split[2].suspect = e->suspect; + split[2].lno = e->lno + (same - e->s_lno); + split[2].s_lno = e->s_lno + (same - e->s_lno); + split[2].num_lines = e->s_lno + e->num_lines - same; + chunk_end_lno = split[2].lno; + } + else + chunk_end_lno = e->lno + e->num_lines; + split[1].num_lines = chunk_end_lno - split[1].lno; + + if (split[1].num_lines < 1) + return; + split[1].suspect = parent; +} + +static void split_blame(struct scoreboard *sb, + struct blame_entry split[3], + struct blame_entry *e) +{ + struct blame_entry *new_entry; + + if (split[0].suspect && split[2].suspect) { + /* we need to split e into two and add another for parent */ + dup_entry(e, &split[0]); + + new_entry = xmalloc(sizeof(*new_entry)); + memcpy(new_entry, &(split[2]), sizeof(struct blame_entry)); + add_blame_entry(sb, new_entry); + + new_entry = xmalloc(sizeof(*new_entry)); + memcpy(new_entry, &(split[1]), sizeof(struct blame_entry)); + add_blame_entry(sb, new_entry); + } + else if (!split[0].suspect && !split[2].suspect) + /* parent covers the entire area */ + dup_entry(e, &split[1]); + else if (split[0].suspect) { + dup_entry(e, &split[0]); + + new_entry = xmalloc(sizeof(*new_entry)); + memcpy(new_entry, &(split[1]), sizeof(struct blame_entry)); + add_blame_entry(sb, new_entry); + } + else { + dup_entry(e, &split[1]); + + new_entry = xmalloc(sizeof(*new_entry)); + memcpy(new_entry, &(split[2]), sizeof(struct blame_entry)); + add_blame_entry(sb, new_entry); + } + + if (DEBUG) { + struct blame_entry *ent; + int lno = 0, corrupt = 0; + + for (ent = sb->ent; ent; ent = ent->next) { + if (lno != ent->lno) + corrupt = 1; + if (ent->s_lno < 0) + corrupt = 1; + lno += ent->num_lines; + } + if (corrupt) { + lno = 0; + for (ent = sb->ent; ent; ent = ent->next) { + printf("L %8d l %8d n %8d\n", + lno, ent->lno, ent->num_lines); + lno = ent->lno + ent->num_lines; + } + die("oops"); + } + } +} + +static void blame_overlap(struct scoreboard *sb, struct blame_entry *e, + int tlno, int plno, int same, + struct origin *parent) +{ + struct blame_entry split[3]; + + split_overlap(split, e, tlno, plno, same, parent); + if (!split[1].suspect) + return; + split_blame(sb, split, e); +} + +static int find_last_in_target(struct scoreboard *sb, struct origin *target) +{ + struct blame_entry *e; + int last_in_target = -1; + + for (e = sb->ent; e; e = e->next) { + if (e->guilty || e->suspect != target) + continue; + if (last_in_target < e->s_lno + e->num_lines) + last_in_target = e->s_lno + e->num_lines; + } + return last_in_target; +} + +static void blame_chunk(struct scoreboard *sb, + int tlno, int plno, int same, + struct origin *target, struct origin *parent) +{ + struct blame_entry *e, *n; + + for (e = sb->ent; e; e = n) { + n = e->next; + if (e->guilty || e->suspect != target) + continue; + if (same <= e->s_lno) + continue; + if (tlno < e->s_lno + e->num_lines) + blame_overlap(sb, e, tlno, plno, same, parent); + } +} + +static int pass_blame_to_parent(struct scoreboard *sb, + struct origin *target, + struct origin *parent) +{ + int i, last_in_target, plno, tlno; + struct patch *patch; + + last_in_target = find_last_in_target(sb, target); + if (last_in_target < 0) + return 1; /* nothing remains for this target */ + + patch = get_patch(parent, target); + plno = tlno = 0; + for (i = 0; i < patch->num; i++) { + struct chunk *chunk = &patch->chunks[i]; + + if (DEBUG) + fprintf(stderr, + "plno = %d, tlno = %d, " + "same as parent up to %d, resync %d and %d\n", + plno, tlno, + chunk->same, chunk->p_next, chunk->t_next); + blame_chunk(sb, tlno, plno, chunk->same, target, parent); + plno = chunk->p_next; + tlno = chunk->t_next; + } + /* rest (i.e. anything above tlno) are the same as parent */ + blame_chunk(sb, tlno, plno, last_in_target, target, parent); + + free_patch(patch); + return 0; +} + +#define MAXPARENT 16 + +static void pass_blame(struct scoreboard *sb, struct origin *origin) +{ + int i; + struct commit *commit = origin->commit; + struct commit_list *parent; + struct origin *parent_origin[MAXPARENT], *porigin; + + memset(parent_origin, 0, sizeof(parent_origin)); + for (i = 0, parent = commit->parents; + i < MAXPARENT && parent; + parent = parent->next, i++) { + struct commit *p = parent->item; + + if (parse_commit(p)) + continue; + porigin = find_origin(sb, parent->item, origin->path); + if (!porigin) + porigin = find_rename(sb, parent->item, origin); + if (!porigin) + continue; + if (!hashcmp(porigin->blob_sha1, origin->blob_sha1)) { + struct blame_entry *e; + for (e = sb->ent; e; e = e->next) + if (e->suspect == origin) + e->suspect = porigin; + /* now everything blamed for origin is blamed for + * porigin, we do not need to keep it anymore. + * Do not free porigin (or the ones we got from + * earlier round); they may still be used elsewhere. + */ + free_origin(origin); + return; + } + parent_origin[i] = porigin; + } + + for (i = 0, parent = commit->parents; + i < MAXPARENT && parent; + parent = parent->next, i++) { + struct origin *porigin = parent_origin[i]; + if (!porigin) + continue; + if (pass_blame_to_parent(sb, origin, porigin)) + return; + } +} + +static void assign_blame(struct scoreboard *sb, struct rev_info *revs) +{ + while (1) { + struct blame_entry *ent; + struct commit *commit; + struct origin *suspect = NULL; + + /* find one suspect to break down */ + for (ent = sb->ent; !suspect && ent; ent = ent->next) + if (!ent->guilty) + suspect = ent->suspect; + if (!suspect) + return; /* all done */ + + commit = suspect->commit; + parse_commit(commit); + if (!(commit->object.flags & UNINTERESTING) && + !(revs->max_age != -1 && commit->date < revs->max_age)) + pass_blame(sb, suspect); + + /* Take responsibility for the remaining entries */ + for (ent = sb->ent; ent; ent = ent->next) + if (ent->suspect == suspect) + ent->guilty = 1; + } +} + +static const char *format_time(unsigned long time, const char *tz_str, + int show_raw_time) +{ + static char time_buf[128]; + time_t t = time; + int minutes, tz; + struct tm *tm; + + if (show_raw_time) { + sprintf(time_buf, "%lu %s", time, tz_str); + return time_buf; + } + + tz = atoi(tz_str); + minutes = tz < 0 ? -tz : tz; + minutes = (minutes / 100)*60 + (minutes % 100); + minutes = tz < 0 ? -minutes : minutes; + t = time + minutes * 60; + tm = gmtime(&t); + + strftime(time_buf, sizeof(time_buf), "%Y-%m-%d %H:%M:%S ", tm); + strcat(time_buf, tz_str); + return time_buf; +} + +struct commit_info +{ + char *author; + char *author_mail; + unsigned long author_time; + char *author_tz; + + /* filled only when asked for details */ + char *committer; + char *committer_mail; + unsigned long committer_time; + char *committer_tz; + + char *summary; +}; + +static void get_ac_line(const char *inbuf, const char *what, + int bufsz, char *person, char **mail, + unsigned long *time, char **tz) +{ + int len; + char *tmp, *endp; + + tmp = strstr(inbuf, what); + if (!tmp) + goto error_out; + tmp += strlen(what); + endp = strchr(tmp, '\n'); + if (!endp) + len = strlen(tmp); + else + len = endp - tmp; + if (bufsz <= len) { + error_out: + /* Ugh */ + person = *mail = *tz = "(unknown)"; + *time = 0; + return; + } + memcpy(person, tmp, len); + + tmp = person; + tmp += len; + *tmp = 0; + while (*tmp != ' ') + tmp--; + *tz = tmp+1; + + *tmp = 0; + while (*tmp != ' ') + tmp--; + *time = strtoul(tmp, NULL, 10); + + *tmp = 0; + while (*tmp != ' ') + tmp--; + *mail = tmp + 1; + *tmp = 0; +} + +static void get_commit_info(struct commit *commit, + struct commit_info *ret, + int detailed) +{ + int len; + char *tmp, *endp; + static char author_buf[1024]; + static char committer_buf[1024]; + static char summary_buf[1024]; + + ret->author = author_buf; + get_ac_line(commit->buffer, "\nauthor ", + sizeof(author_buf), author_buf, &ret->author_mail, + &ret->author_time, &ret->author_tz); + + if (!detailed) + return; + + ret->committer = committer_buf; + get_ac_line(commit->buffer, "\ncommitter ", + sizeof(committer_buf), committer_buf, &ret->committer_mail, + &ret->committer_time, &ret->committer_tz); + + ret->summary = summary_buf; + tmp = strstr(commit->buffer, "\n\n"); + if (!tmp) { + error_out: + sprintf(summary_buf, "(%s)", sha1_to_hex(commit->object.sha1)); + return; + } + tmp += 2; + endp = strchr(tmp, '\n'); + if (!endp) + goto error_out; + len = endp - tmp; + if (len >= sizeof(summary_buf)) + goto error_out; + memcpy(summary_buf, tmp, len); + summary_buf[len] = 0; +} + +#define OUTPUT_ANNOTATE_COMPAT 001 +#define OUTPUT_LONG_OBJECT_NAME 002 +#define OUTPUT_RAW_TIMESTAMP 004 +#define OUTPUT_PORCELAIN 010 +#define OUTPUT_SHOW_NAME 020 +#define OUTPUT_SHOW_NUMBER 040 + +static void emit_porcelain(struct scoreboard *sb, struct blame_entry *ent) +{ + int cnt; + const char *cp; + struct origin *suspect = ent->suspect; + char hex[41]; + + strcpy(hex, sha1_to_hex(suspect->commit->object.sha1)); + printf("%s%c%d %d %d\n", + hex, + ent->guilty ? ' ' : '*', // purely for debugging + ent->s_lno + 1, + ent->lno + 1, + ent->num_lines); + if (!(suspect->commit->object.flags & METAINFO_SHOWN)) { + struct commit_info ci; + suspect->commit->object.flags |= METAINFO_SHOWN; + get_commit_info(suspect->commit, &ci, 1); + printf("author %s\n", ci.author); + printf("author-mail %s\n", ci.author_mail); + printf("author-time %lu\n", ci.author_time); + printf("author-tz %s\n", ci.author_tz); + printf("committer %s\n", ci.committer); + printf("committer-mail %s\n", ci.committer_mail); + printf("committer-time %lu\n", ci.committer_time); + printf("committer-tz %s\n", ci.committer_tz); + printf("filename %s\n", suspect->path); + printf("summary %s\n", ci.summary); + } + else if (suspect->commit->object.flags & MORE_THAN_ONE_PATH) + printf("filename %s\n", suspect->path); + + cp = nth_line(sb, ent->lno); + for (cnt = 0; cnt < ent->num_lines; cnt++) { + char ch; + if (cnt) + printf("%s %d %d\n", hex, + ent->s_lno + 1 + cnt, + ent->lno + 1 + cnt); + putchar('\t'); + do { + ch = *cp++; + putchar(ch); + } while (ch != '\n' && + cp < sb->final_buf + sb->final_buf_size); + } +} + +static void emit_other(struct scoreboard *sb, struct blame_entry *ent, int opt) +{ + int cnt; + const char *cp; + struct origin *suspect = ent->suspect; + struct commit_info ci; + char hex[41]; + int show_raw_time = !!(opt & OUTPUT_RAW_TIMESTAMP); + + get_commit_info(suspect->commit, &ci, 1); + strcpy(hex, sha1_to_hex(suspect->commit->object.sha1)); + + cp = nth_line(sb, ent->lno); + for (cnt = 0; cnt < ent->num_lines; cnt++) { + char ch; + + printf("%.*s", (opt & OUTPUT_LONG_OBJECT_NAME) ? 40 : 8, hex); + if (opt & OUTPUT_ANNOTATE_COMPAT) + printf("\t(%10s\t%10s\t%d)", ci.author, + format_time(ci.author_time, ci.author_tz, + show_raw_time), + ent->lno + 1 + cnt); + else { + if (opt & OUTPUT_SHOW_NAME) + printf(" %-*.*s", longest_file, longest_file, + suspect->path); + if (opt & OUTPUT_SHOW_NUMBER) + printf(" %*d", max_orig_digits, + ent->s_lno + 1 + cnt); + printf(" (%-*.*s %10s %*d) ", + longest_author, longest_author, ci.author, + format_time(ci.author_time, ci.author_tz, + show_raw_time), + max_digits, ent->lno + 1 + cnt); + } + do { + ch = *cp++; + putchar(ch); + } while (ch != '\n' && + cp < sb->final_buf + sb->final_buf_size); + } +} + +static void output(struct scoreboard *sb, int option) +{ + struct blame_entry *ent; + + if (option & OUTPUT_PORCELAIN) { + for (ent = sb->ent; ent; ent = ent->next) { + struct blame_entry *oth; + struct origin *suspect = ent->suspect; + struct commit *commit = suspect->commit; + if (commit->object.flags & MORE_THAN_ONE_PATH) + continue; + for (oth = ent->next; oth; oth = oth->next) { + if ((oth->suspect->commit != commit) || + !strcmp(oth->suspect->path, suspect->path)) + continue; + commit->object.flags |= MORE_THAN_ONE_PATH; + break; + } + } + } + + for (ent = sb->ent; ent; ent = ent->next) { + if (option & OUTPUT_PORCELAIN) + emit_porcelain(sb, ent); + else + emit_other(sb, ent, option); + } +} + +static int prepare_lines(struct scoreboard *sb) +{ + const char *buf = sb->final_buf; + unsigned long len = sb->final_buf_size; + int num = 0, incomplete = 0, bol = 1; + + if (len && buf[len-1] != '\n') + incomplete++; /* incomplete line at the end */ + while (len--) { + if (bol) { + sb->lineno = xrealloc(sb->lineno, + sizeof(int* ) * (num + 1)); + sb->lineno[num] = buf - sb->final_buf; + bol = 0; + } + if (*buf++ == '\n') { + num++; + bol = 1; + } + } + sb->num_lines = num + incomplete; + return sb->num_lines; +} + +static int read_ancestry(const char *graft_file) +{ + FILE *fp = fopen(graft_file, "r"); + char buf[1024]; + if (!fp) + return -1; + while (fgets(buf, sizeof(buf), fp)) { + /* The format is just "Commit Parent1 Parent2 ...\n" */ + int len = strlen(buf); + struct commit_graft *graft = read_graft_line(buf, len); + register_commit_graft(graft, 0); + } + fclose(fp); + return 0; +} + +static int lineno_width(int lines) +{ + int i, width; + + for (width = 1, i = 10; i <= lines + 1; width++) + i *= 10; + return width; +} + +static void find_alignment(struct scoreboard *sb, int *option) +{ + int longest_src_lines = 0; + int longest_dst_lines = 0; + struct blame_entry *e; + + for (e = sb->ent; e; e = e->next) { + struct origin *suspect = e->suspect; + struct commit_info ci; + int num; + + if (!(suspect->commit->object.flags & METAINFO_SHOWN)) { + suspect->commit->object.flags |= METAINFO_SHOWN; + get_commit_info(suspect->commit, &ci, 1); + if (strcmp(suspect->path, sb->path)) + *option |= OUTPUT_SHOW_NAME; + num = strlen(suspect->path); + if (longest_file < num) + longest_file = num; + num = strlen(ci.author); + if (longest_author < num) + longest_author = num; + } + num = e->s_lno + e->num_lines; + if (longest_src_lines < num) + longest_src_lines = num; + num = e->lno + e->num_lines; + if (longest_dst_lines < num) + longest_dst_lines = num; + } + max_orig_digits = lineno_width(longest_src_lines); + max_digits = lineno_width(longest_dst_lines); +} + +static int has_path_in_work_tree(const char *path) +{ + struct stat st; + return !lstat(path, &st); +} + +int cmd_pickaxe(int argc, const char **argv, const char *prefix) +{ + struct rev_info revs; + const char *path; + struct scoreboard sb; + struct origin *o; + struct blame_entry *ent; + int i, seen_dashdash, unk; + long bottom, top, lno; + int output_option = 0; + const char *revs_file = NULL; + const char *final_commit_name = NULL; + char type[10]; + + bottom = top = 0; + seen_dashdash = 0; + for (unk = i = 1; i < argc; i++) { + const char *arg = argv[i]; + if (*arg != '-') + break; + else if (!strcmp("-c", arg)) + output_option |= OUTPUT_ANNOTATE_COMPAT; + else if (!strcmp("-t", arg)) + output_option |= OUTPUT_RAW_TIMESTAMP; + else if (!strcmp("-l", arg)) + output_option |= OUTPUT_LONG_OBJECT_NAME; + else if (!strcmp("-S", arg) && ++i < argc) + revs_file = argv[i]; + else if (!strcmp("-L", arg) && ++i < argc) { + char *term; + arg = argv[i]; + if (bottom || top) + die("More than one '-L n,m' option given"); + bottom = strtol(arg, &term, 10); + if (*term == ',') { + top = strtol(term + 1, &term, 10); + if (*term) + usage(pickaxe_usage); + } + if (bottom && top && top < bottom) { + unsigned long tmp; + tmp = top; top = bottom; bottom = tmp; + } + } + else if (!strcmp("-f", arg) || + !strcmp("--show-name", arg)) + output_option |= OUTPUT_SHOW_NAME; + else if (!strcmp("-n", arg) || + !strcmp("--show-number", arg)) + output_option |= OUTPUT_SHOW_NUMBER; + else if (!strcmp("-p", arg) || + !strcmp("--porcelain", arg)) + output_option |= OUTPUT_PORCELAIN; + else if (!strcmp("--", arg)) { + seen_dashdash = 1; + i++; + break; + } + else + argv[unk++] = arg; + } + + /* We have collected options unknown to us in argv[1..unk] + * which are to be passed to revision machinery if we are + * going to do the "bottom" procesing. + * + * The remaining are: + * + * (1) if seen_dashdash, its either + * "-options -- " or + * "-options -- ". + * but the latter is allowed only if there is no + * options that we passed to revision machinery. + * + * (2) otherwise, we may have "--" somewhere later and + * might be looking at the first one of multiple 'rev' + * parameters (e.g. " master ^next ^maint -- path"). + * See if there is a dashdash first, and give the + * arguments before that to revision machinery. + * After that there must be one 'path'. + * + * (3) otherwise, its one of the three: + * "-options " + * "-options " + * "-options " + * but again the first one is allowed only if + * there is no options that we passed to revision + * machinery. + */ + + if (seen_dashdash) { + /* (1) */ + if (argc <= i) + usage(pickaxe_usage); + path = argv[i]; + if (i + 1 == argc - 1) { + if (unk != 1) + usage(pickaxe_usage); + argv[unk++] = argv[i + 1]; + } + else if (i + 1 != argc) + /* garbage at end */ + usage(pickaxe_usage); + } + else { + int j; + for (j = i; !seen_dashdash && j < argc; j++) + if (!strcmp(argv[j], "--")) + seen_dashdash = j; + if (seen_dashdash) { + if (seen_dashdash + 1 != argc - 1) + usage(pickaxe_usage); + path = argv[seen_dashdash + 1]; + for (j = i; j < seen_dashdash; j++) + argv[unk++] = argv[j]; + } + else { + /* (3) */ + path = argv[i]; + if (i + 1 == argc - 1) { + final_commit_name = argv[i + 1]; + + /* if (unk == 1) we could be getting + * old-style + */ + if (unk == 1 && !has_path_in_work_tree(path)) { + path = argv[i + 1]; + final_commit_name = argv[i]; + } + } + else if (i != argc - 1) + usage(pickaxe_usage); /* garbage at end */ + + if (!has_path_in_work_tree(path)) + die("cannot stat path %s: %s", + path, strerror(errno)); + } + } + + if (final_commit_name) + argv[unk++] = final_commit_name; + + /* Now we got rev and path. We do not want the path pruning + * but we may want "bottom" processing. + */ + argv[unk] = NULL; + + init_revisions(&revs, NULL); + setup_revisions(unk, argv, &revs, "HEAD"); + memset(&sb, 0, sizeof(sb)); + + /* There must be one and only one positive commit in the + * revs->pending array. + */ + for (i = 0; i < revs.pending.nr; i++) { + struct object *obj = revs.pending.objects[i].item; + if (obj->flags & UNINTERESTING) + continue; + while (obj->type == OBJ_TAG) + obj = deref_tag(obj, NULL, 0); + if (obj->type != OBJ_COMMIT) + die("Non commit %s?", + revs.pending.objects[i].name); + if (sb.final) + die("More than one commit to dig from %s and %s?", + revs.pending.objects[i].name, + final_commit_name); + sb.final = (struct commit *) obj; + final_commit_name = revs.pending.objects[i].name; + } + + if (!sb.final) { + /* "--not A B -- path" without anything positive */ + unsigned char head_sha1[20]; + + final_commit_name = "HEAD"; + if (get_sha1(final_commit_name, head_sha1)) + die("No such ref: HEAD"); + sb.final = lookup_commit_reference(head_sha1); + add_pending_object(&revs, &(sb.final->object), "HEAD"); + } + + /* If we have bottom, this will mark the ancestors of the + * bottom commits we would reach while traversing as + * uninteresting. + */ + prepare_revision_walk(&revs); + + o = find_origin(&sb, sb.final, path); + if (!o) + die("no such path %s in %s", path, final_commit_name); + + sb.final_buf = read_sha1_file(o->blob_sha1, type, &sb.final_buf_size); + lno = prepare_lines(&sb); + + if (bottom < 1) + bottom = 1; + if (top < 1) + top = lno; + bottom--; + if (lno < top) + die("file %s has only %lu lines", path, lno); + + ent = xcalloc(1, sizeof(*ent)); + ent->lno = bottom; + ent->num_lines = top - bottom; + ent->suspect = o; + ent->s_lno = bottom; + + sb.ent = ent; + sb.path = path; + + if (revs_file && read_ancestry(revs_file)) + die("reading graft file %s failed: %s", + revs_file, strerror(errno)); + + assign_blame(&sb, &revs); + + coalesce(&sb); + + if (!(output_option & OUTPUT_PORCELAIN)) + find_alignment(&sb, &output_option); + + output(&sb, output_option); + free((void *)sb.final_buf); + for (ent = sb.ent; ent; ) { + struct blame_entry *e = ent->next; + free(ent); + ent = e; + } + return 0; +} diff --git a/builtin.h b/builtin.h index f9fa9ff..7451ce6 100644 --- a/builtin.h +++ b/builtin.h @@ -39,6 +39,7 @@ extern int cmd_mailsplit(int argc, const char **argv, const char *prefix); extern int cmd_mv(int argc, const char **argv, const char *prefix); extern int cmd_name_rev(int argc, const char **argv, const char *prefix); extern int cmd_pack_objects(int argc, const char **argv, const char *prefix); +extern int cmd_pickaxe(int argc, const char **argv, const char *prefix); extern int cmd_prune(int argc, const char **argv, const char *prefix); extern int cmd_prune_packed(int argc, const char **argv, const char *prefix); extern int cmd_push(int argc, const char **argv, const char *prefix); diff --git a/git.c b/git.c index e089b53..6164380 100644 --- a/git.c +++ b/git.c @@ -245,6 +245,7 @@ static void handle_internal_command(int argc, const char **argv, char **envp) { "mv", cmd_mv, RUN_SETUP }, { "name-rev", cmd_name_rev, RUN_SETUP }, { "pack-objects", cmd_pack_objects, RUN_SETUP }, + { "pickaxe", cmd_pickaxe, RUN_SETUP }, { "prune", cmd_prune, RUN_SETUP }, { "prune-packed", cmd_prune_packed, RUN_SETUP }, { "push", cmd_push, RUN_SETUP }, diff --git a/t/annotate-tests.sh b/t/annotate-tests.sh index 8baf2fe..b5ceba4 100644 --- a/t/annotate-tests.sh +++ b/t/annotate-tests.sh @@ -4,6 +4,7 @@ check_count () { head= case "$1" in -h) head="$2"; shift; shift ;; esac + echo "$PROG file $head" >&4 $PROG file $head >.result || return 1 cat .result | perl -e ' my %expect = (@ARGV); diff --git a/t/t8003-pickaxe.sh b/t/t8003-pickaxe.sh new file mode 100755 index 0000000..d09d1c9 --- /dev/null +++ b/t/t8003-pickaxe.sh @@ -0,0 +1,9 @@ +#!/bin/sh + +test_description='git-pickaxe' +. ./test-lib.sh + +PROG='git pickaxe -c' +. ../annotate-tests.sh + +test_done -- cgit v0.10.2-6-g49f6 From d24bba8008d8e537cb48e9760f7621cbe7ae9e38 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Thu, 19 Oct 2006 18:49:30 -0700 Subject: git-pickaxe -M: blame line movements within a file. This makes pickaxe more intelligent than the classic blame. A typical example is a change that moves one static C function from lower part of the file to upper part of the same file, because you added a new caller in the middle. The versions in the parent and the child would look like this: parent child A static foo() { B ... C } D A E B F C G D static foo() { ... call foo(); ... E } F H G H With the classic blame algorithm, we can blame lines A B C D E F G and H to the parent. The child is guilty of introducing the line "... call foo();", and the blame is placed on the child. However, the classic blame algorithm fails to notice that the implementation of foo() at the top of the file is not new, and moved from the lower part of the parent. This commit introduces detection of such line movements, and correctly blames the lines that were simply moved in the file to the parent. Signed-off-by: Junio C Hamano diff --git a/Documentation/git-pickaxe.txt b/Documentation/git-pickaxe.txt index 7685bd0..ebae20f 100644 --- a/Documentation/git-pickaxe.txt +++ b/Documentation/git-pickaxe.txt @@ -7,7 +7,9 @@ git-pickaxe - Show what revision and author last modified each line of a file SYNOPSIS -------- -'git-pickaxe' [-c] [-l] [-t] [-f] [-n] [-p] [-L n,m] [-S ] [] [--] +[verse] +'git-pickaxe' [-c] [-l] [-t] [-f] [-n] [-p] [-L n,m] [-S ] + [-M] [--since=] [] [--] DESCRIPTION ----------- @@ -61,6 +63,16 @@ OPTIONS -p, --porcelain:: Show in a format designed for machine consumption. +-M:: + Detect moving lines in the file as well. When a commit + moves a block of lines in a file (e.g. the original file + has A and then B, and the commit changes it to B and + then A), traditional 'blame' algorithm typically blames + the lines that were moved up (i.e. B) to the parent and + assigns blame to the lines that were moved down (i.e. A) + to the child commit. With this option, both groups of + lines are blamed on the parent. + -h, --help:: Show help message. diff --git a/builtin-pickaxe.c b/builtin-pickaxe.c index cb69fcc..e6ce655 100644 --- a/builtin-pickaxe.c +++ b/builtin-pickaxe.c @@ -19,7 +19,7 @@ #include static char pickaxe_usage[] = -"git-pickaxe [-c] [-l] [-t] [-f] [-n] [-p] [-L n,m] [-S ] [commit] [--] file\n" +"git-pickaxe [-c] [-l] [-t] [-f] [-n] [-p] [-L n,m] [-S ] [-M] [commit] [--] file\n" " -c, --compatibility Use the same output mode as git-annotate (Default: off)\n" " -l, --long Show long commit SHA1 (Default: off)\n" " -t, --time Show raw timestamp (Default: off)\n" @@ -27,6 +27,7 @@ static char pickaxe_usage[] = " -n, --show-number Show original linenumber (Default: off)\n" " -p, --porcelain Show in a format designed for machine consumption\n" " -L n,m Process only line range n,m, counting from 1\n" +" -M Find line movements within the file\n" " -S revs-file Use revisions from revs-file instead of calling git-rev-list\n"; static int longest_file; @@ -36,6 +37,8 @@ static int max_digits; #define DEBUG 0 +#define PICKAXE_BLAME_MOVE 01 + /* bits #0..7 in revision.h, #8..11 used for merge_bases() in commit.c */ #define METAINFO_SHOWN (1u<<12) #define MORE_THAN_ONE_PATH (1u<<13) @@ -542,9 +545,99 @@ static int pass_blame_to_parent(struct scoreboard *sb, return 0; } +static void copy_split_if_better(struct blame_entry best_so_far[3], + struct blame_entry this[3]) +{ + if (!this[1].suspect) + return; + if (best_so_far[1].suspect && + (this[1].num_lines < best_so_far[1].num_lines)) + return; + memcpy(best_so_far, this, sizeof(struct blame_entry [3])); +} + +static void find_copy_in_blob(struct scoreboard *sb, + struct blame_entry *ent, + struct origin *parent, + struct blame_entry split[3], + mmfile_t *file_p) +{ + const char *cp; + int cnt; + mmfile_t file_o; + struct patch *patch; + int i, plno, tlno; + + cp = nth_line(sb, ent->lno); + file_o.ptr = (char*) cp; + cnt = ent->num_lines; + + while (cnt && cp < sb->final_buf + sb->final_buf_size) { + if (*cp++ == '\n') + cnt--; + } + file_o.size = cp - file_o.ptr; + + patch = compare_buffer(file_p, &file_o, 1); + + memset(split, 0, sizeof(struct blame_entry [3])); + plno = tlno = 0; + for (i = 0; i < patch->num; i++) { + struct chunk *chunk = &patch->chunks[i]; + + /* tlno to chunk->same are the same as ent */ + if (ent->num_lines <= tlno) + break; + if (tlno < chunk->same) { + struct blame_entry this[3]; + split_overlap(this, ent, + tlno + ent->s_lno, plno, + chunk->same + ent->s_lno, + parent); + copy_split_if_better(split, this); + } + plno = chunk->p_next; + tlno = chunk->t_next; + } + free_patch(patch); +} + +static int find_move_in_parent(struct scoreboard *sb, + struct origin *target, + struct origin *parent) +{ + int last_in_target; + struct blame_entry *ent, split[3]; + mmfile_t file_p; + char type[10]; + char *blob_p; + + last_in_target = find_last_in_target(sb, target); + if (last_in_target < 0) + return 1; /* nothing remains for this target */ + + blob_p = read_sha1_file(parent->blob_sha1, type, + (unsigned long *) &file_p.size); + file_p.ptr = blob_p; + if (!file_p.ptr) { + free(blob_p); + return 0; + } + + for (ent = sb->ent; ent; ent = ent->next) { + if (ent->suspect != target || ent->guilty) + continue; + find_copy_in_blob(sb, ent, parent, split, &file_p); + if (split[1].suspect) + split_blame(sb, split, ent); + } + free(blob_p); + return 0; +} + #define MAXPARENT 16 -static void pass_blame(struct scoreboard *sb, struct origin *origin) +static void pass_blame(struct scoreboard *sb, struct origin *origin, int opt) { int i; struct commit *commit = origin->commit; @@ -589,9 +682,24 @@ static void pass_blame(struct scoreboard *sb, struct origin *origin) if (pass_blame_to_parent(sb, origin, porigin)) return; } + + /* + * Optionally run "miff" to find moves in parents' files here. + */ + if (opt & PICKAXE_BLAME_MOVE) + for (i = 0, parent = commit->parents; + i < MAXPARENT && parent; + parent = parent->next, i++) { + struct origin *porigin = parent_origin[i]; + if (!porigin) + continue; + if (find_move_in_parent(sb, origin, porigin)) + return; + } + } -static void assign_blame(struct scoreboard *sb, struct rev_info *revs) +static void assign_blame(struct scoreboard *sb, struct rev_info *revs, int opt) { while (1) { struct blame_entry *ent; @@ -609,7 +717,7 @@ static void assign_blame(struct scoreboard *sb, struct rev_info *revs) parse_commit(commit); if (!(commit->object.flags & UNINTERESTING) && !(revs->max_age != -1 && commit->date < revs->max_age)) - pass_blame(sb, suspect); + pass_blame(sb, suspect, opt); /* Take responsibility for the remaining entries */ for (ent = sb->ent; ent; ent = ent->next) @@ -967,13 +1075,14 @@ int cmd_pickaxe(int argc, const char **argv, const char *prefix) struct scoreboard sb; struct origin *o; struct blame_entry *ent; - int i, seen_dashdash, unk; + int i, seen_dashdash, unk, opt; long bottom, top, lno; int output_option = 0; const char *revs_file = NULL; const char *final_commit_name = NULL; char type[10]; + opt = 0; bottom = top = 0; seen_dashdash = 0; for (unk = i = 1; i < argc; i++) { @@ -988,6 +1097,8 @@ int cmd_pickaxe(int argc, const char **argv, const char *prefix) output_option |= OUTPUT_LONG_OBJECT_NAME; else if (!strcmp("-S", arg) && ++i < argc) revs_file = argv[i]; + else if (!strcmp("-M", arg)) + opt |= PICKAXE_BLAME_MOVE; else if (!strcmp("-L", arg) && ++i < argc) { char *term; arg = argv[i]; @@ -1176,7 +1287,7 @@ int cmd_pickaxe(int argc, const char **argv, const char *prefix) die("reading graft file %s failed: %s", revs_file, strerror(errno)); - assign_blame(&sb, &revs); + assign_blame(&sb, &revs, opt); coalesce(&sb); -- cgit v0.10.2-6-g49f6 From 18abd745a05197f498219f5ba88ce238a3d51580 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Thu, 19 Oct 2006 18:50:17 -0700 Subject: git-pickaxe -C: blame cut-and-pasted lines. This completes the initial round of git-pickaxe. In addition to the detection of line movements we already have, this finds new lines that were created by moving or cutting-and-pasting lines from different files in the parent. With this, git pickaxe -f -n -C v1.4.0 -- revision.c finds that a major part of that file actually came from rev-list.c when Linus split the latter at commit ae563642 and blames them to earlier commits that touch rev-list.c. Signed-off-by: Junio C Hamano diff --git a/Documentation/git-pickaxe.txt b/Documentation/git-pickaxe.txt index ebae20f..6d22fd9 100644 --- a/Documentation/git-pickaxe.txt +++ b/Documentation/git-pickaxe.txt @@ -9,7 +9,7 @@ SYNOPSIS -------- [verse] 'git-pickaxe' [-c] [-l] [-t] [-f] [-n] [-p] [-L n,m] [-S ] - [-M] [--since=] [] [--] + [-M] [-C] [-C] [--since=] [] [--] DESCRIPTION ----------- @@ -73,6 +73,14 @@ OPTIONS to the child commit. With this option, both groups of lines are blamed on the parent. +-C:: + In addition to `-M`, detect lines copied from other + files that were modified in the same commit. This is + useful when you reorganize your program and move code + around across files. When this option is given twice, + the command looks for copies from all other files in the + parent for the commit that creates the file in addition. + -h, --help:: Show help message. diff --git a/builtin-pickaxe.c b/builtin-pickaxe.c index e6ce655..74c7c9a 100644 --- a/builtin-pickaxe.c +++ b/builtin-pickaxe.c @@ -19,7 +19,7 @@ #include static char pickaxe_usage[] = -"git-pickaxe [-c] [-l] [-t] [-f] [-n] [-p] [-L n,m] [-S ] [-M] [commit] [--] file\n" +"git-pickaxe [-c] [-l] [-t] [-f] [-n] [-p] [-L n,m] [-S ] [-M] [-C] [-C] [commit] [--] file\n" " -c, --compatibility Use the same output mode as git-annotate (Default: off)\n" " -l, --long Show long commit SHA1 (Default: off)\n" " -t, --time Show raw timestamp (Default: off)\n" @@ -27,7 +27,7 @@ static char pickaxe_usage[] = " -n, --show-number Show original linenumber (Default: off)\n" " -p, --porcelain Show in a format designed for machine consumption\n" " -L n,m Process only line range n,m, counting from 1\n" -" -M Find line movements within the file\n" +" -M, -C Find line movements within and across files\n" " -S revs-file Use revisions from revs-file instead of calling git-rev-list\n"; static int longest_file; @@ -38,6 +38,8 @@ static int max_digits; #define DEBUG 0 #define PICKAXE_BLAME_MOVE 01 +#define PICKAXE_BLAME_COPY 02 +#define PICKAXE_BLAME_COPY_HARDER 04 /* bits #0..7 in revision.h, #8..11 used for merge_bases() in commit.c */ #define METAINFO_SHOWN (1u<<12) @@ -635,6 +637,78 @@ static int find_move_in_parent(struct scoreboard *sb, return 0; } +static int find_copy_in_parent(struct scoreboard *sb, + struct origin *target, + struct commit *parent, + struct origin *porigin, + int opt) +{ + struct diff_options diff_opts; + const char *paths[1]; + struct blame_entry *ent; + int i; + + if (find_last_in_target(sb, target) < 0) + return 1; /* nothing remains for this target */ + + diff_setup(&diff_opts); + diff_opts.recursive = 1; + diff_opts.output_format = DIFF_FORMAT_NO_OUTPUT; + + /* Try "find copies harder" on new path */ + if ((opt & PICKAXE_BLAME_COPY_HARDER) && + (!porigin || strcmp(target->path, porigin->path))) { + diff_opts.detect_rename = DIFF_DETECT_COPY; + diff_opts.find_copies_harder = 1; + } + paths[0] = NULL; + diff_tree_setup_paths(paths, &diff_opts); + if (diff_setup_done(&diff_opts) < 0) + die("diff-setup"); + diff_tree_sha1(parent->tree->object.sha1, + target->commit->tree->object.sha1, + "", &diff_opts); + diffcore_std(&diff_opts); + + for (ent = sb->ent; ent; ent = ent->next) { + struct blame_entry split[3]; + if (ent->suspect != target || ent->guilty) + continue; + + memset(split, 0, sizeof(split)); + for (i = 0; i < diff_queued_diff.nr; i++) { + struct diff_filepair *p = diff_queued_diff.queue[i]; + struct origin *norigin; + mmfile_t file_p; + char type[10]; + char *blob; + struct blame_entry this[3]; + + if (!DIFF_FILE_VALID(p->one)) + continue; /* does not exist in parent */ + if (porigin && !strcmp(p->one->path, porigin->path)) + /* find_move already dealt with this path */ + continue; + norigin = find_origin(sb, parent, p->one->path); + + blob = read_sha1_file(norigin->blob_sha1, type, + (unsigned long *) &file_p.size); + file_p.ptr = blob; + if (!file_p.ptr) { + free(blob); + continue; + } + find_copy_in_blob(sb, ent, norigin, this, &file_p); + copy_split_if_better(split, this); + } + if (split[1].suspect) + split_blame(sb, split, ent); + } + diff_flush(&diff_opts); + + return 0; +} + #define MAXPARENT 16 static void pass_blame(struct scoreboard *sb, struct origin *origin, int opt) @@ -697,6 +771,18 @@ static void pass_blame(struct scoreboard *sb, struct origin *origin, int opt) return; } + /* + * Optionally run "ciff" to find copies from parents' files here. + */ + if (opt & PICKAXE_BLAME_COPY) + for (i = 0, parent = commit->parents; + i < MAXPARENT && parent; + parent = parent->next, i++) { + struct origin *porigin = parent_origin[i]; + if (find_copy_in_parent(sb, origin, parent->item, + porigin, opt)) + return; + } } static void assign_blame(struct scoreboard *sb, struct rev_info *revs, int opt) @@ -1099,6 +1185,11 @@ int cmd_pickaxe(int argc, const char **argv, const char *prefix) revs_file = argv[i]; else if (!strcmp("-M", arg)) opt |= PICKAXE_BLAME_MOVE; + else if (!strcmp("-C", arg)) { + if (opt & PICKAXE_BLAME_COPY) + opt |= PICKAXE_BLAME_COPY_HARDER; + opt |= PICKAXE_BLAME_COPY | PICKAXE_BLAME_MOVE; + } else if (!strcmp("-L", arg) && ++i < argc) { char *term; arg = argv[i]; -- cgit v0.10.2-6-g49f6 From 1617baa5878c9b3886926f0f2e6dc52d067ada4a Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Fri, 20 Oct 2006 18:37:10 -0700 Subject: git-pickaxe: pagenate output by default. Signed-off-by: Junio C Hamano diff --git a/git.c b/git.c index 6164380..b944c37 100644 --- a/git.c +++ b/git.c @@ -245,7 +245,7 @@ static void handle_internal_command(int argc, const char **argv, char **envp) { "mv", cmd_mv, RUN_SETUP }, { "name-rev", cmd_name_rev, RUN_SETUP }, { "pack-objects", cmd_pack_objects, RUN_SETUP }, - { "pickaxe", cmd_pickaxe, RUN_SETUP }, + { "pickaxe", cmd_pickaxe, RUN_SETUP | USE_PAGER }, { "prune", cmd_prune, RUN_SETUP }, { "prune-packed", cmd_prune_packed, RUN_SETUP }, { "push", cmd_push, RUN_SETUP }, -- cgit v0.10.2-6-g49f6 From 1ca6ca876e3553d9609823a70d168c50d7beba7e Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Fri, 20 Oct 2006 18:48:18 -0700 Subject: git-pickaxe: fix nth_line() We would want to be able to refer to the end of the file as "the beginning of Nth line" for a file that is N lines long. Signed-off-by: Junio C Hamano diff --git a/builtin-pickaxe.c b/builtin-pickaxe.c index 74c7c9a..b595299 100644 --- a/builtin-pickaxe.c +++ b/builtin-pickaxe.c @@ -1085,6 +1085,9 @@ static int prepare_lines(struct scoreboard *sb) bol = 1; } } + sb->lineno = xrealloc(sb->lineno, + sizeof(int* ) * (num + incomplete + 1)); + sb->lineno[num + incomplete] = buf - sb->final_buf; sb->num_lines = num + incomplete; return sb->num_lines; } -- cgit v0.10.2-6-g49f6 From 5ff62c300209d761e8e7b3c78da19b910cd9b860 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Fri, 20 Oct 2006 14:51:12 -0700 Subject: git-pickaxe: improve "best match" heuristics Instead of comparing number of lines matched, look at the matched characters and count alnums, so that we do not pass blame on not-so-interesting lines, such as an empty line and a line that is indentation followed by a closing brace. Add an option --score-debug to show the score of each blame_entry while we cook this further on the "next" branch. Signed-off-by: Junio C Hamano diff --git a/builtin-pickaxe.c b/builtin-pickaxe.c index b595299..52e32dc 100644 --- a/builtin-pickaxe.c +++ b/builtin-pickaxe.c @@ -34,8 +34,7 @@ static int longest_file; static int longest_author; static int max_orig_digits; static int max_digits; - -#define DEBUG 0 +static int max_score_digits; #define PICKAXE_BLAME_MOVE 01 #define PICKAXE_BLAME_COPY 02 @@ -78,6 +77,11 @@ struct blame_entry { * suspect's file; internally all line numbers are 0 based. */ int s_lno; + + /* how significant this entry is -- cached to avoid + * scanning the lines over and over + */ + unsigned score; }; struct scoreboard { @@ -215,9 +219,6 @@ static void process_u_diff(void *state_, char *line, unsigned long len) struct chunk *chunk; int off1, off2, len1, len2, num; - if (DEBUG) - fprintf(stderr, "%.*s", (int) len, line); - num = state->ret->num; if (len < 4 || line[0] != '@' || line[1] != '@') { if (state->hunk_in_pre_context && line[0] == ' ') @@ -295,10 +296,6 @@ static struct patch *get_patch(struct origin *parent, struct origin *origin) char *blob_p, *blob_o; struct patch *patch; - if (DEBUG) fprintf(stderr, "get patch %.8s %.8s\n", - sha1_to_hex(parent->commit->object.sha1), - sha1_to_hex(origin->commit->object.sha1)); - blob_p = read_sha1_file(parent->blob_sha1, type, (unsigned long *) &file_p.size); blob_o = read_sha1_file(origin->blob_sha1, type, @@ -352,6 +349,7 @@ static void dup_entry(struct blame_entry *dst, struct blame_entry *src) memcpy(dst, src, sizeof(*src)); dst->prev = p; dst->next = n; + dst->score = 0; } static const char *nth_line(struct scoreboard *sb, int lno) @@ -448,7 +446,7 @@ static void split_blame(struct scoreboard *sb, add_blame_entry(sb, new_entry); } - if (DEBUG) { + if (1) { /* sanity */ struct blame_entry *ent; int lno = 0, corrupt = 0; @@ -530,12 +528,6 @@ static int pass_blame_to_parent(struct scoreboard *sb, for (i = 0; i < patch->num; i++) { struct chunk *chunk = &patch->chunks[i]; - if (DEBUG) - fprintf(stderr, - "plno = %d, tlno = %d, " - "same as parent up to %d, resync %d and %d\n", - plno, tlno, - chunk->same, chunk->p_next, chunk->t_next); blame_chunk(sb, tlno, plno, chunk->same, target, parent); plno = chunk->p_next; tlno = chunk->t_next; @@ -547,14 +539,37 @@ static int pass_blame_to_parent(struct scoreboard *sb, return 0; } -static void copy_split_if_better(struct blame_entry best_so_far[3], +static unsigned ent_score(struct scoreboard *sb, struct blame_entry *e) +{ + unsigned score; + const char *cp, *ep; + + if (e->score) + return e->score; + + score = 0; + cp = nth_line(sb, e->lno); + ep = nth_line(sb, e->lno + e->num_lines); + while (cp < ep) { + unsigned ch = *((unsigned char *)cp); + if (isalnum(ch)) + score++; + cp++; + } + e->score = score; + return score; +} + +static void copy_split_if_better(struct scoreboard *sb, + struct blame_entry best_so_far[3], struct blame_entry this[3]) { if (!this[1].suspect) return; - if (best_so_far[1].suspect && - (this[1].num_lines < best_so_far[1].num_lines)) - return; + if (best_so_far[1].suspect) { + if (ent_score(sb, &this[1]) < ent_score(sb, &best_so_far[1])) + return; + } memcpy(best_so_far, this, sizeof(struct blame_entry [3])); } @@ -596,7 +611,7 @@ static void find_copy_in_blob(struct scoreboard *sb, tlno + ent->s_lno, plno, chunk->same + ent->s_lno, parent); - copy_split_if_better(split, this); + copy_split_if_better(sb, split, this); } plno = chunk->p_next; tlno = chunk->t_next; @@ -699,7 +714,7 @@ static int find_copy_in_parent(struct scoreboard *sb, continue; } find_copy_in_blob(sb, ent, norigin, this, &file_p); - copy_split_if_better(split, this); + copy_split_if_better(sb, split, this); } if (split[1].suspect) split_blame(sb, split, ent); @@ -944,6 +959,7 @@ static void get_commit_info(struct commit *commit, #define OUTPUT_PORCELAIN 010 #define OUTPUT_SHOW_NAME 020 #define OUTPUT_SHOW_NUMBER 040 +#define OUTPUT_SHOW_SCORE 0100 static void emit_porcelain(struct scoreboard *sb, struct blame_entry *ent) { @@ -1016,6 +1032,8 @@ static void emit_other(struct scoreboard *sb, struct blame_entry *ent, int opt) show_raw_time), ent->lno + 1 + cnt); else { + if (opt & OUTPUT_SHOW_SCORE) + printf(" %*d", max_score_digits, ent->score); if (opt & OUTPUT_SHOW_NAME) printf(" %-*.*s", longest_file, longest_file, suspect->path); @@ -1060,8 +1078,9 @@ static void output(struct scoreboard *sb, int option) for (ent = sb->ent; ent; ent = ent->next) { if (option & OUTPUT_PORCELAIN) emit_porcelain(sb, ent); - else + else { emit_other(sb, ent, option); + } } } @@ -1121,6 +1140,7 @@ static void find_alignment(struct scoreboard *sb, int *option) { int longest_src_lines = 0; int longest_dst_lines = 0; + unsigned largest_score = 0; struct blame_entry *e; for (e = sb->ent; e; e = e->next) { @@ -1146,9 +1166,12 @@ static void find_alignment(struct scoreboard *sb, int *option) num = e->lno + e->num_lines; if (longest_dst_lines < num) longest_dst_lines = num; + if (largest_score < ent_score(sb, e)) + largest_score = ent_score(sb, e); } max_orig_digits = lineno_width(longest_src_lines); max_digits = lineno_width(longest_dst_lines); + max_score_digits = lineno_width(largest_score); } static int has_path_in_work_tree(const char *path) @@ -1209,6 +1232,8 @@ int cmd_pickaxe(int argc, const char **argv, const char *prefix) tmp = top; top = bottom; bottom = tmp; } } + else if (!strcmp("--score-debug", arg)) + output_option |= OUTPUT_SHOW_SCORE; else if (!strcmp("-f", arg) || !strcmp("--show-name", arg)) output_option |= OUTPUT_SHOW_NAME; -- cgit v0.10.2-6-g49f6 From 4a0fc95f182dbf5b0c58e6d6cfe8cd7459da7ab6 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Fri, 20 Oct 2006 15:37:12 -0700 Subject: git-pickaxe: introduce heuristics to avoid "trivial" chunks This adds scoring logic to blame_entry to prevent blames on very trivial chunks (e.g. lots of empty lines, indent followed by a closing brace) from being passed down to unrelated lines in the parent. The current heuristics are quite simple and may need to be tweaked later, but we need to start somewhere. Signed-off-by: Junio C Hamano diff --git a/builtin-pickaxe.c b/builtin-pickaxe.c index 52e32dc..e5a50b9 100644 --- a/builtin-pickaxe.c +++ b/builtin-pickaxe.c @@ -40,6 +40,15 @@ static int max_score_digits; #define PICKAXE_BLAME_COPY 02 #define PICKAXE_BLAME_COPY_HARDER 04 +/* + * blame for a blame_entry with score lower than these thresholds + * is not passed to the parent using move/copy logic. + */ +static unsigned blame_move_score; +static unsigned blame_copy_score; +#define BLAME_DEFAULT_MOVE_SCORE 20 +#define BLAME_DEFAULT_COPY_SCORE 40 + /* bits #0..7 in revision.h, #8..11 used for merge_bases() in commit.c */ #define METAINFO_SHOWN (1u<<12) #define MORE_THAN_ONE_PATH (1u<<13) @@ -645,7 +654,8 @@ static int find_move_in_parent(struct scoreboard *sb, if (ent->suspect != target || ent->guilty) continue; find_copy_in_blob(sb, ent, parent, split, &file_p); - if (split[1].suspect) + if (split[1].suspect && + blame_move_score < ent_score(sb, &split[1])) split_blame(sb, split, ent); } free(blob_p); @@ -716,7 +726,8 @@ static int find_copy_in_parent(struct scoreboard *sb, find_copy_in_blob(sb, ent, norigin, this, &file_p); copy_split_if_better(sb, split, this); } - if (split[1].suspect) + if (split[1].suspect && + blame_copy_score < ent_score(sb, &split[1])) split_blame(sb, split, ent); } diff_flush(&diff_opts); @@ -1180,6 +1191,15 @@ static int has_path_in_work_tree(const char *path) return !lstat(path, &st); } +static unsigned parse_score(const char *arg) +{ + char *end; + unsigned long score = strtoul(arg, &end, 10); + if (*end) + return 0; + return score; +} + int cmd_pickaxe(int argc, const char **argv, const char *prefix) { struct rev_info revs; @@ -1209,12 +1229,15 @@ int cmd_pickaxe(int argc, const char **argv, const char *prefix) output_option |= OUTPUT_LONG_OBJECT_NAME; else if (!strcmp("-S", arg) && ++i < argc) revs_file = argv[i]; - else if (!strcmp("-M", arg)) + else if (!strncmp("-M", arg, 2)) { opt |= PICKAXE_BLAME_MOVE; - else if (!strcmp("-C", arg)) { + blame_move_score = parse_score(arg+2); + } + else if (!strncmp("-C", arg, 2)) { if (opt & PICKAXE_BLAME_COPY) opt |= PICKAXE_BLAME_COPY_HARDER; opt |= PICKAXE_BLAME_COPY | PICKAXE_BLAME_MOVE; + blame_copy_score = parse_score(arg+2); } else if (!strcmp("-L", arg) && ++i < argc) { char *term; @@ -1252,6 +1275,11 @@ int cmd_pickaxe(int argc, const char **argv, const char *prefix) argv[unk++] = arg; } + if (!blame_move_score) + blame_move_score = BLAME_DEFAULT_MOVE_SCORE; + if (!blame_copy_score) + blame_copy_score = BLAME_DEFAULT_COPY_SCORE; + /* We have collected options unknown to us in argv[1..unk] * which are to be passed to revision machinery if we are * going to do the "bottom" procesing. -- cgit v0.10.2-6-g49f6 From 612702e8ea3aa070586001f1aa0a2070058664e0 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Fri, 20 Oct 2006 23:49:31 -0700 Subject: git-pickaxe: do not keep commit buffer. We need the commit buffer data while generating the final result, but until then we do not need them. Signed-off-by: Junio C Hamano diff --git a/builtin-pickaxe.c b/builtin-pickaxe.c index e5a50b9..e628b5a 100644 --- a/builtin-pickaxe.c +++ b/builtin-pickaxe.c @@ -108,6 +108,7 @@ struct scoreboard { /* linked list of blames */ struct blame_entry *ent; + /* look-up a line in the final buffer */ int num_lines; int *lineno; }; @@ -188,7 +189,8 @@ static struct origin *find_rename(struct scoreboard *sb, for (i = 0; i < diff_queued_diff.nr; i++) { struct diff_filepair *p = diff_queued_diff.queue[i]; - if (p->status == 'R' && !strcmp(p->one->path, origin->path)) { + if ((p->status == 'R' || p->status == 'C') && + !strcmp(p->one->path, origin->path)) { porigin = find_origin(sb, parent, p->two->path); break; } @@ -457,7 +459,7 @@ static void split_blame(struct scoreboard *sb, if (1) { /* sanity */ struct blame_entry *ent; - int lno = 0, corrupt = 0; + int lno = sb->ent->lno, corrupt = 0; for (ent = sb->ent; ent; ent = ent->next) { if (lno != ent->lno) @@ -467,7 +469,7 @@ static void split_blame(struct scoreboard *sb, lno += ent->num_lines; } if (corrupt) { - lno = 0; + lno = sb->ent->lno; for (ent = sb->ent; ent; ent = ent->next) { printf("L %8d l %8d n %8d\n", lno, ent->lno, ent->num_lines); @@ -508,10 +510,9 @@ static void blame_chunk(struct scoreboard *sb, int tlno, int plno, int same, struct origin *target, struct origin *parent) { - struct blame_entry *e, *n; + struct blame_entry *e; - for (e = sb->ent; e; e = n) { - n = e->next; + for (e = sb->ent; e; e = e->next) { if (e->guilty || e->suspect != target) continue; if (same <= e->s_lno) @@ -556,7 +557,7 @@ static unsigned ent_score(struct scoreboard *sb, struct blame_entry *e) if (e->score) return e->score; - score = 0; + score = 1; cp = nth_line(sb, e->lno); ep = nth_line(sb, e->lno + e->num_lines); while (cp < ep) { @@ -933,6 +934,15 @@ static void get_commit_info(struct commit *commit, static char committer_buf[1024]; static char summary_buf[1024]; + /* We've operated without save_commit_buffer, so + * we now need to populate them for output. + */ + if (!commit->buffer) { + char type[20]; + unsigned long size; + commit->buffer = + read_sha1_file(commit->object.sha1, type, &size); + } ret->author = author_buf; get_ac_line(commit->buffer, "\nauthor ", sizeof(author_buf), author_buf, &ret->author_mail, @@ -1214,6 +1224,8 @@ int cmd_pickaxe(int argc, const char **argv, const char *prefix) const char *final_commit_name = NULL; char type[10]; + save_commit_buffer = 0; + opt = 0; bottom = top = 0; seen_dashdash = 0; -- cgit v0.10.2-6-g49f6 From 46014766bdacec8ad22355ce78898b895bf172d6 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Sat, 21 Oct 2006 00:41:38 -0700 Subject: git-pickaxe: do not confuse two origins that are the same. It used to be that we can compare the address of the origin structure to determine if they are the same because they are always registered with scoreboard. After introduction of the loop to try finding the best split, that is not true anymore. The current code has rather serious leaks with origin structure, but more importantly it gets confused when two origins that points at the same commit and same path. We might eventually have to refcount and gc origin, but let's fix the correctness issue first. Signed-off-by: Junio C Hamano diff --git a/builtin-pickaxe.c b/builtin-pickaxe.c index e628b5a..3ab87ef 100644 --- a/builtin-pickaxe.c +++ b/builtin-pickaxe.c @@ -113,12 +113,20 @@ struct scoreboard { int *lineno; }; +static int cmp_suspect(struct origin *a, struct origin *b) +{ + int cmp = hashcmp(a->commit->object.sha1, b->commit->object.sha1); + if (cmp) + return cmp; + return strcmp(a->path, b->path); +} + static void coalesce(struct scoreboard *sb) { struct blame_entry *ent, *next; for (ent = sb->ent; ent && (next = ent->next); ent = next) { - if (ent->suspect == next->suspect && + if (!cmp_suspect(ent->suspect, next->suspect) && ent->guilty == next->guilty && ent->s_lno + ent->num_lines == next->s_lno) { ent->num_lines += next->num_lines; @@ -126,6 +134,7 @@ static void coalesce(struct scoreboard *sb) if (ent->next) ent->next->prev = ent; free(next); + ent->score = 0; next = ent; /* again */ } } @@ -498,7 +507,7 @@ static int find_last_in_target(struct scoreboard *sb, struct origin *target) int last_in_target = -1; for (e = sb->ent; e; e = e->next) { - if (e->guilty || e->suspect != target) + if (e->guilty || cmp_suspect(e->suspect, target)) continue; if (last_in_target < e->s_lno + e->num_lines) last_in_target = e->s_lno + e->num_lines; @@ -513,7 +522,7 @@ static void blame_chunk(struct scoreboard *sb, struct blame_entry *e; for (e = sb->ent; e; e = e->next) { - if (e->guilty || e->suspect != target) + if (e->guilty || cmp_suspect(e->suspect, target)) continue; if (same <= e->s_lno) continue; @@ -634,7 +643,7 @@ static int find_move_in_parent(struct scoreboard *sb, struct origin *parent) { int last_in_target; - struct blame_entry *ent, split[3]; + struct blame_entry *e, split[3]; mmfile_t file_p; char type[10]; char *blob_p; @@ -651,13 +660,13 @@ static int find_move_in_parent(struct scoreboard *sb, return 0; } - for (ent = sb->ent; ent; ent = ent->next) { - if (ent->suspect != target || ent->guilty) + for (e = sb->ent; e; e = e->next) { + if (e->guilty || cmp_suspect(e->suspect, target)) continue; - find_copy_in_blob(sb, ent, parent, split, &file_p); + find_copy_in_blob(sb, e, parent, split, &file_p); if (split[1].suspect && blame_move_score < ent_score(sb, &split[1])) - split_blame(sb, split, ent); + split_blame(sb, split, e); } free(blob_p); return 0; @@ -671,7 +680,7 @@ static int find_copy_in_parent(struct scoreboard *sb, { struct diff_options diff_opts; const char *paths[1]; - struct blame_entry *ent; + struct blame_entry *e; int i; if (find_last_in_target(sb, target) < 0) @@ -696,9 +705,9 @@ static int find_copy_in_parent(struct scoreboard *sb, "", &diff_opts); diffcore_std(&diff_opts); - for (ent = sb->ent; ent; ent = ent->next) { + for (e = sb->ent; e; e = e->next) { struct blame_entry split[3]; - if (ent->suspect != target || ent->guilty) + if (e->guilty || cmp_suspect(e->suspect, target)) continue; memset(split, 0, sizeof(split)); @@ -724,12 +733,12 @@ static int find_copy_in_parent(struct scoreboard *sb, free(blob); continue; } - find_copy_in_blob(sb, ent, norigin, this, &file_p); + find_copy_in_blob(sb, e, norigin, this, &file_p); copy_split_if_better(sb, split, this); } if (split[1].suspect && blame_copy_score < ent_score(sb, &split[1])) - split_blame(sb, split, ent); + split_blame(sb, split, e); } diff_flush(&diff_opts); @@ -763,12 +772,6 @@ static void pass_blame(struct scoreboard *sb, struct origin *origin, int opt) for (e = sb->ent; e; e = e->next) if (e->suspect == origin) e->suspect = porigin; - /* now everything blamed for origin is blamed for - * porigin, we do not need to keep it anymore. - * Do not free porigin (or the ones we got from - * earlier round); they may still be used elsewhere. - */ - free_origin(origin); return; } parent_origin[i] = porigin; @@ -834,8 +837,10 @@ static void assign_blame(struct scoreboard *sb, struct rev_info *revs, int opt) /* Take responsibility for the remaining entries */ for (ent = sb->ent; ent; ent = ent->next) - if (ent->suspect == suspect) + if (!cmp_suspect(ent->suspect, suspect)) ent->guilty = 1; + + coalesce(sb); } } -- cgit v0.10.2-6-g49f6 From f6c0e191020ad330c06438c144e0ea787ca964fd Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Sat, 21 Oct 2006 02:56:33 -0700 Subject: git-pickaxe: get rid of wasteful find_origin(). After finding out which path in the parent to scan to pass blames, using get_tree_entry() to extract the blob information again was quite wasteful, since diff-tree already gave us that information. Separate the function to create an origin out as get_origin(). You'll never know what is more efficient unless you try and/or think hard. I somehow thought that extracting one known path out of commit's tree is cheaper than running a diff-tree for the current path between the commit and its parent, but it is not the case. In real, non-toy projects, most commits do not touch the path you are interested in, and if the path is a few levels away from the toplevel, whole-subdirectory comparison logic diff-tree allows us to skip opening lower subdirectories. This commit rewrites find_origin() function to use a single-path diff-tree to see if the parent has the same blob as the current suspect, which is cheaper than extracting the blob information using get_tree_entry() and comparing it with what the current suspect has. This shaves about 6% overhead when annotating kernel/sched.c in the Linux kernel repository on my machine. The saving rises to 25% for arch/i386/kernel/Makefile. Signed-off-by: Junio C Hamano diff --git a/builtin-pickaxe.c b/builtin-pickaxe.c index 3ab87ef..cf474b0 100644 --- a/builtin-pickaxe.c +++ b/builtin-pickaxe.c @@ -140,48 +140,103 @@ static void coalesce(struct scoreboard *sb) } } -static void free_origin(struct origin *o) +static struct origin *get_origin(struct scoreboard *sb, + struct commit *commit, + const char *path) { - free(o); -} - -static struct origin *find_origin(struct scoreboard *sb, - struct commit *commit, - const char *path) -{ - struct blame_entry *ent; + struct blame_entry *e; struct origin *o; - unsigned mode; - char type[10]; - for (ent = sb->ent; ent; ent = ent->next) { - if (ent->suspect->commit == commit && - !strcmp(ent->suspect->path, path)) - return ent->suspect; + for (e = sb->ent; e; e = e->next) { + if (e->suspect->commit == commit && + !strcmp(e->suspect->path, path)) + return e->suspect; } - o = xcalloc(1, sizeof(*o) + strlen(path) + 1); o->commit = commit; strcpy(o->path, path); - if (get_tree_entry(commit->object.sha1, path, o->blob_sha1, &mode)) - goto err_out; - if (sha1_object_info(o->blob_sha1, type, NULL) || - strcmp(type, blob_type)) - goto err_out; return o; - err_out: - free_origin(o); - return NULL; } -static struct origin *find_rename(struct scoreboard *sb, +static int fill_blob_sha1(struct origin *origin) +{ + unsigned mode; + char type[10]; + + if (!is_null_sha1(origin->blob_sha1)) + return 0; + if (get_tree_entry(origin->commit->object.sha1, + origin->path, + origin->blob_sha1, &mode)) + goto error_out; + if (sha1_object_info(origin->blob_sha1, type, NULL) || + strcmp(type, blob_type)) + goto error_out; + return 0; + error_out: + hashclr(origin->blob_sha1); + return -1; +} + +static struct origin *find_origin(struct scoreboard *sb, struct commit *parent, struct origin *origin) { struct origin *porigin = NULL; struct diff_options diff_opts; int i; - const char *paths[1]; + const char *paths[2]; + + /* See if the origin->path is different between parent + * and origin first. Most of the time they are the + * same and diff-tree is fairly efficient about this. + */ + diff_setup(&diff_opts); + diff_opts.recursive = 1; + diff_opts.detect_rename = 0; + diff_opts.output_format = DIFF_FORMAT_NO_OUTPUT; + paths[0] = origin->path; + paths[1] = NULL; + + diff_tree_setup_paths(paths, &diff_opts); + if (diff_setup_done(&diff_opts) < 0) + die("diff-setup"); + diff_tree_sha1(parent->tree->object.sha1, + origin->commit->tree->object.sha1, + "", &diff_opts); + diffcore_std(&diff_opts); + + /* It is either one entry that says "modified", or "created", + * or nothing. + */ + if (!diff_queued_diff.nr) { + /* The path is the same as parent */ + porigin = get_origin(sb, parent, origin->path); + hashcpy(porigin->blob_sha1, origin->blob_sha1); + } + else if (diff_queued_diff.nr != 1) + die("internal error in pickaxe::find_origin"); + else { + struct diff_filepair *p = diff_queued_diff.queue[0]; + switch (p->status) { + default: + die("internal error in pickaxe::find_origin (%c)", + p->status); + case 'M': + porigin = get_origin(sb, parent, origin->path); + hashcpy(porigin->blob_sha1, p->one->sha1); + break; + case 'A': + case 'T': + /* Did not exist in parent, or type changed */ + break; + } + } + diff_flush(&diff_opts); + if (porigin) + return porigin; + + /* Otherwise we would look for a rename */ diff_setup(&diff_opts); diff_opts.recursive = 1; @@ -191,16 +246,17 @@ static struct origin *find_rename(struct scoreboard *sb, diff_tree_setup_paths(paths, &diff_opts); if (diff_setup_done(&diff_opts) < 0) die("diff-setup"); - diff_tree_sha1(origin->commit->tree->object.sha1, - parent->tree->object.sha1, + diff_tree_sha1(parent->tree->object.sha1, + origin->commit->tree->object.sha1, "", &diff_opts); diffcore_std(&diff_opts); for (i = 0; i < diff_queued_diff.nr; i++) { struct diff_filepair *p = diff_queued_diff.queue[i]; if ((p->status == 'R' || p->status == 'C') && - !strcmp(p->one->path, origin->path)) { - porigin = find_origin(sb, parent, p->two->path); + !strcmp(p->two->path, origin->path)) { + porigin = get_origin(sb, parent, p->one->path); + hashcpy(porigin->blob_sha1, p->one->sha1); break; } } @@ -705,6 +761,15 @@ static int find_copy_in_parent(struct scoreboard *sb, "", &diff_opts); diffcore_std(&diff_opts); + + /* + * NEEDSWORK: This loop is wasteful in that it opens the same + * blob in the parent number of times. We should swap the + * loop inside out, which would require keeping track of + * "best blame so far" for blame entries that the current + * "target" is being suspected. + */ + for (e = sb->ent; e; e = e->next) { struct blame_entry split[3]; if (e->guilty || cmp_suspect(e->suspect, target)) @@ -724,8 +789,8 @@ static int find_copy_in_parent(struct scoreboard *sb, if (porigin && !strcmp(p->one->path, porigin->path)) /* find_move already dealt with this path */ continue; - norigin = find_origin(sb, parent, p->one->path); - + norigin = get_origin(sb, parent, p->one->path); + hashcpy(norigin->blob_sha1, p->one->sha1); blob = read_sha1_file(norigin->blob_sha1, type, (unsigned long *) &file_p.size); file_p.ptr = blob; @@ -735,6 +800,7 @@ static int find_copy_in_parent(struct scoreboard *sb, } find_copy_in_blob(sb, e, norigin, this, &file_p); copy_split_if_better(sb, split, this); + free(blob); } if (split[1].suspect && blame_copy_score < ent_score(sb, &split[1])) @@ -762,9 +828,7 @@ static void pass_blame(struct scoreboard *sb, struct origin *origin, int opt) if (parse_commit(p)) continue; - porigin = find_origin(sb, parent->item, origin->path); - if (!porigin) - porigin = find_rename(sb, parent->item, origin); + porigin = find_origin(sb, parent->item, origin); if (!porigin) continue; if (!hashcmp(porigin->blob_sha1, origin->blob_sha1)) { @@ -1423,8 +1487,8 @@ int cmd_pickaxe(int argc, const char **argv, const char *prefix) */ prepare_revision_walk(&revs); - o = find_origin(&sb, sb.final, path); - if (!o) + o = get_origin(&sb, sb.final, path); + if (fill_blob_sha1(o)) die("no such path %s in %s", path, final_commit_name); sb.final_buf = read_sha1_file(o->blob_sha1, type, &sb.final_buf_size); -- cgit v0.10.2-6-g49f6 From aec8fa1f587d68a0e50ada2720c8dc6e09709a9c Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Sat, 21 Oct 2006 03:30:53 -0700 Subject: git-pickaxe: swap comparison loop used for -C When assigning blames for code movements across file boundaries, we used to iterate over blame entries (i.e. groups of lines to be blamed) in the outer loop and compared each entry with paths in the parent commit in an inner loop. This meant that we opened the blob data from each path number of times. Reorganize the loop so that we read the same path only once, and compare it against all relevant blame entries. This should perform better, but seems to give mixed results, though. Signed-off-by: Junio C Hamano diff --git a/builtin-pickaxe.c b/builtin-pickaxe.c index cf474b0..663b96d 100644 --- a/builtin-pickaxe.c +++ b/builtin-pickaxe.c @@ -737,11 +737,27 @@ static int find_copy_in_parent(struct scoreboard *sb, struct diff_options diff_opts; const char *paths[1]; struct blame_entry *e; - int i; + int i, j; + struct blame_list { + struct blame_entry *ent; + struct blame_entry split[3]; + } *blame_list; + int num_ents; - if (find_last_in_target(sb, target) < 0) + /* Count the number of entries the target is suspected for, + * and prepare a list of entry and the best split. + */ + for (e = sb->ent, num_ents = 0; e; e = e->next) + if (!e->guilty && !cmp_suspect(e->suspect, target)) + num_ents++; + if (!num_ents) return 1; /* nothing remains for this target */ + blame_list = xcalloc(num_ents, sizeof(struct blame_list)); + for (e = sb->ent, i = 0; e; e = e->next) + if (!e->guilty && !cmp_suspect(e->suspect, target)) + blame_list[i++].ent = e; + diff_setup(&diff_opts); diff_opts.recursive = 1; diff_opts.output_format = DIFF_FORMAT_NO_OUTPUT; @@ -761,52 +777,45 @@ static int find_copy_in_parent(struct scoreboard *sb, "", &diff_opts); diffcore_std(&diff_opts); - - /* - * NEEDSWORK: This loop is wasteful in that it opens the same - * blob in the parent number of times. We should swap the - * loop inside out, which would require keeping track of - * "best blame so far" for blame entries that the current - * "target" is being suspected. - */ - - for (e = sb->ent; e; e = e->next) { - struct blame_entry split[3]; - if (e->guilty || cmp_suspect(e->suspect, target)) + for (i = 0; i < diff_queued_diff.nr; i++) { + struct diff_filepair *p = diff_queued_diff.queue[i]; + struct origin *norigin; + mmfile_t file_p; + char type[10]; + char *blob; + struct blame_entry this[3]; + + if (!DIFF_FILE_VALID(p->one)) + continue; /* does not exist in parent */ + if (porigin && !strcmp(p->one->path, porigin->path)) + /* find_move already dealt with this path */ continue; - memset(split, 0, sizeof(split)); - for (i = 0; i < diff_queued_diff.nr; i++) { - struct diff_filepair *p = diff_queued_diff.queue[i]; - struct origin *norigin; - mmfile_t file_p; - char type[10]; - char *blob; - struct blame_entry this[3]; + norigin = get_origin(sb, parent, p->one->path); + hashcpy(norigin->blob_sha1, p->one->sha1); + blob = read_sha1_file(norigin->blob_sha1, type, + (unsigned long *) &file_p.size); + file_p.ptr = blob; + if (!file_p.ptr) + continue; - if (!DIFF_FILE_VALID(p->one)) - continue; /* does not exist in parent */ - if (porigin && !strcmp(p->one->path, porigin->path)) - /* find_move already dealt with this path */ - continue; - norigin = get_origin(sb, parent, p->one->path); - hashcpy(norigin->blob_sha1, p->one->sha1); - blob = read_sha1_file(norigin->blob_sha1, type, - (unsigned long *) &file_p.size); - file_p.ptr = blob; - if (!file_p.ptr) { - free(blob); - continue; - } - find_copy_in_blob(sb, e, norigin, this, &file_p); - copy_split_if_better(sb, split, this); - free(blob); + for (j = 0; j < num_ents; j++) { + find_copy_in_blob(sb, blame_list[j].ent, norigin, + this, &file_p); + copy_split_if_better(sb, blame_list[j].split, + this); } + free(blob); + } + diff_flush(&diff_opts); + + for (j = 0; j < num_ents; j++) { + struct blame_entry *split = blame_list[j].split; if (split[1].suspect && blame_copy_score < ent_score(sb, &split[1])) - split_blame(sb, split, e); + split_blame(sb, split, blame_list[j].ent); } - diff_flush(&diff_opts); + free(blame_list); return 0; } -- cgit v0.10.2-6-g49f6 From 54a4c6173e30774125016f7ff4976eb4a7485d0c Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Sun, 29 Oct 2006 03:07:40 -0800 Subject: git-pickaxe: WIP to refcount origin structure. The origin structure is allocated for each commit and path while the code traverse down it is copied into different blame entries. To avoid leaks, try refcounting them. This still seems to leak, which I haven't tracked down fully yet. Signed-off-by: Junio C Hamano diff --git a/builtin-pickaxe.c b/builtin-pickaxe.c index 663b96d..b53c7b0 100644 --- a/builtin-pickaxe.c +++ b/builtin-pickaxe.c @@ -36,6 +36,10 @@ static int max_orig_digits; static int max_digits; static int max_score_digits; +#ifndef DEBUG +#define DEBUG 0 +#endif + #define PICKAXE_BLAME_MOVE 01 #define PICKAXE_BLAME_COPY 02 #define PICKAXE_BLAME_COPY_HARDER 04 @@ -54,14 +58,30 @@ static unsigned blame_copy_score; #define MORE_THAN_ONE_PATH (1u<<13) /* - * One blob in a commit + * One blob in a commit that is being suspected */ struct origin { + int refcnt; struct commit *commit; unsigned char blob_sha1[20]; char path[FLEX_ARRAY]; }; +static inline struct origin *origin_incref(struct origin *o) +{ + if (o) + o->refcnt++; + return o; +} + +static void origin_decref(struct origin *o) +{ + if (o && --o->refcnt <= 0) { + memset(o, 0, sizeof(*o)); + free(o); + } +} + struct blame_entry { struct blame_entry *prev; struct blame_entry *next; @@ -121,6 +141,8 @@ static int cmp_suspect(struct origin *a, struct origin *b) return strcmp(a->path, b->path); } +static void sanity_check_refcnt(struct scoreboard *); + static void coalesce(struct scoreboard *sb) { struct blame_entry *ent, *next; @@ -133,11 +155,15 @@ static void coalesce(struct scoreboard *sb) ent->next = next->next; if (ent->next) ent->next->prev = ent; + origin_decref(next->suspect); free(next); ent->score = 0; next = ent; /* again */ } } + + if (DEBUG) /* sanity */ + sanity_check_refcnt(sb); } static struct origin *get_origin(struct scoreboard *sb, @@ -150,10 +176,11 @@ static struct origin *get_origin(struct scoreboard *sb, for (e = sb->ent; e; e = e->next) { if (e->suspect->commit == commit && !strcmp(e->suspect->path, path)) - return e->suspect; + return origin_incref(e->suspect); } o = xcalloc(1, sizeof(*o) + strlen(path) + 1); o->commit = commit; + o->refcnt = 1; strcpy(o->path, path); return o; } @@ -400,6 +427,8 @@ static void add_blame_entry(struct scoreboard *sb, struct blame_entry *e) { struct blame_entry *ent, *prev = NULL; + origin_incref(e->suspect); + for (ent = sb->ent; ent && ent->lno < e->lno; ent = ent->next) prev = ent; @@ -420,8 +449,11 @@ static void add_blame_entry(struct scoreboard *sb, struct blame_entry *e) static void dup_entry(struct blame_entry *dst, struct blame_entry *src) { struct blame_entry *p, *n; + p = dst->prev; n = dst->next; + origin_incref(src->suspect); + origin_decref(dst->suspect); memcpy(dst, src, sizeof(*src)); dst->prev = p; dst->next = n; @@ -433,7 +465,7 @@ static const char *nth_line(struct scoreboard *sb, int lno) return sb->final_buf + sb->lineno[lno]; } -static void split_overlap(struct blame_entry split[3], +static void split_overlap(struct blame_entry *split, struct blame_entry *e, int tlno, int plno, int same, struct origin *parent) @@ -457,7 +489,7 @@ static void split_overlap(struct blame_entry split[3], if (e->s_lno < tlno) { /* there is a pre-chunk part not blamed on parent */ - split[0].suspect = e->suspect; + split[0].suspect = origin_incref(e->suspect); split[0].lno = e->lno; split[0].s_lno = e->s_lno; split[0].num_lines = tlno - e->s_lno; @@ -471,7 +503,7 @@ static void split_overlap(struct blame_entry split[3], if (same < e->s_lno + e->num_lines) { /* there is a post-chunk part not blamed on parent */ - split[2].suspect = e->suspect; + split[2].suspect = origin_incref(e->suspect); split[2].lno = e->lno + (same - e->s_lno); split[2].s_lno = e->s_lno + (same - e->s_lno); split[2].num_lines = e->s_lno + e->num_lines - same; @@ -483,11 +515,11 @@ static void split_overlap(struct blame_entry split[3], if (split[1].num_lines < 1) return; - split[1].suspect = parent; + split[1].suspect = origin_incref(parent); } static void split_blame(struct scoreboard *sb, - struct blame_entry split[3], + struct blame_entry *split, struct blame_entry *e) { struct blame_entry *new_entry; @@ -522,7 +554,7 @@ static void split_blame(struct scoreboard *sb, add_blame_entry(sb, new_entry); } - if (1) { /* sanity */ + if (DEBUG) { /* sanity */ struct blame_entry *ent; int lno = sb->ent->lno, corrupt = 0; @@ -545,6 +577,14 @@ static void split_blame(struct scoreboard *sb, } } +static void decref_split(struct blame_entry *split) +{ + int i; + + for (i = 0; i < 3; i++) + origin_decref(split[i].suspect); +} + static void blame_overlap(struct scoreboard *sb, struct blame_entry *e, int tlno, int plno, int same, struct origin *parent) @@ -552,9 +592,9 @@ static void blame_overlap(struct scoreboard *sb, struct blame_entry *e, struct blame_entry split[3]; split_overlap(split, e, tlno, plno, same, parent); - if (!split[1].suspect) - return; - split_blame(sb, split, e); + if (split[1].suspect) + split_blame(sb, split, e); + decref_split(split); } static int find_last_in_target(struct scoreboard *sb, struct origin *target) @@ -636,22 +676,28 @@ static unsigned ent_score(struct scoreboard *sb, struct blame_entry *e) } static void copy_split_if_better(struct scoreboard *sb, - struct blame_entry best_so_far[3], - struct blame_entry this[3]) + struct blame_entry *best_so_far, + struct blame_entry *this) { + int i; + if (!this[1].suspect) return; if (best_so_far[1].suspect) { if (ent_score(sb, &this[1]) < ent_score(sb, &best_so_far[1])) return; } + + for (i = 0; i < 3; i++) + origin_incref(this[i].suspect); + decref_split(best_so_far); memcpy(best_so_far, this, sizeof(struct blame_entry [3])); } static void find_copy_in_blob(struct scoreboard *sb, struct blame_entry *ent, struct origin *parent, - struct blame_entry split[3], + struct blame_entry *split, mmfile_t *file_p) { const char *cp; @@ -687,6 +733,7 @@ static void find_copy_in_blob(struct scoreboard *sb, chunk->same + ent->s_lno, parent); copy_split_if_better(sb, split, this); + decref_split(this); } plno = chunk->p_next; tlno = chunk->t_next; @@ -723,6 +770,7 @@ static int find_move_in_parent(struct scoreboard *sb, if (split[1].suspect && blame_move_score < ent_score(sb, &split[1])) split_blame(sb, split, e); + decref_split(split); } free(blob_p); return 0; @@ -806,6 +854,7 @@ static int find_copy_in_parent(struct scoreboard *sb, this); } free(blob); + origin_decref(norigin); } diff_flush(&diff_opts); @@ -814,6 +863,7 @@ static int find_copy_in_parent(struct scoreboard *sb, if (split[1].suspect && blame_copy_score < ent_score(sb, &split[1])) split_blame(sb, split, blame_list[j].ent); + decref_split(split); } free(blame_list); @@ -843,9 +893,13 @@ static void pass_blame(struct scoreboard *sb, struct origin *origin, int opt) if (!hashcmp(porigin->blob_sha1, origin->blob_sha1)) { struct blame_entry *e; for (e = sb->ent; e; e = e->next) - if (e->suspect == origin) + if (e->suspect == origin) { + origin_incref(porigin); + origin_decref(e->suspect); e->suspect = porigin; - return; + } + origin_decref(porigin); + goto finish; } parent_origin[i] = porigin; } @@ -857,7 +911,7 @@ static void pass_blame(struct scoreboard *sb, struct origin *origin, int opt) if (!porigin) continue; if (pass_blame_to_parent(sb, origin, porigin)) - return; + goto finish; } /* @@ -871,7 +925,7 @@ static void pass_blame(struct scoreboard *sb, struct origin *origin, int opt) if (!porigin) continue; if (find_move_in_parent(sb, origin, porigin)) - return; + goto finish; } /* @@ -884,8 +938,12 @@ static void pass_blame(struct scoreboard *sb, struct origin *origin, int opt) struct origin *porigin = parent_origin[i]; if (find_copy_in_parent(sb, origin, parent->item, porigin, opt)) - return; + goto finish; } + + finish: + for (i = 0; i < MAXPARENT; i++) + origin_decref(parent_origin[i]); } static void assign_blame(struct scoreboard *sb, struct rev_info *revs, int opt) @@ -902,6 +960,7 @@ static void assign_blame(struct scoreboard *sb, struct rev_info *revs, int opt) if (!suspect) return; /* all done */ + origin_incref(suspect); commit = suspect->commit; parse_commit(commit); if (!(commit->object.flags & UNINTERESTING) && @@ -912,7 +971,7 @@ static void assign_blame(struct scoreboard *sb, struct rev_info *revs, int opt) for (ent = sb->ent; ent; ent = ent->next) if (!cmp_suspect(ent->suspect, suspect)) ent->guilty = 1; - + origin_decref(suspect); coalesce(sb); } } @@ -1132,7 +1191,9 @@ static void emit_other(struct scoreboard *sb, struct blame_entry *ent, int opt) ent->lno + 1 + cnt); else { if (opt & OUTPUT_SHOW_SCORE) - printf(" %*d", max_score_digits, ent->score); + printf(" %*d %02d", + max_score_digits, ent->score, + ent->suspect->refcnt); if (opt & OUTPUT_SHOW_NAME) printf(" %-*.*s", longest_file, longest_file, suspect->path); @@ -1273,6 +1334,45 @@ static void find_alignment(struct scoreboard *sb, int *option) max_score_digits = lineno_width(largest_score); } +static void sanity_check_refcnt(struct scoreboard *sb) +{ + int baa = 0; + struct blame_entry *ent; + + for (ent = sb->ent; ent; ent = ent->next) { + /* first mark the ones that haven't been checked */ + if (0 < ent->suspect->refcnt) + ent->suspect->refcnt = -ent->suspect->refcnt; + else if (!ent->suspect->refcnt) + baa = 1; + } + for (ent = sb->ent; ent; ent = ent->next) { + /* then pick each and see if they have the the + * correct refcnt + */ + int found; + struct blame_entry *e; + struct origin *suspect = ent->suspect; + + if (0 < suspect->refcnt) + continue; + suspect->refcnt = -suspect->refcnt; + for (found = 0, e = sb->ent; e; e = e->next) { + if (e->suspect != suspect) + continue; + found++; + } + if (suspect->refcnt != found) + baa = 1; + } + if (baa) { + int opt = 0160; + find_alignment(sb, &opt); + output(sb, opt); + die("Baa!"); + } +} + static int has_path_in_work_tree(const char *path) { struct stat st; -- cgit v0.10.2-6-g49f6 From 2c40f98439e07b8579d0549109a4feed54da9e40 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Sun, 29 Oct 2006 23:50:38 -0800 Subject: git-pickaxe: allow -Ln,m as well as -L n,m The command rejects -L1,10 as an invalid line range specifier and I got frustrated enough by it, so this makes it allow both forms of input. Signed-off-by: Junio C Hamano diff --git a/builtin-pickaxe.c b/builtin-pickaxe.c index b53c7b0..200772c 100644 --- a/builtin-pickaxe.c +++ b/builtin-pickaxe.c @@ -1429,9 +1429,15 @@ int cmd_pickaxe(int argc, const char **argv, const char *prefix) opt |= PICKAXE_BLAME_COPY | PICKAXE_BLAME_MOVE; blame_copy_score = parse_score(arg+2); } - else if (!strcmp("-L", arg) && ++i < argc) { + else if (!strncmp("-L", arg, 2)) { char *term; - arg = argv[i]; + if (!arg[2]) { + if (++i >= argc) + usage(pickaxe_usage); + arg = argv[i]; + } + else + arg += 2; if (bottom || top) die("More than one '-L n,m' option given"); bottom = strtol(arg, &term, 10); -- cgit v0.10.2-6-g49f6 From f5f75c652b9c2347522159a87297820103e593e4 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Sun, 29 Oct 2006 23:56:12 -0800 Subject: git-pickaxe: refcount origin correctly in find_copy_in_parent() This makes "git-pickaxe -C master -- revision.c" to finish with proper refcounts for all origins. I am reasonably happy with it. Signed-off-by: Junio C Hamano diff --git a/builtin-pickaxe.c b/builtin-pickaxe.c index 200772c..3e7277d 100644 --- a/builtin-pickaxe.c +++ b/builtin-pickaxe.c @@ -852,6 +852,7 @@ static int find_copy_in_parent(struct scoreboard *sb, this, &file_p); copy_split_if_better(sb, blame_list[j].split, this); + decref_split(this); } free(blob); origin_decref(norigin); -- cgit v0.10.2-6-g49f6 From ae86ad65757b857337f10f25ff31cb3348d3943d Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Mon, 30 Oct 2006 14:27:52 -0800 Subject: git-pickaxe: tighten sanity checks. When compiled for debugging, make sure that refcnt sanity check code detects underflows in origin reference counting. Signed-off-by: Junio C Hamano diff --git a/builtin-pickaxe.c b/builtin-pickaxe.c index 3e7277d..8286832 100644 --- a/builtin-pickaxe.c +++ b/builtin-pickaxe.c @@ -973,7 +973,9 @@ static void assign_blame(struct scoreboard *sb, struct rev_info *revs, int opt) if (!cmp_suspect(ent->suspect, suspect)) ent->guilty = 1; origin_decref(suspect); - coalesce(sb); + + if (DEBUG) /* sanity */ + sanity_check_refcnt(sb); } } @@ -1341,11 +1343,14 @@ static void sanity_check_refcnt(struct scoreboard *sb) struct blame_entry *ent; for (ent = sb->ent; ent; ent = ent->next) { - /* first mark the ones that haven't been checked */ + /* Nobody should have zero or negative refcnt */ + if (ent->suspect->refcnt <= 0) + baa = 1; + } + for (ent = sb->ent; ent; ent = ent->next) { + /* Mark the ones that haven't been checked */ if (0 < ent->suspect->refcnt) ent->suspect->refcnt = -ent->suspect->refcnt; - else if (!ent->suspect->refcnt) - baa = 1; } for (ent = sb->ent; ent; ent = ent->next) { /* then pick each and see if they have the the @@ -1357,7 +1362,7 @@ static void sanity_check_refcnt(struct scoreboard *sb) if (0 < suspect->refcnt) continue; - suspect->refcnt = -suspect->refcnt; + suspect->refcnt = -suspect->refcnt; /* Unmark */ for (found = 0, e = sb->ent; e; e = e->next) { if (e->suspect != suspect) continue; -- cgit v0.10.2-6-g49f6 From f69e743d97cdb3f6647ea0620a22311e7bf36051 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Mon, 30 Oct 2006 17:17:41 -0800 Subject: git-pickaxe: split find_origin() into find_rename() and find_origin(). When a merge adds a new file from the second parent, the earlier code tried to find renames in the first parent before noticing that the vertion from the second parent was added without modification. Signed-off-by: Junio C Hamano diff --git a/builtin-pickaxe.c b/builtin-pickaxe.c index 8286832..32dc393 100644 --- a/builtin-pickaxe.c +++ b/builtin-pickaxe.c @@ -211,7 +211,6 @@ static struct origin *find_origin(struct scoreboard *sb, { struct origin *porigin = NULL; struct diff_options diff_opts; - int i; const char *paths[2]; /* See if the origin->path is different between parent @@ -260,10 +259,17 @@ static struct origin *find_origin(struct scoreboard *sb, } } diff_flush(&diff_opts); - if (porigin) - return porigin; + return porigin; +} - /* Otherwise we would look for a rename */ +static struct origin *find_rename(struct scoreboard *sb, + struct commit *parent, + struct origin *origin) +{ + struct origin *porigin = NULL; + struct diff_options diff_opts; + int i; + const char *paths[2]; diff_setup(&diff_opts); diff_opts.recursive = 1; @@ -875,34 +881,46 @@ static int find_copy_in_parent(struct scoreboard *sb, static void pass_blame(struct scoreboard *sb, struct origin *origin, int opt) { - int i; + int i, pass; struct commit *commit = origin->commit; struct commit_list *parent; struct origin *parent_origin[MAXPARENT], *porigin; memset(parent_origin, 0, sizeof(parent_origin)); - for (i = 0, parent = commit->parents; - i < MAXPARENT && parent; - parent = parent->next, i++) { - struct commit *p = parent->item; - if (parse_commit(p)) - continue; - porigin = find_origin(sb, parent->item, origin); - if (!porigin) - continue; - if (!hashcmp(porigin->blob_sha1, origin->blob_sha1)) { - struct blame_entry *e; - for (e = sb->ent; e; e = e->next) - if (e->suspect == origin) { - origin_incref(porigin); - origin_decref(e->suspect); - e->suspect = porigin; - } - origin_decref(porigin); - goto finish; + /* The first pass looks for unrenamed path to optimize for + * common cases, then we look for renames in the second pass. + */ + for (pass = 0; pass < 2; pass++) { + struct origin *(*find)(struct scoreboard *, + struct commit *, struct origin *); + find = pass ? find_rename : find_origin; + + for (i = 0, parent = commit->parents; + i < MAXPARENT && parent; + parent = parent->next, i++) { + struct commit *p = parent->item; + + if (parent_origin[i]) + continue; + if (parse_commit(p)) + continue; + porigin = find(sb, parent->item, origin); + if (!porigin) + continue; + if (!hashcmp(porigin->blob_sha1, origin->blob_sha1)) { + struct blame_entry *e; + for (e = sb->ent; e; e = e->next) + if (e->suspect == origin) { + origin_incref(porigin); + origin_decref(e->suspect); + e->suspect = porigin; + } + origin_decref(porigin); + goto finish; + } + parent_origin[i] = porigin; } - parent_origin[i] = porigin; } for (i = 0, parent = commit->parents; -- cgit v0.10.2-6-g49f6 From 0d981c67d8bcf4610e165744059c1e9e14609baa Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Tue, 31 Oct 2006 01:00:01 -0800 Subject: git-pickaxe: cache one already found path per commit. Depending on how bushy the commit DAG is, this saves calls to the internal diff-tree for fork-point commits. For example, annotating Makefile in the kernel repository saves about a third of such diff-tree calls. Signed-off-by: Junio C Hamano diff --git a/builtin-pickaxe.c b/builtin-pickaxe.c index 32dc393..c9405e9 100644 --- a/builtin-pickaxe.c +++ b/builtin-pickaxe.c @@ -141,6 +141,8 @@ static int cmp_suspect(struct origin *a, struct origin *b) return strcmp(a->path, b->path); } +#define cmp_suspect(a, b) ( ((a)==(b)) ? 0 : cmp_suspect(a,b) ) + static void sanity_check_refcnt(struct scoreboard *); static void coalesce(struct scoreboard *sb) @@ -213,6 +215,12 @@ static struct origin *find_origin(struct scoreboard *sb, struct diff_options diff_opts; const char *paths[2]; + if (parent->util) { + struct origin *cached = parent->util; + if (!strcmp(cached->path, origin->path)) + return origin_incref(cached); + } + /* See if the origin->path is different between parent * and origin first. Most of the time they are the * same and diff-tree is fairly efficient about this. @@ -259,6 +267,12 @@ static struct origin *find_origin(struct scoreboard *sb, } } diff_flush(&diff_opts); + if (porigin) { + origin_incref(porigin); + if (parent->util) + origin_decref(parent->util); + parent->util = porigin; + } return porigin; } @@ -905,7 +919,7 @@ static void pass_blame(struct scoreboard *sb, struct origin *origin, int opt) continue; if (parse_commit(p)) continue; - porigin = find(sb, parent->item, origin); + porigin = find(sb, p, origin); if (!porigin) continue; if (!hashcmp(porigin->blob_sha1, origin->blob_sha1)) { @@ -1371,8 +1385,10 @@ static void sanity_check_refcnt(struct scoreboard *sb) ent->suspect->refcnt = -ent->suspect->refcnt; } for (ent = sb->ent; ent; ent = ent->next) { - /* then pick each and see if they have the the - * correct refcnt + /* then pick each and see if they have the the correct + * refcnt; note that ->util caching means origin's refcnt + * may well be greater than the number of blame entries + * that use it. */ int found; struct blame_entry *e; @@ -1386,7 +1402,7 @@ static void sanity_check_refcnt(struct scoreboard *sb) continue; found++; } - if (suspect->refcnt != found) + if (suspect->refcnt < found) baa = 1; } if (baa) { -- cgit v0.10.2-6-g49f6 From 62476c8e331a22e224d87c18830913129f5f303b Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Tue, 31 Oct 2006 14:22:34 -0800 Subject: Introduce a new revision set operator ^! This is a shorthand for " --not ^@", i.e. "include this commit but exclude any of its parents". When a new file $F is introduced by revision $R, this notation can be used to find a copy-and-paste from existing file in the parents of that revision without annotating the ancestry of the lines that were copied from: git pickaxe -f -C $R^! -- $F Signed-off-by: Junio C Hamano diff --git a/Documentation/git-pickaxe.txt b/Documentation/git-pickaxe.txt index 6d22fd9..c08fdec 100644 --- a/Documentation/git-pickaxe.txt +++ b/Documentation/git-pickaxe.txt @@ -111,6 +111,44 @@ The contents of the actual line is output after the above header, prefixed by a TAB. This is to allow adding more header elements later. + +SPECIFIYING RANGES +------------------ + +Unlike `git-blame` and `git-annotate` in older git, the extent +of annotation can be limited to both line ranges and revision +ranges. When you are interested in finding the origin for +ll. 40-60 for file `foo`, you can use `-L` option like this: + + git pickaxe -L 40,60 foo + +When you are not interested in changes older than the version +v2.6.18, or changes older than 3 weeks, you can use revision +range specifiers similar to `git-rev-list`: + + git pickaxe v2.6.18.. -- foo + git pickaxe --since=3.weeks -- foo + +When revision range specifiers are used to limit the annotation, +lines that have not changed since the range boundary (either the +commit v2.6.18 or the most recent commit that is more than 3 +weeks old in the above example) are blamed for that range +boundary commit. + +A particularly useful way is to see if an added file have lines +created by copy-and-paste from existing files. Sometimes this +indicates that the developer was being sloppy and did not +refactor the code properly. You can first find the commit that +introduced the file with: + + git log --diff-filter=A --pretty=short -- foo + +and then annotate the change between the commit and its +parents, using `commit{caret}!` notation: + + git pickaxe -C -C -f $commit^! -- foo + + SEE ALSO -------- gitlink:git-blame[1] diff --git a/Documentation/git-rev-parse.txt b/Documentation/git-rev-parse.txt index 5d42570..cda80b1 100644 --- a/Documentation/git-rev-parse.txt +++ b/Documentation/git-rev-parse.txt @@ -222,14 +222,21 @@ of `r1` and `r2` and is defined as It it the set of commits that are reachable from either one of `r1` or `r2` but not from both. -Here are a few examples: +Two other shorthands for naming a set that is formed by a commit +and its parent commits exists. `r1{caret}@` notation means all +parents of `r1`. `r1{caret}!` includes commit `r1` but excludes +its all parents. + +Here are a handful examples: D A B D D F A B C D F - ^A G B D + ^A G B D ^A F B C F G...I C D F G I - ^B G I C D F G I + ^B G I C D F G I + F^@ A B C + F^! H D F H Author ------ diff --git a/revision.c b/revision.c index f1e0caa..b021d33 100644 --- a/revision.c +++ b/revision.c @@ -660,6 +660,13 @@ int handle_revision_arg(const char *arg, struct rev_info *revs, return 0; *dotdot = '^'; } + dotdot = strstr(arg, "^!"); + if (dotdot && !dotdot[2]) { + *dotdot = 0; + if (!add_parents_only(revs, arg, flags ^ UNINTERESTING)) + *dotdot = '^'; + } + local_flags = 0; if (*arg == '^') { local_flags = UNINTERESTING; -- cgit v0.10.2-6-g49f6 From 20239bae943733d766e6475cf0916966c868246b Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 2 Nov 2006 02:22:49 -0500 Subject: git-pickaxe: work properly in a subdirectory. We forgot to add prefix to the given path. [jc: interestingly enough, Jeff King had the same idea after I pushed mine out to "pu", and his patch was cleaner, so I dropped mine.] Signed-off-by: Jeff King Signed-off-by: Junio C Hamano diff --git a/builtin-pickaxe.c b/builtin-pickaxe.c index c9405e9..f6e861a 100644 --- a/builtin-pickaxe.c +++ b/builtin-pickaxe.c @@ -1428,6 +1428,13 @@ static unsigned parse_score(const char *arg) return score; } +static const char *add_prefix(const char *prefix, const char *path) +{ + if (!prefix || !prefix[0]) + return path; + return prefix_path(prefix, strlen(prefix), path); +} + int cmd_pickaxe(int argc, const char **argv, const char *prefix) { struct rev_info revs; @@ -1548,7 +1555,7 @@ int cmd_pickaxe(int argc, const char **argv, const char *prefix) /* (1) */ if (argc <= i) usage(pickaxe_usage); - path = argv[i]; + path = add_prefix(prefix, argv[i]); if (i + 1 == argc - 1) { if (unk != 1) usage(pickaxe_usage); @@ -1566,13 +1573,13 @@ int cmd_pickaxe(int argc, const char **argv, const char *prefix) if (seen_dashdash) { if (seen_dashdash + 1 != argc - 1) usage(pickaxe_usage); - path = argv[seen_dashdash + 1]; + path = add_prefix(prefix, argv[seen_dashdash + 1]); for (j = i; j < seen_dashdash; j++) argv[unk++] = argv[j]; } else { /* (3) */ - path = argv[i]; + path = add_prefix(prefix, argv[i]); if (i + 1 == argc - 1) { final_commit_name = argv[i + 1]; @@ -1580,7 +1587,7 @@ int cmd_pickaxe(int argc, const char **argv, const char *prefix) * old-style */ if (unk == 1 && !has_path_in_work_tree(path)) { - path = argv[i + 1]; + path = add_prefix(prefix, argv[i + 1]); final_commit_name = argv[i]; } } -- cgit v0.10.2-6-g49f6 From 2f3f8b218abae6fc0574d0e22d3614fc0f5e983d Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Thu, 2 Nov 2006 00:02:11 -0800 Subject: git-pickaxe: rename detection optimization The idea is that we are interested in renaming into only one path, so we do not care about renames that happen elsewhere. Signed-off-by: Junio C Hamano diff --git a/builtin-pickaxe.c b/builtin-pickaxe.c index f6e861a..97b3732 100644 --- a/builtin-pickaxe.c +++ b/builtin-pickaxe.c @@ -289,6 +289,7 @@ static struct origin *find_rename(struct scoreboard *sb, diff_opts.recursive = 1; diff_opts.detect_rename = DIFF_DETECT_RENAME; diff_opts.output_format = DIFF_FORMAT_NO_OUTPUT; + diff_opts.single_follow = origin->path; paths[0] = NULL; diff_tree_setup_paths(paths, &diff_opts); if (diff_setup_done(&diff_opts) < 0) diff --git a/diff.h b/diff.h index ce3058e..6fda92a 100644 --- a/diff.h +++ b/diff.h @@ -46,6 +46,7 @@ struct diff_options { const char *filter; const char *orderfile; const char *pickaxe; + const char *single_follow; unsigned recursive:1, tree_in_recursive:1, binary:1, diff --git a/diffcore-rename.c b/diffcore-rename.c index ef23901..57a74b6 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -256,11 +256,15 @@ void diffcore_rename(struct diff_options *options) for (i = 0; i < q->nr; i++) { struct diff_filepair *p = q->queue[i]; - if (!DIFF_FILE_VALID(p->one)) + if (!DIFF_FILE_VALID(p->one)) { if (!DIFF_FILE_VALID(p->two)) continue; /* unmerged */ + else if (options->single_follow && + strcmp(options->single_follow, p->two->path)) + continue; /* not interested */ else locate_rename_dst(p->two, 1); + } else if (!DIFF_FILE_VALID(p->two)) { /* If the source is a broken "delete", and * they did not really want to get broken, -- cgit v0.10.2-6-g49f6 From 0421d9f812acc819dcf9e9c6707618aca7e80bf4 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Sat, 4 Nov 2006 12:20:09 -0800 Subject: git-pickaxe: simplify Octopus merges further If more than one parents in an Octopus merge have the same origin, ignore later ones because it would not make any difference in the outcome. Signed-off-by: Junio C Hamano diff --git a/builtin-pickaxe.c b/builtin-pickaxe.c index 97b3732..082ff45 100644 --- a/builtin-pickaxe.c +++ b/builtin-pickaxe.c @@ -915,6 +915,7 @@ static void pass_blame(struct scoreboard *sb, struct origin *origin, int opt) i < MAXPARENT && parent; parent = parent->next, i++) { struct commit *p = parent->item; + int j, same; if (parent_origin[i]) continue; @@ -934,7 +935,16 @@ static void pass_blame(struct scoreboard *sb, struct origin *origin, int opt) origin_decref(porigin); goto finish; } - parent_origin[i] = porigin; + for (j = same = 0; j < i; j++) + if (!hashcmp(parent_origin[j]->blob_sha1, + porigin->blob_sha1)) { + same = 1; + break; + } + if (!same) + parent_origin[i] = porigin; + else + origin_decref(porigin); } } -- cgit v0.10.2-6-g49f6 From 650e2f6752ad29b0cba394535d3b098cf4bb102b Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Sat, 4 Nov 2006 12:37:02 -0800 Subject: git-pickaxe: re-scan the blob after making progress with -M Otherwise we would miss copied lines that are contained in the parts before or after the part that we find after splitting the blame_entry (i.e. split[0] and split[2]). Signed-off-by: Junio C Hamano diff --git a/builtin-pickaxe.c b/builtin-pickaxe.c index 082ff45..29839cd 100644 --- a/builtin-pickaxe.c +++ b/builtin-pickaxe.c @@ -766,7 +766,7 @@ static int find_move_in_parent(struct scoreboard *sb, struct origin *target, struct origin *parent) { - int last_in_target; + int last_in_target, made_progress; struct blame_entry *e, split[3]; mmfile_t file_p; char type[10]; @@ -784,14 +784,20 @@ static int find_move_in_parent(struct scoreboard *sb, return 0; } - for (e = sb->ent; e; e = e->next) { - if (e->guilty || cmp_suspect(e->suspect, target)) - continue; - find_copy_in_blob(sb, e, parent, split, &file_p); - if (split[1].suspect && - blame_move_score < ent_score(sb, &split[1])) - split_blame(sb, split, e); - decref_split(split); + made_progress = 1; + while (made_progress) { + made_progress = 0; + for (e = sb->ent; e; e = e->next) { + if (e->guilty || cmp_suspect(e->suspect, target)) + continue; + find_copy_in_blob(sb, e, parent, split, &file_p); + if (split[1].suspect && + blame_move_score < ent_score(sb, &split[1])) { + split_blame(sb, split, e); + made_progress = 1; + } + decref_split(split); + } } free(blob_p); return 0; -- cgit v0.10.2-6-g49f6 From 334947843cbd50895f1dc58d8bd2f217e1f1ef77 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Sat, 4 Nov 2006 16:39:03 -0800 Subject: git-pickaxe: re-scan the blob after making progress with -C The reason to do this is the same as in the previous change for line copy detection within the same file (-M). Also this fixes -C and -C -C (aka find-copies-harder) logic; in this application we are not interested in the similarity matching diffcore-rename makes, because we are only interested in scanning files that were modified, or in the case of -C -C, scanning all files in the parent and we want to do that ourselves. Signed-off-by: Junio C Hamano diff --git a/builtin-pickaxe.c b/builtin-pickaxe.c index 29839cd..b25e039 100644 --- a/builtin-pickaxe.c +++ b/builtin-pickaxe.c @@ -803,6 +803,36 @@ static int find_move_in_parent(struct scoreboard *sb, return 0; } + +struct blame_list { + struct blame_entry *ent; + struct blame_entry split[3]; +}; + +static struct blame_list *setup_blame_list(struct scoreboard *sb, + struct origin *target, + int *num_ents_p) +{ + struct blame_entry *e; + int num_ents, i; + struct blame_list *blame_list = NULL; + + /* Count the number of entries the target is suspected for, + * and prepare a list of entry and the best split. + */ + for (e = sb->ent, num_ents = 0; e; e = e->next) + if (!e->guilty && !cmp_suspect(e->suspect, target)) + num_ents++; + if (num_ents) { + blame_list = xcalloc(num_ents, sizeof(struct blame_list)); + for (e = sb->ent, i = 0; e; e = e->next) + if (!e->guilty && !cmp_suspect(e->suspect, target)) + blame_list[i++].ent = e; + } + *num_ents_p = num_ents; + return blame_list; +} + static int find_copy_in_parent(struct scoreboard *sb, struct origin *target, struct commit *parent, @@ -811,91 +841,101 @@ static int find_copy_in_parent(struct scoreboard *sb, { struct diff_options diff_opts; const char *paths[1]; - struct blame_entry *e; int i, j; - struct blame_list { - struct blame_entry *ent; - struct blame_entry split[3]; - } *blame_list; + int retval; + struct blame_list *blame_list; int num_ents; - /* Count the number of entries the target is suspected for, - * and prepare a list of entry and the best split. - */ - for (e = sb->ent, num_ents = 0; e; e = e->next) - if (!e->guilty && !cmp_suspect(e->suspect, target)) - num_ents++; - if (!num_ents) + blame_list = setup_blame_list(sb, target, &num_ents); + if (!blame_list) return 1; /* nothing remains for this target */ - blame_list = xcalloc(num_ents, sizeof(struct blame_list)); - for (e = sb->ent, i = 0; e; e = e->next) - if (!e->guilty && !cmp_suspect(e->suspect, target)) - blame_list[i++].ent = e; - diff_setup(&diff_opts); diff_opts.recursive = 1; diff_opts.output_format = DIFF_FORMAT_NO_OUTPUT; - /* Try "find copies harder" on new path */ - if ((opt & PICKAXE_BLAME_COPY_HARDER) && - (!porigin || strcmp(target->path, porigin->path))) { - diff_opts.detect_rename = DIFF_DETECT_COPY; - diff_opts.find_copies_harder = 1; - } paths[0] = NULL; diff_tree_setup_paths(paths, &diff_opts); if (diff_setup_done(&diff_opts) < 0) die("diff-setup"); + + /* Try "find copies harder" on new path if requested; + * we do not want to use diffcore_rename() actually to + * match things up; find_copies_harder is set only to + * force diff_tree_sha1() to feed all filepairs to diff_queue, + * and this code needs to be after diff_setup_done(), which + * usually makes find-copies-harder imply copy detection. + */ + if ((opt & PICKAXE_BLAME_COPY_HARDER) && + (!porigin || strcmp(target->path, porigin->path))) + diff_opts.find_copies_harder = 1; + diff_tree_sha1(parent->tree->object.sha1, target->commit->tree->object.sha1, "", &diff_opts); - diffcore_std(&diff_opts); - for (i = 0; i < diff_queued_diff.nr; i++) { - struct diff_filepair *p = diff_queued_diff.queue[i]; - struct origin *norigin; - mmfile_t file_p; - char type[10]; - char *blob; - struct blame_entry this[3]; - - if (!DIFF_FILE_VALID(p->one)) - continue; /* does not exist in parent */ - if (porigin && !strcmp(p->one->path, porigin->path)) - /* find_move already dealt with this path */ - continue; + if (!diff_opts.find_copies_harder) + diffcore_std(&diff_opts); - norigin = get_origin(sb, parent, p->one->path); - hashcpy(norigin->blob_sha1, p->one->sha1); - blob = read_sha1_file(norigin->blob_sha1, type, - (unsigned long *) &file_p.size); - file_p.ptr = blob; - if (!file_p.ptr) - continue; + retval = 0; + while (1) { + int made_progress = 0; + + for (i = 0; i < diff_queued_diff.nr; i++) { + struct diff_filepair *p = diff_queued_diff.queue[i]; + struct origin *norigin; + mmfile_t file_p; + char type[10]; + char *blob; + struct blame_entry this[3]; + + if (!DIFF_FILE_VALID(p->one)) + continue; /* does not exist in parent */ + if (porigin && !strcmp(p->one->path, porigin->path)) + /* find_move already dealt with this path */ + continue; + + norigin = get_origin(sb, parent, p->one->path); + hashcpy(norigin->blob_sha1, p->one->sha1); + blob = read_sha1_file(norigin->blob_sha1, type, + (unsigned long *) &file_p.size); + file_p.ptr = blob; + if (!file_p.ptr) + continue; + + for (j = 0; j < num_ents; j++) { + find_copy_in_blob(sb, blame_list[j].ent, + norigin, this, &file_p); + copy_split_if_better(sb, blame_list[j].split, + this); + decref_split(this); + } + free(blob); + origin_decref(norigin); + } for (j = 0; j < num_ents; j++) { - find_copy_in_blob(sb, blame_list[j].ent, norigin, - this, &file_p); - copy_split_if_better(sb, blame_list[j].split, - this); - decref_split(this); + struct blame_entry *split = blame_list[j].split; + if (split[1].suspect && + blame_copy_score < ent_score(sb, &split[1])) { + split_blame(sb, split, blame_list[j].ent); + made_progress = 1; + } + decref_split(split); } - free(blob); - origin_decref(norigin); - } - diff_flush(&diff_opts); + free(blame_list); - for (j = 0; j < num_ents; j++) { - struct blame_entry *split = blame_list[j].split; - if (split[1].suspect && - blame_copy_score < ent_score(sb, &split[1])) - split_blame(sb, split, blame_list[j].ent); - decref_split(split); + if (!made_progress) + break; + blame_list = setup_blame_list(sb, target, &num_ents); + if (!blame_list) { + retval = 1; + break; + } } - free(blame_list); + diff_flush(&diff_opts); - return 0; + return retval; } #define MAXPARENT 16 @@ -942,7 +982,8 @@ static void pass_blame(struct scoreboard *sb, struct origin *origin, int opt) goto finish; } for (j = same = 0; j < i; j++) - if (!hashcmp(parent_origin[j]->blob_sha1, + if (parent_origin[j] && + !hashcmp(parent_origin[j]->blob_sha1, porigin->blob_sha1)) { same = 1; break; -- cgit v0.10.2-6-g49f6 From 854b97f6eb3bcbeb76b2705bfbffead1558bde77 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Sat, 4 Nov 2006 19:18:50 -0800 Subject: git-pickaxe: fix origin refcounting When we introduced the cached origin per commit, we gave up proper garbage collecting because it meant that commits hold onto their cached copy. There is no need to do so. Signed-off-by: Junio C Hamano diff --git a/builtin-pickaxe.c b/builtin-pickaxe.c index b25e039..332e6a2 100644 --- a/builtin-pickaxe.c +++ b/builtin-pickaxe.c @@ -168,23 +168,28 @@ static void coalesce(struct scoreboard *sb) sanity_check_refcnt(sb); } +static struct origin *make_origin(struct commit *commit, const char *path) +{ + struct origin *o; + o = xcalloc(1, sizeof(*o) + strlen(path) + 1); + o->commit = commit; + o->refcnt = 1; + strcpy(o->path, path); + return o; +} + static struct origin *get_origin(struct scoreboard *sb, struct commit *commit, const char *path) { struct blame_entry *e; - struct origin *o; for (e = sb->ent; e; e = e->next) { if (e->suspect->commit == commit && !strcmp(e->suspect->path, path)) return origin_incref(e->suspect); } - o = xcalloc(1, sizeof(*o) + strlen(path) + 1); - o->commit = commit; - o->refcnt = 1; - strcpy(o->path, path); - return o; + return make_origin(commit, path); } static int fill_blob_sha1(struct origin *origin) @@ -216,9 +221,19 @@ static struct origin *find_origin(struct scoreboard *sb, const char *paths[2]; if (parent->util) { + /* This is a freestanding copy of origin and not + * refcounted. + */ struct origin *cached = parent->util; - if (!strcmp(cached->path, origin->path)) - return origin_incref(cached); + if (!strcmp(cached->path, origin->path)) { + porigin = get_origin(sb, parent, cached->path); + if (porigin->refcnt == 1) + hashcpy(porigin->blob_sha1, cached->blob_sha1); + return porigin; + } + /* otherwise it was not very useful; free it */ + free(parent->util); + parent->util = NULL; } /* See if the origin->path is different between parent @@ -268,10 +283,10 @@ static struct origin *find_origin(struct scoreboard *sb, } diff_flush(&diff_opts); if (porigin) { - origin_incref(porigin); - if (parent->util) - origin_decref(parent->util); - parent->util = porigin; + struct origin *cached; + cached = make_origin(porigin->commit, porigin->path); + hashcpy(cached->blob_sha1, porigin->blob_sha1); + parent->util = cached; } return porigin; } @@ -1434,8 +1449,13 @@ static void sanity_check_refcnt(struct scoreboard *sb) for (ent = sb->ent; ent; ent = ent->next) { /* Nobody should have zero or negative refcnt */ - if (ent->suspect->refcnt <= 0) + if (ent->suspect->refcnt <= 0) { + fprintf(stderr, "%s in %s has negative refcnt %d\n", + ent->suspect->path, + sha1_to_hex(ent->suspect->commit->object.sha1), + ent->suspect->refcnt); baa = 1; + } } for (ent = sb->ent; ent; ent = ent->next) { /* Mark the ones that haven't been checked */ @@ -1444,9 +1464,7 @@ static void sanity_check_refcnt(struct scoreboard *sb) } for (ent = sb->ent; ent; ent = ent->next) { /* then pick each and see if they have the the correct - * refcnt; note that ->util caching means origin's refcnt - * may well be greater than the number of blame entries - * that use it. + * refcnt. */ int found; struct blame_entry *e; @@ -1460,14 +1478,19 @@ static void sanity_check_refcnt(struct scoreboard *sb) continue; found++; } - if (suspect->refcnt < found) - baa = 1; + if (suspect->refcnt != found) { + fprintf(stderr, "%s in %s has refcnt %d, not %d\n", + ent->suspect->path, + sha1_to_hex(ent->suspect->commit->object.sha1), + ent->suspect->refcnt, found); + baa = 2; + } } if (baa) { int opt = 0160; find_alignment(sb, &opt); output(sb, opt); - die("Baa!"); + die("Baa %d!", baa); } } -- cgit v0.10.2-6-g49f6 From 2bc45477a53b9b7a5202a816b96c14428f683c38 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Sun, 5 Nov 2006 11:47:53 -0800 Subject: git-blame: add internal statistics to count read blobs. diff --git a/blame.c b/blame.c index 8cfd5d9..d0e506c 100644 --- a/blame.c +++ b/blame.c @@ -59,6 +59,7 @@ static void get_blob(struct commit *commit); static int num_get_patch; static int num_commits; static int patch_time; +static int num_read_blob; struct blame_diff_state { struct xdiff_emit_state xm; @@ -204,6 +205,7 @@ static void get_blob(struct commit *commit) return; info->buf = read_sha1_file(info->sha1, type, &info->size); + num_read_blob++; assert(!strcmp(type, blob_type)); } @@ -910,6 +912,7 @@ int main(int argc, const char **argv) } if (DEBUG) { + printf("num read blob: %d\n", num_read_blob); printf("num get patch: %d\n", num_get_patch); printf("num commits: %d\n", num_commits); printf("patch time: %f\n", patch_time / 1000000.0); -- cgit v0.10.2-6-g49f6 From c2e525d97f81bc178567cdf4dd7056ce6224eb58 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Sun, 5 Nov 2006 11:51:41 -0800 Subject: git-pickaxe: optimize by avoiding repeated read_sha1_file(). It turns out that pickaxe reads the same blob repeatedly while blame can reuse the blob already read for the parent when handling a child commit when it's parent's turn to pass its blame to the grandparent. Have a cache in the origin structure to keep the blob there, which will be garbage collected when the origin loses the last reference to it. Signed-off-by: Junio C Hamano diff --git a/builtin-pickaxe.c b/builtin-pickaxe.c index 332e6a2..f12b2d4 100644 --- a/builtin-pickaxe.c +++ b/builtin-pickaxe.c @@ -40,6 +40,11 @@ static int max_score_digits; #define DEBUG 0 #endif +/* stats */ +static int num_read_blob; +static int num_get_patch; +static int num_commits; + #define PICKAXE_BLAME_MOVE 01 #define PICKAXE_BLAME_COPY 02 #define PICKAXE_BLAME_COPY_HARDER 04 @@ -63,10 +68,25 @@ static unsigned blame_copy_score; struct origin { int refcnt; struct commit *commit; + mmfile_t file; unsigned char blob_sha1[20]; char path[FLEX_ARRAY]; }; +static char *fill_origin_blob(struct origin *o, mmfile_t *file) +{ + if (!o->file.ptr) { + char type[10]; + num_read_blob++; + file->ptr = read_sha1_file(o->blob_sha1, type, + (unsigned long *)(&(file->size))); + o->file = *file; + } + else + *file = o->file; + return file->ptr; +} + static inline struct origin *origin_incref(struct origin *o) { if (o) @@ -77,6 +97,8 @@ static inline struct origin *origin_incref(struct origin *o) static void origin_decref(struct origin *o) { if (o && --o->refcnt <= 0) { + if (o->file.ptr) + free(o->file.ptr); memset(o, 0, sizeof(*o)); free(o); } @@ -431,25 +453,14 @@ static struct patch *compare_buffer(mmfile_t *file_p, mmfile_t *file_o, static struct patch *get_patch(struct origin *parent, struct origin *origin) { mmfile_t file_p, file_o; - char type[10]; - char *blob_p, *blob_o; struct patch *patch; - blob_p = read_sha1_file(parent->blob_sha1, type, - (unsigned long *) &file_p.size); - blob_o = read_sha1_file(origin->blob_sha1, type, - (unsigned long *) &file_o.size); - file_p.ptr = blob_p; - file_o.ptr = blob_o; - if (!file_p.ptr || !file_o.ptr) { - free(blob_p); - free(blob_o); + fill_origin_blob(parent, &file_p); + fill_origin_blob(origin, &file_o); + if (!file_p.ptr || !file_o.ptr) return NULL; - } - patch = compare_buffer(&file_p, &file_o, 0); - free(blob_p); - free(blob_o); + num_get_patch++; return patch; } @@ -784,20 +795,14 @@ static int find_move_in_parent(struct scoreboard *sb, int last_in_target, made_progress; struct blame_entry *e, split[3]; mmfile_t file_p; - char type[10]; - char *blob_p; last_in_target = find_last_in_target(sb, target); if (last_in_target < 0) return 1; /* nothing remains for this target */ - blob_p = read_sha1_file(parent->blob_sha1, type, - (unsigned long *) &file_p.size); - file_p.ptr = blob_p; - if (!file_p.ptr) { - free(blob_p); + fill_origin_blob(parent, &file_p); + if (!file_p.ptr) return 0; - } made_progress = 1; while (made_progress) { @@ -814,7 +819,6 @@ static int find_move_in_parent(struct scoreboard *sb, decref_split(split); } } - free(blob_p); return 0; } @@ -900,8 +904,6 @@ static int find_copy_in_parent(struct scoreboard *sb, struct diff_filepair *p = diff_queued_diff.queue[i]; struct origin *norigin; mmfile_t file_p; - char type[10]; - char *blob; struct blame_entry this[3]; if (!DIFF_FILE_VALID(p->one)) @@ -912,9 +914,7 @@ static int find_copy_in_parent(struct scoreboard *sb, norigin = get_origin(sb, parent, p->one->path); hashcpy(norigin->blob_sha1, p->one->sha1); - blob = read_sha1_file(norigin->blob_sha1, type, - (unsigned long *) &file_p.size); - file_p.ptr = blob; + fill_origin_blob(norigin, &file_p); if (!file_p.ptr) continue; @@ -925,7 +925,6 @@ static int find_copy_in_parent(struct scoreboard *sb, this); decref_split(this); } - free(blob); origin_decref(norigin); } @@ -953,6 +952,28 @@ static int find_copy_in_parent(struct scoreboard *sb, return retval; } +/* The blobs of origin and porigin exactly match, so everything + * origin is suspected for can be blamed on the parent. + */ +static void pass_whole_blame(struct scoreboard *sb, + struct origin *origin, struct origin *porigin) +{ + struct blame_entry *e; + + if (!porigin->file.ptr && origin->file.ptr) { + /* Steal its file */ + porigin->file = origin->file; + origin->file.ptr = NULL; + } + for (e = sb->ent; e; e = e->next) { + if (cmp_suspect(e->suspect, origin)) + continue; + origin_incref(porigin); + origin_decref(e->suspect); + e->suspect = porigin; + } +} + #define MAXPARENT 16 static void pass_blame(struct scoreboard *sb, struct origin *origin, int opt) @@ -986,13 +1007,7 @@ static void pass_blame(struct scoreboard *sb, struct origin *origin, int opt) if (!porigin) continue; if (!hashcmp(porigin->blob_sha1, origin->blob_sha1)) { - struct blame_entry *e; - for (e = sb->ent; e; e = e->next) - if (e->suspect == origin) { - origin_incref(porigin); - origin_decref(e->suspect); - e->suspect = porigin; - } + pass_whole_blame(sb, origin, porigin); origin_decref(porigin); goto finish; } @@ -1010,6 +1025,7 @@ static void pass_blame(struct scoreboard *sb, struct origin *origin, int opt) } } + num_commits++; for (i = 0, parent = commit->parents; i < MAXPARENT && parent; parent = parent->next, i++) { @@ -1068,7 +1084,8 @@ static void assign_blame(struct scoreboard *sb, struct rev_info *revs, int opt) origin_incref(suspect); commit = suspect->commit; - parse_commit(commit); + if (!commit->object.parsed) + parse_commit(commit); if (!(commit->object.flags & UNINTERESTING) && !(revs->max_age != -1 && commit->date < revs->max_age)) pass_blame(sb, suspect, opt); @@ -1735,6 +1752,7 @@ int cmd_pickaxe(int argc, const char **argv, const char *prefix) die("no such path %s in %s", path, final_commit_name); sb.final_buf = read_sha1_file(o->blob_sha1, type, &sb.final_buf_size); + num_read_blob++; lno = prepare_lines(&sb); if (bottom < 1) @@ -1772,5 +1790,11 @@ int cmd_pickaxe(int argc, const char **argv, const char *prefix) free(ent); ent = e; } + + if (DEBUG) { + printf("num read blob: %d\n", num_read_blob); + printf("num get patch: %d\n", num_get_patch); + printf("num commits: %d\n", num_commits); + } return 0; } -- cgit v0.10.2-6-g49f6 From 931233bc666b40de039d50f82f12279c94e5e5f0 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Mon, 6 Nov 2006 17:08:32 -0800 Subject: git-pickaxe: -L /regexp/,/regexp/ With this change, you can specify the beginning and the ending line of the range you wish to inspect with pattern matching. For example, these are equivalent with the git.git sources: git pickaxe -L 7,21 v1.4.0 -- commit.c git pickaxe -L '/^struct sort_node/,/^}/' v1.4.0 -- commit.c git pickaxe -L '7,/^}/' v1.4.0 -- commit.c git pickaxe -L '/^struct sort_node/,21' v1.4.0 -- commit.c Signed-off-by: Junio C Hamano diff --git a/builtin-pickaxe.c b/builtin-pickaxe.c index f12b2d4..673185f 100644 --- a/builtin-pickaxe.c +++ b/builtin-pickaxe.c @@ -17,6 +17,7 @@ #include #include +#include static char pickaxe_usage[] = "git-pickaxe [-c] [-l] [-t] [-f] [-n] [-p] [-L n,m] [-S ] [-M] [-C] [-C] [commit] [--] file\n" @@ -1533,6 +1534,78 @@ static const char *add_prefix(const char *prefix, const char *path) return prefix_path(prefix, strlen(prefix), path); } +static const char *parse_loc(const char *spec, + struct scoreboard *sb, long lno, + long begin, long *ret) +{ + char *term; + const char *line; + long num; + int reg_error; + regex_t regexp; + regmatch_t match[1]; + + num = strtol(spec, &term, 10); + if (term != spec) { + *ret = num; + return term; + } + if (spec[0] != '/') + return spec; + + /* it could be a regexp of form /.../ */ + for (term = (char*) spec + 1; *term && *term != '/'; term++) { + if (*term == '\\') + term++; + } + if (*term != '/') + return spec; + + /* try [spec+1 .. term-1] as regexp */ + *term = 0; + begin--; /* input is in human terms */ + line = nth_line(sb, begin); + + if (!(reg_error = regcomp(®exp, spec + 1, REG_NEWLINE)) && + !(reg_error = regexec(®exp, line, 1, match, 0))) { + const char *cp = line + match[0].rm_so; + const char *nline; + + while (begin++ < lno) { + nline = nth_line(sb, begin); + if (line <= cp && cp < nline) + break; + line = nline; + } + *ret = begin; + regfree(®exp); + *term++ = '/'; + return term; + } + else { + char errbuf[1024]; + regerror(reg_error, ®exp, errbuf, 1024); + die("-L parameter '%s': %s", spec + 1, errbuf); + } +} + +static void prepare_blame_range(struct scoreboard *sb, + const char *bottomtop, + long lno, + long *bottom, long *top) +{ + const char *term; + + term = parse_loc(bottomtop, sb, lno, 1, bottom); + if (*term == ',') { + term = parse_loc(term + 1, sb, lno, *bottom + 1, top); + if (*term) + usage(pickaxe_usage); + } + if (*term) + usage(pickaxe_usage); +} + int cmd_pickaxe(int argc, const char **argv, const char *prefix) { struct rev_info revs; @@ -1546,11 +1619,11 @@ int cmd_pickaxe(int argc, const char **argv, const char *prefix) const char *revs_file = NULL; const char *final_commit_name = NULL; char type[10]; + const char *bottomtop = NULL; save_commit_buffer = 0; opt = 0; - bottom = top = 0; seen_dashdash = 0; for (unk = i = 1; i < argc; i++) { const char *arg = argv[i]; @@ -1575,7 +1648,6 @@ int cmd_pickaxe(int argc, const char **argv, const char *prefix) blame_copy_score = parse_score(arg+2); } else if (!strncmp("-L", arg, 2)) { - char *term; if (!arg[2]) { if (++i >= argc) usage(pickaxe_usage); @@ -1583,18 +1655,9 @@ int cmd_pickaxe(int argc, const char **argv, const char *prefix) } else arg += 2; - if (bottom || top) + if (bottomtop) die("More than one '-L n,m' option given"); - bottom = strtol(arg, &term, 10); - if (*term == ',') { - top = strtol(term + 1, &term, 10); - if (*term) - usage(pickaxe_usage); - } - if (bottom && top && top < bottom) { - unsigned long tmp; - tmp = top; top = bottom; bottom = tmp; - } + bottomtop = arg; } else if (!strcmp("--score-debug", arg)) output_option |= OUTPUT_SHOW_SCORE; @@ -1755,6 +1818,13 @@ int cmd_pickaxe(int argc, const char **argv, const char *prefix) num_read_blob++; lno = prepare_lines(&sb); + bottom = top = 0; + if (bottomtop) + prepare_blame_range(&sb, bottomtop, lno, &bottom, &top); + if (bottom && top && top < bottom) { + long tmp; + tmp = top; top = bottom; bottom = tmp; + } if (bottom < 1) bottom = 1; if (top < 1) -- cgit v0.10.2-6-g49f6 From 7bd9641df5b7cca91b21bfdc587962c59700786c Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Tue, 7 Nov 2006 16:20:02 -0800 Subject: git-pickaxe: allow "-L ,+N" With this, git pickaxe -L '/--progress/,+20' v1.4.0 -- pack-objects.c gives you 20 lines starting from the first occurrence of '--progress' in pack-objects, digging from v1.4.0 version. You can also say git pickaxe -L '/--progress/,-5' v1.4.0 -- pack-objects.c Signed-off-by: Junio C Hamano diff --git a/builtin-pickaxe.c b/builtin-pickaxe.c index 673185f..64999f3 100644 --- a/builtin-pickaxe.c +++ b/builtin-pickaxe.c @@ -1545,6 +1545,25 @@ static const char *parse_loc(const char *spec, regex_t regexp; regmatch_t match[1]; + /* Allow "-L ,+20" to mean starting at + * for 20 lines, or "-L ,-5" for 5 lines ending at + * . + */ + if (1 < begin && (spec[0] == '+' || spec[0] == '-')) { + num = strtol(spec + 1, &term, 10); + if (term != spec + 1) { + if (spec[0] == '-') + num = 0 - num; + if (0 < num) + *ret = begin + num - 2; + else if (!num) + *ret = begin; + else + *ret = begin + num; + return term; + } + return spec; + } num = strtol(spec, &term, 10); if (term != spec) { *ret = num; -- cgit v0.10.2-6-g49f6