# # patch "ChangeLog" # from [c5b6af776442ffcda8b25b0b94a9785795da0817] # to [34847c5d937ac67773f07cab8b9d83cd45ab627c] # # patch "rcs_import.cc" # from [12affac881ab416406a9b81ec4700242bb5a0829] # to [0b7f304fb69c215b7555bca255db16bc89106489] # --- ChangeLog +++ ChangeLog @@ -1,3 +1,7 @@ +2005-06-19 graydon hoare + + * rcs_import.cc: Rewrite change set inference logic. + 2005-06-14 Richard Levitte * std_hooks.lua (get_preferred_merge2_command, --- rcs_import.cc +++ rcs_import.cc @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -41,6 +42,8 @@ #include "transforms.hh" #include "ui.hh" +int window = 3600 * 3; + using namespace std; using boost::shared_ptr; using boost::scoped_ptr; @@ -52,161 +55,85 @@ typedef unsigned long cvs_changelog; typedef unsigned long cvs_version; typedef unsigned long cvs_path; +typedef unsigned long cvs_tag; struct cvs_history; -struct -cvs_key +struct +cvs_commit { - cvs_key() {} - cvs_key(rcs_file const & r, - string const & version, - cvs_history & cvs); + cvs_commit(time_t t, + cvs_author a, + cvs_changelog c, + cvs_version v, + cvs_path p) + : time(t), + author(a), + changelog(c), + version(v), + path(p) + {} - inline bool similar_enough(cvs_key const & other) const - { - L(F("Checking similarity of %d and %d\n") % id % other.id); - if (changelog != other.changelog) - return false; - if (author != other.author) - return false; - if (labs(time - other.time) > constants::cvs_window) - return false; - for (map::const_iterator it = files.begin(); it!=files.end(); it++) - { - map::const_iterator otherit; - - L(F("checking %s %s\n") % it->first % it->second); - otherit = other.files.find(it->first); - if (otherit != other.files.end() && it->second!=otherit->second) - { - L(F("!similar_enough: %d/%d\n") % id % other.id); - return false; - } - else if (otherit != other.files.end()) - { - L(F("Same file, different version: %s and %s\n") % it->second % otherit->second); - } - } - L(F("similar_enough: %d/%d\n") % id % other.id); - return true; - } + cvs_commit(rcs_file const & r, + string const & rcs_version, + file_id const & ident, + cvs_history & cvs); - inline bool operator==(cvs_key const & other) const + time_t time; + bool alive; + cvs_author author; + cvs_changelog changelog; + cvs_version version; + cvs_path path; + + bool operator<(cvs_commit const & other) const { - L(F("Checking equality of %d and %d\n") % id % other.id); - return is_synthetic_branch_founding_commit == other.is_synthetic_branch_founding_commit && - branch == other.branch && - changelog == other.changelog && - author == other.author && - time == other.time; + return time < other.time; } - inline bool operator<(cvs_key const & other) const - { - // nb: this must sort as > to construct the edges in the right direction +}; - if (is_synthetic_branch_founding_commit) - { - I(!other.is_synthetic_branch_founding_commit); - return false; - } +struct +cvs_branch +{ + bool has_a_branchpoint; + bool has_a_commit; + time_t last_branchpoint; + time_t first_commit; - if (other.is_synthetic_branch_founding_commit) - { - I(!is_synthetic_branch_founding_commit); - return true; - } - - return time > other.time || + map live_at_beginning; + vector lineage; - (time == other.time - && author > other.author) || - - (time == other.time - && author == other.author - && changelog > other.changelog) || - - (time == other.time - && author == other.author - && changelog == other.changelog - && branch > other.branch); + void note_commit(time_t now) + { + has_a_commit = true; + if (now < first_commit) + first_commit = now; } - inline void add_file(file_path const &file, string const &version) + void note_branchpoint(time_t now) { - L(F("Adding file %s version %s to CVS key %d\n") % file % version % id); - files.insert( make_pair(file, version) ); + has_a_branchpoint = true; + if (now > last_branchpoint) + last_branchpoint = now; } - bool is_synthetic_branch_founding_commit; - cvs_branchname branch; - cvs_changelog changelog; - cvs_author author; - time_t time; - map files; // Maps file to version - int id; // Only used for debug output - - static int nextid; // Used to initialise id -}; - -int cvs_key::nextid = 0; - -struct -cvs_file_edge -{ - cvs_file_edge (file_id const & pv, - file_path const & pp, - bool pl, - file_id const & cv, - file_path const & cp, - bool cl, - cvs_history & cvs); - cvs_version parent_version; - cvs_path parent_path; - bool parent_live_p; - cvs_version child_version; - cvs_path child_path; - bool child_live_p; - inline bool operator<(cvs_file_edge const & other) const + time_t beginning() const { -#if 0 - return (parent_path < other.parent_path) - || ((parent_path == other.parent_path) - && ((parent_version < other.parent_version) - || ((parent_version == other.parent_version) - && ((parent_live_p < other.parent_live_p) - || ((parent_live_p == other.parent_live_p) - && ((child_path < other.child_path) - || ((child_path == other.child_path) - && ((child_version < other.child_version) - || ((child_version == other.child_version) - && (child_live_p < other.child_live_p) ))))))))); -#else - return (parent_path < other.parent_path) - || ((parent_path == other.parent_path) - && ((parent_version < other.parent_version) - || ((parent_version == other.parent_version) - && ((parent_live_p < other.parent_live_p) - || ((parent_live_p == other.parent_live_p) - && ((child_path < other.child_path) - || ((child_path == other.child_path) - && ((child_version < other.child_version) - || ((child_version == other.child_version) - && (child_live_p < other.child_live_p) ))))))))); -#endif + I(has_a_branchpoint || has_a_commit); + if (has_a_commit) + return first_commit; + else + return last_branchpoint; } + + void append_commit(cvs_commit const & c) + { + note_commit(c.time); + lineage.push_back(c); + } }; struct -cvs_state -{ - set in_edges; -}; - -typedef map > -cvs_branch; - -struct cvs_history { @@ -219,12 +146,6 @@ cycle_detector manifest_cycle_detector; - bool find_key_and_state(rcs_file const & r, - string const & version, - cvs_key & key, - shared_ptr & state); - - // assume admin has foo:X.Y.0.N in it, then // this multimap contains entries of the form // X.Y -> foo @@ -236,16 +157,14 @@ // branch name -> branch map > branches; + shared_ptr trunk; - // branch name -> whether there are any commits on the - // branch (as opposed to just branchpoints) - map branch_has_commit; - // stack of branches we're injecting states into stack< shared_ptr > stk; stack< cvs_branchname > bstk; file_path curr_file; + cvs_path curr_file_interned; string base_branch; @@ -262,22 +181,52 @@ enum note_type { note_branchpoint, note_branch_first_commit }; - void note_state_at_branch_beginning(rcs_file const & r, - string const & branchname, - string const & version, - file_id const & ident, - note_type nt); + void push_branch(string const & branch_name, bool private_branch); + void pop_branch(); +}; - void push_branch(string const & branch_name, bool private_branch); - void note_file_edge(rcs_file const & r, - string const & prev_rcs_version_num, - string const & next_rcs_version_num, - file_id const & prev_version, - file_id const & next_version); +cvs_commit::cvs_commit(rcs_file const & r, + string const & rcs_version, + file_id const & ident, + cvs_history & cvs) +{ + map >::const_iterator delta = + r.deltas.find(rcs_version); + I(delta != r.deltas.end()); + + map >::const_iterator deltatext = + r.deltatexts.find(rcs_version); + I(deltatext != r.deltatexts.end()); + + struct tm t; + // We need to initialize t to all zeros, because strptime has a habit of + // leaving bits of the data structure alone, letting garbage sneak into + // our output. + memset(&t, 0, sizeof(t)); + char const * dp = delta->second->date.c_str(); + L(F("Calculating time of %s\n") % dp); +#ifdef WIN32 + I(sscanf(dp, "%d.%d.%d.%d.%d.%d", &(t.tm_year), &(t.tm_mon), + &(t.tm_mday), &(t.tm_hour), &(t.tm_min), &(t.tm_sec))==6); + t.tm_mon--; + // Apparently some RCS files have 2 digit years, others four; tm always + // wants a 2 (or 3) digit year (years since 1900). + if (t.tm_year > 1900) + t.tm_year-=1900; +#else + if (strptime(dp, "%y.%m.%d.%H.%M.%S", &t) == NULL) + I(strptime(dp, "%Y.%m.%d.%H.%M.%S", &t) != NULL); +#endif + time = mktime(&t); + L(F("= %i\n") % time); - void pop_branch(); -}; + alive = delta->second->state != "dead"; + changelog = cvs.changelog_interner.intern(deltatext->second->log); + author = cvs.author_interner.intern(delta->second->author); + path = cvs.curr_file_interned; + version = cvs.file_version_interner.intern(ident.inner()()); +} // piece table stuff @@ -505,6 +454,65 @@ rcs_put_raw_file_edge(next_id, curr_id, del, db); } + + +/* + +please read this exhaustingly long comment and understand it +before mucking with the branch inference logic. + +we are processing a file version. a branch might begin here. if +the current version is X.Y, then there is a branch B starting +here iff there is a symbol in the admin section called X.Y.0.Z, +where Z is the branch number (or if there is a private branch +called X.Y.Z, which is either an import branch or some private +RCS cruft). + +the version X.Y is then considered the branchpoint of B in the +current file. this does *not* mean that the CVS key -- an +abstraction representing whole-tree operations -- of X.Y is the +branchpoint across the CVS archive we're processing. + +in fact, CVS does not record the occurrence of a branching +action (tag -b). we have no idea who executed that command and +when. what we know instead is the commit X.Y immediately +preceeding the branch -- CVS consideres this the branchpoint -- +in this file's reduced view of history. we also know the first +commit X.Y.Z.1 inside the branch (which might not exist). + +our old strategy was to consider all branches nested in a +hierarchy, which was a super-tree of all the branch trees in all +the CVS files in a repository. this involved considering X.Y as +the parent version of branch X.Y.Z, an selecting "the" +branchpoint connecting the two as the least CVS key X.Y.Z.1 +committed inside the branch B. + +this was a mistake, for two significant reasons. + +first, some files do not *have* any commit inside the branch B, +only a branchpoint X.Y.0.Z. this branchpoint is actually the +last commit *before* the user branched, and could be a very old +commit, long before the branch was formed, so it is useless in +determining the branch structure. + +second, some files do not have a branch B, or worse, have +branched into B from an "ancestor" branch A, where a different +file branches into B from a different ancestor branch C. in +other words, while there *is* a tree structure within the X.Y.Z +branches of each file, there is *no* shared tree structure +between the branch names across a repository. in one file A can +be an ancestor of B, in another file B can be an ancestor of A. + +thus, we give up on establishing a hierarchy between branches +altogether. all branches exist in a flat namespace, and all are +direct descendents of the empty revision at the root of +history. each branchpoint symbol mentioned in the +administrative section of a file is considered the root of a new +lineage. + +*/ + + static void process_branch(string const & begin_version, vector< piece > const & begin_lines, @@ -525,10 +533,15 @@ while(! (r.deltas.find(curr_version) == r.deltas.end())) { L(F("version %s has %d lines\n") % curr_version % curr_lines->size()); + + cvs_commit curr_commit(r, curr_version, curr_id, cvs); + cvs.stk.top()->append_commit(curr_commit); + ++cvs.n_versions; string next_version = r.deltas.find(curr_version)->second->next; - if (!next_version.empty()) - { // construct this edge on our own branch + + if (! next_version.empty()) + { L(F("following RCS edge %s -> %s\n") % curr_version % next_version); construct_version(*curr_lines, next_version, *next_lines, r); @@ -537,100 +550,26 @@ insert_into_db(curr_data, curr_id, *next_lines, next_data, next_id, db); - - cvs.note_file_edge (r, curr_version, next_version, - file_id(curr_id), file_id(next_id)); } - else - { L(F("revision %s has no successor\n") % curr_version); - if (curr_version=="1.1") - { // mark this file as newly present since this commit - // (and as not present before) - - // perhaps this should get a member function of cvs_history ? - L(F("marking %s as not present in older manifests\n") % curr_version); - cvs_key k; - shared_ptr s; - cvs.find_key_and_state(r, curr_version, k, s); - I(r.deltas.find(curr_version) != r.deltas.end()); - bool live_p = r.deltas.find(curr_version)->second->state != "dead"; - s->in_edges.insert(cvs_file_edge(curr_id, cvs.curr_file, false, - curr_id, cvs.curr_file, live_p, - cvs)); - ++cvs.n_versions; - } - } - - /* - - please read this exhaustingly long comment and understand it - before mucking with the branch inference logic. - - we are processing a file version. a branch might begin here. if - the current version is X.Y, then there is a branch B starting - here iff there is a symbol in the admin section called X.Y.0.Z, - where Z is the branch number (or if there is a private branch - called X.Y.Z, which is either an import branch or some private - RCS cruft). - - the version X.Y is then considered the branchpoint of B in the - current file. this does *not* mean that the CVS key -- an - abstraction representing whole-tree operations -- of X.Y is the - branchpoint across the CVS archive we're processing. - - in fact, CVS does not record the occurrence of a branching - action (tag -b). we have no idea who executed that command and - when. what we know instead is the commit X.Y immediately - preceeding the branch -- CVS consideres this the branchpoint -- - in this file's reduced view of history. we also know the first - commit X.Y.Z.1 inside the branch (which might not exist). - - our old strategy was to consider all branches nested in a - hierarchy, which was a super-tree of all the branch trees in all - the CVS files in a repository. this involved considering X.Y as - the parent version of branch X.Y.Z, an selecting "the" - branchpoint connecting the two as the least CVS key X.Y.Z.1 - committed inside the branch B. - - this was a mistake, for two significant reasons. - - first, some files do not *have* any commit inside the branch B, - only a branchpoint X.Y.0.Z. this branchpoint is actually the - last commit *before* the user branched, and could be a very old - commit, long before the branch was formed, so it is useless in - determining the branch structure. - - second, some files do not have a branch B, or worse, have - branched into B from an "ancestor" branch A, where a different - file branches into B from a different ancestor branch C. in - other words, while there *is* a tree structure within the X.Y.Z - branches of each file, there is *no* shared tree structure - between the branch names across a repository. in one file A can - be an ancestor of B, in another file B can be an ancestor of A. - - thus, we give up on establishing a hierarchy between branches - altogether. all branches exist in a flat namespace, and all are - direct descendents of the empty revision at the root of - history. each branchpoint symbol mentioned in the - administrative section of a file is considered the root of a new - lineage. - - */ - + // mark the beginning-of-branch time and state of this file if + // we're at a branchpoint typedef multimap::const_iterator ity; pair range = cvs.branchpoints.equal_range(curr_version); if (range.first != cvs.branchpoints.end() - && range.first->first == curr_version) - { - for (ity branch = range.first; branch != range.second; ++branch) - { - cvs.note_state_at_branch_beginning(r, branch->second, - curr_version, - curr_id, - cvs_history::note_branchpoint); - } - } + && range.first->first == curr_version) + { + for (ity i = range.first; i != range.second; ++i) + { + shared_ptr b = cvs.stk.top(); + cvs.push_branch(i->second, false); + if (curr_commit.alive) + b->live_at_beginning[cvs.curr_file_interned] = curr_commit.version; + b->note_branchpoint(curr_commit.time); + cvs.pop_branch(); + } + } + // recursively follow any branch commits coming from the branchpoint boost::shared_ptr curr_delta = r.deltas.find(curr_version)->second; @@ -638,50 +577,41 @@ i != curr_delta->branches.end(); ++i) { string branch; + data branch_data; + hexenc branch_id; + vector< piece > branch_lines; bool priv = false; map::const_iterator be = cvs.branch_first_entries.find(*i); - + if (be != cvs.branch_first_entries.end()) branch = be->second; else priv = true; L(F("following RCS branch %s = '%s'\n") % (*i) % branch); - vector< piece > branch_lines; - construct_version(*curr_lines, *i, branch_lines, r); - - data branch_data; - hexenc branch_id; + + construct_version(*curr_lines, *i, branch_lines, r); insert_into_db(curr_data, curr_id, - branch_lines, branch_data, branch_id, db); - - // update the branch beginning time to reflect improved - // information, if there was a commit on the branch - if (!priv) - cvs.note_state_at_branch_beginning(r, branch, - *i, - branch_id, - cvs_history::note_branch_first_commit); - - cvs.push_branch (branch, priv); - cvs.note_file_edge (r, curr_version, *i, - file_id(curr_id), file_id(branch_id)); + branch_lines, branch_data, branch_id, db); + + cvs.push_branch(branch, priv); process_branch(*i, branch_lines, branch_data, branch_id, r, db, cvs); cvs.pop_branch(); L(F("finished RCS branch %s = '%s'\n") % (*i) % branch); } - + if (!r.deltas.find(curr_version)->second->next.empty()) - { // advance - curr_data = next_data; - curr_id = next_id; - curr_version = next_version; - swap(next_lines, curr_lines); - next_lines->clear(); - } + { + // advance + curr_data = next_data; + curr_id = next_id; + curr_version = next_version; + swap(next_lines, curr_lines); + next_lines->clear(); + } else break; - } + } } @@ -713,10 +643,10 @@ { // create the head state in case it is a loner - cvs_key k; - shared_ptr s; - L(F("noting head version %s : %s\n") % cvs.curr_file % r.admin.head); - cvs.find_key_and_state (r, r.admin.head, k, s); + // cvs_key k; + // shared_ptr s; + // L(F("noting head version %s : %s\n") % cvs.curr_file % r.admin.head); + // cvs.find_key_and_state (r, r.admin.head, k, s); } global_pieces.reset(); @@ -755,42 +685,7 @@ // CVS importing stuff follows -/* - we define a "cvs key" as a triple of author, commit time and - changelog. the equality of keys is a bit blurry due to a window of time - in which "the same" commit may begin and end. the window is evaluated - during the multimap walk though; for insertion in the multimap a true > - is used. a key identifies a particular commit. - - we reconstruct the history of a CVS archive by accumulating file edges - into archive nodes. each node is called a "cvs_state", but it is really a - collection of file *edges* leading into that archive state. we accumulate - file edges by walking up the trunk and down the branches of each RCS file. - - once we've got all the edges accumulated into archive nodes, we walk the - tree of cvs_states, up through the trunk and down through the branches, - carrying a manifest_map with us during the walk. for each edge, we - construct either the parent or child state of the edge (depending on - which way we're walking) and then calculate and write out a manifest - delta for the difference between the previous and current manifest map. we - also write out manifest certs, though the direction of ancestry changes - depending on whether we're going up the trunk or down the branches. - - */ - -cvs_file_edge::cvs_file_edge (file_id const & pv, file_path const & pp, bool pl, - file_id const & cv, file_path const & cp, bool cl, - cvs_history & cvs) : - parent_version(cvs.file_version_interner.intern(pv.inner()())), - parent_path(cvs.path_interner.intern(pp())), - parent_live_p(pl), - child_version(cvs.file_version_interner.intern(cv.inner()())), - child_path(cvs.path_interner.intern(cp())), - child_live_p(cl) -{ -} - static void split_version(string const & v, vector & vs) { @@ -814,49 +709,6 @@ } } -cvs_key::cvs_key(rcs_file const & r, string const & version, - cvs_history & cvs) : - is_synthetic_branch_founding_commit(false) -{ - map >::const_iterator delta = - r.deltas.find(version); - I(delta != r.deltas.end()); - - map >::const_iterator deltatext = - r.deltatexts.find(version); - I(deltatext != r.deltatexts.end()); - - { - struct tm t; - // We need to initialize t to all zeros, because strptime has a habit of - // leaving bits of the data structure alone, letting garbage sneak into - // our output. - memset(&t, 0, sizeof(t)); - char const * dp = delta->second->date.c_str(); - L(F("Calculating time of %s\n") % dp); -#ifdef WIN32 - I(sscanf(dp, "%d.%d.%d.%d.%d.%d", &(t.tm_year), &(t.tm_mon), - &(t.tm_mday), &(t.tm_hour), &(t.tm_min), &(t.tm_sec))==6); - t.tm_mon--; - // Apparently some RCS files have 2 digit years, others four; tm always - // wants a 2 (or 3) digit year (years since 1900). - if (t.tm_year > 1900) - t.tm_year-=1900; -#else - if (strptime(dp, "%y.%m.%d.%H.%M.%S", &t) == NULL) - I(strptime(dp, "%Y.%m.%d.%H.%M.%S", &t) != NULL); -#endif - time=mktime(&t); - L(F("= %i\n") % time); - id = nextid++; - } - - branch = cvs.bstk.top(); - changelog = cvs.changelog_interner.intern(deltatext->second->log); - author = cvs.author_interner.intern(delta->second->author); -} - - cvs_history::cvs_history() : n_versions("versions", "v", 1), n_tree_branches("branches", "b", 1) @@ -879,6 +731,7 @@ && ss.substr(last_slash-5,6)=="Attic/") ss.erase(last_slash-5,6); curr_file = file_path(ss); + curr_file_interned = path_interner.intern(ss); } void cvs_history::index_branchpoint_symbols(rcs_file const & r) @@ -917,243 +770,37 @@ } } -void -cvs_history::note_state_at_branch_beginning(rcs_file const & r, - string const & branchname, - string const & version, - file_id const & ident, - note_type nt) -{ - // here we manufacture a single synthetic commit -- the "branch - // birth" commit -- representing the cumulative affect of all the - // tag -b operations the user once performed. it has a synthetic - // author ("cvs_import") and a synthetic log message ("beginning of - // branch foo"), and occurs at the time of the *last* branchpoint of - // any files which entered this branch. - // - // note that this does not establish a revision-ancestry - // relationship between the branchpoint and the branch. the branch - // is considered a child of the null revision, as far as monotone is - // concerned. - L(F("noting branchpoint for %s = %s\n") % branchname % version); - push_branch(branchname, false); - - cvs_key k; - shared_ptr s; - I(stk.size() > 0); - shared_ptr branch = stk.top(); - - string branch_birth_message = "beginning of branch " + branchname; - string branch_birth_author = "cvs_import"; - - cvs_changelog clog = changelog_interner.intern(branch_birth_message); - cvs_author auth = author_interner.intern(branch_birth_author); - - // note: SBFC is short for "synthetic branch-founding commit" - - if (branch->empty()) - { - I(nt == note_branchpoint); - find_key_and_state (r, version, k, s); - branch->erase(k); - k.changelog = clog; - k.author = auth; - k.add_file(curr_file, version); - k.is_synthetic_branch_founding_commit = true; - branch->insert(make_pair(k, s)); - L(F("added SBFC for %s at id %d, time %d\n") - % branchname % k.id % k.time); - } - else - { - cvs_key nk(r, version, *this); - - cvs_branch::iterator i = branch->end(); - i--; - I(i->first.is_synthetic_branch_founding_commit); - I(i->first.author == auth); - I(i->first.changelog == clog); - - k = i->first; - s = i->second; - - L(F("found existing SBFC at id %d, time %d\n") - % k.id % k.time); - if (nt == note_branchpoint - && nk.time > k.time - && branch_has_commit[branchname] == false) - { - L(F("moving SBFC for %s to later branchpoint at %d\n") - % branchname % nk.time); - branch->erase(i); - k.time = nk.time; - k.add_file(curr_file, version); - branch->insert(make_pair(k, s)); - } - else if (nt == note_branch_first_commit - && nk.time < k.time) - { - L(F("moving SBFC for %s to earlier branch commit at %d\n") - % branchname % nk.time); - branch->erase(i); - k.time = nk.time; - branch->insert(make_pair(k, s)); - branch_has_commit[branchname] = true; - } - - } - - if (nt == note_branchpoint) - { - map >::const_iterator del; - del = r.deltas.find(version); - I(del != r.deltas.end()); - bool alive = del->second->state != "dead"; - - s->in_edges.insert(cvs_file_edge(file_id(), curr_file, alive, - ident, curr_file, alive, - *this)); - } - - pop_branch(); -} - -bool -cvs_history::find_key_and_state(rcs_file const & r, - string const & version, - cvs_key & key, - shared_ptr & state) -{ - I(stk.size() > 0); - shared_ptr branch = stk.top(); - cvs_key nk(r, version, *this); - - nk.add_file(curr_file, version); - // key+(window/2) is in the future, key-(window/2) is in the past. the - // past is considered "greater than" the future in this map, so we take: - // - // - new, the lower bound of key+(window/2) in the map - // - old, the upper bound of key-(window/2) in the map - // - // and search all the nodes inside this section, from new to old bound. - - map< cvs_key, shared_ptr >::iterator i_new, i_old, i; - cvs_key k_new(nk), k_old(nk); - - if (static_cast(k_new.time + constants::cvs_window / 2) > k_new.time) - k_new.time += constants::cvs_window / 2; - - if (static_cast(k_old.time - constants::cvs_window / 2) < k_old.time) - k_old.time -= constants::cvs_window / 2; - - i_new = branch->lower_bound(k_new); - i_old = branch->upper_bound(k_old); - - for (i = i_new; i != i_old; ++i) - { - if (i->first.similar_enough(nk)) - { - key = i->first; - state = i->second; - branch->erase(i); - key.add_file(curr_file, version); - branch->insert(make_pair(key,state)); - return true; - } - } - key = nk; - state = shared_ptr(new cvs_state()); - branch->insert(make_pair(key, state)); - return false; -} - void cvs_history::push_branch(string const & branch_name, bool private_branch) -{ +{ shared_ptr branch; - + + string bname = base_branch + "." + branch_name; I(stk.size() > 0); - map >::const_iterator b = branches.find(branch_name); - if (b == branches.end()) - { - branch = shared_ptr(new cvs_branch()); - if (!private_branch) - branches.insert(make_pair(branch_name, branch)); - } - else - branch = b->second; - - stk.push(branch); - if (private_branch) - bstk.push(bstk.top()); - else - bstk.push(branch_interner.intern(base_branch + "." + branch_name)); -} - -void -cvs_history::note_file_edge(rcs_file const & r, - string const & prev_rcs_version_num, - string const & next_rcs_version_num, - file_id const & prev_version, - file_id const & next_version) -{ - - cvs_key k; - shared_ptr s; - - I(stk.size() > 0); - I(! curr_file().empty()); - - L(F("noting file edge %s -> %s\n") % prev_rcs_version_num % next_rcs_version_num); - - // we can't use operator[] since it is non-const - std::map >::const_iterator - prev_delta = r.deltas.find(prev_rcs_version_num), - next_delta = r.deltas.find(next_rcs_version_num); - I(prev_delta!=r.deltas.end()); - I(next_delta!=r.deltas.end()); - bool prev_alive = prev_delta->second->state!="dead"; - bool next_alive = next_delta->second->state!="dead"; - - L(F("note_file_edge %s %d -> %s %d\n") - % prev_rcs_version_num % prev_alive - % next_rcs_version_num % next_alive); - - // we always aggregate in-edges in children, but we will also create - // parents as we encounter them. - if (stk.size() == 1) { - // we are on the trunk, prev is child, next is parent. - L(F("noting trunk edge %s : %s -> %s\n") % curr_file - % next_rcs_version_num - % prev_rcs_version_num); - find_key_and_state (r, next_rcs_version_num, k, s); // just to create it if necessary - find_key_and_state (r, prev_rcs_version_num, k, s); - - L(F("trunk edge entering key state %d\n") % k.id); - s->in_edges.insert(cvs_file_edge(next_version, curr_file, next_alive, - prev_version, curr_file, prev_alive, - *this)); + stk.push(stk.top()); + bstk.push(bstk.top()); + return; } else - { - // we are on a branch, prev is parent, next is child. - L(F("noting branch edge %s : %s -> %s\n") % curr_file - % prev_rcs_version_num - % next_rcs_version_num); - find_key_and_state (r, next_rcs_version_num, k, s); - L(F("branch edge on %s entering key state %d\n") - % branch_interner.lookup(k.branch) % k.id); - s->in_edges.insert(cvs_file_edge(prev_version, curr_file, prev_alive, - next_version, curr_file, next_alive, - *this)); + { + map >::const_iterator b = branches.find(bname); + if (b == branches.end()) + { + branch = shared_ptr(new cvs_branch()); + branches.insert(make_pair(bname, branch)); + ++n_tree_branches; + } + else + branch = b->second; + + stk.push(branch); + bstk.push(branch_interner.intern(bname)); } - - ++n_versions; } void @@ -1190,35 +837,414 @@ }; -static void -store_manifest_edge(manifest_map const & parent, - manifest_map const & child, - manifest_id const & parent_mid, - manifest_id const & child_mid, - app_state & app, - cvs_history & cvs, - bool head_manifest_p) + + +// +// our task here is to produce a sequence of revision descriptions +// from the per-file commit records we have. we do this by rolling +// forwards through the temporally sorted file-commit list +// accumulating file-commits into revisions and flushing the +// revisions when we feel they are "complete". +// +// revisions have to have a time associated with them. this time +// will be the first time of any commit associated with the +// revision. they have an author and a changelog, which is shared +// by all the file-commits in the revision. +// +// there might be multiple revisions overlapping in time. this is +// legal wrt. CVS. we keep a set, and search all members of the set +// for the best match. +// +// consider this situation of overlapping revisions: +// +// +---------------+ +---------------+ +---------------+ +// | rev #1 @ 0011 | | rev #2 @ 0012 | | rev #3 @ 0013 | +// |~~~~~~~~~~~~~~~| |~~~~~~~~~~~~~~~| |~~~~~~~~~~~~~~~| +// | patch foo.txt | | patch bar.txt | | patch baz.txt | +// +---------------+ +---------------+ +---------------+ +// +// suppose you have this situation and you run across a "patch +// bar.txt" commit at timestamp 0014. what do you do? +// +// - you know that rev #2 cannot accept this commit, simply because +// two commits on the same file makes *two* revisions, not one. +// +// - perhaps rev #3 could accept it; after all, it could be that the +// commit associated with rev #2 released its commit lock, and the +// commit associated with rev #3 quickly updated and committed at +// 0013, finishing off at 0014. +// +// - can rev #1 accept it? no. because CVS calcualted the version it +// expected to see in bar.txt before calling up the server, when +// committing rev #1. the version it expected to see was the version +// in bar.txt *before* time 0012; that is, before rev #2 had any affect +// on bar.txt. when it contacted the server, the commit associated +// with rev #1 would have aborted if it had seen any other number. +// so rev #1 could not start before an edit to bar.txt and then +// include its own edit to bar.txt. +// +// so we have only one case where bar.txt can be accepted. if the +// commit is not accepted into a legal rev (outside the window, +// wrong changelog/author) it starts a new revision. +// +// as we scan forwards, if we hit timestamps which lie beyond rev #n's +// window, we flush rev #n. +// +// if there are multiple coincident and legal revs to direct a +// commit to (all with the same author/changelog), we direct the +// commit to the rev with the closest initial timestamp. that is, +// the *latest* beginning time. + +struct +cvs_cluster { + time_t first_time; + cvs_author author; + cvs_changelog changelog; - L(F("storing manifest %s (base %s)\n") % parent_mid % child_mid); + cvs_cluster(time_t t, + cvs_author a, + cvs_changelog c) + : first_time(t), + author(a), + changelog(c) + {} - if (head_manifest_p) + struct entry + { + bool live; + cvs_version version; + time_t time; + entry(bool l, cvs_version v, time_t t) + : live(l), + version(v), + time(t) + {} + }; + + typedef map entry_map; + entry_map entries; +}; + + +struct +cluster_consumer +{ + cvs_history & cvs; + app_state & app; + string const & branchname; + cvs_branch const & branch; + map live_files; + ticker & n_revisions; + + struct prepared_revision + { + prepared_revision(revision_id i, + shared_ptr r, + cvs_cluster const & c); + revision_id rid; + shared_ptr rev; + time_t time; + cvs_author author; + cvs_changelog changelog; + }; + + vector preps; + + manifest_map parent_map, child_map; + manifest_id parent_mid, child_mid; + revision_id parent_rid, child_rid; + + cluster_consumer(cvs_history & cvs, + app_state & app, + string const & branchname, + cvs_branch const & branch, + ticker & n_revs); + + void consume_cluster(cvs_cluster const & c, + bool head_p); + void build_change_set(cvs_cluster const & c, + change_set & cs); + void store_manifest_edge(bool head_p); + void store_auxiliary_certs(prepared_revision const & p); + void store_revisions(); +}; + +typedef shared_ptr +cluster_ptr; + +struct +cluster_ptr_lt +{ + bool operator()(cluster_ptr const & a, + cluster_ptr const & b) const + { + return a->first_time < b->first_time; + } +}; + +typedef set +cluster_set; + +void +import_branch(cvs_history & cvs, + app_state & app, + string const & branchname, + shared_ptr const & branch, + ticker & n_revs) +{ + cluster_set clusters; + cluster_consumer cons(cvs, app, branchname, *branch, n_revs); + unsigned long commits_remaining = branch->lineage.size(); + + // step 1: sort the lineage + sort(branch->lineage.begin(), branch->lineage.end()); + + for (vector::const_iterator i = branch->lineage.begin(); + i != branch->lineage.end(); ++i) + { + commits_remaining--; + + L(F("examining next commit [t:%d] [a:%d] [c:%d]\n") + % i->time % i->author % i->changelog); + + // step 2: expire all clusters from the beginning of the set which + // have passed the window size + while (!clusters.empty()) + { + cluster_set::const_iterator j = clusters.begin(); + if ((*j)->first_time + window < i->time) + { + L(F("expiring cluster\n")); + cons.consume_cluster(**j, false); + clusters.erase(j); + } + else + break; + } + + // step 3: find the last still-live cluster to have touched this + // file + time_t last_modify_time = 0; + for (cluster_set::const_iterator j = clusters.begin(); + j != clusters.end(); ++j) + { + cvs_cluster::entry_map::const_iterator k = (*j)->entries.find(i->path); + if (k != (*j)->entries.end() && + k->second.time > last_modify_time) + last_modify_time = k->second.time; + } + L(F("last modification time is %d\n") % last_modify_time); + + // step 4: find a cluster which starts after the + // last_modify_time, which doesn't modify the file in question, + // and which contains the same author and changelog as our + // commit + cluster_ptr target; + for (cluster_set::const_iterator j = clusters.begin(); + j != clusters.end(); ++j) + { + if (((*j)->first_time > last_modify_time) + && ((*j)->author == i->author) + && ((*j)->changelog == i->changelog) + && ((*j)->entries.find(i->path) == (*j)->entries.end())) + { + L(F("picked existing cluster target\n")); + target = (*j); + } + } + + // if we're still not finding an active cluster, + // this is probably the first commit in it. make + // a new one. + if (!target) + { + L(F("building new cluster [t:%d] [a:%d] [c:%d]\n") + % i->time % i->author % i->changelog); + target = cluster_ptr(new cvs_cluster(i->time, + i->author, + i->changelog)); + clusters.insert(target); + } + + I(target); + target->entries.insert(make_pair(i->path, + cvs_cluster::entry(i->alive, + i->version, + i->time))); + } + + + // now we are done this lineage; flush all remaining clusters + L(F("finished branch commits, writing all pending clusters\n")); + while (!clusters.empty()) { + cons.consume_cluster(**clusters.begin(), clusters.size() == 1); + clusters.erase(clusters.begin()); + } + L(F("finished writing pending clusters\n")); + + cons.store_revisions(); + +} + + +void +import_cvs_repo(fs::path const & cvsroot, + app_state & app) +{ + N(!fs::exists(cvsroot / "CVSROOT"), + F("%s appears to be a CVS repository root directory\n" + "try importing a module instead, with 'cvs_import %s/") + % cvsroot.native_directory_string() % cvsroot.native_directory_string()); + + { + // early short-circuit to avoid failure after lots of work + rsa_keypair_id key; + N(guess_default_key(key,app), + F("no unique private key for cert construction")); + require_password(key, app); + } + + cvs_history cvs; + N(app.branch_name() != "", F("need base --branch argument for importing")); + cvs.base_branch = app.branch_name(); + + // push the trunk + cvs.trunk = shared_ptr(new cvs_branch()); + cvs.stk.push(cvs.trunk); + cvs.bstk.push(cvs.branch_interner.intern(cvs.base_branch)); + + { + transaction_guard guard(app.db); + cvs_tree_walker walker(cvs, app.db); + N( fs::exists(cvsroot), + F("path %s does not exist") % cvsroot.string()); + N( fs::is_directory(cvsroot), + F("path %s is not a directory") % cvsroot.string()); + app.db.ensure_open(); + N(chdir(cvsroot.native_directory_string().c_str()) == 0, + F("could not change directory to %s") % cvsroot.string()); + walk_tree(walker); + guard.commit(); + } + + I(cvs.stk.size() == 1); + + ticker n_revs("finished revisions", "r", 1); + + for (map >::const_iterator i = cvs.branches.begin(); + i != cvs.branches.end(); ++i) + { + L(F("branch %s has %d entries\n") % i->first % i->second->lineage.size()); + import_branch(cvs, app, i->first, i->second, n_revs); + } + + L(F("trunk has %d entries\n") % cvs.trunk->lineage.size()); + import_branch(cvs, app, cvs.base_branch, cvs.trunk, n_revs); + + return; + +} + +cluster_consumer::cluster_consumer(cvs_history & cvs, + app_state & app, + string const & branchname, + cvs_branch const & branch, + ticker & n_revs) + : cvs(cvs), + app(app), + branchname(branchname), + branch(branch), + n_revisions(n_revs) +{ + if (!branch.live_at_beginning.empty()) + { + cvs_author synthetic_author = + cvs.author_interner.intern("cvs_import"); + + cvs_changelog synthetic_cl = + cvs.changelog_interner.intern("beginning of branch " + + branchname); + + time_t synthetic_time = branch.beginning(); + cvs_cluster initial_cluster(synthetic_time, + synthetic_author, + synthetic_cl); + + L(F("initial cluster on branch %s has %d live entries\n") % + branchname % branch.live_at_beginning.size()); + + for (map::const_iterator i = branch.live_at_beginning.begin(); + i != branch.live_at_beginning.end(); ++i) + { + cvs_cluster::entry e(true, i->second, synthetic_time); + L(F("initial cluster contains %s at %s\n") % + cvs.path_interner.lookup(i->first) % + cvs.file_version_interner.lookup(i->second)); + initial_cluster.entries.insert(make_pair(i->first, e)); + } + consume_cluster(initial_cluster, false); + } +} + +cluster_consumer::prepared_revision::prepared_revision(revision_id i, + shared_ptr r, + cvs_cluster const & c) + : rid(i), + rev(r), + time(c.first_time), + author(c.author), + changelog(c.changelog) +{ +} + + +void +cluster_consumer::store_revisions() +{ + for (vector::const_iterator i = preps.begin(); + i != preps.end(); ++i) + { + if (! app.db.revision_exists(i->rid)) + { + data tmp; + write_revision_set(*(i->rev), tmp); + /* + cout << "+++WRITING REVISION" << endl; + cout << tmp << endl; + cout << "---WRITING REVISION" << endl; + */ + app.db.put_revision(i->rid, *(i->rev)); + store_auxiliary_certs(*i); + ++n_revisions; + } + } +} + +void +cluster_consumer::store_manifest_edge(bool head_p) +{ + L(F("storing manifest '%s' (base %s)\n") % parent_mid % child_mid); + + if (head_p) + { L(F("storing head %s\n") % child_mid); // a branch has one very important manifest: the head. this is // the "newest" of all manifests within the branch (including - // the trunk), and we store it in its entirety. + // the trunk), and we store it in its entirety, before the + // cluster consumer is destroyed. if (! app.db.manifest_version_exists(child_mid)) { manifest_data mdat; - write_manifest_map(child, mdat); + write_manifest_map(child_map, mdat); app.db.put_manifest(child_mid, mdat); } } if (null_id(parent_mid)) { - L(F("skipping null manifest\n")); + L(F("skipping delta to null manifest\n")); return; } @@ -1238,7 +1264,7 @@ { L(F("writing full manifest %s\n") % parent_mid); manifest_data mdat; - write_manifest_map(parent, mdat); + write_manifest_map(parent_map, mdat); app.db.put_manifest(parent_mid, mdat); } return; @@ -1256,214 +1282,84 @@ // run from child (new) -> parent (old). delta del; - diff(child, parent, del); + diff(child_map, parent_map, del); rcs_put_raw_manifest_edge(parent_mid.inner(), child_mid.inner(), del, app.db); } - -static void -store_auxiliary_certs(cvs_key const & key, - revision_id const & id, - app_state & app, - cvs_history const & cvs) +void +cluster_consumer::store_auxiliary_certs(prepared_revision const & p) { packet_db_writer dbw(app); - cert_revision_in_branch(id, cert_value(cvs.branch_interner.lookup(key.branch)), app, dbw); - cert_revision_author(id, cvs.author_interner.lookup(key.author), app, dbw); - cert_revision_changelog(id, cvs.changelog_interner.lookup(key.changelog), app, dbw); - cert_revision_date_time(id, key.time, app, dbw); + cert_revision_in_branch(p.rid, cert_value(branchname), app, dbw); + cert_revision_author(p.rid, cvs.author_interner.lookup(p.author), app, dbw); + cert_revision_changelog(p.rid, cvs.changelog_interner.lookup(p.changelog), app, dbw); + cert_revision_date_time(p.rid, p.time, app, dbw); } -static void -build_change_set(shared_ptr state, - manifest_map const & state_map, - cvs_history & cvs, - change_set & cs) +void +cluster_consumer::build_change_set(cvs_cluster const & c, + change_set & cs) { - change_set empty; - cs = empty; - - for (set::const_iterator f = state->in_edges.begin(); - f != state->in_edges.end(); ++f) + for (cvs_cluster::entry_map::const_iterator i = c.entries.begin(); + i != c.entries.end(); ++i) { - file_id fid(cvs.file_version_interner.lookup(f->child_version)); - file_path pth(cvs.path_interner.lookup(f->child_path)); - if (!f->child_live_p) - { - if (f->parent_live_p) + file_path pth(cvs.path_interner.lookup(i->first)); + file_id fid(cvs.file_version_interner.lookup(i->second.version)); + if (i->second.live) + { + map::const_iterator e = live_files.find(i->first); + if (e == live_files.end()) { - L(F("deleting entry state '%s' on '%s'\n") % fid % pth); - cs.delete_file(pth); + L(F("adding entry state '%s' on '%s'\n") % fid % pth); + cs.add_file(pth, fid); + live_files[i->first] = i->second.version; } - else + else if (e->second != i->second.version) { - // it can actually happen that we have a file that went from - // dead to dead. when a file is created on a branch, cvs first - // _commits a deleted file_ on mainline, and then branches from - // it and resurrects it. In such cases, we should just ignore - // the file, it doesn't actually exist. So, in this block, we - // do nothing. + file_id old_fid(cvs.file_version_interner.lookup(e->second)); + L(F("applying state delta on '%s' : '%s' -> '%s'\n") + % pth % old_fid % fid); + cs.apply_delta(pth, old_fid, fid); + live_files[i->first] = i->second.version; } } - else + else { - manifest_map::const_iterator i = state_map.find(pth); - if (i == state_map.end()) + map::const_iterator e = live_files.find(i->first); + if (e != live_files.end()) { - L(F("adding entry state '%s' on '%s'\n") % fid % pth); - cs.add_file(pth, fid); + L(F("deleting entry state '%s' on '%s'\n") % fid % pth); + cs.delete_file(pth); + live_files.erase(i->first); } - else if (manifest_entry_id(i) == fid) - { - L(F("skipping preserved entry state '%s' on '%s'\n") - % fid % pth); - } - else - { - L(F("applying state delta on '%s' : '%s' -> '%s'\n") - % pth % manifest_entry_id(i) % fid); - cs.apply_delta(pth, manifest_entry_id(i), fid); - } - } + } } - L(F("logical changeset from parent -> child has %d file state changes\n") - % state->in_edges.size()); } -static void -import_branch_states(ticker & n_edges, - cvs_branch & branch, - cvs_history & cvs, - app_state & app, - vector< pair > & revisions) -{ - manifest_map parent_map, child_map; - manifest_id parent_mid, child_mid; - revision_id parent_rid, child_rid; - - // we look through the branch temporally *backwards* from oldest to - // newest +void +cluster_consumer::consume_cluster(cvs_cluster const & c, + bool head_p) +{ + shared_ptr rev(new revision_set()); + boost::shared_ptr cs(new change_set()); + build_change_set(c, *cs); - for (cvs_branch::reverse_iterator i = branch.rbegin(); - i != branch.rend(); ++i) - { - L(F("importing branch %s, state [%d: %s @ %d]\n") - % cvs.branch_interner.lookup(i->first.branch) - % i->first.id - % cvs.author_interner.lookup(i->first.author) - % i->first.time); + apply_change_set(*cs, child_map); + calculate_ident(child_map, child_mid); - revision_set rev; - boost::shared_ptr cs(new change_set()); - build_change_set(i->second, parent_map, cvs, *cs); + rev->new_manifest = child_mid; + rev->edges.insert(make_pair(parent_rid, make_pair(parent_mid, cs))); + calculate_ident(*rev, child_rid); - apply_change_set(*cs, child_map); - calculate_ident(child_map, child_mid); + store_manifest_edge(head_p); - rev.new_manifest = child_mid; - rev.edges.insert(make_pair(parent_rid, make_pair(parent_mid, cs))); - calculate_ident(rev, child_rid); + L(F("consumed cluster %s (parent '%s')\n") % child_rid % rev->edges.begin()->first); + preps.push_back(prepared_revision(child_rid, rev, c)); - revisions.push_back(make_pair(i->first, rev)); - - store_manifest_edge(parent_map, child_map, - parent_mid, child_mid, - app, cvs, i->first == branch.begin()->first); - - // now apply same change set to parent_map, making parent_map == child_map - apply_change_set(*cs, parent_map); - parent_mid = child_mid; - parent_rid = child_rid; - ++n_edges; - } + // now apply same change set to parent_map, making parent_map == child_map + apply_change_set(*cs, parent_map); + parent_mid = child_mid; + parent_rid = child_rid; } - -void -import_cvs_repo(fs::path const & cvsroot, - app_state & app) -{ - N(!fs::exists(cvsroot / "CVSROOT"), - F("%s appears to be a CVS repository root directory\n" - "try importing a module instead, with 'cvs_import %s/") - % cvsroot.native_directory_string() % cvsroot.native_directory_string()); - - { - // early short-circuit to avoid failure after lots of work - rsa_keypair_id key; - N(guess_default_key(key,app), - F("no unique private key for cert construction")); - require_password(key, app); - } - - cvs_history cvs; - N(app.branch_name() != "", F("need base --branch argument for importing")); - cvs.base_branch = app.branch_name(); - - // push the trunk - cvs.stk.push(shared_ptr(new cvs_branch())); - cvs.bstk.push(cvs.branch_interner.intern(cvs.base_branch)); - - { - transaction_guard guard(app.db); - cvs_tree_walker walker(cvs, app.db); - N( fs::exists(cvsroot), - F("path %s does not exist") % cvsroot.string()); - N( fs::is_directory(cvsroot), - F("path %s is not a directory") % cvsroot.string()); - app.db.ensure_open(); - N(chdir(cvsroot.native_directory_string().c_str()) == 0, - F("could not change directory to %s") % cvsroot.string()); - walk_tree(walker); - guard.commit(); - } - - P(F("phase 1 (version import) complete\n")); - - I(cvs.stk.size() == 1); - - vector< pair > revisions; - { - ticker n_branches("finished branches", "b", 1); - ticker n_edges("finished edges", "e", 1); - transaction_guard guard(app.db); - manifest_map root_manifest; - manifest_id root_mid; - revision_id root_rid; - - - ui.set_tick_trailer("building trunk"); - import_branch_states(n_edges, *cvs.stk.top(), cvs, app, revisions); - - for(map >::const_iterator branch = cvs.branches.begin(); - branch != cvs.branches.end(); ++branch) - { - ui.set_tick_trailer("building branch " + branch->first); - ++n_branches; - import_branch_states(n_edges, *(branch->second), cvs, app, revisions); - } - - P(F("phase 2 (ancestry reconstruction) complete\n")); - guard.commit(); - } - - { - ticker n_revisions("written revisions", "r", 1); - ui.set_tick_trailer(""); - transaction_guard guard(app.db); - for (vector< pair >::const_iterator - i = revisions.begin(); i != revisions.end(); ++i) - { - revision_id rid; - calculate_ident(i->second, rid); - if (! app.db.revision_exists(rid)) - app.db.put_revision(rid, i->second); - store_auxiliary_certs(i->first, rid, app, cvs); - ++n_revisions; - } - P(F("phase 3 (writing revisions) complete\n")); - guard.commit(); - } -} -