# 
# patch "ChangeLog"
#  from [c5b6af776442ffcda8b25b0b94a9785795da0817]
#    to [34847c5d937ac67773f07cab8b9d83cd45ab627c]
# 
# patch "rcs_import.cc"
#  from [12affac881ab416406a9b81ec4700242bb5a0829]
#    to [0b7f304fb69c215b7555bca255db16bc89106489]
# 
--- ChangeLog
+++ ChangeLog
@@ -1,3 +1,7 @@
+2005-06-19  graydon hoare  <address@hidden>
+
+	* rcs_import.cc: Rewrite change set inference logic.
+
 2005-06-14  Richard Levitte  <address@hidden>
 
 	* std_hooks.lua (get_preferred_merge2_command,
--- rcs_import.cc
+++ rcs_import.cc
@@ -7,6 +7,7 @@
 #include <algorithm>
 #include <iostream>
 #include <iterator>
+#include <list>
 #include <map>
 #include <set>
 #include <sstream>
@@ -41,6 +42,8 @@
 #include "transforms.hh"
 #include "ui.hh"
 
+int window = 3600 * 3;
+
 using namespace std;
 using boost::shared_ptr;
 using boost::scoped_ptr;
@@ -52,161 +55,85 @@
 typedef unsigned long cvs_changelog;
 typedef unsigned long cvs_version;
 typedef unsigned long cvs_path;
+typedef unsigned long cvs_tag;
 
 struct cvs_history;
 
-struct 
-cvs_key
+struct
+cvs_commit
 {
-  cvs_key() {}
-  cvs_key(rcs_file const & r, 
-          string const & version, 
-          cvs_history & cvs);
+  cvs_commit(time_t t,
+             cvs_author a,
+             cvs_changelog c,
+             cvs_version v,
+             cvs_path p)
+    : time(t),
+      author(a),
+      changelog(c),
+      version(v),
+      path(p)
+  {}
 
-  inline bool similar_enough(cvs_key const & other) const
-  {
-    L(F("Checking similarity of %d and %d\n") % id % other.id);
-    if (changelog != other.changelog)
-      return false;
-    if (author != other.author)
-      return false;
-    if (labs(time - other.time) > constants::cvs_window)
-      return false;
-    for (map<file_path,string>::const_iterator it = files.begin(); it!=files.end(); it++)
-      {
-        map<file_path,string>::const_iterator otherit;
-        
-        L(F("checking %s %s\n") % it->first % it->second);
-        otherit = other.files.find(it->first);
-        if (otherit != other.files.end() && it->second!=otherit->second)
-          {
-            L(F("!similar_enough: %d/%d\n") % id % other.id);
-            return false;
-          }
-        else if (otherit != other.files.end())
-          {
-            L(F("Same file, different version: %s and %s\n") % it->second % otherit->second);
-          }
-      }
-    L(F("similar_enough: %d/%d\n") % id % other.id);
-    return true;
-  }
+  cvs_commit(rcs_file const & r, 
+             string const & rcs_version,
+             file_id const & ident,
+             cvs_history & cvs);
 
-  inline bool operator==(cvs_key const & other) const
+  time_t time;
+  bool alive;
+  cvs_author author;
+  cvs_changelog changelog;
+  cvs_version version;
+  cvs_path path;
+
+  bool operator<(cvs_commit const & other) const 
   {
-    L(F("Checking equality of %d and %d\n") % id % other.id);
-    return is_synthetic_branch_founding_commit == other.is_synthetic_branch_founding_commit &&
-      branch == other.branch &&
-      changelog == other.changelog &&
-      author == other.author &&
-      time == other.time;
+    return time < other.time;
   }
-  inline bool operator<(cvs_key const & other) const
-  {
-    // nb: this must sort as > to construct the edges in the right direction
+};
 
-    if (is_synthetic_branch_founding_commit)
-      {
-        I(!other.is_synthetic_branch_founding_commit);
-        return false;
-      }
+struct
+cvs_branch
+{
+  bool has_a_branchpoint;
+  bool has_a_commit;
+  time_t last_branchpoint;
+  time_t first_commit;
 
-    if (other.is_synthetic_branch_founding_commit)
-      {
-        I(!is_synthetic_branch_founding_commit);
-        return true;
-      }
-    
-    return time > other.time ||
+  map<cvs_path, cvs_version> live_at_beginning;
+  vector<cvs_commit> lineage;
 
-      (time == other.time 
-       && author > other.author) ||
-
-      (time == other.time 
-       && author == other.author 
-       && changelog > other.changelog) ||
-
-      (time == other.time 
-       && author == other.author 
-       && changelog == other.changelog
-       && branch > other.branch);
+  void note_commit(time_t now)
+  {
+    has_a_commit = true;
+    if (now < first_commit)
+      first_commit = now;
   }
 
-  inline void add_file(file_path const &file, string const &version)
+  void note_branchpoint(time_t now)
   {
-    L(F("Adding file %s version %s to CVS key %d\n") % file % version % id);
-    files.insert( make_pair(file, version) );
+    has_a_branchpoint = true;
+    if (now > last_branchpoint)
+      last_branchpoint = now;
   }
 
-  bool is_synthetic_branch_founding_commit;
-  cvs_branchname branch;
-  cvs_changelog changelog;
-  cvs_author author;
-  time_t time;
-  map<file_path, string> files; // Maps file to version
-  int id; // Only used for debug output
-
-  static int nextid; // Used to initialise id
-};
-
-int cvs_key::nextid = 0;
-
-struct 
-cvs_file_edge
-{
-  cvs_file_edge (file_id const & pv, 
-                 file_path const & pp,
-                 bool pl,
-                 file_id const & cv, 
-                 file_path const & cp,
-                 bool cl,
-                 cvs_history & cvs);
-  cvs_version parent_version;
-  cvs_path parent_path;
-  bool parent_live_p;
-  cvs_version child_version;
-  cvs_path child_path;
-  bool child_live_p;
-  inline bool operator<(cvs_file_edge const & other) const
+  time_t beginning() const
   {
-#if 0
-    return (parent_path < other.parent_path) 
-                       || ((parent_path == other.parent_path) 
-       && ((parent_version < other.parent_version) 
-                       || ((parent_version == other.parent_version) 
-       && ((parent_live_p < other.parent_live_p) 
-                       || ((parent_live_p == other.parent_live_p) 
-       && ((child_path < other.child_path) 
-                       || ((child_path == other.child_path) 
-       && ((child_version < other.child_version) 
-                       || ((child_version == other.child_version) 
-       && (child_live_p < other.child_live_p) )))))))));
-#else
-    return (parent_path < other.parent_path) 
-                        || ((parent_path == other.parent_path) 
-        && ((parent_version < other.parent_version) 
-                        || ((parent_version == other.parent_version) 
-        && ((parent_live_p < other.parent_live_p) 
-                        || ((parent_live_p == other.parent_live_p) 
-        && ((child_path < other.child_path) 
-                        || ((child_path == other.child_path) 
-        && ((child_version < other.child_version) 
-                        || ((child_version == other.child_version) 
-        && (child_live_p < other.child_live_p) )))))))));
-#endif
+    I(has_a_branchpoint || has_a_commit);
+    if (has_a_commit)
+      return first_commit;
+    else
+      return last_branchpoint;
   }
+  
+  void append_commit(cvs_commit const & c) 
+  {
+    note_commit(c.time);
+    lineage.push_back(c);
+  }
 };
 
 struct 
-cvs_state
-{
-  set<cvs_file_edge> in_edges;
-};
-
-typedef map<cvs_key, shared_ptr<cvs_state> > 
-cvs_branch;
-
-struct 
 cvs_history
 {
 
@@ -219,12 +146,6 @@
 
   cycle_detector<unsigned long> manifest_cycle_detector;
 
-  bool find_key_and_state(rcs_file const & r, 
-                          string const & version,
-                          cvs_key & key,
-                          shared_ptr<cvs_state> & state);
-
-
   // assume admin has foo:X.Y.0.N in it, then 
   // this multimap contains entries of the form
   // X.Y   -> foo
@@ -236,16 +157,14 @@
 
   // branch name -> branch
   map<string, shared_ptr<cvs_branch> > branches;
+  shared_ptr<cvs_branch> trunk;
 
-  // branch name -> whether there are any commits on the
-  //                branch (as opposed to just branchpoints)
-  map<string, bool> branch_has_commit;
-
   // stack of branches we're injecting states into
   stack< shared_ptr<cvs_branch> > stk;
   stack< cvs_branchname > bstk;
 
   file_path curr_file;
+  cvs_path curr_file_interned;
   
   string base_branch;
 
@@ -262,22 +181,52 @@
   enum note_type { note_branchpoint,
                    note_branch_first_commit };
 
-  void note_state_at_branch_beginning(rcs_file const & r,
-				      string const & branchname,
-				      string const & version, 
-				      file_id const & ident,
-                                      note_type nt);
+  void push_branch(string const & branch_name, bool private_branch);
+  void pop_branch();
+};
 
-  void push_branch(string const & branch_name, bool private_branch);
 
-  void note_file_edge(rcs_file const & r, 
-                      string const & prev_rcs_version_num,
-                      string const & next_rcs_version_num,
-                      file_id const & prev_version,
-                      file_id const & next_version);
+cvs_commit::cvs_commit(rcs_file const & r, 
+                       string const & rcs_version,
+                       file_id const & ident,
+                       cvs_history & cvs)
+{
+  map<string, shared_ptr<rcs_delta> >::const_iterator delta = 
+    r.deltas.find(rcs_version);
+  I(delta != r.deltas.end());
+    
+  map<string, shared_ptr<rcs_deltatext> >::const_iterator deltatext = 
+    r.deltatexts.find(rcs_version);
+  I(deltatext != r.deltatexts.end());
+    
+  struct tm t;
+  // We need to initialize t to all zeros, because strptime has a habit of
+  // leaving bits of the data structure alone, letting garbage sneak into
+  // our output.
+  memset(&t, 0, sizeof(t));
+  char const * dp = delta->second->date.c_str();
+  L(F("Calculating time of %s\n") % dp);
+#ifdef WIN32
+  I(sscanf(dp, "%d.%d.%d.%d.%d.%d", &(t.tm_year), &(t.tm_mon), 
+           &(t.tm_mday), &(t.tm_hour), &(t.tm_min), &(t.tm_sec))==6);
+  t.tm_mon--;
+  // Apparently some RCS files have 2 digit years, others four; tm always
+  // wants a 2 (or 3) digit year (years since 1900).
+  if (t.tm_year > 1900)
+    t.tm_year-=1900;
+#else
+  if (strptime(dp, "%y.%m.%d.%H.%M.%S", &t) == NULL)
+    I(strptime(dp, "%Y.%m.%d.%H.%M.%S", &t) != NULL);
+#endif
+  time = mktime(&t);
+  L(F("= %i\n") % time);
 
-  void pop_branch();
-};
+  alive = delta->second->state != "dead";
+  changelog = cvs.changelog_interner.intern(deltatext->second->log);
+  author = cvs.author_interner.intern(delta->second->author);
+  path = cvs.curr_file_interned;
+  version = cvs.file_version_interner.intern(ident.inner()());
+}
 
 
 // piece table stuff
@@ -505,6 +454,65 @@
   rcs_put_raw_file_edge(next_id, curr_id, del, db);
 }
 
+
+
+/*
+       
+please read this exhaustingly long comment and understand it
+before mucking with the branch inference logic.
+
+we are processing a file version. a branch might begin here. if
+the current version is X.Y, then there is a branch B starting
+here iff there is a symbol in the admin section called X.Y.0.Z,
+where Z is the branch number (or if there is a private branch 
+called X.Y.Z, which is either an import branch or some private
+RCS cruft).
+
+the version X.Y is then considered the branchpoint of B in the
+current file. this does *not* mean that the CVS key -- an
+abstraction representing whole-tree operations -- of X.Y is the
+branchpoint across the CVS archive we're processing.
+
+in fact, CVS does not record the occurrence of a branching
+action (tag -b). we have no idea who executed that command and
+when. what we know instead is the commit X.Y immediately
+preceeding the branch -- CVS consideres this the branchpoint --
+in this file's reduced view of history. we also know the first
+commit X.Y.Z.1 inside the branch (which might not exist).
+
+our old strategy was to consider all branches nested in a
+hierarchy, which was a super-tree of all the branch trees in all
+the CVS files in a repository. this involved considering X.Y as
+the parent version of branch X.Y.Z, an selecting "the"
+branchpoint connecting the two as the least CVS key X.Y.Z.1
+committed inside the branch B.
+
+this was a mistake, for two significant reasons.
+
+first, some files do not *have* any commit inside the branch B,
+only a branchpoint X.Y.0.Z. this branchpoint is actually the
+last commit *before* the user branched, and could be a very old
+commit, long before the branch was formed, so it is useless in
+determining the branch structure.
+
+second, some files do not have a branch B, or worse, have
+branched into B from an "ancestor" branch A, where a different
+file branches into B from a different ancestor branch C. in
+other words, while there *is* a tree structure within the X.Y.Z
+branches of each file, there is *no* shared tree structure
+between the branch names across a repository. in one file A can
+be an ancestor of B, in another file B can be an ancestor of A.
+
+thus, we give up on establishing a hierarchy between branches
+altogether. all branches exist in a flat namespace, and all are
+direct descendents of the empty revision at the root of
+history. each branchpoint symbol mentioned in the
+administrative section of a file is considered the root of a new
+lineage.
+      
+*/
+
+
 static void 
 process_branch(string const & begin_version, 
                vector< piece > const & begin_lines,
@@ -525,10 +533,15 @@
   while(! (r.deltas.find(curr_version) == r.deltas.end()))
     {
       L(F("version %s has %d lines\n") % curr_version % curr_lines->size());
+
+      cvs_commit curr_commit(r, curr_version, curr_id, cvs);
+      cvs.stk.top()->append_commit(curr_commit);
+      ++cvs.n_versions;
       
       string next_version = r.deltas.find(curr_version)->second->next;
-      if (!next_version.empty())
-      {  // construct this edge on our own branch
+
+      if (! next_version.empty())
+      {  
          L(F("following RCS edge %s -> %s\n") % curr_version % next_version);
 
          construct_version(*curr_lines, next_version, *next_lines, r);
@@ -537,100 +550,26 @@
 
          insert_into_db(curr_data, curr_id, 
                      *next_lines, next_data, next_id, db);
-
-          cvs.note_file_edge (r, curr_version, next_version, 
-                          file_id(curr_id), file_id(next_id));
       }
-      else
-      {  L(F("revision %s has no successor\n") % curr_version);
-         if (curr_version=="1.1")
-         {  // mark this file as newly present since this commit
-            // (and as not present before)
-            
-            // perhaps this should get a member function of cvs_history ?
-            L(F("marking %s as not present in older manifests\n") % curr_version);
-            cvs_key k;
-            shared_ptr<cvs_state> s;
-            cvs.find_key_and_state(r, curr_version, k, s);
-            I(r.deltas.find(curr_version) != r.deltas.end());
-            bool live_p = r.deltas.find(curr_version)->second->state != "dead";
-            s->in_edges.insert(cvs_file_edge(curr_id, cvs.curr_file, false,
-                                             curr_id, cvs.curr_file, live_p,
-                                             cvs));
-            ++cvs.n_versions;
-         }
-      }
 
-
-      /*
-       
-      please read this exhaustingly long comment and understand it
-      before mucking with the branch inference logic.
-
-      we are processing a file version. a branch might begin here. if
-      the current version is X.Y, then there is a branch B starting
-      here iff there is a symbol in the admin section called X.Y.0.Z,
-      where Z is the branch number (or if there is a private branch 
-      called X.Y.Z, which is either an import branch or some private
-      RCS cruft).
-
-      the version X.Y is then considered the branchpoint of B in the
-      current file. this does *not* mean that the CVS key -- an
-      abstraction representing whole-tree operations -- of X.Y is the
-      branchpoint across the CVS archive we're processing.
-
-      in fact, CVS does not record the occurrence of a branching
-      action (tag -b). we have no idea who executed that command and
-      when. what we know instead is the commit X.Y immediately
-      preceeding the branch -- CVS consideres this the branchpoint --
-      in this file's reduced view of history. we also know the first
-      commit X.Y.Z.1 inside the branch (which might not exist).
-
-      our old strategy was to consider all branches nested in a
-      hierarchy, which was a super-tree of all the branch trees in all
-      the CVS files in a repository. this involved considering X.Y as
-      the parent version of branch X.Y.Z, an selecting "the"
-      branchpoint connecting the two as the least CVS key X.Y.Z.1
-      committed inside the branch B.
-
-      this was a mistake, for two significant reasons.
-
-      first, some files do not *have* any commit inside the branch B,
-      only a branchpoint X.Y.0.Z. this branchpoint is actually the
-      last commit *before* the user branched, and could be a very old
-      commit, long before the branch was formed, so it is useless in
-      determining the branch structure.
-
-      second, some files do not have a branch B, or worse, have
-      branched into B from an "ancestor" branch A, where a different
-      file branches into B from a different ancestor branch C. in
-      other words, while there *is* a tree structure within the X.Y.Z
-      branches of each file, there is *no* shared tree structure
-      between the branch names across a repository. in one file A can
-      be an ancestor of B, in another file B can be an ancestor of A.
-
-      thus, we give up on establishing a hierarchy between branches
-      altogether. all branches exist in a flat namespace, and all are
-      direct descendents of the empty revision at the root of
-      history. each branchpoint symbol mentioned in the
-      administrative section of a file is considered the root of a new
-      lineage.
-      
-      */
-
+      // mark the beginning-of-branch time and state of this file if
+      // we're at a branchpoint
       typedef multimap<string,string>::const_iterator ity;
       pair<ity,ity> range = cvs.branchpoints.equal_range(curr_version);
       if (range.first != cvs.branchpoints.end() 
-	  && range.first->first == curr_version)
-	{
-	  for (ity branch = range.first; branch != range.second; ++branch)
-	    {
-	      cvs.note_state_at_branch_beginning(r, branch->second,
-						 curr_version, 
-						 curr_id,
-                                                 cvs_history::note_branchpoint);
-	    }
-	}
+          && range.first->first == curr_version)
+        {
+          for (ity i = range.first; i != range.second; ++i)
+            {
+              shared_ptr<cvs_branch> b = cvs.stk.top();
+              cvs.push_branch(i->second, false);   
+              if (curr_commit.alive)
+                b->live_at_beginning[cvs.curr_file_interned] = curr_commit.version;
+              b->note_branchpoint(curr_commit.time);
+              cvs.pop_branch();
+            }
+        }
+                
 
       // recursively follow any branch commits coming from the branchpoint
       boost::shared_ptr<rcs_delta> curr_delta = r.deltas.find(curr_version)->second;
@@ -638,50 +577,41 @@
 	  i != curr_delta->branches.end(); ++i)
 	{
 	  string branch;
+	  data branch_data;
+	  hexenc<id> branch_id;
+	  vector< piece > branch_lines;
 	  bool priv = false;
 	  map<string, string>::const_iterator be = cvs.branch_first_entries.find(*i);
-
+          
 	  if (be != cvs.branch_first_entries.end())
 	    branch = be->second;
 	  else
 	    priv = true;
 	  
 	  L(F("following RCS branch %s = '%s'\n") % (*i) % branch);
-	  vector< piece > branch_lines;
-	  construct_version(*curr_lines, *i, branch_lines, r);
-          	  
-	  data branch_data;
-	  hexenc<id> branch_id;
+          
+	  construct_version(*curr_lines, *i, branch_lines, r);          	  
 	  insert_into_db(curr_data, curr_id, 
-			 branch_lines, branch_data, branch_id, db);
-
-          // update the branch beginning time to reflect improved
-          // information, if there was a commit on the branch
-          if (!priv)
-            cvs.note_state_at_branch_beginning(r, branch,
-                                               *i, 
-                                               branch_id,
-                                               cvs_history::note_branch_first_commit);
-
-	  cvs.push_branch (branch, priv);
-	  cvs.note_file_edge (r, curr_version, *i,
-			      file_id(curr_id), file_id(branch_id));	      
+                         branch_lines, branch_data, branch_id, db);
+          
+          cvs.push_branch(branch, priv);
 	  process_branch(*i, branch_lines, branch_data, branch_id, r, db, cvs);
 	  cvs.pop_branch();
 	  
 	  L(F("finished RCS branch %s = '%s'\n") % (*i) % branch);
 	}
-
+      
       if (!r.deltas.find(curr_version)->second->next.empty())
-      {  // advance
-         curr_data = next_data;
-         curr_id = next_id;
-         curr_version = next_version;
-         swap(next_lines, curr_lines);
-         next_lines->clear();
-      }
+        {  
+          // advance
+          curr_data = next_data;
+          curr_id = next_id;
+          curr_version = next_version;
+          swap(next_lines, curr_lines);
+          next_lines->clear();
+        }
       else break;
-    }
+    }  
 } 
 
 
@@ -713,10 +643,10 @@
         
     {
       // create the head state in case it is a loner
-      cvs_key k;
-      shared_ptr<cvs_state> s;
-      L(F("noting head version %s : %s\n") % cvs.curr_file % r.admin.head);
-      cvs.find_key_and_state (r, r.admin.head, k, s);
+      //       cvs_key k;
+      //       shared_ptr<cvs_state> s;
+      //       L(F("noting head version %s : %s\n") % cvs.curr_file % r.admin.head);
+      //       cvs.find_key_and_state (r, r.admin.head, k, s);
     }
     
     global_pieces.reset();
@@ -755,42 +685,7 @@
 
 // CVS importing stuff follows
 
-/*
 
-  we define a "cvs key" as a triple of author, commit time and
-  changelog. the equality of keys is a bit blurry due to a window of time
-  in which "the same" commit may begin and end. the window is evaluated
-  during the multimap walk though; for insertion in the multimap a true >
-  is used. a key identifies a particular commit.
-
-  we reconstruct the history of a CVS archive by accumulating file edges
-  into archive nodes. each node is called a "cvs_state", but it is really a
-  collection of file *edges* leading into that archive state. we accumulate
-  file edges by walking up the trunk and down the branches of each RCS file.
-
-  once we've got all the edges accumulated into archive nodes, we walk the
-  tree of cvs_states, up through the trunk and down through the branches,
-  carrying a manifest_map with us during the walk. for each edge, we
-  construct either the parent or child state of the edge (depending on
-  which way we're walking) and then calculate and write out a manifest
-  delta for the difference between the previous and current manifest map. we
-  also write out manifest certs, though the direction of ancestry changes
-  depending on whether we're going up the trunk or down the branches.
-
- */
-
-cvs_file_edge::cvs_file_edge (file_id const & pv, file_path const & pp, bool pl,
-                              file_id const & cv, file_path const & cp, bool cl,
-                              cvs_history & cvs) :
-  parent_version(cvs.file_version_interner.intern(pv.inner()())), 
-  parent_path(cvs.path_interner.intern(pp())),
-  parent_live_p(pl),
-  child_version(cvs.file_version_interner.intern(cv.inner()())), 
-  child_path(cvs.path_interner.intern(cp())),
-  child_live_p(cl)
-{
-}
-
 static void
 split_version(string const & v, vector<string> & vs)
 {
@@ -814,49 +709,6 @@
     }
 }
 
-cvs_key::cvs_key(rcs_file const & r, string const & version,
-                 cvs_history & cvs) :
-  is_synthetic_branch_founding_commit(false)
-{
-  map<string, shared_ptr<rcs_delta> >::const_iterator delta = 
-    r.deltas.find(version);
-  I(delta != r.deltas.end());
-
-  map<string, shared_ptr<rcs_deltatext> >::const_iterator deltatext = 
-    r.deltatexts.find(version);
-  I(deltatext != r.deltatexts.end());
-
-  {    
-    struct tm t;
-    // We need to initialize t to all zeros, because strptime has a habit of
-    // leaving bits of the data structure alone, letting garbage sneak into
-    // our output.
-    memset(&t, 0, sizeof(t));
-    char const * dp = delta->second->date.c_str();
-    L(F("Calculating time of %s\n") % dp);
-#ifdef WIN32
-    I(sscanf(dp, "%d.%d.%d.%d.%d.%d", &(t.tm_year), &(t.tm_mon), 
-             &(t.tm_mday), &(t.tm_hour), &(t.tm_min), &(t.tm_sec))==6);
-    t.tm_mon--;
-    // Apparently some RCS files have 2 digit years, others four; tm always
-    // wants a 2 (or 3) digit year (years since 1900).
-    if (t.tm_year > 1900)
-        t.tm_year-=1900;
-#else
-    if (strptime(dp, "%y.%m.%d.%H.%M.%S", &t) == NULL)
-      I(strptime(dp, "%Y.%m.%d.%H.%M.%S", &t) != NULL);
-#endif
-    time=mktime(&t);
-    L(F("= %i\n") % time);
-    id = nextid++;
-  }
-
-  branch = cvs.bstk.top();
-  changelog = cvs.changelog_interner.intern(deltatext->second->log);
-  author = cvs.author_interner.intern(delta->second->author);
-}
-
-
 cvs_history::cvs_history() :
   n_versions("versions", "v", 1),
   n_tree_branches("branches", "b", 1)
@@ -879,6 +731,7 @@
         && ss.substr(last_slash-5,6)=="Attic/")
      ss.erase(last_slash-5,6);
   curr_file = file_path(ss);
+  curr_file_interned = path_interner.intern(ss);
 }
 
 void cvs_history::index_branchpoint_symbols(rcs_file const & r)
@@ -917,243 +770,37 @@
     }
 }
 
-void
-cvs_history::note_state_at_branch_beginning(rcs_file const & r,
-					    string const & branchname,
-					    string const & version, 
-					    file_id const & ident,
-                                            note_type nt)
-{
-  // here we manufacture a single synthetic commit -- the "branch
-  // birth" commit -- representing the cumulative affect of all the
-  // tag -b operations the user once performed. it has a synthetic
-  // author ("cvs_import") and a synthetic log message ("beginning of
-  // branch foo"), and occurs at the time of the *last* branchpoint of
-  // any files which entered this branch.
-  //
-  // note that this does not establish a revision-ancestry
-  // relationship between the branchpoint and the branch. the branch
-  // is considered a child of the null revision, as far as monotone is
-  // concerned.
 
-  L(F("noting branchpoint for %s = %s\n") % branchname % version);
 
-  push_branch(branchname, false);
-
-  cvs_key k;
-  shared_ptr<cvs_state> s;
-  I(stk.size() > 0);
-  shared_ptr<cvs_branch> branch = stk.top();
-
-  string branch_birth_message = "beginning of branch " + branchname;
-  string branch_birth_author = "cvs_import";
-
-  cvs_changelog clog = changelog_interner.intern(branch_birth_message);
-  cvs_author auth = author_interner.intern(branch_birth_author);
-
-  // note: SBFC is short for "synthetic branch-founding commit"
-
-  if (branch->empty())
-    {
-      I(nt == note_branchpoint);
-      find_key_and_state (r, version, k, s);
-      branch->erase(k);
-      k.changelog = clog;
-      k.author = auth;
-      k.add_file(curr_file, version);
-      k.is_synthetic_branch_founding_commit = true;
-      branch->insert(make_pair(k, s));
-      L(F("added SBFC for %s at id %d, time %d\n") 
-        % branchname % k.id % k.time);
-    }
-  else
-    {
-      cvs_key nk(r, version, *this);
-      
-      cvs_branch::iterator i = branch->end(); 
-      i--;
-      I(i->first.is_synthetic_branch_founding_commit);
-      I(i->first.author == auth);
-      I(i->first.changelog == clog);
-      
-      k = i->first;
-      s = i->second;
-
-      L(F("found existing SBFC at id %d, time %d\n") 
-        % k.id % k.time);
-      if (nt == note_branchpoint 
-          && nk.time > k.time
-          && branch_has_commit[branchname] == false)
-	{
-          L(F("moving SBFC for %s to later branchpoint at %d\n") 
-            % branchname % nk.time);
-	  branch->erase(i);
-	  k.time = nk.time;
-	  k.add_file(curr_file, version);
-	  branch->insert(make_pair(k, s));
-	}
-      else if (nt == note_branch_first_commit
-               && nk.time < k.time)
-	{
-          L(F("moving SBFC for %s to earlier branch commit at %d\n") 
-            % branchname % nk.time);
-	  branch->erase(i);
-	  k.time = nk.time;
-	  branch->insert(make_pair(k, s));
-          branch_has_commit[branchname] = true;
-	}
-
-    }
-
-  if (nt == note_branchpoint)
-    {
-      map<string, shared_ptr<rcs_delta> >::const_iterator del;
-      del = r.deltas.find(version);
-      I(del != r.deltas.end());
-      bool alive = del->second->state != "dead";
-      
-      s->in_edges.insert(cvs_file_edge(file_id(), curr_file, alive,
-                                       ident, curr_file, alive,
-                                       *this));
-    }
-
-  pop_branch();
-}
-
-bool 
-cvs_history::find_key_and_state(rcs_file const & r, 
-                                string const & version,
-                                cvs_key & key,
-                                shared_ptr<cvs_state> & state)
-{
-  I(stk.size() > 0);
-  shared_ptr<cvs_branch> branch = stk.top();
-  cvs_key nk(r, version, *this);
-
-  nk.add_file(curr_file, version);
-  // key+(window/2) is in the future, key-(window/2) is in the past. the
-  // past is considered "greater than" the future in this map, so we take:
-  // 
-  //  - new, the lower bound of key+(window/2) in the map
-  //  - old, the upper bound of key-(window/2) in the map
-  //
-  // and search all the nodes inside this section, from new to old bound.
-
-  map< cvs_key, shared_ptr<cvs_state> >::iterator i_new, i_old, i;
-  cvs_key k_new(nk), k_old(nk);
-
-  if (static_cast<time_t>(k_new.time + constants::cvs_window / 2) > k_new.time)
-    k_new.time += constants::cvs_window / 2;
-
-  if (static_cast<time_t>(k_old.time - constants::cvs_window / 2) < k_old.time)
-    k_old.time -= constants::cvs_window / 2;
-  
-  i_new = branch->lower_bound(k_new);
-  i_old = branch->upper_bound(k_old);
-
-  for (i = i_new; i != i_old; ++i)
-    {
-      if (i->first.similar_enough(nk))
-        {
-          key = i->first;
-          state = i->second;
-	  branch->erase(i);
-          key.add_file(curr_file, version);
-	  branch->insert(make_pair(key,state));
-          return true;
-        }
-    }
-  key = nk;
-  state = shared_ptr<cvs_state>(new cvs_state());
-  branch->insert(make_pair(key, state));
-  return false;
-}
-
 void 
 cvs_history::push_branch(string const & branch_name, bool private_branch)
-{      
+{
   shared_ptr<cvs_branch> branch;
-
+  
+  string bname = base_branch + "." + branch_name;
   I(stk.size() > 0);
 
-  map<string, shared_ptr<cvs_branch> >::const_iterator b = branches.find(branch_name);
-  if (b == branches.end())
-    {
-      branch = shared_ptr<cvs_branch>(new cvs_branch());
-      if (!private_branch)
-	branches.insert(make_pair(branch_name, branch));      
-    }
-  else
-    branch = b->second;
-
-  stk.push(branch);
-
   if (private_branch)
-    bstk.push(bstk.top());
-  else
-    bstk.push(branch_interner.intern(base_branch + "." + branch_name));
-}
-
-void 
-cvs_history::note_file_edge(rcs_file const & r, 
-                            string const & prev_rcs_version_num,
-                            string const & next_rcs_version_num,
-                            file_id const & prev_version,
-                            file_id const & next_version) 
-{
-
-  cvs_key k;
-  shared_ptr<cvs_state> s;
-
-  I(stk.size() > 0);
-  I(! curr_file().empty());
-
-  L(F("noting file edge %s -> %s\n") % prev_rcs_version_num % next_rcs_version_num);
-
-  // we can't use operator[] since it is non-const
-  std::map<std::string, boost::shared_ptr<rcs_delta> >::const_iterator
-        prev_delta = r.deltas.find(prev_rcs_version_num),
-        next_delta = r.deltas.find(next_rcs_version_num);
-  I(prev_delta!=r.deltas.end());
-  I(next_delta!=r.deltas.end());
-  bool prev_alive = prev_delta->second->state!="dead";
-  bool next_alive = next_delta->second->state!="dead";
-  
-  L(F("note_file_edge %s %d -> %s %d\n") 
-    % prev_rcs_version_num % prev_alive
-    % next_rcs_version_num % next_alive);
-
-  // we always aggregate in-edges in children, but we will also create
-  // parents as we encounter them.
-  if (stk.size() == 1)
     {
-      // we are on the trunk, prev is child, next is parent.
-      L(F("noting trunk edge %s : %s -> %s\n") % curr_file
-        % next_rcs_version_num
-        % prev_rcs_version_num);
-      find_key_and_state (r, next_rcs_version_num, k, s); // just to create it if necessary      
-      find_key_and_state (r, prev_rcs_version_num, k, s);
-
-      L(F("trunk edge entering key state %d\n") % k.id);
-      s->in_edges.insert(cvs_file_edge(next_version, curr_file, next_alive,
-                                       prev_version, curr_file, prev_alive,
-                                       *this));
+      stk.push(stk.top());
+      bstk.push(bstk.top());
+      return;
     }
   else
-    {
-      // we are on a branch, prev is parent, next is child.
-      L(F("noting branch edge %s : %s -> %s\n") % curr_file
-        % prev_rcs_version_num
-        % next_rcs_version_num);
-      find_key_and_state (r, next_rcs_version_num, k, s);
-      L(F("branch edge on %s entering key state %d\n") 
-        % branch_interner.lookup(k.branch) % k.id);
-      s->in_edges.insert(cvs_file_edge(prev_version, curr_file, prev_alive,
-                                       next_version, curr_file, next_alive,
-                                       *this));
+    {  
+      map<string, shared_ptr<cvs_branch> >::const_iterator b = branches.find(bname);
+      if (b == branches.end())
+        {
+          branch = shared_ptr<cvs_branch>(new cvs_branch());
+          branches.insert(make_pair(bname, branch));
+          ++n_tree_branches;
+        }
+      else
+        branch = b->second;
+      
+      stk.push(branch);
+      bstk.push(branch_interner.intern(bname));
     }
-    
-  ++n_versions;
 }
 
 void 
@@ -1190,35 +837,414 @@
 };
 
 
-static void 
-store_manifest_edge(manifest_map const & parent,
-                    manifest_map const & child,
-                    manifest_id const & parent_mid,
-                    manifest_id const & child_mid,
-                    app_state & app,
-                    cvs_history & cvs,
-                    bool head_manifest_p)
+
+
+//
+// our task here is to produce a sequence of revision descriptions
+// from the per-file commit records we have. we do this by rolling
+// forwards through the temporally sorted file-commit list
+// accumulating file-commits into revisions and flushing the
+// revisions when we feel they are "complete".
+//
+// revisions have to have a time associated with them. this time
+// will be the first time of any commit associated with the
+// revision. they have an author and a changelog, which is shared
+// by all the file-commits in the revision.
+//
+// there might be multiple revisions overlapping in time. this is
+// legal wrt. CVS. we keep a set, and search all members of the set
+// for the best match.
+//
+// consider this situation of overlapping revisions:
+//
+//    +---------------+   +---------------+   +---------------+
+//    | rev #1 @ 0011 |   | rev #2 @ 0012 |   | rev #3 @ 0013 |
+//    |~~~~~~~~~~~~~~~|   |~~~~~~~~~~~~~~~|   |~~~~~~~~~~~~~~~|
+//    | patch foo.txt |   | patch bar.txt |   | patch baz.txt |
+//    +---------------+   +---------------+   +---------------+
+//
+// suppose you have this situation and you run across a "patch
+// bar.txt" commit at timestamp 0014. what do you do?
+//
+// - you know that rev #2 cannot accept this commit, simply because
+//   two commits on the same file makes *two* revisions, not one.
+//
+// - perhaps rev #3 could accept it; after all, it could be that the
+//   commit associated with rev #2 released its commit lock, and the
+//   commit associated with rev #3 quickly updated and committed at
+//   0013, finishing off at 0014.
+//
+// - can rev #1 accept it? no. because CVS calcualted the version it 
+//   expected to see in bar.txt before calling up the server, when
+//   committing rev #1. the version it expected to see was the version
+//   in bar.txt *before* time 0012; that is, before rev #2 had any affect
+//   on bar.txt. when it contacted the server, the commit associated 
+//   with rev #1 would have aborted if it had seen any other number. 
+//   so rev #1 could not start before an edit to bar.txt and then
+//   include its own edit to bar.txt.
+//
+// so we have only one case where bar.txt can be accepted. if the
+// commit is not accepted into a legal rev (outside the window,
+// wrong changelog/author) it starts a new revision.
+//
+// as we scan forwards, if we hit timestamps which lie beyond rev #n's
+// window, we flush rev #n. 
+//
+// if there are multiple coincident and legal revs to direct a
+// commit to (all with the same author/changelog), we direct the
+// commit to the rev with the closest initial timestamp. that is,
+// the *latest* beginning time.
+
+struct
+cvs_cluster
 {
+  time_t first_time;
+  cvs_author author;
+  cvs_changelog changelog;
 
-  L(F("storing manifest %s (base %s)\n") % parent_mid % child_mid);
+  cvs_cluster(time_t t, 
+              cvs_author a,
+              cvs_changelog c) 
+    : first_time(t),
+      author(a),
+      changelog(c)
+  {}
 
-  if (head_manifest_p)
+  struct entry
+  {
+    bool live;
+    cvs_version version;
+    time_t time;
+    entry(bool l, cvs_version v, time_t t)
+      : live(l),
+        version(v),
+        time(t)
+    {}
+  };
+
+  typedef map<cvs_path, entry> entry_map;
+  entry_map entries;
+};
+
+
+struct
+cluster_consumer
+{
+  cvs_history & cvs;
+  app_state & app;
+  string const & branchname;
+  cvs_branch const & branch;
+  map<cvs_path, cvs_version> live_files;
+  ticker & n_revisions;
+
+  struct prepared_revision
+  {
+    prepared_revision(revision_id i, 
+                      shared_ptr<revision_set> r,
+                      cvs_cluster const & c);
+    revision_id rid;
+    shared_ptr<revision_set> rev;
+    time_t time;
+    cvs_author author;
+    cvs_changelog changelog;
+  };
+
+  vector<prepared_revision> preps;
+
+  manifest_map parent_map, child_map;
+  manifest_id parent_mid, child_mid;
+  revision_id parent_rid, child_rid;
+
+  cluster_consumer(cvs_history & cvs,
+                   app_state & app,
+                   string const & branchname,
+                   cvs_branch const & branch,
+                   ticker & n_revs);
+
+  void consume_cluster(cvs_cluster const & c, 
+                       bool head_p);
+  void build_change_set(cvs_cluster const & c,
+                        change_set & cs);
+  void store_manifest_edge(bool head_p);
+  void store_auxiliary_certs(prepared_revision const & p);
+  void store_revisions();
+};
+
+typedef shared_ptr<cvs_cluster> 
+cluster_ptr;
+
+struct
+cluster_ptr_lt
+{
+  bool operator()(cluster_ptr const & a,
+                  cluster_ptr const & b) const
+  {
+    return a->first_time < b->first_time;
+  }
+};
+
+typedef set<cluster_ptr, cluster_ptr_lt>
+cluster_set;
+
+void 
+import_branch(cvs_history & cvs, 
+              app_state & app,
+              string const & branchname,
+              shared_ptr<cvs_branch> const & branch,
+              ticker & n_revs)
+{
+  cluster_set clusters;
+  cluster_consumer cons(cvs, app, branchname, *branch, n_revs);
+  unsigned long commits_remaining = branch->lineage.size();
+
+  // step 1: sort the lineage
+  sort(branch->lineage.begin(), branch->lineage.end());
+
+  for (vector<cvs_commit>::const_iterator i = branch->lineage.begin();
+       i != branch->lineage.end(); ++i)
+    {      
+      commits_remaining--;
+
+      L(F("examining next commit [t:%d] [a:%d] [c:%d]\n")
+        % i->time % i->author % i->changelog);
+      
+      // step 2: expire all clusters from the beginning of the set which
+      // have passed the window size
+      while (!clusters.empty())
+        {
+          cluster_set::const_iterator j = clusters.begin();
+          if ((*j)->first_time + window < i->time)
+            {
+              L(F("expiring cluster\n"));
+              cons.consume_cluster(**j, false);
+              clusters.erase(j);
+            }
+          else
+            break;
+        }
+      
+      // step 3: find the last still-live cluster to have touched this
+      // file
+      time_t last_modify_time = 0;
+      for (cluster_set::const_iterator j = clusters.begin();
+           j != clusters.end(); ++j)
+        {
+          cvs_cluster::entry_map::const_iterator k = (*j)->entries.find(i->path);
+          if (k != (*j)->entries.end() &&
+              k->second.time > last_modify_time)
+            last_modify_time = k->second.time;
+        }
+      L(F("last modification time is %d\n") % last_modify_time);
+      
+      // step 4: find a cluster which starts after the
+      // last_modify_time, which doesn't modify the file in question,
+      // and which contains the same author and changelog as our
+      // commit
+      cluster_ptr target;
+      for (cluster_set::const_iterator j = clusters.begin();
+           j != clusters.end(); ++j)
+        {
+          if (((*j)->first_time > last_modify_time)
+              && ((*j)->author == i->author)
+              && ((*j)->changelog == i->changelog)
+              && ((*j)->entries.find(i->path) == (*j)->entries.end()))
+            {              
+              L(F("picked existing cluster target\n"));
+              target = (*j);
+            }
+        }
+      
+      // if we're still not finding an active cluster,
+      // this is probably the first commit in it. make 
+      // a new one.
+      if (!target)
+        {
+          L(F("building new cluster [t:%d] [a:%d] [c:%d]\n")
+            % i->time % i->author % i->changelog);
+          target = cluster_ptr(new cvs_cluster(i->time, 
+                                               i->author, 
+                                               i->changelog));
+          clusters.insert(target);
+        }
+      
+      I(target);
+      target->entries.insert(make_pair(i->path, 
+                                       cvs_cluster::entry(i->alive, 
+                                                          i->version,
+                                                          i->time)));
+    }
+
+
+  // now we are done this lineage; flush all remaining clusters
+  L(F("finished branch commits, writing all pending clusters\n"));
+  while (!clusters.empty())
     {
+      cons.consume_cluster(**clusters.begin(), clusters.size() == 1);
+      clusters.erase(clusters.begin());
+    }
+  L(F("finished writing pending clusters\n"));
+
+  cons.store_revisions();
+
+}
+
+
+void 
+import_cvs_repo(fs::path const & cvsroot, 
+                app_state & app)
+{
+  N(!fs::exists(cvsroot / "CVSROOT"),
+    F("%s appears to be a CVS repository root directory\n"
+      "try importing a module instead, with 'cvs_import %s/<module_name>")
+    % cvsroot.native_directory_string() % cvsroot.native_directory_string());
+  
+  {
+    // early short-circuit to avoid failure after lots of work
+    rsa_keypair_id key;
+    N(guess_default_key(key,app),
+      F("no unique private key for cert construction"));
+    require_password(key, app);
+  }
+
+  cvs_history cvs;
+  N(app.branch_name() != "", F("need base --branch argument for importing"));
+  cvs.base_branch = app.branch_name();
+
+  // push the trunk
+  cvs.trunk = shared_ptr<cvs_branch>(new cvs_branch());
+  cvs.stk.push(cvs.trunk);
+  cvs.bstk.push(cvs.branch_interner.intern(cvs.base_branch));
+
+  {
+    transaction_guard guard(app.db);
+    cvs_tree_walker walker(cvs, app.db);
+    N( fs::exists(cvsroot),
+       F("path %s does not exist") % cvsroot.string());
+    N( fs::is_directory(cvsroot),
+       F("path %s is not a directory") % cvsroot.string());
+    app.db.ensure_open();
+    N(chdir(cvsroot.native_directory_string().c_str()) == 0,
+      F("could not change directory to %s") % cvsroot.string());
+    walk_tree(walker);
+    guard.commit();
+  }
+
+  I(cvs.stk.size() == 1);
+
+  ticker n_revs("finished revisions", "r", 1);
+
+  for (map<string, shared_ptr<cvs_branch> >::const_iterator i = cvs.branches.begin();
+       i != cvs.branches.end(); ++i)
+    {
+      L(F("branch %s has %d entries\n") % i->first % i->second->lineage.size());
+      import_branch(cvs, app, i->first, i->second, n_revs);
+    }
+
+  L(F("trunk has %d entries\n") % cvs.trunk->lineage.size());
+  import_branch(cvs, app, cvs.base_branch, cvs.trunk, n_revs);
+
+  return;
+
+}
+
+cluster_consumer::cluster_consumer(cvs_history & cvs,
+                                   app_state & app,
+                                   string const & branchname,
+                                   cvs_branch const & branch,
+                                   ticker & n_revs)
+  : cvs(cvs), 
+    app(app), 
+    branchname(branchname),
+    branch(branch),
+    n_revisions(n_revs)
+{
+  if (!branch.live_at_beginning.empty())
+    {
+      cvs_author synthetic_author = 
+        cvs.author_interner.intern("cvs_import");
+      
+      cvs_changelog synthetic_cl = 
+        cvs.changelog_interner.intern("beginning of branch " 
+                                      + branchname);
+      
+      time_t synthetic_time = branch.beginning();
+      cvs_cluster initial_cluster(synthetic_time, 
+                                  synthetic_author, 
+                                  synthetic_cl);
+      
+      L(F("initial cluster on branch %s has %d live entries\n") % 
+        branchname % branch.live_at_beginning.size());
+
+      for (map<cvs_path, cvs_version>::const_iterator i = branch.live_at_beginning.begin();
+           i != branch.live_at_beginning.end(); ++i)
+        {
+          cvs_cluster::entry e(true, i->second, synthetic_time);
+          L(F("initial cluster contains %s at %s\n") % 
+            cvs.path_interner.lookup(i->first) %
+            cvs.file_version_interner.lookup(i->second));
+          initial_cluster.entries.insert(make_pair(i->first, e));
+        }
+      consume_cluster(initial_cluster, false);
+    }
+}
+  
+cluster_consumer::prepared_revision::prepared_revision(revision_id i, 
+                                                       shared_ptr<revision_set> r,
+                                                       cvs_cluster const & c)
+  : rid(i), 
+    rev(r), 
+    time(c.first_time), 
+    author(c.author), 
+    changelog(c.changelog)
+{
+}
+
+
+void 
+cluster_consumer::store_revisions()
+{
+  for (vector<prepared_revision>::const_iterator i = preps.begin();
+       i != preps.end(); ++i)
+    {
+      if (! app.db.revision_exists(i->rid))
+        {
+          data tmp;
+          write_revision_set(*(i->rev), tmp);
+          /*
+          cout << "+++WRITING REVISION" << endl;
+          cout << tmp << endl;
+          cout << "---WRITING REVISION" << endl;
+          */
+          app.db.put_revision(i->rid, *(i->rev));
+          store_auxiliary_certs(*i);
+          ++n_revisions;
+        }
+    }
+}
+
+void
+cluster_consumer::store_manifest_edge(bool head_p)
+{
+  L(F("storing manifest '%s' (base %s)\n") % parent_mid % child_mid);
+
+  if (head_p)
+    {
       L(F("storing head %s\n") % child_mid);
       // a branch has one very important manifest: the head.  this is
       // the "newest" of all manifests within the branch (including
-      // the trunk), and we store it in its entirety.
+      // the trunk), and we store it in its entirety, before the
+      // cluster consumer is destroyed.
       if (! app.db.manifest_version_exists(child_mid))
         {
           manifest_data mdat;
-          write_manifest_map(child, mdat);
+          write_manifest_map(child_map, mdat);
           app.db.put_manifest(child_mid, mdat);
         }
     }
 
   if (null_id(parent_mid))
     {
-      L(F("skipping null manifest\n"));
+      L(F("skipping delta to null manifest\n"));
       return;
     }
 
@@ -1238,7 +1264,7 @@
 	{
 	  L(F("writing full manifest %s\n") % parent_mid);
 	  manifest_data mdat;
-	  write_manifest_map(parent, mdat);
+	  write_manifest_map(parent_map, mdat);
 	  app.db.put_manifest(parent_mid, mdat);
 	}
       return;
@@ -1256,214 +1282,84 @@
   // run from child (new) -> parent (old).
       
   delta del;
-  diff(child, parent, del);
+  diff(child_map, parent_map, del);
   rcs_put_raw_manifest_edge(parent_mid.inner(),
 			    child_mid.inner(),
 			    del, app.db);
 }
 
-
-static void 
-store_auxiliary_certs(cvs_key const & key, 
-                      revision_id const & id, 
-                      app_state & app, 
-                      cvs_history const & cvs)
+void 
+cluster_consumer::store_auxiliary_certs(prepared_revision const & p)
 {
   packet_db_writer dbw(app);
-  cert_revision_in_branch(id, cert_value(cvs.branch_interner.lookup(key.branch)), app, dbw); 
-  cert_revision_author(id, cvs.author_interner.lookup(key.author), app, dbw); 
-  cert_revision_changelog(id, cvs.changelog_interner.lookup(key.changelog), app, dbw);
-  cert_revision_date_time(id, key.time, app, dbw);
+  cert_revision_in_branch(p.rid, cert_value(branchname), app, dbw); 
+  cert_revision_author(p.rid, cvs.author_interner.lookup(p.author), app, dbw); 
+  cert_revision_changelog(p.rid, cvs.changelog_interner.lookup(p.changelog), app, dbw);
+  cert_revision_date_time(p.rid, p.time, app, dbw);
 }
 
-static void 
-build_change_set(shared_ptr<cvs_state> state,
-                 manifest_map const & state_map,
-                 cvs_history & cvs,
-                 change_set & cs)
+void
+cluster_consumer::build_change_set(cvs_cluster const & c,
+                                   change_set & cs)
 {
-  change_set empty;
-  cs = empty;
-
-  for (set<cvs_file_edge>::const_iterator f = state->in_edges.begin();
-       f != state->in_edges.end(); ++f)
+  for (cvs_cluster::entry_map::const_iterator i = c.entries.begin();
+       i != c.entries.end(); ++i)
     {
-      file_id fid(cvs.file_version_interner.lookup(f->child_version));
-      file_path pth(cvs.path_interner.lookup(f->child_path));
-      if (!f->child_live_p)
-        {  
-          if (f->parent_live_p)
+      file_path pth(cvs.path_interner.lookup(i->first));
+      file_id fid(cvs.file_version_interner.lookup(i->second.version));
+      if (i->second.live)
+        {
+          map<cvs_path, cvs_version>::const_iterator e = live_files.find(i->first);
+          if (e == live_files.end())
             {
-              L(F("deleting entry state '%s' on '%s'\n") % fid % pth);              
-              cs.delete_file(pth);
+              L(F("adding entry state '%s' on '%s'\n") % fid % pth);          
+              cs.add_file(pth, fid);
+              live_files[i->first] = i->second.version;
             }
-          else
+          else if (e->second != i->second.version)
             {
-              // it can actually happen that we have a file that went from
-              // dead to dead.  when a file is created on a branch, cvs first
-              // _commits a deleted file_ on mainline, and then branches from
-              // it and resurrects it.  In such cases, we should just ignore
-              // the file, it doesn't actually exist.  So, in this block, we
-              // do nothing.
+              file_id old_fid(cvs.file_version_interner.lookup(e->second));
+              L(F("applying state delta on '%s' : '%s' -> '%s'\n") 
+                % pth % old_fid % fid);
+              cs.apply_delta(pth, old_fid, fid);
+              live_files[i->first] = i->second.version;
             }
         }
-      else 
+      else
         {
-          manifest_map::const_iterator i = state_map.find(pth);
-          if (i == state_map.end())
+          map<cvs_path, cvs_version>::const_iterator e = live_files.find(i->first);
+          if (e != live_files.end())
             {
-              L(F("adding entry state '%s' on '%s'\n") % fid % pth);          
-              cs.add_file(pth, fid);          
+              L(F("deleting entry state '%s' on '%s'\n") % fid % pth);              
+              cs.delete_file(pth);
+              live_files.erase(i->first);
             }
-          else if (manifest_entry_id(i) == fid)
-            {
-              L(F("skipping preserved entry state '%s' on '%s'\n")
-                % fid % pth);         
-            }
-          else
-            {
-              L(F("applying state delta on '%s' : '%s' -> '%s'\n") 
-                % pth % manifest_entry_id(i) % fid);          
-              cs.apply_delta(pth, manifest_entry_id(i), fid);
-            }
-        }  
+        }
     }
-  L(F("logical changeset from parent -> child has %d file state changes\n") 
-    % state->in_edges.size());
 }
 
-static void 
-import_branch_states(ticker & n_edges, 
-		     cvs_branch & branch,
-		     cvs_history & cvs,
-		     app_state & app,
-		     vector< pair<cvs_key, revision_set> > & revisions)
-{
-  manifest_map parent_map, child_map;
-  manifest_id parent_mid, child_mid;
-  revision_id parent_rid, child_rid;
-  
-  // we look through the branch temporally *backwards* from oldest to
-  // newest
+void
+cluster_consumer::consume_cluster(cvs_cluster const & c,
+                                  bool head_p)
+{  
+  shared_ptr<revision_set> rev(new revision_set());
+  boost::shared_ptr<change_set> cs(new change_set());
+  build_change_set(c, *cs);
 
-  for (cvs_branch::reverse_iterator i = branch.rbegin(); 
-       i != branch.rend(); ++i)
-    {
-      L(F("importing branch %s, state [%d: %s @ %d]\n")
-        % cvs.branch_interner.lookup(i->first.branch)
-        % i->first.id
-        % cvs.author_interner.lookup(i->first.author)
-        % i->first.time);
+  apply_change_set(*cs, child_map);
+  calculate_ident(child_map, child_mid);
 
-      revision_set rev;
-      boost::shared_ptr<change_set> cs(new change_set());
-      build_change_set(i->second, parent_map, cvs, *cs);
+  rev->new_manifest = child_mid;
+  rev->edges.insert(make_pair(parent_rid, make_pair(parent_mid, cs)));
+  calculate_ident(*rev, child_rid);
 
-      apply_change_set(*cs, child_map);
-      calculate_ident(child_map, child_mid);
+  store_manifest_edge(head_p);
 
-      rev.new_manifest = child_mid;
-      rev.edges.insert(make_pair(parent_rid, make_pair(parent_mid, cs)));
-      calculate_ident(rev, child_rid);
+  L(F("consumed cluster %s (parent '%s')\n") % child_rid % rev->edges.begin()->first);  
+  preps.push_back(prepared_revision(child_rid, rev, c));
 
-      revisions.push_back(make_pair(i->first, rev));
-
-      store_manifest_edge(parent_map, child_map, 
-                          parent_mid, child_mid, 
-                          app, cvs, i->first == branch.begin()->first);
-
-      // now apply same change set to parent_map, making parent_map == child_map
-      apply_change_set(*cs, parent_map);
-      parent_mid = child_mid;
-      parent_rid = child_rid;
-      ++n_edges;
-    }
+  // now apply same change set to parent_map, making parent_map == child_map
+  apply_change_set(*cs, parent_map);
+  parent_mid = child_mid;
+  parent_rid = child_rid;
 }
-
-void 
-import_cvs_repo(fs::path const & cvsroot, 
-                app_state & app)
-{
-  N(!fs::exists(cvsroot / "CVSROOT"),
-    F("%s appears to be a CVS repository root directory\n"
-      "try importing a module instead, with 'cvs_import %s/<module_name>")
-    % cvsroot.native_directory_string() % cvsroot.native_directory_string());
-  
-  {
-    // early short-circuit to avoid failure after lots of work
-    rsa_keypair_id key;
-    N(guess_default_key(key,app),
-      F("no unique private key for cert construction"));
-    require_password(key, app);
-  }
-
-  cvs_history cvs;
-  N(app.branch_name() != "", F("need base --branch argument for importing"));
-  cvs.base_branch = app.branch_name();
-
-  // push the trunk
-  cvs.stk.push(shared_ptr<cvs_branch>(new cvs_branch()));  
-  cvs.bstk.push(cvs.branch_interner.intern(cvs.base_branch));
-
-  {
-    transaction_guard guard(app.db);
-    cvs_tree_walker walker(cvs, app.db);
-    N( fs::exists(cvsroot),
-       F("path %s does not exist") % cvsroot.string());
-    N( fs::is_directory(cvsroot),
-       F("path %s is not a directory") % cvsroot.string());
-    app.db.ensure_open();
-    N(chdir(cvsroot.native_directory_string().c_str()) == 0,
-      F("could not change directory to %s") % cvsroot.string());
-    walk_tree(walker);
-    guard.commit();
-  }
-
-  P(F("phase 1 (version import) complete\n"));
-
-  I(cvs.stk.size() == 1);
-
-  vector< pair<cvs_key, revision_set> > revisions;
-  {
-    ticker n_branches("finished branches", "b", 1);
-    ticker n_edges("finished edges", "e", 1);
-    transaction_guard guard(app.db);
-    manifest_map root_manifest;
-    manifest_id root_mid;
-    revision_id root_rid; 
-    
-
-    ui.set_tick_trailer("building trunk");
-    import_branch_states(n_edges, *cvs.stk.top(), cvs, app, revisions);
-
-    for(map<string, shared_ptr<cvs_branch> >::const_iterator branch = cvs.branches.begin();
-	branch != cvs.branches.end(); ++branch)
-      {
-	ui.set_tick_trailer("building branch " + branch->first);
-	++n_branches;
-	import_branch_states(n_edges, *(branch->second), cvs, app, revisions);
-      }
-
-    P(F("phase 2 (ancestry reconstruction) complete\n"));
-    guard.commit();
-  }
-  
-  {
-    ticker n_revisions("written revisions", "r", 1);
-    ui.set_tick_trailer("");
-    transaction_guard guard(app.db);
-    for (vector< pair<cvs_key, revision_set> >::const_iterator
-           i = revisions.begin(); i != revisions.end(); ++i)
-      {
-        revision_id rid;
-        calculate_ident(i->second, rid);
-        if (! app.db.revision_exists(rid))
-          app.db.put_revision(rid, i->second);
-        store_auxiliary_certs(i->first, rid, app, cvs);
-        ++n_revisions;
-      }
-    P(F("phase 3 (writing revisions) complete\n"));
-    guard.commit();
-  }
-}
-