# # # add_file "mtn_benchmark/random_content.py" # content [654582744bd8a67d13964429f5330cab51e306ae] # # patch "mtn_benchmark/benchmarks.py" # from [27e5079920b8abad19ad7bad2a2153a385a57014] # to [3a942d0be3b8d79dd0e2c0460e878be547321b1d] # # patch "mtn_benchmark/instrumenters.py" # from [5ce55a21058a7ebc43fe0df2163b4e6f7a514d9e] # to [7c5ebdf22732696a15f3e92207d8216879965edc] # # patch "mtn_benchmark/mtn.py" # from [773ecd0cb44dd07077bf972e0bd38ed2e33e8e5f] # to [6fbfc58a51891b3f96c1022432c34804566d57cd] # # patch "mtn_benchmark/repo.py" # from [6f250d61245dc45bb1815a3864c454532ed2b69c] # to [4242d0244903f51354a513e9d36dc400c1391f73] # ============================================================ --- mtn_benchmark/random_content.py 654582744bd8a67d13964429f5330cab51e306ae +++ mtn_benchmark/random_content.py 654582744bd8a67d13964429f5330cab51e306ae @@ -0,0 +1,96 @@ +try: + import scipy.stats +except ImportError: + # Allow importing, in case the user isn't actually going to use us... but + # we probably won't work if they do. + pass + +# Requires scipy, and contortions. +def binomial_variate(r, n, p): + scipy.stats.seed(r.randrange(1 << 31), 1) + return scipy.stats.binom.rvs(n, p)[0] + +class RandomContentSource(object): + # We generate lines as printable ascii nonsense, with uniformly distributed + # lengths. This isn't that great a model, but it's unlikely that what we do + # here matters that much... + def genline(self, r): + length = r.randrange(20, 70) + return "".join([chr(r.randint(ord(" "), ord("}"))) for i in xrange(length)]) + "\n" + + def genbytes(self, r, size): + total = 0 + lines = [] + while total < size: + line = self.genline(r) + total += len(line) + lines.append(line) + return ("".join(lines))[:size] + +class FileSource(RandomContentSource): + def __init__(self, meanlog, sdlog): + self.meanlog = meanlog + self.sdlog = sdlog + + # This model is much more important, and much better -- we generate file + # sizes as log normal. This provides a rather tight fit to many source + # trees -- I checked monotone, the linux kernel, the uclinux distribution + # (which includes a 100,000 files, mostly other projects source), pygame, + # xfree86, diffutils, and they all eyeballed "close enough". (These were + # just some tarballs I had lying around.) There are definitely things in + # this data that this model does not capture -- + # http://www.cs.hmc.edu/~geoff/filesizes.html discusses the kinds of + # phenemona that occur reasonably well -- but it's a good start. + # + # For the above-mentioned source trees, I found: + # + # tree mean(log(data)) sd(log(data)) + # ----------- --------------- ------------- + # monotone 7.59 1.80 + # linux kernel 8.21 1.62 + # uclinux 7.87 1.69 + # pygame 8.57 1.53 + # diffutils 8.21 1.77 + # xfree86 8.40 1.83 + # + # Which is surprisingly consistent -- picking a meanlog of ~8 and a sdlog + # of ~1.7 should give quite reasonable sizes. + def generate(self, r): + size = int(r.lognormvariate(self.meanlog, self.sdlog)) + return self.genbytes(r, size) + +class EditSource(RandomContentSource): + # Arguments are parameters to define two binomial distributions -- one + # describes how many hunks we edit, and the other how many bytes we edit + # in each hunk. This is totally ad-hoc. + # NB: the binomial requires: sd_size**2 < mean_size + def __init__(self, mean_num, sd_num, mean_size, sd_size): + self.n_num, self.p_num = self.mean_sd_to_n_p(mean_num, sd_num) + self.n_size, self.p_size = self.mean_sd_to_n_p(mean_size, sd_size) + + def mean_sd_to_n_p(self, mean, sd): + # n*p = mean + # sqrt(n*p*(1-p)) = sd + # --> + # 1 - (sd^2 / mean) = p + # mean / p = n + p = 1 - (sd ** 2 / float(mean)) + assert 1 > p > 0 + n = int(mean / p) + return (n, p) + + def edit(self, r, start_text): + text = start_text + num = binomial_variate(r, self.n_num, self.p_num) + for i in xrange(num): + del_size = binomial_variate(r, self.n_size, self.p_size) + if not del_size: + continue + if del_size > len(start_text): + del_size = len(start_text) + pivot1 = r.randint(0, len(start_text) - del_size) + pivot2 = pivot1 + del_size + insert_size = binomial_variate(r, self.n_size, self.p_size) + replacement = self.genbytes(r, insert_size) + text = text[:pivot1] + replacement + text[pivot2:] + return text ============================================================ --- mtn_benchmark/benchmarks.py 27e5079920b8abad19ad7bad2a2153a385a57014 +++ mtn_benchmark/benchmarks.py 3a942d0be3b8d79dd0e2c0460e878be547321b1d @@ -17,7 +17,7 @@ def setup(self, vcs): vcs.init_repo("target") - self.repo_source.setup() + self.repo_source.setup(vcs) def run(self, vcs): vcs.pull(self.repo_source.repo_path(), "target") ============================================================ --- mtn_benchmark/instrumenters.py 5ce55a21058a7ebc43fe0df2163b4e6f7a514d9e +++ mtn_benchmark/instrumenters.py 7c5ebdf22732696a15f3e92207d8216879965edc @@ -5,6 +5,7 @@ class NullInstrumenter(instrumenter.Instrumenter): objclass = instrumenter.InstrumenterObj + class TimingInstrumenterObj(instrumenter.RecordingInstrumenterObj): def parse_time_str(self, s): # 1.7 -> 1.7 @@ -34,11 +35,11 @@ process.hook(timing_hook) return process - class TimingInstrumenter(instrumenter.Instrumenter): repeats = 3 objclass = TimingInstrumenterObj + class MemTimingInstrumenterObj(instrumenter.RecordingInstrumenterObj): def __init__(self, record_dir, path=None): super(MemTimingInstrumenterObj, self).__init__(record_dir) @@ -69,7 +70,6 @@ process.hook(timing_hook) return process - class MemTimingInstrumenter(instrumenter.Instrumenter): repeats = 3 objclass = MemTimingInstrumenterObj ============================================================ --- mtn_benchmark/mtn.py 773ecd0cb44dd07077bf972e0bd38ed2e33e8e5f +++ mtn_benchmark/mtn.py 6fbfc58a51891b3f96c1022432c34804566d57cd @@ -1,7 +1,8 @@ import os import os.path import shutil import time +import mtn_benchmark.util # VCSes all have a factory, with method: .new(instrumenter) @@ -78,4 +79,21 @@ self.instrumenter.run("init_repo", ["./mtn", "db", "init", "-d", repo]) + def init_ws(self, ws, repo): + self.instrumenter.run("init_ws", + ["./mtn", "setup", ws, "-d", repo, "-b", "testbranch"]) + # Must be run in a workspace + # Creates a file and schedules it for addition at the next commit + def add_file(self, name, contents): + d, basename = os.path.split(name) + if d: + mtn_benchmark.util.ensure_dir(d) + f = open(name, "w") + f.write(contents) + f.close() + self.instrumenter.run("add_file", ["./mtn", "add", name]) + + def add_dir(self, name): + mtn_benchmark.util.ensure_dir(name) + self.instrumenter.run("add_dir", ["./mtn", "add", name]) ============================================================ --- mtn_benchmark/repo.py 6f250d61245dc45bb1815a3864c454532ed2b69c +++ mtn_benchmark/repo.py 4242d0244903f51354a513e9d36dc400c1391f73 @@ -1,4 +1,6 @@ +import os import os.path +import random # repos have setup() and repo_path() methods. @@ -6,11 +8,79 @@ def __init__(self, path): self.path = os.path.abspath(path) - def setup(self): + def setup(self, vcs): pass def repo_path(self): return self.path +# Fancy random-history generation + +DIR = "dir" +FILE = "file" + +class Node(object): + def __init__(self, parent, name, type): + self.parent = parent + self.name = name + self.type = type + + def fullname(self): + if self.parent: + return self.parent.fullname() + "/" + self.name + else: + return self.name + +class Tree(object): + def __init__(self, vcs): + self.vcs = vcs + self.nodes = {} + self.files = {} + self.dirs = {} + + def clone(self): + t = Tree(self.vcs) + t.nodes = dict(self.nodes) + t.files = dict(self.files) + t.dirs = dict(self.dirs) + + def add_node(self, parent, name, type): + n = Node(parent, name, type) + assert name not in self.nodes + self.nodes[name] = n + return n + + def add_file(self, parent, name, contents): + self.add_node(parent, name, FILE) + self.files[name] = n + vcs.add_file(n.fullname(), contents) + + def add_dir(self, parent, name): + self.add_node(parent, name, DIR) + self.dirs[name] = n + vcs.add_dir(n.fullname()) + +class SimpleRandomRepo(object): + magic_numbers = { + "seed": 0, + "start_dirs": 20, + "start_files": 400, + } + + def __init__(self, **kwargs): + self.magic_numbers.update(kwargs) + self.random = random.Random(self.magic_numbers["seed"]) + self.path = "randrepo" + + def setup(self, vcs): + vcs.init_repo(self.path) + + scratchdir = "randrepo-scratch" + vcs.init_ws(scratchdir) + + startdir = os.getcwd() + os.chdir(scratchdir) + + os.chdir(startdir) + -# TODO: add synthetic repo generation classes