Mercurial > public > mercurial-scm > hg-stable
view tests/artifacts/scripts/generate-churning-bundle.py @ 52470:83f87912c5e0
test-sparse-revlog: commit the repo content in a single in-mem transaction
I don't like to use the internal API like that, but it make the whole generation
twice faster on my machine and suitable for always run during tests. However it
also mean the `--pure` version of the test will run this with the pure source
code and be unbearably slow. However if another flavor generated the file, pure
will be able to simply reuse it, so it seems fine.
author | Pierre-Yves David <pierre-yves.david@octobus.net> |
---|---|
date | Wed, 04 Dec 2024 05:29:28 +0100 |
parents | 9feb175c028d |
children | 24c3b3dbab08 |
line wrap: on
line source
#!/usr/bin/env python3 # # generate-branchy-bundle - generate a branch for a "large" branchy repository # # Copyright 2018 Octobus, contact@octobus.net # # This software may be used and distributed according to the terms of the # GNU General Public License version 2 or any later version. # # This script generates a repository suitable for testing delta computation # strategies. # # The repository update a single "large" file with many updates. One fixed part # of the files always get updated while the rest of the lines get updated over # time. This update happens over many topological branches, some getting merged # back. import hashlib import os import shutil import subprocess import sys import tempfile import mercurial.context import mercurial.hg import mercurial.ui BUNDLE_NAME = 'big-file-churn.hg' # constants for generating the repository NB_CHANGESET = 5000 PERIOD_MERGING = 8 PERIOD_BRANCHING = 7 MOVE_BACK_MIN = 3 MOVE_BACK_RANGE = 5 # constants for generating the large file we keep updating # # At each revision, the beginning on the file change, # and set of other lines changes too. FILENAME = 'SPARSE-REVLOG-TEST-FILE' NB_LINES = 10500 ALWAYS_CHANGE_LINES = 500 OTHER_CHANGES = 300 def build_graph(): heads = {0} graph = {0: (None, None)} for idx in range(1, NB_CHANGESET + 1): p, _ = parents = [idx - 1, None] if (idx % PERIOD_BRANCHING) == 0: back = MOVE_BACK_MIN + (idx % MOVE_BACK_RANGE) for _ in range(back): p = graph.get(p, (p,))[0] parents[0] = p if (idx % PERIOD_MERGING) == 0: parents[1] = min(heads) for p in parents: heads.discard(p) heads.add(idx) graph[idx] = tuple(parents) return graph GRAPH = build_graph() def nextcontent(previous_content): """utility to produce a new file content from the previous one""" return hashlib.md5(previous_content).hexdigest().encode('ascii') def filecontent(iteridx, oldcontent): """generate a new file content The content is generated according the iteration index and previous content""" # initial call if iteridx == 0: current = b'' else: current = b"%d" % iteridx for idx in range(NB_LINES): do_change_line = True if oldcontent is not None and ALWAYS_CHANGE_LINES < idx: do_change_line = not ((idx - iteridx) % OTHER_CHANGES) if do_change_line: to_write = current + b'\n' current = nextcontent(current) else: to_write = oldcontent[idx] yield to_write def merge_content(base, left, right): """merge two file content to produce a new one use unambiguous update on each side when possible, and produce a new line whenever a merge is needed. Similar to what the manifest would do. """ for old, left, right in zip(base, left, right): if old == left and old == right: yield old elif old == left and old != right: yield right elif old != left and old == right: yield left else: yield nextcontent(left + right) def ancestors(graph, rev): """return the set of ancestors of revision <rev>""" to_proceed = {rev} seen = set(to_proceed) while to_proceed: current = to_proceed.pop() for p in graph[current]: if p is None: continue if p in seen: continue to_proceed.add(p) seen.add(p) return seen def gca(graph, left, right): """find the greater common ancestors of left and right Note that the algorithm is stupid and N² when run on all merge, however this should not be a too much issue given the current scale. """ return max(ancestors(graph, left) & ancestors(graph, right)) def make_one_content_fn(idx, base, left, right): """build a function that build the content on demand The dependency are kept are reference to make sure they are not garbage-collected until we use them. Once we computed the current content, we make sure to drop their reference to allow them to be garbage collected. """ def content_fn(idx=idx, base=base, left=left, right=right): if left is None: new = filecontent(idx, None) elif base is None: new = filecontent(idx, left()) else: merged = merge_content(base(), left(), right()) new = filecontent(idx, list(merged)) return list(new) del idx del base del left del right value = None cf = [content_fn] del content_fn def final_fn(): nonlocal value if value is None: content_fn = cf.pop() value = list(content_fn()) del content_fn return value return final_fn def build_content_graph(graph): """produce file content for all revision The content will be generated on demande and cached. Cleanup the dictionnary are you use it to reduce memory usage. """ content = {} for idx, (p1, p2) in graph.items(): base = left = right = None if p1 is not None: left = content[p1] if p2 is not None: right = content[p2] base_rev = gca(graph, p1, p2) base = content[base_rev] content[idx] = make_one_content_fn(idx, base, left, right) return content CONTENT = build_content_graph(GRAPH) def hg(command, *args): """call a mercurial command with appropriate config and argument""" env = os.environ.copy() if 'CHGHG' in env: full_cmd = ['chg'] else: full_cmd = ['hg'] full_cmd.append('--quiet') full_cmd.append(command) if command == 'commit': # reproducible commit metadata full_cmd.extend(['--date', '0 0', '--user', 'test']) elif command == 'merge': # avoid conflicts by picking the local variant full_cmd.extend(['--tool', ':merge-local']) full_cmd.extend(args) env['HGRCPATH'] = '' return subprocess.check_call(full_cmd, env=env) def write_repo(path): """write repository content in memory""" repo = mercurial.hg.repository( mercurial.ui.ui.load(), path=path.encode('utf-8'), ) nodemap = {None: repo.nodeconstants.nullid} with repo.lock(), repo.transaction(b'bundle-generation'): for idx, (p1, p2) in GRAPH.items(): if sys.stdout.isatty(): print("generating commit #%d/%d" % (idx, NB_CHANGESET)) file_fn = lambda repo, memctx, path: mercurial.context.memfilectx( repo, memctx, path, data=b''.join(CONTENT.pop(idx)()), ) mc = mercurial.context.memctx( repo, (nodemap[p1], nodemap[p2]), b'commit #%d' % idx if idx else b'initial commit', [FILENAME.encode('ascii')], file_fn, user=b"test", date=(0, 0), ) nodemap[idx] = repo.commitctx(mc) def run(target): tmpdir = tempfile.mkdtemp(prefix='tmp-hg-test-big-file-bundle-') try: os.chdir(tmpdir) hg( 'init', '--config', 'format.maxchainlen=%d' % NB_CHANGESET, ) write_repo(tmpdir) hg('bundle', '--all', target, '--config', 'devel.bundle.delta=p1') with open(target, 'rb') as bundle: data = bundle.read() digest = hashlib.md5(data).hexdigest() with open(target + '.md5', 'wb') as md5file: md5file.write(digest.encode('ascii') + b'\n') if sys.stdout.isatty(): print('bundle generated at "%s" md5: %s' % (target, digest)) finally: shutil.rmtree(tmpdir) return 0 if __name__ == '__main__': orig = os.path.realpath(os.path.dirname(sys.argv[0])) target = os.path.join(orig, os.pardir, 'cache', BUNDLE_NAME) sys.exit(run(target))