view tests/artifacts/scripts/generate-churning-bundle.py @ 52470:83f87912c5e0

test-sparse-revlog: commit the repo content in a single in-mem transaction I don't like to use the internal API like that, but it make the whole generation twice faster on my machine and suitable for always run during tests. However it also mean the `--pure` version of the test will run this with the pure source code and be unbearably slow. However if another flavor generated the file, pure will be able to simply reuse it, so it seems fine.
author Pierre-Yves David <pierre-yves.david@octobus.net>
date Wed, 04 Dec 2024 05:29:28 +0100
parents 9feb175c028d
children 24c3b3dbab08
line wrap: on
line source

#!/usr/bin/env python3
#
# generate-branchy-bundle - generate a branch for a "large" branchy repository
#
# Copyright 2018 Octobus, contact@octobus.net
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.
#
# This script generates a repository suitable for testing delta computation
# strategies.
#
# The repository update a single "large" file with many updates. One fixed part
# of the files always get updated while the rest of the lines get updated over
# time. This update happens over many topological branches, some getting merged
# back.


import hashlib
import os
import shutil
import subprocess
import sys
import tempfile

import mercurial.context
import mercurial.hg
import mercurial.ui

BUNDLE_NAME = 'big-file-churn.hg'

# constants for generating the repository
NB_CHANGESET = 5000
PERIOD_MERGING = 8
PERIOD_BRANCHING = 7
MOVE_BACK_MIN = 3
MOVE_BACK_RANGE = 5

# constants for generating the large file we keep updating
#
# At each revision, the beginning on the file change,
# and set of other lines changes too.
FILENAME = 'SPARSE-REVLOG-TEST-FILE'
NB_LINES = 10500
ALWAYS_CHANGE_LINES = 500
OTHER_CHANGES = 300


def build_graph():
    heads = {0}
    graph = {0: (None, None)}
    for idx in range(1, NB_CHANGESET + 1):
        p, _ = parents = [idx - 1, None]
        if (idx % PERIOD_BRANCHING) == 0:
            back = MOVE_BACK_MIN + (idx % MOVE_BACK_RANGE)
            for _ in range(back):
                p = graph.get(p, (p,))[0]
                parents[0] = p
        if (idx % PERIOD_MERGING) == 0:
            parents[1] = min(heads)
        for p in parents:
            heads.discard(p)
        heads.add(idx)
        graph[idx] = tuple(parents)
    return graph


GRAPH = build_graph()


def nextcontent(previous_content):
    """utility to produce a new file content from the previous one"""
    return hashlib.md5(previous_content).hexdigest().encode('ascii')


def filecontent(iteridx, oldcontent):
    """generate a new file content

    The content is generated according the iteration index and previous
    content"""

    # initial call
    if iteridx == 0:
        current = b''
    else:
        current = b"%d" % iteridx

    for idx in range(NB_LINES):
        do_change_line = True
        if oldcontent is not None and ALWAYS_CHANGE_LINES < idx:
            do_change_line = not ((idx - iteridx) % OTHER_CHANGES)

        if do_change_line:
            to_write = current + b'\n'
            current = nextcontent(current)
        else:
            to_write = oldcontent[idx]
        yield to_write


def merge_content(base, left, right):
    """merge two file content to produce a new one

    use unambiguous update on each side when possible, and produce a new line
    whenever a merge is needed. Similar to what the manifest would do.
    """
    for old, left, right in zip(base, left, right):
        if old == left and old == right:
            yield old
        elif old == left and old != right:
            yield right
        elif old != left and old == right:
            yield left
        else:
            yield nextcontent(left + right)


def ancestors(graph, rev):
    """return the set of ancestors of revision <rev>"""
    to_proceed = {rev}
    seen = set(to_proceed)
    while to_proceed:
        current = to_proceed.pop()
        for p in graph[current]:
            if p is None:
                continue
            if p in seen:
                continue
            to_proceed.add(p)
            seen.add(p)
    return seen


def gca(graph, left, right):
    """find the greater common ancestors of left and right

    Note that the algorithm is stupid and N² when run on all merge, however
    this should not be a too much issue given the current scale.
    """
    return max(ancestors(graph, left) & ancestors(graph, right))


def make_one_content_fn(idx, base, left, right):
    """build a function that build the content on demand

    The dependency are kept are reference to make sure they are not
    garbage-collected until we use them. Once we computed the current content,
    we make sure to drop their reference to allow them to be garbage collected.
    """

    def content_fn(idx=idx, base=base, left=left, right=right):
        if left is None:
            new = filecontent(idx, None)
        elif base is None:
            new = filecontent(idx, left())
        else:
            merged = merge_content(base(), left(), right())
            new = filecontent(idx, list(merged))
        return list(new)

    del idx
    del base
    del left
    del right

    value = None
    cf = [content_fn]
    del content_fn

    def final_fn():
        nonlocal value
        if value is None:
            content_fn = cf.pop()
            value = list(content_fn())
            del content_fn
        return value

    return final_fn


def build_content_graph(graph):
    """produce file content for all revision

    The content will be generated on demande and cached. Cleanup the
    dictionnary are you use it to reduce memory usage.
    """
    content = {}
    for idx, (p1, p2) in graph.items():
        base = left = right = None
        if p1 is not None:
            left = content[p1]
            if p2 is not None:
                right = content[p2]
                base_rev = gca(graph, p1, p2)
                base = content[base_rev]
        content[idx] = make_one_content_fn(idx, base, left, right)
    return content


CONTENT = build_content_graph(GRAPH)


def hg(command, *args):
    """call a mercurial command with appropriate config and argument"""
    env = os.environ.copy()
    if 'CHGHG' in env:
        full_cmd = ['chg']
    else:
        full_cmd = ['hg']
    full_cmd.append('--quiet')
    full_cmd.append(command)
    if command == 'commit':
        # reproducible commit metadata
        full_cmd.extend(['--date', '0 0', '--user', 'test'])
    elif command == 'merge':
        # avoid conflicts by picking the local variant
        full_cmd.extend(['--tool', ':merge-local'])
    full_cmd.extend(args)
    env['HGRCPATH'] = ''
    return subprocess.check_call(full_cmd, env=env)


def write_repo(path):
    """write repository content in memory"""
    repo = mercurial.hg.repository(
        mercurial.ui.ui.load(),
        path=path.encode('utf-8'),
    )
    nodemap = {None: repo.nodeconstants.nullid}
    with repo.lock(), repo.transaction(b'bundle-generation'):
        for idx, (p1, p2) in GRAPH.items():
            if sys.stdout.isatty():
                print("generating commit #%d/%d" % (idx, NB_CHANGESET))

            file_fn = lambda repo, memctx, path: mercurial.context.memfilectx(
                repo,
                memctx,
                path,
                data=b''.join(CONTENT.pop(idx)()),
            )

            mc = mercurial.context.memctx(
                repo,
                (nodemap[p1], nodemap[p2]),
                b'commit #%d' % idx if idx else b'initial commit',
                [FILENAME.encode('ascii')],
                file_fn,
                user=b"test",
                date=(0, 0),
            )
            nodemap[idx] = repo.commitctx(mc)


def run(target):
    tmpdir = tempfile.mkdtemp(prefix='tmp-hg-test-big-file-bundle-')
    try:
        os.chdir(tmpdir)
        hg(
            'init',
            '--config',
            'format.maxchainlen=%d' % NB_CHANGESET,
        )
        write_repo(tmpdir)
        hg('bundle', '--all', target, '--config', 'devel.bundle.delta=p1')
        with open(target, 'rb') as bundle:
            data = bundle.read()
            digest = hashlib.md5(data).hexdigest()
        with open(target + '.md5', 'wb') as md5file:
            md5file.write(digest.encode('ascii') + b'\n')
        if sys.stdout.isatty():
            print('bundle generated at "%s" md5: %s' % (target, digest))

    finally:
        shutil.rmtree(tmpdir)
    return 0


if __name__ == '__main__':
    orig = os.path.realpath(os.path.dirname(sys.argv[0]))
    target = os.path.join(orig, os.pardir, 'cache', BUNDLE_NAME)
    sys.exit(run(target))