view tests/artifacts/scripts/generate-churning-bundle.py @ 53040:cdd7bf612c7b stable tip

bundle-spec: properly format boolean parameter (issue6960) This was breaking automatic clone bundle generation. This changeset fixes it and add a test to catch it in the future.
author Pierre-Yves David <pierre-yves.david@octobus.net>
date Tue, 11 Mar 2025 02:29:42 +0100
parents 7e5ed1e80913
children
line wrap: on
line source

#!/usr/bin/env python3
#
# generate-branchy-bundle - generate a branch for a "large" branchy repository
#
# Copyright 2018 Octobus, contact@octobus.net
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.
#
# This script generates a repository suitable for testing delta computation
# strategies.
#
# The repository update a single "large" file with many updates. One fixed part
# of the files always get updated while the rest of the lines get updated over
# time. This update happens over many topological branches, some getting merged
# back.
#
# --lazy     will skip generating the file if one exist with the right content
#            already.
# --validate make sure the generated bundle has the expected content.


import hashlib
import os
import shutil
import subprocess
import sys
import tempfile

import mercurial.context
import mercurial.hg
import mercurial.ui

BUNDLE_NAME = 'big-file-churn.hg'

# constants for generating the repository
NB_CHANGESET = 5000
PERIOD_MERGING = 8
PERIOD_BRANCHING = 7
MOVE_BACK_MIN = 3
MOVE_BACK_RANGE = 5

# constants for generating the large file we keep updating
#
# At each revision, the beginning on the file change,
# and set of other lines changes too.
FILENAME = 'SPARSE-REVLOG-TEST-FILE'
NB_LINES = 10500
ALWAYS_CHANGE_LINES = 500
OTHER_CHANGES = 300


def build_graph():
    heads = {0}
    graph = {0: (None, None)}
    for idx in range(1, NB_CHANGESET + 1):
        p, _ = parents = [idx - 1, None]
        if (idx % PERIOD_BRANCHING) == 0:
            back = MOVE_BACK_MIN + (idx % MOVE_BACK_RANGE)
            for _ in range(back):
                p = graph.get(p, (p,))[0]
                parents[0] = p
        if (idx % PERIOD_MERGING) == 0:
            parents[1] = min(heads)
        for p in parents:
            heads.discard(p)
        heads.add(idx)
        graph[idx] = tuple(parents)
    return graph


GRAPH = build_graph()


def nextcontent(previous_content):
    """utility to produce a new file content from the previous one"""
    return hashlib.md5(previous_content).hexdigest().encode('ascii')


def filecontent(iteridx, oldcontent):
    """generate a new file content

    The content is generated according the iteration index and previous
    content"""

    # initial call
    if iteridx == 0:
        current = b''
    else:
        current = b"%d" % iteridx

    for idx in range(NB_LINES):
        do_change_line = True
        if oldcontent is not None and ALWAYS_CHANGE_LINES < idx:
            do_change_line = not ((idx - iteridx) % OTHER_CHANGES)

        if do_change_line:
            to_write = current + b'\n'
            current = nextcontent(current)
        else:
            to_write = oldcontent[idx]
        yield to_write


def merge_content(base, left, right):
    """merge two file content to produce a new one

    use unambiguous update on each side when possible, and produce a new line
    whenever a merge is needed. Similar to what the manifest would do.
    """
    for old, left, right in zip(base, left, right):
        if old == left and old == right:
            yield old
        elif old == left and old != right:
            yield right
        elif old != left and old == right:
            yield left
        else:
            yield nextcontent(left + right)


def ancestors(graph, rev):
    """return the set of ancestors of revision <rev>"""
    to_proceed = {rev}
    seen = set(to_proceed)
    while to_proceed:
        current = to_proceed.pop()
        for p in graph[current]:
            if p is None:
                continue
            if p in seen:
                continue
            to_proceed.add(p)
            seen.add(p)
    return seen


def gca(graph, left, right):
    """find the greater common ancestors of left and right

    Note that the algorithm is stupid and N² when run on all merge, however
    this should not be a too much issue given the current scale.
    """
    return max(ancestors(graph, left) & ancestors(graph, right))


def make_one_content_fn(idx, base, left, right):
    """build a function that build the content on demand

    The dependency are kept are reference to make sure they are not
    garbage-collected until we use them. Once we computed the current content,
    we make sure to drop their reference to allow them to be garbage collected.
    """

    def content_fn(idx=idx, base=base, left=left, right=right):
        if left is None:
            new = filecontent(idx, None)
        elif base is None:
            new = filecontent(idx, left())
        else:
            merged = merge_content(base(), left(), right())
            new = filecontent(idx, list(merged))
        return list(new)

    del idx
    del base
    del left
    del right

    value = None
    cf = [content_fn]
    del content_fn

    def final_fn():
        nonlocal value
        if value is None:
            content_fn = cf.pop()
            value = list(content_fn())
            del content_fn
        return value

    return final_fn


def build_content_graph(graph):
    """produce file content for all revision

    The content will be generated on demande and cached. Cleanup the
    dictionnary are you use it to reduce memory usage.
    """
    content = {}
    for idx, (p1, p2) in graph.items():
        base = left = right = None
        if p1 is not None:
            left = content[p1]
            if p2 is not None:
                right = content[p2]
                base_rev = gca(graph, p1, p2)
                base = content[base_rev]
        content[idx] = make_one_content_fn(idx, base, left, right)
    return content


CONTENT = build_content_graph(GRAPH)


def hg(command, *args):
    """call a mercurial command with appropriate config and argument"""
    env = os.environ.copy()
    if 'CHGHG' in env:
        full_cmd = ['chg']
    else:
        full_cmd = ['hg']
    full_cmd.append('--quiet')
    full_cmd.append(command)
    if command == 'commit':
        # reproducible commit metadata
        full_cmd.extend(['--date', '0 0', '--user', 'test'])
    elif command == 'merge':
        # avoid conflicts by picking the local variant
        full_cmd.extend(['--tool', ':merge-local'])
    full_cmd.extend(args)
    env['HGRCPATH'] = ''
    return subprocess.check_call(full_cmd, env=env)


def write_repo(path):
    """write repository content in memory"""
    repo = mercurial.hg.repository(
        mercurial.ui.ui.load(),
        path=path.encode('utf-8'),
    )
    nodemap = {None: repo.nodeconstants.nullid}
    with repo.lock(), repo.transaction(b'bundle-generation'):
        for idx, (p1, p2) in GRAPH.items():
            if sys.stdout.isatty():
                print("generating commit #%d/%d" % (idx, NB_CHANGESET))

            file_fn = lambda repo, memctx, path: mercurial.context.memfilectx(
                repo,
                memctx,
                path,
                data=b''.join(CONTENT.pop(idx)()),
            )

            mc = mercurial.context.memctx(
                repo,
                (nodemap[p1], nodemap[p2]),
                b'commit #%d' % idx if idx else b'initial commit',
                [FILENAME.encode('ascii')],
                file_fn,
                user=b"test",
                date=(0, 0),
            )
            nodemap[idx] = repo.commitctx(mc)


def compute_md5(target):
    with open(target, 'rb') as bundle:
        data = bundle.read()
        return hashlib.md5(data).hexdigest()


def write_md5(target, md5):
    with open(target + '.md5', 'wb') as md5file:
        md5file.write(md5.encode('ascii') + b'\n')


def read_md5(target):
    with open(target + '.md5', 'rb') as md5file:
        return md5file.read().strip().decode('ascii')


def up_to_date_target(target):
    """return true if the file already exist at the right"""
    try:
        found = compute_md5(target)
        expected = read_md5(target)
    except OSError:
        return False
    return found == expected


def run(target, validate=False):
    tmpdir = tempfile.mkdtemp(prefix='tmp-hg-test-big-file-bundle-')
    cwd = os.getcwd()
    try:
        os.chdir(tmpdir)
        hg(
            'init',
            '--config',
            'format.maxchainlen=%d' % NB_CHANGESET,
        )
        write_repo(tmpdir)
        hg('bundle', '--all', target, '--config', 'devel.bundle.delta=p1')
        digest = compute_md5(target)
        if not validate:
            write_md5(target, digest)
        else:
            expected = read_md5(target)
            if expected != digest:
                msg = "bundle generated does not match the expected content\n"
                msg += "    expected: %s\n" % expected
                msg += "    got:      %s" % digest
                print(msg, file=sys.stderr)
                return 1
    finally:
        # Windows does not let you remove the current working directory
        os.chdir(cwd)
        shutil.rmtree(tmpdir)
    return 0


if __name__ == '__main__':
    orig = os.path.realpath(os.path.dirname(sys.argv[0]))
    target = os.path.join(orig, os.pardir, 'cache', BUNDLE_NAME)
    lazy = '--lazy' in sys.argv[1:]
    validate = '--validate' in sys.argv[1:]
    if lazy and up_to_date_target(target):
        sys.exit(0)
    sys.exit(run(target, validate=validate))