comparison tests/artifacts/scripts/generate-churning-bundle.py @ 39508:4ca7a67c94c8

sparse-revlog: add a test checking revlog deltas for a churning file The test repository contains 5000 revisions and is therefore slow to build: five minutes with CHG, over fifteen minutes without. It is too slow to build during the test. Bundling all content produce a sizeable result, 20BM, too large to be committed. Instead, we commit a script to build the expected bundle and the test checks if the bundle is available. Any run of the script will produce the same repository content, using resulting in the same hashes. Using smaller repositories was tried, however, it misses most of the cases we are planning to improve. Having them in a 5000 repository is already nice, we usually see these case in repositories in the order of magnitude of one million revisions. This test will be very useful to check various changes strategy for building delta to store in a sparse-revlog. In this series we will focus our attention on the following metrics: The ones that will impact the final storage performance (size, space): * size of the revlog data file (".hg/store/data/*.d") * chain length info The ones that describe the deltas patterns: * number of snapshot revision (and their level) * size taken by snapshot revision (and their level)
author Boris Feld <boris.feld@octobus.net>
date Mon, 10 Sep 2018 09:08:24 -0700
parents
children b59676077654 56a0de3d581c
comparison
equal deleted inserted replaced
39507:966950954fda 39508:4ca7a67c94c8
1 #!/usr/bin/env python
2 #
3 # generate-branchy-bundle - generate a branch for a "large" branchy repository
4 #
5 # Copyright 2018 Octobus, contact@octobus.net
6 #
7 # This software may be used and distributed according to the terms of the
8 # GNU General Public License version 2 or any later version.
9 #
10 # This script generates a repository suitable for testing delta computation
11 # strategies.
12 #
13 # The repository update a single "large" file with many updates. One fixed part
14 # of the files always get updated while the rest of the lines get updated over
15 # time. This update happens over many topological branches, some getting merged
16 # back.
17 #
18 # Running with `chg` in your path and `CHGHG` set is recommended for speed.
19
20 from __future__ import absolute_import, print_function
21
22 import hashlib
23 import os
24 import shutil
25 import subprocess
26 import sys
27 import tempfile
28
29 BUNDLE_NAME = 'big-file-churn.hg'
30
31 # constants for generating the repository
32 NB_CHANGESET = 5000
33 PERIOD_MERGING = 8
34 PERIOD_BRANCHING = 7
35 MOVE_BACK_MIN = 3
36 MOVE_BACK_RANGE = 5
37
38 # constants for generating the large file we keep updating
39 #
40 # At each revision, the beginning on the file change,
41 # and set of other lines changes too.
42 FILENAME='SPARSE-REVLOG-TEST-FILE'
43 NB_LINES = 10500
44 ALWAYS_CHANGE_LINES = 500
45 FILENAME = 'SPARSE-REVLOG-TEST-FILE'
46 OTHER_CHANGES = 300
47
48 def nextcontent(previous_content):
49 """utility to produce a new file content from the previous one"""
50 return hashlib.md5(previous_content).hexdigest()
51
52 def filecontent(iteridx, oldcontent):
53 """generate a new file content
54
55 The content is generated according the iteration index and previous
56 content"""
57
58 # initial call
59 if iteridx is None:
60 current = ''
61 else:
62 current = str(iteridx)
63
64 for idx in xrange(NB_LINES):
65 do_change_line = True
66 if oldcontent is not None and ALWAYS_CHANGE_LINES < idx:
67 do_change_line = not ((idx - iteridx) % OTHER_CHANGES)
68
69 if do_change_line:
70 to_write = current + '\n'
71 current = nextcontent(current)
72 else:
73 to_write = oldcontent[idx]
74 yield to_write
75
76 def updatefile(filename, idx):
77 """update <filename> to be at appropriate content for iteration <idx>"""
78 existing = None
79 if idx is not None:
80 with open(filename, 'rb') as old:
81 existing = old.readlines()
82 with open(filename, 'wb') as target:
83 for line in filecontent(idx, existing):
84 target.write(line)
85
86 def hg(command, *args):
87 """call a mercurial command with appropriate config and argument"""
88 env = os.environ.copy()
89 if 'CHGHG' in env:
90 full_cmd = ['chg']
91 else:
92 full_cmd = ['hg']
93 full_cmd.append('--quiet')
94 full_cmd.append(command)
95 if command == 'commit':
96 # reproducible commit metadata
97 full_cmd.extend(['--date', '0 0', '--user', 'test'])
98 elif command == 'merge':
99 # avoid conflicts by picking the local variant
100 full_cmd.extend(['--tool', ':merge-local'])
101 full_cmd.extend(args)
102 env['HGRCPATH'] = ''
103 return subprocess.check_call(full_cmd, env=env)
104
105 def run(target):
106 tmpdir = tempfile.mkdtemp(prefix='tmp-hg-test-big-file-bundle-')
107 try:
108 os.chdir(tmpdir)
109 hg('init')
110 updatefile(FILENAME, None)
111 hg('commit', '--addremove', '--message', 'initial commit')
112 for idx in xrange(1, NB_CHANGESET + 1):
113 if sys.stdout.isatty():
114 print("generating commit #%d/%d" % (idx, NB_CHANGESET))
115 if (idx % PERIOD_BRANCHING) == 0:
116 move_back = MOVE_BACK_MIN + (idx % MOVE_BACK_RANGE)
117 hg('update', ".~%d" % move_back)
118 if (idx % PERIOD_MERGING) == 0:
119 hg('merge', 'min(head())')
120 updatefile(FILENAME, idx)
121 hg('commit', '--message', 'commit #%d' % idx)
122 hg('bundle', '--all', target)
123 with open(target, 'rb') as bundle:
124 data = bundle.read()
125 digest = hashlib.md5(data).hexdigest()
126 with open(target + '.md5', 'wb') as md5file:
127 md5file.write(digest + '\n')
128 if sys.stdout.isatty():
129 print('bundle generated at "%s" md5: %s' % (target, digest))
130
131 finally:
132 shutil.rmtree(tmpdir)
133 return 0
134
135 if __name__ == '__main__':
136 orig = os.path.realpath(os.path.dirname(sys.argv[0]))
137 target = os.path.join(orig, os.pardir, 'cache', BUNDLE_NAME)
138 sys.exit(run(target))
139