Mercurial > public > mercurial-scm > hg
comparison mercurial/revlogutils/deltas.py @ 40642:9c3c697267db
sparse-revlog: rework the way we enforce chunk size limit
We move from a O(N) algorithm to a O(log(N)) algorithm.
The previous algorithm was traversing the whole delta chain, looking for the
exact point where it became too big. This would result in most of the delta
chain to be traversed.
Instead, we now use a "binary" approach, slicing the chain in two until we
have a chunk of the appropriate size.
We still keep the previous algorithm for the snapshots part. There are few of
them and they are large bits of data distant from each other. So the previous
algorithm should work well in that case.
To take a practical example of restoring manifest revision '59547c40bc4c' for
a reference NetBeans repository (using sparse-revlog). The media time of the
step `slice-sparse-chain` of `perfrevlogrevision` improve from 1.109 ms to
0.660 ms.
author | Boris Feld <boris.feld@octobus.net> |
---|---|
date | Fri, 09 Nov 2018 17:58:37 +0100 |
parents | 85b14f0dc334 |
children | fd1d41ccbe38 |
comparison
equal
deleted
inserted
replaced
40641:85b14f0dc334 | 40642:9c3c697267db |
---|---|
77 `revlog._srmingapsize`. | 77 `revlog._srmingapsize`. |
78 | 78 |
79 If individual revisions chunk are larger than this limit, they will still | 79 If individual revisions chunk are larger than this limit, they will still |
80 be raised individually. | 80 be raised individually. |
81 | 81 |
82 >>> revlog = _testrevlog([ | 82 >>> data = [ |
83 ... 5, #00 (5) | 83 ... 5, #00 (5) |
84 ... 10, #01 (5) | 84 ... 10, #01 (5) |
85 ... 12, #02 (2) | 85 ... 12, #02 (2) |
86 ... 12, #03 (empty) | 86 ... 12, #03 (empty) |
87 ... 27, #04 (15) | 87 ... 27, #04 (15) |
94 ... 51, #11 (3) | 94 ... 51, #11 (3) |
95 ... 74, #12 (23) | 95 ... 74, #12 (23) |
96 ... 85, #13 (11) | 96 ... 85, #13 (11) |
97 ... 86, #14 (1) | 97 ... 86, #14 (1) |
98 ... 91, #15 (5) | 98 ... 91, #15 (5) |
99 ... ]) | 99 ... ] |
100 >>> revlog = _testrevlog(data, snapshot=range(16)) | |
100 | 101 |
101 >>> list(slicechunk(revlog, list(range(16)))) | 102 >>> list(slicechunk(revlog, list(range(16)))) |
102 [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]] | 103 [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]] |
103 >>> list(slicechunk(revlog, [0, 15])) | 104 >>> list(slicechunk(revlog, [0, 15])) |
104 [[0], [15]] | 105 [[0], [15]] |
131 This is intended to be used on chunk that density slicing selected by that | 132 This is intended to be used on chunk that density slicing selected by that |
132 are still too large compared to the read garantee of revlog. This might | 133 are still too large compared to the read garantee of revlog. This might |
133 happens when "minimal gap size" interrupted the slicing or when chain are | 134 happens when "minimal gap size" interrupted the slicing or when chain are |
134 built in a way that create large blocks next to each other. | 135 built in a way that create large blocks next to each other. |
135 | 136 |
136 >>> revlog = _testrevlog([ | 137 >>> data = [ |
137 ... 3, #0 (3) | 138 ... 3, #0 (3) |
138 ... 5, #1 (2) | 139 ... 5, #1 (2) |
139 ... 6, #2 (1) | 140 ... 6, #2 (1) |
140 ... 8, #3 (2) | 141 ... 8, #3 (2) |
141 ... 8, #4 (empty) | 142 ... 8, #4 (empty) |
142 ... 11, #5 (3) | 143 ... 11, #5 (3) |
143 ... 12, #6 (1) | 144 ... 12, #6 (1) |
144 ... 13, #7 (1) | 145 ... 13, #7 (1) |
145 ... 14, #8 (1) | 146 ... 14, #8 (1) |
146 ... ]) | 147 ... ] |
148 | |
149 == All snapshots cases == | |
150 >>> revlog = _testrevlog(data, snapshot=range(9)) | |
147 | 151 |
148 Cases where chunk is already small enough | 152 Cases where chunk is already small enough |
149 >>> list(_slicechunktosize(revlog, [0], 3)) | 153 >>> list(_slicechunktosize(revlog, [0], 3)) |
150 [[0]] | 154 [[0]] |
151 >>> list(_slicechunktosize(revlog, [6, 7], 3)) | 155 >>> list(_slicechunktosize(revlog, [6, 7], 3)) |
176 [[0], [1]] | 180 [[0], [1]] |
177 >>> list(_slicechunktosize(revlog, [1, 3], 1)) | 181 >>> list(_slicechunktosize(revlog, [1, 3], 1)) |
178 [[1], [3]] | 182 [[1], [3]] |
179 >>> list(_slicechunktosize(revlog, [3, 4, 5], 2)) | 183 >>> list(_slicechunktosize(revlog, [3, 4, 5], 2)) |
180 [[3], [5]] | 184 [[3], [5]] |
185 | |
186 == No Snapshot cases == | |
187 >>> revlog = _testrevlog(data) | |
188 | |
189 Cases where chunk is already small enough | |
190 >>> list(_slicechunktosize(revlog, [0], 3)) | |
191 [[0]] | |
192 >>> list(_slicechunktosize(revlog, [6, 7], 3)) | |
193 [[6, 7]] | |
194 >>> list(_slicechunktosize(revlog, [0], None)) | |
195 [[0]] | |
196 >>> list(_slicechunktosize(revlog, [6, 7], None)) | |
197 [[6, 7]] | |
198 | |
199 cases where we need actual slicing | |
200 >>> list(_slicechunktosize(revlog, [0, 1], 3)) | |
201 [[0], [1]] | |
202 >>> list(_slicechunktosize(revlog, [1, 3], 3)) | |
203 [[1], [3]] | |
204 >>> list(_slicechunktosize(revlog, [1, 2, 3], 3)) | |
205 [[1], [2, 3]] | |
206 >>> list(_slicechunktosize(revlog, [3, 5], 3)) | |
207 [[3], [5]] | |
208 >>> list(_slicechunktosize(revlog, [3, 4, 5], 3)) | |
209 [[3], [4, 5]] | |
210 >>> list(_slicechunktosize(revlog, [5, 6, 7, 8], 3)) | |
211 [[5], [6, 7, 8]] | |
212 >>> list(_slicechunktosize(revlog, [0, 1, 2, 3, 4, 5, 6, 7, 8], 3)) | |
213 [[0], [1, 2], [3], [5], [6, 7, 8]] | |
214 | |
215 Case with too large individual chunk (must return valid chunk) | |
216 >>> list(_slicechunktosize(revlog, [0, 1], 2)) | |
217 [[0], [1]] | |
218 >>> list(_slicechunktosize(revlog, [1, 3], 1)) | |
219 [[1], [3]] | |
220 >>> list(_slicechunktosize(revlog, [3, 4, 5], 2)) | |
221 [[3], [5]] | |
222 | |
223 == mixed case == | |
224 >>> revlog = _testrevlog(data, snapshot=[0, 1, 2]) | |
225 >>> list(_slicechunktosize(revlog, list(range(9)), 5)) | |
226 [[0, 1], [2], [3, 4, 5], [6, 7, 8]] | |
181 """ | 227 """ |
182 assert targetsize is None or 0 <= targetsize | 228 assert targetsize is None or 0 <= targetsize |
183 if targetsize is None or segmentspan(revlog, revs) <= targetsize: | 229 startdata = revlog.start(revs[0]) |
230 enddata = revlog.end(revs[-1]) | |
231 fullspan = enddata - startdata | |
232 if targetsize is None or fullspan <= targetsize: | |
184 yield revs | 233 yield revs |
185 return | 234 return |
186 | 235 |
187 startrevidx = 0 | 236 startrevidx = 0 |
188 startdata = revlog.start(revs[0]) | |
189 endrevidx = 0 | 237 endrevidx = 0 |
190 iterrevs = enumerate(revs) | 238 iterrevs = enumerate(revs) |
191 next(iterrevs) # skip first rev. | 239 next(iterrevs) # skip first rev. |
240 # first step: get snapshots out of the way | |
192 for idx, r in iterrevs: | 241 for idx, r in iterrevs: |
193 span = revlog.end(r) - startdata | 242 span = revlog.end(r) - startdata |
194 if span <= targetsize: | 243 snapshot = revlog.issnapshot(r) |
244 if span <= targetsize and snapshot: | |
195 endrevidx = idx | 245 endrevidx = idx |
196 else: | 246 else: |
197 chunk = _trimchunk(revlog, revs, startrevidx, endrevidx + 1) | 247 chunk = _trimchunk(revlog, revs, startrevidx, endrevidx + 1) |
198 if chunk: | 248 if chunk: |
199 yield chunk | 249 yield chunk |
200 startrevidx = idx | 250 startrevidx = idx |
201 startdata = revlog.start(r) | 251 startdata = revlog.start(r) |
202 endrevidx = idx | 252 endrevidx = idx |
203 yield _trimchunk(revlog, revs, startrevidx) | 253 if not snapshot: |
254 break | |
255 | |
256 # for the others, we use binary slicing to quickly converge toward valid | |
257 # chunks (otherwise, we might end up looking for start/end of many | |
258 # revisions). This logic is not looking for the perfect slicing point, it | |
259 # focuses on quickly converging toward valid chunks. | |
260 nbitem = len(revs) | |
261 while (enddata - startdata) > targetsize: | |
262 endrevidx = nbitem | |
263 if nbitem - startrevidx <= 1: | |
264 break # protect against individual chunk larger than limit | |
265 localenddata = revlog.end(revs[endrevidx - 1]) | |
266 span = localenddata - startdata | |
267 while (localenddata - startdata) > targetsize: | |
268 if endrevidx - startrevidx <= 1: | |
269 break # protect against individual chunk larger than limit | |
270 endrevidx -= (endrevidx - startrevidx) // 2 | |
271 localenddata = revlog.end(revs[endrevidx - 1]) | |
272 span = localenddata - startdata | |
273 chunk = _trimchunk(revlog, revs, startrevidx, endrevidx) | |
274 if chunk: | |
275 yield chunk | |
276 startrevidx = endrevidx | |
277 startdata = revlog.start(revs[startrevidx]) | |
278 | |
279 chunk = _trimchunk(revlog, revs, startrevidx) | |
280 if chunk: | |
281 yield chunk | |
204 | 282 |
205 def _slicechunktodensity(revlog, revs, targetdensity=0.5, | 283 def _slicechunktodensity(revlog, revs, targetdensity=0.5, |
206 mingapsize=0): | 284 mingapsize=0): |
207 """slice revs to reduce the amount of unrelated data to be read from disk. | 285 """slice revs to reduce the amount of unrelated data to be read from disk. |
208 | 286 |