Mercurial > public > mercurial-scm > hg
comparison mercurial/revlog.py @ 39330:655b5b465953
revlog: split functionality related to deltas computation in a new module
The revlog module is getting big and this logic is getting more and more
advanced. Moving it to `mercurial.revlogutils.deltas` split a lot off
revlog.py and will help this logic to become less interleaved with revlog.
The code is simply moved without modification (but for namespace changes).
Refactoring and improvement will be made in later changesets.
author | Boris Feld <boris.feld@octobus.net> |
---|---|
date | Thu, 16 Aug 2018 02:53:42 +0200 |
parents | 729082bb9938 |
children | 6f4b8f607a31 |
comparison
equal
deleted
inserted
replaced
39329:729082bb9938 | 39330:655b5b465953 |
---|---|
15 | 15 |
16 import collections | 16 import collections |
17 import contextlib | 17 import contextlib |
18 import errno | 18 import errno |
19 import hashlib | 19 import hashlib |
20 import heapq | |
21 import os | 20 import os |
22 import re | 21 import re |
23 import struct | 22 import struct |
24 import zlib | 23 import zlib |
25 | 24 |
37 ) | 36 ) |
38 from .i18n import _ | 37 from .i18n import _ |
39 from .revlogutils.constants import ( | 38 from .revlogutils.constants import ( |
40 FLAG_GENERALDELTA, | 39 FLAG_GENERALDELTA, |
41 FLAG_INLINE_DATA, | 40 FLAG_INLINE_DATA, |
42 LIMIT_DELTA2TEXT, | |
43 REVIDX_DEFAULT_FLAGS, | 41 REVIDX_DEFAULT_FLAGS, |
44 REVIDX_ELLIPSIS, | 42 REVIDX_ELLIPSIS, |
45 REVIDX_EXTSTORED, | 43 REVIDX_EXTSTORED, |
46 REVIDX_FLAGS_ORDER, | 44 REVIDX_FLAGS_ORDER, |
47 REVIDX_ISCENSORED, | 45 REVIDX_ISCENSORED, |
67 pycompat, | 65 pycompat, |
68 repository, | 66 repository, |
69 templatefilters, | 67 templatefilters, |
70 util, | 68 util, |
71 ) | 69 ) |
70 from .revlogutils import ( | |
71 deltas as deltautil, | |
72 ) | |
72 from .utils import ( | 73 from .utils import ( |
73 interfaceutil, | 74 interfaceutil, |
74 stringutil, | 75 stringutil, |
75 ) | 76 ) |
76 | 77 |
208 b = p1 | 209 b = p1 |
209 s = hashlib.sha1(a) | 210 s = hashlib.sha1(a) |
210 s.update(b) | 211 s.update(b) |
211 s.update(text) | 212 s.update(text) |
212 return s.digest() | 213 return s.digest() |
213 | |
214 class _testrevlog(object): | |
215 """minimalist fake revlog to use in doctests""" | |
216 | |
217 def __init__(self, data, density=0.5, mingap=0): | |
218 """data is an list of revision payload boundaries""" | |
219 self._data = data | |
220 self._srdensitythreshold = density | |
221 self._srmingapsize = mingap | |
222 | |
223 def start(self, rev): | |
224 if rev == 0: | |
225 return 0 | |
226 return self._data[rev - 1] | |
227 | |
228 def end(self, rev): | |
229 return self._data[rev] | |
230 | |
231 def length(self, rev): | |
232 return self.end(rev) - self.start(rev) | |
233 | |
234 def __len__(self): | |
235 return len(self._data) | |
236 | |
237 def _trimchunk(revlog, revs, startidx, endidx=None): | |
238 """returns revs[startidx:endidx] without empty trailing revs | |
239 | |
240 Doctest Setup | |
241 >>> revlog = _testrevlog([ | |
242 ... 5, #0 | |
243 ... 10, #1 | |
244 ... 12, #2 | |
245 ... 12, #3 (empty) | |
246 ... 17, #4 | |
247 ... 21, #5 | |
248 ... 21, #6 (empty) | |
249 ... ]) | |
250 | |
251 Contiguous cases: | |
252 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0) | |
253 [0, 1, 2, 3, 4, 5] | |
254 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 5) | |
255 [0, 1, 2, 3, 4] | |
256 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 4) | |
257 [0, 1, 2] | |
258 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 2, 4) | |
259 [2] | |
260 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3) | |
261 [3, 4, 5] | |
262 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3, 5) | |
263 [3, 4] | |
264 | |
265 Discontiguous cases: | |
266 >>> _trimchunk(revlog, [1, 3, 5, 6], 0) | |
267 [1, 3, 5] | |
268 >>> _trimchunk(revlog, [1, 3, 5, 6], 0, 2) | |
269 [1] | |
270 >>> _trimchunk(revlog, [1, 3, 5, 6], 1, 3) | |
271 [3, 5] | |
272 >>> _trimchunk(revlog, [1, 3, 5, 6], 1) | |
273 [3, 5] | |
274 """ | |
275 length = revlog.length | |
276 | |
277 if endidx is None: | |
278 endidx = len(revs) | |
279 | |
280 # If we have a non-emtpy delta candidate, there are nothing to trim | |
281 if revs[endidx - 1] < len(revlog): | |
282 # Trim empty revs at the end, except the very first revision of a chain | |
283 while (endidx > 1 | |
284 and endidx > startidx | |
285 and length(revs[endidx - 1]) == 0): | |
286 endidx -= 1 | |
287 | |
288 return revs[startidx:endidx] | |
289 | |
290 def _segmentspan(revlog, revs, deltainfo=None): | |
291 """Get the byte span of a segment of revisions | |
292 | |
293 revs is a sorted array of revision numbers | |
294 | |
295 >>> revlog = _testrevlog([ | |
296 ... 5, #0 | |
297 ... 10, #1 | |
298 ... 12, #2 | |
299 ... 12, #3 (empty) | |
300 ... 17, #4 | |
301 ... ]) | |
302 | |
303 >>> _segmentspan(revlog, [0, 1, 2, 3, 4]) | |
304 17 | |
305 >>> _segmentspan(revlog, [0, 4]) | |
306 17 | |
307 >>> _segmentspan(revlog, [3, 4]) | |
308 5 | |
309 >>> _segmentspan(revlog, [1, 2, 3,]) | |
310 7 | |
311 >>> _segmentspan(revlog, [1, 3]) | |
312 7 | |
313 """ | |
314 if not revs: | |
315 return 0 | |
316 if deltainfo is not None and len(revlog) <= revs[-1]: | |
317 if len(revs) == 1: | |
318 return deltainfo.deltalen | |
319 offset = revlog.end(len(revlog) - 1) | |
320 end = deltainfo.deltalen + offset | |
321 else: | |
322 end = revlog.end(revs[-1]) | |
323 return end - revlog.start(revs[0]) | |
324 | |
325 def _slicechunk(revlog, revs, deltainfo=None, targetsize=None): | |
326 """slice revs to reduce the amount of unrelated data to be read from disk. | |
327 | |
328 ``revs`` is sliced into groups that should be read in one time. | |
329 Assume that revs are sorted. | |
330 | |
331 The initial chunk is sliced until the overall density (payload/chunks-span | |
332 ratio) is above `revlog._srdensitythreshold`. No gap smaller than | |
333 `revlog._srmingapsize` is skipped. | |
334 | |
335 If `targetsize` is set, no chunk larger than `targetsize` will be yield. | |
336 For consistency with other slicing choice, this limit won't go lower than | |
337 `revlog._srmingapsize`. | |
338 | |
339 If individual revisions chunk are larger than this limit, they will still | |
340 be raised individually. | |
341 | |
342 >>> revlog = _testrevlog([ | |
343 ... 5, #00 (5) | |
344 ... 10, #01 (5) | |
345 ... 12, #02 (2) | |
346 ... 12, #03 (empty) | |
347 ... 27, #04 (15) | |
348 ... 31, #05 (4) | |
349 ... 31, #06 (empty) | |
350 ... 42, #07 (11) | |
351 ... 47, #08 (5) | |
352 ... 47, #09 (empty) | |
353 ... 48, #10 (1) | |
354 ... 51, #11 (3) | |
355 ... 74, #12 (23) | |
356 ... 85, #13 (11) | |
357 ... 86, #14 (1) | |
358 ... 91, #15 (5) | |
359 ... ]) | |
360 | |
361 >>> list(_slicechunk(revlog, list(range(16)))) | |
362 [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]] | |
363 >>> list(_slicechunk(revlog, [0, 15])) | |
364 [[0], [15]] | |
365 >>> list(_slicechunk(revlog, [0, 11, 15])) | |
366 [[0], [11], [15]] | |
367 >>> list(_slicechunk(revlog, [0, 11, 13, 15])) | |
368 [[0], [11, 13, 15]] | |
369 >>> list(_slicechunk(revlog, [1, 2, 3, 5, 8, 10, 11, 14])) | |
370 [[1, 2], [5, 8, 10, 11], [14]] | |
371 | |
372 Slicing with a maximum chunk size | |
373 >>> list(_slicechunk(revlog, [0, 11, 13, 15], targetsize=15)) | |
374 [[0], [11], [13], [15]] | |
375 >>> list(_slicechunk(revlog, [0, 11, 13, 15], targetsize=20)) | |
376 [[0], [11], [13, 15]] | |
377 """ | |
378 if targetsize is not None: | |
379 targetsize = max(targetsize, revlog._srmingapsize) | |
380 # targetsize should not be specified when evaluating delta candidates: | |
381 # * targetsize is used to ensure we stay within specification when reading, | |
382 # * deltainfo is used to pick are good delta chain when writing. | |
383 if not (deltainfo is None or targetsize is None): | |
384 msg = 'cannot use `targetsize` with a `deltainfo`' | |
385 raise error.ProgrammingError(msg) | |
386 for chunk in _slicechunktodensity(revlog, revs, | |
387 deltainfo, | |
388 revlog._srdensitythreshold, | |
389 revlog._srmingapsize): | |
390 for subchunk in _slicechunktosize(revlog, chunk, targetsize): | |
391 yield subchunk | |
392 | |
393 def _slicechunktosize(revlog, revs, targetsize=None): | |
394 """slice revs to match the target size | |
395 | |
396 This is intended to be used on chunk that density slicing selected by that | |
397 are still too large compared to the read garantee of revlog. This might | |
398 happens when "minimal gap size" interrupted the slicing or when chain are | |
399 built in a way that create large blocks next to each other. | |
400 | |
401 >>> revlog = _testrevlog([ | |
402 ... 3, #0 (3) | |
403 ... 5, #1 (2) | |
404 ... 6, #2 (1) | |
405 ... 8, #3 (2) | |
406 ... 8, #4 (empty) | |
407 ... 11, #5 (3) | |
408 ... 12, #6 (1) | |
409 ... 13, #7 (1) | |
410 ... 14, #8 (1) | |
411 ... ]) | |
412 | |
413 Cases where chunk is already small enough | |
414 >>> list(_slicechunktosize(revlog, [0], 3)) | |
415 [[0]] | |
416 >>> list(_slicechunktosize(revlog, [6, 7], 3)) | |
417 [[6, 7]] | |
418 >>> list(_slicechunktosize(revlog, [0], None)) | |
419 [[0]] | |
420 >>> list(_slicechunktosize(revlog, [6, 7], None)) | |
421 [[6, 7]] | |
422 | |
423 cases where we need actual slicing | |
424 >>> list(_slicechunktosize(revlog, [0, 1], 3)) | |
425 [[0], [1]] | |
426 >>> list(_slicechunktosize(revlog, [1, 3], 3)) | |
427 [[1], [3]] | |
428 >>> list(_slicechunktosize(revlog, [1, 2, 3], 3)) | |
429 [[1, 2], [3]] | |
430 >>> list(_slicechunktosize(revlog, [3, 5], 3)) | |
431 [[3], [5]] | |
432 >>> list(_slicechunktosize(revlog, [3, 4, 5], 3)) | |
433 [[3], [5]] | |
434 >>> list(_slicechunktosize(revlog, [5, 6, 7, 8], 3)) | |
435 [[5], [6, 7, 8]] | |
436 >>> list(_slicechunktosize(revlog, [0, 1, 2, 3, 4, 5, 6, 7, 8], 3)) | |
437 [[0], [1, 2], [3], [5], [6, 7, 8]] | |
438 | |
439 Case with too large individual chunk (must return valid chunk) | |
440 >>> list(_slicechunktosize(revlog, [0, 1], 2)) | |
441 [[0], [1]] | |
442 >>> list(_slicechunktosize(revlog, [1, 3], 1)) | |
443 [[1], [3]] | |
444 >>> list(_slicechunktosize(revlog, [3, 4, 5], 2)) | |
445 [[3], [5]] | |
446 """ | |
447 assert targetsize is None or 0 <= targetsize | |
448 if targetsize is None or _segmentspan(revlog, revs) <= targetsize: | |
449 yield revs | |
450 return | |
451 | |
452 startrevidx = 0 | |
453 startdata = revlog.start(revs[0]) | |
454 endrevidx = 0 | |
455 iterrevs = enumerate(revs) | |
456 next(iterrevs) # skip first rev. | |
457 for idx, r in iterrevs: | |
458 span = revlog.end(r) - startdata | |
459 if span <= targetsize: | |
460 endrevidx = idx | |
461 else: | |
462 chunk = _trimchunk(revlog, revs, startrevidx, endrevidx + 1) | |
463 if chunk: | |
464 yield chunk | |
465 startrevidx = idx | |
466 startdata = revlog.start(r) | |
467 endrevidx = idx | |
468 yield _trimchunk(revlog, revs, startrevidx) | |
469 | |
470 def _slicechunktodensity(revlog, revs, deltainfo=None, targetdensity=0.5, | |
471 mingapsize=0): | |
472 """slice revs to reduce the amount of unrelated data to be read from disk. | |
473 | |
474 ``revs`` is sliced into groups that should be read in one time. | |
475 Assume that revs are sorted. | |
476 | |
477 ``deltainfo`` is a _deltainfo instance of a revision that we would append | |
478 to the top of the revlog. | |
479 | |
480 The initial chunk is sliced until the overall density (payload/chunks-span | |
481 ratio) is above `targetdensity`. No gap smaller than `mingapsize` is | |
482 skipped. | |
483 | |
484 >>> revlog = _testrevlog([ | |
485 ... 5, #00 (5) | |
486 ... 10, #01 (5) | |
487 ... 12, #02 (2) | |
488 ... 12, #03 (empty) | |
489 ... 27, #04 (15) | |
490 ... 31, #05 (4) | |
491 ... 31, #06 (empty) | |
492 ... 42, #07 (11) | |
493 ... 47, #08 (5) | |
494 ... 47, #09 (empty) | |
495 ... 48, #10 (1) | |
496 ... 51, #11 (3) | |
497 ... 74, #12 (23) | |
498 ... 85, #13 (11) | |
499 ... 86, #14 (1) | |
500 ... 91, #15 (5) | |
501 ... ]) | |
502 | |
503 >>> list(_slicechunktodensity(revlog, list(range(16)))) | |
504 [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]] | |
505 >>> list(_slicechunktodensity(revlog, [0, 15])) | |
506 [[0], [15]] | |
507 >>> list(_slicechunktodensity(revlog, [0, 11, 15])) | |
508 [[0], [11], [15]] | |
509 >>> list(_slicechunktodensity(revlog, [0, 11, 13, 15])) | |
510 [[0], [11, 13, 15]] | |
511 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14])) | |
512 [[1, 2], [5, 8, 10, 11], [14]] | |
513 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14], | |
514 ... mingapsize=20)) | |
515 [[1, 2, 3, 5, 8, 10, 11], [14]] | |
516 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14], | |
517 ... targetdensity=0.95)) | |
518 [[1, 2], [5], [8, 10, 11], [14]] | |
519 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14], | |
520 ... targetdensity=0.95, mingapsize=12)) | |
521 [[1, 2], [5, 8, 10, 11], [14]] | |
522 """ | |
523 start = revlog.start | |
524 length = revlog.length | |
525 | |
526 if len(revs) <= 1: | |
527 yield revs | |
528 return | |
529 | |
530 nextrev = len(revlog) | |
531 nextoffset = revlog.end(nextrev - 1) | |
532 | |
533 if deltainfo is None: | |
534 deltachainspan = _segmentspan(revlog, revs) | |
535 chainpayload = sum(length(r) for r in revs) | |
536 else: | |
537 deltachainspan = deltainfo.distance | |
538 chainpayload = deltainfo.compresseddeltalen | |
539 | |
540 if deltachainspan < mingapsize: | |
541 yield revs | |
542 return | |
543 | |
544 readdata = deltachainspan | |
545 | |
546 if deltachainspan: | |
547 density = chainpayload / float(deltachainspan) | |
548 else: | |
549 density = 1.0 | |
550 | |
551 if density >= targetdensity: | |
552 yield revs | |
553 return | |
554 | |
555 if deltainfo is not None and deltainfo.deltalen: | |
556 revs = list(revs) | |
557 revs.append(nextrev) | |
558 | |
559 # Store the gaps in a heap to have them sorted by decreasing size | |
560 gapsheap = [] | |
561 heapq.heapify(gapsheap) | |
562 prevend = None | |
563 for i, rev in enumerate(revs): | |
564 if rev < nextrev: | |
565 revstart = start(rev) | |
566 revlen = length(rev) | |
567 else: | |
568 revstart = nextoffset | |
569 revlen = deltainfo.deltalen | |
570 | |
571 # Skip empty revisions to form larger holes | |
572 if revlen == 0: | |
573 continue | |
574 | |
575 if prevend is not None: | |
576 gapsize = revstart - prevend | |
577 # only consider holes that are large enough | |
578 if gapsize > mingapsize: | |
579 heapq.heappush(gapsheap, (-gapsize, i)) | |
580 | |
581 prevend = revstart + revlen | |
582 | |
583 # Collect the indices of the largest holes until the density is acceptable | |
584 indicesheap = [] | |
585 heapq.heapify(indicesheap) | |
586 while gapsheap and density < targetdensity: | |
587 oppgapsize, gapidx = heapq.heappop(gapsheap) | |
588 | |
589 heapq.heappush(indicesheap, gapidx) | |
590 | |
591 # the gap sizes are stored as negatives to be sorted decreasingly | |
592 # by the heap | |
593 readdata -= (-oppgapsize) | |
594 if readdata > 0: | |
595 density = chainpayload / float(readdata) | |
596 else: | |
597 density = 1.0 | |
598 | |
599 # Cut the revs at collected indices | |
600 previdx = 0 | |
601 while indicesheap: | |
602 idx = heapq.heappop(indicesheap) | |
603 | |
604 chunk = _trimchunk(revlog, revs, previdx, idx) | |
605 if chunk: | |
606 yield chunk | |
607 | |
608 previdx = idx | |
609 | |
610 chunk = _trimchunk(revlog, revs, previdx) | |
611 if chunk: | |
612 yield chunk | |
613 | |
614 @attr.s(slots=True, frozen=True) | |
615 class _deltainfo(object): | |
616 distance = attr.ib() | |
617 deltalen = attr.ib() | |
618 data = attr.ib() | |
619 base = attr.ib() | |
620 chainbase = attr.ib() | |
621 chainlen = attr.ib() | |
622 compresseddeltalen = attr.ib() | |
623 snapshotdepth = attr.ib() | |
624 | |
625 class _deltacomputer(object): | |
626 def __init__(self, revlog): | |
627 self.revlog = revlog | |
628 | |
629 def _getcandidaterevs(self, p1, p2, cachedelta): | |
630 """ | |
631 Provides revisions that present an interest to be diffed against, | |
632 grouped by level of easiness. | |
633 """ | |
634 revlog = self.revlog | |
635 gdelta = revlog._generaldelta | |
636 curr = len(revlog) | |
637 prev = curr - 1 | |
638 p1r, p2r = revlog.rev(p1), revlog.rev(p2) | |
639 | |
640 # should we try to build a delta? | |
641 if prev != nullrev and revlog._storedeltachains: | |
642 tested = set() | |
643 # This condition is true most of the time when processing | |
644 # changegroup data into a generaldelta repo. The only time it | |
645 # isn't true is if this is the first revision in a delta chain | |
646 # or if ``format.generaldelta=true`` disabled ``lazydeltabase``. | |
647 if cachedelta and gdelta and revlog._lazydeltabase: | |
648 # Assume what we received from the server is a good choice | |
649 # build delta will reuse the cache | |
650 yield (cachedelta[0],) | |
651 tested.add(cachedelta[0]) | |
652 | |
653 if gdelta: | |
654 # exclude already lazy tested base if any | |
655 parents = [p for p in (p1r, p2r) | |
656 if p != nullrev and p not in tested] | |
657 | |
658 if not revlog._deltabothparents and len(parents) == 2: | |
659 parents.sort() | |
660 # To minimize the chance of having to build a fulltext, | |
661 # pick first whichever parent is closest to us (max rev) | |
662 yield (parents[1],) | |
663 # then the other one (min rev) if the first did not fit | |
664 yield (parents[0],) | |
665 tested.update(parents) | |
666 elif len(parents) > 0: | |
667 # Test all parents (1 or 2), and keep the best candidate | |
668 yield parents | |
669 tested.update(parents) | |
670 | |
671 if prev not in tested: | |
672 # other approach failed try against prev to hopefully save us a | |
673 # fulltext. | |
674 yield (prev,) | |
675 tested.add(prev) | |
676 | |
677 def buildtext(self, revinfo, fh): | |
678 """Builds a fulltext version of a revision | |
679 | |
680 revinfo: _revisioninfo instance that contains all needed info | |
681 fh: file handle to either the .i or the .d revlog file, | |
682 depending on whether it is inlined or not | |
683 """ | |
684 btext = revinfo.btext | |
685 if btext[0] is not None: | |
686 return btext[0] | |
687 | |
688 revlog = self.revlog | |
689 cachedelta = revinfo.cachedelta | |
690 flags = revinfo.flags | |
691 node = revinfo.node | |
692 | |
693 baserev = cachedelta[0] | |
694 delta = cachedelta[1] | |
695 # special case deltas which replace entire base; no need to decode | |
696 # base revision. this neatly avoids censored bases, which throw when | |
697 # they're decoded. | |
698 hlen = struct.calcsize(">lll") | |
699 if delta[:hlen] == mdiff.replacediffheader(revlog.rawsize(baserev), | |
700 len(delta) - hlen): | |
701 btext[0] = delta[hlen:] | |
702 else: | |
703 # deltabase is rawtext before changed by flag processors, which is | |
704 # equivalent to non-raw text | |
705 basetext = revlog.revision(baserev, _df=fh, raw=False) | |
706 btext[0] = mdiff.patch(basetext, delta) | |
707 | |
708 try: | |
709 res = revlog._processflags(btext[0], flags, 'read', raw=True) | |
710 btext[0], validatehash = res | |
711 if validatehash: | |
712 revlog.checkhash(btext[0], node, p1=revinfo.p1, p2=revinfo.p2) | |
713 if flags & REVIDX_ISCENSORED: | |
714 raise RevlogError(_('node %s is not censored') % node) | |
715 except CensoredNodeError: | |
716 # must pass the censored index flag to add censored revisions | |
717 if not flags & REVIDX_ISCENSORED: | |
718 raise | |
719 return btext[0] | |
720 | |
721 def _builddeltadiff(self, base, revinfo, fh): | |
722 revlog = self.revlog | |
723 t = self.buildtext(revinfo, fh) | |
724 if revlog.iscensored(base): | |
725 # deltas based on a censored revision must replace the | |
726 # full content in one patch, so delta works everywhere | |
727 header = mdiff.replacediffheader(revlog.rawsize(base), len(t)) | |
728 delta = header + t | |
729 else: | |
730 ptext = revlog.revision(base, _df=fh, raw=True) | |
731 delta = mdiff.textdiff(ptext, t) | |
732 | |
733 return delta | |
734 | |
735 def _builddeltainfo(self, revinfo, base, fh): | |
736 # can we use the cached delta? | |
737 if revinfo.cachedelta and revinfo.cachedelta[0] == base: | |
738 delta = revinfo.cachedelta[1] | |
739 else: | |
740 delta = self._builddeltadiff(base, revinfo, fh) | |
741 revlog = self.revlog | |
742 header, data = revlog.compress(delta) | |
743 deltalen = len(header) + len(data) | |
744 chainbase = revlog.chainbase(base) | |
745 offset = revlog.end(len(revlog) - 1) | |
746 dist = deltalen + offset - revlog.start(chainbase) | |
747 if revlog._generaldelta: | |
748 deltabase = base | |
749 else: | |
750 deltabase = chainbase | |
751 chainlen, compresseddeltalen = revlog._chaininfo(base) | |
752 chainlen += 1 | |
753 compresseddeltalen += deltalen | |
754 | |
755 revlog = self.revlog | |
756 snapshotdepth = None | |
757 if deltabase == nullrev: | |
758 snapshotdepth = 0 | |
759 elif revlog._sparserevlog and revlog.issnapshot(deltabase): | |
760 # A delta chain should always be one full snapshot, | |
761 # zero or more semi-snapshots, and zero or more deltas | |
762 p1, p2 = revlog.rev(revinfo.p1), revlog.rev(revinfo.p2) | |
763 if deltabase not in (p1, p2) and revlog.issnapshot(deltabase): | |
764 snapshotdepth = len(revlog._deltachain(deltabase)[0]) | |
765 | |
766 return _deltainfo(dist, deltalen, (header, data), deltabase, | |
767 chainbase, chainlen, compresseddeltalen, | |
768 snapshotdepth) | |
769 | |
770 def finddeltainfo(self, revinfo, fh): | |
771 """Find an acceptable delta against a candidate revision | |
772 | |
773 revinfo: information about the revision (instance of _revisioninfo) | |
774 fh: file handle to either the .i or the .d revlog file, | |
775 depending on whether it is inlined or not | |
776 | |
777 Returns the first acceptable candidate revision, as ordered by | |
778 _getcandidaterevs | |
779 """ | |
780 if not revinfo.textlen: | |
781 return None # empty file do not need delta | |
782 | |
783 cachedelta = revinfo.cachedelta | |
784 p1 = revinfo.p1 | |
785 p2 = revinfo.p2 | |
786 revlog = self.revlog | |
787 | |
788 deltalength = self.revlog.length | |
789 deltaparent = self.revlog.deltaparent | |
790 | |
791 deltainfo = None | |
792 deltas_limit = revinfo.textlen * LIMIT_DELTA2TEXT | |
793 for candidaterevs in self._getcandidaterevs(p1, p2, cachedelta): | |
794 # filter out delta base that will never produce good delta | |
795 candidaterevs = [r for r in candidaterevs | |
796 if self.revlog.length(r) <= deltas_limit] | |
797 nominateddeltas = [] | |
798 for candidaterev in candidaterevs: | |
799 # skip over empty delta (no need to include them in a chain) | |
800 while candidaterev != nullrev and not deltalength(candidaterev): | |
801 candidaterev = deltaparent(candidaterev) | |
802 # no need to try a delta against nullid, this will be handled | |
803 # by fulltext later. | |
804 if candidaterev == nullrev: | |
805 continue | |
806 # no delta for rawtext-changing revs (see "candelta" for why) | |
807 if revlog.flags(candidaterev) & REVIDX_RAWTEXT_CHANGING_FLAGS: | |
808 continue | |
809 candidatedelta = self._builddeltainfo(revinfo, candidaterev, fh) | |
810 if revlog._isgooddeltainfo(candidatedelta, revinfo): | |
811 nominateddeltas.append(candidatedelta) | |
812 if nominateddeltas: | |
813 deltainfo = min(nominateddeltas, key=lambda x: x.deltalen) | |
814 break | |
815 | |
816 return deltainfo | |
817 | 214 |
818 @attr.s(slots=True, frozen=True) | 215 @attr.s(slots=True, frozen=True) |
819 class _revisioninfo(object): | 216 class _revisioninfo(object): |
820 """Information about a revision that allows building its fulltext | 217 """Information about a revision that allows building its fulltext |
821 node: expected hash of the revision | 218 node: expected hash of the revision |
2093 ladd = l.append | 1490 ladd = l.append |
2094 | 1491 |
2095 if not self._withsparseread: | 1492 if not self._withsparseread: |
2096 slicedchunks = (revs,) | 1493 slicedchunks = (revs,) |
2097 else: | 1494 else: |
2098 slicedchunks = _slicechunk(self, revs, targetsize=targetsize) | 1495 slicedchunks = deltautil.slicechunk(self, revs, |
1496 targetsize=targetsize) | |
2099 | 1497 |
2100 for revschunk in slicedchunks: | 1498 for revschunk in slicedchunks: |
2101 firstrev = revschunk[0] | 1499 firstrev = revschunk[0] |
2102 # Skip trailing revisions with empty diff | 1500 # Skip trailing revisions with empty diff |
2103 for lastrev in revschunk[::-1]: | 1501 for lastrev in revschunk[::-1]: |
2391 cachedelta - an optional precomputed delta | 1789 cachedelta - an optional precomputed delta |
2392 node - nodeid of revision; typically node is not specified, and it is | 1790 node - nodeid of revision; typically node is not specified, and it is |
2393 computed by default as hash(text, p1, p2), however subclasses might | 1791 computed by default as hash(text, p1, p2), however subclasses might |
2394 use different hashing method (and override checkhash() in such case) | 1792 use different hashing method (and override checkhash() in such case) |
2395 flags - the known flags to set on the revision | 1793 flags - the known flags to set on the revision |
2396 deltacomputer - an optional _deltacomputer instance shared between | 1794 deltacomputer - an optional deltacomputer instance shared between |
2397 multiple calls | 1795 multiple calls |
2398 """ | 1796 """ |
2399 if link == nullrev: | 1797 if link == nullrev: |
2400 raise RevlogError(_("attempted to add linkrev -1 to %s") | 1798 raise RevlogError(_("attempted to add linkrev -1 to %s") |
2401 % self.indexfile) | 1799 % self.indexfile) |
2514 except KeyError: | 1912 except KeyError: |
2515 raise RevlogError(_('unknown compression type %r') % t) | 1913 raise RevlogError(_('unknown compression type %r') % t) |
2516 | 1914 |
2517 return compressor.decompress(data) | 1915 return compressor.decompress(data) |
2518 | 1916 |
2519 def _isgooddeltainfo(self, deltainfo, revinfo): | |
2520 """Returns True if the given delta is good. Good means that it is within | |
2521 the disk span, disk size, and chain length bounds that we know to be | |
2522 performant.""" | |
2523 if deltainfo is None: | |
2524 return False | |
2525 | |
2526 # - 'deltainfo.distance' is the distance from the base revision -- | |
2527 # bounding it limits the amount of I/O we need to do. | |
2528 # - 'deltainfo.compresseddeltalen' is the sum of the total size of | |
2529 # deltas we need to apply -- bounding it limits the amount of CPU | |
2530 # we consume. | |
2531 | |
2532 if self._sparserevlog: | |
2533 # As sparse-read will be used, we can consider that the distance, | |
2534 # instead of being the span of the whole chunk, | |
2535 # is the span of the largest read chunk | |
2536 base = deltainfo.base | |
2537 | |
2538 if base != nullrev: | |
2539 deltachain = self._deltachain(base)[0] | |
2540 else: | |
2541 deltachain = [] | |
2542 | |
2543 # search for the first non-snapshot revision | |
2544 for idx, r in enumerate(deltachain): | |
2545 if not self.issnapshot(r): | |
2546 break | |
2547 deltachain = deltachain[idx:] | |
2548 chunks = _slicechunk(self, deltachain, deltainfo) | |
2549 all_span = [_segmentspan(self, revs, deltainfo) for revs in chunks] | |
2550 distance = max(all_span) | |
2551 else: | |
2552 distance = deltainfo.distance | |
2553 | |
2554 textlen = revinfo.textlen | |
2555 defaultmax = textlen * 4 | |
2556 maxdist = self._maxdeltachainspan | |
2557 if not maxdist: | |
2558 maxdist = distance # ensure the conditional pass | |
2559 maxdist = max(maxdist, defaultmax) | |
2560 if self._sparserevlog and maxdist < self._srmingapsize: | |
2561 # In multiple place, we are ignoring irrelevant data range below a | |
2562 # certain size. Be also apply this tradeoff here and relax span | |
2563 # constraint for small enought content. | |
2564 maxdist = self._srmingapsize | |
2565 | |
2566 # Bad delta from read span: | |
2567 # | |
2568 # If the span of data read is larger than the maximum allowed. | |
2569 if maxdist < distance: | |
2570 return False | |
2571 | |
2572 # Bad delta from new delta size: | |
2573 # | |
2574 # If the delta size is larger than the target text, storing the | |
2575 # delta will be inefficient. | |
2576 if textlen < deltainfo.deltalen: | |
2577 return False | |
2578 | |
2579 # Bad delta from cumulated payload size: | |
2580 # | |
2581 # If the sum of delta get larger than K * target text length. | |
2582 if textlen * LIMIT_DELTA2TEXT < deltainfo.compresseddeltalen: | |
2583 return False | |
2584 | |
2585 # Bad delta from chain length: | |
2586 # | |
2587 # If the number of delta in the chain gets too high. | |
2588 if self._maxchainlen and self._maxchainlen < deltainfo.chainlen: | |
2589 return False | |
2590 | |
2591 # bad delta from intermediate snapshot size limit | |
2592 # | |
2593 # If an intermediate snapshot size is higher than the limit. The | |
2594 # limit exist to prevent endless chain of intermediate delta to be | |
2595 # created. | |
2596 if (deltainfo.snapshotdepth is not None and | |
2597 (textlen >> deltainfo.snapshotdepth) < deltainfo.deltalen): | |
2598 return False | |
2599 | |
2600 # bad delta if new intermediate snapshot is larger than the previous | |
2601 # snapshot | |
2602 if (deltainfo.snapshotdepth | |
2603 and self.length(deltainfo.base) < deltainfo.deltalen): | |
2604 return False | |
2605 | |
2606 return True | |
2607 | |
2608 def _addrevision(self, node, rawtext, transaction, link, p1, p2, flags, | 1917 def _addrevision(self, node, rawtext, transaction, link, p1, p2, flags, |
2609 cachedelta, ifh, dfh, alwayscache=False, | 1918 cachedelta, ifh, dfh, alwayscache=False, |
2610 deltacomputer=None): | 1919 deltacomputer=None): |
2611 """internal function to add revisions to the log | 1920 """internal function to add revisions to the log |
2612 | 1921 |
2650 cachedelta[1]) | 1959 cachedelta[1]) |
2651 else: | 1960 else: |
2652 textlen = len(rawtext) | 1961 textlen = len(rawtext) |
2653 | 1962 |
2654 if deltacomputer is None: | 1963 if deltacomputer is None: |
2655 deltacomputer = _deltacomputer(self) | 1964 deltacomputer = deltautil.deltacomputer(self) |
2656 | 1965 |
2657 revinfo = _revisioninfo(node, p1, p2, btext, textlen, cachedelta, flags) | 1966 revinfo = _revisioninfo(node, p1, p2, btext, textlen, cachedelta, flags) |
2658 | 1967 |
2659 # no delta for flag processor revision (see "candelta" for why) | 1968 # no delta for flag processor revision (see "candelta" for why) |
2660 # not calling candelta since only one revision needs test, also to | 1969 # not calling candelta since only one revision needs test, also to |
2752 def flush(): | 2061 def flush(): |
2753 if dfh: | 2062 if dfh: |
2754 dfh.flush() | 2063 dfh.flush() |
2755 ifh.flush() | 2064 ifh.flush() |
2756 try: | 2065 try: |
2757 deltacomputer = _deltacomputer(self) | 2066 deltacomputer = deltautil.deltacomputer(self) |
2758 # loop through our set of deltas | 2067 # loop through our set of deltas |
2759 for data in deltas: | 2068 for data in deltas: |
2760 node, p1, p2, linknode, deltabase, delta, flags = data | 2069 node, p1, p2, linknode, deltabase, delta, flags = data |
2761 link = linkmapper(linknode) | 2070 link = linkmapper(linknode) |
2762 flags = flags or REVIDX_DEFAULT_FLAGS | 2071 flags = flags or REVIDX_DEFAULT_FLAGS |
3125 destrevlog._deltabothparents = deltabothparents or oldamd | 2434 destrevlog._deltabothparents = deltabothparents or oldamd |
3126 | 2435 |
3127 populatecachedelta = deltareuse in (self.DELTAREUSEALWAYS, | 2436 populatecachedelta = deltareuse in (self.DELTAREUSEALWAYS, |
3128 self.DELTAREUSESAMEREVS) | 2437 self.DELTAREUSESAMEREVS) |
3129 | 2438 |
3130 deltacomputer = _deltacomputer(destrevlog) | 2439 deltacomputer = deltautil.deltacomputer(destrevlog) |
3131 index = self.index | 2440 index = self.index |
3132 for rev in self: | 2441 for rev in self: |
3133 entry = index[rev] | 2442 entry = index[rev] |
3134 | 2443 |
3135 # Some classes override linkrev to take filtered revs into | 2444 # Some classes override linkrev to take filtered revs into |