comparison mercurial/branchmap.py @ 51372:02e7d79edf62

branchmap: use mmap for faster revbranchcache loading A typical revbranchmap usage is: - load the entire revbranchmap file into memory - maybe do a few lookups - add a few bytes to it - write the addition to disk There's no reason to load the entire revbranchmap into memory. We can split it into a large immutable prefix and a mutable suffix, and then memorymap the prefix, thus saving all the useless loading. Benchmarking on some real-world pushes suggests that out of ~100s server-side push handling revbranchcache handling is responsible for: * ~7s with no change * ~1.3s with the change, without mmap * 0.04s with the change, with mmap
author Arseniy Alekseyev <aalekseyev@janestreet.com>
date Wed, 10 Jan 2024 18:58:42 +0000
parents f4a0806081f2
children 40943970b7ae 7f7086a42b2b
comparison
equal deleted inserted replaced
51371:54a75576287a 51372:02e7d79edf62
619 _rbcnodelen = 4 619 _rbcnodelen = 4
620 _rbcbranchidxmask = 0x7FFFFFFF 620 _rbcbranchidxmask = 0x7FFFFFFF
621 _rbccloseflag = 0x80000000 621 _rbccloseflag = 0x80000000
622 622
623 623
624 class rbcrevs:
625 """a byte string consisting of an immutable prefix followed by a mutable suffix"""
626
627 def __init__(self, revs):
628 self._prefix = revs
629 self._rest = bytearray()
630
631 def __len__(self):
632 return len(self._prefix) + len(self._rest)
633
634 def unpack_record(self, rbcrevidx):
635 if rbcrevidx < len(self._prefix):
636 return unpack_from(_rbcrecfmt, util.buffer(self._prefix), rbcrevidx)
637 else:
638 return unpack_from(
639 _rbcrecfmt,
640 util.buffer(self._rest),
641 rbcrevidx - len(self._prefix),
642 )
643
644 def make_mutable(self):
645 if len(self._prefix) > 0:
646 entirety = bytearray()
647 entirety[:] = self._prefix
648 entirety.extend(self._rest)
649 self._rest = entirety
650 self._prefix = bytearray()
651
652 def truncate(self, pos):
653 self.make_mutable()
654 del self._rest[pos:]
655
656 def pack_into(self, rbcrevidx, node, branchidx):
657 if rbcrevidx < len(self._prefix):
658 self.make_mutable()
659 buf = self._rest
660 start_offset = rbcrevidx - len(self._prefix)
661 end_offset = start_offset + _rbcrecsize
662
663 if len(self._rest) < end_offset:
664 # bytearray doesn't allocate extra space at least in Python 3.7.
665 # When multiple changesets are added in a row, precise resize would
666 # result in quadratic complexity. Overallocate to compensate by
667 # using the classic doubling technique for dynamic arrays instead.
668 # If there was a gap in the map before, less space will be reserved.
669 self._rest.extend(b'\0' * end_offset)
670 return pack_into(
671 _rbcrecfmt,
672 buf,
673 start_offset,
674 node,
675 branchidx,
676 )
677
678 def extend(self, extension):
679 return self._rest.extend(extension)
680
681 def slice(self, begin, end):
682 if begin < len(self._prefix):
683 acc = bytearray()
684 acc[:] = self._prefix[begin:end]
685 acc.extend(
686 self._rest[begin - len(self._prefix) : end - len(self._prefix)]
687 )
688 return acc
689 return self._rest[begin - len(self._prefix) : end - len(self._prefix)]
690
691
624 class revbranchcache: 692 class revbranchcache:
625 """Persistent cache, mapping from revision number to branch name and close. 693 """Persistent cache, mapping from revision number to branch name and close.
626 This is a low level cache, independent of filtering. 694 This is a low level cache, independent of filtering.
627 695
628 Branch names are stored in rbc-names in internal encoding separated by 0. 696 Branch names are stored in rbc-names in internal encoding separated by 0.
646 714
647 def __init__(self, repo, readonly=True): 715 def __init__(self, repo, readonly=True):
648 assert repo.filtername is None 716 assert repo.filtername is None
649 self._repo = repo 717 self._repo = repo
650 self._names = [] # branch names in local encoding with static index 718 self._names = [] # branch names in local encoding with static index
651 self._rbcrevs = bytearray() 719 self._rbcrevs = rbcrevs(bytearray())
652 self._rbcsnameslen = 0 # length of names read at _rbcsnameslen 720 self._rbcsnameslen = 0 # length of names read at _rbcsnameslen
653 try: 721 try:
654 bndata = repo.cachevfs.read(_rbcnames) 722 bndata = repo.cachevfs.read(_rbcnames)
655 self._rbcsnameslen = len(bndata) # for verification before writing 723 self._rbcsnameslen = len(bndata) # for verification before writing
656 if bndata: 724 if bndata:
662 # don't try to use cache - fall back to the slow path 730 # don't try to use cache - fall back to the slow path
663 self.branchinfo = self._branchinfo 731 self.branchinfo = self._branchinfo
664 732
665 if self._names: 733 if self._names:
666 try: 734 try:
667 data = repo.cachevfs.read(_rbcrevs) 735 if repo.ui.configbool(b'format', b'mmap-revbranchcache'):
668 self._rbcrevs[:] = data 736 with repo.cachevfs(_rbcrevs) as fp:
737 data = util.buffer(util.mmapread(fp))
738 else:
739 data = repo.cachevfs.read(_rbcrevs)
740 self._rbcrevs = rbcrevs(data)
669 except (IOError, OSError) as inst: 741 except (IOError, OSError) as inst:
670 repo.ui.debug( 742 repo.ui.debug(
671 b"couldn't read revision branch cache: %s\n" 743 b"couldn't read revision branch cache: %s\n"
672 % stringutil.forcebytestr(inst) 744 % stringutil.forcebytestr(inst)
673 ) 745 )
683 def _clear(self): 755 def _clear(self):
684 self._rbcsnameslen = 0 756 self._rbcsnameslen = 0
685 del self._names[:] 757 del self._names[:]
686 self._rbcnamescount = 0 758 self._rbcnamescount = 0
687 self._rbcrevslen = len(self._repo.changelog) 759 self._rbcrevslen = len(self._repo.changelog)
688 self._rbcrevs = bytearray(self._rbcrevslen * _rbcrecsize) 760 self._rbcrevs = rbcrevs(bytearray(self._rbcrevslen * _rbcrecsize))
689 util.clearcachedproperty(self, b'_namesreverse') 761 util.clearcachedproperty(self, b'_namesreverse')
690 762
691 @util.propertycache 763 @util.propertycache
692 def _namesreverse(self): 764 def _namesreverse(self):
693 return {b: r for r, b in enumerate(self._names)} 765 return {b: r for r, b in enumerate(self._names)}
706 if len(self._rbcrevs) < rbcrevidx + _rbcrecsize: 778 if len(self._rbcrevs) < rbcrevidx + _rbcrecsize:
707 return self._branchinfo(rev) 779 return self._branchinfo(rev)
708 780
709 # fast path: extract data from cache, use it if node is matching 781 # fast path: extract data from cache, use it if node is matching
710 reponode = changelog.node(rev)[:_rbcnodelen] 782 reponode = changelog.node(rev)[:_rbcnodelen]
711 cachenode, branchidx = unpack_from( 783 cachenode, branchidx = self._rbcrevs.unpack_record(rbcrevidx)
712 _rbcrecfmt, util.buffer(self._rbcrevs), rbcrevidx
713 )
714 close = bool(branchidx & _rbccloseflag) 784 close = bool(branchidx & _rbccloseflag)
715 if close: 785 if close:
716 branchidx &= _rbcbranchidxmask 786 branchidx &= _rbcbranchidxmask
717 if cachenode == b'\0\0\0\0': 787 if cachenode == b'\0\0\0\0':
718 pass 788 pass
731 self._repo.ui.debug( 801 self._repo.ui.debug(
732 b"history modification detected - truncating " 802 b"history modification detected - truncating "
733 b"revision branch cache to revision %d\n" % rev 803 b"revision branch cache to revision %d\n" % rev
734 ) 804 )
735 truncate = rbcrevidx + _rbcrecsize 805 truncate = rbcrevidx + _rbcrecsize
736 del self._rbcrevs[truncate:] 806 self._rbcrevs.truncate(truncate)
737 self._rbcrevslen = min(self._rbcrevslen, truncate) 807 self._rbcrevslen = min(self._rbcrevslen, truncate)
738 808
739 # fall back to slow path and make sure it will be written to disk 809 # fall back to slow path and make sure it will be written to disk
740 return self._branchinfo(rev) 810 return self._branchinfo(rev)
741 811
780 def _setcachedata(self, rev, node, branchidx): 850 def _setcachedata(self, rev, node, branchidx):
781 """Writes the node's branch data to the in-memory cache data.""" 851 """Writes the node's branch data to the in-memory cache data."""
782 if rev == nullrev: 852 if rev == nullrev:
783 return 853 return
784 rbcrevidx = rev * _rbcrecsize 854 rbcrevidx = rev * _rbcrecsize
785 requiredsize = rbcrevidx + _rbcrecsize 855 self._rbcrevs.pack_into(rbcrevidx, node, branchidx)
786 rbccur = len(self._rbcrevs)
787 if rbccur < requiredsize:
788 # bytearray doesn't allocate extra space at least in Python 3.7.
789 # When multiple changesets are added in a row, precise resize would
790 # result in quadratic complexity. Overallocate to compensate by
791 # use the classic doubling technique for dynamic arrays instead.
792 # If there was a gap in the map before, less space will be reserved.
793 self._rbcrevs.extend(b'\0' * max(_rbcmininc, requiredsize))
794 pack_into(_rbcrecfmt, self._rbcrevs, rbcrevidx, node, branchidx)
795 self._rbcrevslen = min(self._rbcrevslen, rev) 856 self._rbcrevslen = min(self._rbcrevslen, rev)
796 857
797 tr = self._repo.currenttransaction() 858 tr = self._repo.currenttransaction()
798 if tr: 859 if tr:
799 tr.addfinalize(b'write-revbranchcache', self.write) 860 tr.addfinalize(b'write-revbranchcache', self.write)
864 if f.tell() != start: 925 if f.tell() != start:
865 start = 0 926 start = 0
866 f.seek(start) 927 f.seek(start)
867 f.truncate() 928 f.truncate()
868 end = revs * _rbcrecsize 929 end = revs * _rbcrecsize
869 f.write(self._rbcrevs[start:end]) 930 f.write(self._rbcrevs.slice(start, end))
870 self._rbcrevslen = revs 931 self._rbcrevslen = revs