comparison mercurial/revlog.py @ 51105:0250e45040f1

revlog: add a small cache of unfiltered chunk This can provides a massive boost to the reading of multiple revision and the computation of a valid delta chain. This greatly help operation like `hg log --patch`, delta computation (helping pull/unbundle), linkrev adjustment (helping copy tracing). A first round of benchmark for `hg log --patch --limit 1000` shows improvement in the 10-20% range on "small" repository like pypy or mercurial and large improvements (about 33%) for more complex ones like netbeans and mozilla's. These speeds up are consistent with the improvement to `hg pull` (from a server sending poor deltas) I saw benchmarking this last year. Further benchmark will be run during the freeze. I added some configuration in the experimental space to be able to further test the effect of various tuning for now. This feature should fit well in the "usage/resource profile" configuration that we should land next cycle. When it does not provides a benefit the overhead of the cache seem to be around 2%, a small price for the big improvement. In addition I believe we could shave most of this overhead with a more efficent lru implementation.
author Pierre-Yves David <pierre-yves.david@octobus.net>
date Fri, 27 Oct 2023 08:54:41 +0200
parents c2d2e5b65def
children 98910135a3bc
comparison
equal deleted inserted replaced
51104:c2d2e5b65def 51105:0250e45040f1
293 # how much data is large 293 # how much data is large
294 mmap_index_threshold = attr.ib(default=None) 294 mmap_index_threshold = attr.ib(default=None)
295 # How much data to read and cache into the raw revlog data cache. 295 # How much data to read and cache into the raw revlog data cache.
296 chunk_cache_size = attr.ib(default=65536) 296 chunk_cache_size = attr.ib(default=65536)
297 297
298 # The size of the uncompressed cache compared to the largest revision seen.
299 uncompressed_cache_factor = attr.ib(default=None)
300
301 # The number of chunk cached
302 uncompressed_cache_count = attr.ib(default=None)
303
298 # Allow sparse reading of the revlog data 304 # Allow sparse reading of the revlog data
299 with_sparse_read = attr.ib(default=False) 305 with_sparse_read = attr.ib(default=False)
300 # minimal density of a sparse read chunk 306 # minimal density of a sparse read chunk
301 sr_density_threshold = attr.ib(default=0.50) 307 sr_density_threshold = attr.ib(default=0.50)
302 # minimal size of data we skip when performing sparse read 308 # minimal size of data we skip when performing sparse read
394 # revlog header -> revlog compressor 400 # revlog header -> revlog compressor
395 self._decompressors = {} 401 self._decompressors = {}
396 # 3-tuple of (node, rev, text) for a raw revision. 402 # 3-tuple of (node, rev, text) for a raw revision.
397 self._revisioncache = None 403 self._revisioncache = None
398 404
405 # cache some uncompressed chunks
406 # rev → uncompressed_chunk
407 #
408 # the max cost is dynamically updated to be proportionnal to the
409 # size of revision we actually encounter.
410 self._uncompressed_chunk_cache = None
411 if self.data_config.uncompressed_cache_factor is not None:
412 self._uncompressed_chunk_cache = util.lrucachedict(
413 self.data_config.uncompressed_cache_count,
414 maxcost=65536, # some arbitrary initial value
415 )
416
399 self._delay_buffer = None 417 self._delay_buffer = None
400 418
401 @property 419 @property
402 def index_file(self): 420 def index_file(self):
403 return self.__index_file 421 return self.__index_file
412 return len(self.index) 430 return len(self.index)
413 431
414 def clear_cache(self): 432 def clear_cache(self):
415 assert not self.is_delaying 433 assert not self.is_delaying
416 self._revisioncache = None 434 self._revisioncache = None
435 if self._uncompressed_chunk_cache is not None:
436 self._uncompressed_chunk_cache.clear()
417 self._segmentfile.clear_cache() 437 self._segmentfile.clear_cache()
418 self._segmentfile_sidedata.clear_cache() 438 self._segmentfile_sidedata.clear_cache()
419 439
420 @property 440 @property
421 def canonical_index_file(self): 441 def canonical_index_file(self):
863 to be used for reading. If used, the seek position of the file will not 883 to be used for reading. If used, the seek position of the file will not
864 be preserved. 884 be preserved.
865 885
866 Returns a str holding uncompressed data for the requested revision. 886 Returns a str holding uncompressed data for the requested revision.
867 """ 887 """
888 if self._uncompressed_chunk_cache is not None:
889 uncomp = self._uncompressed_chunk_cache.get(rev)
890 if uncomp is not None:
891 return uncomp
892
868 compression_mode = self.index[rev][10] 893 compression_mode = self.index[rev][10]
869 data = self.get_segment_for_revs(rev, rev)[1] 894 data = self.get_segment_for_revs(rev, rev)[1]
870 if compression_mode == COMP_MODE_PLAIN: 895 if compression_mode == COMP_MODE_PLAIN:
871 return data 896 uncomp = data
872 elif compression_mode == COMP_MODE_DEFAULT: 897 elif compression_mode == COMP_MODE_DEFAULT:
873 return self._decompressor(data) 898 uncomp = self._decompressor(data)
874 elif compression_mode == COMP_MODE_INLINE: 899 elif compression_mode == COMP_MODE_INLINE:
875 return self.decompress(data) 900 uncomp = self.decompress(data)
876 else: 901 else:
877 msg = b'unknown compression mode %d' 902 msg = b'unknown compression mode %d'
878 msg %= compression_mode 903 msg %= compression_mode
879 raise error.RevlogError(msg) 904 raise error.RevlogError(msg)
905 if self._uncompressed_chunk_cache is not None:
906 self._uncompressed_chunk_cache.insert(rev, uncomp, cost=len(uncomp))
907 return uncomp
880 908
881 def _chunks(self, revs, targetsize=None): 909 def _chunks(self, revs, targetsize=None):
882 """Obtain decompressed chunks for the specified revisions. 910 """Obtain decompressed chunks for the specified revisions.
883 911
884 Accepts an iterable of numeric revisions that are assumed to be in 912 Accepts an iterable of numeric revisions that are assumed to be in
897 length = self.length 925 length = self.length
898 inline = self.inline 926 inline = self.inline
899 iosize = self.index.entry_size 927 iosize = self.index.entry_size
900 buffer = util.buffer 928 buffer = util.buffer
901 929
902 l = [] 930 fetched_revs = []
903 ladd = l.append 931 fadd = fetched_revs.append
932
904 chunks = [] 933 chunks = []
905 ladd = chunks.append 934 ladd = chunks.append
906 935
907 if not self.data_config.with_sparse_read: 936 if self._uncompressed_chunk_cache is None:
908 slicedchunks = (revs,) 937 fetched_revs = revs
938 else:
939 for rev in revs:
940 cached_value = self._uncompressed_chunk_cache.get(rev)
941 if cached_value is None:
942 fadd(rev)
943 else:
944 ladd((rev, cached_value))
945
946 if not fetched_revs:
947 slicedchunks = ()
948 elif not self.data_config.with_sparse_read:
949 slicedchunks = (fetched_revs,)
909 else: 950 else:
910 slicedchunks = deltautil.slicechunk( 951 slicedchunks = deltautil.slicechunk(
911 self, 952 self,
912 revs, 953 fetched_revs,
913 targetsize=targetsize, 954 targetsize=targetsize,
914 ) 955 )
915 956
916 for revschunk in slicedchunks: 957 for revschunk in slicedchunks:
917 firstrev = revschunk[0] 958 firstrev = revschunk[0]
947 else: 988 else:
948 msg = b'unknown compression mode %d' 989 msg = b'unknown compression mode %d'
949 msg %= comp_mode 990 msg %= comp_mode
950 raise error.RevlogError(msg) 991 raise error.RevlogError(msg)
951 ladd((rev, c)) 992 ladd((rev, c))
952 993 if self._uncompressed_chunk_cache is not None:
994 self._uncompressed_chunk_cache.insert(rev, c, len(c))
995
996 chunks.sort()
953 return [x[1] for x in chunks] 997 return [x[1] for x in chunks]
954 998
955 def raw_text(self, node, rev): 999 def raw_text(self, node, rev):
956 """return the possibly unvalidated rawtext for a revision 1000 """return the possibly unvalidated rawtext for a revision
957 1001
978 1022
979 targetsize = None 1023 targetsize = None
980 rawsize = self.index[rev][2] 1024 rawsize = self.index[rev][2]
981 if 0 <= rawsize: 1025 if 0 <= rawsize:
982 targetsize = 4 * rawsize 1026 targetsize = 4 * rawsize
1027
1028 if self._uncompressed_chunk_cache is not None:
1029 # dynamically update the uncompressed_chunk_cache size to the
1030 # largest revision we saw in this revlog.
1031 factor = self.data_config.uncompressed_cache_factor
1032 candidate_size = rawsize * factor
1033 if candidate_size > self._uncompressed_chunk_cache.maxcost:
1034 self._uncompressed_chunk_cache.maxcost = candidate_size
983 1035
984 bins = self._chunks(chain, targetsize=targetsize) 1036 bins = self._chunks(chain, targetsize=targetsize)
985 if basetext is None: 1037 if basetext is None:
986 basetext = bytes(bins[0]) 1038 basetext = bytes(bins[0])
987 bins = bins[1:] 1039 bins = bins[1:]