Mercurial > public > mercurial-scm > hg
comparison mercurial/revlog.py @ 51105:0250e45040f1
revlog: add a small cache of unfiltered chunk
This can provides a massive boost to the reading of multiple revision and the
computation of a valid delta chain.
This greatly help operation like `hg log --patch`, delta computation (helping
pull/unbundle), linkrev adjustment (helping copy tracing).
A first round of benchmark for `hg log --patch --limit 1000` shows improvement
in the 10-20% range on "small" repository like pypy or mercurial and large
improvements (about 33%) for more complex ones like netbeans and mozilla's.
These speeds up are consistent with the improvement to `hg pull` (from a server
sending poor deltas) I saw benchmarking this last year. Further benchmark will
be run during the freeze.
I added some configuration in the experimental space to be able to further test
the effect of various tuning for now. This feature should fit well in the
"usage/resource profile" configuration that we should land next cycle.
When it does not provides a benefit the overhead of the cache seem to be around
2%, a small price for the big improvement. In addition I believe we could shave
most of this overhead with a more efficent lru implementation.
author | Pierre-Yves David <pierre-yves.david@octobus.net> |
---|---|
date | Fri, 27 Oct 2023 08:54:41 +0200 |
parents | c2d2e5b65def |
children | 98910135a3bc |
comparison
equal
deleted
inserted
replaced
51104:c2d2e5b65def | 51105:0250e45040f1 |
---|---|
293 # how much data is large | 293 # how much data is large |
294 mmap_index_threshold = attr.ib(default=None) | 294 mmap_index_threshold = attr.ib(default=None) |
295 # How much data to read and cache into the raw revlog data cache. | 295 # How much data to read and cache into the raw revlog data cache. |
296 chunk_cache_size = attr.ib(default=65536) | 296 chunk_cache_size = attr.ib(default=65536) |
297 | 297 |
298 # The size of the uncompressed cache compared to the largest revision seen. | |
299 uncompressed_cache_factor = attr.ib(default=None) | |
300 | |
301 # The number of chunk cached | |
302 uncompressed_cache_count = attr.ib(default=None) | |
303 | |
298 # Allow sparse reading of the revlog data | 304 # Allow sparse reading of the revlog data |
299 with_sparse_read = attr.ib(default=False) | 305 with_sparse_read = attr.ib(default=False) |
300 # minimal density of a sparse read chunk | 306 # minimal density of a sparse read chunk |
301 sr_density_threshold = attr.ib(default=0.50) | 307 sr_density_threshold = attr.ib(default=0.50) |
302 # minimal size of data we skip when performing sparse read | 308 # minimal size of data we skip when performing sparse read |
394 # revlog header -> revlog compressor | 400 # revlog header -> revlog compressor |
395 self._decompressors = {} | 401 self._decompressors = {} |
396 # 3-tuple of (node, rev, text) for a raw revision. | 402 # 3-tuple of (node, rev, text) for a raw revision. |
397 self._revisioncache = None | 403 self._revisioncache = None |
398 | 404 |
405 # cache some uncompressed chunks | |
406 # rev → uncompressed_chunk | |
407 # | |
408 # the max cost is dynamically updated to be proportionnal to the | |
409 # size of revision we actually encounter. | |
410 self._uncompressed_chunk_cache = None | |
411 if self.data_config.uncompressed_cache_factor is not None: | |
412 self._uncompressed_chunk_cache = util.lrucachedict( | |
413 self.data_config.uncompressed_cache_count, | |
414 maxcost=65536, # some arbitrary initial value | |
415 ) | |
416 | |
399 self._delay_buffer = None | 417 self._delay_buffer = None |
400 | 418 |
401 @property | 419 @property |
402 def index_file(self): | 420 def index_file(self): |
403 return self.__index_file | 421 return self.__index_file |
412 return len(self.index) | 430 return len(self.index) |
413 | 431 |
414 def clear_cache(self): | 432 def clear_cache(self): |
415 assert not self.is_delaying | 433 assert not self.is_delaying |
416 self._revisioncache = None | 434 self._revisioncache = None |
435 if self._uncompressed_chunk_cache is not None: | |
436 self._uncompressed_chunk_cache.clear() | |
417 self._segmentfile.clear_cache() | 437 self._segmentfile.clear_cache() |
418 self._segmentfile_sidedata.clear_cache() | 438 self._segmentfile_sidedata.clear_cache() |
419 | 439 |
420 @property | 440 @property |
421 def canonical_index_file(self): | 441 def canonical_index_file(self): |
863 to be used for reading. If used, the seek position of the file will not | 883 to be used for reading. If used, the seek position of the file will not |
864 be preserved. | 884 be preserved. |
865 | 885 |
866 Returns a str holding uncompressed data for the requested revision. | 886 Returns a str holding uncompressed data for the requested revision. |
867 """ | 887 """ |
888 if self._uncompressed_chunk_cache is not None: | |
889 uncomp = self._uncompressed_chunk_cache.get(rev) | |
890 if uncomp is not None: | |
891 return uncomp | |
892 | |
868 compression_mode = self.index[rev][10] | 893 compression_mode = self.index[rev][10] |
869 data = self.get_segment_for_revs(rev, rev)[1] | 894 data = self.get_segment_for_revs(rev, rev)[1] |
870 if compression_mode == COMP_MODE_PLAIN: | 895 if compression_mode == COMP_MODE_PLAIN: |
871 return data | 896 uncomp = data |
872 elif compression_mode == COMP_MODE_DEFAULT: | 897 elif compression_mode == COMP_MODE_DEFAULT: |
873 return self._decompressor(data) | 898 uncomp = self._decompressor(data) |
874 elif compression_mode == COMP_MODE_INLINE: | 899 elif compression_mode == COMP_MODE_INLINE: |
875 return self.decompress(data) | 900 uncomp = self.decompress(data) |
876 else: | 901 else: |
877 msg = b'unknown compression mode %d' | 902 msg = b'unknown compression mode %d' |
878 msg %= compression_mode | 903 msg %= compression_mode |
879 raise error.RevlogError(msg) | 904 raise error.RevlogError(msg) |
905 if self._uncompressed_chunk_cache is not None: | |
906 self._uncompressed_chunk_cache.insert(rev, uncomp, cost=len(uncomp)) | |
907 return uncomp | |
880 | 908 |
881 def _chunks(self, revs, targetsize=None): | 909 def _chunks(self, revs, targetsize=None): |
882 """Obtain decompressed chunks for the specified revisions. | 910 """Obtain decompressed chunks for the specified revisions. |
883 | 911 |
884 Accepts an iterable of numeric revisions that are assumed to be in | 912 Accepts an iterable of numeric revisions that are assumed to be in |
897 length = self.length | 925 length = self.length |
898 inline = self.inline | 926 inline = self.inline |
899 iosize = self.index.entry_size | 927 iosize = self.index.entry_size |
900 buffer = util.buffer | 928 buffer = util.buffer |
901 | 929 |
902 l = [] | 930 fetched_revs = [] |
903 ladd = l.append | 931 fadd = fetched_revs.append |
932 | |
904 chunks = [] | 933 chunks = [] |
905 ladd = chunks.append | 934 ladd = chunks.append |
906 | 935 |
907 if not self.data_config.with_sparse_read: | 936 if self._uncompressed_chunk_cache is None: |
908 slicedchunks = (revs,) | 937 fetched_revs = revs |
938 else: | |
939 for rev in revs: | |
940 cached_value = self._uncompressed_chunk_cache.get(rev) | |
941 if cached_value is None: | |
942 fadd(rev) | |
943 else: | |
944 ladd((rev, cached_value)) | |
945 | |
946 if not fetched_revs: | |
947 slicedchunks = () | |
948 elif not self.data_config.with_sparse_read: | |
949 slicedchunks = (fetched_revs,) | |
909 else: | 950 else: |
910 slicedchunks = deltautil.slicechunk( | 951 slicedchunks = deltautil.slicechunk( |
911 self, | 952 self, |
912 revs, | 953 fetched_revs, |
913 targetsize=targetsize, | 954 targetsize=targetsize, |
914 ) | 955 ) |
915 | 956 |
916 for revschunk in slicedchunks: | 957 for revschunk in slicedchunks: |
917 firstrev = revschunk[0] | 958 firstrev = revschunk[0] |
947 else: | 988 else: |
948 msg = b'unknown compression mode %d' | 989 msg = b'unknown compression mode %d' |
949 msg %= comp_mode | 990 msg %= comp_mode |
950 raise error.RevlogError(msg) | 991 raise error.RevlogError(msg) |
951 ladd((rev, c)) | 992 ladd((rev, c)) |
952 | 993 if self._uncompressed_chunk_cache is not None: |
994 self._uncompressed_chunk_cache.insert(rev, c, len(c)) | |
995 | |
996 chunks.sort() | |
953 return [x[1] for x in chunks] | 997 return [x[1] for x in chunks] |
954 | 998 |
955 def raw_text(self, node, rev): | 999 def raw_text(self, node, rev): |
956 """return the possibly unvalidated rawtext for a revision | 1000 """return the possibly unvalidated rawtext for a revision |
957 | 1001 |
978 | 1022 |
979 targetsize = None | 1023 targetsize = None |
980 rawsize = self.index[rev][2] | 1024 rawsize = self.index[rev][2] |
981 if 0 <= rawsize: | 1025 if 0 <= rawsize: |
982 targetsize = 4 * rawsize | 1026 targetsize = 4 * rawsize |
1027 | |
1028 if self._uncompressed_chunk_cache is not None: | |
1029 # dynamically update the uncompressed_chunk_cache size to the | |
1030 # largest revision we saw in this revlog. | |
1031 factor = self.data_config.uncompressed_cache_factor | |
1032 candidate_size = rawsize * factor | |
1033 if candidate_size > self._uncompressed_chunk_cache.maxcost: | |
1034 self._uncompressed_chunk_cache.maxcost = candidate_size | |
983 | 1035 |
984 bins = self._chunks(chain, targetsize=targetsize) | 1036 bins = self._chunks(chain, targetsize=targetsize) |
985 if basetext is None: | 1037 if basetext is None: |
986 basetext = bytes(bins[0]) | 1038 basetext = bytes(bins[0]) |
987 bins = bins[1:] | 1039 bins = bins[1:] |