mercurial-scm/hg: mercurial/revlog.py comparison

comparison mercurial/revlog.py @ 51105:0250e45040f1

revlog: add a small cache of unfiltered chunk This can provides a massive boost to the reading of multiple revision and the computation of a valid delta chain. This greatly help operation like `hg log --patch`, delta computation (helping pull/unbundle), linkrev adjustment (helping copy tracing). A first round of benchmark for `hg log --patch --limit 1000` shows improvement in the 10-20% range on "small" repository like pypy or mercurial and large improvements (about 33%) for more complex ones like netbeans and mozilla's. These speeds up are consistent with the improvement to `hg pull` (from a server sending poor deltas) I saw benchmarking this last year. Further benchmark will be run during the freeze. I added some configuration in the experimental space to be able to further test the effect of various tuning for now. This feature should fit well in the "usage/resource profile" configuration that we should land next cycle. When it does not provides a benefit the overhead of the cache seem to be around 2%, a small price for the big improvement. In addition I believe we could shave most of this overhead with a more efficent lru implementation.

author	Pierre-Yves David <pierre-yves.david@octobus.net>
date	Fri, 27 Oct 2023 08:54:41 +0200
parents	c2d2e5b65def
children	98910135a3bc

comparison

equal deleted inserted replaced

-:c2d2e5b65def
+:0250e45040f1
 # how much data is large
 mmap_index_threshold = attr.ib(default=None)
 # How much data to read and cache into the raw revlog data cache.
 chunk_cache_size = attr.ib(default=65536)
+# The size of the uncompressed cache compared to the largest revision seen.
+uncompressed_cache_factor = attr.ib(default=None)
+# The number of chunk cached
+uncompressed_cache_count = attr.ib(default=None)
 # Allow sparse reading of the revlog data
 with_sparse_read = attr.ib(default=False)
 # minimal density of a sparse read chunk
 sr_density_threshold = attr.ib(default=0.50)
 # minimal size of data we skip when performing sparse read
 # revlog header -> revlog compressor
 self._decompressors = {}
 # 3-tuple of (node, rev, text) for a raw revision.
 self._revisioncache = None
+# cache some uncompressed chunks
+# rev → uncompressed_chunk
+#
+# the max cost is dynamically updated to be proportionnal to the
+# size of revision we actually encounter.
+self._uncompressed_chunk_cache = None
+if self.data_config.uncompressed_cache_factor is not None:
+self._uncompressed_chunk_cache = util.lrucachedict(
+self.data_config.uncompressed_cache_count,
+maxcost=65536,  # some arbitrary initial value
+)
 self._delay_buffer = None
 @property
 def index_file(self):
 return self.__index_file
 return len(self.index)
 def clear_cache(self):
 assert not self.is_delaying
 self._revisioncache = None
+if self._uncompressed_chunk_cache is not None:
+self._uncompressed_chunk_cache.clear()
 self._segmentfile.clear_cache()
 self._segmentfile_sidedata.clear_cache()
 @property
 def canonical_index_file(self):
 to be used for reading. If used, the seek position of the file will not
 be preserved.
 Returns a str holding uncompressed data for the requested revision.
 """
+if self._uncompressed_chunk_cache is not None:
+uncomp = self._uncompressed_chunk_cache.get(rev)
+if uncomp is not None:
+return uncomp
 compression_mode = self.index[rev][10]
 data = self.get_segment_for_revs(rev, rev)[1]
 if compression_mode == COMP_MODE_PLAIN:
-return data
+uncomp = data
 elif compression_mode == COMP_MODE_DEFAULT:
-return self._decompressor(data)
+uncomp = self._decompressor(data)
 elif compression_mode == COMP_MODE_INLINE:
-return self.decompress(data)
+uncomp = self.decompress(data)
 else:
 msg = b'unknown compression mode %d'
 msg %= compression_mode
 raise error.RevlogError(msg)
+if self._uncompressed_chunk_cache is not None:
+self._uncompressed_chunk_cache.insert(rev, uncomp, cost=len(uncomp))
+return uncomp
 def _chunks(self, revs, targetsize=None):
 """Obtain decompressed chunks for the specified revisions.
 Accepts an iterable of numeric revisions that are assumed to be in
 length = self.length
 inline = self.inline
 iosize = self.index.entry_size
 buffer = util.buffer
-l = []
+fetched_revs = []
-ladd = l.append
+fadd = fetched_revs.append
 chunks = []
 ladd = chunks.append
-if not self.data_config.with_sparse_read:
+if self._uncompressed_chunk_cache is None:
-slicedchunks = (revs,)
+fetched_revs = revs
+else:
+for rev in revs:
+cached_value = self._uncompressed_chunk_cache.get(rev)
+if cached_value is None:
+fadd(rev)
+else:
+ladd((rev, cached_value))
+if not fetched_revs:
+slicedchunks = ()
+elif not self.data_config.with_sparse_read:
+slicedchunks = (fetched_revs,)
 else:
 slicedchunks = deltautil.slicechunk(
 self,
-revs,
+fetched_revs,
 targetsize=targetsize,
 )
 for revschunk in slicedchunks:
 firstrev = revschunk[0]
 else:
 msg = b'unknown compression mode %d'
 msg %= comp_mode
 raise error.RevlogError(msg)
 ladd((rev, c))
+if self._uncompressed_chunk_cache is not None:
+self._uncompressed_chunk_cache.insert(rev, c, len(c))
+chunks.sort()
 return [x[1] for x in chunks]
 def raw_text(self, node, rev):
 """return the possibly unvalidated rawtext for a revision
 targetsize = None
 rawsize = self.index[rev][2]
 if 0 <= rawsize:
 targetsize = 4 * rawsize
+if self._uncompressed_chunk_cache is not None:
+# dynamically update the uncompressed_chunk_cache size to the
+# largest revision we saw in this revlog.
+factor = self.data_config.uncompressed_cache_factor
+candidate_size = rawsize * factor
+if candidate_size > self._uncompressed_chunk_cache.maxcost:
+self._uncompressed_chunk_cache.maxcost = candidate_size
 bins = self._chunks(chain, targetsize=targetsize)
 if basetext is None:
 basetext = bytes(bins[0])
 bins = bins[1:]

Mercurial > public > mercurial-scm > hg

comparison mercurial/revlog.py @ 51105:0250e45040f1