mercurial-scm/hg: mercurial/revlog.py comparison

comparison mercurial/revlog.py @ 47425:e0a314bcbc9d

revlog: Extract low-level random-access file read caching logic The `revlog` class does many things, among which fulfilling requests for arbitrary byte slices from the revlog "data file" by reading a larger chunk and caching it in memory, in order to reduce the number of system calls. This extracts that logic into a new class, so that it may later also be used for the side-data file (with another instance of that class). The copyright notice of the new file does not include a date or author name since such information tend not to be kept up-to-date: https://www.linuxfoundation.org/en/blog/copyright-notices-in-open-source-software-projects/ Differential Revision: https://phab.mercurial-scm.org/D10878

author	Simon Sapin <simon.sapin@octobus.net>
date	Tue, 08 Jun 2021 19:55:00 +0200
parents	5fbac82a8780
children	cac0e0621ceb

comparison

equal deleted inserted replaced

-:f77404040776
+:e0a314bcbc9d
 censor,
 deltas as deltautil,
 docket as docketutil,
 flagutil,
 nodemap as nodemaputil,
+randomaccessfile,
 revlogv0,
 sidedata as sidedatautil,
 )
 from .utils import (
 storageutil,
 # Aliased for performance.
 _zlibdecompress = zlib.decompress
 # max size of revlog with inline data
 _maxinline = 131072
-_chunksize = 1048576
 # Flag processors for REVIDX_ELLIPSIS.
 def ellipsisreadprocessor(rl, text):
 return text, False
 # corresponds to uncompressed length of indexformatng (2 gigs, 4-byte
 # signed integer)
 _maxentrysize = 0x7FFFFFFF
-PARTIAL_READ_MSG = _(
-b'partial read of revlog %s; expected %d bytes from offset %d, got %d'
-)
 FILE_TOO_SHORT_MSG = _(
 b'cannot read from revlog %s;'
 b'  expected %d bytes from offset %d, data size is %d'
 )
 self._parse_index = parse_index_v1_nodemap
 elif use_rust_index:
 self._parse_index = parse_index_v1_mixed
 try:
 d = self._parse_index(index_data, self._inline)
-index, _chunkcache = d
+index, chunkcache = d
 use_nodemap = (
 not self._inline
 and self._nodemap_file is not None
 and util.safehasattr(index, 'update_nodemap_data')
 )
 index.update_nodemap_data(*nodemap_data)
 except (ValueError, IndexError):
 raise error.RevlogError(
 _(b"index %s is corrupted") % self.display_id
 )
-self.index, self._chunkcache = d
+self.index = index
-if not self._chunkcache:
+self._segmentfile = randomaccessfile.randomaccessfile(
-self._chunkclear()
+self.opener,
+(self._indexfile if self._inline else self._datafile),
+self._chunkcachesize,
+chunkcache,
+)
 # revnum -> (chain-length, sum-delta-length)
 self._chaininfocache = util.lrucachedict(500)
 # revlog header -> revlog compressor
 self._decompressors = {}
 def _datafp(self, mode=b'r'):
 """file object for the revlog's data file"""
 return self.opener(self._datafile, mode=mode)
 @contextlib.contextmanager
-def _datareadfp(self, existingfp=None):
-"""file object suitable to read data"""
-# Use explicit file handle, if given.
-if existingfp is not None:
-yield existingfp
-# Use a file handle being actively used for writes, if available.
-# There is some danger to doing this because reads will seek the
-# file. However, _writeentry() performs a SEEK_END before all writes,
-# so we should be safe.
-elif self._writinghandles:
-if self._inline:
-yield self._writinghandles[0]
-else:
-yield self._writinghandles[1]
-# Otherwise open a new file handle.
-else:
-if self._inline:
-func = self._indexfp
-else:
-func = self._datafp
-with func() as fp:
-yield fp
-@contextlib.contextmanager
 def _sidedatareadfp(self):
 """file object suitable to read sidedata"""
 if self._writinghandles:
 yield self._writinghandles[2]
 else:
 nodemaputil.setup_persistent_nodemap(transaction, self)
 def clearcaches(self):
 self._revisioncache = None
 self._chainbasecache.clear()
-self._chunkcache = (0, b'')
+self._segmentfile.clear_cache()
 self._pcache = {}
 self._nodemap_docket = None
 self.index.clearcaches()
 # The python code is the one responsible for validating the docket, we
 # end up having to refresh it here.
 returns True if text is different than what is stored.
 """
 p1, p2 = self.parents(node)
 return storageutil.hashrevisionsha1(text, p1, p2) != node
-def _cachesegment(self, offset, data):
-"""Add a segment to the revlog cache.
-Accepts an absolute offset and the data that is at that location.
-"""
-o, d = self._chunkcache
-# try to add to existing cache
-if o + len(d) == offset and len(d) + len(data) < _chunksize:
-self._chunkcache = o, d + data
-else:
-self._chunkcache = offset, data
-def _readsegment(self, offset, length, df=None):
-"""Load a segment of raw data from the revlog.
-Accepts an absolute offset, length to read, and an optional existing
-file handle to read from.
-If an existing file handle is passed, it will be seeked and the
-original seek position will NOT be restored.
-Returns a str or buffer of raw byte data.
-Raises if the requested number of bytes could not be read.
-"""
-# Cache data both forward and backward around the requested
-# data, in a fixed size window. This helps speed up operations
-# involving reading the revlog backwards.
-cachesize = self._chunkcachesize
-realoffset = offset & ~(cachesize - 1)
-reallength = (
-(offset + length + cachesize) & ~(cachesize - 1)
-) - realoffset
-with self._datareadfp(df) as df:
-df.seek(realoffset)
-d = df.read(reallength)
-self._cachesegment(realoffset, d)
-if offset != realoffset or reallength != length:
-startoffset = offset - realoffset
-if len(d) - startoffset < length:
-filename = self._indexfile if self._inline else self._datafile
-got = len(d) - startoffset
-m = PARTIAL_READ_MSG % (filename, length, offset, got)
-raise error.RevlogError(m)
-return util.buffer(d, startoffset, length)
-if len(d) < length:
-filename = self._indexfile if self._inline else self._datafile
-got = len(d) - startoffset
-m = PARTIAL_READ_MSG % (filename, length, offset, got)
-raise error.RevlogError(m)
-return d
-def _getsegment(self, offset, length, df=None):
-"""Obtain a segment of raw data from the revlog.
-Accepts an absolute offset, length of bytes to obtain, and an
-optional file handle to the already-opened revlog. If the file
-handle is used, it's original seek position will not be preserved.
-Requests for data may be returned from a cache.
-Returns a str or a buffer instance of raw byte data.
-"""
-o, d = self._chunkcache
-l = len(d)
-# is it in the cache?
-cachestart = offset - o
-cacheend = cachestart + length
-if cachestart >= 0 and cacheend <= l:
-if cachestart == 0 and cacheend == l:
-return d  # avoid a copy
-return util.buffer(d, cachestart, cacheend - cachestart)
-return self._readsegment(offset, length, df=df)
 def _getsegmentforrevs(self, startrev, endrev, df=None):
 """Obtain a segment of raw data corresponding to a range of revisions.
 Accepts the start and end revisions and an optional already-open
 file handle to be used for reading. If the file handle is read, its
 if self._inline:
 start += (startrev + 1) * self.index.entry_size
 end += (endrev + 1) * self.index.entry_size
 length = end - start
-return start, self._getsegment(start, length, df=df)
+return start, self._segmentfile.read_chunk(start, length, df)
 def _chunk(self, rev, df=None):
 """Obtain a single decompressed chunk for a revision.
 Accepts an integer revision and an optional already-open file handle
 msg = b'unknown compression mode %d'
 msg %= comp_mode
 raise error.RevlogError(msg)
 return l
-def _chunkclear(self):
-"""Clear the raw chunk cache."""
-self._chunkcache = (0, b'')
 def deltaparent(self, rev):
 """return deltaparent of the given revision"""
 base = self.index[rev][3]
 if base == rev:
 if len(comp_segment) < sidedata_size:
 filename = self._sidedatafile
 length = sidedata_size
 offset = sidedata_offset
 got = len(comp_segment)
-m = PARTIAL_READ_MSG % (filename, length, offset, got)
+m = randomaccessfile.PARTIAL_READ_MSG % (
+filename,
+length,
+offset,
+got,
+)
 raise error.RevlogError(m)
 comp = self.index[rev][11]
 if comp == COMP_MODE_PLAIN:
 segment = comp_segment
 fp.flush()
 fp.close()
 # We can't use the cached file handle after close(). So prevent
 # its usage.
 self._writinghandles = None
+self._segmentfile.writing_handle = None
 new_dfh = self._datafp(b'w+')
 new_dfh.truncate(0)  # drop any potentially existing data
 try:
 with self._indexfp() as read_ifh:
 # the temp file replace the real index when we exit the context
 # manager
 tr.replace(self._indexfile, trindex * self.index.entry_size)
 nodemaputil.setup_persistent_nodemap(tr, self)
-self._chunkclear()
+self._segmentfile = randomaccessfile.randomaccessfile(
+self.opener,
+self._datafile,
+self._chunkcachesize,
+)
 if existing_handles:
 # switched from inline to conventional reopen the index
 ifh = self.__index_write_fp()
 self._writinghandles = (ifh, new_dfh, None)
+self._segmentfile.writing_handle = new_dfh
 new_dfh = None
 finally:
 if new_dfh is not None:
 new_dfh.close()
 transaction.add(self._indexfile, dsize + isize)
 else:
 transaction.add(self._indexfile, isize)
 # exposing all file handle for writing.
 self._writinghandles = (ifh, dfh, sdfh)
+self._segmentfile.writing_handle = ifh if self._inline else dfh
 yield
 if self._docket is not None:
 self._write_docket(transaction)
 finally:
 self._writinghandles = None
+self._segmentfile.writing_handle = None
 if dfh is not None:
 dfh.close()
 if sdfh is not None:
 sdfh.close()
 # closing the index file last to avoid exposing referent to
 self._docket.write(transaction, stripping=True)
 # then reset internal state in memory to forget those revisions
 self._revisioncache = None
 self._chaininfocache = util.lrucachedict(500)
-self._chunkclear()
+self._segmentfile.clear_cache()
 del self.index[rev:-1]
 def checksize(self):
 """Check size of index and data files

Mercurial > public > mercurial-scm > hg

comparison mercurial/revlog.py @ 47425:e0a314bcbc9d