Mercurial > public > mercurial-scm > hg-stable
diff mercurial/revlogutils/randomaccessfile.py @ 47431:e0a314bcbc9d
revlog: Extract low-level random-access file read caching logic
The `revlog` class does many things, among which fulfilling requests for
arbitrary byte slices from the revlog "data file" by reading a larger chunk
and caching it in memory, in order to reduce the number of system calls.
This extracts that logic into a new class, so that it may later also be used
for the side-data file (with another instance of that class).
The copyright notice of the new file does not include a date or author name
since such information tend not to be kept up-to-date:
https://www.linuxfoundation.org/en/blog/copyright-notices-in-open-source-software-projects/
Differential Revision: https://phab.mercurial-scm.org/D10878
author | Simon Sapin <simon.sapin@octobus.net> |
---|---|
date | Tue, 08 Jun 2021 19:55:00 +0200 |
parents | |
children | 5fa083a5ff04 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mercurial/revlogutils/randomaccessfile.py Tue Jun 08 19:55:00 2021 +0200 @@ -0,0 +1,138 @@ +# Copyright Mercurial Contributors +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. + +import contextlib + +from ..i18n import _ +from .. import ( + error, + util, +) + + +_MAX_CACHED_CHUNK_SIZE = 1048576 # 1 MiB + +PARTIAL_READ_MSG = _( + b'partial read of revlog %s; expected %d bytes from offset %d, got %d' +) + + +def _is_power_of_two(n): + return (n & (n - 1) == 0) and n != 0 + + +class randomaccessfile(object): + """Accessing arbitrary chuncks of data within a file, with some caching""" + + def __init__( + self, + opener, + filename, + default_cached_chunk_size, + initial_cache=None, + ): + # Required by bitwise manipulation below + assert _is_power_of_two(default_cached_chunk_size) + + self.opener = opener + self.filename = filename + self.default_cached_chunk_size = default_cached_chunk_size + self.writing_handle = None # This is set from revlog.py + self._cached_chunk = b'' + self._cached_chunk_position = 0 # Offset from the start of the file + if initial_cache: + self._cached_chunk_position, self._cached_chunk = initial_cache + + def clear_cache(self): + self._cached_chunk = b'' + self._cached_chunk_position = 0 + + def _open(self, mode=b'r'): + """Return a file object""" + return self.opener(self.filename, mode=mode) + + @contextlib.contextmanager + def _open_read(self, existing_file_obj=None): + """File object suitable for reading data""" + # Use explicit file handle, if given. + if existing_file_obj is not None: + yield existing_file_obj + + # Use a file handle being actively used for writes, if available. + # There is some danger to doing this because reads will seek the + # file. However, revlog._writeentry performs a SEEK_END before all + # writes, so we should be safe. + elif self.writing_handle: + yield self.writing_handle + + # Otherwise open a new file handle. + else: + with self._open() as fp: + yield fp + + def read_chunk(self, offset, length, existing_file_obj=None): + """Read a chunk of bytes from the file. + + Accepts an absolute offset, length to read, and an optional existing + file handle to read from. + + If an existing file handle is passed, it will be seeked and the + original seek position will NOT be restored. + + Returns a str or buffer of raw byte data. + + Raises if the requested number of bytes could not be read. + """ + end = offset + length + cache_start = self._cached_chunk_position + cache_end = cache_start + len(self._cached_chunk) + # Is the requested chunk within the cache? + if cache_start <= offset and end <= cache_end: + if cache_start == offset and end == cache_end: + return self._cached_chunk # avoid a copy + relative_start = offset - cache_start + return util.buffer(self._cached_chunk, relative_start, length) + + return self._read_and_update_cache(offset, length, existing_file_obj) + + def _read_and_update_cache(self, offset, length, existing_file_obj=None): + # Cache data both forward and backward around the requested + # data, in a fixed size window. This helps speed up operations + # involving reading the revlog backwards. + real_offset = offset & ~(self.default_cached_chunk_size - 1) + real_length = ( + (offset + length + self.default_cached_chunk_size) + & ~(self.default_cached_chunk_size - 1) + ) - real_offset + with self._open_read(existing_file_obj) as file_obj: + file_obj.seek(real_offset) + data = file_obj.read(real_length) + + self._add_cached_chunk(real_offset, data) + + relative_offset = offset - real_offset + got = len(data) - relative_offset + if got < length: + message = PARTIAL_READ_MSG % (self.filename, length, offset, got) + raise error.RevlogError(message) + + if offset != real_offset or real_length != length: + return util.buffer(data, relative_offset, length) + return data + + def _add_cached_chunk(self, offset, data): + """Add to or replace the cached data chunk. + + Accepts an absolute offset and the data that is at that location. + """ + if ( + self._cached_chunk_position + len(self._cached_chunk) == offset + and len(self._cached_chunk) + len(data) < _MAX_CACHED_CHUNK_SIZE + ): + # add to existing cache + self._cached_chunk += data + else: + self._cached_chunk = data + self._cached_chunk_position = offset