diff mercurial/revlogutils/randomaccessfile.py @ 47431:e0a314bcbc9d

revlog: Extract low-level random-access file read caching logic The `revlog` class does many things, among which fulfilling requests for arbitrary byte slices from the revlog "data file" by reading a larger chunk and caching it in memory, in order to reduce the number of system calls. This extracts that logic into a new class, so that it may later also be used for the side-data file (with another instance of that class). The copyright notice of the new file does not include a date or author name since such information tend not to be kept up-to-date: https://www.linuxfoundation.org/en/blog/copyright-notices-in-open-source-software-projects/ Differential Revision: https://phab.mercurial-scm.org/D10878
author Simon Sapin <simon.sapin@octobus.net>
date Tue, 08 Jun 2021 19:55:00 +0200
parents
children 5fa083a5ff04
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mercurial/revlogutils/randomaccessfile.py	Tue Jun 08 19:55:00 2021 +0200
@@ -0,0 +1,138 @@
+# Copyright Mercurial Contributors
+#
+# This software may be used and distributed according to the terms of the
+# GNU General Public License version 2 or any later version.
+
+import contextlib
+
+from ..i18n import _
+from .. import (
+    error,
+    util,
+)
+
+
+_MAX_CACHED_CHUNK_SIZE = 1048576  # 1 MiB
+
+PARTIAL_READ_MSG = _(
+    b'partial read of revlog %s; expected %d bytes from offset %d, got %d'
+)
+
+
+def _is_power_of_two(n):
+    return (n & (n - 1) == 0) and n != 0
+
+
+class randomaccessfile(object):
+    """Accessing arbitrary chuncks of data within a file, with some caching"""
+
+    def __init__(
+        self,
+        opener,
+        filename,
+        default_cached_chunk_size,
+        initial_cache=None,
+    ):
+        # Required by bitwise manipulation below
+        assert _is_power_of_two(default_cached_chunk_size)
+
+        self.opener = opener
+        self.filename = filename
+        self.default_cached_chunk_size = default_cached_chunk_size
+        self.writing_handle = None  # This is set from revlog.py
+        self._cached_chunk = b''
+        self._cached_chunk_position = 0  # Offset from the start of the file
+        if initial_cache:
+            self._cached_chunk_position, self._cached_chunk = initial_cache
+
+    def clear_cache(self):
+        self._cached_chunk = b''
+        self._cached_chunk_position = 0
+
+    def _open(self, mode=b'r'):
+        """Return a file object"""
+        return self.opener(self.filename, mode=mode)
+
+    @contextlib.contextmanager
+    def _open_read(self, existing_file_obj=None):
+        """File object suitable for reading data"""
+        # Use explicit file handle, if given.
+        if existing_file_obj is not None:
+            yield existing_file_obj
+
+        # Use a file handle being actively used for writes, if available.
+        # There is some danger to doing this because reads will seek the
+        # file. However, revlog._writeentry performs a SEEK_END before all
+        # writes, so we should be safe.
+        elif self.writing_handle:
+            yield self.writing_handle
+
+        # Otherwise open a new file handle.
+        else:
+            with self._open() as fp:
+                yield fp
+
+    def read_chunk(self, offset, length, existing_file_obj=None):
+        """Read a chunk of bytes from the file.
+
+        Accepts an absolute offset, length to read, and an optional existing
+        file handle to read from.
+
+        If an existing file handle is passed, it will be seeked and the
+        original seek position will NOT be restored.
+
+        Returns a str or buffer of raw byte data.
+
+        Raises if the requested number of bytes could not be read.
+        """
+        end = offset + length
+        cache_start = self._cached_chunk_position
+        cache_end = cache_start + len(self._cached_chunk)
+        # Is the requested chunk within the cache?
+        if cache_start <= offset and end <= cache_end:
+            if cache_start == offset and end == cache_end:
+                return self._cached_chunk  # avoid a copy
+            relative_start = offset - cache_start
+            return util.buffer(self._cached_chunk, relative_start, length)
+
+        return self._read_and_update_cache(offset, length, existing_file_obj)
+
+    def _read_and_update_cache(self, offset, length, existing_file_obj=None):
+        # Cache data both forward and backward around the requested
+        # data, in a fixed size window. This helps speed up operations
+        # involving reading the revlog backwards.
+        real_offset = offset & ~(self.default_cached_chunk_size - 1)
+        real_length = (
+            (offset + length + self.default_cached_chunk_size)
+            & ~(self.default_cached_chunk_size - 1)
+        ) - real_offset
+        with self._open_read(existing_file_obj) as file_obj:
+            file_obj.seek(real_offset)
+            data = file_obj.read(real_length)
+
+        self._add_cached_chunk(real_offset, data)
+
+        relative_offset = offset - real_offset
+        got = len(data) - relative_offset
+        if got < length:
+            message = PARTIAL_READ_MSG % (self.filename, length, offset, got)
+            raise error.RevlogError(message)
+
+        if offset != real_offset or real_length != length:
+            return util.buffer(data, relative_offset, length)
+        return data
+
+    def _add_cached_chunk(self, offset, data):
+        """Add to or replace the cached data chunk.
+
+        Accepts an absolute offset and the data that is at that location.
+        """
+        if (
+            self._cached_chunk_position + len(self._cached_chunk) == offset
+            and len(self._cached_chunk) + len(data) < _MAX_CACHED_CHUNK_SIZE
+        ):
+            # add to existing cache
+            self._cached_chunk += data
+        else:
+            self._cached_chunk = data
+            self._cached_chunk_position = offset