changeset 52971:469b9a628b51

dirstatemap: update, document and type the identity tracking This new form should hopefully be clearer and less error prone.
author Pierre-Yves David <pierre-yves.david@octobus.net>
date Tue, 18 Feb 2025 22:24:08 +0100
parents 42f78c859dd1
children e189a555865c
files mercurial/dirstatemap.py
diffstat 1 files changed, 38 insertions(+), 18 deletions(-) [+]
line wrap: on
line diff
--- a/mercurial/dirstatemap.py	Tue Feb 18 22:49:43 2025 +0100
+++ b/mercurial/dirstatemap.py	Tue Feb 18 22:24:08 2025 +0100
@@ -10,6 +10,7 @@
 from typing import (
     Optional,
     TYPE_CHECKING,
+    Tuple,
 )
 
 from .i18n import _
@@ -109,9 +110,6 @@
         # for consistent view between _pl() and _read() invocations
         self._pendingmode = None
 
-    def _set_identity(self) -> None:
-        self.identity = self._get_current_identity()
-
     def _get_current_identity(self) -> Optional[typelib.CacheStat]:
         # TODO have a cleaner approach on httpstaticrepo side
         path = self._opener.join(self._filename)
@@ -172,14 +170,34 @@
         self._pendingmode = mode
         return fp
 
-    def _readdirstatefile(self, size: int = -1) -> bytes:
+    def _readdirstatefile(
+        self,
+        size: int = -1,
+    ) -> Tuple[Optional[typelib.CacheStat], bytes]:
+        """read the content of the file used as "entry point" for the dirstate
+
+        Return a (identity, data) tuple. The identity can be used for cache
+        validation and concurrent changes detection and must be set as
+        `self.identity` if `data` is preserved.
+        """
+        identity = self._get_current_identity()
+        # There is a race condition between fetching the identity and reading
+        # the file content.  Another process might update the file after we get
+        # the identity information.  However this is fine for our purpose as
+        # this will only create false-positive for "data changed" and no
+        # false-negative.
+        #
+        # in addition for the case that matter the most (updating the dirstate
+        # semantic content and not just some cache information), this will not
+        # happens as the lock should be held when changes to the dirstate
+        # content are made.
         testing.wait_on_cfg(self._ui, b'dirstate.pre-read-file')
         try:
             with self._opendirstatefile() as fp:
-                return fp.read(size)
+                return identity, fp.read(size)
         except FileNotFoundError:
             # File doesn't exist, so the current state is empty
-            return b''
+            return identity, b''
 
     @property
     def docket(self) -> docketmod.DirstateDocket:
@@ -189,8 +207,7 @@
                 raise error.ProgrammingError(
                     b'dirstate only has a docket in v2 format'
                 )
-            self._set_identity()
-            data = self._readdirstatefile()
+            self.identity, data = self._readdirstatefile()
             if data == b'' or data.startswith(docketmod.V2_FORMAT_MARKER):
                 self._docket = docketmod.DirstateDocket.parse(
                     data, self._nodeconstants
@@ -281,7 +298,7 @@
 
     def _v1_parents(self, from_v2_exception=None):
         read_len = self._nodelen * 2
-        st = self._readdirstatefile(read_len)
+        _identity, st = self._readdirstatefile(read_len)
         l = len(st)
         if l == read_len:
             self._parents = (
@@ -402,16 +419,14 @@
                 self.docket
             except error.CorruptedDirstate:
                 # fall back to dirstate-v1 if we fail to read v2
-                self._set_identity()
-                st = self._readdirstatefile()
+                self.identity, st = self._readdirstatefile()
             else:
                 if not self.docket.uuid:
                     return
                 testing.wait_on_cfg(self._ui, b'dirstate.post-docket-read-file')
                 st = self._read_v2_data()
         else:
-            self._set_identity()
-            st = self._readdirstatefile()
+            self.identity, st = self._readdirstatefile()
 
         if not st:
             return
@@ -474,7 +489,14 @@
         self._dirtyparents = False
 
     @propertycache
-    def identity(self):
+    def identity(self) -> Optional[typelib.CacheStat]:
+        """A cache identifier for the state of the file as data were read
+
+        This must always be set with the object returned from
+        `self._readdirstatefile()`. assigning another value later will break
+        some security mechanism and can lead to misbehavior when concurrent
+        operation are run.
+        """
         self._map
         return self.identity
 
@@ -733,11 +755,9 @@
 
         def _v1_map(self, from_v2_exception=None):
             try:
-                self._set_identity()
+                self.identity, data = self._readdirstatefile()
                 identity = self._get_rust_identity()
-                self._map, parents = rustmod.DirstateMap.new_v1(
-                    self._readdirstatefile(), identity
-                )
+                self._map, parents = rustmod.DirstateMap.new_v1(data, identity)
             except OSError as e:
                 if from_v2_exception is not None:
                     raise e from from_v2_exception