diff mercurial/revlog.py @ 47395:e6292eb33384

revlog: store sidedata in their own file This makes sidedata manipulation simpler and results in more compact data when traversing either data or sidedata. Differential Revision: https://phab.mercurial-scm.org/D10787
author Pierre-Yves David <pierre-yves.david@octobus.net>
date Fri, 28 May 2021 23:41:17 +0200
parents 75e1104f23a2
children 33d626910374
line wrap: on
line diff
--- a/mercurial/revlog.py	Fri May 28 23:41:12 2021 +0200
+++ b/mercurial/revlog.py	Fri May 28 23:41:17 2021 +0200
@@ -1,4 +1,5 @@
 # revlog.py - storage back-end for mercurial
+# coding: utf8
 #
 # Copyright 2005-2007 Olivia Mackall <olivia@selenic.com>
 #
@@ -260,6 +261,11 @@
     b'partial read of revlog %s; expected %d bytes from offset %d, got %d'
 )
 
+FILE_TOO_SHORT_MSG = _(
+    b'cannot read from revlog %s;'
+    b'  expected %d bytes from offset %d, data size is %d'
+)
+
 
 class revlog(object):
     """
@@ -401,6 +407,7 @@
         self._docket_file = None
         self._indexfile = None
         self._datafile = None
+        self._sidedatafile = None
         self._nodemap_file = None
         self.postfix = postfix
         self._trypending = trypending
@@ -445,7 +452,7 @@
         # custom flags.
         self._flagprocessors = dict(flagutil.flagprocessors)
 
-        # 2-tuple of file handles being used for active writing.
+        # 3-tuple of file handles being used for active writing.
         self._writinghandles = None
         # prevent nesting of addgroup
         self._adding_group = None
@@ -634,6 +641,7 @@
 
         if self._docket is not None:
             self._datafile = self._docket.data_filepath()
+            self._sidedatafile = self._docket.sidedata_filepath()
         elif self.postfix is None:
             self._datafile = b'%s.d' % self.radix
         else:
@@ -803,9 +811,14 @@
             with func() as fp:
                 yield fp
 
+    @contextlib.contextmanager
     def _sidedatareadfp(self):
         """file object suitable to read sidedata"""
-        return self._datareadfp()
+        if self._writinghandles:
+            yield self._writinghandles[2]
+        else:
+            with self.opener(self._sidedatafile) as fp:
+                yield fp
 
     def tiprev(self):
         return len(self.index) - 1
@@ -909,6 +922,23 @@
     def start(self, rev):
         return int(self.index[rev][0] >> 16)
 
+    def sidedata_cut_off(self, rev):
+        sd_cut_off = self.index[rev][8]
+        if sd_cut_off != 0:
+            return sd_cut_off
+        # This is some annoying dance, because entries without sidedata
+        # currently use 0 as their ofsset. (instead of previous-offset +
+        # previous-size)
+        #
+        # We should reconsider this sidedata → 0 sidata_offset policy.
+        # In the meantime, we need this.
+        while 0 <= rev:
+            e = self.index[rev]
+            if e[9] != 0:
+                return e[8] + e[9]
+            rev -= 1
+        return 0
+
     def flags(self, rev):
         return self.index[rev][0] & 0xFFFF
 
@@ -2074,11 +2104,19 @@
 
         # XXX this need caching, as we do for data
         with self._sidedatareadfp() as sdf:
-            sdf.seek(sidedata_offset)
+            if self._docket.sidedata_end < sidedata_offset + sidedata_size:
+                filename = self._sidedatafile
+                end = self._docket.sidedata_end
+                offset = sidedata_offset
+                length = sidedata_size
+                m = FILE_TOO_SHORT_MSG % (filename, length, offset, end)
+                raise error.RevlogError(m)
+
+            sdf.seek(sidedata_offset, os.SEEK_SET)
             comp_segment = sdf.read(sidedata_size)
 
             if len(comp_segment) < sidedata_size:
-                filename = self._datafile
+                filename = self._sidedatafile
                 length = sidedata_size
                 offset = sidedata_offset
                 got = len(comp_segment)
@@ -2215,7 +2253,7 @@
             if existing_handles:
                 # switched from inline to conventional reopen the index
                 ifh = self.__index_write_fp()
-                self._writinghandles = (ifh, new_dfh)
+                self._writinghandles = (ifh, new_dfh, None)
                 new_dfh = None
         finally:
             if new_dfh is not None:
@@ -2233,7 +2271,7 @@
         if self._writinghandles is not None:
             yield
         else:
-            ifh = dfh = None
+            ifh = dfh = sdfh = None
             try:
                 r = len(self)
                 # opening the data file.
@@ -2253,6 +2291,17 @@
                             raise
                         dfh = self._datafp(b"w+")
                     transaction.add(self._datafile, dsize)
+                if self._sidedatafile is not None:
+                    try:
+                        sdfh = self.opener(self._sidedatafile, mode=b"r+")
+                        dfh.seek(self._docket.sidedata_end, os.SEEK_SET)
+                    except IOError as inst:
+                        if inst.errno != errno.ENOENT:
+                            raise
+                        sdfh = self.opener(self._sidedatafile, mode=b"w+")
+                    transaction.add(
+                        self._sidedatafile, self._docket.sidedata_end
+                    )
 
                 # opening the index file.
                 isize = r * self.index.entry_size
@@ -2262,7 +2311,7 @@
                 else:
                     transaction.add(self._indexfile, isize)
                 # exposing all file handle for writing.
-                self._writinghandles = (ifh, dfh)
+                self._writinghandles = (ifh, dfh, sdfh)
                 yield
                 if self._docket is not None:
                     self._write_docket(transaction)
@@ -2270,6 +2319,8 @@
                 self._writinghandles = None
                 if dfh is not None:
                     dfh.close()
+                if sdfh is not None:
+                    dfh.close()
                 # closing the index file last to avoid exposing referent to
                 # potential unflushed data content.
                 if ifh is not None:
@@ -2513,7 +2564,8 @@
         offset = self._get_data_offset(prev)
 
         if self._concurrencychecker:
-            ifh, dfh = self._writinghandles
+            ifh, dfh, sdfh = self._writinghandles
+            # XXX no checking for the sidedata file
             if self._inline:
                 # offset is "as if" it were in the .d file, so we need to add on
                 # the size of the entry metadata.
@@ -2570,7 +2622,7 @@
         if sidedata and self.hassidedata:
             sidedata_compression_mode = COMP_MODE_PLAIN
             serialized_sidedata = sidedatautil.serialize_sidedata(sidedata)
-            sidedata_offset = offset + deltainfo.deltalen
+            sidedata_offset = self._docket.sidedata_end
             h, comp_sidedata = self.compress(serialized_sidedata)
             if (
                 h != b'u'
@@ -2622,6 +2674,7 @@
             link,
             offset,
             serialized_sidedata,
+            sidedata_offset,
         )
 
         rawtext = btext[0]
@@ -2648,7 +2701,9 @@
         else:
             return self._docket.data_end
 
-    def _writeentry(self, transaction, entry, data, link, offset, sidedata):
+    def _writeentry(
+        self, transaction, entry, data, link, offset, sidedata, sidedata_offset
+    ):
         # Files opened in a+ mode have inconsistent behavior on various
         # platforms. Windows requires that a file positioning call be made
         # when the file handle transitions between reads and writes. See
@@ -2664,7 +2719,7 @@
         if self._writinghandles is None:
             msg = b'adding revision outside `revlog._writing` context'
             raise error.ProgrammingError(msg)
-        ifh, dfh = self._writinghandles
+        ifh, dfh, sdfh = self._writinghandles
         if self._docket is None:
             ifh.seek(0, os.SEEK_END)
         else:
@@ -2674,16 +2729,20 @@
                 dfh.seek(0, os.SEEK_END)
             else:
                 dfh.seek(self._docket.data_end, os.SEEK_SET)
+        if sdfh:
+            sdfh.seek(self._docket.sidedata_end, os.SEEK_SET)
 
         curr = len(self) - 1
         if not self._inline:
             transaction.add(self._datafile, offset)
+            if self._sidedatafile:
+                transaction.add(self._sidedatafile, sidedata_offset)
             transaction.add(self._indexfile, curr * len(entry))
             if data[0]:
                 dfh.write(data[0])
             dfh.write(data[1])
             if sidedata:
-                dfh.write(sidedata)
+                sdfh.write(sidedata)
             ifh.write(entry)
         else:
             offset += curr * self.index.entry_size
@@ -2691,12 +2750,12 @@
             ifh.write(entry)
             ifh.write(data[0])
             ifh.write(data[1])
-            if sidedata:
-                ifh.write(sidedata)
+            assert not sidedata
             self._enforceinlinesize(transaction)
         if self._docket is not None:
             self._docket.index_end = self._writinghandles[0].tell()
             self._docket.data_end = self._writinghandles[1].tell()
+            self._docket.sidedata_end = self._writinghandles[2].tell()
 
         nodemaputil.setup_persistent_nodemap(transaction, self)
 
@@ -2866,12 +2925,17 @@
         else:
             end = data_end + (rev * self.index.entry_size)
 
+        if self._sidedatafile:
+            sidedata_end = self.sidedata_cut_off(rev)
+            transaction.add(self._sidedatafile, sidedata_end)
+
         transaction.add(self._indexfile, end)
         if self._docket is not None:
             # XXX we could, leverage the docket while stripping. However it is
             # not powerfull enough at the time of this comment
             self._docket.index_end = end
             self._docket.data_end = data_end
+            self._docket.sidedata_end = sidedata_end
             self._docket.write(transaction, stripping=True)
 
         # then reset internal state in memory to forget those revisions
@@ -3398,13 +3462,10 @@
         new_entries = []
         # append the new sidedata
         with self._writing(transaction):
-            ifh, dfh = self._writinghandles
-            if self._docket is not None:
-                dfh.seek(self._docket.data_end, os.SEEK_SET)
-            else:
-                dfh.seek(0, os.SEEK_END)
-
-            current_offset = dfh.tell()
+            ifh, dfh, sdfh = self._writinghandles
+            dfh.seek(self._docket.sidedata_end, os.SEEK_SET)
+
+            current_offset = sdfh.tell()
             for rev in range(startrev, endrev + 1):
                 entry = self.index[rev]
                 new_sidedata, flags = sidedatautil.run_sidedata_helpers(
@@ -3455,12 +3516,11 @@
                 )
 
                 # the sidedata computation might have move the file cursors around
-                dfh.seek(current_offset, os.SEEK_SET)
-                dfh.write(serialized_sidedata)
+                sdfh.seek(current_offset, os.SEEK_SET)
+                sdfh.write(serialized_sidedata)
                 new_entries.append(entry_update)
                 current_offset += len(serialized_sidedata)
-                if self._docket is not None:
-                    self._docket.data_end = dfh.tell()
+                self._docket.sidedata_end = sdfh.tell()
 
             # rewrite the new index entries
             ifh.seek(startrev * self.index.entry_size)