comparison mercurial/revlog.py @ 47395:e6292eb33384

revlog: store sidedata in their own file This makes sidedata manipulation simpler and results in more compact data when traversing either data or sidedata. Differential Revision: https://phab.mercurial-scm.org/D10787
author Pierre-Yves David <pierre-yves.david@octobus.net>
date Fri, 28 May 2021 23:41:17 +0200
parents 75e1104f23a2
children 33d626910374
comparison
equal deleted inserted replaced
47394:bcf92bdc2bca 47395:e6292eb33384
1 # revlog.py - storage back-end for mercurial 1 # revlog.py - storage back-end for mercurial
2 # coding: utf8
2 # 3 #
3 # Copyright 2005-2007 Olivia Mackall <olivia@selenic.com> 4 # Copyright 2005-2007 Olivia Mackall <olivia@selenic.com>
4 # 5 #
5 # This software may be used and distributed according to the terms of the 6 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version. 7 # GNU General Public License version 2 or any later version.
258 259
259 PARTIAL_READ_MSG = _( 260 PARTIAL_READ_MSG = _(
260 b'partial read of revlog %s; expected %d bytes from offset %d, got %d' 261 b'partial read of revlog %s; expected %d bytes from offset %d, got %d'
261 ) 262 )
262 263
264 FILE_TOO_SHORT_MSG = _(
265 b'cannot read from revlog %s;'
266 b' expected %d bytes from offset %d, data size is %d'
267 )
268
263 269
264 class revlog(object): 270 class revlog(object):
265 """ 271 """
266 the underlying revision storage object 272 the underlying revision storage object
267 273
399 self.radix = radix 405 self.radix = radix
400 406
401 self._docket_file = None 407 self._docket_file = None
402 self._indexfile = None 408 self._indexfile = None
403 self._datafile = None 409 self._datafile = None
410 self._sidedatafile = None
404 self._nodemap_file = None 411 self._nodemap_file = None
405 self.postfix = postfix 412 self.postfix = postfix
406 self._trypending = trypending 413 self._trypending = trypending
407 self.opener = opener 414 self.opener = opener
408 if persistentnodemap: 415 if persistentnodemap:
443 450
444 # Make copy of flag processors so each revlog instance can support 451 # Make copy of flag processors so each revlog instance can support
445 # custom flags. 452 # custom flags.
446 self._flagprocessors = dict(flagutil.flagprocessors) 453 self._flagprocessors = dict(flagutil.flagprocessors)
447 454
448 # 2-tuple of file handles being used for active writing. 455 # 3-tuple of file handles being used for active writing.
449 self._writinghandles = None 456 self._writinghandles = None
450 # prevent nesting of addgroup 457 # prevent nesting of addgroup
451 self._adding_group = None 458 self._adding_group = None
452 459
453 self._loadindex() 460 self._loadindex()
632 # main docket, so disable it for now. 639 # main docket, so disable it for now.
633 self._nodemap_file = None 640 self._nodemap_file = None
634 641
635 if self._docket is not None: 642 if self._docket is not None:
636 self._datafile = self._docket.data_filepath() 643 self._datafile = self._docket.data_filepath()
644 self._sidedatafile = self._docket.sidedata_filepath()
637 elif self.postfix is None: 645 elif self.postfix is None:
638 self._datafile = b'%s.d' % self.radix 646 self._datafile = b'%s.d' % self.radix
639 else: 647 else:
640 self._datafile = b'%s.d.%s' % (self.radix, self.postfix) 648 self._datafile = b'%s.d.%s' % (self.radix, self.postfix)
641 649
801 else: 809 else:
802 func = self._datafp 810 func = self._datafp
803 with func() as fp: 811 with func() as fp:
804 yield fp 812 yield fp
805 813
814 @contextlib.contextmanager
806 def _sidedatareadfp(self): 815 def _sidedatareadfp(self):
807 """file object suitable to read sidedata""" 816 """file object suitable to read sidedata"""
808 return self._datareadfp() 817 if self._writinghandles:
818 yield self._writinghandles[2]
819 else:
820 with self.opener(self._sidedatafile) as fp:
821 yield fp
809 822
810 def tiprev(self): 823 def tiprev(self):
811 return len(self.index) - 1 824 return len(self.index) - 1
812 825
813 def tip(self): 826 def tip(self):
906 919
907 # First tuple entry is 8 bytes. First 6 bytes are offset. Last 2 bytes 920 # First tuple entry is 8 bytes. First 6 bytes are offset. Last 2 bytes
908 # are flags. 921 # are flags.
909 def start(self, rev): 922 def start(self, rev):
910 return int(self.index[rev][0] >> 16) 923 return int(self.index[rev][0] >> 16)
924
925 def sidedata_cut_off(self, rev):
926 sd_cut_off = self.index[rev][8]
927 if sd_cut_off != 0:
928 return sd_cut_off
929 # This is some annoying dance, because entries without sidedata
930 # currently use 0 as their ofsset. (instead of previous-offset +
931 # previous-size)
932 #
933 # We should reconsider this sidedata → 0 sidata_offset policy.
934 # In the meantime, we need this.
935 while 0 <= rev:
936 e = self.index[rev]
937 if e[9] != 0:
938 return e[8] + e[9]
939 rev -= 1
940 return 0
911 941
912 def flags(self, rev): 942 def flags(self, rev):
913 return self.index[rev][0] & 0xFFFF 943 return self.index[rev][0] & 0xFFFF
914 944
915 def length(self, rev): 945 def length(self, rev):
2072 if sidedata_size == 0: 2102 if sidedata_size == 0:
2073 return {} 2103 return {}
2074 2104
2075 # XXX this need caching, as we do for data 2105 # XXX this need caching, as we do for data
2076 with self._sidedatareadfp() as sdf: 2106 with self._sidedatareadfp() as sdf:
2077 sdf.seek(sidedata_offset) 2107 if self._docket.sidedata_end < sidedata_offset + sidedata_size:
2108 filename = self._sidedatafile
2109 end = self._docket.sidedata_end
2110 offset = sidedata_offset
2111 length = sidedata_size
2112 m = FILE_TOO_SHORT_MSG % (filename, length, offset, end)
2113 raise error.RevlogError(m)
2114
2115 sdf.seek(sidedata_offset, os.SEEK_SET)
2078 comp_segment = sdf.read(sidedata_size) 2116 comp_segment = sdf.read(sidedata_size)
2079 2117
2080 if len(comp_segment) < sidedata_size: 2118 if len(comp_segment) < sidedata_size:
2081 filename = self._datafile 2119 filename = self._sidedatafile
2082 length = sidedata_size 2120 length = sidedata_size
2083 offset = sidedata_offset 2121 offset = sidedata_offset
2084 got = len(comp_segment) 2122 got = len(comp_segment)
2085 m = PARTIAL_READ_MSG % (filename, length, offset, got) 2123 m = PARTIAL_READ_MSG % (filename, length, offset, got)
2086 raise error.RevlogError(m) 2124 raise error.RevlogError(m)
2213 self._chunkclear() 2251 self._chunkclear()
2214 2252
2215 if existing_handles: 2253 if existing_handles:
2216 # switched from inline to conventional reopen the index 2254 # switched from inline to conventional reopen the index
2217 ifh = self.__index_write_fp() 2255 ifh = self.__index_write_fp()
2218 self._writinghandles = (ifh, new_dfh) 2256 self._writinghandles = (ifh, new_dfh, None)
2219 new_dfh = None 2257 new_dfh = None
2220 finally: 2258 finally:
2221 if new_dfh is not None: 2259 if new_dfh is not None:
2222 new_dfh.close() 2260 new_dfh.close()
2223 2261
2231 msg %= self.display_id 2269 msg %= self.display_id
2232 raise error.ProgrammingError(msg) 2270 raise error.ProgrammingError(msg)
2233 if self._writinghandles is not None: 2271 if self._writinghandles is not None:
2234 yield 2272 yield
2235 else: 2273 else:
2236 ifh = dfh = None 2274 ifh = dfh = sdfh = None
2237 try: 2275 try:
2238 r = len(self) 2276 r = len(self)
2239 # opening the data file. 2277 # opening the data file.
2240 dsize = 0 2278 dsize = 0
2241 if r: 2279 if r:
2251 except IOError as inst: 2289 except IOError as inst:
2252 if inst.errno != errno.ENOENT: 2290 if inst.errno != errno.ENOENT:
2253 raise 2291 raise
2254 dfh = self._datafp(b"w+") 2292 dfh = self._datafp(b"w+")
2255 transaction.add(self._datafile, dsize) 2293 transaction.add(self._datafile, dsize)
2294 if self._sidedatafile is not None:
2295 try:
2296 sdfh = self.opener(self._sidedatafile, mode=b"r+")
2297 dfh.seek(self._docket.sidedata_end, os.SEEK_SET)
2298 except IOError as inst:
2299 if inst.errno != errno.ENOENT:
2300 raise
2301 sdfh = self.opener(self._sidedatafile, mode=b"w+")
2302 transaction.add(
2303 self._sidedatafile, self._docket.sidedata_end
2304 )
2256 2305
2257 # opening the index file. 2306 # opening the index file.
2258 isize = r * self.index.entry_size 2307 isize = r * self.index.entry_size
2259 ifh = self.__index_write_fp() 2308 ifh = self.__index_write_fp()
2260 if self._inline: 2309 if self._inline:
2261 transaction.add(self._indexfile, dsize + isize) 2310 transaction.add(self._indexfile, dsize + isize)
2262 else: 2311 else:
2263 transaction.add(self._indexfile, isize) 2312 transaction.add(self._indexfile, isize)
2264 # exposing all file handle for writing. 2313 # exposing all file handle for writing.
2265 self._writinghandles = (ifh, dfh) 2314 self._writinghandles = (ifh, dfh, sdfh)
2266 yield 2315 yield
2267 if self._docket is not None: 2316 if self._docket is not None:
2268 self._write_docket(transaction) 2317 self._write_docket(transaction)
2269 finally: 2318 finally:
2270 self._writinghandles = None 2319 self._writinghandles = None
2271 if dfh is not None: 2320 if dfh is not None:
2321 dfh.close()
2322 if sdfh is not None:
2272 dfh.close() 2323 dfh.close()
2273 # closing the index file last to avoid exposing referent to 2324 # closing the index file last to avoid exposing referent to
2274 # potential unflushed data content. 2325 # potential unflushed data content.
2275 if ifh is not None: 2326 if ifh is not None:
2276 ifh.close() 2327 ifh.close()
2511 prev = curr - 1 2562 prev = curr - 1
2512 2563
2513 offset = self._get_data_offset(prev) 2564 offset = self._get_data_offset(prev)
2514 2565
2515 if self._concurrencychecker: 2566 if self._concurrencychecker:
2516 ifh, dfh = self._writinghandles 2567 ifh, dfh, sdfh = self._writinghandles
2568 # XXX no checking for the sidedata file
2517 if self._inline: 2569 if self._inline:
2518 # offset is "as if" it were in the .d file, so we need to add on 2570 # offset is "as if" it were in the .d file, so we need to add on
2519 # the size of the entry metadata. 2571 # the size of the entry metadata.
2520 self._concurrencychecker( 2572 self._concurrencychecker(
2521 ifh, self._indexfile, offset + curr * self.index.entry_size 2573 ifh, self._indexfile, offset + curr * self.index.entry_size
2568 2620
2569 sidedata_compression_mode = COMP_MODE_INLINE 2621 sidedata_compression_mode = COMP_MODE_INLINE
2570 if sidedata and self.hassidedata: 2622 if sidedata and self.hassidedata:
2571 sidedata_compression_mode = COMP_MODE_PLAIN 2623 sidedata_compression_mode = COMP_MODE_PLAIN
2572 serialized_sidedata = sidedatautil.serialize_sidedata(sidedata) 2624 serialized_sidedata = sidedatautil.serialize_sidedata(sidedata)
2573 sidedata_offset = offset + deltainfo.deltalen 2625 sidedata_offset = self._docket.sidedata_end
2574 h, comp_sidedata = self.compress(serialized_sidedata) 2626 h, comp_sidedata = self.compress(serialized_sidedata)
2575 if ( 2627 if (
2576 h != b'u' 2628 h != b'u'
2577 and comp_sidedata[0:1] != b'\0' 2629 and comp_sidedata[0:1] != b'\0'
2578 and len(comp_sidedata) < len(serialized_sidedata) 2630 and len(comp_sidedata) < len(serialized_sidedata)
2620 entry, 2672 entry,
2621 deltainfo.data, 2673 deltainfo.data,
2622 link, 2674 link,
2623 offset, 2675 offset,
2624 serialized_sidedata, 2676 serialized_sidedata,
2677 sidedata_offset,
2625 ) 2678 )
2626 2679
2627 rawtext = btext[0] 2680 rawtext = btext[0]
2628 2681
2629 if alwayscache and rawtext is None: 2682 if alwayscache and rawtext is None:
2646 if self._docket is None: 2699 if self._docket is None:
2647 return self.end(prev) 2700 return self.end(prev)
2648 else: 2701 else:
2649 return self._docket.data_end 2702 return self._docket.data_end
2650 2703
2651 def _writeentry(self, transaction, entry, data, link, offset, sidedata): 2704 def _writeentry(
2705 self, transaction, entry, data, link, offset, sidedata, sidedata_offset
2706 ):
2652 # Files opened in a+ mode have inconsistent behavior on various 2707 # Files opened in a+ mode have inconsistent behavior on various
2653 # platforms. Windows requires that a file positioning call be made 2708 # platforms. Windows requires that a file positioning call be made
2654 # when the file handle transitions between reads and writes. See 2709 # when the file handle transitions between reads and writes. See
2655 # 3686fa2b8eee and the mixedfilemodewrapper in windows.py. On other 2710 # 3686fa2b8eee and the mixedfilemodewrapper in windows.py. On other
2656 # platforms, Python or the platform itself can be buggy. Some versions 2711 # platforms, Python or the platform itself can be buggy. Some versions
2662 # the file handle is reused for reads and may be seeked there, we need 2717 # the file handle is reused for reads and may be seeked there, we need
2663 # to be careful before changing this. 2718 # to be careful before changing this.
2664 if self._writinghandles is None: 2719 if self._writinghandles is None:
2665 msg = b'adding revision outside `revlog._writing` context' 2720 msg = b'adding revision outside `revlog._writing` context'
2666 raise error.ProgrammingError(msg) 2721 raise error.ProgrammingError(msg)
2667 ifh, dfh = self._writinghandles 2722 ifh, dfh, sdfh = self._writinghandles
2668 if self._docket is None: 2723 if self._docket is None:
2669 ifh.seek(0, os.SEEK_END) 2724 ifh.seek(0, os.SEEK_END)
2670 else: 2725 else:
2671 ifh.seek(self._docket.index_end, os.SEEK_SET) 2726 ifh.seek(self._docket.index_end, os.SEEK_SET)
2672 if dfh: 2727 if dfh:
2673 if self._docket is None: 2728 if self._docket is None:
2674 dfh.seek(0, os.SEEK_END) 2729 dfh.seek(0, os.SEEK_END)
2675 else: 2730 else:
2676 dfh.seek(self._docket.data_end, os.SEEK_SET) 2731 dfh.seek(self._docket.data_end, os.SEEK_SET)
2732 if sdfh:
2733 sdfh.seek(self._docket.sidedata_end, os.SEEK_SET)
2677 2734
2678 curr = len(self) - 1 2735 curr = len(self) - 1
2679 if not self._inline: 2736 if not self._inline:
2680 transaction.add(self._datafile, offset) 2737 transaction.add(self._datafile, offset)
2738 if self._sidedatafile:
2739 transaction.add(self._sidedatafile, sidedata_offset)
2681 transaction.add(self._indexfile, curr * len(entry)) 2740 transaction.add(self._indexfile, curr * len(entry))
2682 if data[0]: 2741 if data[0]:
2683 dfh.write(data[0]) 2742 dfh.write(data[0])
2684 dfh.write(data[1]) 2743 dfh.write(data[1])
2685 if sidedata: 2744 if sidedata:
2686 dfh.write(sidedata) 2745 sdfh.write(sidedata)
2687 ifh.write(entry) 2746 ifh.write(entry)
2688 else: 2747 else:
2689 offset += curr * self.index.entry_size 2748 offset += curr * self.index.entry_size
2690 transaction.add(self._indexfile, offset) 2749 transaction.add(self._indexfile, offset)
2691 ifh.write(entry) 2750 ifh.write(entry)
2692 ifh.write(data[0]) 2751 ifh.write(data[0])
2693 ifh.write(data[1]) 2752 ifh.write(data[1])
2694 if sidedata: 2753 assert not sidedata
2695 ifh.write(sidedata)
2696 self._enforceinlinesize(transaction) 2754 self._enforceinlinesize(transaction)
2697 if self._docket is not None: 2755 if self._docket is not None:
2698 self._docket.index_end = self._writinghandles[0].tell() 2756 self._docket.index_end = self._writinghandles[0].tell()
2699 self._docket.data_end = self._writinghandles[1].tell() 2757 self._docket.data_end = self._writinghandles[1].tell()
2758 self._docket.sidedata_end = self._writinghandles[2].tell()
2700 2759
2701 nodemaputil.setup_persistent_nodemap(transaction, self) 2760 nodemaputil.setup_persistent_nodemap(transaction, self)
2702 2761
2703 def addgroup( 2762 def addgroup(
2704 self, 2763 self,
2864 transaction.add(self._datafile, data_end) 2923 transaction.add(self._datafile, data_end)
2865 end = rev * self.index.entry_size 2924 end = rev * self.index.entry_size
2866 else: 2925 else:
2867 end = data_end + (rev * self.index.entry_size) 2926 end = data_end + (rev * self.index.entry_size)
2868 2927
2928 if self._sidedatafile:
2929 sidedata_end = self.sidedata_cut_off(rev)
2930 transaction.add(self._sidedatafile, sidedata_end)
2931
2869 transaction.add(self._indexfile, end) 2932 transaction.add(self._indexfile, end)
2870 if self._docket is not None: 2933 if self._docket is not None:
2871 # XXX we could, leverage the docket while stripping. However it is 2934 # XXX we could, leverage the docket while stripping. However it is
2872 # not powerfull enough at the time of this comment 2935 # not powerfull enough at the time of this comment
2873 self._docket.index_end = end 2936 self._docket.index_end = end
2874 self._docket.data_end = data_end 2937 self._docket.data_end = data_end
2938 self._docket.sidedata_end = sidedata_end
2875 self._docket.write(transaction, stripping=True) 2939 self._docket.write(transaction, stripping=True)
2876 2940
2877 # then reset internal state in memory to forget those revisions 2941 # then reset internal state in memory to forget those revisions
2878 self._revisioncache = None 2942 self._revisioncache = None
2879 self._chaininfocache = util.lrucachedict(500) 2943 self._chaininfocache = util.lrucachedict(500)
3396 return 3460 return
3397 3461
3398 new_entries = [] 3462 new_entries = []
3399 # append the new sidedata 3463 # append the new sidedata
3400 with self._writing(transaction): 3464 with self._writing(transaction):
3401 ifh, dfh = self._writinghandles 3465 ifh, dfh, sdfh = self._writinghandles
3402 if self._docket is not None: 3466 dfh.seek(self._docket.sidedata_end, os.SEEK_SET)
3403 dfh.seek(self._docket.data_end, os.SEEK_SET) 3467
3404 else: 3468 current_offset = sdfh.tell()
3405 dfh.seek(0, os.SEEK_END)
3406
3407 current_offset = dfh.tell()
3408 for rev in range(startrev, endrev + 1): 3469 for rev in range(startrev, endrev + 1):
3409 entry = self.index[rev] 3470 entry = self.index[rev]
3410 new_sidedata, flags = sidedatautil.run_sidedata_helpers( 3471 new_sidedata, flags = sidedatautil.run_sidedata_helpers(
3411 store=self, 3472 store=self,
3412 sidedata_helpers=helpers, 3473 sidedata_helpers=helpers,
3453 new_offset_flags, 3514 new_offset_flags,
3454 sidedata_compression_mode, 3515 sidedata_compression_mode,
3455 ) 3516 )
3456 3517
3457 # the sidedata computation might have move the file cursors around 3518 # the sidedata computation might have move the file cursors around
3458 dfh.seek(current_offset, os.SEEK_SET) 3519 sdfh.seek(current_offset, os.SEEK_SET)
3459 dfh.write(serialized_sidedata) 3520 sdfh.write(serialized_sidedata)
3460 new_entries.append(entry_update) 3521 new_entries.append(entry_update)
3461 current_offset += len(serialized_sidedata) 3522 current_offset += len(serialized_sidedata)
3462 if self._docket is not None: 3523 self._docket.sidedata_end = sdfh.tell()
3463 self._docket.data_end = dfh.tell()
3464 3524
3465 # rewrite the new index entries 3525 # rewrite the new index entries
3466 ifh.seek(startrev * self.index.entry_size) 3526 ifh.seek(startrev * self.index.entry_size)
3467 for i, e in enumerate(new_entries): 3527 for i, e in enumerate(new_entries):
3468 rev = startrev + i 3528 rev = startrev + i