Mercurial > public > mercurial-scm > hg-stable
comparison mercurial/revlog.py @ 47395:e6292eb33384
revlog: store sidedata in their own file
This makes sidedata manipulation simpler and results in more compact data when
traversing either data or sidedata.
Differential Revision: https://phab.mercurial-scm.org/D10787
author | Pierre-Yves David <pierre-yves.david@octobus.net> |
---|---|
date | Fri, 28 May 2021 23:41:17 +0200 |
parents | 75e1104f23a2 |
children | 33d626910374 |
comparison
equal
deleted
inserted
replaced
47394:bcf92bdc2bca | 47395:e6292eb33384 |
---|---|
1 # revlog.py - storage back-end for mercurial | 1 # revlog.py - storage back-end for mercurial |
2 # coding: utf8 | |
2 # | 3 # |
3 # Copyright 2005-2007 Olivia Mackall <olivia@selenic.com> | 4 # Copyright 2005-2007 Olivia Mackall <olivia@selenic.com> |
4 # | 5 # |
5 # This software may be used and distributed according to the terms of the | 6 # This software may be used and distributed according to the terms of the |
6 # GNU General Public License version 2 or any later version. | 7 # GNU General Public License version 2 or any later version. |
258 | 259 |
259 PARTIAL_READ_MSG = _( | 260 PARTIAL_READ_MSG = _( |
260 b'partial read of revlog %s; expected %d bytes from offset %d, got %d' | 261 b'partial read of revlog %s; expected %d bytes from offset %d, got %d' |
261 ) | 262 ) |
262 | 263 |
264 FILE_TOO_SHORT_MSG = _( | |
265 b'cannot read from revlog %s;' | |
266 b' expected %d bytes from offset %d, data size is %d' | |
267 ) | |
268 | |
263 | 269 |
264 class revlog(object): | 270 class revlog(object): |
265 """ | 271 """ |
266 the underlying revision storage object | 272 the underlying revision storage object |
267 | 273 |
399 self.radix = radix | 405 self.radix = radix |
400 | 406 |
401 self._docket_file = None | 407 self._docket_file = None |
402 self._indexfile = None | 408 self._indexfile = None |
403 self._datafile = None | 409 self._datafile = None |
410 self._sidedatafile = None | |
404 self._nodemap_file = None | 411 self._nodemap_file = None |
405 self.postfix = postfix | 412 self.postfix = postfix |
406 self._trypending = trypending | 413 self._trypending = trypending |
407 self.opener = opener | 414 self.opener = opener |
408 if persistentnodemap: | 415 if persistentnodemap: |
443 | 450 |
444 # Make copy of flag processors so each revlog instance can support | 451 # Make copy of flag processors so each revlog instance can support |
445 # custom flags. | 452 # custom flags. |
446 self._flagprocessors = dict(flagutil.flagprocessors) | 453 self._flagprocessors = dict(flagutil.flagprocessors) |
447 | 454 |
448 # 2-tuple of file handles being used for active writing. | 455 # 3-tuple of file handles being used for active writing. |
449 self._writinghandles = None | 456 self._writinghandles = None |
450 # prevent nesting of addgroup | 457 # prevent nesting of addgroup |
451 self._adding_group = None | 458 self._adding_group = None |
452 | 459 |
453 self._loadindex() | 460 self._loadindex() |
632 # main docket, so disable it for now. | 639 # main docket, so disable it for now. |
633 self._nodemap_file = None | 640 self._nodemap_file = None |
634 | 641 |
635 if self._docket is not None: | 642 if self._docket is not None: |
636 self._datafile = self._docket.data_filepath() | 643 self._datafile = self._docket.data_filepath() |
644 self._sidedatafile = self._docket.sidedata_filepath() | |
637 elif self.postfix is None: | 645 elif self.postfix is None: |
638 self._datafile = b'%s.d' % self.radix | 646 self._datafile = b'%s.d' % self.radix |
639 else: | 647 else: |
640 self._datafile = b'%s.d.%s' % (self.radix, self.postfix) | 648 self._datafile = b'%s.d.%s' % (self.radix, self.postfix) |
641 | 649 |
801 else: | 809 else: |
802 func = self._datafp | 810 func = self._datafp |
803 with func() as fp: | 811 with func() as fp: |
804 yield fp | 812 yield fp |
805 | 813 |
814 @contextlib.contextmanager | |
806 def _sidedatareadfp(self): | 815 def _sidedatareadfp(self): |
807 """file object suitable to read sidedata""" | 816 """file object suitable to read sidedata""" |
808 return self._datareadfp() | 817 if self._writinghandles: |
818 yield self._writinghandles[2] | |
819 else: | |
820 with self.opener(self._sidedatafile) as fp: | |
821 yield fp | |
809 | 822 |
810 def tiprev(self): | 823 def tiprev(self): |
811 return len(self.index) - 1 | 824 return len(self.index) - 1 |
812 | 825 |
813 def tip(self): | 826 def tip(self): |
906 | 919 |
907 # First tuple entry is 8 bytes. First 6 bytes are offset. Last 2 bytes | 920 # First tuple entry is 8 bytes. First 6 bytes are offset. Last 2 bytes |
908 # are flags. | 921 # are flags. |
909 def start(self, rev): | 922 def start(self, rev): |
910 return int(self.index[rev][0] >> 16) | 923 return int(self.index[rev][0] >> 16) |
924 | |
925 def sidedata_cut_off(self, rev): | |
926 sd_cut_off = self.index[rev][8] | |
927 if sd_cut_off != 0: | |
928 return sd_cut_off | |
929 # This is some annoying dance, because entries without sidedata | |
930 # currently use 0 as their ofsset. (instead of previous-offset + | |
931 # previous-size) | |
932 # | |
933 # We should reconsider this sidedata → 0 sidata_offset policy. | |
934 # In the meantime, we need this. | |
935 while 0 <= rev: | |
936 e = self.index[rev] | |
937 if e[9] != 0: | |
938 return e[8] + e[9] | |
939 rev -= 1 | |
940 return 0 | |
911 | 941 |
912 def flags(self, rev): | 942 def flags(self, rev): |
913 return self.index[rev][0] & 0xFFFF | 943 return self.index[rev][0] & 0xFFFF |
914 | 944 |
915 def length(self, rev): | 945 def length(self, rev): |
2072 if sidedata_size == 0: | 2102 if sidedata_size == 0: |
2073 return {} | 2103 return {} |
2074 | 2104 |
2075 # XXX this need caching, as we do for data | 2105 # XXX this need caching, as we do for data |
2076 with self._sidedatareadfp() as sdf: | 2106 with self._sidedatareadfp() as sdf: |
2077 sdf.seek(sidedata_offset) | 2107 if self._docket.sidedata_end < sidedata_offset + sidedata_size: |
2108 filename = self._sidedatafile | |
2109 end = self._docket.sidedata_end | |
2110 offset = sidedata_offset | |
2111 length = sidedata_size | |
2112 m = FILE_TOO_SHORT_MSG % (filename, length, offset, end) | |
2113 raise error.RevlogError(m) | |
2114 | |
2115 sdf.seek(sidedata_offset, os.SEEK_SET) | |
2078 comp_segment = sdf.read(sidedata_size) | 2116 comp_segment = sdf.read(sidedata_size) |
2079 | 2117 |
2080 if len(comp_segment) < sidedata_size: | 2118 if len(comp_segment) < sidedata_size: |
2081 filename = self._datafile | 2119 filename = self._sidedatafile |
2082 length = sidedata_size | 2120 length = sidedata_size |
2083 offset = sidedata_offset | 2121 offset = sidedata_offset |
2084 got = len(comp_segment) | 2122 got = len(comp_segment) |
2085 m = PARTIAL_READ_MSG % (filename, length, offset, got) | 2123 m = PARTIAL_READ_MSG % (filename, length, offset, got) |
2086 raise error.RevlogError(m) | 2124 raise error.RevlogError(m) |
2213 self._chunkclear() | 2251 self._chunkclear() |
2214 | 2252 |
2215 if existing_handles: | 2253 if existing_handles: |
2216 # switched from inline to conventional reopen the index | 2254 # switched from inline to conventional reopen the index |
2217 ifh = self.__index_write_fp() | 2255 ifh = self.__index_write_fp() |
2218 self._writinghandles = (ifh, new_dfh) | 2256 self._writinghandles = (ifh, new_dfh, None) |
2219 new_dfh = None | 2257 new_dfh = None |
2220 finally: | 2258 finally: |
2221 if new_dfh is not None: | 2259 if new_dfh is not None: |
2222 new_dfh.close() | 2260 new_dfh.close() |
2223 | 2261 |
2231 msg %= self.display_id | 2269 msg %= self.display_id |
2232 raise error.ProgrammingError(msg) | 2270 raise error.ProgrammingError(msg) |
2233 if self._writinghandles is not None: | 2271 if self._writinghandles is not None: |
2234 yield | 2272 yield |
2235 else: | 2273 else: |
2236 ifh = dfh = None | 2274 ifh = dfh = sdfh = None |
2237 try: | 2275 try: |
2238 r = len(self) | 2276 r = len(self) |
2239 # opening the data file. | 2277 # opening the data file. |
2240 dsize = 0 | 2278 dsize = 0 |
2241 if r: | 2279 if r: |
2251 except IOError as inst: | 2289 except IOError as inst: |
2252 if inst.errno != errno.ENOENT: | 2290 if inst.errno != errno.ENOENT: |
2253 raise | 2291 raise |
2254 dfh = self._datafp(b"w+") | 2292 dfh = self._datafp(b"w+") |
2255 transaction.add(self._datafile, dsize) | 2293 transaction.add(self._datafile, dsize) |
2294 if self._sidedatafile is not None: | |
2295 try: | |
2296 sdfh = self.opener(self._sidedatafile, mode=b"r+") | |
2297 dfh.seek(self._docket.sidedata_end, os.SEEK_SET) | |
2298 except IOError as inst: | |
2299 if inst.errno != errno.ENOENT: | |
2300 raise | |
2301 sdfh = self.opener(self._sidedatafile, mode=b"w+") | |
2302 transaction.add( | |
2303 self._sidedatafile, self._docket.sidedata_end | |
2304 ) | |
2256 | 2305 |
2257 # opening the index file. | 2306 # opening the index file. |
2258 isize = r * self.index.entry_size | 2307 isize = r * self.index.entry_size |
2259 ifh = self.__index_write_fp() | 2308 ifh = self.__index_write_fp() |
2260 if self._inline: | 2309 if self._inline: |
2261 transaction.add(self._indexfile, dsize + isize) | 2310 transaction.add(self._indexfile, dsize + isize) |
2262 else: | 2311 else: |
2263 transaction.add(self._indexfile, isize) | 2312 transaction.add(self._indexfile, isize) |
2264 # exposing all file handle for writing. | 2313 # exposing all file handle for writing. |
2265 self._writinghandles = (ifh, dfh) | 2314 self._writinghandles = (ifh, dfh, sdfh) |
2266 yield | 2315 yield |
2267 if self._docket is not None: | 2316 if self._docket is not None: |
2268 self._write_docket(transaction) | 2317 self._write_docket(transaction) |
2269 finally: | 2318 finally: |
2270 self._writinghandles = None | 2319 self._writinghandles = None |
2271 if dfh is not None: | 2320 if dfh is not None: |
2321 dfh.close() | |
2322 if sdfh is not None: | |
2272 dfh.close() | 2323 dfh.close() |
2273 # closing the index file last to avoid exposing referent to | 2324 # closing the index file last to avoid exposing referent to |
2274 # potential unflushed data content. | 2325 # potential unflushed data content. |
2275 if ifh is not None: | 2326 if ifh is not None: |
2276 ifh.close() | 2327 ifh.close() |
2511 prev = curr - 1 | 2562 prev = curr - 1 |
2512 | 2563 |
2513 offset = self._get_data_offset(prev) | 2564 offset = self._get_data_offset(prev) |
2514 | 2565 |
2515 if self._concurrencychecker: | 2566 if self._concurrencychecker: |
2516 ifh, dfh = self._writinghandles | 2567 ifh, dfh, sdfh = self._writinghandles |
2568 # XXX no checking for the sidedata file | |
2517 if self._inline: | 2569 if self._inline: |
2518 # offset is "as if" it were in the .d file, so we need to add on | 2570 # offset is "as if" it were in the .d file, so we need to add on |
2519 # the size of the entry metadata. | 2571 # the size of the entry metadata. |
2520 self._concurrencychecker( | 2572 self._concurrencychecker( |
2521 ifh, self._indexfile, offset + curr * self.index.entry_size | 2573 ifh, self._indexfile, offset + curr * self.index.entry_size |
2568 | 2620 |
2569 sidedata_compression_mode = COMP_MODE_INLINE | 2621 sidedata_compression_mode = COMP_MODE_INLINE |
2570 if sidedata and self.hassidedata: | 2622 if sidedata and self.hassidedata: |
2571 sidedata_compression_mode = COMP_MODE_PLAIN | 2623 sidedata_compression_mode = COMP_MODE_PLAIN |
2572 serialized_sidedata = sidedatautil.serialize_sidedata(sidedata) | 2624 serialized_sidedata = sidedatautil.serialize_sidedata(sidedata) |
2573 sidedata_offset = offset + deltainfo.deltalen | 2625 sidedata_offset = self._docket.sidedata_end |
2574 h, comp_sidedata = self.compress(serialized_sidedata) | 2626 h, comp_sidedata = self.compress(serialized_sidedata) |
2575 if ( | 2627 if ( |
2576 h != b'u' | 2628 h != b'u' |
2577 and comp_sidedata[0:1] != b'\0' | 2629 and comp_sidedata[0:1] != b'\0' |
2578 and len(comp_sidedata) < len(serialized_sidedata) | 2630 and len(comp_sidedata) < len(serialized_sidedata) |
2620 entry, | 2672 entry, |
2621 deltainfo.data, | 2673 deltainfo.data, |
2622 link, | 2674 link, |
2623 offset, | 2675 offset, |
2624 serialized_sidedata, | 2676 serialized_sidedata, |
2677 sidedata_offset, | |
2625 ) | 2678 ) |
2626 | 2679 |
2627 rawtext = btext[0] | 2680 rawtext = btext[0] |
2628 | 2681 |
2629 if alwayscache and rawtext is None: | 2682 if alwayscache and rawtext is None: |
2646 if self._docket is None: | 2699 if self._docket is None: |
2647 return self.end(prev) | 2700 return self.end(prev) |
2648 else: | 2701 else: |
2649 return self._docket.data_end | 2702 return self._docket.data_end |
2650 | 2703 |
2651 def _writeentry(self, transaction, entry, data, link, offset, sidedata): | 2704 def _writeentry( |
2705 self, transaction, entry, data, link, offset, sidedata, sidedata_offset | |
2706 ): | |
2652 # Files opened in a+ mode have inconsistent behavior on various | 2707 # Files opened in a+ mode have inconsistent behavior on various |
2653 # platforms. Windows requires that a file positioning call be made | 2708 # platforms. Windows requires that a file positioning call be made |
2654 # when the file handle transitions between reads and writes. See | 2709 # when the file handle transitions between reads and writes. See |
2655 # 3686fa2b8eee and the mixedfilemodewrapper in windows.py. On other | 2710 # 3686fa2b8eee and the mixedfilemodewrapper in windows.py. On other |
2656 # platforms, Python or the platform itself can be buggy. Some versions | 2711 # platforms, Python or the platform itself can be buggy. Some versions |
2662 # the file handle is reused for reads and may be seeked there, we need | 2717 # the file handle is reused for reads and may be seeked there, we need |
2663 # to be careful before changing this. | 2718 # to be careful before changing this. |
2664 if self._writinghandles is None: | 2719 if self._writinghandles is None: |
2665 msg = b'adding revision outside `revlog._writing` context' | 2720 msg = b'adding revision outside `revlog._writing` context' |
2666 raise error.ProgrammingError(msg) | 2721 raise error.ProgrammingError(msg) |
2667 ifh, dfh = self._writinghandles | 2722 ifh, dfh, sdfh = self._writinghandles |
2668 if self._docket is None: | 2723 if self._docket is None: |
2669 ifh.seek(0, os.SEEK_END) | 2724 ifh.seek(0, os.SEEK_END) |
2670 else: | 2725 else: |
2671 ifh.seek(self._docket.index_end, os.SEEK_SET) | 2726 ifh.seek(self._docket.index_end, os.SEEK_SET) |
2672 if dfh: | 2727 if dfh: |
2673 if self._docket is None: | 2728 if self._docket is None: |
2674 dfh.seek(0, os.SEEK_END) | 2729 dfh.seek(0, os.SEEK_END) |
2675 else: | 2730 else: |
2676 dfh.seek(self._docket.data_end, os.SEEK_SET) | 2731 dfh.seek(self._docket.data_end, os.SEEK_SET) |
2732 if sdfh: | |
2733 sdfh.seek(self._docket.sidedata_end, os.SEEK_SET) | |
2677 | 2734 |
2678 curr = len(self) - 1 | 2735 curr = len(self) - 1 |
2679 if not self._inline: | 2736 if not self._inline: |
2680 transaction.add(self._datafile, offset) | 2737 transaction.add(self._datafile, offset) |
2738 if self._sidedatafile: | |
2739 transaction.add(self._sidedatafile, sidedata_offset) | |
2681 transaction.add(self._indexfile, curr * len(entry)) | 2740 transaction.add(self._indexfile, curr * len(entry)) |
2682 if data[0]: | 2741 if data[0]: |
2683 dfh.write(data[0]) | 2742 dfh.write(data[0]) |
2684 dfh.write(data[1]) | 2743 dfh.write(data[1]) |
2685 if sidedata: | 2744 if sidedata: |
2686 dfh.write(sidedata) | 2745 sdfh.write(sidedata) |
2687 ifh.write(entry) | 2746 ifh.write(entry) |
2688 else: | 2747 else: |
2689 offset += curr * self.index.entry_size | 2748 offset += curr * self.index.entry_size |
2690 transaction.add(self._indexfile, offset) | 2749 transaction.add(self._indexfile, offset) |
2691 ifh.write(entry) | 2750 ifh.write(entry) |
2692 ifh.write(data[0]) | 2751 ifh.write(data[0]) |
2693 ifh.write(data[1]) | 2752 ifh.write(data[1]) |
2694 if sidedata: | 2753 assert not sidedata |
2695 ifh.write(sidedata) | |
2696 self._enforceinlinesize(transaction) | 2754 self._enforceinlinesize(transaction) |
2697 if self._docket is not None: | 2755 if self._docket is not None: |
2698 self._docket.index_end = self._writinghandles[0].tell() | 2756 self._docket.index_end = self._writinghandles[0].tell() |
2699 self._docket.data_end = self._writinghandles[1].tell() | 2757 self._docket.data_end = self._writinghandles[1].tell() |
2758 self._docket.sidedata_end = self._writinghandles[2].tell() | |
2700 | 2759 |
2701 nodemaputil.setup_persistent_nodemap(transaction, self) | 2760 nodemaputil.setup_persistent_nodemap(transaction, self) |
2702 | 2761 |
2703 def addgroup( | 2762 def addgroup( |
2704 self, | 2763 self, |
2864 transaction.add(self._datafile, data_end) | 2923 transaction.add(self._datafile, data_end) |
2865 end = rev * self.index.entry_size | 2924 end = rev * self.index.entry_size |
2866 else: | 2925 else: |
2867 end = data_end + (rev * self.index.entry_size) | 2926 end = data_end + (rev * self.index.entry_size) |
2868 | 2927 |
2928 if self._sidedatafile: | |
2929 sidedata_end = self.sidedata_cut_off(rev) | |
2930 transaction.add(self._sidedatafile, sidedata_end) | |
2931 | |
2869 transaction.add(self._indexfile, end) | 2932 transaction.add(self._indexfile, end) |
2870 if self._docket is not None: | 2933 if self._docket is not None: |
2871 # XXX we could, leverage the docket while stripping. However it is | 2934 # XXX we could, leverage the docket while stripping. However it is |
2872 # not powerfull enough at the time of this comment | 2935 # not powerfull enough at the time of this comment |
2873 self._docket.index_end = end | 2936 self._docket.index_end = end |
2874 self._docket.data_end = data_end | 2937 self._docket.data_end = data_end |
2938 self._docket.sidedata_end = sidedata_end | |
2875 self._docket.write(transaction, stripping=True) | 2939 self._docket.write(transaction, stripping=True) |
2876 | 2940 |
2877 # then reset internal state in memory to forget those revisions | 2941 # then reset internal state in memory to forget those revisions |
2878 self._revisioncache = None | 2942 self._revisioncache = None |
2879 self._chaininfocache = util.lrucachedict(500) | 2943 self._chaininfocache = util.lrucachedict(500) |
3396 return | 3460 return |
3397 | 3461 |
3398 new_entries = [] | 3462 new_entries = [] |
3399 # append the new sidedata | 3463 # append the new sidedata |
3400 with self._writing(transaction): | 3464 with self._writing(transaction): |
3401 ifh, dfh = self._writinghandles | 3465 ifh, dfh, sdfh = self._writinghandles |
3402 if self._docket is not None: | 3466 dfh.seek(self._docket.sidedata_end, os.SEEK_SET) |
3403 dfh.seek(self._docket.data_end, os.SEEK_SET) | 3467 |
3404 else: | 3468 current_offset = sdfh.tell() |
3405 dfh.seek(0, os.SEEK_END) | |
3406 | |
3407 current_offset = dfh.tell() | |
3408 for rev in range(startrev, endrev + 1): | 3469 for rev in range(startrev, endrev + 1): |
3409 entry = self.index[rev] | 3470 entry = self.index[rev] |
3410 new_sidedata, flags = sidedatautil.run_sidedata_helpers( | 3471 new_sidedata, flags = sidedatautil.run_sidedata_helpers( |
3411 store=self, | 3472 store=self, |
3412 sidedata_helpers=helpers, | 3473 sidedata_helpers=helpers, |
3453 new_offset_flags, | 3514 new_offset_flags, |
3454 sidedata_compression_mode, | 3515 sidedata_compression_mode, |
3455 ) | 3516 ) |
3456 | 3517 |
3457 # the sidedata computation might have move the file cursors around | 3518 # the sidedata computation might have move the file cursors around |
3458 dfh.seek(current_offset, os.SEEK_SET) | 3519 sdfh.seek(current_offset, os.SEEK_SET) |
3459 dfh.write(serialized_sidedata) | 3520 sdfh.write(serialized_sidedata) |
3460 new_entries.append(entry_update) | 3521 new_entries.append(entry_update) |
3461 current_offset += len(serialized_sidedata) | 3522 current_offset += len(serialized_sidedata) |
3462 if self._docket is not None: | 3523 self._docket.sidedata_end = sdfh.tell() |
3463 self._docket.data_end = dfh.tell() | |
3464 | 3524 |
3465 # rewrite the new index entries | 3525 # rewrite the new index entries |
3466 ifh.seek(startrev * self.index.entry_size) | 3526 ifh.seek(startrev * self.index.entry_size) |
3467 for i, e in enumerate(new_entries): | 3527 for i, e in enumerate(new_entries): |
3468 rev = startrev + i | 3528 rev = startrev + i |