comparison mercurial/revlog.py @ 47425:e0a314bcbc9d

revlog: Extract low-level random-access file read caching logic The `revlog` class does many things, among which fulfilling requests for arbitrary byte slices from the revlog "data file" by reading a larger chunk and caching it in memory, in order to reduce the number of system calls. This extracts that logic into a new class, so that it may later also be used for the side-data file (with another instance of that class). The copyright notice of the new file does not include a date or author name since such information tend not to be kept up-to-date: https://www.linuxfoundation.org/en/blog/copyright-notices-in-open-source-software-projects/ Differential Revision: https://phab.mercurial-scm.org/D10878
author Simon Sapin <simon.sapin@octobus.net>
date Tue, 08 Jun 2021 19:55:00 +0200
parents 5fbac82a8780
children cac0e0621ceb
comparison
equal deleted inserted replaced
47424:f77404040776 47425:e0a314bcbc9d
84 censor, 84 censor,
85 deltas as deltautil, 85 deltas as deltautil,
86 docket as docketutil, 86 docket as docketutil,
87 flagutil, 87 flagutil,
88 nodemap as nodemaputil, 88 nodemap as nodemaputil,
89 randomaccessfile,
89 revlogv0, 90 revlogv0,
90 sidedata as sidedatautil, 91 sidedata as sidedatautil,
91 ) 92 )
92 from .utils import ( 93 from .utils import (
93 storageutil, 94 storageutil,
123 # Aliased for performance. 124 # Aliased for performance.
124 _zlibdecompress = zlib.decompress 125 _zlibdecompress = zlib.decompress
125 126
126 # max size of revlog with inline data 127 # max size of revlog with inline data
127 _maxinline = 131072 128 _maxinline = 131072
128 _chunksize = 1048576
129 129
130 # Flag processors for REVIDX_ELLIPSIS. 130 # Flag processors for REVIDX_ELLIPSIS.
131 def ellipsisreadprocessor(rl, text): 131 def ellipsisreadprocessor(rl, text):
132 return text, False 132 return text, False
133 133
229 229
230 230
231 # corresponds to uncompressed length of indexformatng (2 gigs, 4-byte 231 # corresponds to uncompressed length of indexformatng (2 gigs, 4-byte
232 # signed integer) 232 # signed integer)
233 _maxentrysize = 0x7FFFFFFF 233 _maxentrysize = 0x7FFFFFFF
234
235 PARTIAL_READ_MSG = _(
236 b'partial read of revlog %s; expected %d bytes from offset %d, got %d'
237 )
238 234
239 FILE_TOO_SHORT_MSG = _( 235 FILE_TOO_SHORT_MSG = _(
240 b'cannot read from revlog %s;' 236 b'cannot read from revlog %s;'
241 b' expected %d bytes from offset %d, data size is %d' 237 b' expected %d bytes from offset %d, data size is %d'
242 ) 238 )
603 self._parse_index = parse_index_v1_nodemap 599 self._parse_index = parse_index_v1_nodemap
604 elif use_rust_index: 600 elif use_rust_index:
605 self._parse_index = parse_index_v1_mixed 601 self._parse_index = parse_index_v1_mixed
606 try: 602 try:
607 d = self._parse_index(index_data, self._inline) 603 d = self._parse_index(index_data, self._inline)
608 index, _chunkcache = d 604 index, chunkcache = d
609 use_nodemap = ( 605 use_nodemap = (
610 not self._inline 606 not self._inline
611 and self._nodemap_file is not None 607 and self._nodemap_file is not None
612 and util.safehasattr(index, 'update_nodemap_data') 608 and util.safehasattr(index, 'update_nodemap_data')
613 ) 609 )
624 index.update_nodemap_data(*nodemap_data) 620 index.update_nodemap_data(*nodemap_data)
625 except (ValueError, IndexError): 621 except (ValueError, IndexError):
626 raise error.RevlogError( 622 raise error.RevlogError(
627 _(b"index %s is corrupted") % self.display_id 623 _(b"index %s is corrupted") % self.display_id
628 ) 624 )
629 self.index, self._chunkcache = d 625 self.index = index
630 if not self._chunkcache: 626 self._segmentfile = randomaccessfile.randomaccessfile(
631 self._chunkclear() 627 self.opener,
628 (self._indexfile if self._inline else self._datafile),
629 self._chunkcachesize,
630 chunkcache,
631 )
632 # revnum -> (chain-length, sum-delta-length) 632 # revnum -> (chain-length, sum-delta-length)
633 self._chaininfocache = util.lrucachedict(500) 633 self._chaininfocache = util.lrucachedict(500)
634 # revlog header -> revlog compressor 634 # revlog header -> revlog compressor
635 self._decompressors = {} 635 self._decompressors = {}
636 636
707 def _datafp(self, mode=b'r'): 707 def _datafp(self, mode=b'r'):
708 """file object for the revlog's data file""" 708 """file object for the revlog's data file"""
709 return self.opener(self._datafile, mode=mode) 709 return self.opener(self._datafile, mode=mode)
710 710
711 @contextlib.contextmanager 711 @contextlib.contextmanager
712 def _datareadfp(self, existingfp=None):
713 """file object suitable to read data"""
714 # Use explicit file handle, if given.
715 if existingfp is not None:
716 yield existingfp
717
718 # Use a file handle being actively used for writes, if available.
719 # There is some danger to doing this because reads will seek the
720 # file. However, _writeentry() performs a SEEK_END before all writes,
721 # so we should be safe.
722 elif self._writinghandles:
723 if self._inline:
724 yield self._writinghandles[0]
725 else:
726 yield self._writinghandles[1]
727
728 # Otherwise open a new file handle.
729 else:
730 if self._inline:
731 func = self._indexfp
732 else:
733 func = self._datafp
734 with func() as fp:
735 yield fp
736
737 @contextlib.contextmanager
738 def _sidedatareadfp(self): 712 def _sidedatareadfp(self):
739 """file object suitable to read sidedata""" 713 """file object suitable to read sidedata"""
740 if self._writinghandles: 714 if self._writinghandles:
741 yield self._writinghandles[2] 715 yield self._writinghandles[2]
742 else: 716 else:
805 nodemaputil.setup_persistent_nodemap(transaction, self) 779 nodemaputil.setup_persistent_nodemap(transaction, self)
806 780
807 def clearcaches(self): 781 def clearcaches(self):
808 self._revisioncache = None 782 self._revisioncache = None
809 self._chainbasecache.clear() 783 self._chainbasecache.clear()
810 self._chunkcache = (0, b'') 784 self._segmentfile.clear_cache()
811 self._pcache = {} 785 self._pcache = {}
812 self._nodemap_docket = None 786 self._nodemap_docket = None
813 self.index.clearcaches() 787 self.index.clearcaches()
814 # The python code is the one responsible for validating the docket, we 788 # The python code is the one responsible for validating the docket, we
815 # end up having to refresh it here. 789 # end up having to refresh it here.
1627 returns True if text is different than what is stored. 1601 returns True if text is different than what is stored.
1628 """ 1602 """
1629 p1, p2 = self.parents(node) 1603 p1, p2 = self.parents(node)
1630 return storageutil.hashrevisionsha1(text, p1, p2) != node 1604 return storageutil.hashrevisionsha1(text, p1, p2) != node
1631 1605
1632 def _cachesegment(self, offset, data):
1633 """Add a segment to the revlog cache.
1634
1635 Accepts an absolute offset and the data that is at that location.
1636 """
1637 o, d = self._chunkcache
1638 # try to add to existing cache
1639 if o + len(d) == offset and len(d) + len(data) < _chunksize:
1640 self._chunkcache = o, d + data
1641 else:
1642 self._chunkcache = offset, data
1643
1644 def _readsegment(self, offset, length, df=None):
1645 """Load a segment of raw data from the revlog.
1646
1647 Accepts an absolute offset, length to read, and an optional existing
1648 file handle to read from.
1649
1650 If an existing file handle is passed, it will be seeked and the
1651 original seek position will NOT be restored.
1652
1653 Returns a str or buffer of raw byte data.
1654
1655 Raises if the requested number of bytes could not be read.
1656 """
1657 # Cache data both forward and backward around the requested
1658 # data, in a fixed size window. This helps speed up operations
1659 # involving reading the revlog backwards.
1660 cachesize = self._chunkcachesize
1661 realoffset = offset & ~(cachesize - 1)
1662 reallength = (
1663 (offset + length + cachesize) & ~(cachesize - 1)
1664 ) - realoffset
1665 with self._datareadfp(df) as df:
1666 df.seek(realoffset)
1667 d = df.read(reallength)
1668
1669 self._cachesegment(realoffset, d)
1670 if offset != realoffset or reallength != length:
1671 startoffset = offset - realoffset
1672 if len(d) - startoffset < length:
1673 filename = self._indexfile if self._inline else self._datafile
1674 got = len(d) - startoffset
1675 m = PARTIAL_READ_MSG % (filename, length, offset, got)
1676 raise error.RevlogError(m)
1677 return util.buffer(d, startoffset, length)
1678
1679 if len(d) < length:
1680 filename = self._indexfile if self._inline else self._datafile
1681 got = len(d) - startoffset
1682 m = PARTIAL_READ_MSG % (filename, length, offset, got)
1683 raise error.RevlogError(m)
1684
1685 return d
1686
1687 def _getsegment(self, offset, length, df=None):
1688 """Obtain a segment of raw data from the revlog.
1689
1690 Accepts an absolute offset, length of bytes to obtain, and an
1691 optional file handle to the already-opened revlog. If the file
1692 handle is used, it's original seek position will not be preserved.
1693
1694 Requests for data may be returned from a cache.
1695
1696 Returns a str or a buffer instance of raw byte data.
1697 """
1698 o, d = self._chunkcache
1699 l = len(d)
1700
1701 # is it in the cache?
1702 cachestart = offset - o
1703 cacheend = cachestart + length
1704 if cachestart >= 0 and cacheend <= l:
1705 if cachestart == 0 and cacheend == l:
1706 return d # avoid a copy
1707 return util.buffer(d, cachestart, cacheend - cachestart)
1708
1709 return self._readsegment(offset, length, df=df)
1710
1711 def _getsegmentforrevs(self, startrev, endrev, df=None): 1606 def _getsegmentforrevs(self, startrev, endrev, df=None):
1712 """Obtain a segment of raw data corresponding to a range of revisions. 1607 """Obtain a segment of raw data corresponding to a range of revisions.
1713 1608
1714 Accepts the start and end revisions and an optional already-open 1609 Accepts the start and end revisions and an optional already-open
1715 file handle to be used for reading. If the file handle is read, its 1610 file handle to be used for reading. If the file handle is read, its
1738 if self._inline: 1633 if self._inline:
1739 start += (startrev + 1) * self.index.entry_size 1634 start += (startrev + 1) * self.index.entry_size
1740 end += (endrev + 1) * self.index.entry_size 1635 end += (endrev + 1) * self.index.entry_size
1741 length = end - start 1636 length = end - start
1742 1637
1743 return start, self._getsegment(start, length, df=df) 1638 return start, self._segmentfile.read_chunk(start, length, df)
1744 1639
1745 def _chunk(self, rev, df=None): 1640 def _chunk(self, rev, df=None):
1746 """Obtain a single decompressed chunk for a revision. 1641 """Obtain a single decompressed chunk for a revision.
1747 1642
1748 Accepts an integer revision and an optional already-open file handle 1643 Accepts an integer revision and an optional already-open file handle
1829 msg = b'unknown compression mode %d' 1724 msg = b'unknown compression mode %d'
1830 msg %= comp_mode 1725 msg %= comp_mode
1831 raise error.RevlogError(msg) 1726 raise error.RevlogError(msg)
1832 1727
1833 return l 1728 return l
1834
1835 def _chunkclear(self):
1836 """Clear the raw chunk cache."""
1837 self._chunkcache = (0, b'')
1838 1729
1839 def deltaparent(self, rev): 1730 def deltaparent(self, rev):
1840 """return deltaparent of the given revision""" 1731 """return deltaparent of the given revision"""
1841 base = self.index[rev][3] 1732 base = self.index[rev][3]
1842 if base == rev: 1733 if base == rev:
2041 if len(comp_segment) < sidedata_size: 1932 if len(comp_segment) < sidedata_size:
2042 filename = self._sidedatafile 1933 filename = self._sidedatafile
2043 length = sidedata_size 1934 length = sidedata_size
2044 offset = sidedata_offset 1935 offset = sidedata_offset
2045 got = len(comp_segment) 1936 got = len(comp_segment)
2046 m = PARTIAL_READ_MSG % (filename, length, offset, got) 1937 m = randomaccessfile.PARTIAL_READ_MSG % (
1938 filename,
1939 length,
1940 offset,
1941 got,
1942 )
2047 raise error.RevlogError(m) 1943 raise error.RevlogError(m)
2048 1944
2049 comp = self.index[rev][11] 1945 comp = self.index[rev][11]
2050 if comp == COMP_MODE_PLAIN: 1946 if comp == COMP_MODE_PLAIN:
2051 segment = comp_segment 1947 segment = comp_segment
2134 fp.flush() 2030 fp.flush()
2135 fp.close() 2031 fp.close()
2136 # We can't use the cached file handle after close(). So prevent 2032 # We can't use the cached file handle after close(). So prevent
2137 # its usage. 2033 # its usage.
2138 self._writinghandles = None 2034 self._writinghandles = None
2035 self._segmentfile.writing_handle = None
2139 2036
2140 new_dfh = self._datafp(b'w+') 2037 new_dfh = self._datafp(b'w+')
2141 new_dfh.truncate(0) # drop any potentially existing data 2038 new_dfh.truncate(0) # drop any potentially existing data
2142 try: 2039 try:
2143 with self._indexfp() as read_ifh: 2040 with self._indexfp() as read_ifh:
2169 # the temp file replace the real index when we exit the context 2066 # the temp file replace the real index when we exit the context
2170 # manager 2067 # manager
2171 2068
2172 tr.replace(self._indexfile, trindex * self.index.entry_size) 2069 tr.replace(self._indexfile, trindex * self.index.entry_size)
2173 nodemaputil.setup_persistent_nodemap(tr, self) 2070 nodemaputil.setup_persistent_nodemap(tr, self)
2174 self._chunkclear() 2071 self._segmentfile = randomaccessfile.randomaccessfile(
2072 self.opener,
2073 self._datafile,
2074 self._chunkcachesize,
2075 )
2175 2076
2176 if existing_handles: 2077 if existing_handles:
2177 # switched from inline to conventional reopen the index 2078 # switched from inline to conventional reopen the index
2178 ifh = self.__index_write_fp() 2079 ifh = self.__index_write_fp()
2179 self._writinghandles = (ifh, new_dfh, None) 2080 self._writinghandles = (ifh, new_dfh, None)
2081 self._segmentfile.writing_handle = new_dfh
2180 new_dfh = None 2082 new_dfh = None
2181 finally: 2083 finally:
2182 if new_dfh is not None: 2084 if new_dfh is not None:
2183 new_dfh.close() 2085 new_dfh.close()
2184 2086
2233 transaction.add(self._indexfile, dsize + isize) 2135 transaction.add(self._indexfile, dsize + isize)
2234 else: 2136 else:
2235 transaction.add(self._indexfile, isize) 2137 transaction.add(self._indexfile, isize)
2236 # exposing all file handle for writing. 2138 # exposing all file handle for writing.
2237 self._writinghandles = (ifh, dfh, sdfh) 2139 self._writinghandles = (ifh, dfh, sdfh)
2140 self._segmentfile.writing_handle = ifh if self._inline else dfh
2238 yield 2141 yield
2239 if self._docket is not None: 2142 if self._docket is not None:
2240 self._write_docket(transaction) 2143 self._write_docket(transaction)
2241 finally: 2144 finally:
2242 self._writinghandles = None 2145 self._writinghandles = None
2146 self._segmentfile.writing_handle = None
2243 if dfh is not None: 2147 if dfh is not None:
2244 dfh.close() 2148 dfh.close()
2245 if sdfh is not None: 2149 if sdfh is not None:
2246 sdfh.close() 2150 sdfh.close()
2247 # closing the index file last to avoid exposing referent to 2151 # closing the index file last to avoid exposing referent to
2871 self._docket.write(transaction, stripping=True) 2775 self._docket.write(transaction, stripping=True)
2872 2776
2873 # then reset internal state in memory to forget those revisions 2777 # then reset internal state in memory to forget those revisions
2874 self._revisioncache = None 2778 self._revisioncache = None
2875 self._chaininfocache = util.lrucachedict(500) 2779 self._chaininfocache = util.lrucachedict(500)
2876 self._chunkclear() 2780 self._segmentfile.clear_cache()
2877 2781
2878 del self.index[rev:-1] 2782 del self.index[rev:-1]
2879 2783
2880 def checksize(self): 2784 def checksize(self):
2881 """Check size of index and data files 2785 """Check size of index and data files