Mercurial > public > mercurial-scm > hg
comparison mercurial/revlog.py @ 47425:e0a314bcbc9d
revlog: Extract low-level random-access file read caching logic
The `revlog` class does many things, among which fulfilling requests for
arbitrary byte slices from the revlog "data file" by reading a larger chunk
and caching it in memory, in order to reduce the number of system calls.
This extracts that logic into a new class, so that it may later also be used
for the side-data file (with another instance of that class).
The copyright notice of the new file does not include a date or author name
since such information tend not to be kept up-to-date:
https://www.linuxfoundation.org/en/blog/copyright-notices-in-open-source-software-projects/
Differential Revision: https://phab.mercurial-scm.org/D10878
author | Simon Sapin <simon.sapin@octobus.net> |
---|---|
date | Tue, 08 Jun 2021 19:55:00 +0200 |
parents | 5fbac82a8780 |
children | cac0e0621ceb |
comparison
equal
deleted
inserted
replaced
47424:f77404040776 | 47425:e0a314bcbc9d |
---|---|
84 censor, | 84 censor, |
85 deltas as deltautil, | 85 deltas as deltautil, |
86 docket as docketutil, | 86 docket as docketutil, |
87 flagutil, | 87 flagutil, |
88 nodemap as nodemaputil, | 88 nodemap as nodemaputil, |
89 randomaccessfile, | |
89 revlogv0, | 90 revlogv0, |
90 sidedata as sidedatautil, | 91 sidedata as sidedatautil, |
91 ) | 92 ) |
92 from .utils import ( | 93 from .utils import ( |
93 storageutil, | 94 storageutil, |
123 # Aliased for performance. | 124 # Aliased for performance. |
124 _zlibdecompress = zlib.decompress | 125 _zlibdecompress = zlib.decompress |
125 | 126 |
126 # max size of revlog with inline data | 127 # max size of revlog with inline data |
127 _maxinline = 131072 | 128 _maxinline = 131072 |
128 _chunksize = 1048576 | |
129 | 129 |
130 # Flag processors for REVIDX_ELLIPSIS. | 130 # Flag processors for REVIDX_ELLIPSIS. |
131 def ellipsisreadprocessor(rl, text): | 131 def ellipsisreadprocessor(rl, text): |
132 return text, False | 132 return text, False |
133 | 133 |
229 | 229 |
230 | 230 |
231 # corresponds to uncompressed length of indexformatng (2 gigs, 4-byte | 231 # corresponds to uncompressed length of indexformatng (2 gigs, 4-byte |
232 # signed integer) | 232 # signed integer) |
233 _maxentrysize = 0x7FFFFFFF | 233 _maxentrysize = 0x7FFFFFFF |
234 | |
235 PARTIAL_READ_MSG = _( | |
236 b'partial read of revlog %s; expected %d bytes from offset %d, got %d' | |
237 ) | |
238 | 234 |
239 FILE_TOO_SHORT_MSG = _( | 235 FILE_TOO_SHORT_MSG = _( |
240 b'cannot read from revlog %s;' | 236 b'cannot read from revlog %s;' |
241 b' expected %d bytes from offset %d, data size is %d' | 237 b' expected %d bytes from offset %d, data size is %d' |
242 ) | 238 ) |
603 self._parse_index = parse_index_v1_nodemap | 599 self._parse_index = parse_index_v1_nodemap |
604 elif use_rust_index: | 600 elif use_rust_index: |
605 self._parse_index = parse_index_v1_mixed | 601 self._parse_index = parse_index_v1_mixed |
606 try: | 602 try: |
607 d = self._parse_index(index_data, self._inline) | 603 d = self._parse_index(index_data, self._inline) |
608 index, _chunkcache = d | 604 index, chunkcache = d |
609 use_nodemap = ( | 605 use_nodemap = ( |
610 not self._inline | 606 not self._inline |
611 and self._nodemap_file is not None | 607 and self._nodemap_file is not None |
612 and util.safehasattr(index, 'update_nodemap_data') | 608 and util.safehasattr(index, 'update_nodemap_data') |
613 ) | 609 ) |
624 index.update_nodemap_data(*nodemap_data) | 620 index.update_nodemap_data(*nodemap_data) |
625 except (ValueError, IndexError): | 621 except (ValueError, IndexError): |
626 raise error.RevlogError( | 622 raise error.RevlogError( |
627 _(b"index %s is corrupted") % self.display_id | 623 _(b"index %s is corrupted") % self.display_id |
628 ) | 624 ) |
629 self.index, self._chunkcache = d | 625 self.index = index |
630 if not self._chunkcache: | 626 self._segmentfile = randomaccessfile.randomaccessfile( |
631 self._chunkclear() | 627 self.opener, |
628 (self._indexfile if self._inline else self._datafile), | |
629 self._chunkcachesize, | |
630 chunkcache, | |
631 ) | |
632 # revnum -> (chain-length, sum-delta-length) | 632 # revnum -> (chain-length, sum-delta-length) |
633 self._chaininfocache = util.lrucachedict(500) | 633 self._chaininfocache = util.lrucachedict(500) |
634 # revlog header -> revlog compressor | 634 # revlog header -> revlog compressor |
635 self._decompressors = {} | 635 self._decompressors = {} |
636 | 636 |
707 def _datafp(self, mode=b'r'): | 707 def _datafp(self, mode=b'r'): |
708 """file object for the revlog's data file""" | 708 """file object for the revlog's data file""" |
709 return self.opener(self._datafile, mode=mode) | 709 return self.opener(self._datafile, mode=mode) |
710 | 710 |
711 @contextlib.contextmanager | 711 @contextlib.contextmanager |
712 def _datareadfp(self, existingfp=None): | |
713 """file object suitable to read data""" | |
714 # Use explicit file handle, if given. | |
715 if existingfp is not None: | |
716 yield existingfp | |
717 | |
718 # Use a file handle being actively used for writes, if available. | |
719 # There is some danger to doing this because reads will seek the | |
720 # file. However, _writeentry() performs a SEEK_END before all writes, | |
721 # so we should be safe. | |
722 elif self._writinghandles: | |
723 if self._inline: | |
724 yield self._writinghandles[0] | |
725 else: | |
726 yield self._writinghandles[1] | |
727 | |
728 # Otherwise open a new file handle. | |
729 else: | |
730 if self._inline: | |
731 func = self._indexfp | |
732 else: | |
733 func = self._datafp | |
734 with func() as fp: | |
735 yield fp | |
736 | |
737 @contextlib.contextmanager | |
738 def _sidedatareadfp(self): | 712 def _sidedatareadfp(self): |
739 """file object suitable to read sidedata""" | 713 """file object suitable to read sidedata""" |
740 if self._writinghandles: | 714 if self._writinghandles: |
741 yield self._writinghandles[2] | 715 yield self._writinghandles[2] |
742 else: | 716 else: |
805 nodemaputil.setup_persistent_nodemap(transaction, self) | 779 nodemaputil.setup_persistent_nodemap(transaction, self) |
806 | 780 |
807 def clearcaches(self): | 781 def clearcaches(self): |
808 self._revisioncache = None | 782 self._revisioncache = None |
809 self._chainbasecache.clear() | 783 self._chainbasecache.clear() |
810 self._chunkcache = (0, b'') | 784 self._segmentfile.clear_cache() |
811 self._pcache = {} | 785 self._pcache = {} |
812 self._nodemap_docket = None | 786 self._nodemap_docket = None |
813 self.index.clearcaches() | 787 self.index.clearcaches() |
814 # The python code is the one responsible for validating the docket, we | 788 # The python code is the one responsible for validating the docket, we |
815 # end up having to refresh it here. | 789 # end up having to refresh it here. |
1627 returns True if text is different than what is stored. | 1601 returns True if text is different than what is stored. |
1628 """ | 1602 """ |
1629 p1, p2 = self.parents(node) | 1603 p1, p2 = self.parents(node) |
1630 return storageutil.hashrevisionsha1(text, p1, p2) != node | 1604 return storageutil.hashrevisionsha1(text, p1, p2) != node |
1631 | 1605 |
1632 def _cachesegment(self, offset, data): | |
1633 """Add a segment to the revlog cache. | |
1634 | |
1635 Accepts an absolute offset and the data that is at that location. | |
1636 """ | |
1637 o, d = self._chunkcache | |
1638 # try to add to existing cache | |
1639 if o + len(d) == offset and len(d) + len(data) < _chunksize: | |
1640 self._chunkcache = o, d + data | |
1641 else: | |
1642 self._chunkcache = offset, data | |
1643 | |
1644 def _readsegment(self, offset, length, df=None): | |
1645 """Load a segment of raw data from the revlog. | |
1646 | |
1647 Accepts an absolute offset, length to read, and an optional existing | |
1648 file handle to read from. | |
1649 | |
1650 If an existing file handle is passed, it will be seeked and the | |
1651 original seek position will NOT be restored. | |
1652 | |
1653 Returns a str or buffer of raw byte data. | |
1654 | |
1655 Raises if the requested number of bytes could not be read. | |
1656 """ | |
1657 # Cache data both forward and backward around the requested | |
1658 # data, in a fixed size window. This helps speed up operations | |
1659 # involving reading the revlog backwards. | |
1660 cachesize = self._chunkcachesize | |
1661 realoffset = offset & ~(cachesize - 1) | |
1662 reallength = ( | |
1663 (offset + length + cachesize) & ~(cachesize - 1) | |
1664 ) - realoffset | |
1665 with self._datareadfp(df) as df: | |
1666 df.seek(realoffset) | |
1667 d = df.read(reallength) | |
1668 | |
1669 self._cachesegment(realoffset, d) | |
1670 if offset != realoffset or reallength != length: | |
1671 startoffset = offset - realoffset | |
1672 if len(d) - startoffset < length: | |
1673 filename = self._indexfile if self._inline else self._datafile | |
1674 got = len(d) - startoffset | |
1675 m = PARTIAL_READ_MSG % (filename, length, offset, got) | |
1676 raise error.RevlogError(m) | |
1677 return util.buffer(d, startoffset, length) | |
1678 | |
1679 if len(d) < length: | |
1680 filename = self._indexfile if self._inline else self._datafile | |
1681 got = len(d) - startoffset | |
1682 m = PARTIAL_READ_MSG % (filename, length, offset, got) | |
1683 raise error.RevlogError(m) | |
1684 | |
1685 return d | |
1686 | |
1687 def _getsegment(self, offset, length, df=None): | |
1688 """Obtain a segment of raw data from the revlog. | |
1689 | |
1690 Accepts an absolute offset, length of bytes to obtain, and an | |
1691 optional file handle to the already-opened revlog. If the file | |
1692 handle is used, it's original seek position will not be preserved. | |
1693 | |
1694 Requests for data may be returned from a cache. | |
1695 | |
1696 Returns a str or a buffer instance of raw byte data. | |
1697 """ | |
1698 o, d = self._chunkcache | |
1699 l = len(d) | |
1700 | |
1701 # is it in the cache? | |
1702 cachestart = offset - o | |
1703 cacheend = cachestart + length | |
1704 if cachestart >= 0 and cacheend <= l: | |
1705 if cachestart == 0 and cacheend == l: | |
1706 return d # avoid a copy | |
1707 return util.buffer(d, cachestart, cacheend - cachestart) | |
1708 | |
1709 return self._readsegment(offset, length, df=df) | |
1710 | |
1711 def _getsegmentforrevs(self, startrev, endrev, df=None): | 1606 def _getsegmentforrevs(self, startrev, endrev, df=None): |
1712 """Obtain a segment of raw data corresponding to a range of revisions. | 1607 """Obtain a segment of raw data corresponding to a range of revisions. |
1713 | 1608 |
1714 Accepts the start and end revisions and an optional already-open | 1609 Accepts the start and end revisions and an optional already-open |
1715 file handle to be used for reading. If the file handle is read, its | 1610 file handle to be used for reading. If the file handle is read, its |
1738 if self._inline: | 1633 if self._inline: |
1739 start += (startrev + 1) * self.index.entry_size | 1634 start += (startrev + 1) * self.index.entry_size |
1740 end += (endrev + 1) * self.index.entry_size | 1635 end += (endrev + 1) * self.index.entry_size |
1741 length = end - start | 1636 length = end - start |
1742 | 1637 |
1743 return start, self._getsegment(start, length, df=df) | 1638 return start, self._segmentfile.read_chunk(start, length, df) |
1744 | 1639 |
1745 def _chunk(self, rev, df=None): | 1640 def _chunk(self, rev, df=None): |
1746 """Obtain a single decompressed chunk for a revision. | 1641 """Obtain a single decompressed chunk for a revision. |
1747 | 1642 |
1748 Accepts an integer revision and an optional already-open file handle | 1643 Accepts an integer revision and an optional already-open file handle |
1829 msg = b'unknown compression mode %d' | 1724 msg = b'unknown compression mode %d' |
1830 msg %= comp_mode | 1725 msg %= comp_mode |
1831 raise error.RevlogError(msg) | 1726 raise error.RevlogError(msg) |
1832 | 1727 |
1833 return l | 1728 return l |
1834 | |
1835 def _chunkclear(self): | |
1836 """Clear the raw chunk cache.""" | |
1837 self._chunkcache = (0, b'') | |
1838 | 1729 |
1839 def deltaparent(self, rev): | 1730 def deltaparent(self, rev): |
1840 """return deltaparent of the given revision""" | 1731 """return deltaparent of the given revision""" |
1841 base = self.index[rev][3] | 1732 base = self.index[rev][3] |
1842 if base == rev: | 1733 if base == rev: |
2041 if len(comp_segment) < sidedata_size: | 1932 if len(comp_segment) < sidedata_size: |
2042 filename = self._sidedatafile | 1933 filename = self._sidedatafile |
2043 length = sidedata_size | 1934 length = sidedata_size |
2044 offset = sidedata_offset | 1935 offset = sidedata_offset |
2045 got = len(comp_segment) | 1936 got = len(comp_segment) |
2046 m = PARTIAL_READ_MSG % (filename, length, offset, got) | 1937 m = randomaccessfile.PARTIAL_READ_MSG % ( |
1938 filename, | |
1939 length, | |
1940 offset, | |
1941 got, | |
1942 ) | |
2047 raise error.RevlogError(m) | 1943 raise error.RevlogError(m) |
2048 | 1944 |
2049 comp = self.index[rev][11] | 1945 comp = self.index[rev][11] |
2050 if comp == COMP_MODE_PLAIN: | 1946 if comp == COMP_MODE_PLAIN: |
2051 segment = comp_segment | 1947 segment = comp_segment |
2134 fp.flush() | 2030 fp.flush() |
2135 fp.close() | 2031 fp.close() |
2136 # We can't use the cached file handle after close(). So prevent | 2032 # We can't use the cached file handle after close(). So prevent |
2137 # its usage. | 2033 # its usage. |
2138 self._writinghandles = None | 2034 self._writinghandles = None |
2035 self._segmentfile.writing_handle = None | |
2139 | 2036 |
2140 new_dfh = self._datafp(b'w+') | 2037 new_dfh = self._datafp(b'w+') |
2141 new_dfh.truncate(0) # drop any potentially existing data | 2038 new_dfh.truncate(0) # drop any potentially existing data |
2142 try: | 2039 try: |
2143 with self._indexfp() as read_ifh: | 2040 with self._indexfp() as read_ifh: |
2169 # the temp file replace the real index when we exit the context | 2066 # the temp file replace the real index when we exit the context |
2170 # manager | 2067 # manager |
2171 | 2068 |
2172 tr.replace(self._indexfile, trindex * self.index.entry_size) | 2069 tr.replace(self._indexfile, trindex * self.index.entry_size) |
2173 nodemaputil.setup_persistent_nodemap(tr, self) | 2070 nodemaputil.setup_persistent_nodemap(tr, self) |
2174 self._chunkclear() | 2071 self._segmentfile = randomaccessfile.randomaccessfile( |
2072 self.opener, | |
2073 self._datafile, | |
2074 self._chunkcachesize, | |
2075 ) | |
2175 | 2076 |
2176 if existing_handles: | 2077 if existing_handles: |
2177 # switched from inline to conventional reopen the index | 2078 # switched from inline to conventional reopen the index |
2178 ifh = self.__index_write_fp() | 2079 ifh = self.__index_write_fp() |
2179 self._writinghandles = (ifh, new_dfh, None) | 2080 self._writinghandles = (ifh, new_dfh, None) |
2081 self._segmentfile.writing_handle = new_dfh | |
2180 new_dfh = None | 2082 new_dfh = None |
2181 finally: | 2083 finally: |
2182 if new_dfh is not None: | 2084 if new_dfh is not None: |
2183 new_dfh.close() | 2085 new_dfh.close() |
2184 | 2086 |
2233 transaction.add(self._indexfile, dsize + isize) | 2135 transaction.add(self._indexfile, dsize + isize) |
2234 else: | 2136 else: |
2235 transaction.add(self._indexfile, isize) | 2137 transaction.add(self._indexfile, isize) |
2236 # exposing all file handle for writing. | 2138 # exposing all file handle for writing. |
2237 self._writinghandles = (ifh, dfh, sdfh) | 2139 self._writinghandles = (ifh, dfh, sdfh) |
2140 self._segmentfile.writing_handle = ifh if self._inline else dfh | |
2238 yield | 2141 yield |
2239 if self._docket is not None: | 2142 if self._docket is not None: |
2240 self._write_docket(transaction) | 2143 self._write_docket(transaction) |
2241 finally: | 2144 finally: |
2242 self._writinghandles = None | 2145 self._writinghandles = None |
2146 self._segmentfile.writing_handle = None | |
2243 if dfh is not None: | 2147 if dfh is not None: |
2244 dfh.close() | 2148 dfh.close() |
2245 if sdfh is not None: | 2149 if sdfh is not None: |
2246 sdfh.close() | 2150 sdfh.close() |
2247 # closing the index file last to avoid exposing referent to | 2151 # closing the index file last to avoid exposing referent to |
2871 self._docket.write(transaction, stripping=True) | 2775 self._docket.write(transaction, stripping=True) |
2872 | 2776 |
2873 # then reset internal state in memory to forget those revisions | 2777 # then reset internal state in memory to forget those revisions |
2874 self._revisioncache = None | 2778 self._revisioncache = None |
2875 self._chaininfocache = util.lrucachedict(500) | 2779 self._chaininfocache = util.lrucachedict(500) |
2876 self._chunkclear() | 2780 self._segmentfile.clear_cache() |
2877 | 2781 |
2878 del self.index[rev:-1] | 2782 del self.index[rev:-1] |
2879 | 2783 |
2880 def checksize(self): | 2784 def checksize(self): |
2881 """Check size of index and data files | 2785 """Check size of index and data files |