comparison mercurial/revlog.py @ 35738:f90f6fd130c1

revlog: group delta computation methods under _deltacomputer object Extracting these methods from revlog will allow changing the implementation of the deltacomputer, by providing this interface: __init__(self, revlog) - constructor that initialize the object from a given revlog buildtext(self, revinfo, fh) - builds the fulltext version of a revision from a _revisioninfo object and the file handle to the .d (or .i for inline mode) file. finddeltainfo(self, revinfo, fh) - find a revision in the revlog against which it is acceptable to build a delta, and build the corresponding _deltainfo. It should now be easier to write an experimental feature that would replace _deltacomputer by another object, for example one that would know how to parallelize the delta computation in order to quicken the storage of multiple revisions.
author Paul Morelle <paul.morelle@octobus.net>
date Sun, 14 Jan 2018 21:28:12 +0100
parents d99b07bc69fb
children be923ce44d6a d031609b3cb7
comparison
equal deleted inserted replaced
35737:d99b07bc69fb 35738:f90f6fd130c1
261 data = attr.ib() 261 data = attr.ib()
262 base = attr.ib() 262 base = attr.ib()
263 chainbase = attr.ib() 263 chainbase = attr.ib()
264 chainlen = attr.ib() 264 chainlen = attr.ib()
265 compresseddeltalen = attr.ib() 265 compresseddeltalen = attr.ib()
266
267 class _deltacomputer(object):
268 def __init__(self, revlog):
269 self.revlog = revlog
270
271 def _getcandidaterevs(self, p1, p2, cachedelta):
272 """
273 Provides revisions that present an interest to be diffed against,
274 grouped by level of easiness.
275 """
276 revlog = self.revlog
277 curr = len(revlog)
278 prev = curr - 1
279 p1r, p2r = revlog.rev(p1), revlog.rev(p2)
280
281 # should we try to build a delta?
282 if prev != nullrev and revlog.storedeltachains:
283 tested = set()
284 # This condition is true most of the time when processing
285 # changegroup data into a generaldelta repo. The only time it
286 # isn't true is if this is the first revision in a delta chain
287 # or if ``format.generaldelta=true`` disabled ``lazydeltabase``.
288 if cachedelta and revlog._generaldelta and revlog._lazydeltabase:
289 # Assume what we received from the server is a good choice
290 # build delta will reuse the cache
291 yield (cachedelta[0],)
292 tested.add(cachedelta[0])
293
294 if revlog._generaldelta:
295 # exclude already lazy tested base if any
296 parents = [p for p in (p1r, p2r)
297 if p != nullrev and p not in tested]
298 if parents and not revlog._aggressivemergedeltas:
299 # Pick whichever parent is closer to us (to minimize the
300 # chance of having to build a fulltext).
301 parents = [max(parents)]
302 tested.update(parents)
303 yield parents
304
305 if prev not in tested:
306 # other approach failed try against prev to hopefully save us a
307 # fulltext.
308 yield (prev,)
309
310 def buildtext(self, revinfo, fh):
311 """Builds a fulltext version of a revision
312
313 revinfo: _revisioninfo instance that contains all needed info
314 fh: file handle to either the .i or the .d revlog file,
315 depending on whether it is inlined or not
316 """
317 btext = revinfo.btext
318 if btext[0] is not None:
319 return btext[0]
320
321 revlog = self.revlog
322 cachedelta = revinfo.cachedelta
323 flags = revinfo.flags
324 node = revinfo.node
325
326 baserev = cachedelta[0]
327 delta = cachedelta[1]
328 # special case deltas which replace entire base; no need to decode
329 # base revision. this neatly avoids censored bases, which throw when
330 # they're decoded.
331 hlen = struct.calcsize(">lll")
332 if delta[:hlen] == mdiff.replacediffheader(revlog.rawsize(baserev),
333 len(delta) - hlen):
334 btext[0] = delta[hlen:]
335 else:
336 basetext = revlog.revision(baserev, _df=fh, raw=True)
337 btext[0] = mdiff.patch(basetext, delta)
338
339 try:
340 res = revlog._processflags(btext[0], flags, 'read', raw=True)
341 btext[0], validatehash = res
342 if validatehash:
343 revlog.checkhash(btext[0], node, p1=revinfo.p1, p2=revinfo.p2)
344 if flags & REVIDX_ISCENSORED:
345 raise RevlogError(_('node %s is not censored') % node)
346 except CensoredNodeError:
347 # must pass the censored index flag to add censored revisions
348 if not flags & REVIDX_ISCENSORED:
349 raise
350 return btext[0]
351
352 def _builddeltadiff(self, base, revinfo, fh):
353 revlog = self.revlog
354 t = self.buildtext(revinfo, fh)
355 if revlog.iscensored(base):
356 # deltas based on a censored revision must replace the
357 # full content in one patch, so delta works everywhere
358 header = mdiff.replacediffheader(revlog.rawsize(base), len(t))
359 delta = header + t
360 else:
361 ptext = revlog.revision(base, _df=fh, raw=True)
362 delta = mdiff.textdiff(ptext, t)
363
364 return delta
365
366 def _builddeltainfo(self, revinfo, base, fh):
367 # can we use the cached delta?
368 if revinfo.cachedelta and revinfo.cachedelta[0] == base:
369 delta = revinfo.cachedelta[1]
370 else:
371 delta = self._builddeltadiff(base, revinfo, fh)
372 revlog = self.revlog
373 header, data = revlog.compress(delta)
374 deltalen = len(header) + len(data)
375 chainbase = revlog.chainbase(base)
376 offset = revlog.end(len(revlog) - 1)
377 dist = deltalen + offset - revlog.start(chainbase)
378 if revlog._generaldelta:
379 deltabase = base
380 else:
381 deltabase = chainbase
382 chainlen, compresseddeltalen = revlog._chaininfo(base)
383 chainlen += 1
384 compresseddeltalen += deltalen
385 return _deltainfo(dist, deltalen, (header, data), deltabase,
386 chainbase, chainlen, compresseddeltalen)
387
388 def finddeltainfo(self, revinfo, fh):
389 """Find an acceptable delta against a candidate revision
390
391 revinfo: information about the revision (instance of _revisioninfo)
392 fh: file handle to either the .i or the .d revlog file,
393 depending on whether it is inlined or not
394
395 Returns the first acceptable candidate revision, as ordered by
396 _getcandidaterevs
397 """
398 cachedelta = revinfo.cachedelta
399 p1 = revinfo.p1
400 p2 = revinfo.p2
401 revlog = self.revlog
402
403 deltainfo = None
404 for candidaterevs in self._getcandidaterevs(p1, p2, cachedelta):
405 nominateddeltas = []
406 for candidaterev in candidaterevs:
407 candidatedelta = self._builddeltainfo(revinfo, candidaterev, fh)
408 if revlog._isgooddeltainfo(candidatedelta, revinfo.textlen):
409 nominateddeltas.append(candidatedelta)
410 if nominateddeltas:
411 deltainfo = min(nominateddeltas, key=lambda x: x.deltalen)
412 break
413
414 return deltainfo
266 415
267 @attr.s(slots=True, frozen=True) 416 @attr.s(slots=True, frozen=True)
268 class _revisioninfo(object): 417 class _revisioninfo(object):
269 """Information about a revision that allows building its fulltext 418 """Information about a revision that allows building its fulltext
270 node: expected hash of the revision 419 node: expected hash of the revision
1719 1868
1720 tr.replace(self.indexfile, trindex * self._io.size) 1869 tr.replace(self.indexfile, trindex * self._io.size)
1721 self._chunkclear() 1870 self._chunkclear()
1722 1871
1723 def addrevision(self, text, transaction, link, p1, p2, cachedelta=None, 1872 def addrevision(self, text, transaction, link, p1, p2, cachedelta=None,
1724 node=None, flags=REVIDX_DEFAULT_FLAGS): 1873 node=None, flags=REVIDX_DEFAULT_FLAGS, deltacomputer=None):
1725 """add a revision to the log 1874 """add a revision to the log
1726 1875
1727 text - the revision data to add 1876 text - the revision data to add
1728 transaction - the transaction object used for rollback 1877 transaction - the transaction object used for rollback
1729 link - the linkrev data to add 1878 link - the linkrev data to add
1731 cachedelta - an optional precomputed delta 1880 cachedelta - an optional precomputed delta
1732 node - nodeid of revision; typically node is not specified, and it is 1881 node - nodeid of revision; typically node is not specified, and it is
1733 computed by default as hash(text, p1, p2), however subclasses might 1882 computed by default as hash(text, p1, p2), however subclasses might
1734 use different hashing method (and override checkhash() in such case) 1883 use different hashing method (and override checkhash() in such case)
1735 flags - the known flags to set on the revision 1884 flags - the known flags to set on the revision
1885 deltacomputer - an optional _deltacomputer instance shared between
1886 multiple calls
1736 """ 1887 """
1737 if link == nullrev: 1888 if link == nullrev:
1738 raise RevlogError(_("attempted to add linkrev -1 to %s") 1889 raise RevlogError(_("attempted to add linkrev -1 to %s")
1739 % self.indexfile) 1890 % self.indexfile)
1740 1891
1759 1910
1760 if validatehash: 1911 if validatehash:
1761 self.checkhash(rawtext, node, p1=p1, p2=p2) 1912 self.checkhash(rawtext, node, p1=p1, p2=p2)
1762 1913
1763 return self.addrawrevision(rawtext, transaction, link, p1, p2, node, 1914 return self.addrawrevision(rawtext, transaction, link, p1, p2, node,
1764 flags, cachedelta=cachedelta) 1915 flags, cachedelta=cachedelta,
1916 deltacomputer=deltacomputer)
1765 1917
1766 def addrawrevision(self, rawtext, transaction, link, p1, p2, node, flags, 1918 def addrawrevision(self, rawtext, transaction, link, p1, p2, node, flags,
1767 cachedelta=None): 1919 cachedelta=None, deltacomputer=None):
1768 """add a raw revision with known flags, node and parents 1920 """add a raw revision with known flags, node and parents
1769 useful when reusing a revision not stored in this revlog (ex: received 1921 useful when reusing a revision not stored in this revlog (ex: received
1770 over wire, or read from an external bundle). 1922 over wire, or read from an external bundle).
1771 """ 1923 """
1772 dfh = None 1924 dfh = None
1773 if not self._inline: 1925 if not self._inline:
1774 dfh = self.opener(self.datafile, "a+") 1926 dfh = self.opener(self.datafile, "a+")
1775 ifh = self.opener(self.indexfile, "a+", checkambig=self._checkambig) 1927 ifh = self.opener(self.indexfile, "a+", checkambig=self._checkambig)
1776 try: 1928 try:
1777 return self._addrevision(node, rawtext, transaction, link, p1, p2, 1929 return self._addrevision(node, rawtext, transaction, link, p1, p2,
1778 flags, cachedelta, ifh, dfh) 1930 flags, cachedelta, ifh, dfh,
1931 deltacomputer=deltacomputer)
1779 finally: 1932 finally:
1780 if dfh: 1933 if dfh:
1781 dfh.close() 1934 dfh.close()
1782 ifh.close() 1935 ifh.close()
1783 1936
1873 (self._maxchainlen and d.chainlen > self._maxchainlen)): 2026 (self._maxchainlen and d.chainlen > self._maxchainlen)):
1874 return False 2027 return False
1875 2028
1876 return True 2029 return True
1877 2030
1878 def _getcandidaterevs(self, p1, p2, cachedelta):
1879 """
1880 Provides revisions that present an interest to be diffed against,
1881 grouped by level of easiness.
1882 """
1883 curr = len(self)
1884 prev = curr - 1
1885 p1r, p2r = self.rev(p1), self.rev(p2)
1886
1887 # should we try to build a delta?
1888 if prev != nullrev and self.storedeltachains:
1889 tested = set()
1890 # This condition is true most of the time when processing
1891 # changegroup data into a generaldelta repo. The only time it
1892 # isn't true is if this is the first revision in a delta chain
1893 # or if ``format.generaldelta=true`` disabled ``lazydeltabase``.
1894 if cachedelta and self._generaldelta and self._lazydeltabase:
1895 # Assume what we received from the server is a good choice
1896 # build delta will reuse the cache
1897 yield (cachedelta[0],)
1898 tested.add(cachedelta[0])
1899
1900 if self._generaldelta:
1901 # exclude already lazy tested base if any
1902 parents = [p for p in (p1r, p2r)
1903 if p != nullrev and p not in tested]
1904 if parents and not self._aggressivemergedeltas:
1905 # Pick whichever parent is closer to us (to minimize the
1906 # chance of having to build a fulltext).
1907 parents = [max(parents)]
1908 tested.update(parents)
1909 yield parents
1910
1911 if prev not in tested:
1912 # other approach failed try against prev to hopefully save us a
1913 # fulltext.
1914 yield (prev,)
1915
1916 def _buildtext(self, revinfo, fh):
1917 """Builds a fulltext version of a revision
1918
1919 revinfo: _revisioninfo instance that contains all needed info
1920 fh: file handle to either the .i or the .d revlog file,
1921 depending on whether it is inlined or not
1922 """
1923 btext = revinfo.btext
1924 if btext[0] is not None:
1925 return btext[0]
1926
1927 cachedelta = revinfo.cachedelta
1928 flags = revinfo.flags
1929 node = revinfo.node
1930
1931 baserev = cachedelta[0]
1932 delta = cachedelta[1]
1933 # special case deltas which replace entire base; no need to decode
1934 # base revision. this neatly avoids censored bases, which throw when
1935 # they're decoded.
1936 hlen = struct.calcsize(">lll")
1937 if delta[:hlen] == mdiff.replacediffheader(self.rawsize(baserev),
1938 len(delta) - hlen):
1939 btext[0] = delta[hlen:]
1940 else:
1941 basetext = self.revision(baserev, _df=fh, raw=True)
1942 btext[0] = mdiff.patch(basetext, delta)
1943
1944 try:
1945 res = self._processflags(btext[0], flags, 'read', raw=True)
1946 btext[0], validatehash = res
1947 if validatehash:
1948 self.checkhash(btext[0], node, p1=revinfo.p1, p2=revinfo.p2)
1949 if flags & REVIDX_ISCENSORED:
1950 raise RevlogError(_('node %s is not censored') % node)
1951 except CensoredNodeError:
1952 # must pass the censored index flag to add censored revisions
1953 if not flags & REVIDX_ISCENSORED:
1954 raise
1955 return btext[0]
1956
1957 def _builddeltadiff(self, base, revinfo, fh):
1958 t = self._buildtext(revinfo, fh)
1959 if self.iscensored(base):
1960 # deltas based on a censored revision must replace the
1961 # full content in one patch, so delta works everywhere
1962 header = mdiff.replacediffheader(self.rawsize(base), len(t))
1963 delta = header + t
1964 else:
1965 ptext = self.revision(base, _df=fh, raw=True)
1966 delta = mdiff.textdiff(ptext, t)
1967
1968 return delta
1969
1970 def _builddeltainfo(self, revinfo, base, fh):
1971 # can we use the cached delta?
1972 if revinfo.cachedelta and revinfo.cachedelta[0] == base:
1973 delta = revinfo.cachedelta[1]
1974 else:
1975 delta = self._builddeltadiff(base, revinfo, fh)
1976 header, data = self.compress(delta)
1977 deltalen = len(header) + len(data)
1978 chainbase = self.chainbase(base)
1979 offset = self.end(len(self) - 1)
1980 dist = deltalen + offset - self.start(chainbase)
1981 if self._generaldelta:
1982 deltabase = base
1983 else:
1984 deltabase = chainbase
1985 chainlen, compresseddeltalen = self._chaininfo(base)
1986 chainlen += 1
1987 compresseddeltalen += deltalen
1988 return _deltainfo(dist, deltalen, (header, data), deltabase,
1989 chainbase, chainlen, compresseddeltalen)
1990
1991 def _finddeltainfo(self, revinfo, fh):
1992 """Find an acceptable delta against a candidate revision
1993
1994 revinfo: information about the revision (instance of _revisioninfo)
1995 fh: file handle to either the .i or the .d revlog file,
1996 depending on whether it is inlined or not
1997
1998 Returns the first acceptable candidate revision, as ordered by
1999 _getcandidaterevs
2000 """
2001 cachedelta = revinfo.cachedelta
2002 p1 = revinfo.p1
2003 p2 = revinfo.p2
2004
2005 deltainfo = None
2006 for candidaterevs in self._getcandidaterevs(p1, p2, cachedelta):
2007 nominateddeltas = []
2008 for candidaterev in candidaterevs:
2009 candidatedelta = self._builddeltainfo(revinfo, candidaterev, fh)
2010 if self._isgooddeltainfo(candidatedelta, revinfo.textlen):
2011 nominateddeltas.append(candidatedelta)
2012 if nominateddeltas:
2013 deltainfo = min(nominateddeltas, key=lambda x: x.deltalen)
2014 break
2015
2016 return deltainfo
2017
2018 def _addrevision(self, node, rawtext, transaction, link, p1, p2, flags, 2031 def _addrevision(self, node, rawtext, transaction, link, p1, p2, flags,
2019 cachedelta, ifh, dfh, alwayscache=False): 2032 cachedelta, ifh, dfh, alwayscache=False,
2033 deltacomputer=None):
2020 """internal function to add revisions to the log 2034 """internal function to add revisions to the log
2021 2035
2022 see addrevision for argument descriptions. 2036 see addrevision for argument descriptions.
2023 2037
2024 note: "addrevision" takes non-raw text, "_addrevision" takes raw text. 2038 note: "addrevision" takes non-raw text, "_addrevision" takes raw text.
2039
2040 if "deltacomputer" is not provided or None, a defaultdeltacomputer will
2041 be used.
2025 2042
2026 invariants: 2043 invariants:
2027 - rawtext is optional (can be None); if not set, cachedelta must be set. 2044 - rawtext is optional (can be None); if not set, cachedelta must be set.
2028 if both are set, they must correspond to each other. 2045 if both are set, they must correspond to each other.
2029 """ 2046 """
2052 textlen = mdiff.patchedsize(self.rawsize(cachedelta[0]), 2069 textlen = mdiff.patchedsize(self.rawsize(cachedelta[0]),
2053 cachedelta[1]) 2070 cachedelta[1])
2054 else: 2071 else:
2055 textlen = len(rawtext) 2072 textlen = len(rawtext)
2056 2073
2074 if deltacomputer is None:
2075 deltacomputer = _deltacomputer(self)
2076
2057 revinfo = _revisioninfo(node, p1, p2, btext, textlen, cachedelta, flags) 2077 revinfo = _revisioninfo(node, p1, p2, btext, textlen, cachedelta, flags)
2058 deltainfo = self._finddeltainfo(revinfo, fh) 2078 deltainfo = deltacomputer.finddeltainfo(revinfo, fh)
2059 2079
2060 if deltainfo is not None: 2080 if deltainfo is not None:
2061 base = deltainfo.base 2081 base = deltainfo.base
2062 chainbase = deltainfo.chainbase 2082 chainbase = deltainfo.chainbase
2063 data = deltainfo.data 2083 data = deltainfo.data
2064 l = deltainfo.deltalen 2084 l = deltainfo.deltalen
2065 else: 2085 else:
2066 rawtext = self._buildtext(revinfo, fh) 2086 rawtext = deltacomputer.buildtext(revinfo, fh)
2067 data = self.compress(rawtext) 2087 data = self.compress(rawtext)
2068 l = len(data[1]) + len(data[0]) 2088 l = len(data[1]) + len(data[0])
2069 base = chainbase = curr 2089 base = chainbase = curr
2070 2090
2071 e = (offset_type(offset, flags), l, textlen, 2091 e = (offset_type(offset, flags), l, textlen,
2075 2095
2076 entry = self._io.packentry(e, self.node, self.version, curr) 2096 entry = self._io.packentry(e, self.node, self.version, curr)
2077 self._writeentry(transaction, ifh, dfh, entry, data, link, offset) 2097 self._writeentry(transaction, ifh, dfh, entry, data, link, offset)
2078 2098
2079 if alwayscache and rawtext is None: 2099 if alwayscache and rawtext is None:
2080 rawtext = self._buildtext(revinfo, fh) 2100 rawtext = deltacomputer._buildtext(revinfo, fh)
2081 2101
2082 if type(rawtext) == str: # only accept immutable objects 2102 if type(rawtext) == str: # only accept immutable objects
2083 self._cache = (node, curr, rawtext) 2103 self._cache = (node, curr, rawtext)
2084 self._chainbasecache[curr] = chainbase 2104 self._chainbasecache[curr] = chainbase
2085 return node 2105 return node
2145 def flush(): 2165 def flush():
2146 if dfh: 2166 if dfh:
2147 dfh.flush() 2167 dfh.flush()
2148 ifh.flush() 2168 ifh.flush()
2149 try: 2169 try:
2170 deltacomputer = _deltacomputer(self)
2150 # loop through our set of deltas 2171 # loop through our set of deltas
2151 for data in deltas: 2172 for data in deltas:
2152 node, p1, p2, linknode, deltabase, delta, flags = data 2173 node, p1, p2, linknode, deltabase, delta, flags = data
2153 link = linkmapper(linknode) 2174 link = linkmapper(linknode)
2154 flags = flags or REVIDX_DEFAULT_FLAGS 2175 flags = flags or REVIDX_DEFAULT_FLAGS
2191 # generation so the revision data can always be handled as raw 2212 # generation so the revision data can always be handled as raw
2192 # by the flagprocessor. 2213 # by the flagprocessor.
2193 self._addrevision(node, None, transaction, link, 2214 self._addrevision(node, None, transaction, link,
2194 p1, p2, flags, (baserev, delta), 2215 p1, p2, flags, (baserev, delta),
2195 ifh, dfh, 2216 ifh, dfh,
2196 alwayscache=bool(addrevisioncb)) 2217 alwayscache=bool(addrevisioncb),
2218 deltacomputer=deltacomputer)
2197 2219
2198 if addrevisioncb: 2220 if addrevisioncb:
2199 addrevisioncb(self, node) 2221 addrevisioncb(self, node)
2200 2222
2201 if not dfh and not self._inline: 2223 if not dfh and not self._inline:
2414 destrevlog._aggressivemergedeltas = aggressivemergedeltas or oldamd 2436 destrevlog._aggressivemergedeltas = aggressivemergedeltas or oldamd
2415 2437
2416 populatecachedelta = deltareuse in (self.DELTAREUSEALWAYS, 2438 populatecachedelta = deltareuse in (self.DELTAREUSEALWAYS,
2417 self.DELTAREUSESAMEREVS) 2439 self.DELTAREUSESAMEREVS)
2418 2440
2441 deltacomputer = _deltacomputer(destrevlog)
2419 index = self.index 2442 index = self.index
2420 for rev in self: 2443 for rev in self:
2421 entry = index[rev] 2444 entry = index[rev]
2422 2445
2423 # Some classes override linkrev to take filtered revs into 2446 # Some classes override linkrev to take filtered revs into
2442 2465
2443 2466
2444 if deltareuse == self.DELTAREUSEFULLADD: 2467 if deltareuse == self.DELTAREUSEFULLADD:
2445 destrevlog.addrevision(rawtext, tr, linkrev, p1, p2, 2468 destrevlog.addrevision(rawtext, tr, linkrev, p1, p2,
2446 cachedelta=cachedelta, 2469 cachedelta=cachedelta,
2447 node=node, flags=flags) 2470 node=node, flags=flags,
2471 deltacomputer=deltacomputer)
2448 else: 2472 else:
2449 ifh = destrevlog.opener(destrevlog.indexfile, 'a+', 2473 ifh = destrevlog.opener(destrevlog.indexfile, 'a+',
2450 checkambig=False) 2474 checkambig=False)
2451 dfh = None 2475 dfh = None
2452 if not destrevlog._inline: 2476 if not destrevlog._inline:
2453 dfh = destrevlog.opener(destrevlog.datafile, 'a+') 2477 dfh = destrevlog.opener(destrevlog.datafile, 'a+')
2454 try: 2478 try:
2455 destrevlog._addrevision(node, rawtext, tr, linkrev, p1, 2479 destrevlog._addrevision(node, rawtext, tr, linkrev, p1,
2456 p2, flags, cachedelta, ifh, dfh) 2480 p2, flags, cachedelta, ifh, dfh,
2481 deltacomputer=deltacomputer)
2457 finally: 2482 finally:
2458 if dfh: 2483 if dfh:
2459 dfh.close() 2484 dfh.close()
2460 ifh.close() 2485 ifh.close()
2461 2486