Mercurial > public > mercurial-scm > hg
diff hgext/fastannotate/revmap.py @ 39210:1ddb296e0dee
fastannotate: initial import from Facebook's hg-experimental
I made as few changes as I could to get the tests to pass, but this
was a bit involved due to some churn in the blame code since someone
last gave fastannotate any TLC.
There's still follow-up work here to rip out support for old versions
of hg and to integrate the protocol with modern standards.
Some performance numbers (all on my 2016 MacBook Pro with a 2.6Ghz i7):
Mercurial mercurial/manifest.py
traditional blame
time: real 1.050 secs (user 0.990+0.000 sys 0.060+0.000)
build cache
time: real 5.900 secs (user 5.720+0.000 sys 0.110+0.000)
fastannotate
time: real 0.120 secs (user 0.100+0.000 sys 0.020+0.000)
Mercurial mercurial/localrepo.py
traditional blame
time: real 3.330 secs (user 3.220+0.000 sys 0.070+0.000)
build cache
time: real 30.610 secs (user 30.190+0.000 sys 0.230+0.000)
fastannotate
time: real 0.180 secs (user 0.160+0.000 sys 0.020+0.000)
mozilla-central dom/ipc/ContentParent.cpp
traditional blame
time: real 7.640 secs (user 7.210+0.000 sys 0.380+0.000)
build cache
time: real 98.650 secs (user 97.000+0.000 sys 0.950+0.000)
fastannotate
time: real 1.580 secs (user 1.340+0.000 sys 0.240+0.000)
mozilla-central dom/base/nsDocument.cpp
traditional blame
time: real 17.110 secs (user 16.490+0.000 sys 0.500+0.000)
build cache
time: real 399.750 secs (user 394.520+0.000 sys 2.610+0.000)
fastannotate
time: real 1.780 secs (user 1.530+0.000 sys 0.240+0.000)
So building the cache is expensive (but might be faster with xdiff
enabled), but the blame results are *way* faster.
Differential Revision: https://phab.mercurial-scm.org/D3994
author | Augie Fackler <augie@google.com> |
---|---|
date | Mon, 30 Jul 2018 22:50:00 -0400 |
parents | |
children | 1205ba8f11ac |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hgext/fastannotate/revmap.py Mon Jul 30 22:50:00 2018 -0400 @@ -0,0 +1,254 @@ +# Copyright 2016-present Facebook. All Rights Reserved. +# +# revmap: trivial hg hash - linelog rev bidirectional map +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. + +from __future__ import absolute_import + +import bisect +import os +import struct + +from mercurial.node import hex +from mercurial import ( + error as hgerror, + pycompat, +) +from . import error + +# the revmap file format is straightforward: +# +# 8 bytes: header +# 1 byte : flag for linelog revision 1 +# ? bytes: (optional) '\0'-terminated path string +# only exists if (flag & renameflag) != 0 +# 20 bytes: hg hash for linelog revision 1 +# 1 byte : flag for linelog revision 2 +# ? bytes: (optional) '\0'-terminated path string +# 20 bytes: hg hash for linelog revision 2 +# .... +# +# the implementation is kinda stupid: __init__ loads the whole revmap. +# no laziness. benchmark shows loading 10000 revisions is about 0.015 +# seconds, which looks enough for our use-case. if this implementation +# becomes a bottleneck, we can change it to lazily read the file +# from the end. + +# whether the changeset is in the side branch. i.e. not in the linear main +# branch but only got referenced by lines in merge changesets. +sidebranchflag = 1 + +# whether the changeset changes the file path (ie. is a rename) +renameflag = 2 + +# len(mercurial.node.nullid) +_hshlen = 20 + +class revmap(object): + """trivial hg bin hash - linelog rev bidirectional map + + also stores a flag (uint8) for each revision, and track renames. + """ + + HEADER = b'REVMAP1\0' + + def __init__(self, path=None): + """create or load the revmap, optionally associate to a file + + if path is None, the revmap is entirely in-memory. the caller is + responsible for locking. concurrent writes to a same file is unsafe. + the caller needs to make sure one file is associated to at most one + revmap object at a time.""" + self.path = path + self._rev2hsh = [None] + self._rev2flag = [None] + self._hsh2rev = {} + # since rename does not happen frequently, do not store path for every + # revision. self._renamerevs can be used for bisecting. + self._renamerevs = [0] + self._renamepaths = [''] + self._lastmaxrev = -1 + if path: + if os.path.exists(path): + self._load() + else: + # write the header so "append" can do incremental updates + self.flush() + + def copyfrom(self, rhs): + """copy the map data from another revmap. do not affect self.path""" + self._rev2hsh = rhs._rev2hsh[:] + self._rev2flag = rhs._rev2flag[:] + self._hsh2rev = rhs._hsh2rev.copy() + self._renamerevs = rhs._renamerevs[:] + self._renamepaths = rhs._renamepaths[:] + self._lastmaxrev = -1 + + @property + def maxrev(self): + """return max linelog revision number""" + return len(self._rev2hsh) - 1 + + def append(self, hsh, sidebranch=False, path=None, flush=False): + """add a binary hg hash and return the mapped linelog revision. + if flush is True, incrementally update the file. + """ + if hsh in self._hsh2rev: + raise error.CorruptedFileError('%r is in revmap already' % hex(hsh)) + if len(hsh) != _hshlen: + raise hgerror.ProgrammingError('hsh must be %d-char long' % _hshlen) + idx = len(self._rev2hsh) + flag = 0 + if sidebranch: + flag |= sidebranchflag + if path is not None and path != self._renamepaths[-1]: + flag |= renameflag + self._renamerevs.append(idx) + self._renamepaths.append(path) + self._rev2hsh.append(hsh) + self._rev2flag.append(flag) + self._hsh2rev[hsh] = idx + if flush: + self.flush() + return idx + + def rev2hsh(self, rev): + """convert linelog revision to hg hash. return None if not found.""" + if rev > self.maxrev or rev < 0: + return None + return self._rev2hsh[rev] + + def rev2flag(self, rev): + """get the flag (uint8) for a given linelog revision. + return None if revision does not exist. + """ + if rev > self.maxrev or rev < 0: + return None + return self._rev2flag[rev] + + def rev2path(self, rev): + """get the path for a given linelog revision. + return None if revision does not exist. + """ + if rev > self.maxrev or rev < 0: + return None + idx = bisect.bisect_right(self._renamerevs, rev) - 1 + return self._renamepaths[idx] + + def hsh2rev(self, hsh): + """convert hg hash to linelog revision. return None if not found.""" + return self._hsh2rev.get(hsh) + + def clear(self, flush=False): + """make the map empty. if flush is True, write to disk""" + # rev 0 is reserved, real rev starts from 1 + self._rev2hsh = [None] + self._rev2flag = [None] + self._hsh2rev = {} + self._rev2path = [''] + self._lastmaxrev = -1 + if flush: + self.flush() + + def flush(self): + """write the state down to the file""" + if not self.path: + return + if self._lastmaxrev == -1: # write the entire file + with open(self.path, 'wb') as f: + f.write(self.HEADER) + for i in pycompat.xrange(1, len(self._rev2hsh)): + self._writerev(i, f) + else: # append incrementally + with open(self.path, 'ab') as f: + for i in pycompat.xrange(self._lastmaxrev + 1, + len(self._rev2hsh)): + self._writerev(i, f) + self._lastmaxrev = self.maxrev + + def _load(self): + """load state from file""" + if not self.path: + return + # use local variables in a loop. CPython uses LOAD_FAST for them, + # which is faster than both LOAD_CONST and LOAD_GLOBAL. + flaglen = 1 + hshlen = _hshlen + with open(self.path, 'rb') as f: + if f.read(len(self.HEADER)) != self.HEADER: + raise error.CorruptedFileError() + self.clear(flush=False) + while True: + buf = f.read(flaglen) + if not buf: + break + flag = ord(buf) + rev = len(self._rev2hsh) + if flag & renameflag: + path = self._readcstr(f) + self._renamerevs.append(rev) + self._renamepaths.append(path) + hsh = f.read(hshlen) + if len(hsh) != hshlen: + raise error.CorruptedFileError() + self._hsh2rev[hsh] = rev + self._rev2flag.append(flag) + self._rev2hsh.append(hsh) + self._lastmaxrev = self.maxrev + + def _writerev(self, rev, f): + """append a revision data to file""" + flag = self._rev2flag[rev] + hsh = self._rev2hsh[rev] + f.write(struct.pack('B', flag)) + if flag & renameflag: + path = self.rev2path(rev) + if path is None: + raise error.CorruptedFileError('cannot find path for %s' % rev) + f.write(path + '\0') + f.write(hsh) + + @staticmethod + def _readcstr(f): + """read a C-language-like '\0'-terminated string""" + buf = '' + while True: + ch = f.read(1) + if not ch: # unexpected eof + raise error.CorruptedFileError() + if ch == '\0': + break + buf += ch + return buf + + def __contains__(self, f): + """(fctx or (node, path)) -> bool. + test if (node, path) is in the map, and is not in a side branch. + f can be either a tuple of (node, path), or a fctx. + """ + if isinstance(f, tuple): # f: (node, path) + hsh, path = f + else: # f: fctx + hsh, path = f.node(), f.path() + rev = self.hsh2rev(hsh) + if rev is None: + return False + if path is not None and path != self.rev2path(rev): + return False + return (self.rev2flag(rev) & sidebranchflag) == 0 + +def getlastnode(path): + """return the last hash in a revmap, without loading its full content. + this is equivalent to `m = revmap(path); m.rev2hsh(m.maxrev)`, but faster. + """ + hsh = None + try: + with open(path, 'rb') as f: + f.seek(-_hshlen, 2) + if f.tell() > len(revmap.HEADER): + hsh = f.read(_hshlen) + except IOError: + pass + return hsh