hgext/git/index.py
changeset 44477 ad718271a9eb
child 44484 ec54b3d2af0b
equal deleted inserted replaced
44470:a08bbdf839ae 44477:ad718271a9eb
       
     1 from __future__ import absolute_import
       
     2 
       
     3 import collections
       
     4 import os
       
     5 import sqlite3
       
     6 
       
     7 import pygit2
       
     8 
       
     9 from mercurial.i18n import _
       
    10 
       
    11 from mercurial import (
       
    12     encoding,
       
    13     error,
       
    14     node as nodemod,
       
    15     pycompat,
       
    16 )
       
    17 
       
    18 from . import gitutil
       
    19 
       
    20 
       
    21 _CURRENT_SCHEMA_VERSION = 1
       
    22 _SCHEMA = (
       
    23     """
       
    24 CREATE TABLE refs (
       
    25   -- node and name are unique together. There may be more than one name for
       
    26   -- a given node, and there may be no name at all for a given node (in the
       
    27   -- case of an anonymous hg head).
       
    28   node TEXT NOT NULL,
       
    29   name TEXT
       
    30 );
       
    31 
       
    32 -- The "possible heads" of the repository, which we use to figure out
       
    33 -- if we need to re-walk the changelog.
       
    34 CREATE TABLE possible_heads (
       
    35   node TEXT NOT NULL
       
    36 );
       
    37 
       
    38 -- The topological heads of the changelog, which hg depends on.
       
    39 CREATE TABLE heads (
       
    40   node TEXT NOT NULL
       
    41 );
       
    42 
       
    43 -- A total ordering of the changelog
       
    44 CREATE TABLE changelog (
       
    45   rev INTEGER NOT NULL PRIMARY KEY,
       
    46   node TEXT NOT NULL,
       
    47   p1 TEXT,
       
    48   p2 TEXT
       
    49 );
       
    50 
       
    51 CREATE UNIQUE INDEX changelog_node_idx ON changelog(node);
       
    52 CREATE UNIQUE INDEX changelog_node_rev_idx ON changelog(rev, node);
       
    53 
       
    54 -- Changed files for each commit, which lets us dynamically build
       
    55 -- filelogs.
       
    56 CREATE TABLE changedfiles (
       
    57   node TEXT NOT NULL,
       
    58   filename TEXT NOT NULL,
       
    59   -- 40 zeroes for deletions
       
    60   filenode TEXT NOT NULL,
       
    61 -- to handle filelog parentage:
       
    62   p1node TEXT,
       
    63   p1filenode TEXT,
       
    64   p2node TEXT,
       
    65   p2filenode TEXT
       
    66 );
       
    67 
       
    68 CREATE INDEX changedfiles_nodes_idx
       
    69   ON changedfiles(node);
       
    70 
       
    71 PRAGMA user_version=%d
       
    72 """
       
    73     % _CURRENT_SCHEMA_VERSION
       
    74 )
       
    75 
       
    76 
       
    77 def _createdb(path):
       
    78     # print('open db', path)
       
    79     # import traceback
       
    80     # traceback.print_stack()
       
    81     db = sqlite3.connect(encoding.strfromlocal(path))
       
    82     db.text_factory = bytes
       
    83 
       
    84     res = db.execute('PRAGMA user_version').fetchone()[0]
       
    85 
       
    86     # New database.
       
    87     if res == 0:
       
    88         for statement in _SCHEMA.split(';'):
       
    89             db.execute(statement.strip())
       
    90 
       
    91         db.commit()
       
    92 
       
    93     elif res == _CURRENT_SCHEMA_VERSION:
       
    94         pass
       
    95 
       
    96     else:
       
    97         raise error.Abort(_(b'sqlite database has unrecognized version'))
       
    98 
       
    99     db.execute('PRAGMA journal_mode=WAL')
       
   100 
       
   101     return db
       
   102 
       
   103 
       
   104 _OUR_ORDER = (
       
   105     pygit2.GIT_SORT_TOPOLOGICAL | pygit2.GIT_SORT_TIME | pygit2.GIT_SORT_REVERSE
       
   106 )
       
   107 
       
   108 _DIFF_FLAGS = 1 << 21  # GIT_DIFF_FORCE_BINARY, which isn't exposed by pygit2
       
   109 
       
   110 
       
   111 def _find_nearest_ancestor_introducing_node(
       
   112     db, gitrepo, file_path, walk_start, filenode
       
   113 ):
       
   114     """Find the nearest ancestor that introduces a file node.
       
   115 
       
   116     Args:
       
   117       db: a handle to our sqlite database.
       
   118       gitrepo: A pygit2.Repository instance.
       
   119       file_path: the path of a file in the repo
       
   120       walk_start: a pygit2.Oid that is a commit where we should start walking
       
   121                   for our nearest ancestor.
       
   122 
       
   123     Returns:
       
   124       A hexlified SHA that is the commit ID of the next-nearest parent.
       
   125     """
       
   126     assert isinstance(file_path, str), 'file_path must be str, got %r' % type(
       
   127         file_path
       
   128     )
       
   129     assert isinstance(filenode, str), 'filenode must be str, got %r' % type(
       
   130         filenode
       
   131     )
       
   132     parent_options = {
       
   133         row[0].decode('ascii')
       
   134         for row in db.execute(
       
   135             'SELECT node FROM changedfiles '
       
   136             'WHERE filename = ? AND filenode = ?',
       
   137             (file_path, filenode),
       
   138         )
       
   139     }
       
   140     inner_walker = gitrepo.walk(walk_start, _OUR_ORDER)
       
   141     for w in inner_walker:
       
   142         if w.id.hex in parent_options:
       
   143             return w.id.hex
       
   144     raise error.ProgrammingError(
       
   145         'Unable to find introducing commit for %s node %s from %s',
       
   146         (file_path, filenode, walk_start),
       
   147     )
       
   148 
       
   149 
       
   150 def fill_in_filelog(gitrepo, db, startcommit, path, startfilenode):
       
   151     """Given a starting commit and path, fill in a filelog's parent pointers.
       
   152 
       
   153     Args:
       
   154       gitrepo: a pygit2.Repository
       
   155       db: a handle to our sqlite database
       
   156       startcommit: a hexlified node id for the commit to start at
       
   157       path: the path of the file whose parent pointers we should fill in.
       
   158       filenode: the hexlified node id of the file at startcommit
       
   159 
       
   160     TODO: make filenode optional
       
   161     """
       
   162     assert isinstance(
       
   163         startcommit, str
       
   164     ), 'startcommit must be str, got %r' % type(startcommit)
       
   165     assert isinstance(
       
   166         startfilenode, str
       
   167     ), 'startfilenode must be str, got %r' % type(startfilenode)
       
   168     visit = collections.deque([(startcommit, startfilenode)])
       
   169     while visit:
       
   170         cnode, filenode = visit.popleft()
       
   171         commit = gitrepo[cnode]
       
   172         parents = []
       
   173         for parent in commit.parents:
       
   174             t = parent.tree
       
   175             for comp in path.split('/'):
       
   176                 try:
       
   177                     t = gitrepo[t[comp].id]
       
   178                 except KeyError:
       
   179                     break
       
   180             else:
       
   181                 introducer = _find_nearest_ancestor_introducing_node(
       
   182                     db, gitrepo, path, parent.id, t.id.hex
       
   183                 )
       
   184                 parents.append((introducer, t.id.hex))
       
   185         p1node = p1fnode = p2node = p2fnode = gitutil.nullgit
       
   186         for par, parfnode in parents:
       
   187             found = int(
       
   188                 db.execute(
       
   189                     'SELECT COUNT(*) FROM changedfiles WHERE '
       
   190                     'node = ? AND filename = ? AND filenode = ? AND '
       
   191                     'p1node NOT NULL',
       
   192                     (par, path, parfnode),
       
   193                 ).fetchone()[0]
       
   194             )
       
   195             if found == 0:
       
   196                 assert par is not None
       
   197                 visit.append((par, parfnode))
       
   198         if parents:
       
   199             p1node, p1fnode = parents[0]
       
   200         if len(parents) == 2:
       
   201             p2node, p2fnode = parents[1]
       
   202         if len(parents) > 2:
       
   203             raise error.ProgrammingError(
       
   204                 b"git support can't handle octopus merges"
       
   205             )
       
   206         db.execute(
       
   207             'UPDATE changedfiles SET '
       
   208             'p1node = ?, p1filenode = ?, p2node = ?, p2filenode = ? '
       
   209             'WHERE node = ? AND filename = ? AND filenode = ?',
       
   210             (p1node, p1fnode, p2node, p2fnode, commit.id.hex, path, filenode),
       
   211         )
       
   212     db.commit()
       
   213 
       
   214 
       
   215 def _index_repo(gitrepo, db, progress_factory=lambda *args, **kwargs: None):
       
   216     # Identify all references so we can tell the walker to visit all of them.
       
   217     all_refs = gitrepo.listall_references()
       
   218     possible_heads = set()
       
   219     prog = progress_factory(b'refs')
       
   220     for pos, ref in enumerate(all_refs):
       
   221         if prog is not None:
       
   222             prog.update(pos)
       
   223         if not (
       
   224             ref.startswith('refs/heads/')  # local branch
       
   225             or ref.startswith('refs/tags/')  # tag
       
   226             or ref.startswith('refs/remotes/')  # remote branch
       
   227             or ref.startswith('refs/hg/')  # from this extension
       
   228         ):
       
   229             continue
       
   230         try:
       
   231             start = gitrepo.lookup_reference(ref).peel(pygit2.GIT_OBJ_COMMIT)
       
   232         except ValueError:
       
   233             # No commit to be found, so we don't care for hg's purposes.
       
   234             continue
       
   235         possible_heads.add(start.id)
       
   236     # Optimization: if the list of heads hasn't changed, don't
       
   237     # reindex, the changelog. This doesn't matter on small
       
   238     # repositories, but on even moderately deep histories (eg cpython)
       
   239     # this is a very important performance win.
       
   240     #
       
   241     # TODO: we should figure out how to incrementally index history
       
   242     # (preferably by detecting rewinds!) so that we don't have to do a
       
   243     # full changelog walk every time a new commit is created.
       
   244     cache_heads = {x[0] for x in db.execute('SELECT node FROM possible_heads')}
       
   245     walker = None
       
   246     cur_cache_heads = {h.hex for h in possible_heads}
       
   247     if cur_cache_heads == cache_heads:
       
   248         return
       
   249     for start in possible_heads:
       
   250         if walker is None:
       
   251             walker = gitrepo.walk(start, _OUR_ORDER)
       
   252         else:
       
   253             walker.push(start)
       
   254 
       
   255     # Empty out the existing changelog. Even for large-ish histories
       
   256     # we can do the top-level "walk all the commits" dance very
       
   257     # quickly as long as we don't need to figure out the changed files
       
   258     # list.
       
   259     db.execute('DELETE FROM changelog')
       
   260     if prog is not None:
       
   261         prog.complete()
       
   262     prog = progress_factory(b'commits')
       
   263     # This walker is sure to visit all the revisions in history, but
       
   264     # only once.
       
   265     for pos, commit in enumerate(walker):
       
   266         if prog is not None:
       
   267             prog.update(pos)
       
   268         p1 = p2 = nodemod.nullhex
       
   269         if len(commit.parents) > 2:
       
   270             raise error.ProgrammingError(
       
   271                 (
       
   272                     b"git support can't handle octopus merges, "
       
   273                     b"found a commit with %d parents :("
       
   274                 )
       
   275                 % len(commit.parents)
       
   276             )
       
   277         if commit.parents:
       
   278             p1 = commit.parents[0].id.hex
       
   279         if len(commit.parents) == 2:
       
   280             p2 = commit.parents[1].id.hex
       
   281         db.execute(
       
   282             'INSERT INTO changelog (rev, node, p1, p2) VALUES(?, ?, ?, ?)',
       
   283             (pos, commit.id.hex, p1, p2),
       
   284         )
       
   285 
       
   286         num_changedfiles = db.execute(
       
   287             "SELECT COUNT(*) from changedfiles WHERE node = ?",
       
   288             (commit.id.hex,),
       
   289         ).fetchone()[0]
       
   290         if not num_changedfiles:
       
   291             files = {}
       
   292             # I *think* we only need to check p1 for changed files
       
   293             # (and therefore linkrevs), because any node that would
       
   294             # actually have this commit as a linkrev would be
       
   295             # completely new in this rev.
       
   296             p1 = commit.parents[0].id.hex if commit.parents else None
       
   297             if p1 is not None:
       
   298                 patchgen = gitrepo.diff(p1, commit.id.hex, flags=_DIFF_FLAGS)
       
   299             else:
       
   300                 patchgen = commit.tree.diff_to_tree(
       
   301                     swap=True, flags=_DIFF_FLAGS
       
   302                 )
       
   303             new_files = (p.delta.new_file for p in patchgen)
       
   304             files = {
       
   305                 nf.path: nf.id.hex
       
   306                 for nf in new_files
       
   307                 if nf.id.raw != nodemod.nullid
       
   308             }
       
   309             for p, n in files.items():
       
   310                 # We intentionally set NULLs for any file parentage
       
   311                 # information so it'll get demand-computed later. We
       
   312                 # used to do it right here, and it was _very_ slow.
       
   313                 db.execute(
       
   314                     'INSERT INTO changedfiles ('
       
   315                     'node, filename, filenode, p1node, p1filenode, p2node, '
       
   316                     'p2filenode) VALUES(?, ?, ?, ?, ?, ?, ?)',
       
   317                     (commit.id.hex, p, n, None, None, None, None),
       
   318                 )
       
   319     db.execute('DELETE FROM heads')
       
   320     db.execute('DELETE FROM possible_heads')
       
   321     for hid in possible_heads:
       
   322         h = hid.hex
       
   323         db.execute('INSERT INTO possible_heads (node) VALUES(?)', (h,))
       
   324         haschild = db.execute(
       
   325             'SELECT COUNT(*) FROM changelog WHERE p1 = ? OR p2 = ?', (h, h)
       
   326         ).fetchone()[0]
       
   327         if not haschild:
       
   328             db.execute('INSERT INTO heads (node) VALUES(?)', (h,))
       
   329 
       
   330     db.commit()
       
   331     if prog is not None:
       
   332         prog.complete()
       
   333 
       
   334 
       
   335 def get_index(gitrepo, progress_factory=lambda *args, **kwargs: None):
       
   336     cachepath = os.path.join(
       
   337         pycompat.fsencode(gitrepo.path), b'..', b'.hg', b'cache'
       
   338     )
       
   339     if not os.path.exists(cachepath):
       
   340         os.makedirs(cachepath)
       
   341     dbpath = os.path.join(cachepath, b'git-commits.sqlite')
       
   342     db = _createdb(dbpath)
       
   343     # TODO check against gitrepo heads before doing a full index
       
   344     # TODO thread a ui.progress call into this layer
       
   345     _index_repo(gitrepo, db, progress_factory)
       
   346     return db