mercurial-scm/hg: comparison hgext/git/index.py

equal deleted inserted replaced

-:a08bbdf839ae
+:ad718271a9eb
+from __future__ import absolute_import
+import collections
+import os
+import sqlite3
+import pygit2
+from mercurial.i18n import _
+from mercurial import (
+encoding,
+error,
+node as nodemod,
+pycompat,
+)
+from . import gitutil
+_CURRENT_SCHEMA_VERSION = 1
+_SCHEMA = (
+"""
+CREATE TABLE refs (
+-- node and name are unique together. There may be more than one name for
+-- a given node, and there may be no name at all for a given node (in the
+-- case of an anonymous hg head).
+node TEXT NOT NULL,
+name TEXT
+);
+-- The "possible heads" of the repository, which we use to figure out
+-- if we need to re-walk the changelog.
+CREATE TABLE possible_heads (
+node TEXT NOT NULL
+);
+-- The topological heads of the changelog, which hg depends on.
+CREATE TABLE heads (
+node TEXT NOT NULL
+);
+-- A total ordering of the changelog
+CREATE TABLE changelog (
+rev INTEGER NOT NULL PRIMARY KEY,
+node TEXT NOT NULL,
+p1 TEXT,
+p2 TEXT
+);
+CREATE UNIQUE INDEX changelog_node_idx ON changelog(node);
+CREATE UNIQUE INDEX changelog_node_rev_idx ON changelog(rev, node);
+-- Changed files for each commit, which lets us dynamically build
+-- filelogs.
+CREATE TABLE changedfiles (
+node TEXT NOT NULL,
+filename TEXT NOT NULL,
+-- 40 zeroes for deletions
+filenode TEXT NOT NULL,
+-- to handle filelog parentage:
+p1node TEXT,
+p1filenode TEXT,
+p2node TEXT,
+p2filenode TEXT
+);
+CREATE INDEX changedfiles_nodes_idx
+ON changedfiles(node);
+PRAGMA user_version=%d
+"""
+% _CURRENT_SCHEMA_VERSION
+)
+def _createdb(path):
+# print('open db', path)
+# import traceback
+# traceback.print_stack()
+db = sqlite3.connect(encoding.strfromlocal(path))
+db.text_factory = bytes
+res = db.execute('PRAGMA user_version').fetchone()[0]
+# New database.
+if res == 0:
+for statement in _SCHEMA.split(';'):
+db.execute(statement.strip())
+db.commit()
+elif res == _CURRENT_SCHEMA_VERSION:
+pass
+else:
+raise error.Abort(_(b'sqlite database has unrecognized version'))
+db.execute('PRAGMA journal_mode=WAL')
+return db
+_OUR_ORDER = (
+pygit2.GIT_SORT_TOPOLOGICAL | pygit2.GIT_SORT_TIME | pygit2.GIT_SORT_REVERSE
+)
+_DIFF_FLAGS = 1 << 21  # GIT_DIFF_FORCE_BINARY, which isn't exposed by pygit2
+def _find_nearest_ancestor_introducing_node(
+db, gitrepo, file_path, walk_start, filenode
+):
+"""Find the nearest ancestor that introduces a file node.
+Args:
+db: a handle to our sqlite database.
+gitrepo: A pygit2.Repository instance.
+file_path: the path of a file in the repo
+walk_start: a pygit2.Oid that is a commit where we should start walking
+for our nearest ancestor.
+Returns:
+A hexlified SHA that is the commit ID of the next-nearest parent.
+"""
+assert isinstance(file_path, str), 'file_path must be str, got %r' % type(
+file_path
+)
+assert isinstance(filenode, str), 'filenode must be str, got %r' % type(
+filenode
+)
+parent_options = {
+row[0].decode('ascii')
+for row in db.execute(
+'SELECT node FROM changedfiles '
+'WHERE filename = ? AND filenode = ?',
+(file_path, filenode),
+)
+}
+inner_walker = gitrepo.walk(walk_start, _OUR_ORDER)
+for w in inner_walker:
+if w.id.hex in parent_options:
+return w.id.hex
+raise error.ProgrammingError(
+'Unable to find introducing commit for %s node %s from %s',
+(file_path, filenode, walk_start),
+)
+def fill_in_filelog(gitrepo, db, startcommit, path, startfilenode):
+"""Given a starting commit and path, fill in a filelog's parent pointers.
+Args:
+gitrepo: a pygit2.Repository
+db: a handle to our sqlite database
+startcommit: a hexlified node id for the commit to start at
+path: the path of the file whose parent pointers we should fill in.
+filenode: the hexlified node id of the file at startcommit
+TODO: make filenode optional
+"""
+assert isinstance(
+startcommit, str
+), 'startcommit must be str, got %r' % type(startcommit)
+assert isinstance(
+startfilenode, str
+), 'startfilenode must be str, got %r' % type(startfilenode)
+visit = collections.deque([(startcommit, startfilenode)])
+while visit:
+cnode, filenode = visit.popleft()
+commit = gitrepo[cnode]
+parents = []
+for parent in commit.parents:
+t = parent.tree
+for comp in path.split('/'):
+try:
+t = gitrepo[t[comp].id]
+except KeyError:
+break
+else:
+introducer = _find_nearest_ancestor_introducing_node(
+db, gitrepo, path, parent.id, t.id.hex
+)
+parents.append((introducer, t.id.hex))
+p1node = p1fnode = p2node = p2fnode = gitutil.nullgit
+for par, parfnode in parents:
+found = int(
+db.execute(
+'SELECT COUNT(*) FROM changedfiles WHERE '
+'node = ? AND filename = ? AND filenode = ? AND '
+'p1node NOT NULL',
+(par, path, parfnode),
+).fetchone()[0]
+)
+if found == 0:
+assert par is not None
+visit.append((par, parfnode))
+if parents:
+p1node, p1fnode = parents[0]
+if len(parents) == 2:
+p2node, p2fnode = parents[1]
+if len(parents) > 2:
+raise error.ProgrammingError(
+b"git support can't handle octopus merges"
+)
+db.execute(
+'UPDATE changedfiles SET '
+'p1node = ?, p1filenode = ?, p2node = ?, p2filenode = ? '
+'WHERE node = ? AND filename = ? AND filenode = ?',
+(p1node, p1fnode, p2node, p2fnode, commit.id.hex, path, filenode),
+)
+db.commit()
+def _index_repo(gitrepo, db, progress_factory=lambda *args, **kwargs: None):
+# Identify all references so we can tell the walker to visit all of them.
+all_refs = gitrepo.listall_references()
+possible_heads = set()
+prog = progress_factory(b'refs')
+for pos, ref in enumerate(all_refs):
+if prog is not None:
+prog.update(pos)
+if not (
+ref.startswith('refs/heads/')  # local branch
+or ref.startswith('refs/tags/')  # tag
+or ref.startswith('refs/remotes/')  # remote branch
+or ref.startswith('refs/hg/')  # from this extension
+):
+continue
+try:
+start = gitrepo.lookup_reference(ref).peel(pygit2.GIT_OBJ_COMMIT)
+except ValueError:
+# No commit to be found, so we don't care for hg's purposes.
+continue
+possible_heads.add(start.id)
+# Optimization: if the list of heads hasn't changed, don't
+# reindex, the changelog. This doesn't matter on small
+# repositories, but on even moderately deep histories (eg cpython)
+# this is a very important performance win.
+#
+# TODO: we should figure out how to incrementally index history
+# (preferably by detecting rewinds!) so that we don't have to do a
+# full changelog walk every time a new commit is created.
+cache_heads = {x[0] for x in db.execute('SELECT node FROM possible_heads')}
+walker = None
+cur_cache_heads = {h.hex for h in possible_heads}
+if cur_cache_heads == cache_heads:
+return
+for start in possible_heads:
+if walker is None:
+walker = gitrepo.walk(start, _OUR_ORDER)
+else:
+walker.push(start)
+# Empty out the existing changelog. Even for large-ish histories
+# we can do the top-level "walk all the commits" dance very
+# quickly as long as we don't need to figure out the changed files
+# list.
+db.execute('DELETE FROM changelog')
+if prog is not None:
+prog.complete()
+prog = progress_factory(b'commits')
+# This walker is sure to visit all the revisions in history, but
+# only once.
+for pos, commit in enumerate(walker):
+if prog is not None:
+prog.update(pos)
+p1 = p2 = nodemod.nullhex
+if len(commit.parents) > 2:
+raise error.ProgrammingError(
+(
+b"git support can't handle octopus merges, "
+b"found a commit with %d parents :("
+)
+% len(commit.parents)
+)
+if commit.parents:
+p1 = commit.parents[0].id.hex
+if len(commit.parents) == 2:
+p2 = commit.parents[1].id.hex
+db.execute(
+'INSERT INTO changelog (rev, node, p1, p2) VALUES(?, ?, ?, ?)',
+(pos, commit.id.hex, p1, p2),
+)
+num_changedfiles = db.execute(
+"SELECT COUNT(*) from changedfiles WHERE node = ?",
+(commit.id.hex,),
+).fetchone()[0]
+if not num_changedfiles:
+files = {}
+# I *think* we only need to check p1 for changed files
+# (and therefore linkrevs), because any node that would
+# actually have this commit as a linkrev would be
+# completely new in this rev.
+p1 = commit.parents[0].id.hex if commit.parents else None
+if p1 is not None:
+patchgen = gitrepo.diff(p1, commit.id.hex, flags=_DIFF_FLAGS)
+else:
+patchgen = commit.tree.diff_to_tree(
+swap=True, flags=_DIFF_FLAGS
+)
+new_files = (p.delta.new_file for p in patchgen)
+files = {
+nf.path: nf.id.hex
+for nf in new_files
+if nf.id.raw != nodemod.nullid
+}
+for p, n in files.items():
+# We intentionally set NULLs for any file parentage
+# information so it'll get demand-computed later. We
+# used to do it right here, and it was _very_ slow.
+db.execute(
+'INSERT INTO changedfiles ('
+'node, filename, filenode, p1node, p1filenode, p2node, '
+'p2filenode) VALUES(?, ?, ?, ?, ?, ?, ?)',
+(commit.id.hex, p, n, None, None, None, None),
+)
+db.execute('DELETE FROM heads')
+db.execute('DELETE FROM possible_heads')
+for hid in possible_heads:
+h = hid.hex
+db.execute('INSERT INTO possible_heads (node) VALUES(?)', (h,))
+haschild = db.execute(
+'SELECT COUNT(*) FROM changelog WHERE p1 = ? OR p2 = ?', (h, h)
+).fetchone()[0]
+if not haschild:
+db.execute('INSERT INTO heads (node) VALUES(?)', (h,))
+db.commit()
+if prog is not None:
+prog.complete()
+def get_index(gitrepo, progress_factory=lambda *args, **kwargs: None):
+cachepath = os.path.join(
+pycompat.fsencode(gitrepo.path), b'..', b'.hg', b'cache'
+)
+if not os.path.exists(cachepath):
+os.makedirs(cachepath)
+dbpath = os.path.join(cachepath, b'git-commits.sqlite')
+db = _createdb(dbpath)
+# TODO check against gitrepo heads before doing a full index
+# TODO thread a ui.progress call into this layer
+_index_repo(gitrepo, db, progress_factory)
+return db

changeset 44477	ad718271a9eb
child 44484	ec54b3d2af0b