Mercurial > public > mercurial-scm > hg-stable
view hgext/git/index.py @ 52652:4dadaf300fe0
git: index changed files on-demand
Instead of indexing the changed files for every commit immediately, we can
index...
1. heads' changed files immediately
2. other commits' changed files on-demand
This helps a lot on repositories with large histories since the initial
mercurial invocation doesn't have to wait for the complete repo history to
be indexed.
author | Josef 'Jeff' Sipek <jeffpc@josefsipek.net> |
---|---|
date | Fri, 04 Oct 2024 10:51:44 -0400 |
parents | 42f00965e50b |
children | 3865451a5fab |
line wrap: on
line source
from __future__ import annotations import collections import os import sqlite3 from mercurial.i18n import _ from mercurial.node import sha1nodeconstants from mercurial import ( encoding, error, pycompat, ) from . import gitutil pygit2 = gitutil.get_pygit2() _CURRENT_SCHEMA_VERSION = 4 _SCHEMA = ( """ CREATE TABLE refs ( -- node and name are unique together. There may be more than one name for -- a given node, and there may be no name at all for a given node (in the -- case of an anonymous hg head). node TEXT NOT NULL, name TEXT ); -- The "possible heads" of the repository, which we use to figure out -- if we need to re-walk the changelog. CREATE TABLE possible_heads ( node TEXT NOT NULL ); CREATE UNIQUE INDEX possible_heads_idx ON possible_heads(node); -- The topological heads of the changelog, which hg depends on. CREATE TABLE heads ( node TEXT NOT NULL ); -- A total ordering of the changelog CREATE TABLE changelog ( rev INTEGER NOT NULL PRIMARY KEY, node TEXT NOT NULL, p1 TEXT, p2 TEXT, synthetic TEXT, changedfiles BOOLEAN ); CREATE UNIQUE INDEX changelog_node_idx ON changelog(node); CREATE UNIQUE INDEX changelog_node_rev_idx ON changelog(rev, node); -- Changed files for each commit, which lets us dynamically build -- filelogs. CREATE TABLE changedfiles ( node TEXT NOT NULL, filename TEXT NOT NULL, -- 40 zeroes for deletions filenode TEXT NOT NULL, -- to handle filelog parentage: p1node TEXT, p1filenode TEXT, p2node TEXT, p2filenode TEXT ); CREATE INDEX changedfiles_nodes_idx ON changedfiles(node); PRAGMA user_version=%d """ % _CURRENT_SCHEMA_VERSION ) def _createdb(path): # print('open db', path) # import traceback # traceback.print_stack() db = sqlite3.connect(encoding.strfromlocal(path)) db.text_factory = bytes res = db.execute('PRAGMA user_version').fetchone()[0] # New database. if res == 0: for statement in _SCHEMA.split(';'): db.execute(statement.strip()) db.commit() elif res == _CURRENT_SCHEMA_VERSION: pass else: raise error.Abort(_(b'sqlite database has unrecognized version')) db.execute('PRAGMA journal_mode=WAL') return db _OUR_ORDER = () if pygit2: _OUR_ORDER = ( pygit2.GIT_SORT_TOPOLOGICAL | pygit2.GIT_SORT_TIME | pygit2.GIT_SORT_REVERSE ) _DIFF_FLAGS = 1 << 21 # GIT_DIFF_FORCE_BINARY, which isn't exposed by pygit2 def _find_nearest_ancestor_introducing_node( db, gitrepo, file_path, walk_start, filenode ): """Find the nearest ancestor that introduces a file node. Args: db: a handle to our sqlite database. gitrepo: A pygit2.Repository instance. file_path: the path of a file in the repo walk_start: a pygit2.Oid that is a commit where we should start walking for our nearest ancestor. Returns: A hexlified SHA that is the commit ID of the next-nearest parent. """ assert isinstance(file_path, str), 'file_path must be str, got %r' % type( file_path ) assert isinstance(filenode, str), 'filenode must be str, got %r' % type( filenode ) parent_options = { row[0].decode('ascii') for row in db.execute( 'SELECT node FROM changedfiles ' 'WHERE filename = ? AND filenode = ?', (file_path, filenode), ) } inner_walker = gitrepo.walk(walk_start, _OUR_ORDER) for w in inner_walker: if w.id.hex in parent_options: return w.id.hex raise error.ProgrammingError( 'Unable to find introducing commit for %s node %s from %s', (file_path, filenode, walk_start), ) def fill_in_filelog(gitrepo, db, startcommit, path, startfilenode): """Given a starting commit and path, fill in a filelog's parent pointers. Args: gitrepo: a pygit2.Repository db: a handle to our sqlite database startcommit: a hexlified node id for the commit to start at path: the path of the file whose parent pointers we should fill in. filenode: the hexlified node id of the file at startcommit TODO: make filenode optional """ assert isinstance( startcommit, str ), 'startcommit must be str, got %r' % type(startcommit) assert isinstance( startfilenode, str ), 'startfilenode must be str, got %r' % type(startfilenode) visit = collections.deque([(startcommit, startfilenode)]) while visit: cnode, filenode = visit.popleft() commit = gitrepo[cnode] parents = [] for parent in commit.parents: t = parent.tree for comp in path.split('/'): try: t = gitrepo[t[comp].id] except KeyError: break else: introducer = _find_nearest_ancestor_introducing_node( db, gitrepo, path, parent.id, t.id.hex ) parents.append((introducer, t.id.hex)) p1node = p1fnode = p2node = p2fnode = gitutil.nullgit for par, parfnode in parents: found = int( db.execute( 'SELECT COUNT(*) FROM changedfiles WHERE ' 'node = ? AND filename = ? AND filenode = ? AND ' 'p1node NOT NULL', (par, path, parfnode), ).fetchone()[0] ) if found == 0: assert par is not None visit.append((par, parfnode)) if parents: p1node, p1fnode = parents[0] if len(parents) == 2: p2node, p2fnode = parents[1] if len(parents) > 2: raise error.ProgrammingError( b"git support can't handle octopus merges" ) db.execute( 'UPDATE changedfiles SET ' 'p1node = ?, p1filenode = ?, p2node = ?, p2filenode = ? ' 'WHERE node = ? AND filename = ? AND filenode = ?', (p1node, p1fnode, p2node, p2fnode, commit.id.hex, path, filenode), ) db.commit() def _index_repo_commit(gitrepo, db, node, commit=False): already_done = db.execute( "SELECT changedfiles FROM changelog WHERE node=?", (node,) ).fetchone()[0] if already_done: return # This commit has already been indexed commit = gitrepo[node] files = {} # I *think* we only need to check p1 for changed files # (and therefore linkrevs), because any node that would # actually have this commit as a linkrev would be # completely new in this rev. p1 = commit.parents[0].id.hex if commit.parents else None if p1 is not None: patchgen = gitrepo.diff(p1, commit.id.hex, flags=_DIFF_FLAGS) else: patchgen = commit.tree.diff_to_tree(swap=True, flags=_DIFF_FLAGS) new_files = (p.delta.new_file for p in patchgen) files = { nf.path: nf.id.hex for nf in new_files if nf.id.raw != sha1nodeconstants.nullid } for p, n in files.items(): # We intentionally set NULLs for any file parentage # information so it'll get demand-computed later. We # used to do it right here, and it was _very_ slow. db.execute( 'INSERT INTO changedfiles (' 'node, filename, filenode, p1node, p1filenode, p2node, ' 'p2filenode) VALUES(?, ?, ?, ?, ?, ?, ?)', (commit.id.hex, p, n, None, None, None, None), ) # Mark the commit as loaded db.execute( "UPDATE changelog SET changedfiles=TRUE WHERE node=?", (commit.id.hex,) ) if commit: db.commit() def _index_repo( gitrepo, db, logfn=lambda x: None, progress_factory=lambda *args, **kwargs: None, ): # Identify all references so we can tell the walker to visit all of them. all_refs = gitrepo.listall_references() possible_heads = set() prog = progress_factory(b'refs') for pos, ref in enumerate(all_refs): if prog is not None: prog.update(pos) if not ( ref.startswith('refs/heads/') # local branch or ref.startswith('refs/tags/') # tag or ref.startswith('refs/remotes/') # remote branch or ref.startswith('refs/hg/') # from this extension ): continue try: start = gitrepo.lookup_reference(ref).peel(pygit2.GIT_OBJ_COMMIT) except ValueError: # No commit to be found, so we don't care for hg's purposes. continue possible_heads.add(start.id) # Optimization: if the list of refs hasn't changed, don't # reindex the changelog. This doesn't matter on small # repositories, but on even moderately deep histories (e.g., cpython) # this is a very important performance win. # # TODO: we should figure out how to incrementally index history # (preferably by detecting rewinds!) so that we don't have to do a # full changelog walk every time a new commit is created. cache_heads = { pycompat.sysstr(x[0]) for x in db.execute('SELECT node FROM possible_heads') } walker = None cur_cache_heads = {h.hex for h in possible_heads} if cur_cache_heads == cache_heads: return logfn(b'heads mismatch, rebuilding dagcache\n') for start in possible_heads: if walker is None: walker = gitrepo.walk(start, _OUR_ORDER) else: walker.push(start) # Empty out the existing changelog. Even for large-ish histories # we can do the top-level "walk all the commits" dance very # quickly as long as we don't need to figure out the changed files # list. db.execute('DELETE FROM changelog') if prog is not None: prog.complete() prog = progress_factory(b'commits') # This walker is sure to visit all the revisions in history, but # only once. pos = -1 for commit in walker: if prog is not None: prog.update(pos) p1 = p2 = gitutil.nullgit if len(commit.parents) <= 2: if commit.parents: p1 = commit.parents[0].id.hex if len(commit.parents) == 2: p2 = commit.parents[1].id.hex pos += 1 db.execute( 'INSERT INTO changelog (rev, node, p1, p2, synthetic, changedfiles) VALUES(?, ?, ?, ?, NULL, FALSE)', (pos, commit.id.hex, p1, p2), ) else: parents = list(commit.parents) p1 = parents.pop(0).id.hex while parents: pos += 1 if len(parents) == 1: this = commit.id.hex synth = None else: this = "%040x" % pos synth = commit.id.hex p2 = parents.pop(0).id.hex db.execute( 'INSERT INTO changelog (rev, node, p1, p2, synthetic, changedfiles) VALUES(?, ?, ?, ?, ?, FALSE)', (pos, this, p1, p2, synth), ) p1 = this # Determine heads from the list of possible heads. db.execute('DELETE FROM heads') db.execute('DELETE FROM possible_heads') db.executemany( 'INSERT INTO possible_heads (node) VALUES(?)', [(hid.hex,) for hid in possible_heads], ) db.execute( ''' INSERT INTO heads (node) SELECT node FROM possible_heads WHERE node NOT IN ( SELECT DISTINCT possible_heads.node FROM changelog, possible_heads WHERE changelog.p1 = possible_heads.node OR changelog.p2 = possible_heads.node ) ''' ) # Mark all commits with already-loaded changefiles info db.execute( ''' UPDATE changelog SET changedfiles=TRUE WHERE node IN ( SELECT DISTINCT node FROM changedfiles ) ''' ) if prog is not None: prog.complete() # Index the changed files for head commits prog = progress_factory(b'indexing head files') heads = [ row[0].decode('ascii') for row in db.execute("SELECT * FROM heads") ] for pos, h in enumerate(heads): if prog is not None: prog.update(pos) _index_repo_commit(gitrepo, db, h) db.commit() if prog is not None: prog.complete() def get_index( gitrepo, logfn=lambda x: None, progress_factory=lambda *args, **kwargs: None ): cachepath = os.path.join( pycompat.fsencode(gitrepo.path), b'..', b'.hg', b'cache' ) if not os.path.exists(cachepath): os.makedirs(cachepath) dbpath = os.path.join(cachepath, b'git-commits.sqlite') db = _createdb(dbpath) # TODO check against gitrepo heads before doing a full index # TODO thread a ui.progress call into this layer _index_repo(gitrepo, db, logfn, progress_factory) return db