comparison hgext/git/index.py @ 52622:aa5844ade247

git: speed up possible head processing during indexing by ~100x Benchmarking of 50 iterations of indexing (see below) shows that there is essentially no difference for small repos (<1k commits), similarly medium repos (~12k commits) see some benefit but other overheads completely overwhelm it, but for large repos (~122k commits) the 80-100x speedup is clearly visible to the user. All of the numbers are in seconds and were measured with time.time() calls placed in _index_repo(). The times exclude the time taken by changedfiles processing. Small repo (guilt, 553 commits, 1 head): Min. 1st Qu. Median Mean 3rd Qu. Max. 0.0008781 0.0009274 0.0009800 0.0012285 0.0014637 0.0024107 (before) 0.0003092 0.0003281 0.0003519 0.0003777 0.0003927 0.0006843 (after) Medium repo (hamlib, 12k commits, 53 heads): Min. 1st Qu. Median Mean 3rd Qu. Max. 0.04881 0.05135 0.07632 0.06672 0.08042 0.09415 (before) 0.004249 0.004420 0.004799 0.004809 0.005051 0.006416 (after) Large repo (qemu, 122k commits, 50 heads): Min. 1st Qu. Median Mean 3rd Qu. Max. 4.274 4.595 4.832 6.578 8.397 9.721 (before) 0.05180 0.05643 0.05865 0.06130 0.06712 0.06872 (after)
author Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
date Wed, 02 Oct 2024 15:01:26 -0400
parents f4733654f144
children 4e2ea270ba6a
comparison
equal deleted inserted replaced
52621:ab4fb2d15bc9 52622:aa5844ade247
16 from . import gitutil 16 from . import gitutil
17 17
18 18
19 pygit2 = gitutil.get_pygit2() 19 pygit2 = gitutil.get_pygit2()
20 20
21 _CURRENT_SCHEMA_VERSION = 1 21 _CURRENT_SCHEMA_VERSION = 2
22 _SCHEMA = ( 22 _SCHEMA = (
23 """ 23 """
24 CREATE TABLE refs ( 24 CREATE TABLE refs (
25 -- node and name are unique together. There may be more than one name for 25 -- node and name are unique together. There may be more than one name for
26 -- a given node, and there may be no name at all for a given node (in the 26 -- a given node, and there may be no name at all for a given node (in the
32 -- The "possible heads" of the repository, which we use to figure out 32 -- The "possible heads" of the repository, which we use to figure out
33 -- if we need to re-walk the changelog. 33 -- if we need to re-walk the changelog.
34 CREATE TABLE possible_heads ( 34 CREATE TABLE possible_heads (
35 node TEXT NOT NULL 35 node TEXT NOT NULL
36 ); 36 );
37
38 CREATE UNIQUE INDEX possible_heads_idx ON possible_heads(node);
37 39
38 -- The topological heads of the changelog, which hg depends on. 40 -- The topological heads of the changelog, which hg depends on.
39 CREATE TABLE heads ( 41 CREATE TABLE heads (
40 node TEXT NOT NULL 42 node TEXT NOT NULL
41 ); 43 );
329 'p2filenode) VALUES(?, ?, ?, ?, ?, ?, ?)', 331 'p2filenode) VALUES(?, ?, ?, ?, ?, ?, ?)',
330 (commit.id.hex, p, n, None, None, None, None), 332 (commit.id.hex, p, n, None, None, None, None),
331 ) 333 )
332 db.execute('DELETE FROM heads') 334 db.execute('DELETE FROM heads')
333 db.execute('DELETE FROM possible_heads') 335 db.execute('DELETE FROM possible_heads')
334 for hid in possible_heads: 336 db.executemany(
335 h = hid.hex 337 'INSERT INTO possible_heads (node) VALUES(?)',
336 db.execute('INSERT INTO possible_heads (node) VALUES(?)', (h,)) 338 [(hid.hex,) for hid in possible_heads],
337 haschild = db.execute( 339 )
338 'SELECT COUNT(*) FROM changelog WHERE p1 = ? OR p2 = ?', (h, h) 340 db.execute(
339 ).fetchone()[0] 341 '''
340 if not haschild: 342 INSERT INTO heads (node)
341 db.execute('INSERT INTO heads (node) VALUES(?)', (h,)) 343 SELECT node FROM possible_heads WHERE
344 node NOT IN (
345 SELECT DISTINCT possible_heads.node FROM changelog, possible_heads WHERE
346 changelog.p1 = possible_heads.node OR
347 changelog.p2 = possible_heads.node
348 )
349 '''
350 )
342 351
343 db.commit() 352 db.commit()
344 if prog is not None: 353 if prog is not None:
345 prog.complete() 354 prog.complete()
346 355