diff mercurial/filesetlang.py @ 38904:899b4c74209c

fileset: combine union of basic patterns into single matcher This appears to improve query performance in a big repository than I thought. Writing less Python in a hot loop, faster computation we gain. $ hg files --cwd mozilla-central --time 'set:a* + b* + c* + d* + e*' (orig) time: real 0.670 secs (user 0.640+0.000 sys 0.030+0.000) (new) time: real 0.210 secs (user 0.180+0.000 sys 0.020+0.000)
author Yuya Nishihara <yuya@tcha.org>
date Sat, 21 Jul 2018 17:19:12 +0900
parents 73731fa8d1bd
children e79a69af1593
line wrap: on
line diff
--- a/mercurial/filesetlang.py	Sat Jul 21 17:13:34 2018 +0900
+++ b/mercurial/filesetlang.py	Sat Jul 21 17:19:12 2018 +0900
@@ -185,6 +185,21 @@
         return ('minus', ta, tb[1])
     return (op, ta, tb)
 
+def _optimizeunion(xs):
+    # collect string patterns so they can be compiled into a single regexp
+    ws, ts, ss = [], [], []
+    for x in xs:
+        w, t = _optimize(x)
+        if t is not None and t[0] in {'string', 'symbol', 'kindpat'}:
+            ss.append(t)
+            continue
+        ws.append(w)
+        ts.append(t)
+    if ss:
+        ws.append(WEIGHT_CHECK_FILENAME)
+        ts.append(('patterns',) + tuple(ss))
+    return ws, ts
+
 def _optimize(x):
     if x is None:
         return 0, x
@@ -206,7 +221,9 @@
         else:
             return wb, _optimizeandops(op, tb, ta)
     if op == 'or':
-        ws, ts = zip(*(_optimize(y) for y in x[1:]))
+        ws, ts = _optimizeunion(x[1:])
+        if len(ts) == 1:
+            return ws[0], ts[0] # 'or' operation is fully optimized out
         ts = tuple(it[1] for it in sorted(enumerate(ts),
                                           key=lambda it: ws[it[0]]))
         return max(ws), (op,) + ts