diff rust/hg-core/src/matchers.rs @ 44593:496868f1030c

rust-matchers: use the `regex` crate Instead of falling back to Python when a code path with "ignore" functionality is reached and `Re2` is not installed, the default compilation (i.e. without the `with-re2` feature) will use the `regex` crate for all regular expressions business. As with the introduction of `Re2` in a previous series, this yields a big performance boost compared to the Python + C code in `status`, `diff`, `commit`, `update`, and maybe others. For now `Re2` looks to be faster at compiling the DFA (1.5ms vs 5ms for Netbeans' `.hgignore`) and a bit faster in actual use: (123ms vs 137ms for the parallel traversal of Netbeans' clean repo). I am in talks with the author of `regex` to see whether that performance difference is a bug, a "won't fix", or a tuning issue. The `regex` crate is already one of our dependencies and using this code does not require any additional work from the end-user than to use the Rust extensions. Differential Revision: https://phab.mercurial-scm.org/D8323
author Rapha?l Gom?s <rgomes@octobus.net>
date Tue, 24 Mar 2020 17:55:59 +0100
parents d880805d5442
children e62052d0f377
line wrap: on
line diff
--- a/rust/hg-core/src/matchers.rs	Sun Mar 15 16:11:58 2020 +0900
+++ b/rust/hg-core/src/matchers.rs	Tue Mar 24 17:55:59 2020 +0100
@@ -331,8 +331,37 @@
 }
 
 #[cfg(not(feature = "with-re2"))]
-fn re_matcher(_: &[u8]) -> PatternResult<Box<dyn Fn(&HgPath) -> bool + Sync>> {
-    Err(PatternError::Re2NotInstalled)
+/// Returns a function that matches an `HgPath` against the given regex
+/// pattern.
+///
+/// This can fail when the pattern is invalid or not supported by the
+/// underlying engine (the `regex` crate), for instance anything with
+/// back-references.
+fn re_matcher(
+    pattern: &[u8],
+) -> PatternResult<impl Fn(&HgPath) -> bool + Sync> {
+    use std::io::Write;
+
+    let mut escaped_bytes = vec![];
+    for byte in pattern {
+        if *byte > 127 {
+            write!(escaped_bytes, "\\x{:x}", *byte).unwrap();
+        } else {
+            escaped_bytes.push(*byte);
+        }
+    }
+
+    // Avoid the cost of UTF8 checking
+    //
+    // # Safety
+    // This is safe because we escaped all non-ASCII bytes.
+    let pattern_string = unsafe { String::from_utf8_unchecked(escaped_bytes) };
+    let re = regex::bytes::RegexBuilder::new(&pattern_string)
+        .unicode(false)
+        .build()
+        .map_err(|e| PatternError::UnsupportedSyntax(e.to_string()))?;
+
+    Ok(move |path: &HgPath| re.is_match(path.as_bytes()))
 }
 
 /// Returns the regex pattern and a function that matches an `HgPath` against