mercurial-scm/hg: rust/hg-core/src/filepatterns.rs comparison

comparison rust/hg-core/src/filepatterns.rs @ 52556:1866119cbad7

rust-ignore: construct regex Hir object directly, avoiding large regex string Rework how we convert patterns to regexes in rust. Instead of going patterns -> string -> Regex, which is slow and causes some correctness issues, build a structured regex_syntax::hir::Hir value, which is faster and it also prevents surprising regex escape. This change makes the time of `build_regex_match` go from ~70-80ms to ~40ms in my testing (for a large hgignore). The bug I mentioned involves regex patterns that "escape" their intended scope. For example, a sequence of hgignore regexp patterns like this would previously lead to surprising behavior: foo(?: bar baz ) this matches foobar and foobaz, and doesn't match bar and baz. The new behavior is to report a pattern parse error The Python hg also has this bug, so this bugfix not really helping much, but it's probably better to fall back to real Python bugs than to simulate them.

author	Arseniy Alekseyev <aalekseyev@janestreet.com>
date	Fri, 06 Dec 2024 20:27:59 +0000
parents	e2e49069eeb6
children	94e2547e6f3d

comparison

equal deleted inserted replaced

-:85c095c1f8bc
+:1866119cbad7
 // GNU General Public License version 2 or any later version.
 //! Handling of Mercurial-specific patterns.
 use crate::{
+pre_regex::PreRegex,
 utils::{
 files::{canonical_path, get_bytes_from_path, get_path_from_bytes},
 hg_path::{path_to_hg_path_buf, HgPathBuf, HgPathError},
 SliceExt,
 },
 FastHashMap,
 };
 use lazy_static::lazy_static;
 use regex::bytes::{NoExpand, Regex};
+use std::mem;
 use std::path::{Path, PathBuf};
 use std::vec::Vec;
 use std::{fmt, ops::Deref};
 #[derive(Debug, derive_more::From)]
 v[*byte as usize].insert(0, b'\\');
 }
 v
 };
 }
-/// These are matched in order
-const GLOB_REPLACEMENTS: &[(&[u8], &[u8])] =
-&[(b"*/", b"(?:.*/)?"), (b"*", b".*"), (b"", b"[^/]*")];
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub enum PatternSyntax {
 /// A regular expression
 Regexp,
 ///
 /// Note: `Box` is used to minimize size impact on other enum variants
 ExpandedSubInclude(Box<SubInclude>),
 }
-/// Transforms a glob pattern into a regex
+/// A wildcard parsed from a glob
-pub fn glob_to_re(pat: &[u8]) -> Vec<u8> {
+#[derive(Debug, Clone, Copy)]
+enum GlobWildcard {
+/// `**/` matches any sequence of characters ending at a path
+/// component boundary
+AnyComponents,
+/// `*`: matches any sequence of characters within one path component
+AnyNonSlash,
+/// `**`: matches any sequence of characters including slashes
+Anything,
+}
+impl GlobWildcard {
+/// Optimization to simplify the regex prefixes for unrooted globs.
+/// It's unclear if this is worth it for performance, but it also has
+/// some cosmetic effect by making these regexes easier to understand.
+fn make_unrooted(wildcard: Option<GlobWildcard>) -> GlobWildcard {
+match wildcard {
+None => Self::AnyComponents,
+Some(Self::AnyComponents) => Self::AnyComponents,
+Some(Self::AnyNonSlash) => Self::Anything,
+Some(Self::Anything) => Self::Anything,
+}
+}
+fn to_re(self) -> PreRegex {
+match self {
+Self::AnyComponents => PreRegex::preceding_dir_components(),
+Self::AnyNonSlash => PreRegex::NonslashStar,
+Self::Anything => PreRegex::DotStar,
+}
+}
+}
+fn glob_parse_after_star(input: &mut &[u8]) -> GlobWildcard {
+if let Some((b'*', rest)) = input.split_first() {
+if let Some((b'/', rest)) = rest.split_first() {
+*input = rest;
+GlobWildcard::AnyComponents
+} else {
+*input = rest;
+GlobWildcard::Anything
+}
+} else {
+GlobWildcard::AnyNonSlash
+}
+}
+/// The result of glob to re conversion.
+/// The start of the regular expression `start` is tracked
+/// separately for a pattern simplification opportunity
+/// (see `GlobWildcard::make_unrooted`)
+pub struct GlobToRe {
+start: Option<GlobWildcard>,
+rest: Vec<PreRegex>,
+}
+impl GlobToRe {
+/// Convert to a regex. `rooted` specifies if the glob should match
+/// at the root of the repo (true), or anywhere in the repo (false)
+fn into_re(self, rooted: bool) -> PreRegex {
+let wildcard = if !rooted {
+Some(GlobWildcard::make_unrooted(self.start))
+} else {
+self.start
+};
+let mut res: Vec<_> =
+wildcard.into_iter().map(|x| x.to_re()).collect();
+res.extend(self.rest);
+PreRegex::Sequence(res)
+}
+}
+/// Transforms a glob pattern into a regex.
+pub fn glob_to_re(pat: &[u8]) -> PatternResult<GlobToRe> {
+let mut start = None;
+let mut res: Vec<PreRegex> = vec![];
 let mut input = pat;
-let mut res: Vec<u8> = vec![];
-let mut group_depth = 0;
+let mut group_stack = vec![];
+let add_byte = |out: &mut Vec<PreRegex>, b: u8| match out.last_mut() {
+Some(PreRegex::Bytes(v)) => {
+v.push(b);
+}
+_ => out.push(PreRegex::Bytes(vec![b])),
+};
 while let Some((c, rest)) = input.split_first() {
 input = rest;
 match c {
 b'*' => {
-for (source, repl) in GLOB_REPLACEMENTS {
+let wildcard = glob_parse_after_star(&mut input);
-if let Some(rest) = input.drop_prefix(source) {
+if res.is_empty() && start.is_none() {
-input = rest;
+start = Some(wildcard)
-res.extend(*repl);
+} else {
-break;
+res.push(wildcard.to_re())
-}
 }
 }
-b'?' => res.extend(b"."),
+b'?' => res.push(PreRegex::Dot),
 b'[' => {
 match input.iter().skip(1).position(|b| *b == b']') {
-None => res.extend(b"\\["),
+None => res.push(PreRegex::Byte(b'[')),
 Some(end) => {
 // Account for the one we skipped
 let end = end + 1;
-res.extend(b"[");
+// TODO: parse charsets ourselves?
+let mut class = vec![];
+class.extend(b"[");
 for (i, b) in input[..end].iter().enumerate() {
 if *b == b'!' && i == 0 {
-res.extend(b"^")
+class.extend(b"^")
 } else if *b == b'^' && i == 0 {
-res.extend(b"\\^")
+class.extend(b"\\^")
 } else if *b == b'\\' {
-res.extend(b"\\\\")
+class.extend(b"\\\\")
 } else {
-res.push(*b)
+class.push(*b)
 }
 }
-res.extend(b"]");
+class.extend(b"]");
+res.push(PreRegex::parse(&class)?);
 input = &input[end + 1..];
 }
 }
 }
 b'{' => {
-group_depth += 1;
+group_stack.push((mem::take(&mut res), vec![]));
-res.extend(b"(?:")
+}
-}
+b'}' if !group_stack.is_empty() => {
-b'}' if group_depth > 0 => {
+let hir = PreRegex::Sequence(mem::take(&mut res));
-group_depth -= 1;
+let (old_res, mut alt) = group_stack.pop().unwrap();
-res.extend(b")");
+alt.push(hir);
-}
+res = old_res;
-b',' if group_depth > 0 => res.extend(b"|"),
+res.push(PreRegex::Alternation(alt));
+}
+b',' if !group_stack.is_empty() => {
+let frame = group_stack.last_mut().unwrap();
+frame.1.push(PreRegex::Sequence(mem::take(&mut res)));
+}
 b'\\' => {
 let c = {
 if let Some((c, rest)) = input.split_first() {
 input = rest;
 c
 } else {
 c
 }
 };
-res.extend(&RE_ESCAPE[*c as usize])
+add_byte(&mut res, *c)
 }
-_ => res.extend(&RE_ESCAPE[*c as usize]),
+_ => add_byte(&mut res, *c),
 }
 }
-res
+if !group_stack.is_empty() {
-}
+return Err(PatternError::UnsupportedSyntax(
+"error: invalid glob, has unclosed alternation ('{')".to_string(),
-fn escape_pattern(pattern: &[u8]) -> Vec<u8> {
+));
-pattern
+}
-.iter()
+Ok(GlobToRe { start, rest: res })
-.flat_map(|c| RE_ESCAPE[*c as usize].clone())
-.collect()
 }
 pub fn parse_pattern_syntax_kind(
 kind: &[u8],
 ) -> Result<PatternSyntax, PatternError> {
 /// so any path that has the pattern as a prefix, should match.
 MoreComponents,
 }
 impl GlobSuffix {
-fn to_re(self) -> &'static [u8] {
+pub fn to_re(self) -> PreRegex {
 match self {
-Self::Empty => b"$",
+Self::Empty => PreRegex::Eof,
-Self::MoreComponents => b"(?:/|$)",
+Self::MoreComponents => PreRegex::SlashOrEof,
 }
 }
 }
 /// Builds the regex that corresponds to the given pattern.
 /// If within a `syntax: regexp` context, returns the pattern,
 /// otherwise, returns the corresponding regex.
 fn _build_single_regex(
 entry: &IgnorePattern,
 glob_suffix: GlobSuffix,
-) -> Vec<u8> {
+) -> PatternResult<PreRegex> {
 let IgnorePattern {
 syntax, pattern, ..
 } = entry;
 if pattern.is_empty() {
-return vec![];
+return Ok(PreRegex::Empty);
 }
 match syntax {
-PatternSyntax::Regexp => pattern.to_owned(),
+PatternSyntax::Regexp => PreRegex::parse(pattern),
 PatternSyntax::RelRegexp => {
 // The `regex` crate accepts `**` while `re2` and Python's `re`
 // do not. Checking for `*` correctly triggers the same error all
 // engines.
 if pattern[0] == b'^'
 || pattern[0] == b'*'
 || pattern.starts_with(b".*")
 {
-return pattern.to_owned();
+return PreRegex::parse(pattern);
 }
-match FLAG_RE.find(pattern) {
+let re = match FLAG_RE.find(pattern) {
 Some(mat) => {
 let s = mat.start();
 let e = mat.end();
 [
 &b"(?"[..],
 &b")"[..],
 ]
 .concat()
 }
 None => [&b".*"[..], pattern].concat(),
-}
+};
+PreRegex::parse(&re)
 }
 PatternSyntax::Path | PatternSyntax::RelPath => {
 if pattern == b"." {
-return vec![];
+return Ok(PreRegex::Empty);
 }
-[
+Ok(PreRegex::Sequence(vec![
-escape_pattern(pattern).as_slice(),
+PreRegex::literal(pattern),
 GlobSuffix::MoreComponents.to_re(),
-]
+]))
-.concat()
 }
 PatternSyntax::RootFilesIn => {
-let mut res = if pattern == b"." {
+let re = if pattern == b"." {
-vec![]
+PreRegex::Empty
 } else {
 // Pattern is a directory name.
-[escape_pattern(pattern).as_slice(), b"/"].concat()
+let mut pattern = pattern.clone();
+pattern.push(b'/');
+PreRegex::Bytes(pattern)
 };
 // Anything after the pattern must be a non-directory.
-res.extend(b"[^/]+$");
+Ok(PreRegex::Sequence(vec![re, PreRegex::parse(b"[^/]+$")?]))
-res
 }
 PatternSyntax::RelGlob => {
-let glob_re = glob_to_re(pattern);
+let glob_re = glob_to_re(pattern)?;
-if let Some(rest) = glob_re.drop_prefix(b"[^/]*") {
+Ok(PreRegex::Sequence(vec![
-[b".*", rest, glob_suffix.to_re()].concat()
+glob_re.into_re(false),
-} else {
+glob_suffix.to_re(),
-[b"(?:.*/)?", glob_re.as_slice(), glob_suffix.to_re()].concat()
+]))
-}
 }
 PatternSyntax::Glob | PatternSyntax::RootGlob => {
-[glob_to_re(pattern).as_slice(), glob_suffix.to_re()].concat()
+let glob_re = glob_to_re(pattern)?;
+Ok(PreRegex::Sequence(vec![
+glob_re.into_re(true),
+glob_suffix.to_re(),
+]))
 }
 PatternSyntax::Include
 | PatternSyntax::SubInclude
 | PatternSyntax::ExpandedSubInclude(_)
 | PatternSyntax::FilePath => unreachable!(),
 /// that don't need to be transformed into a regex.
 pub fn build_single_regex(
 entry: &IgnorePattern,
 glob_suffix: GlobSuffix,
 regex_config: RegexCompleteness,
-) -> Result<Option<Vec<u8>>, PatternError> {
+) -> Result<Option<PreRegex>, PatternError> {
 let IgnorePattern {
 pattern, syntax, ..
 } = entry;
 let pattern = match syntax {
 PatternSyntax::RootGlob
 {
 Ok(None)
 } else {
 let mut entry = entry.clone();
 entry.pattern = pattern;
-Ok(Some(_build_single_regex(&entry, glob_suffix)))
+Ok(Some(_build_single_regex(&entry, glob_suffix)?))
 }
 }
 lazy_static! {
 static ref SYNTAXES: FastHashMap<&'static [u8], PatternSyntax> = {
 #[cfg(test)]
 mod tests {
 use super::*;
 use pretty_assertions::assert_eq;
+fn escape_pattern(pattern: &[u8]) -> Vec<u8> {
+pattern
+.iter()
+.flat_map(|c| RE_ESCAPE[*c as usize].clone())
+.collect()
+}
 #[test]
 fn escape_pattern_test() {
 let untouched =
 br#"!"%',/0123456789:;<=>@ABCDEFGHIJKLMNOPQRSTUVWXYZ_`abcdefghijklmnopqrstuvwxyz"#;
 assert_eq!(escape_pattern(untouched), untouched.to_vec());
 // All escape codes
 assert_eq!(
 escape_pattern(br"()[]{}?*+-|^$\\.&~#\t\n\r\v\f"),
 br"\(\)\[\]\{\}\?\*\+\-\|\^\$\\\\\.\&\~\#\\t\\n\\r\\v\\f".to_vec()
 );
+}
+fn glob_to_re(pat: &[u8]) -> Vec<u8> {
+super::glob_to_re(pat).unwrap().into_re(true).to_bytes()
 }
 #[test]
 fn glob_test() {
 assert_eq!(glob_to_re(br"?"), br".");
 super::build_single_regex(
 entry,
 glob_suffix,
 RegexCompleteness::ExcludeExactFiles,
 )
+.map(|re_opt| re_opt.map(|re| re.to_bytes()))
 }
 #[test]
 fn test_build_single_regex() {
 assert_eq!(
 );
 assert_eq!(
 build_single_regex(
 &IgnorePattern::new(
 PatternSyntax::RelRegexp,
-b"(?ia)ba{2}r",
+b"(?i)ba{2}r",
 Path::new("")
 ),
 GlobSuffix::MoreComponents
 )
 .unwrap(),
-Some(b"(?ia:.*ba{2}r)".to_vec()),
+Some(b"(?i:.*ba{2}r)".to_vec()),
 );
 assert_eq!(
 build_single_regex(
 &IgnorePattern::new(
 PatternSyntax::RelRegexp,
-b"(?ia)^ba{2}r",
+b"(?i)^ba{2}r",
 Path::new("")
 ),
 GlobSuffix::MoreComponents
 )
 .unwrap(),
-Some(b"(?ia:^ba{2}r)".to_vec()),
+Some(b"(?i:^ba{2}r)".to_vec()),
 );
 }
 }

Mercurial > public > mercurial-scm > hg

comparison rust/hg-core/src/filepatterns.rs @ 52556:1866119cbad7