Mercurial > public > mercurial-scm > hg
diff rust/hg-core/src/filepatterns.rs @ 52556:1866119cbad7
rust-ignore: construct regex Hir object directly, avoiding large regex string
Rework how we convert patterns to regexes in rust.
Instead of going patterns -> string -> Regex, which is slow and causes
some correctness issues, build a structured regex_syntax::hir::Hir value,
which is faster and it also prevents surprising regex escape.
This change makes the time of `build_regex_match` go from ~70-80ms
to ~40ms in my testing (for a large hgignore).
The bug I mentioned involves regex patterns that "escape" their
intended scope. For example, a sequence of hgignore regexp patterns like
this would previously lead to surprising behavior:
foo(?:
bar
baz
)
this matches foobar and foobaz, and doesn't match bar and baz.
The new behavior is to report a pattern parse error
The Python hg also has this bug, so this bugfix
not really helping much, but it's probably better to
fall back to real Python bugs than to simulate them.
author | Arseniy Alekseyev <aalekseyev@janestreet.com> |
---|---|
date | Fri, 06 Dec 2024 20:27:59 +0000 |
parents | e2e49069eeb6 |
children | 94e2547e6f3d |
line wrap: on
line diff
--- a/rust/hg-core/src/filepatterns.rs Sun Dec 22 08:17:53 2024 -0300 +++ b/rust/hg-core/src/filepatterns.rs Fri Dec 06 20:27:59 2024 +0000 @@ -8,6 +8,7 @@ //! Handling of Mercurial-specific patterns. use crate::{ + pre_regex::PreRegex, utils::{ files::{canonical_path, get_bytes_from_path, get_path_from_bytes}, hg_path::{path_to_hg_path_buf, HgPathBuf, HgPathError}, @@ -17,6 +18,7 @@ }; use lazy_static::lazy_static; use regex::bytes::{NoExpand, Regex}; +use std::mem; use std::path::{Path, PathBuf}; use std::vec::Vec; use std::{fmt, ops::Deref}; @@ -71,10 +73,6 @@ }; } -/// These are matched in order -const GLOB_REPLACEMENTS: &[(&[u8], &[u8])] = - &[(b"*/", b"(?:.*/)?"), (b"*", b".*"), (b"", b"[^/]*")]; - #[derive(Debug, Clone, PartialEq, Eq)] pub enum PatternSyntax { /// A regular expression @@ -111,60 +109,151 @@ ExpandedSubInclude(Box<SubInclude>), } -/// Transforms a glob pattern into a regex -pub fn glob_to_re(pat: &[u8]) -> Vec<u8> { +/// A wildcard parsed from a glob +#[derive(Debug, Clone, Copy)] +enum GlobWildcard { + /// `**/` matches any sequence of characters ending at a path + /// component boundary + AnyComponents, + /// `*`: matches any sequence of characters within one path component + AnyNonSlash, + /// `**`: matches any sequence of characters including slashes + Anything, +} +impl GlobWildcard { + /// Optimization to simplify the regex prefixes for unrooted globs. + /// It's unclear if this is worth it for performance, but it also has + /// some cosmetic effect by making these regexes easier to understand. + fn make_unrooted(wildcard: Option<GlobWildcard>) -> GlobWildcard { + match wildcard { + None => Self::AnyComponents, + Some(Self::AnyComponents) => Self::AnyComponents, + Some(Self::AnyNonSlash) => Self::Anything, + Some(Self::Anything) => Self::Anything, + } + } + + fn to_re(self) -> PreRegex { + match self { + Self::AnyComponents => PreRegex::preceding_dir_components(), + Self::AnyNonSlash => PreRegex::NonslashStar, + Self::Anything => PreRegex::DotStar, + } + } +} + +fn glob_parse_after_star(input: &mut &[u8]) -> GlobWildcard { + if let Some((b'*', rest)) = input.split_first() { + if let Some((b'/', rest)) = rest.split_first() { + *input = rest; + GlobWildcard::AnyComponents + } else { + *input = rest; + GlobWildcard::Anything + } + } else { + GlobWildcard::AnyNonSlash + } +} + +/// The result of glob to re conversion. +/// The start of the regular expression `start` is tracked +/// separately for a pattern simplification opportunity +/// (see `GlobWildcard::make_unrooted`) +pub struct GlobToRe { + start: Option<GlobWildcard>, + rest: Vec<PreRegex>, +} + +impl GlobToRe { + /// Convert to a regex. `rooted` specifies if the glob should match + /// at the root of the repo (true), or anywhere in the repo (false) + fn into_re(self, rooted: bool) -> PreRegex { + let wildcard = if !rooted { + Some(GlobWildcard::make_unrooted(self.start)) + } else { + self.start + }; + + let mut res: Vec<_> = + wildcard.into_iter().map(|x| x.to_re()).collect(); + res.extend(self.rest); + PreRegex::Sequence(res) + } +} + +/// Transforms a glob pattern into a regex. +pub fn glob_to_re(pat: &[u8]) -> PatternResult<GlobToRe> { + let mut start = None; + let mut res: Vec<PreRegex> = vec![]; let mut input = pat; - let mut res: Vec<u8> = vec![]; - let mut group_depth = 0; + + let mut group_stack = vec![]; + + let add_byte = |out: &mut Vec<PreRegex>, b: u8| match out.last_mut() { + Some(PreRegex::Bytes(v)) => { + v.push(b); + } + _ => out.push(PreRegex::Bytes(vec![b])), + }; while let Some((c, rest)) = input.split_first() { input = rest; match c { b'*' => { - for (source, repl) in GLOB_REPLACEMENTS { - if let Some(rest) = input.drop_prefix(source) { - input = rest; - res.extend(*repl); - break; - } + let wildcard = glob_parse_after_star(&mut input); + if res.is_empty() && start.is_none() { + start = Some(wildcard) + } else { + res.push(wildcard.to_re()) } } - b'?' => res.extend(b"."), + b'?' => res.push(PreRegex::Dot), b'[' => { match input.iter().skip(1).position(|b| *b == b']') { - None => res.extend(b"\\["), + None => res.push(PreRegex::Byte(b'[')), Some(end) => { // Account for the one we skipped let end = end + 1; - res.extend(b"["); + // TODO: parse charsets ourselves? + let mut class = vec![]; + class.extend(b"["); for (i, b) in input[..end].iter().enumerate() { if *b == b'!' && i == 0 { - res.extend(b"^") + class.extend(b"^") } else if *b == b'^' && i == 0 { - res.extend(b"\\^") + class.extend(b"\\^") } else if *b == b'\\' { - res.extend(b"\\\\") + class.extend(b"\\\\") } else { - res.push(*b) + class.push(*b) } } - res.extend(b"]"); + class.extend(b"]"); + + res.push(PreRegex::parse(&class)?); + input = &input[end + 1..]; } } } b'{' => { - group_depth += 1; - res.extend(b"(?:") + group_stack.push((mem::take(&mut res), vec![])); } - b'}' if group_depth > 0 => { - group_depth -= 1; - res.extend(b")"); + b'}' if !group_stack.is_empty() => { + let hir = PreRegex::Sequence(mem::take(&mut res)); + let (old_res, mut alt) = group_stack.pop().unwrap(); + alt.push(hir); + res = old_res; + res.push(PreRegex::Alternation(alt)); } - b',' if group_depth > 0 => res.extend(b"|"), + b',' if !group_stack.is_empty() => { + let frame = group_stack.last_mut().unwrap(); + frame.1.push(PreRegex::Sequence(mem::take(&mut res))); + } b'\\' => { let c = { if let Some((c, rest)) = input.split_first() { @@ -174,19 +263,17 @@ c } }; - res.extend(&RE_ESCAPE[*c as usize]) + add_byte(&mut res, *c) } - _ => res.extend(&RE_ESCAPE[*c as usize]), + _ => add_byte(&mut res, *c), } } - res -} - -fn escape_pattern(pattern: &[u8]) -> Vec<u8> { - pattern - .iter() - .flat_map(|c| RE_ESCAPE[*c as usize].clone()) - .collect() + if !group_stack.is_empty() { + return Err(PatternError::UnsupportedSyntax( + "error: invalid glob, has unclosed alternation ('{')".to_string(), + )); + } + Ok(GlobToRe { start, rest: res }) } pub fn parse_pattern_syntax_kind( @@ -226,10 +313,10 @@ } impl GlobSuffix { - fn to_re(self) -> &'static [u8] { + pub fn to_re(self) -> PreRegex { match self { - Self::Empty => b"$", - Self::MoreComponents => b"(?:/|$)", + Self::Empty => PreRegex::Eof, + Self::MoreComponents => PreRegex::SlashOrEof, } } } @@ -240,15 +327,15 @@ fn _build_single_regex( entry: &IgnorePattern, glob_suffix: GlobSuffix, -) -> Vec<u8> { +) -> PatternResult<PreRegex> { let IgnorePattern { syntax, pattern, .. } = entry; if pattern.is_empty() { - return vec![]; + return Ok(PreRegex::Empty); } match syntax { - PatternSyntax::Regexp => pattern.to_owned(), + PatternSyntax::Regexp => PreRegex::parse(pattern), PatternSyntax::RelRegexp => { // The `regex` crate accepts `**` while `re2` and Python's `re` // do not. Checking for `*` correctly triggers the same error all @@ -257,9 +344,9 @@ || pattern[0] == b'*' || pattern.starts_with(b".*") { - return pattern.to_owned(); + return PreRegex::parse(pattern); } - match FLAG_RE.find(pattern) { + let re = match FLAG_RE.find(pattern) { Some(mat) => { let s = mat.start(); let e = mat.end(); @@ -281,40 +368,44 @@ .concat() } None => [&b".*"[..], pattern].concat(), - } + }; + PreRegex::parse(&re) } PatternSyntax::Path | PatternSyntax::RelPath => { if pattern == b"." { - return vec![]; + return Ok(PreRegex::Empty); } - [ - escape_pattern(pattern).as_slice(), + Ok(PreRegex::Sequence(vec![ + PreRegex::literal(pattern), GlobSuffix::MoreComponents.to_re(), - ] - .concat() + ])) } PatternSyntax::RootFilesIn => { - let mut res = if pattern == b"." { - vec![] + let re = if pattern == b"." { + PreRegex::Empty } else { // Pattern is a directory name. - [escape_pattern(pattern).as_slice(), b"/"].concat() + let mut pattern = pattern.clone(); + pattern.push(b'/'); + PreRegex::Bytes(pattern) }; // Anything after the pattern must be a non-directory. - res.extend(b"[^/]+$"); - res + Ok(PreRegex::Sequence(vec![re, PreRegex::parse(b"[^/]+$")?])) } PatternSyntax::RelGlob => { - let glob_re = glob_to_re(pattern); - if let Some(rest) = glob_re.drop_prefix(b"[^/]*") { - [b".*", rest, glob_suffix.to_re()].concat() - } else { - [b"(?:.*/)?", glob_re.as_slice(), glob_suffix.to_re()].concat() - } + let glob_re = glob_to_re(pattern)?; + Ok(PreRegex::Sequence(vec![ + glob_re.into_re(false), + glob_suffix.to_re(), + ])) } PatternSyntax::Glob | PatternSyntax::RootGlob => { - [glob_to_re(pattern).as_slice(), glob_suffix.to_re()].concat() + let glob_re = glob_to_re(pattern)?; + Ok(PreRegex::Sequence(vec![ + glob_re.into_re(true), + glob_suffix.to_re(), + ])) } PatternSyntax::Include | PatternSyntax::SubInclude @@ -397,7 +488,7 @@ entry: &IgnorePattern, glob_suffix: GlobSuffix, regex_config: RegexCompleteness, -) -> Result<Option<Vec<u8>>, PatternError> { +) -> Result<Option<PreRegex>, PatternError> { let IgnorePattern { pattern, syntax, .. } = entry; @@ -421,7 +512,7 @@ } else { let mut entry = entry.clone(); entry.pattern = pattern; - Ok(Some(_build_single_regex(&entry, glob_suffix))) + Ok(Some(_build_single_regex(&entry, glob_suffix)?)) } } @@ -766,6 +857,13 @@ use super::*; use pretty_assertions::assert_eq; + fn escape_pattern(pattern: &[u8]) -> Vec<u8> { + pattern + .iter() + .flat_map(|c| RE_ESCAPE[*c as usize].clone()) + .collect() + } + #[test] fn escape_pattern_test() { let untouched = @@ -778,6 +876,10 @@ ); } + fn glob_to_re(pat: &[u8]) -> Vec<u8> { + super::glob_to_re(pat).unwrap().into_re(true).to_bytes() + } + #[test] fn glob_test() { assert_eq!(glob_to_re(br"?"), br"."); @@ -853,6 +955,7 @@ glob_suffix, RegexCompleteness::ExcludeExactFiles, ) + .map(|re_opt| re_opt.map(|re| re.to_bytes())) } #[test] @@ -953,25 +1056,25 @@ build_single_regex( &IgnorePattern::new( PatternSyntax::RelRegexp, - b"(?ia)ba{2}r", + b"(?i)ba{2}r", Path::new("") ), GlobSuffix::MoreComponents ) .unwrap(), - Some(b"(?ia:.*ba{2}r)".to_vec()), + Some(b"(?i:.*ba{2}r)".to_vec()), ); assert_eq!( build_single_regex( &IgnorePattern::new( PatternSyntax::RelRegexp, - b"(?ia)^ba{2}r", + b"(?i)^ba{2}r", Path::new("") ), GlobSuffix::MoreComponents ) .unwrap(), - Some(b"(?ia:^ba{2}r)".to_vec()), + Some(b"(?i:^ba{2}r)".to_vec()), ); } }