comparison rust/hg-core/src/filepatterns.rs @ 52556:1866119cbad7

rust-ignore: construct regex Hir object directly, avoiding large regex string Rework how we convert patterns to regexes in rust. Instead of going patterns -> string -> Regex, which is slow and causes some correctness issues, build a structured regex_syntax::hir::Hir value, which is faster and it also prevents surprising regex escape. This change makes the time of `build_regex_match` go from ~70-80ms to ~40ms in my testing (for a large hgignore). The bug I mentioned involves regex patterns that "escape" their intended scope. For example, a sequence of hgignore regexp patterns like this would previously lead to surprising behavior: foo(?: bar baz ) this matches foobar and foobaz, and doesn't match bar and baz. The new behavior is to report a pattern parse error The Python hg also has this bug, so this bugfix not really helping much, but it's probably better to fall back to real Python bugs than to simulate them.
author Arseniy Alekseyev <aalekseyev@janestreet.com>
date Fri, 06 Dec 2024 20:27:59 +0000
parents e2e49069eeb6
children 94e2547e6f3d
comparison
equal deleted inserted replaced
52555:85c095c1f8bc 52556:1866119cbad7
6 // GNU General Public License version 2 or any later version. 6 // GNU General Public License version 2 or any later version.
7 7
8 //! Handling of Mercurial-specific patterns. 8 //! Handling of Mercurial-specific patterns.
9 9
10 use crate::{ 10 use crate::{
11 pre_regex::PreRegex,
11 utils::{ 12 utils::{
12 files::{canonical_path, get_bytes_from_path, get_path_from_bytes}, 13 files::{canonical_path, get_bytes_from_path, get_path_from_bytes},
13 hg_path::{path_to_hg_path_buf, HgPathBuf, HgPathError}, 14 hg_path::{path_to_hg_path_buf, HgPathBuf, HgPathError},
14 SliceExt, 15 SliceExt,
15 }, 16 },
16 FastHashMap, 17 FastHashMap,
17 }; 18 };
18 use lazy_static::lazy_static; 19 use lazy_static::lazy_static;
19 use regex::bytes::{NoExpand, Regex}; 20 use regex::bytes::{NoExpand, Regex};
21 use std::mem;
20 use std::path::{Path, PathBuf}; 22 use std::path::{Path, PathBuf};
21 use std::vec::Vec; 23 use std::vec::Vec;
22 use std::{fmt, ops::Deref}; 24 use std::{fmt, ops::Deref};
23 25
24 #[derive(Debug, derive_more::From)] 26 #[derive(Debug, derive_more::From)]
68 v[*byte as usize].insert(0, b'\\'); 70 v[*byte as usize].insert(0, b'\\');
69 } 71 }
70 v 72 v
71 }; 73 };
72 } 74 }
73
74 /// These are matched in order
75 const GLOB_REPLACEMENTS: &[(&[u8], &[u8])] =
76 &[(b"*/", b"(?:.*/)?"), (b"*", b".*"), (b"", b"[^/]*")];
77 75
78 #[derive(Debug, Clone, PartialEq, Eq)] 76 #[derive(Debug, Clone, PartialEq, Eq)]
79 pub enum PatternSyntax { 77 pub enum PatternSyntax {
80 /// A regular expression 78 /// A regular expression
81 Regexp, 79 Regexp,
109 /// 107 ///
110 /// Note: `Box` is used to minimize size impact on other enum variants 108 /// Note: `Box` is used to minimize size impact on other enum variants
111 ExpandedSubInclude(Box<SubInclude>), 109 ExpandedSubInclude(Box<SubInclude>),
112 } 110 }
113 111
114 /// Transforms a glob pattern into a regex 112 /// A wildcard parsed from a glob
115 pub fn glob_to_re(pat: &[u8]) -> Vec<u8> { 113 #[derive(Debug, Clone, Copy)]
114 enum GlobWildcard {
115 /// `**/` matches any sequence of characters ending at a path
116 /// component boundary
117 AnyComponents,
118 /// `*`: matches any sequence of characters within one path component
119 AnyNonSlash,
120 /// `**`: matches any sequence of characters including slashes
121 Anything,
122 }
123 impl GlobWildcard {
124 /// Optimization to simplify the regex prefixes for unrooted globs.
125 /// It's unclear if this is worth it for performance, but it also has
126 /// some cosmetic effect by making these regexes easier to understand.
127 fn make_unrooted(wildcard: Option<GlobWildcard>) -> GlobWildcard {
128 match wildcard {
129 None => Self::AnyComponents,
130 Some(Self::AnyComponents) => Self::AnyComponents,
131 Some(Self::AnyNonSlash) => Self::Anything,
132 Some(Self::Anything) => Self::Anything,
133 }
134 }
135
136 fn to_re(self) -> PreRegex {
137 match self {
138 Self::AnyComponents => PreRegex::preceding_dir_components(),
139 Self::AnyNonSlash => PreRegex::NonslashStar,
140 Self::Anything => PreRegex::DotStar,
141 }
142 }
143 }
144
145 fn glob_parse_after_star(input: &mut &[u8]) -> GlobWildcard {
146 if let Some((b'*', rest)) = input.split_first() {
147 if let Some((b'/', rest)) = rest.split_first() {
148 *input = rest;
149 GlobWildcard::AnyComponents
150 } else {
151 *input = rest;
152 GlobWildcard::Anything
153 }
154 } else {
155 GlobWildcard::AnyNonSlash
156 }
157 }
158
159 /// The result of glob to re conversion.
160 /// The start of the regular expression `start` is tracked
161 /// separately for a pattern simplification opportunity
162 /// (see `GlobWildcard::make_unrooted`)
163 pub struct GlobToRe {
164 start: Option<GlobWildcard>,
165 rest: Vec<PreRegex>,
166 }
167
168 impl GlobToRe {
169 /// Convert to a regex. `rooted` specifies if the glob should match
170 /// at the root of the repo (true), or anywhere in the repo (false)
171 fn into_re(self, rooted: bool) -> PreRegex {
172 let wildcard = if !rooted {
173 Some(GlobWildcard::make_unrooted(self.start))
174 } else {
175 self.start
176 };
177
178 let mut res: Vec<_> =
179 wildcard.into_iter().map(|x| x.to_re()).collect();
180 res.extend(self.rest);
181 PreRegex::Sequence(res)
182 }
183 }
184
185 /// Transforms a glob pattern into a regex.
186 pub fn glob_to_re(pat: &[u8]) -> PatternResult<GlobToRe> {
187 let mut start = None;
188 let mut res: Vec<PreRegex> = vec![];
116 let mut input = pat; 189 let mut input = pat;
117 let mut res: Vec<u8> = vec![]; 190
118 let mut group_depth = 0; 191 let mut group_stack = vec![];
192
193 let add_byte = |out: &mut Vec<PreRegex>, b: u8| match out.last_mut() {
194 Some(PreRegex::Bytes(v)) => {
195 v.push(b);
196 }
197 _ => out.push(PreRegex::Bytes(vec![b])),
198 };
119 199
120 while let Some((c, rest)) = input.split_first() { 200 while let Some((c, rest)) = input.split_first() {
121 input = rest; 201 input = rest;
122 202
123 match c { 203 match c {
124 b'*' => { 204 b'*' => {
125 for (source, repl) in GLOB_REPLACEMENTS { 205 let wildcard = glob_parse_after_star(&mut input);
126 if let Some(rest) = input.drop_prefix(source) { 206 if res.is_empty() && start.is_none() {
127 input = rest; 207 start = Some(wildcard)
128 res.extend(*repl); 208 } else {
129 break; 209 res.push(wildcard.to_re())
130 }
131 } 210 }
132 } 211 }
133 b'?' => res.extend(b"."), 212 b'?' => res.push(PreRegex::Dot),
134 b'[' => { 213 b'[' => {
135 match input.iter().skip(1).position(|b| *b == b']') { 214 match input.iter().skip(1).position(|b| *b == b']') {
136 None => res.extend(b"\\["), 215 None => res.push(PreRegex::Byte(b'[')),
137 Some(end) => { 216 Some(end) => {
138 // Account for the one we skipped 217 // Account for the one we skipped
139 let end = end + 1; 218 let end = end + 1;
140 219
141 res.extend(b"["); 220 // TODO: parse charsets ourselves?
221 let mut class = vec![];
222 class.extend(b"[");
142 223
143 for (i, b) in input[..end].iter().enumerate() { 224 for (i, b) in input[..end].iter().enumerate() {
144 if *b == b'!' && i == 0 { 225 if *b == b'!' && i == 0 {
145 res.extend(b"^") 226 class.extend(b"^")
146 } else if *b == b'^' && i == 0 { 227 } else if *b == b'^' && i == 0 {
147 res.extend(b"\\^") 228 class.extend(b"\\^")
148 } else if *b == b'\\' { 229 } else if *b == b'\\' {
149 res.extend(b"\\\\") 230 class.extend(b"\\\\")
150 } else { 231 } else {
151 res.push(*b) 232 class.push(*b)
152 } 233 }
153 } 234 }
154 res.extend(b"]"); 235 class.extend(b"]");
236
237 res.push(PreRegex::parse(&class)?);
238
155 input = &input[end + 1..]; 239 input = &input[end + 1..];
156 } 240 }
157 } 241 }
158 } 242 }
159 b'{' => { 243 b'{' => {
160 group_depth += 1; 244 group_stack.push((mem::take(&mut res), vec![]));
161 res.extend(b"(?:") 245 }
162 } 246 b'}' if !group_stack.is_empty() => {
163 b'}' if group_depth > 0 => { 247 let hir = PreRegex::Sequence(mem::take(&mut res));
164 group_depth -= 1; 248 let (old_res, mut alt) = group_stack.pop().unwrap();
165 res.extend(b")"); 249 alt.push(hir);
166 } 250 res = old_res;
167 b',' if group_depth > 0 => res.extend(b"|"), 251 res.push(PreRegex::Alternation(alt));
252 }
253 b',' if !group_stack.is_empty() => {
254 let frame = group_stack.last_mut().unwrap();
255 frame.1.push(PreRegex::Sequence(mem::take(&mut res)));
256 }
168 b'\\' => { 257 b'\\' => {
169 let c = { 258 let c = {
170 if let Some((c, rest)) = input.split_first() { 259 if let Some((c, rest)) = input.split_first() {
171 input = rest; 260 input = rest;
172 c 261 c
173 } else { 262 } else {
174 c 263 c
175 } 264 }
176 }; 265 };
177 res.extend(&RE_ESCAPE[*c as usize]) 266 add_byte(&mut res, *c)
178 } 267 }
179 _ => res.extend(&RE_ESCAPE[*c as usize]), 268 _ => add_byte(&mut res, *c),
180 } 269 }
181 } 270 }
182 res 271 if !group_stack.is_empty() {
183 } 272 return Err(PatternError::UnsupportedSyntax(
184 273 "error: invalid glob, has unclosed alternation ('{')".to_string(),
185 fn escape_pattern(pattern: &[u8]) -> Vec<u8> { 274 ));
186 pattern 275 }
187 .iter() 276 Ok(GlobToRe { start, rest: res })
188 .flat_map(|c| RE_ESCAPE[*c as usize].clone())
189 .collect()
190 } 277 }
191 278
192 pub fn parse_pattern_syntax_kind( 279 pub fn parse_pattern_syntax_kind(
193 kind: &[u8], 280 kind: &[u8],
194 ) -> Result<PatternSyntax, PatternError> { 281 ) -> Result<PatternSyntax, PatternError> {
224 /// so any path that has the pattern as a prefix, should match. 311 /// so any path that has the pattern as a prefix, should match.
225 MoreComponents, 312 MoreComponents,
226 } 313 }
227 314
228 impl GlobSuffix { 315 impl GlobSuffix {
229 fn to_re(self) -> &'static [u8] { 316 pub fn to_re(self) -> PreRegex {
230 match self { 317 match self {
231 Self::Empty => b"$", 318 Self::Empty => PreRegex::Eof,
232 Self::MoreComponents => b"(?:/|$)", 319 Self::MoreComponents => PreRegex::SlashOrEof,
233 } 320 }
234 } 321 }
235 } 322 }
236 323
237 /// Builds the regex that corresponds to the given pattern. 324 /// Builds the regex that corresponds to the given pattern.
238 /// If within a `syntax: regexp` context, returns the pattern, 325 /// If within a `syntax: regexp` context, returns the pattern,
239 /// otherwise, returns the corresponding regex. 326 /// otherwise, returns the corresponding regex.
240 fn _build_single_regex( 327 fn _build_single_regex(
241 entry: &IgnorePattern, 328 entry: &IgnorePattern,
242 glob_suffix: GlobSuffix, 329 glob_suffix: GlobSuffix,
243 ) -> Vec<u8> { 330 ) -> PatternResult<PreRegex> {
244 let IgnorePattern { 331 let IgnorePattern {
245 syntax, pattern, .. 332 syntax, pattern, ..
246 } = entry; 333 } = entry;
247 if pattern.is_empty() { 334 if pattern.is_empty() {
248 return vec![]; 335 return Ok(PreRegex::Empty);
249 } 336 }
250 match syntax { 337 match syntax {
251 PatternSyntax::Regexp => pattern.to_owned(), 338 PatternSyntax::Regexp => PreRegex::parse(pattern),
252 PatternSyntax::RelRegexp => { 339 PatternSyntax::RelRegexp => {
253 // The `regex` crate accepts `**` while `re2` and Python's `re` 340 // The `regex` crate accepts `**` while `re2` and Python's `re`
254 // do not. Checking for `*` correctly triggers the same error all 341 // do not. Checking for `*` correctly triggers the same error all
255 // engines. 342 // engines.
256 if pattern[0] == b'^' 343 if pattern[0] == b'^'
257 || pattern[0] == b'*' 344 || pattern[0] == b'*'
258 || pattern.starts_with(b".*") 345 || pattern.starts_with(b".*")
259 { 346 {
260 return pattern.to_owned(); 347 return PreRegex::parse(pattern);
261 } 348 }
262 match FLAG_RE.find(pattern) { 349 let re = match FLAG_RE.find(pattern) {
263 Some(mat) => { 350 Some(mat) => {
264 let s = mat.start(); 351 let s = mat.start();
265 let e = mat.end(); 352 let e = mat.end();
266 [ 353 [
267 &b"(?"[..], 354 &b"(?"[..],
279 &b")"[..], 366 &b")"[..],
280 ] 367 ]
281 .concat() 368 .concat()
282 } 369 }
283 None => [&b".*"[..], pattern].concat(), 370 None => [&b".*"[..], pattern].concat(),
284 } 371 };
372 PreRegex::parse(&re)
285 } 373 }
286 PatternSyntax::Path | PatternSyntax::RelPath => { 374 PatternSyntax::Path | PatternSyntax::RelPath => {
287 if pattern == b"." { 375 if pattern == b"." {
288 return vec![]; 376 return Ok(PreRegex::Empty);
289 } 377 }
290 [ 378 Ok(PreRegex::Sequence(vec![
291 escape_pattern(pattern).as_slice(), 379 PreRegex::literal(pattern),
292 GlobSuffix::MoreComponents.to_re(), 380 GlobSuffix::MoreComponents.to_re(),
293 ] 381 ]))
294 .concat()
295 } 382 }
296 PatternSyntax::RootFilesIn => { 383 PatternSyntax::RootFilesIn => {
297 let mut res = if pattern == b"." { 384 let re = if pattern == b"." {
298 vec![] 385 PreRegex::Empty
299 } else { 386 } else {
300 // Pattern is a directory name. 387 // Pattern is a directory name.
301 [escape_pattern(pattern).as_slice(), b"/"].concat() 388 let mut pattern = pattern.clone();
389 pattern.push(b'/');
390 PreRegex::Bytes(pattern)
302 }; 391 };
303 392
304 // Anything after the pattern must be a non-directory. 393 // Anything after the pattern must be a non-directory.
305 res.extend(b"[^/]+$"); 394 Ok(PreRegex::Sequence(vec![re, PreRegex::parse(b"[^/]+$")?]))
306 res
307 } 395 }
308 PatternSyntax::RelGlob => { 396 PatternSyntax::RelGlob => {
309 let glob_re = glob_to_re(pattern); 397 let glob_re = glob_to_re(pattern)?;
310 if let Some(rest) = glob_re.drop_prefix(b"[^/]*") { 398 Ok(PreRegex::Sequence(vec![
311 [b".*", rest, glob_suffix.to_re()].concat() 399 glob_re.into_re(false),
312 } else { 400 glob_suffix.to_re(),
313 [b"(?:.*/)?", glob_re.as_slice(), glob_suffix.to_re()].concat() 401 ]))
314 }
315 } 402 }
316 PatternSyntax::Glob | PatternSyntax::RootGlob => { 403 PatternSyntax::Glob | PatternSyntax::RootGlob => {
317 [glob_to_re(pattern).as_slice(), glob_suffix.to_re()].concat() 404 let glob_re = glob_to_re(pattern)?;
405 Ok(PreRegex::Sequence(vec![
406 glob_re.into_re(true),
407 glob_suffix.to_re(),
408 ]))
318 } 409 }
319 PatternSyntax::Include 410 PatternSyntax::Include
320 | PatternSyntax::SubInclude 411 | PatternSyntax::SubInclude
321 | PatternSyntax::ExpandedSubInclude(_) 412 | PatternSyntax::ExpandedSubInclude(_)
322 | PatternSyntax::FilePath => unreachable!(), 413 | PatternSyntax::FilePath => unreachable!(),
395 /// that don't need to be transformed into a regex. 486 /// that don't need to be transformed into a regex.
396 pub fn build_single_regex( 487 pub fn build_single_regex(
397 entry: &IgnorePattern, 488 entry: &IgnorePattern,
398 glob_suffix: GlobSuffix, 489 glob_suffix: GlobSuffix,
399 regex_config: RegexCompleteness, 490 regex_config: RegexCompleteness,
400 ) -> Result<Option<Vec<u8>>, PatternError> { 491 ) -> Result<Option<PreRegex>, PatternError> {
401 let IgnorePattern { 492 let IgnorePattern {
402 pattern, syntax, .. 493 pattern, syntax, ..
403 } = entry; 494 } = entry;
404 let pattern = match syntax { 495 let pattern = match syntax {
405 PatternSyntax::RootGlob 496 PatternSyntax::RootGlob
419 { 510 {
420 Ok(None) 511 Ok(None)
421 } else { 512 } else {
422 let mut entry = entry.clone(); 513 let mut entry = entry.clone();
423 entry.pattern = pattern; 514 entry.pattern = pattern;
424 Ok(Some(_build_single_regex(&entry, glob_suffix))) 515 Ok(Some(_build_single_regex(&entry, glob_suffix)?))
425 } 516 }
426 } 517 }
427 518
428 lazy_static! { 519 lazy_static! {
429 static ref SYNTAXES: FastHashMap<&'static [u8], PatternSyntax> = { 520 static ref SYNTAXES: FastHashMap<&'static [u8], PatternSyntax> = {
764 #[cfg(test)] 855 #[cfg(test)]
765 mod tests { 856 mod tests {
766 use super::*; 857 use super::*;
767 use pretty_assertions::assert_eq; 858 use pretty_assertions::assert_eq;
768 859
860 fn escape_pattern(pattern: &[u8]) -> Vec<u8> {
861 pattern
862 .iter()
863 .flat_map(|c| RE_ESCAPE[*c as usize].clone())
864 .collect()
865 }
866
769 #[test] 867 #[test]
770 fn escape_pattern_test() { 868 fn escape_pattern_test() {
771 let untouched = 869 let untouched =
772 br#"!"%',/0123456789:;<=>@ABCDEFGHIJKLMNOPQRSTUVWXYZ_`abcdefghijklmnopqrstuvwxyz"#; 870 br#"!"%',/0123456789:;<=>@ABCDEFGHIJKLMNOPQRSTUVWXYZ_`abcdefghijklmnopqrstuvwxyz"#;
773 assert_eq!(escape_pattern(untouched), untouched.to_vec()); 871 assert_eq!(escape_pattern(untouched), untouched.to_vec());
774 // All escape codes 872 // All escape codes
775 assert_eq!( 873 assert_eq!(
776 escape_pattern(br"()[]{}?*+-|^$\\.&~#\t\n\r\v\f"), 874 escape_pattern(br"()[]{}?*+-|^$\\.&~#\t\n\r\v\f"),
777 br"\(\)\[\]\{\}\?\*\+\-\|\^\$\\\\\.\&\~\#\\t\\n\\r\\v\\f".to_vec() 875 br"\(\)\[\]\{\}\?\*\+\-\|\^\$\\\\\.\&\~\#\\t\\n\\r\\v\\f".to_vec()
778 ); 876 );
877 }
878
879 fn glob_to_re(pat: &[u8]) -> Vec<u8> {
880 super::glob_to_re(pat).unwrap().into_re(true).to_bytes()
779 } 881 }
780 882
781 #[test] 883 #[test]
782 fn glob_test() { 884 fn glob_test() {
783 assert_eq!(glob_to_re(br"?"), br"."); 885 assert_eq!(glob_to_re(br"?"), br".");
851 super::build_single_regex( 953 super::build_single_regex(
852 entry, 954 entry,
853 glob_suffix, 955 glob_suffix,
854 RegexCompleteness::ExcludeExactFiles, 956 RegexCompleteness::ExcludeExactFiles,
855 ) 957 )
958 .map(|re_opt| re_opt.map(|re| re.to_bytes()))
856 } 959 }
857 960
858 #[test] 961 #[test]
859 fn test_build_single_regex() { 962 fn test_build_single_regex() {
860 assert_eq!( 963 assert_eq!(
951 ); 1054 );
952 assert_eq!( 1055 assert_eq!(
953 build_single_regex( 1056 build_single_regex(
954 &IgnorePattern::new( 1057 &IgnorePattern::new(
955 PatternSyntax::RelRegexp, 1058 PatternSyntax::RelRegexp,
956 b"(?ia)ba{2}r", 1059 b"(?i)ba{2}r",
957 Path::new("") 1060 Path::new("")
958 ), 1061 ),
959 GlobSuffix::MoreComponents 1062 GlobSuffix::MoreComponents
960 ) 1063 )
961 .unwrap(), 1064 .unwrap(),
962 Some(b"(?ia:.*ba{2}r)".to_vec()), 1065 Some(b"(?i:.*ba{2}r)".to_vec()),
963 ); 1066 );
964 assert_eq!( 1067 assert_eq!(
965 build_single_regex( 1068 build_single_regex(
966 &IgnorePattern::new( 1069 &IgnorePattern::new(
967 PatternSyntax::RelRegexp, 1070 PatternSyntax::RelRegexp,
968 b"(?ia)^ba{2}r", 1071 b"(?i)^ba{2}r",
969 Path::new("") 1072 Path::new("")
970 ), 1073 ),
971 GlobSuffix::MoreComponents 1074 GlobSuffix::MoreComponents
972 ) 1075 )
973 .unwrap(), 1076 .unwrap(),
974 Some(b"(?ia:^ba{2}r)".to_vec()), 1077 Some(b"(?i:^ba{2}r)".to_vec()),
975 ); 1078 );
976 } 1079 }
977 } 1080 }