Mercurial > public > mercurial-scm > hg
diff rust/hg-core/src/filepatterns.rs @ 42327:e8f3740cc067
rust-filepatterns: add a Rust implementation of pattern-related utils
This change introduces Rust implementations of two functions related to
pattern handling, all located in `match.py`:
- `_regex`
- `readpatternfile`
These utils are useful in the long-term effort to improve `hg status`'s
performance using Rust. Experimental work done by Valentin Gatien-Baron
shows very promising improvements, but is too different from the current
Mercurial core code structure to be used "as-is".
This is the first - albeit very small - step towards the code revamp
needed down the line.
Two dependencies were added: `regex` and `lazy_static`. Both of them
will be useful for a majority of the Rust code that will be written,
are well known and maintained either by the Rust core team, or by
very frequent contributors.
Differential Revision: https://phab.mercurial-scm.org/D6271
author | Rapha?l Gom?s <rgomes@octobus.net> |
---|---|
date | Wed, 24 Apr 2019 11:34:09 +0200 |
parents | |
children | 9609430d3625 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rust/hg-core/src/filepatterns.rs Wed Apr 24 11:34:09 2019 +0200 @@ -0,0 +1,345 @@ +use crate::{LineNumber, PatternError, PatternFileError}; +use regex::Regex; +use std::collections::HashMap; +use std::fs::File; +use std::io::Read; +use std::vec::Vec; + +lazy_static! { + static ref reescape: Vec<Vec<u8>> = { + let mut v: Vec<Vec<u8>> = (0..=255).map(|byte| vec![byte]).collect(); + let to_escape = b"()[]{}?*+-|^$\\.&~# \t\n\r\x0b\x0c"; + for byte in to_escape { + v[*byte as usize].insert(0, b'\\'); + } + v + }; +} + +/// These are matched in order +const GLOB_REPLACEMENTS: &[(&[u8], &[u8])] = + &[(b"*/", b"(?:.*/)?"), (b"*", b".*"), (b"", b"[^/]*")]; + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub enum PatternSyntax { + Regexp, + /// Glob that matches at the front of the path + RootGlob, + /// Glob that matches at any suffix of the path (still anchored at slashes) + Glob, + Path, + RelPath, + RelGlob, + RelRegexp, + RootFiles, +} + +/// Transforms a glob pattern into a regex +fn glob_to_re(pat: &[u8]) -> Vec<u8> { + let mut input = pat; + let mut res: Vec<u8> = vec![]; + let mut group_depth = 0; + + while let Some((c, rest)) = input.split_first() { + input = rest; + + match c { + b'*' => { + for (source, repl) in GLOB_REPLACEMENTS { + if input.starts_with(source) { + input = &input[source.len()..]; + res.extend(*repl); + break; + } + } + } + b'?' => res.extend(b"."), + b'[' => { + match input.iter().skip(1).position(|b| *b == b']') { + None => res.extend(b"\\["), + Some(end) => { + // Account for the one we skipped + let end = end + 1; + + res.extend(b"["); + + for (i, b) in input[..end].iter().enumerate() { + if *b == b'!' && i == 0 { + res.extend(b"^") + } else if *b == b'^' && i == 0 { + res.extend(b"\\^") + } else if *b == b'\\' { + res.extend(b"\\\\") + } else { + res.push(*b) + } + } + res.extend(b"]"); + input = &input[end + 1..]; + } + } + } + b'{' => { + group_depth += 1; + res.extend(b"(?:") + } + b'}' if group_depth > 0 => { + group_depth -= 1; + res.extend(b")"); + } + b',' if group_depth > 0 => res.extend(b"|"), + b'\\' => { + let c = { + if let Some((c, rest)) = input.split_first() { + input = rest; + c + } else { + c + } + }; + res.extend(&reescape[*c as usize]) + } + _ => res.extend(&reescape[*c as usize]), + } + } + res +} + +fn escape_pattern(pattern: &[u8]) -> Vec<u8> { + pattern + .iter() + .flat_map(|c| reescape[*c as usize].clone()) + .collect() +} + +fn parse_pattern_syntax(kind: &[u8]) -> Result<PatternSyntax, PatternError> { + match kind { + b"re" => Ok(PatternSyntax::Regexp), + b"path" => Ok(PatternSyntax::Path), + b"relpath" => Ok(PatternSyntax::RelPath), + b"rootfilesin" => Ok(PatternSyntax::RootFiles), + b"relglob" => Ok(PatternSyntax::RelGlob), + b"relre" => Ok(PatternSyntax::RelRegexp), + b"glob" => Ok(PatternSyntax::Glob), + b"rootglob" => Ok(PatternSyntax::RootGlob), + _ => Err(PatternError::UnsupportedSyntax( + String::from_utf8_lossy(kind).to_string(), + )), + } +} + +/// Builds the regex that corresponds to the given pattern. +/// If within a `syntax: regexp` context, returns the pattern, +/// otherwise, returns the corresponding regex. +fn _build_single_regex( + syntax: PatternSyntax, + pattern: &[u8], + globsuffix: &[u8], +) -> Vec<u8> { + if pattern.is_empty() { + return vec![]; + } + match syntax { + PatternSyntax::Regexp => pattern.to_owned(), + PatternSyntax::RelRegexp => { + if pattern[0] == b'^' { + return pattern.to_owned(); + } + let mut res = b".*".to_vec(); + res.extend(pattern); + res + } + PatternSyntax::Path | PatternSyntax::RelPath => { + if pattern == b"." { + return vec![]; + } + let mut pattern = escape_pattern(pattern); + pattern.extend(b"(?:/|$)"); + pattern + } + PatternSyntax::RootFiles => { + let mut res = if pattern == b"." { + vec![] + } else { + // Pattern is a directory name. + let mut as_vec: Vec<u8> = escape_pattern(pattern); + as_vec.push(b'/'); + as_vec + }; + + // Anything after the pattern must be a non-directory. + res.extend(b"[^/]+$"); + res + } + PatternSyntax::Glob + | PatternSyntax::RelGlob + | PatternSyntax::RootGlob => { + let mut res: Vec<u8> = vec![]; + if syntax == PatternSyntax::RelGlob { + res.extend(b"(?:|.*/)"); + } + + res.extend(glob_to_re(pattern)); + res.extend(globsuffix.iter()); + res + } + } +} + +const GLOB_SPECIAL_CHARACTERS: [u8; 7] = + [b'*', b'?', b'[', b']', b'{', b'}', b'\\']; + +/// Wrapper function to `_build_single_regex` that short-circuits 'exact' globs +/// that don't need to be transformed into a regex. +pub fn build_single_regex( + kind: &str, + pat: &[u8], + globsuffix: &[u8], +) -> Result<Vec<u8>, PatternError> { + let enum_kind = parse_pattern_syntax(kind.as_bytes())?; + if enum_kind == PatternSyntax::RootGlob + && pat.iter().all(|b| GLOB_SPECIAL_CHARACTERS.contains(b)) + { + Ok(pat.to_vec()) + } else { + Ok(_build_single_regex(enum_kind, pat, globsuffix)) + } +} + +lazy_static! { + static ref SYNTAXES: HashMap<&'static str, &'static str> = { + let mut m = HashMap::new(); + + m.insert("re", "relre:"); + m.insert("regexp", "relre:"); + m.insert("glob", "relglob:"); + m.insert("rootglob", "rootglob:"); + m.insert("include", "include"); + m.insert("subinclude", "subinclude"); + m + }; +} + +pub type PatternTuple = (String, LineNumber, String); +type WarningTuple = (String, String); + +pub fn parse_pattern_file_contents( + lines: &str, + file_path: &str, + warn: bool, +) -> (Vec<PatternTuple>, Vec<WarningTuple>) { + let comment_regex = Regex::new(r"((?:^|[^\\])(?:\\\\)*)#.*").unwrap(); + let mut inputs: Vec<PatternTuple> = vec![]; + let mut warnings: Vec<WarningTuple> = vec![]; + + let mut current_syntax = "relre:"; + + let mut line = String::new(); + for (line_number, line_str) in lines.split('\n').enumerate() { + let line_number = line_number + 1; + line.replace_range(.., line_str); + + if line.contains('#') { + if let Some(cap) = comment_regex.captures(line.clone().as_ref()) { + line = line[..cap.get(1).unwrap().end()].to_string() + } + line = line.replace(r"\#", "#"); + } + + let mut line = line.trim_end(); + + if line.is_empty() { + continue; + } + + if line.starts_with("syntax:") { + let syntax = line["syntax:".len()..].trim(); + + if let Some(rel_syntax) = SYNTAXES.get(syntax) { + current_syntax = rel_syntax; + } else if warn { + warnings.push((file_path.to_string(), syntax.to_string())); + } + continue; + } + + let mut line_syntax: &str = ¤t_syntax; + + for (s, rels) in SYNTAXES.iter() { + if line.starts_with(rels) { + line_syntax = rels; + line = &line[rels.len()..]; + break; + } else if line.starts_with(&format!("{}:", s)) { + line_syntax = rels; + line = &line[s.len() + 1..]; + break; + } + } + + inputs.push(( + format!("{}{}", line_syntax, line), + line_number, + line.to_string(), + )); + } + (inputs, warnings) +} + +pub fn read_pattern_file( + file_path: String, + warn: bool, +) -> Result<(Vec<PatternTuple>, Vec<WarningTuple>), PatternFileError> { + let mut f = File::open(&file_path)?; + let mut contents = String::new(); + + f.read_to_string(&mut contents)?; + + Ok(parse_pattern_file_contents(&contents, &file_path, warn)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn escape_pattern_test() { + let untouched = br#"!"%',/0123456789:;<=>@ABCDEFGHIJKLMNOPQRSTUVWXYZ_`abcdefghijklmnopqrstuvwxyz"#; + assert_eq!(escape_pattern(untouched), untouched.to_vec()); + // All escape codes + assert_eq!( + escape_pattern(br#"()[]{}?*+-|^$\\.&~# \t\n\r\v\f"#), + br#"\(\)\[\]\{\}\?\*\+\-\|\^\$\\\\\.\&\~\#\ \\t\\n\\r\\v\\f"# + .to_vec() + ); + } + + #[test] + fn glob_test() { + assert_eq!(glob_to_re(br#"?"#), br#"."#); + assert_eq!(glob_to_re(br#"*"#), br#"[^/]*"#); + assert_eq!(glob_to_re(br#"**"#), br#".*"#); + assert_eq!(glob_to_re(br#"**/a"#), br#"(?:.*/)?a"#); + assert_eq!(glob_to_re(br#"a/**/b"#), br#"a/(?:.*/)?b"#); + assert_eq!(glob_to_re(br#"[a*?!^][^b][!c]"#), br#"[a*?!^][\^b][^c]"#); + assert_eq!(glob_to_re(br#"{a,b}"#), br#"(?:a|b)"#); + assert_eq!(glob_to_re(br#".\*\?"#), br#"\.\*\?"#); + } + + #[test] + fn test_parse_pattern_file_contents() { + let lines = "syntax: glob\n*.elc"; + + assert_eq!( + vec![("relglob:*.elc".to_string(), 2, "*.elc".to_string())], + parse_pattern_file_contents(lines, "file_path", false).0, + ); + + let lines = "syntax: include\nsyntax: glob"; + + assert_eq!( + parse_pattern_file_contents(lines, "file_path", false).0, + vec![] + ); + } +}