Mercurial > public > mercurial-scm > hg-stable
changeset 52583:b89c934e6269
rust-hgignore: add a scripting command to print the hgignore regexp
Add a command `script::hgignore --print-re` to print the
hgignore regexp.
One complication is that the `rootfilesin`-only matcher doesn't use a
regular expression, and the existing converts it to something that's
not a regular expression.
We add code to handle that case.
Since this command is now sufficient to generate a tidy-looking
regexp for scripting, this frees up the "debug" command to report
the internal regexp used by the regex engine, so we make that
change too.
author | Arseniy Alekseyev <aalekseyev@janestreet.com> |
---|---|
date | Fri, 13 Dec 2024 15:05:37 +0000 |
parents | 1866119cbad7 |
children | 2d52ae3c5f76 |
files | rust/hg-core/src/matchers.rs rust/hg-core/src/pre_regex.rs rust/rhg/src/commands/debugignorerhg.rs rust/rhg/src/commands/script_hgignore.rs rust/rhg/src/main.rs tests/test-hgignore.t |
diffstat | 6 files changed, 220 insertions(+), 35 deletions(-) [+] |
line wrap: on
line diff
--- a/rust/hg-core/src/matchers.rs Fri Dec 06 20:27:59 2024 +0000 +++ b/rust/hg-core/src/matchers.rs Fri Dec 13 15:05:37 2024 +0000 @@ -8,6 +8,7 @@ //! Structs and types for matching files and directories. use format_bytes::format_bytes; +use itertools::Itertools; use once_cell::sync::OnceCell; use regex_automata::meta::Regex; use regex_syntax::hir::Hir; @@ -30,10 +31,10 @@ use crate::dirstate::status::IgnoreFnType; use crate::filepatterns::normalize_path_bytes; -use std::collections::HashSet; use std::fmt::{Display, Error, Formatter}; use std::path::{Path, PathBuf}; use std::{borrow::ToOwned, collections::BTreeSet}; +use std::{collections::HashSet, str::FromStr}; #[derive(Debug, PartialEq)] pub enum VisitChildrenSet { @@ -297,7 +298,7 @@ /// assert_eq!(matcher.exact_match(HgPath::new(b"lib.h")), false); // exact matches are for (rel)path kinds /// ``` pub struct PatternMatcher<'a> { - patterns: Vec<u8>, + patterns: PatternsDesc, match_fn: IgnoreFnType<'a>, /// Whether all the patterns match a prefix (i.e. recursively) prefix: bool, @@ -306,10 +307,63 @@ dirs: DirsMultiset, } +enum PatternsDesc { + Re(PreRegex), + RootFilesIn(Vec<Vec<u8>>, GlobSuffix), +} + +pub enum ReSyntax { + Tidy, + Internal, +} + +impl PatternsDesc { + fn to_re(&self) -> PreRegex { + match self { + Self::Re(re) => re.clone(), + Self::RootFilesIn(patterns, glob_suffix) => { + let patterns = patterns + .clone() + .into_iter() + .map(|pattern: Vec<u8>| IgnorePattern { + syntax: PatternSyntax::RootFilesIn, + source: PathBuf::from_str("<rootfilesin-matcher>") + .unwrap(), + pattern, + }) + .collect_vec(); + build_regex_match_for_debug(&patterns, *glob_suffix).unwrap() + } + } + } + + fn to_pattern_bytes(&self, syntax: ReSyntax) -> Vec<u8> { + match syntax { + ReSyntax::Tidy => self.to_re().to_bytes(), + ReSyntax::Internal => match self { + PatternsDesc::Re(re) => re.to_hir().to_string().into_bytes(), + PatternsDesc::RootFilesIn(dirs, _) => { + let mut patterns = vec![]; + patterns.extend(b"rootfilesin: "); + let mut dirs_vec = dirs.clone(); + dirs_vec.sort(); + patterns.extend(dirs_vec.escaped_bytes()); + patterns + } + }, + } + } +} + impl core::fmt::Debug for PatternMatcher<'_> { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { f.debug_struct("PatternMatcher") - .field("patterns", &String::from_utf8_lossy(&self.patterns)) + .field( + "patterns", + &String::from_utf8_lossy( + &self.patterns.to_pattern_bytes(ReSyntax::Internal), + ), + ) .field("prefix", &self.prefix) .field("files", &self.files) .field("dirs", &self.dirs) @@ -441,7 +495,7 @@ /// assert!(!matcher.matches(HgPath::new(b"dir/subdir/subsubdir/file"))); /// ``` pub struct IncludeMatcher<'a> { - patterns: Vec<u8>, + patterns: PatternsDesc, match_fn: IgnoreFnType<'a>, /// Whether all the patterns match a prefix (i.e. recursively) prefix: bool, @@ -453,7 +507,10 @@ impl core::fmt::Debug for IncludeMatcher<'_> { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { f.debug_struct("IncludeMatcher") - .field("patterns", &String::from_utf8_lossy(&self.patterns)) + .field( + "patterns", + &String::from_utf8_lossy(&self.patterns.to_re().to_bytes()), + ) .field("prefix", &self.prefix) .field("roots", &self.roots) .field("dirs", &self.dirs) @@ -815,7 +872,7 @@ ignore_patterns: &[IgnorePattern], glob_suffix: GlobSuffix, regex_config: RegexCompleteness, -) -> PatternResult<(Vec<u8>, IgnoreFnType<'a>)> { +) -> PatternResult<(PreRegex, IgnoreFnType<'a>)> { let mut regexps = vec![]; let mut exact_set = HashSet::new(); @@ -850,7 +907,32 @@ Box::new(func) as IgnoreFnType }; - Ok((full_regex.to_bytes(), func)) + Ok((full_regex, func)) +} + +#[logging_timer::time("trace")] +fn build_regex_match_for_debug<'a>( + ignore_patterns: &[IgnorePattern], + glob_suffix: GlobSuffix, +) -> PatternResult<PreRegex> { + let mut regexps = vec![]; + + for pattern in ignore_patterns { + if let Some(re) = build_single_regex( + pattern, + glob_suffix, + RegexCompleteness::Complete, + )? { + regexps.push(re); + } else { + panic!("RegexCompleteness::Complete should prevent this branch"); + } + } + + Ok(PreRegex::Sequence(vec![ + PreRegex::parse(&b"^"[..])?, + PreRegex::Alternation(regexps), + ])) } /// Returns roots and directories corresponding to each pattern. @@ -943,10 +1025,10 @@ ignore_patterns: Vec<IgnorePattern>, glob_suffix: GlobSuffix, regex_config: RegexCompleteness, -) -> PatternResult<(Vec<u8>, IgnoreFnType<'a>)> { +) -> PatternResult<(PatternsDesc, IgnoreFnType<'a>)> { let mut match_funcs: Vec<IgnoreFnType<'a>> = vec![]; // For debugging and printing - let mut patterns = vec![]; + let patterns; let (subincludes, ignore_patterns) = filter_subincludes(ignore_patterns)?; @@ -998,18 +1080,19 @@ }; match_funcs.push(Box::new(match_func)); - patterns.extend(b"rootfilesin: "); dirs_vec.sort(); - patterns.extend(dirs_vec.escaped_bytes()); + patterns = PatternsDesc::RootFilesIn(dirs_vec, glob_suffix); } else { let (new_re, match_func) = build_regex_match( &ignore_patterns, glob_suffix, regex_config, )?; - patterns = new_re; + patterns = PatternsDesc::Re(new_re); match_funcs.push(match_func) } + } else { + patterns = PatternsDesc::Re(PreRegex::Empty) } Ok(if match_funcs.len() == 1 { @@ -1131,8 +1214,8 @@ DirsChildrenMultiset::new(thing, Some(self.parents.iter())) } - pub fn debug_get_patterns(&self) -> &[u8] { - self.patterns.as_ref() + pub fn debug_get_patterns(&self, syntax: ReSyntax) -> Vec<u8> { + self.patterns.to_pattern_bytes(syntax) } } @@ -1147,7 +1230,9 @@ write!( f, "IncludeMatcher(includes='{}')", - String::from_utf8_lossy(&self.patterns.escaped_bytes()) + &String::from_utf8_lossy( + &self.patterns.to_pattern_bytes(ReSyntax::Internal) + ) ) } }
--- a/rust/hg-core/src/pre_regex.rs Fri Dec 06 20:27:59 2024 +0000 +++ b/rust/hg-core/src/pre_regex.rs Fri Dec 13 15:05:37 2024 +0000 @@ -19,6 +19,9 @@ &RE_ESCAPE[c as usize] } +/// An intermediate regular expression representation, that can be used +/// both to compile down to a `Regex` for matching, or converted to +/// a string directly for diagnostics. #[derive(Debug, Clone)] pub enum PreRegex { Empty,
--- a/rust/rhg/src/commands/debugignorerhg.rs Fri Dec 06 20:27:59 2024 +0000 +++ b/rust/rhg/src/commands/debugignorerhg.rs Fri Dec 13 15:05:37 2024 +0000 @@ -1,8 +1,10 @@ use crate::error::CommandError; +use crate::ui::Ui; use clap::Arg; use hg::dirstate::status::StatusError; use hg::filepatterns::RegexCompleteness; -use hg::matchers::get_ignore_matcher_pre; +use hg::matchers::{get_ignore_matcher_pre, ReSyntax}; +use hg::repo::Repo; use log::warn; pub const HELP_TEXT: &str = " @@ -24,12 +26,19 @@ ).about(HELP_TEXT) } -pub fn run(invocation: &crate::CliInvocation) -> Result<(), CommandError> { - let repo = invocation.repo?; - let args = invocation.subcommand_args; +pub enum WhichPatterns { + All, + SlowPathOnly, +} +pub fn work( + repo: &Repo, + ui: &Ui, + which: WhichPatterns, + syntax: ReSyntax, +) -> Result<(), CommandError> { let ignore_file = repo.working_directory_vfs().join(".hgignore"); // TODO hardcoded - let all_patterns = args.get_flag("all-patterns"); + let all_patterns = matches!(which, WhichPatterns::All); let (ignore_matcher, warnings) = get_ignore_matcher_pre( vec![ignore_file], @@ -51,8 +60,22 @@ warn!("Pattern warnings: {:?}", &warnings); } - let patterns = ignore_matcher.debug_get_patterns(); - invocation.ui.write_stdout(patterns)?; - invocation.ui.write_stdout(b"\n")?; + let patterns = ignore_matcher.debug_get_patterns(syntax); + ui.write_stdout(&patterns)?; + ui.write_stdout(b"\n")?; Ok(()) } + +pub fn run(invocation: &crate::CliInvocation) -> Result<(), CommandError> { + let repo = invocation.repo?; + let args = invocation.subcommand_args; + + let all_patterns = args.get_flag("all-patterns"); + let patterns = if all_patterns { + WhichPatterns::All + } else { + WhichPatterns::SlowPathOnly + }; + + work(repo, invocation.ui, patterns, ReSyntax::Internal) +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rust/rhg/src/commands/script_hgignore.rs Fri Dec 13 15:05:37 2024 +0000 @@ -0,0 +1,42 @@ +use clap::Arg; +use hg::matchers::ReSyntax; + +use crate::error::CommandError; + +use super::debugignorerhg::WhichPatterns; + +pub const HELP_TEXT: &str = " +Show effective hgignore patterns used by rhg. + +This is a pure Rust version of `hg debugignore`. + +Some options might be missing, check the list below. +"; + +pub fn args() -> clap::Command { + clap::command!("script::hgignore") + .arg( + Arg::new("print-re") + .help("Print the regular expression that matches all ignored files.") + .action(clap::ArgAction::SetTrue) + .long("print-re"), + ).about(HELP_TEXT) +} + +pub fn run(invocation: &crate::CliInvocation) -> Result<(), CommandError> { + let repo = invocation.repo?; + let args = invocation.subcommand_args; + let print_re = args.get_flag("print-re"); + if !print_re { + return Err(CommandError::unsupported( + "Unsupported invocation: flag --print-re is required", + )); + } + + crate::commands::debugignorerhg::work( + repo, + invocation.ui, + WhichPatterns::All, + ReSyntax::Tidy, + ) +}
--- a/rust/rhg/src/main.rs Fri Dec 06 20:27:59 2024 +0000 +++ b/rust/rhg/src/main.rs Fri Dec 13 15:05:37 2024 +0000 @@ -534,6 +534,7 @@ pub mod debugrhgsparse; pub mod files; pub mod root; + pub mod script_hgignore; pub mod status; } @@ -616,6 +617,7 @@ subcommand!(root), subcommand!(config), subcommand!(status), + subcommand!(script_hgignore), ]; let mut commands = Subcommands::new(); for cmd in subcommands {
--- a/tests/test-hgignore.t Fri Dec 06 20:27:59 2024 +0000 +++ b/tests/test-hgignore.t Fri Dec 13 15:05:37 2024 +0000 @@ -17,7 +17,16 @@ <nevermatcher> #if rhg - $ hg debugignorerhg + $ rhg_debugignore() { + > echo debugignorerhg: + > hg debugignorerhg + > echo script::hgignore --print-re: + > hg script::hgignore --print-re + > } + $ rhg_debugignore + debugignorerhg: + (?:) + script::hgignore --print-re: #endif @@ -83,7 +92,10 @@ <includematcher includes='(?i:.*\\.O$)|.*.hgignore'> #if rhg - $ hg debugignorerhg + $ rhg_debugignore + debugignorerhg: + (?:\\A[\x00- \x0b-\xf4\x8f\xbf\xbf]*(?:(?:\\.[Oo]\\z)|(?:[\x00- \x0b-\xf4\x8f\xbf\xbf](?:hgignore)))) (esc) + script::hgignore --print-re: ^(?:(?i:.*\.O$)|.*.hgignore) #endif @@ -99,7 +111,10 @@ <includematcher includes='.*.hgignore|(?i:.*\\.O$)'> #if rhg - $ hg debugignorerhg + $ rhg_debugignore + debugignorerhg: + (?:\\A[\x00- \x0b-\xf4\x8f\xbf\xbf]*(?:(?:[\x00- \x0b-\xf4\x8f\xbf\xbf](?:hgignore))|(?:\\.[Oo]\\z))) (esc) + script::hgignore --print-re: ^(?:.*.hgignore|(?i:.*\.O$)) #endif @@ -116,7 +131,10 @@ <includematcher includes='(?i:.*\\.O$)|.*.HGIGNORE'> #if rhg - $ hg debugignorerhg + $ rhg_debugignore + debugignorerhg: + (?:\\A[\x00- \x0b-\xf4\x8f\xbf\xbf]*(?:(?:\\.[Oo]\\z)|(?:[\x00- \x0b-\xf4\x8f\xbf\xbf](?:HGIGNORE)))) (esc) + script::hgignore --print-re: ^(?:(?i:.*\.O$)|.*.HGIGNORE) #endif @@ -131,7 +149,10 @@ <includematcher includes='.*.HGIGNORE|(?i:.*\\.O$)'> #if rhg - $ hg debugignorerhg + $ rhg_debugignore + debugignorerhg: + (?:\\A[\x00- \x0b-\xf4\x8f\xbf\xbf]*(?:(?:[\x00- \x0b-\xf4\x8f\xbf\xbf](?:HGIGNORE))|(?:\\.[Oo]\\z))) (esc) + script::hgignore --print-re: ^(?:.*.HGIGNORE|(?i:.*\.O$)) #endif @@ -149,7 +170,10 @@ <includematcher includes='(?i:^[^a].*\\.O$)|.*.HGIGNORE'> #if rhg - $ hg debugignorerhg + $ rhg_debugignore + debugignorerhg: + (?:\\A(?:(?:\\A[\x00-@B-`b-\xf4\x8f\xbf\xbf][\x00- \x0b-\xf4\x8f\xbf\xbf]*\\.[Oo]\\z)|(?:[\x00- \x0b-\xf4\x8f\xbf\xbf]*[\x00- \x0b-\xf4\x8f\xbf\xbf](?:HGIGNORE)))) (esc) + script::hgignore --print-re: ^(?:(?i:^[^a].*\.O$)|.*.HGIGNORE) #endif @@ -165,7 +189,10 @@ <includematcher includes='.*.HGIGNORE|(?i:^[^a].*\\.O$)'> #if rhg - $ hg debugignorerhg + $ rhg_debugignore + debugignorerhg: + (?:\\A(?:(?:[\x00- \x0b-\xf4\x8f\xbf\xbf]*[\x00- \x0b-\xf4\x8f\xbf\xbf](?:HGIGNORE))|(?:\\A[\x00-@B-`b-\xf4\x8f\xbf\xbf][\x00- \x0b-\xf4\x8f\xbf\xbf]*\\.[Oo]\\z))) (esc) + script::hgignore --print-re: ^(?:.*.HGIGNORE|(?i:^[^a].*\.O$)) #endif @@ -360,11 +387,11 @@ ? dir/c.o ? syntax #if rhg - $ hg debugignorerhg -a + $ rhg_debugignore + debugignorerhg: + (?:\A[a&&b]) + script::hgignore --print-re: ^(?:dir/b\.o(?:/|$)) - - $ hg debugignorerhg - ^ ^ #endif $ echo "relglob:*" > .hgignore @@ -379,7 +406,10 @@ <includematcher includes='.*(?:/|$)'> #if rhg - $ hg debugignorerhg + $ rhg_debugignore + debugignorerhg: + (?:\A(?-u:[\x00-\xFF])*?(?:/|\z)) + script::hgignore --print-re: ^(?:.*(?:/|$)) #endif