changeset 52583:b89c934e6269

rust-hgignore: add a scripting command to print the hgignore regexp Add a command `script::hgignore --print-re` to print the hgignore regexp. One complication is that the `rootfilesin`-only matcher doesn't use a regular expression, and the existing converts it to something that's not a regular expression. We add code to handle that case. Since this command is now sufficient to generate a tidy-looking regexp for scripting, this frees up the "debug" command to report the internal regexp used by the regex engine, so we make that change too.
author Arseniy Alekseyev <aalekseyev@janestreet.com>
date Fri, 13 Dec 2024 15:05:37 +0000
parents 1866119cbad7
children 2d52ae3c5f76
files rust/hg-core/src/matchers.rs rust/hg-core/src/pre_regex.rs rust/rhg/src/commands/debugignorerhg.rs rust/rhg/src/commands/script_hgignore.rs rust/rhg/src/main.rs tests/test-hgignore.t
diffstat 6 files changed, 220 insertions(+), 35 deletions(-) [+]
line wrap: on
line diff
--- a/rust/hg-core/src/matchers.rs	Fri Dec 06 20:27:59 2024 +0000
+++ b/rust/hg-core/src/matchers.rs	Fri Dec 13 15:05:37 2024 +0000
@@ -8,6 +8,7 @@
 //! Structs and types for matching files and directories.
 
 use format_bytes::format_bytes;
+use itertools::Itertools;
 use once_cell::sync::OnceCell;
 use regex_automata::meta::Regex;
 use regex_syntax::hir::Hir;
@@ -30,10 +31,10 @@
 
 use crate::dirstate::status::IgnoreFnType;
 use crate::filepatterns::normalize_path_bytes;
-use std::collections::HashSet;
 use std::fmt::{Display, Error, Formatter};
 use std::path::{Path, PathBuf};
 use std::{borrow::ToOwned, collections::BTreeSet};
+use std::{collections::HashSet, str::FromStr};
 
 #[derive(Debug, PartialEq)]
 pub enum VisitChildrenSet {
@@ -297,7 +298,7 @@
 /// assert_eq!(matcher.exact_match(HgPath::new(b"lib.h")), false); // exact matches are for (rel)path kinds
 /// ```
 pub struct PatternMatcher<'a> {
-    patterns: Vec<u8>,
+    patterns: PatternsDesc,
     match_fn: IgnoreFnType<'a>,
     /// Whether all the patterns match a prefix (i.e. recursively)
     prefix: bool,
@@ -306,10 +307,63 @@
     dirs: DirsMultiset,
 }
 
+enum PatternsDesc {
+    Re(PreRegex),
+    RootFilesIn(Vec<Vec<u8>>, GlobSuffix),
+}
+
+pub enum ReSyntax {
+    Tidy,
+    Internal,
+}
+
+impl PatternsDesc {
+    fn to_re(&self) -> PreRegex {
+        match self {
+            Self::Re(re) => re.clone(),
+            Self::RootFilesIn(patterns, glob_suffix) => {
+                let patterns = patterns
+                    .clone()
+                    .into_iter()
+                    .map(|pattern: Vec<u8>| IgnorePattern {
+                        syntax: PatternSyntax::RootFilesIn,
+                        source: PathBuf::from_str("<rootfilesin-matcher>")
+                            .unwrap(),
+                        pattern,
+                    })
+                    .collect_vec();
+                build_regex_match_for_debug(&patterns, *glob_suffix).unwrap()
+            }
+        }
+    }
+
+    fn to_pattern_bytes(&self, syntax: ReSyntax) -> Vec<u8> {
+        match syntax {
+            ReSyntax::Tidy => self.to_re().to_bytes(),
+            ReSyntax::Internal => match self {
+                PatternsDesc::Re(re) => re.to_hir().to_string().into_bytes(),
+                PatternsDesc::RootFilesIn(dirs, _) => {
+                    let mut patterns = vec![];
+                    patterns.extend(b"rootfilesin: ");
+                    let mut dirs_vec = dirs.clone();
+                    dirs_vec.sort();
+                    patterns.extend(dirs_vec.escaped_bytes());
+                    patterns
+                }
+            },
+        }
+    }
+}
+
 impl core::fmt::Debug for PatternMatcher<'_> {
     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
         f.debug_struct("PatternMatcher")
-            .field("patterns", &String::from_utf8_lossy(&self.patterns))
+            .field(
+                "patterns",
+                &String::from_utf8_lossy(
+                    &self.patterns.to_pattern_bytes(ReSyntax::Internal),
+                ),
+            )
             .field("prefix", &self.prefix)
             .field("files", &self.files)
             .field("dirs", &self.dirs)
@@ -441,7 +495,7 @@
 /// assert!(!matcher.matches(HgPath::new(b"dir/subdir/subsubdir/file")));
 /// ```
 pub struct IncludeMatcher<'a> {
-    patterns: Vec<u8>,
+    patterns: PatternsDesc,
     match_fn: IgnoreFnType<'a>,
     /// Whether all the patterns match a prefix (i.e. recursively)
     prefix: bool,
@@ -453,7 +507,10 @@
 impl core::fmt::Debug for IncludeMatcher<'_> {
     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
         f.debug_struct("IncludeMatcher")
-            .field("patterns", &String::from_utf8_lossy(&self.patterns))
+            .field(
+                "patterns",
+                &String::from_utf8_lossy(&self.patterns.to_re().to_bytes()),
+            )
             .field("prefix", &self.prefix)
             .field("roots", &self.roots)
             .field("dirs", &self.dirs)
@@ -815,7 +872,7 @@
     ignore_patterns: &[IgnorePattern],
     glob_suffix: GlobSuffix,
     regex_config: RegexCompleteness,
-) -> PatternResult<(Vec<u8>, IgnoreFnType<'a>)> {
+) -> PatternResult<(PreRegex, IgnoreFnType<'a>)> {
     let mut regexps = vec![];
     let mut exact_set = HashSet::new();
 
@@ -850,7 +907,32 @@
         Box::new(func) as IgnoreFnType
     };
 
-    Ok((full_regex.to_bytes(), func))
+    Ok((full_regex, func))
+}
+
+#[logging_timer::time("trace")]
+fn build_regex_match_for_debug<'a>(
+    ignore_patterns: &[IgnorePattern],
+    glob_suffix: GlobSuffix,
+) -> PatternResult<PreRegex> {
+    let mut regexps = vec![];
+
+    for pattern in ignore_patterns {
+        if let Some(re) = build_single_regex(
+            pattern,
+            glob_suffix,
+            RegexCompleteness::Complete,
+        )? {
+            regexps.push(re);
+        } else {
+            panic!("RegexCompleteness::Complete should prevent this branch");
+        }
+    }
+
+    Ok(PreRegex::Sequence(vec![
+        PreRegex::parse(&b"^"[..])?,
+        PreRegex::Alternation(regexps),
+    ]))
 }
 
 /// Returns roots and directories corresponding to each pattern.
@@ -943,10 +1025,10 @@
     ignore_patterns: Vec<IgnorePattern>,
     glob_suffix: GlobSuffix,
     regex_config: RegexCompleteness,
-) -> PatternResult<(Vec<u8>, IgnoreFnType<'a>)> {
+) -> PatternResult<(PatternsDesc, IgnoreFnType<'a>)> {
     let mut match_funcs: Vec<IgnoreFnType<'a>> = vec![];
     // For debugging and printing
-    let mut patterns = vec![];
+    let patterns;
 
     let (subincludes, ignore_patterns) = filter_subincludes(ignore_patterns)?;
 
@@ -998,18 +1080,19 @@
             };
             match_funcs.push(Box::new(match_func));
 
-            patterns.extend(b"rootfilesin: ");
             dirs_vec.sort();
-            patterns.extend(dirs_vec.escaped_bytes());
+            patterns = PatternsDesc::RootFilesIn(dirs_vec, glob_suffix);
         } else {
             let (new_re, match_func) = build_regex_match(
                 &ignore_patterns,
                 glob_suffix,
                 regex_config,
             )?;
-            patterns = new_re;
+            patterns = PatternsDesc::Re(new_re);
             match_funcs.push(match_func)
         }
+    } else {
+        patterns = PatternsDesc::Re(PreRegex::Empty)
     }
 
     Ok(if match_funcs.len() == 1 {
@@ -1131,8 +1214,8 @@
         DirsChildrenMultiset::new(thing, Some(self.parents.iter()))
     }
 
-    pub fn debug_get_patterns(&self) -> &[u8] {
-        self.patterns.as_ref()
+    pub fn debug_get_patterns(&self, syntax: ReSyntax) -> Vec<u8> {
+        self.patterns.to_pattern_bytes(syntax)
     }
 }
 
@@ -1147,7 +1230,9 @@
         write!(
             f,
             "IncludeMatcher(includes='{}')",
-            String::from_utf8_lossy(&self.patterns.escaped_bytes())
+            &String::from_utf8_lossy(
+                &self.patterns.to_pattern_bytes(ReSyntax::Internal)
+            )
         )
     }
 }
--- a/rust/hg-core/src/pre_regex.rs	Fri Dec 06 20:27:59 2024 +0000
+++ b/rust/hg-core/src/pre_regex.rs	Fri Dec 13 15:05:37 2024 +0000
@@ -19,6 +19,9 @@
     &RE_ESCAPE[c as usize]
 }
 
+/// An intermediate regular expression representation, that can be used
+/// both to compile down to a `Regex` for matching, or converted to
+/// a string directly for diagnostics.
 #[derive(Debug, Clone)]
 pub enum PreRegex {
     Empty,
--- a/rust/rhg/src/commands/debugignorerhg.rs	Fri Dec 06 20:27:59 2024 +0000
+++ b/rust/rhg/src/commands/debugignorerhg.rs	Fri Dec 13 15:05:37 2024 +0000
@@ -1,8 +1,10 @@
 use crate::error::CommandError;
+use crate::ui::Ui;
 use clap::Arg;
 use hg::dirstate::status::StatusError;
 use hg::filepatterns::RegexCompleteness;
-use hg::matchers::get_ignore_matcher_pre;
+use hg::matchers::{get_ignore_matcher_pre, ReSyntax};
+use hg::repo::Repo;
 use log::warn;
 
 pub const HELP_TEXT: &str = "
@@ -24,12 +26,19 @@
     ).about(HELP_TEXT)
 }
 
-pub fn run(invocation: &crate::CliInvocation) -> Result<(), CommandError> {
-    let repo = invocation.repo?;
-    let args = invocation.subcommand_args;
+pub enum WhichPatterns {
+    All,
+    SlowPathOnly,
+}
 
+pub fn work(
+    repo: &Repo,
+    ui: &Ui,
+    which: WhichPatterns,
+    syntax: ReSyntax,
+) -> Result<(), CommandError> {
     let ignore_file = repo.working_directory_vfs().join(".hgignore"); // TODO hardcoded
-    let all_patterns = args.get_flag("all-patterns");
+    let all_patterns = matches!(which, WhichPatterns::All);
 
     let (ignore_matcher, warnings) = get_ignore_matcher_pre(
         vec![ignore_file],
@@ -51,8 +60,22 @@
         warn!("Pattern warnings: {:?}", &warnings);
     }
 
-    let patterns = ignore_matcher.debug_get_patterns();
-    invocation.ui.write_stdout(patterns)?;
-    invocation.ui.write_stdout(b"\n")?;
+    let patterns = ignore_matcher.debug_get_patterns(syntax);
+    ui.write_stdout(&patterns)?;
+    ui.write_stdout(b"\n")?;
     Ok(())
 }
+
+pub fn run(invocation: &crate::CliInvocation) -> Result<(), CommandError> {
+    let repo = invocation.repo?;
+    let args = invocation.subcommand_args;
+
+    let all_patterns = args.get_flag("all-patterns");
+    let patterns = if all_patterns {
+        WhichPatterns::All
+    } else {
+        WhichPatterns::SlowPathOnly
+    };
+
+    work(repo, invocation.ui, patterns, ReSyntax::Internal)
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rust/rhg/src/commands/script_hgignore.rs	Fri Dec 13 15:05:37 2024 +0000
@@ -0,0 +1,42 @@
+use clap::Arg;
+use hg::matchers::ReSyntax;
+
+use crate::error::CommandError;
+
+use super::debugignorerhg::WhichPatterns;
+
+pub const HELP_TEXT: &str = "
+Show effective hgignore patterns used by rhg.
+
+This is a pure Rust version of `hg debugignore`.
+
+Some options might be missing, check the list below.
+";
+
+pub fn args() -> clap::Command {
+    clap::command!("script::hgignore")
+    .arg(
+        Arg::new("print-re")
+            .help("Print the regular expression that matches all ignored files.")
+            .action(clap::ArgAction::SetTrue)
+            .long("print-re"),
+    ).about(HELP_TEXT)
+}
+
+pub fn run(invocation: &crate::CliInvocation) -> Result<(), CommandError> {
+    let repo = invocation.repo?;
+    let args = invocation.subcommand_args;
+    let print_re = args.get_flag("print-re");
+    if !print_re {
+        return Err(CommandError::unsupported(
+            "Unsupported invocation: flag --print-re is required",
+        ));
+    }
+
+    crate::commands::debugignorerhg::work(
+        repo,
+        invocation.ui,
+        WhichPatterns::All,
+        ReSyntax::Tidy,
+    )
+}
--- a/rust/rhg/src/main.rs	Fri Dec 06 20:27:59 2024 +0000
+++ b/rust/rhg/src/main.rs	Fri Dec 13 15:05:37 2024 +0000
@@ -534,6 +534,7 @@
     pub mod debugrhgsparse;
     pub mod files;
     pub mod root;
+    pub mod script_hgignore;
     pub mod status;
 }
 
@@ -616,6 +617,7 @@
         subcommand!(root),
         subcommand!(config),
         subcommand!(status),
+        subcommand!(script_hgignore),
     ];
     let mut commands = Subcommands::new();
     for cmd in subcommands {
--- a/tests/test-hgignore.t	Fri Dec 06 20:27:59 2024 +0000
+++ b/tests/test-hgignore.t	Fri Dec 13 15:05:37 2024 +0000
@@ -17,7 +17,16 @@
   <nevermatcher>
 
 #if rhg
-  $ hg debugignorerhg
+  $ rhg_debugignore() {
+  >   echo debugignorerhg:
+  >   hg debugignorerhg
+  >   echo script::hgignore --print-re:
+  >   hg script::hgignore --print-re
+  > }
+  $ rhg_debugignore
+  debugignorerhg:
+  (?:)
+  script::hgignore --print-re:
   
 #endif
 
@@ -83,7 +92,10 @@
   <includematcher includes='(?i:.*\\.O$)|.*.hgignore'>
 
 #if rhg
-  $ hg debugignorerhg
+  $ rhg_debugignore
+  debugignorerhg:
+  (?:\\A[\x00-	\x0b-\xf4\x8f\xbf\xbf]*(?:(?:\\.[Oo]\\z)|(?:[\x00-	\x0b-\xf4\x8f\xbf\xbf](?:hgignore)))) (esc)
+  script::hgignore --print-re:
   ^(?:(?i:.*\.O$)|.*.hgignore)
 #endif
 
@@ -99,7 +111,10 @@
   <includematcher includes='.*.hgignore|(?i:.*\\.O$)'>
 
 #if rhg
-  $ hg debugignorerhg
+  $ rhg_debugignore
+  debugignorerhg:
+  (?:\\A[\x00-	\x0b-\xf4\x8f\xbf\xbf]*(?:(?:[\x00-	\x0b-\xf4\x8f\xbf\xbf](?:hgignore))|(?:\\.[Oo]\\z))) (esc)
+  script::hgignore --print-re:
   ^(?:.*.hgignore|(?i:.*\.O$))
 #endif
 
@@ -116,7 +131,10 @@
   <includematcher includes='(?i:.*\\.O$)|.*.HGIGNORE'>
 
 #if rhg
-  $ hg debugignorerhg
+  $ rhg_debugignore
+  debugignorerhg:
+  (?:\\A[\x00-	\x0b-\xf4\x8f\xbf\xbf]*(?:(?:\\.[Oo]\\z)|(?:[\x00-	\x0b-\xf4\x8f\xbf\xbf](?:HGIGNORE)))) (esc)
+  script::hgignore --print-re:
   ^(?:(?i:.*\.O$)|.*.HGIGNORE)
 #endif
 
@@ -131,7 +149,10 @@
   <includematcher includes='.*.HGIGNORE|(?i:.*\\.O$)'>
 
 #if rhg
-  $ hg debugignorerhg
+  $ rhg_debugignore
+  debugignorerhg:
+  (?:\\A[\x00-	\x0b-\xf4\x8f\xbf\xbf]*(?:(?:[\x00-	\x0b-\xf4\x8f\xbf\xbf](?:HGIGNORE))|(?:\\.[Oo]\\z))) (esc)
+  script::hgignore --print-re:
   ^(?:.*.HGIGNORE|(?i:.*\.O$))
 #endif
 
@@ -149,7 +170,10 @@
   <includematcher includes='(?i:^[^a].*\\.O$)|.*.HGIGNORE'>
 
 #if rhg
-  $ hg debugignorerhg
+  $ rhg_debugignore
+  debugignorerhg:
+  (?:\\A(?:(?:\\A[\x00-@B-`b-\xf4\x8f\xbf\xbf][\x00-	\x0b-\xf4\x8f\xbf\xbf]*\\.[Oo]\\z)|(?:[\x00-	\x0b-\xf4\x8f\xbf\xbf]*[\x00-	\x0b-\xf4\x8f\xbf\xbf](?:HGIGNORE)))) (esc)
+  script::hgignore --print-re:
   ^(?:(?i:^[^a].*\.O$)|.*.HGIGNORE)
 #endif
 
@@ -165,7 +189,10 @@
   <includematcher includes='.*.HGIGNORE|(?i:^[^a].*\\.O$)'>
 
 #if rhg
-  $ hg debugignorerhg
+  $ rhg_debugignore
+  debugignorerhg:
+  (?:\\A(?:(?:[\x00-	\x0b-\xf4\x8f\xbf\xbf]*[\x00-	\x0b-\xf4\x8f\xbf\xbf](?:HGIGNORE))|(?:\\A[\x00-@B-`b-\xf4\x8f\xbf\xbf][\x00-	\x0b-\xf4\x8f\xbf\xbf]*\\.[Oo]\\z))) (esc)
+  script::hgignore --print-re:
   ^(?:.*.HGIGNORE|(?i:^[^a].*\.O$))
 #endif
 
@@ -360,11 +387,11 @@
   ? dir/c.o
   ? syntax
 #if rhg
-  $ hg debugignorerhg -a
+  $ rhg_debugignore
+  debugignorerhg:
+  (?:\A[a&&b])
+  script::hgignore --print-re:
   ^(?:dir/b\.o(?:/|$))
-
-  $ hg debugignorerhg
-  ^ ^
 #endif
 
   $ echo "relglob:*" > .hgignore
@@ -379,7 +406,10 @@
   <includematcher includes='.*(?:/|$)'>
 
 #if rhg
-  $ hg debugignorerhg
+  $ rhg_debugignore
+  debugignorerhg:
+  (?:\A(?-u:[\x00-\xFF])*?(?:/|\z))
+  script::hgignore --print-re:
   ^(?:.*(?:/|$))
 #endif