view rust/hg-core/src/pre_regex.rs @ 52556:1866119cbad7

rust-ignore: construct regex Hir object directly, avoiding large regex string Rework how we convert patterns to regexes in rust. Instead of going patterns -> string -> Regex, which is slow and causes some correctness issues, build a structured regex_syntax::hir::Hir value, which is faster and it also prevents surprising regex escape. This change makes the time of `build_regex_match` go from ~70-80ms to ~40ms in my testing (for a large hgignore). The bug I mentioned involves regex patterns that "escape" their intended scope. For example, a sequence of hgignore regexp patterns like this would previously lead to surprising behavior: foo(?: bar baz ) this matches foobar and foobaz, and doesn't match bar and baz. The new behavior is to report a pattern parse error The Python hg also has this bug, so this bugfix not really helping much, but it's probably better to fall back to real Python bugs than to simulate them.
author Arseniy Alekseyev <aalekseyev@janestreet.com>
date Fri, 06 Dec 2024 20:27:59 +0000
parents
children b89c934e6269
line wrap: on
line source

use core::str;

use lazy_static::lazy_static;

use crate::filepatterns::PatternError;

lazy_static! {
    static ref RE_ESCAPE: Vec<Vec<u8>> = {
        let mut v: Vec<Vec<u8>> = (0..=255).map(|byte| vec![byte]).collect();
        let to_escape = b"()[]{}?*+-|^$\\.&~#\t\n\r\x0b\x0c";
        for byte in to_escape {
            v[*byte as usize].insert(0, b'\\');
        }
        v
    };
}

pub fn escape_char_for_re(c: u8) -> &'static [u8] {
    &RE_ESCAPE[c as usize]
}

#[derive(Debug, Clone)]
pub enum PreRegex {
    Empty,
    Dot,
    DotStar,
    Eof,
    NonslashStar,
    Byte(u8),
    Bytes(Vec<u8>),
    SlashOrEof,
    Re((regex_syntax::hir::Hir, Vec<u8>)),
    Maybe(Box<Self>),
    Alternation(Vec<Self>),
    Sequence(Vec<Self>),
}

mod to_hir {
    use itertools::Itertools;
    use regex_syntax::hir::{
        Class, ClassBytes, ClassBytesRange, Dot, Hir, Look, Repetition,
    };

    use super::PreRegex;

    fn hir_star(hir: Hir) -> Hir {
        Hir::repetition(Repetition {
            min: 0,
            max: None,
            greedy: false,
            sub: Box::new(hir),
        })
    }

    fn hir_eof() -> Hir {
        Hir::look(Look::End)
    }

    fn hir_nonslash() -> Hir {
        let mut class =
            Class::Bytes(ClassBytes::new([ClassBytesRange::new(b'/', b'/')]));
        Class::negate(&mut class);
        Hir::class(class)
    }

    fn hir_byte(b: u8) -> Hir {
        let class =
            Class::Bytes(ClassBytes::new([ClassBytesRange::new(b, b)]));
        Hir::class(class)
    }

    fn hir_literal(text: &[u8]) -> Hir {
        let b: Box<[u8]> = Box::from(text);
        Hir::literal(b)
    }

    pub(crate) fn to_hir(re: &PreRegex) -> regex_syntax::hir::Hir {
        match re {
            PreRegex::Empty => Hir::empty(),
            PreRegex::Dot => Hir::dot(Dot::AnyByte),
            PreRegex::DotStar => hir_star(Hir::dot(Dot::AnyByte)),
            PreRegex::Eof => hir_eof(),
            PreRegex::NonslashStar => hir_star(hir_nonslash()),
            PreRegex::Byte(b) => hir_byte(*b),
            PreRegex::Bytes(bs) => hir_literal(bs),
            PreRegex::SlashOrEof => {
                Hir::alternation(vec![hir_byte(b'/'), hir_eof()])
            }
            PreRegex::Re((hir, _)) => hir.clone(),
            PreRegex::Maybe(s) => {
                Hir::alternation(vec![Hir::empty(), s.to_hir()])
            }
            PreRegex::Alternation(alt) => {
                let alt = alt.iter().map(|r| r.to_hir()).collect_vec();
                Hir::alternation(alt)
            }
            PreRegex::Sequence(seq) => {
                let seq = seq.iter().map(|r| r.to_hir()).collect_vec();
                Hir::concat(seq)
            }
        }
    }
}

impl PreRegex {
    pub fn to_hir(&self) -> regex_syntax::hir::Hir {
        to_hir::to_hir(self)
    }

    fn to_bytes_rec(&self, out: &mut Vec<u8>) {
        match self {
            PreRegex::Empty => (),
            PreRegex::Dot => out.push(b'.'),
            PreRegex::DotStar => out.extend_from_slice(&b".*"[..]),
            PreRegex::Eof => out.push(b'$'),
            PreRegex::NonslashStar => out.extend_from_slice(&b"[^/]*"[..]),
            PreRegex::Byte(b) => out.extend_from_slice(escape_char_for_re(*b)),
            PreRegex::Bytes(bytes) => {
                for b in bytes {
                    out.extend_from_slice(escape_char_for_re(*b))
                }
            }
            PreRegex::SlashOrEof => out.extend_from_slice(&b"(?:/|$)"[..]),
            PreRegex::Re((_hir, src)) => out.extend_from_slice(src),
            PreRegex::Alternation(alt) => {
                if alt.is_empty() {
                    // something that can never match
                    out.extend_from_slice(&b" ^"[..])
                } else {
                    out.extend_from_slice(&b"(?:"[..]);
                    let mut first = true;
                    for r in alt {
                        if first {
                            first = false
                        } else {
                            out.extend_from_slice(&b"|"[..]);
                        }
                        r.to_bytes_rec(out)
                    }
                    out.extend_from_slice(&b")"[..]);
                }
            }
            PreRegex::Sequence(seq) => {
                for r in seq {
                    r.to_bytes_rec(out)
                }
            }
            PreRegex::Maybe(r) => {
                out.extend_from_slice(&b"(?:"[..]);
                r.to_bytes_rec(out);
                out.extend_from_slice(&b")?"[..]);
            }
        }
    }

    pub fn parse(re: &[u8]) -> Result<Self, PatternError> {
        let re_str = str::from_utf8(re)
            .map_err(|err| PatternError::UnsupportedSyntax(err.to_string()))?;
        Ok(Self::Re((
            regex_syntax::parse(re_str).map_err(|err| {
                PatternError::UnsupportedSyntax(err.to_string())
            })?,
            re.to_vec(),
        )))
    }

    pub fn to_bytes(&self) -> Vec<u8> {
        let mut out = vec![];
        self.to_bytes_rec(&mut out);
        out
    }

    pub fn literal(prefix: &[u8]) -> PreRegex {
        Self::Bytes(prefix.to_vec())
    }

    pub fn preceding_dir_components() -> Self {
        Self::Maybe(Box::new(Self::Sequence(vec![
            Self::DotStar,
            Self::Byte(b'/'),
        ])))
    }
}