Mercurial > public > mercurial-scm > hg
view rust/hg-core/src/pre_regex.rs @ 52556:1866119cbad7
rust-ignore: construct regex Hir object directly, avoiding large regex string
Rework how we convert patterns to regexes in rust.
Instead of going patterns -> string -> Regex, which is slow and causes
some correctness issues, build a structured regex_syntax::hir::Hir value,
which is faster and it also prevents surprising regex escape.
This change makes the time of `build_regex_match` go from ~70-80ms
to ~40ms in my testing (for a large hgignore).
The bug I mentioned involves regex patterns that "escape" their
intended scope. For example, a sequence of hgignore regexp patterns like
this would previously lead to surprising behavior:
foo(?:
bar
baz
)
this matches foobar and foobaz, and doesn't match bar and baz.
The new behavior is to report a pattern parse error
The Python hg also has this bug, so this bugfix
not really helping much, but it's probably better to
fall back to real Python bugs than to simulate them.
author | Arseniy Alekseyev <aalekseyev@janestreet.com> |
---|---|
date | Fri, 06 Dec 2024 20:27:59 +0000 |
parents | |
children | b89c934e6269 |
line wrap: on
line source
use core::str; use lazy_static::lazy_static; use crate::filepatterns::PatternError; lazy_static! { static ref RE_ESCAPE: Vec<Vec<u8>> = { let mut v: Vec<Vec<u8>> = (0..=255).map(|byte| vec![byte]).collect(); let to_escape = b"()[]{}?*+-|^$\\.&~#\t\n\r\x0b\x0c"; for byte in to_escape { v[*byte as usize].insert(0, b'\\'); } v }; } pub fn escape_char_for_re(c: u8) -> &'static [u8] { &RE_ESCAPE[c as usize] } #[derive(Debug, Clone)] pub enum PreRegex { Empty, Dot, DotStar, Eof, NonslashStar, Byte(u8), Bytes(Vec<u8>), SlashOrEof, Re((regex_syntax::hir::Hir, Vec<u8>)), Maybe(Box<Self>), Alternation(Vec<Self>), Sequence(Vec<Self>), } mod to_hir { use itertools::Itertools; use regex_syntax::hir::{ Class, ClassBytes, ClassBytesRange, Dot, Hir, Look, Repetition, }; use super::PreRegex; fn hir_star(hir: Hir) -> Hir { Hir::repetition(Repetition { min: 0, max: None, greedy: false, sub: Box::new(hir), }) } fn hir_eof() -> Hir { Hir::look(Look::End) } fn hir_nonslash() -> Hir { let mut class = Class::Bytes(ClassBytes::new([ClassBytesRange::new(b'/', b'/')])); Class::negate(&mut class); Hir::class(class) } fn hir_byte(b: u8) -> Hir { let class = Class::Bytes(ClassBytes::new([ClassBytesRange::new(b, b)])); Hir::class(class) } fn hir_literal(text: &[u8]) -> Hir { let b: Box<[u8]> = Box::from(text); Hir::literal(b) } pub(crate) fn to_hir(re: &PreRegex) -> regex_syntax::hir::Hir { match re { PreRegex::Empty => Hir::empty(), PreRegex::Dot => Hir::dot(Dot::AnyByte), PreRegex::DotStar => hir_star(Hir::dot(Dot::AnyByte)), PreRegex::Eof => hir_eof(), PreRegex::NonslashStar => hir_star(hir_nonslash()), PreRegex::Byte(b) => hir_byte(*b), PreRegex::Bytes(bs) => hir_literal(bs), PreRegex::SlashOrEof => { Hir::alternation(vec![hir_byte(b'/'), hir_eof()]) } PreRegex::Re((hir, _)) => hir.clone(), PreRegex::Maybe(s) => { Hir::alternation(vec![Hir::empty(), s.to_hir()]) } PreRegex::Alternation(alt) => { let alt = alt.iter().map(|r| r.to_hir()).collect_vec(); Hir::alternation(alt) } PreRegex::Sequence(seq) => { let seq = seq.iter().map(|r| r.to_hir()).collect_vec(); Hir::concat(seq) } } } } impl PreRegex { pub fn to_hir(&self) -> regex_syntax::hir::Hir { to_hir::to_hir(self) } fn to_bytes_rec(&self, out: &mut Vec<u8>) { match self { PreRegex::Empty => (), PreRegex::Dot => out.push(b'.'), PreRegex::DotStar => out.extend_from_slice(&b".*"[..]), PreRegex::Eof => out.push(b'$'), PreRegex::NonslashStar => out.extend_from_slice(&b"[^/]*"[..]), PreRegex::Byte(b) => out.extend_from_slice(escape_char_for_re(*b)), PreRegex::Bytes(bytes) => { for b in bytes { out.extend_from_slice(escape_char_for_re(*b)) } } PreRegex::SlashOrEof => out.extend_from_slice(&b"(?:/|$)"[..]), PreRegex::Re((_hir, src)) => out.extend_from_slice(src), PreRegex::Alternation(alt) => { if alt.is_empty() { // something that can never match out.extend_from_slice(&b" ^"[..]) } else { out.extend_from_slice(&b"(?:"[..]); let mut first = true; for r in alt { if first { first = false } else { out.extend_from_slice(&b"|"[..]); } r.to_bytes_rec(out) } out.extend_from_slice(&b")"[..]); } } PreRegex::Sequence(seq) => { for r in seq { r.to_bytes_rec(out) } } PreRegex::Maybe(r) => { out.extend_from_slice(&b"(?:"[..]); r.to_bytes_rec(out); out.extend_from_slice(&b")?"[..]); } } } pub fn parse(re: &[u8]) -> Result<Self, PatternError> { let re_str = str::from_utf8(re) .map_err(|err| PatternError::UnsupportedSyntax(err.to_string()))?; Ok(Self::Re(( regex_syntax::parse(re_str).map_err(|err| { PatternError::UnsupportedSyntax(err.to_string()) })?, re.to_vec(), ))) } pub fn to_bytes(&self) -> Vec<u8> { let mut out = vec![]; self.to_bytes_rec(&mut out); out } pub fn literal(prefix: &[u8]) -> PreRegex { Self::Bytes(prefix.to_vec()) } pub fn preceding_dir_components() -> Self { Self::Maybe(Box::new(Self::Sequence(vec![ Self::DotStar, Self::Byte(b'/'), ]))) } }