comparison rust/hg-core/src/pre_regex.rs @ 52556:1866119cbad7

rust-ignore: construct regex Hir object directly, avoiding large regex string Rework how we convert patterns to regexes in rust. Instead of going patterns -> string -> Regex, which is slow and causes some correctness issues, build a structured regex_syntax::hir::Hir value, which is faster and it also prevents surprising regex escape. This change makes the time of `build_regex_match` go from ~70-80ms to ~40ms in my testing (for a large hgignore). The bug I mentioned involves regex patterns that "escape" their intended scope. For example, a sequence of hgignore regexp patterns like this would previously lead to surprising behavior: foo(?: bar baz ) this matches foobar and foobaz, and doesn't match bar and baz. The new behavior is to report a pattern parse error The Python hg also has this bug, so this bugfix not really helping much, but it's probably better to fall back to real Python bugs than to simulate them.
author Arseniy Alekseyev <aalekseyev@janestreet.com>
date Fri, 06 Dec 2024 20:27:59 +0000
parents
children b89c934e6269
comparison
equal deleted inserted replaced
52555:85c095c1f8bc 52556:1866119cbad7
1 use core::str;
2
3 use lazy_static::lazy_static;
4
5 use crate::filepatterns::PatternError;
6
7 lazy_static! {
8 static ref RE_ESCAPE: Vec<Vec<u8>> = {
9 let mut v: Vec<Vec<u8>> = (0..=255).map(|byte| vec![byte]).collect();
10 let to_escape = b"()[]{}?*+-|^$\\.&~#\t\n\r\x0b\x0c";
11 for byte in to_escape {
12 v[*byte as usize].insert(0, b'\\');
13 }
14 v
15 };
16 }
17
18 pub fn escape_char_for_re(c: u8) -> &'static [u8] {
19 &RE_ESCAPE[c as usize]
20 }
21
22 #[derive(Debug, Clone)]
23 pub enum PreRegex {
24 Empty,
25 Dot,
26 DotStar,
27 Eof,
28 NonslashStar,
29 Byte(u8),
30 Bytes(Vec<u8>),
31 SlashOrEof,
32 Re((regex_syntax::hir::Hir, Vec<u8>)),
33 Maybe(Box<Self>),
34 Alternation(Vec<Self>),
35 Sequence(Vec<Self>),
36 }
37
38 mod to_hir {
39 use itertools::Itertools;
40 use regex_syntax::hir::{
41 Class, ClassBytes, ClassBytesRange, Dot, Hir, Look, Repetition,
42 };
43
44 use super::PreRegex;
45
46 fn hir_star(hir: Hir) -> Hir {
47 Hir::repetition(Repetition {
48 min: 0,
49 max: None,
50 greedy: false,
51 sub: Box::new(hir),
52 })
53 }
54
55 fn hir_eof() -> Hir {
56 Hir::look(Look::End)
57 }
58
59 fn hir_nonslash() -> Hir {
60 let mut class =
61 Class::Bytes(ClassBytes::new([ClassBytesRange::new(b'/', b'/')]));
62 Class::negate(&mut class);
63 Hir::class(class)
64 }
65
66 fn hir_byte(b: u8) -> Hir {
67 let class =
68 Class::Bytes(ClassBytes::new([ClassBytesRange::new(b, b)]));
69 Hir::class(class)
70 }
71
72 fn hir_literal(text: &[u8]) -> Hir {
73 let b: Box<[u8]> = Box::from(text);
74 Hir::literal(b)
75 }
76
77 pub(crate) fn to_hir(re: &PreRegex) -> regex_syntax::hir::Hir {
78 match re {
79 PreRegex::Empty => Hir::empty(),
80 PreRegex::Dot => Hir::dot(Dot::AnyByte),
81 PreRegex::DotStar => hir_star(Hir::dot(Dot::AnyByte)),
82 PreRegex::Eof => hir_eof(),
83 PreRegex::NonslashStar => hir_star(hir_nonslash()),
84 PreRegex::Byte(b) => hir_byte(*b),
85 PreRegex::Bytes(bs) => hir_literal(bs),
86 PreRegex::SlashOrEof => {
87 Hir::alternation(vec![hir_byte(b'/'), hir_eof()])
88 }
89 PreRegex::Re((hir, _)) => hir.clone(),
90 PreRegex::Maybe(s) => {
91 Hir::alternation(vec![Hir::empty(), s.to_hir()])
92 }
93 PreRegex::Alternation(alt) => {
94 let alt = alt.iter().map(|r| r.to_hir()).collect_vec();
95 Hir::alternation(alt)
96 }
97 PreRegex::Sequence(seq) => {
98 let seq = seq.iter().map(|r| r.to_hir()).collect_vec();
99 Hir::concat(seq)
100 }
101 }
102 }
103 }
104
105 impl PreRegex {
106 pub fn to_hir(&self) -> regex_syntax::hir::Hir {
107 to_hir::to_hir(self)
108 }
109
110 fn to_bytes_rec(&self, out: &mut Vec<u8>) {
111 match self {
112 PreRegex::Empty => (),
113 PreRegex::Dot => out.push(b'.'),
114 PreRegex::DotStar => out.extend_from_slice(&b".*"[..]),
115 PreRegex::Eof => out.push(b'$'),
116 PreRegex::NonslashStar => out.extend_from_slice(&b"[^/]*"[..]),
117 PreRegex::Byte(b) => out.extend_from_slice(escape_char_for_re(*b)),
118 PreRegex::Bytes(bytes) => {
119 for b in bytes {
120 out.extend_from_slice(escape_char_for_re(*b))
121 }
122 }
123 PreRegex::SlashOrEof => out.extend_from_slice(&b"(?:/|$)"[..]),
124 PreRegex::Re((_hir, src)) => out.extend_from_slice(src),
125 PreRegex::Alternation(alt) => {
126 if alt.is_empty() {
127 // something that can never match
128 out.extend_from_slice(&b" ^"[..])
129 } else {
130 out.extend_from_slice(&b"(?:"[..]);
131 let mut first = true;
132 for r in alt {
133 if first {
134 first = false
135 } else {
136 out.extend_from_slice(&b"|"[..]);
137 }
138 r.to_bytes_rec(out)
139 }
140 out.extend_from_slice(&b")"[..]);
141 }
142 }
143 PreRegex::Sequence(seq) => {
144 for r in seq {
145 r.to_bytes_rec(out)
146 }
147 }
148 PreRegex::Maybe(r) => {
149 out.extend_from_slice(&b"(?:"[..]);
150 r.to_bytes_rec(out);
151 out.extend_from_slice(&b")?"[..]);
152 }
153 }
154 }
155
156 pub fn parse(re: &[u8]) -> Result<Self, PatternError> {
157 let re_str = str::from_utf8(re)
158 .map_err(|err| PatternError::UnsupportedSyntax(err.to_string()))?;
159 Ok(Self::Re((
160 regex_syntax::parse(re_str).map_err(|err| {
161 PatternError::UnsupportedSyntax(err.to_string())
162 })?,
163 re.to_vec(),
164 )))
165 }
166
167 pub fn to_bytes(&self) -> Vec<u8> {
168 let mut out = vec![];
169 self.to_bytes_rec(&mut out);
170 out
171 }
172
173 pub fn literal(prefix: &[u8]) -> PreRegex {
174 Self::Bytes(prefix.to_vec())
175 }
176
177 pub fn preceding_dir_components() -> Self {
178 Self::Maybe(Box::new(Self::Sequence(vec![
179 Self::DotStar,
180 Self::Byte(b'/'),
181 ])))
182 }
183 }