Mercurial > public > mercurial-scm > hg
comparison rust/hg-core/src/filepatterns.rs @ 42437:9609430d3625
rust-filepatterns: use bytes instead of String
In my initial patch, I introduced an unnecessary hard constraint on UTF-8
filenames and patterns which I forgot to remove. Although the performance
penalty for using String might be negligible, we don't want to break
compatibility with non-UTF-8 encodings for no reason.
Moreover, this change allows for a cleaner Rust core API.
This patch introduces a new utils module that is used with this fix.
Finally, PatternError was not put inside the Python module generated by
Rust, which would have raised a NameError.
Differential Revision: https://phab.mercurial-scm.org/D6485
author | Rapha?l Gom?s <rgomes@octobus.net> |
---|---|
date | Thu, 06 Jun 2019 15:30:56 +0200 |
parents | e8f3740cc067 |
children | 48f1f864d928 |
comparison
equal
deleted
inserted
replaced
42436:dc5bd66a8270 | 42437:9609430d3625 |
---|---|
1 use crate::{LineNumber, PatternError, PatternFileError}; | 1 use crate::{LineNumber, PatternError, PatternFileError}; |
2 use regex::Regex; | 2 use regex::bytes::Regex; |
3 use std::collections::HashMap; | 3 use std::collections::HashMap; |
4 use std::fs::File; | 4 use std::fs::File; |
5 use std::io::Read; | 5 use std::io::Read; |
6 use std::vec::Vec; | 6 use std::vec::Vec; |
7 use utils::files::get_path_from_bytes; | |
8 use utils::{replace_slice, SliceExt}; | |
7 | 9 |
8 lazy_static! { | 10 lazy_static! { |
9 static ref reescape: Vec<Vec<u8>> = { | 11 static ref reescape: Vec<Vec<u8>> = { |
10 let mut v: Vec<Vec<u8>> = (0..=255).map(|byte| vec![byte]).collect(); | 12 let mut v: Vec<Vec<u8>> = (0..=255).map(|byte| vec![byte]).collect(); |
11 let to_escape = b"()[]{}?*+-|^$\\.&~# \t\n\r\x0b\x0c"; | 13 let to_escape = b"()[]{}?*+-|^$\\.&~# \t\n\r\x0b\x0c"; |
190 [b'*', b'?', b'[', b']', b'{', b'}', b'\\']; | 192 [b'*', b'?', b'[', b']', b'{', b'}', b'\\']; |
191 | 193 |
192 /// Wrapper function to `_build_single_regex` that short-circuits 'exact' globs | 194 /// Wrapper function to `_build_single_regex` that short-circuits 'exact' globs |
193 /// that don't need to be transformed into a regex. | 195 /// that don't need to be transformed into a regex. |
194 pub fn build_single_regex( | 196 pub fn build_single_regex( |
195 kind: &str, | 197 kind: &[u8], |
196 pat: &[u8], | 198 pat: &[u8], |
197 globsuffix: &[u8], | 199 globsuffix: &[u8], |
198 ) -> Result<Vec<u8>, PatternError> { | 200 ) -> Result<Vec<u8>, PatternError> { |
199 let enum_kind = parse_pattern_syntax(kind.as_bytes())?; | 201 let enum_kind = parse_pattern_syntax(kind)?; |
200 if enum_kind == PatternSyntax::RootGlob | 202 if enum_kind == PatternSyntax::RootGlob |
201 && pat.iter().all(|b| GLOB_SPECIAL_CHARACTERS.contains(b)) | 203 && pat.iter().all(|b| GLOB_SPECIAL_CHARACTERS.contains(b)) |
202 { | 204 { |
203 Ok(pat.to_vec()) | 205 Ok(pat.to_vec()) |
204 } else { | 206 } else { |
205 Ok(_build_single_regex(enum_kind, pat, globsuffix)) | 207 Ok(_build_single_regex(enum_kind, pat, globsuffix)) |
206 } | 208 } |
207 } | 209 } |
208 | 210 |
209 lazy_static! { | 211 lazy_static! { |
210 static ref SYNTAXES: HashMap<&'static str, &'static str> = { | 212 static ref SYNTAXES: HashMap<&'static [u8], &'static [u8]> = { |
211 let mut m = HashMap::new(); | 213 let mut m = HashMap::new(); |
212 | 214 |
213 m.insert("re", "relre:"); | 215 m.insert(b"re".as_ref(), b"relre:".as_ref()); |
214 m.insert("regexp", "relre:"); | 216 m.insert(b"regexp".as_ref(), b"relre:".as_ref()); |
215 m.insert("glob", "relglob:"); | 217 m.insert(b"glob".as_ref(), b"relglob:".as_ref()); |
216 m.insert("rootglob", "rootglob:"); | 218 m.insert(b"rootglob".as_ref(), b"rootglob:".as_ref()); |
217 m.insert("include", "include"); | 219 m.insert(b"include".as_ref(), b"include".as_ref()); |
218 m.insert("subinclude", "subinclude"); | 220 m.insert(b"subinclude".as_ref(), b"subinclude".as_ref()); |
219 m | 221 m |
220 }; | 222 }; |
221 } | 223 } |
222 | 224 |
223 pub type PatternTuple = (String, LineNumber, String); | 225 pub type PatternTuple = (Vec<u8>, LineNumber, Vec<u8>); |
224 type WarningTuple = (String, String); | 226 type WarningTuple = (String, String); |
225 | 227 |
226 pub fn parse_pattern_file_contents( | 228 pub fn parse_pattern_file_contents( |
227 lines: &str, | 229 lines: &[u8], |
228 file_path: &str, | 230 file_path: &[u8], |
229 warn: bool, | 231 warn: bool, |
230 ) -> (Vec<PatternTuple>, Vec<WarningTuple>) { | 232 ) -> (Vec<PatternTuple>, Vec<WarningTuple>) { |
231 let comment_regex = Regex::new(r"((?:^|[^\\])(?:\\\\)*)#.*").unwrap(); | 233 let comment_regex = Regex::new(r"((?:^|[^\\])(?:\\\\)*)#.*").unwrap(); |
232 let mut inputs: Vec<PatternTuple> = vec![]; | 234 let mut inputs: Vec<PatternTuple> = vec![]; |
233 let mut warnings: Vec<WarningTuple> = vec![]; | 235 let mut warnings: Vec<WarningTuple> = vec![]; |
234 | 236 |
235 let mut current_syntax = "relre:"; | 237 let mut current_syntax = b"relre:".as_ref(); |
236 | 238 |
237 let mut line = String::new(); | 239 for (line_number, mut line) in lines.split(|c| *c == b'\n').enumerate() { |
238 for (line_number, line_str) in lines.split('\n').enumerate() { | |
239 let line_number = line_number + 1; | 240 let line_number = line_number + 1; |
240 line.replace_range(.., line_str); | 241 |
241 | 242 if line.contains(&('#' as u8)) { |
242 if line.contains('#') { | 243 if let Some(cap) = comment_regex.captures(line) { |
243 if let Some(cap) = comment_regex.captures(line.clone().as_ref()) { | 244 line = &line[..cap.get(1).unwrap().end()] |
244 line = line[..cap.get(1).unwrap().end()].to_string() | 245 } |
245 } | 246 let mut line = line.to_owned(); |
246 line = line.replace(r"\#", "#"); | 247 replace_slice(&mut line, br"\#", b"#"); |
247 } | 248 } |
248 | 249 |
249 let mut line = line.trim_end(); | 250 let mut line = line.trim_end(); |
250 | 251 |
251 if line.is_empty() { | 252 if line.is_empty() { |
252 continue; | 253 continue; |
253 } | 254 } |
254 | 255 |
255 if line.starts_with("syntax:") { | 256 if line.starts_with(b"syntax:") { |
256 let syntax = line["syntax:".len()..].trim(); | 257 let syntax = line[b"syntax:".len()..].trim(); |
257 | 258 |
258 if let Some(rel_syntax) = SYNTAXES.get(syntax) { | 259 if let Some(rel_syntax) = SYNTAXES.get(syntax) { |
259 current_syntax = rel_syntax; | 260 current_syntax = rel_syntax; |
260 } else if warn { | 261 } else if warn { |
261 warnings.push((file_path.to_string(), syntax.to_string())); | 262 warnings.push(( |
263 String::from_utf8_lossy(file_path).to_string(), | |
264 String::from_utf8_lossy(syntax).to_string(), | |
265 )); | |
262 } | 266 } |
263 continue; | 267 continue; |
264 } | 268 } |
265 | 269 |
266 let mut line_syntax: &str = ¤t_syntax; | 270 let mut line_syntax: &[u8] = ¤t_syntax; |
267 | 271 |
268 for (s, rels) in SYNTAXES.iter() { | 272 for (s, rels) in SYNTAXES.iter() { |
269 if line.starts_with(rels) { | 273 if line.starts_with(rels) { |
270 line_syntax = rels; | 274 line_syntax = rels; |
271 line = &line[rels.len()..]; | 275 line = &line[rels.len()..]; |
272 break; | 276 break; |
273 } else if line.starts_with(&format!("{}:", s)) { | 277 } else if line.starts_with(&[s, b":".as_ref()].concat()) { |
274 line_syntax = rels; | 278 line_syntax = rels; |
275 line = &line[s.len() + 1..]; | 279 line = &line[s.len() + 1..]; |
276 break; | 280 break; |
277 } | 281 } |
278 } | 282 } |
279 | 283 |
280 inputs.push(( | 284 inputs.push(( |
281 format!("{}{}", line_syntax, line), | 285 [line_syntax, line].concat(), |
282 line_number, | 286 line_number, |
283 line.to_string(), | 287 line.to_owned(), |
284 )); | 288 )); |
285 } | 289 } |
286 (inputs, warnings) | 290 (inputs, warnings) |
287 } | 291 } |
288 | 292 |
289 pub fn read_pattern_file( | 293 pub fn read_pattern_file( |
290 file_path: String, | 294 file_path: &[u8], |
291 warn: bool, | 295 warn: bool, |
292 ) -> Result<(Vec<PatternTuple>, Vec<WarningTuple>), PatternFileError> { | 296 ) -> Result<(Vec<PatternTuple>, Vec<WarningTuple>), PatternFileError> { |
293 let mut f = File::open(&file_path)?; | 297 let mut f = File::open(get_path_from_bytes(file_path))?; |
294 let mut contents = String::new(); | 298 let mut contents = Vec::new(); |
295 | 299 |
296 f.read_to_string(&mut contents)?; | 300 f.read_to_end(&mut contents)?; |
297 | 301 |
298 Ok(parse_pattern_file_contents(&contents, &file_path, warn)) | 302 Ok(parse_pattern_file_contents(&contents, file_path, warn)) |
299 } | 303 } |
300 | 304 |
301 #[cfg(test)] | 305 #[cfg(test)] |
302 mod tests { | 306 mod tests { |
303 use super::*; | 307 use super::*; |
326 assert_eq!(glob_to_re(br#".\*\?"#), br#"\.\*\?"#); | 330 assert_eq!(glob_to_re(br#".\*\?"#), br#"\.\*\?"#); |
327 } | 331 } |
328 | 332 |
329 #[test] | 333 #[test] |
330 fn test_parse_pattern_file_contents() { | 334 fn test_parse_pattern_file_contents() { |
331 let lines = "syntax: glob\n*.elc"; | 335 let lines = b"syntax: glob\n*.elc"; |
332 | 336 |
333 assert_eq!( | 337 assert_eq!( |
334 vec![("relglob:*.elc".to_string(), 2, "*.elc".to_string())], | 338 vec![(b"relglob:*.elc".to_vec(), 2, b"*.elc".to_vec())], |
335 parse_pattern_file_contents(lines, "file_path", false).0, | 339 parse_pattern_file_contents(lines, b"file_path", false).0, |
336 ); | 340 ); |
337 | 341 |
338 let lines = "syntax: include\nsyntax: glob"; | 342 let lines = b"syntax: include\nsyntax: glob"; |
339 | 343 |
340 assert_eq!( | 344 assert_eq!( |
341 parse_pattern_file_contents(lines, "file_path", false).0, | 345 parse_pattern_file_contents(lines, b"file_path", false).0, |
342 vec![] | 346 vec![] |
343 ); | 347 ); |
344 } | 348 let lines = b"glob:**.o"; |
345 } | 349 assert_eq!( |
350 parse_pattern_file_contents(lines, b"file_path", false).0, | |
351 vec![(b"relglob:**.o".to_vec(), 1, b"**.o".to_vec())] | |
352 ); | |
353 } | |
354 } |