comparison rust/hg-core/src/filepatterns.rs @ 42437:9609430d3625

rust-filepatterns: use bytes instead of String In my initial patch, I introduced an unnecessary hard constraint on UTF-8 filenames and patterns which I forgot to remove. Although the performance penalty for using String might be negligible, we don't want to break compatibility with non-UTF-8 encodings for no reason. Moreover, this change allows for a cleaner Rust core API. This patch introduces a new utils module that is used with this fix. Finally, PatternError was not put inside the Python module generated by Rust, which would have raised a NameError. Differential Revision: https://phab.mercurial-scm.org/D6485
author Rapha?l Gom?s <rgomes@octobus.net>
date Thu, 06 Jun 2019 15:30:56 +0200
parents e8f3740cc067
children 48f1f864d928
comparison
equal deleted inserted replaced
42436:dc5bd66a8270 42437:9609430d3625
1 use crate::{LineNumber, PatternError, PatternFileError}; 1 use crate::{LineNumber, PatternError, PatternFileError};
2 use regex::Regex; 2 use regex::bytes::Regex;
3 use std::collections::HashMap; 3 use std::collections::HashMap;
4 use std::fs::File; 4 use std::fs::File;
5 use std::io::Read; 5 use std::io::Read;
6 use std::vec::Vec; 6 use std::vec::Vec;
7 use utils::files::get_path_from_bytes;
8 use utils::{replace_slice, SliceExt};
7 9
8 lazy_static! { 10 lazy_static! {
9 static ref reescape: Vec<Vec<u8>> = { 11 static ref reescape: Vec<Vec<u8>> = {
10 let mut v: Vec<Vec<u8>> = (0..=255).map(|byte| vec![byte]).collect(); 12 let mut v: Vec<Vec<u8>> = (0..=255).map(|byte| vec![byte]).collect();
11 let to_escape = b"()[]{}?*+-|^$\\.&~# \t\n\r\x0b\x0c"; 13 let to_escape = b"()[]{}?*+-|^$\\.&~# \t\n\r\x0b\x0c";
190 [b'*', b'?', b'[', b']', b'{', b'}', b'\\']; 192 [b'*', b'?', b'[', b']', b'{', b'}', b'\\'];
191 193
192 /// Wrapper function to `_build_single_regex` that short-circuits 'exact' globs 194 /// Wrapper function to `_build_single_regex` that short-circuits 'exact' globs
193 /// that don't need to be transformed into a regex. 195 /// that don't need to be transformed into a regex.
194 pub fn build_single_regex( 196 pub fn build_single_regex(
195 kind: &str, 197 kind: &[u8],
196 pat: &[u8], 198 pat: &[u8],
197 globsuffix: &[u8], 199 globsuffix: &[u8],
198 ) -> Result<Vec<u8>, PatternError> { 200 ) -> Result<Vec<u8>, PatternError> {
199 let enum_kind = parse_pattern_syntax(kind.as_bytes())?; 201 let enum_kind = parse_pattern_syntax(kind)?;
200 if enum_kind == PatternSyntax::RootGlob 202 if enum_kind == PatternSyntax::RootGlob
201 && pat.iter().all(|b| GLOB_SPECIAL_CHARACTERS.contains(b)) 203 && pat.iter().all(|b| GLOB_SPECIAL_CHARACTERS.contains(b))
202 { 204 {
203 Ok(pat.to_vec()) 205 Ok(pat.to_vec())
204 } else { 206 } else {
205 Ok(_build_single_regex(enum_kind, pat, globsuffix)) 207 Ok(_build_single_regex(enum_kind, pat, globsuffix))
206 } 208 }
207 } 209 }
208 210
209 lazy_static! { 211 lazy_static! {
210 static ref SYNTAXES: HashMap<&'static str, &'static str> = { 212 static ref SYNTAXES: HashMap<&'static [u8], &'static [u8]> = {
211 let mut m = HashMap::new(); 213 let mut m = HashMap::new();
212 214
213 m.insert("re", "relre:"); 215 m.insert(b"re".as_ref(), b"relre:".as_ref());
214 m.insert("regexp", "relre:"); 216 m.insert(b"regexp".as_ref(), b"relre:".as_ref());
215 m.insert("glob", "relglob:"); 217 m.insert(b"glob".as_ref(), b"relglob:".as_ref());
216 m.insert("rootglob", "rootglob:"); 218 m.insert(b"rootglob".as_ref(), b"rootglob:".as_ref());
217 m.insert("include", "include"); 219 m.insert(b"include".as_ref(), b"include".as_ref());
218 m.insert("subinclude", "subinclude"); 220 m.insert(b"subinclude".as_ref(), b"subinclude".as_ref());
219 m 221 m
220 }; 222 };
221 } 223 }
222 224
223 pub type PatternTuple = (String, LineNumber, String); 225 pub type PatternTuple = (Vec<u8>, LineNumber, Vec<u8>);
224 type WarningTuple = (String, String); 226 type WarningTuple = (String, String);
225 227
226 pub fn parse_pattern_file_contents( 228 pub fn parse_pattern_file_contents(
227 lines: &str, 229 lines: &[u8],
228 file_path: &str, 230 file_path: &[u8],
229 warn: bool, 231 warn: bool,
230 ) -> (Vec<PatternTuple>, Vec<WarningTuple>) { 232 ) -> (Vec<PatternTuple>, Vec<WarningTuple>) {
231 let comment_regex = Regex::new(r"((?:^|[^\\])(?:\\\\)*)#.*").unwrap(); 233 let comment_regex = Regex::new(r"((?:^|[^\\])(?:\\\\)*)#.*").unwrap();
232 let mut inputs: Vec<PatternTuple> = vec![]; 234 let mut inputs: Vec<PatternTuple> = vec![];
233 let mut warnings: Vec<WarningTuple> = vec![]; 235 let mut warnings: Vec<WarningTuple> = vec![];
234 236
235 let mut current_syntax = "relre:"; 237 let mut current_syntax = b"relre:".as_ref();
236 238
237 let mut line = String::new(); 239 for (line_number, mut line) in lines.split(|c| *c == b'\n').enumerate() {
238 for (line_number, line_str) in lines.split('\n').enumerate() {
239 let line_number = line_number + 1; 240 let line_number = line_number + 1;
240 line.replace_range(.., line_str); 241
241 242 if line.contains(&('#' as u8)) {
242 if line.contains('#') { 243 if let Some(cap) = comment_regex.captures(line) {
243 if let Some(cap) = comment_regex.captures(line.clone().as_ref()) { 244 line = &line[..cap.get(1).unwrap().end()]
244 line = line[..cap.get(1).unwrap().end()].to_string() 245 }
245 } 246 let mut line = line.to_owned();
246 line = line.replace(r"\#", "#"); 247 replace_slice(&mut line, br"\#", b"#");
247 } 248 }
248 249
249 let mut line = line.trim_end(); 250 let mut line = line.trim_end();
250 251
251 if line.is_empty() { 252 if line.is_empty() {
252 continue; 253 continue;
253 } 254 }
254 255
255 if line.starts_with("syntax:") { 256 if line.starts_with(b"syntax:") {
256 let syntax = line["syntax:".len()..].trim(); 257 let syntax = line[b"syntax:".len()..].trim();
257 258
258 if let Some(rel_syntax) = SYNTAXES.get(syntax) { 259 if let Some(rel_syntax) = SYNTAXES.get(syntax) {
259 current_syntax = rel_syntax; 260 current_syntax = rel_syntax;
260 } else if warn { 261 } else if warn {
261 warnings.push((file_path.to_string(), syntax.to_string())); 262 warnings.push((
263 String::from_utf8_lossy(file_path).to_string(),
264 String::from_utf8_lossy(syntax).to_string(),
265 ));
262 } 266 }
263 continue; 267 continue;
264 } 268 }
265 269
266 let mut line_syntax: &str = &current_syntax; 270 let mut line_syntax: &[u8] = &current_syntax;
267 271
268 for (s, rels) in SYNTAXES.iter() { 272 for (s, rels) in SYNTAXES.iter() {
269 if line.starts_with(rels) { 273 if line.starts_with(rels) {
270 line_syntax = rels; 274 line_syntax = rels;
271 line = &line[rels.len()..]; 275 line = &line[rels.len()..];
272 break; 276 break;
273 } else if line.starts_with(&format!("{}:", s)) { 277 } else if line.starts_with(&[s, b":".as_ref()].concat()) {
274 line_syntax = rels; 278 line_syntax = rels;
275 line = &line[s.len() + 1..]; 279 line = &line[s.len() + 1..];
276 break; 280 break;
277 } 281 }
278 } 282 }
279 283
280 inputs.push(( 284 inputs.push((
281 format!("{}{}", line_syntax, line), 285 [line_syntax, line].concat(),
282 line_number, 286 line_number,
283 line.to_string(), 287 line.to_owned(),
284 )); 288 ));
285 } 289 }
286 (inputs, warnings) 290 (inputs, warnings)
287 } 291 }
288 292
289 pub fn read_pattern_file( 293 pub fn read_pattern_file(
290 file_path: String, 294 file_path: &[u8],
291 warn: bool, 295 warn: bool,
292 ) -> Result<(Vec<PatternTuple>, Vec<WarningTuple>), PatternFileError> { 296 ) -> Result<(Vec<PatternTuple>, Vec<WarningTuple>), PatternFileError> {
293 let mut f = File::open(&file_path)?; 297 let mut f = File::open(get_path_from_bytes(file_path))?;
294 let mut contents = String::new(); 298 let mut contents = Vec::new();
295 299
296 f.read_to_string(&mut contents)?; 300 f.read_to_end(&mut contents)?;
297 301
298 Ok(parse_pattern_file_contents(&contents, &file_path, warn)) 302 Ok(parse_pattern_file_contents(&contents, file_path, warn))
299 } 303 }
300 304
301 #[cfg(test)] 305 #[cfg(test)]
302 mod tests { 306 mod tests {
303 use super::*; 307 use super::*;
326 assert_eq!(glob_to_re(br#".\*\?"#), br#"\.\*\?"#); 330 assert_eq!(glob_to_re(br#".\*\?"#), br#"\.\*\?"#);
327 } 331 }
328 332
329 #[test] 333 #[test]
330 fn test_parse_pattern_file_contents() { 334 fn test_parse_pattern_file_contents() {
331 let lines = "syntax: glob\n*.elc"; 335 let lines = b"syntax: glob\n*.elc";
332 336
333 assert_eq!( 337 assert_eq!(
334 vec![("relglob:*.elc".to_string(), 2, "*.elc".to_string())], 338 vec![(b"relglob:*.elc".to_vec(), 2, b"*.elc".to_vec())],
335 parse_pattern_file_contents(lines, "file_path", false).0, 339 parse_pattern_file_contents(lines, b"file_path", false).0,
336 ); 340 );
337 341
338 let lines = "syntax: include\nsyntax: glob"; 342 let lines = b"syntax: include\nsyntax: glob";
339 343
340 assert_eq!( 344 assert_eq!(
341 parse_pattern_file_contents(lines, "file_path", false).0, 345 parse_pattern_file_contents(lines, b"file_path", false).0,
342 vec![] 346 vec![]
343 ); 347 );
344 } 348 let lines = b"glob:**.o";
345 } 349 assert_eq!(
350 parse_pattern_file_contents(lines, b"file_path", false).0,
351 vec![(b"relglob:**.o".to_vec(), 1, b"**.o".to_vec())]
352 );
353 }
354 }