Mercurial > public > mercurial-scm > hg-stable
annotate rust/hg-core/src/encoding.rs @ 53003:155e1e8dc055
rust-nodemap: don't compute the error string unless needed
This is... really dumb and costs a ton of performance in a hot loop. It was
75% of a profile for a tip to null p1 node traversal in pure Rust.
I'm at fault, done in 652149ed64f08ee73e8fd2f76aa480ea8820fe08.
I thought clippy had a lint for this, but apparently not?
author | Rapha?l Gom?s <rgomes@octobus.net> |
---|---|
date | Fri, 21 Feb 2025 13:56:11 -0500 |
parents | 94e2547e6f3d |
children |
rev | line source |
---|---|
52770 | 1 //! Character transcoding support. |
2 | |
3 use core::str; | |
4 use std::borrow::Cow; | |
5 | |
52774
94e2547e6f3d
rust: move code from utils to utils::strings
Mitchell Kember <mkember@janestreet.com>
parents:
52770
diff
changeset
|
6 use crate::{errors::HgError, utils::strings::Escaped}; |
52770 | 7 use unicode_width::UnicodeWidthStr as _; |
8 | |
9 /// String encoder and decoder. | |
10 #[derive(Copy, Clone, Debug)] | |
11 pub struct Encoder { | |
12 /// The user's local encoding. | |
13 local_encoding: Encoding, | |
14 /// What to do when decoding fails. (Encoding always uses | |
15 /// `Mode::Replace`). | |
16 decoding_mode: Mode, | |
17 /// Width to use for characters that can be interpreted either as narrow | |
18 /// or wide depending on the context. | |
19 pub ambiguous_width: Width, | |
20 } | |
21 | |
22 /// Character encoding. | |
23 #[derive(Copy, Clone, Debug)] | |
24 pub enum Encoding { | |
25 Utf8, | |
26 Ascii, | |
27 } | |
28 | |
29 /// Character decoding mode. | |
30 #[derive(Copy, Clone, Debug)] | |
31 pub enum Mode { | |
32 /// Produce an error message for invalid characters. | |
33 Strict, | |
34 /// Replace invalid characters with a special character. | |
35 Replace, | |
36 } | |
37 | |
38 /// The width of a Unicode character. | |
39 #[derive(Copy, Clone, Debug)] | |
40 pub enum Width { | |
41 /// Narrow, taking up 1 terminal column. | |
42 Narrow, | |
43 /// Wide, taking up 2 terminal columns. | |
44 Wide, | |
45 } | |
46 | |
47 impl Default for Encoder { | |
48 fn default() -> Self { | |
49 Self { | |
50 local_encoding: Encoding::Utf8, | |
51 decoding_mode: Mode::Strict, | |
52 ambiguous_width: Width::Narrow, | |
53 } | |
54 } | |
55 } | |
56 | |
57 impl Encoder { | |
58 /// Creates an encoder from environment variables. | |
59 pub fn from_env() -> Result<Self, HgError> { | |
60 let default = Encoder::default(); | |
61 let local_encoding = match std::env::var_os("HGENCODING") { | |
62 None => default.local_encoding, | |
63 Some(s) | |
64 if s.eq_ignore_ascii_case("utf-8") | |
65 || s.eq_ignore_ascii_case("utf8") => | |
66 { | |
67 Encoding::Utf8 | |
68 } | |
69 Some(s) if s.eq_ignore_ascii_case("ascii") => Encoding::Ascii, | |
70 Some(s) => { | |
71 return Err(HgError::unsupported(format!( | |
72 "HGENCODING value '{}' is not supported", | |
73 s.to_string_lossy() | |
74 ))) | |
75 } | |
76 }; | |
77 let decoding_mode = match std::env::var_os("HGENCODINGMODE") { | |
78 None => default.decoding_mode, | |
79 Some(s) if s == "strict" => Mode::Strict, | |
80 Some(s) if s == "replace" => Mode::Replace, | |
81 Some(s) => { | |
82 return Err(HgError::abort_simple(format!( | |
83 "HGENCODINGMODE value '{}' is not supported", | |
84 s.to_string_lossy() | |
85 ))) | |
86 } | |
87 }; | |
88 let ambiguous_width = match std::env::var_os("HGENCODINGAMBIGUOUS") { | |
89 None => default.ambiguous_width, | |
90 Some(s) if s == "narrow" => Width::Narrow, | |
91 Some(s) if s == "wide" => Width::Wide, | |
92 Some(s) => { | |
93 return Err(HgError::abort_simple(format!( | |
94 "HGENCODINGAMBIGUOUS value '{}' is not supported", | |
95 s.to_string_lossy() | |
96 ))) | |
97 } | |
98 }; | |
99 Ok(Self { | |
100 local_encoding, | |
101 decoding_mode, | |
102 ambiguous_width, | |
103 }) | |
104 } | |
105 | |
106 /// Decodes an internal UTF-8 string from bytes. | |
107 pub fn decode_internal<'a>( | |
108 &self, | |
109 bytes: &'a [u8], | |
110 ) -> Result<&'a str, HgError> { | |
111 decode_utf8(bytes).map_err(HgError::corrupted) | |
112 } | |
113 | |
114 /// Converts a string from internal UTF-8 to the local character encoding. | |
115 pub fn to_local<'a>(&self, str: &'a str) -> Cow<'a, [u8]> { | |
116 match self.local_encoding { | |
117 Encoding::Utf8 => Cow::Borrowed(str.as_bytes()), | |
118 Encoding::Ascii => { | |
119 if str.is_ascii() { | |
120 Cow::Borrowed(str.as_bytes()) | |
121 } else { | |
122 Cow::Owned(codepoints_to_ascii_lossy(str).into_bytes()) | |
123 } | |
124 } | |
125 } | |
126 } | |
127 | |
128 /// Converts a string from the local character encoding to UTF-8. | |
129 pub fn from_local<'a>( | |
130 &self, | |
131 bytes: &'a [u8], | |
132 ) -> Result<Cow<'a, str>, HgError> { | |
133 match (self.local_encoding, self.decoding_mode) { | |
134 (Encoding::Utf8, Mode::Strict) => Ok(Cow::Borrowed( | |
135 decode_utf8(bytes).map_err(HgError::abort_simple)?, | |
136 )), | |
137 (Encoding::Utf8, Mode::Replace) => { | |
138 Ok(String::from_utf8_lossy(bytes)) | |
139 } | |
140 (Encoding::Ascii, Mode::Strict) => Ok(Cow::Borrowed( | |
141 decode_ascii(bytes).map_err(HgError::abort_simple)?, | |
142 )), | |
143 (Encoding::Ascii, Mode::Replace) => { | |
144 Ok(Cow::Owned(bytes_to_ascii_lossy(bytes))) | |
145 } | |
146 } | |
147 } | |
148 | |
149 /// Returns the column width of a string for display. | |
150 pub fn column_width(&self, str: &str) -> usize { | |
151 match self.ambiguous_width { | |
152 Width::Narrow => str.width(), | |
153 Width::Wide => str.width_cjk(), | |
154 } | |
155 } | |
156 | |
157 /// Returns the column width if `bytes` can be decoded as UTF-8, otherwise | |
158 /// just returns the length in bytes. | |
159 pub fn column_width_bytes(&self, bytes: &[u8]) -> usize { | |
160 match str::from_utf8(bytes) { | |
161 Ok(str) => self.column_width(str), | |
162 Err(_) => bytes.len(), | |
163 } | |
164 } | |
165 } | |
166 | |
167 /// Decodes bytes as UTF-8 or returns a detailed error message. | |
168 fn decode_utf8(bytes: &[u8]) -> Result<&str, String> { | |
169 str::from_utf8(bytes).map_err(|err| { | |
170 format!( | |
171 "invalid UTF-8 at offset {}: \"{}\"", | |
172 err.valid_up_to(), | |
173 str::from_utf8(&bytes.escaped_bytes()).unwrap() | |
174 ) | |
175 }) | |
176 } | |
177 | |
178 /// Decodes bytes as ASCII or returns a detailed error message. | |
179 fn decode_ascii(bytes: &[u8]) -> Result<&str, String> { | |
180 // TODO: Use `as_ascii` https://github.com/rust-lang/rust/issues/110998 | |
181 if bytes.is_ascii() { | |
182 // Safety: Just checked that it's ASCII. | |
183 let str = unsafe { str::from_utf8_unchecked(bytes) }; | |
184 Ok(str) | |
185 } else { | |
186 Err(format!( | |
187 "invalid ASCII: \"{}\"", | |
188 str::from_utf8(&bytes.escaped_bytes()).unwrap() | |
189 )) | |
190 } | |
191 } | |
192 | |
193 /// Replaces all non-ASCII codepoints with '?'. | |
194 fn codepoints_to_ascii_lossy(str: &str) -> String { | |
195 let mut ascii = String::new(); | |
196 for char in str.chars() { | |
197 ascii.push(if char.is_ascii() { char } else { '?' }); | |
198 } | |
199 ascii | |
200 } | |
201 | |
202 /// Replaces all non-ASCII bytes with '?'. | |
203 fn bytes_to_ascii_lossy(bytes: &[u8]) -> String { | |
204 let mut ascii = String::new(); | |
205 for &b in bytes { | |
206 ascii.push(if b.is_ascii() { b as char } else { '?' }); | |
207 } | |
208 ascii | |
209 } | |
210 | |
211 #[cfg(test)] | |
212 mod tests { | |
213 use super::*; | |
214 | |
215 #[test] | |
216 fn test_decode_internal() { | |
217 let encoder = Encoder::default(); | |
218 assert_eq!(encoder.decode_internal(b"").unwrap(), ""); | |
219 assert_eq!(encoder.decode_internal(b"\xc3\xa9").unwrap(), "é"); | |
220 match encoder.decode_internal(b"A\xc3") { | |
221 Ok(_) => panic!("expected an error"), | |
222 Err(HgError::CorruptedRepository(message)) => { | |
223 assert_eq!(message, "invalid UTF-8 at offset 1: \"A\\xc3\"") | |
224 } | |
225 Err(_) => panic!("expected a CorruptedRepository error"), | |
226 } | |
227 } | |
228 | |
229 #[test] | |
230 fn test_to_local() { | |
231 let encoder = Encoder::default(); | |
232 assert_eq!(encoder.to_local("").as_ref(), b""); | |
233 assert_eq!(encoder.to_local("é").as_ref(), b"\xc3\xa9"); | |
234 } | |
235 | |
236 #[test] | |
237 fn test_from_local() { | |
238 let encoder = Encoder::default(); | |
239 assert_eq!(encoder.from_local(b"").unwrap(), ""); | |
240 assert_eq!(encoder.from_local(b"\xc3\xa9").unwrap(), "é"); | |
241 match encoder.from_local(b"A\xc3") { | |
242 Ok(_) => panic!("expected an error"), | |
243 Err(HgError::Abort { message, .. }) => { | |
244 assert_eq!(message, "invalid UTF-8 at offset 1: \"A\\xc3\"") | |
245 } | |
246 Err(_) => panic!("expected a CorruptedRepository error"), | |
247 } | |
248 } | |
249 | |
250 #[test] | |
251 fn test_from_local_replace() { | |
252 let encoder = Encoder { | |
253 decoding_mode: Mode::Replace, | |
254 ..Default::default() | |
255 }; | |
256 assert_eq!(encoder.from_local(b"A\xc3").unwrap(), "A\u{fffd}"); | |
257 } | |
258 | |
259 #[test] | |
260 fn test_column_width() { | |
261 let encoder = Encoder::default(); | |
262 assert_eq!(encoder.column_width(""), 0); | |
263 assert_eq!(encoder.column_width("a"), 1); | |
264 assert_eq!(encoder.column_width("ab"), 2); | |
265 assert_eq!(encoder.column_width("été"), 3); | |
266 assert_eq!(encoder.column_width("\u{1f496}"), 2); | |
267 } | |
268 | |
269 #[test] | |
270 fn test_column_width_ambiguous() { | |
271 let narrow_encoder = Encoder { | |
272 ambiguous_width: Width::Narrow, | |
273 ..Default::default() | |
274 }; | |
275 assert_eq!(narrow_encoder.column_width("\u{2606}"), 1); | |
276 | |
277 let wide_encoder = Encoder { | |
278 ambiguous_width: Width::Wide, | |
279 ..Default::default() | |
280 }; | |
281 assert_eq!(wide_encoder.column_width("\u{2606}"), 2); | |
282 } | |
283 | |
284 #[test] | |
285 fn test_column_width_bytes() { | |
286 let encoder = Encoder::default(); | |
287 assert_eq!(encoder.column_width_bytes(b""), 0); | |
288 assert_eq!(encoder.column_width_bytes("été".as_bytes()), 3); | |
289 assert_eq!(encoder.column_width_bytes(b"A\xc3"), 2); | |
290 } | |
291 } |