Mercurial > public > mercurial-scm > hg
comparison rust/hg-core/src/encoding.rs @ 52756:bbf1c52252ae
rust: add encoding.rs
This is based on encoding.py. It reads the environment variables HGENCODING,
HGENCODINGMODE, and HGENCODINGAMBIGUOUS. Currently it only supports UTF-8 and
ascii, but it could be extended to support other local encodings.
Unlike Python, it assumes all internal strings are UTF-8 and does not attempt to
fallback to latin-1 (or ui.fallbackencoding).
Nothing is using this now, but in the future command output and error messages
should transition to using it.
I replaced existing calls to `utf8_to_local` and `local_to_uf8` with direct
String/bytes methods since they were not logically converting between internal
and local encodings. Instead, they were used (for example) when an error message
happened to be stored as String but needed to be passed somewhere as bytes. The
proper fix for this will be to avoid String in the first place.
author | Mitchell Kember <mkember@janestreet.com> |
---|---|
date | Wed, 05 Feb 2025 17:35:52 -0500 |
parents | |
children | 94e2547e6f3d |
comparison
equal
deleted
inserted
replaced
52755:1b7a57a5b47a | 52756:bbf1c52252ae |
---|---|
1 //! Character transcoding support. | |
2 | |
3 use core::str; | |
4 use std::borrow::Cow; | |
5 | |
6 use crate::{errors::HgError, utils::Escaped}; | |
7 use unicode_width::UnicodeWidthStr as _; | |
8 | |
9 /// String encoder and decoder. | |
10 #[derive(Copy, Clone, Debug)] | |
11 pub struct Encoder { | |
12 /// The user's local encoding. | |
13 local_encoding: Encoding, | |
14 /// What to do when decoding fails. (Encoding always uses | |
15 /// `Mode::Replace`). | |
16 decoding_mode: Mode, | |
17 /// Width to use for characters that can be interpreted either as narrow | |
18 /// or wide depending on the context. | |
19 pub ambiguous_width: Width, | |
20 } | |
21 | |
22 /// Character encoding. | |
23 #[derive(Copy, Clone, Debug)] | |
24 pub enum Encoding { | |
25 Utf8, | |
26 Ascii, | |
27 } | |
28 | |
29 /// Character decoding mode. | |
30 #[derive(Copy, Clone, Debug)] | |
31 pub enum Mode { | |
32 /// Produce an error message for invalid characters. | |
33 Strict, | |
34 /// Replace invalid characters with a special character. | |
35 Replace, | |
36 } | |
37 | |
38 /// The width of a Unicode character. | |
39 #[derive(Copy, Clone, Debug)] | |
40 pub enum Width { | |
41 /// Narrow, taking up 1 terminal column. | |
42 Narrow, | |
43 /// Wide, taking up 2 terminal columns. | |
44 Wide, | |
45 } | |
46 | |
47 impl Default for Encoder { | |
48 fn default() -> Self { | |
49 Self { | |
50 local_encoding: Encoding::Utf8, | |
51 decoding_mode: Mode::Strict, | |
52 ambiguous_width: Width::Narrow, | |
53 } | |
54 } | |
55 } | |
56 | |
57 impl Encoder { | |
58 /// Creates an encoder from environment variables. | |
59 pub fn from_env() -> Result<Self, HgError> { | |
60 let default = Encoder::default(); | |
61 let local_encoding = match std::env::var_os("HGENCODING") { | |
62 None => default.local_encoding, | |
63 Some(s) | |
64 if s.eq_ignore_ascii_case("utf-8") | |
65 || s.eq_ignore_ascii_case("utf8") => | |
66 { | |
67 Encoding::Utf8 | |
68 } | |
69 Some(s) if s.eq_ignore_ascii_case("ascii") => Encoding::Ascii, | |
70 Some(s) => { | |
71 return Err(HgError::unsupported(format!( | |
72 "HGENCODING value '{}' is not supported", | |
73 s.to_string_lossy() | |
74 ))) | |
75 } | |
76 }; | |
77 let decoding_mode = match std::env::var_os("HGENCODINGMODE") { | |
78 None => default.decoding_mode, | |
79 Some(s) if s == "strict" => Mode::Strict, | |
80 Some(s) if s == "replace" => Mode::Replace, | |
81 Some(s) => { | |
82 return Err(HgError::abort_simple(format!( | |
83 "HGENCODINGMODE value '{}' is not supported", | |
84 s.to_string_lossy() | |
85 ))) | |
86 } | |
87 }; | |
88 let ambiguous_width = match std::env::var_os("HGENCODINGAMBIGUOUS") { | |
89 None => default.ambiguous_width, | |
90 Some(s) if s == "narrow" => Width::Narrow, | |
91 Some(s) if s == "wide" => Width::Wide, | |
92 Some(s) => { | |
93 return Err(HgError::abort_simple(format!( | |
94 "HGENCODINGAMBIGUOUS value '{}' is not supported", | |
95 s.to_string_lossy() | |
96 ))) | |
97 } | |
98 }; | |
99 Ok(Self { | |
100 local_encoding, | |
101 decoding_mode, | |
102 ambiguous_width, | |
103 }) | |
104 } | |
105 | |
106 /// Decodes an internal UTF-8 string from bytes. | |
107 pub fn decode_internal<'a>( | |
108 &self, | |
109 bytes: &'a [u8], | |
110 ) -> Result<&'a str, HgError> { | |
111 decode_utf8(bytes).map_err(HgError::corrupted) | |
112 } | |
113 | |
114 /// Converts a string from internal UTF-8 to the local character encoding. | |
115 pub fn to_local<'a>(&self, str: &'a str) -> Cow<'a, [u8]> { | |
116 match self.local_encoding { | |
117 Encoding::Utf8 => Cow::Borrowed(str.as_bytes()), | |
118 Encoding::Ascii => { | |
119 if str.is_ascii() { | |
120 Cow::Borrowed(str.as_bytes()) | |
121 } else { | |
122 Cow::Owned(codepoints_to_ascii_lossy(str).into_bytes()) | |
123 } | |
124 } | |
125 } | |
126 } | |
127 | |
128 /// Converts a string from the local character encoding to UTF-8. | |
129 pub fn from_local<'a>( | |
130 &self, | |
131 bytes: &'a [u8], | |
132 ) -> Result<Cow<'a, str>, HgError> { | |
133 match (self.local_encoding, self.decoding_mode) { | |
134 (Encoding::Utf8, Mode::Strict) => Ok(Cow::Borrowed( | |
135 decode_utf8(bytes).map_err(HgError::abort_simple)?, | |
136 )), | |
137 (Encoding::Utf8, Mode::Replace) => { | |
138 Ok(String::from_utf8_lossy(bytes)) | |
139 } | |
140 (Encoding::Ascii, Mode::Strict) => Ok(Cow::Borrowed( | |
141 decode_ascii(bytes).map_err(HgError::abort_simple)?, | |
142 )), | |
143 (Encoding::Ascii, Mode::Replace) => { | |
144 Ok(Cow::Owned(bytes_to_ascii_lossy(bytes))) | |
145 } | |
146 } | |
147 } | |
148 | |
149 /// Returns the column width of a string for display. | |
150 pub fn column_width(&self, str: &str) -> usize { | |
151 match self.ambiguous_width { | |
152 Width::Narrow => str.width(), | |
153 Width::Wide => str.width_cjk(), | |
154 } | |
155 } | |
156 | |
157 /// Returns the column width if `bytes` can be decoded as UTF-8, otherwise | |
158 /// just returns the length in bytes. | |
159 pub fn column_width_bytes(&self, bytes: &[u8]) -> usize { | |
160 match str::from_utf8(bytes) { | |
161 Ok(str) => self.column_width(str), | |
162 Err(_) => bytes.len(), | |
163 } | |
164 } | |
165 } | |
166 | |
167 /// Decodes bytes as UTF-8 or returns a detailed error message. | |
168 fn decode_utf8(bytes: &[u8]) -> Result<&str, String> { | |
169 str::from_utf8(bytes).map_err(|err| { | |
170 format!( | |
171 "invalid UTF-8 at offset {}: \"{}\"", | |
172 err.valid_up_to(), | |
173 str::from_utf8(&bytes.escaped_bytes()).unwrap() | |
174 ) | |
175 }) | |
176 } | |
177 | |
178 /// Decodes bytes as ASCII or returns a detailed error message. | |
179 fn decode_ascii(bytes: &[u8]) -> Result<&str, String> { | |
180 // TODO: Use `as_ascii` https://github.com/rust-lang/rust/issues/110998 | |
181 if bytes.is_ascii() { | |
182 // Safety: Just checked that it's ASCII. | |
183 let str = unsafe { str::from_utf8_unchecked(bytes) }; | |
184 Ok(str) | |
185 } else { | |
186 Err(format!( | |
187 "invalid ASCII: \"{}\"", | |
188 str::from_utf8(&bytes.escaped_bytes()).unwrap() | |
189 )) | |
190 } | |
191 } | |
192 | |
193 /// Replaces all non-ASCII codepoints with '?'. | |
194 fn codepoints_to_ascii_lossy(str: &str) -> String { | |
195 let mut ascii = String::new(); | |
196 for char in str.chars() { | |
197 ascii.push(if char.is_ascii() { char } else { '?' }); | |
198 } | |
199 ascii | |
200 } | |
201 | |
202 /// Replaces all non-ASCII bytes with '?'. | |
203 fn bytes_to_ascii_lossy(bytes: &[u8]) -> String { | |
204 let mut ascii = String::new(); | |
205 for &b in bytes { | |
206 ascii.push(if b.is_ascii() { b as char } else { '?' }); | |
207 } | |
208 ascii | |
209 } | |
210 | |
211 #[cfg(test)] | |
212 mod tests { | |
213 use super::*; | |
214 | |
215 #[test] | |
216 fn test_decode_internal() { | |
217 let encoder = Encoder::default(); | |
218 assert_eq!(encoder.decode_internal(b"").unwrap(), ""); | |
219 assert_eq!(encoder.decode_internal(b"\xc3\xa9").unwrap(), "é"); | |
220 match encoder.decode_internal(b"A\xc3") { | |
221 Ok(_) => panic!("expected an error"), | |
222 Err(HgError::CorruptedRepository(message)) => { | |
223 assert_eq!(message, "invalid UTF-8 at offset 1: \"A\\xc3\"") | |
224 } | |
225 Err(_) => panic!("expected a CorruptedRepository error"), | |
226 } | |
227 } | |
228 | |
229 #[test] | |
230 fn test_to_local() { | |
231 let encoder = Encoder::default(); | |
232 assert_eq!(encoder.to_local("").as_ref(), b""); | |
233 assert_eq!(encoder.to_local("é").as_ref(), b"\xc3\xa9"); | |
234 } | |
235 | |
236 #[test] | |
237 fn test_from_local() { | |
238 let encoder = Encoder::default(); | |
239 assert_eq!(encoder.from_local(b"").unwrap(), ""); | |
240 assert_eq!(encoder.from_local(b"\xc3\xa9").unwrap(), "é"); | |
241 match encoder.from_local(b"A\xc3") { | |
242 Ok(_) => panic!("expected an error"), | |
243 Err(HgError::Abort { message, .. }) => { | |
244 assert_eq!(message, "invalid UTF-8 at offset 1: \"A\\xc3\"") | |
245 } | |
246 Err(_) => panic!("expected a CorruptedRepository error"), | |
247 } | |
248 } | |
249 | |
250 #[test] | |
251 fn test_from_local_replace() { | |
252 let encoder = Encoder { | |
253 decoding_mode: Mode::Replace, | |
254 ..Default::default() | |
255 }; | |
256 assert_eq!(encoder.from_local(b"A\xc3").unwrap(), "A\u{fffd}"); | |
257 } | |
258 | |
259 #[test] | |
260 fn test_column_width() { | |
261 let encoder = Encoder::default(); | |
262 assert_eq!(encoder.column_width(""), 0); | |
263 assert_eq!(encoder.column_width("a"), 1); | |
264 assert_eq!(encoder.column_width("ab"), 2); | |
265 assert_eq!(encoder.column_width("été"), 3); | |
266 assert_eq!(encoder.column_width("\u{1f496}"), 2); | |
267 } | |
268 | |
269 #[test] | |
270 fn test_column_width_ambiguous() { | |
271 let narrow_encoder = Encoder { | |
272 ambiguous_width: Width::Narrow, | |
273 ..Default::default() | |
274 }; | |
275 assert_eq!(narrow_encoder.column_width("\u{2606}"), 1); | |
276 | |
277 let wide_encoder = Encoder { | |
278 ambiguous_width: Width::Wide, | |
279 ..Default::default() | |
280 }; | |
281 assert_eq!(wide_encoder.column_width("\u{2606}"), 2); | |
282 } | |
283 | |
284 #[test] | |
285 fn test_column_width_bytes() { | |
286 let encoder = Encoder::default(); | |
287 assert_eq!(encoder.column_width_bytes(b""), 0); | |
288 assert_eq!(encoder.column_width_bytes("été".as_bytes()), 3); | |
289 assert_eq!(encoder.column_width_bytes(b"A\xc3"), 2); | |
290 } | |
291 } |