author | Mitchell Kember <mkember@janestreet.com> |
Thu, 16 Jan 2025 13:15:02 -0500 | |
changeset 52760 | 94e2547e6f3d |
parent 52756 | bbf1c52252ae |
permissions | -rw-r--r-- |
52756 | 1 |
//! Character transcoding support. |
2 |
||
3 |
use core::str; |
|
4 |
use std::borrow::Cow; |
|
5 |
||
52760
94e2547e6f3d
rust: move code from utils to utils::strings
Mitchell Kember <mkember@janestreet.com>
parents:
52756
diff
changeset
|
6 |
use crate::{errors::HgError, utils::strings::Escaped}; |
52756 | 7 |
use unicode_width::UnicodeWidthStr as _; |
8 |
||
9 |
/// String encoder and decoder. |
|
10 |
#[derive(Copy, Clone, Debug)] |
|
11 |
pub struct Encoder { |
|
12 |
/// The user's local encoding. |
|
13 |
local_encoding: Encoding, |
|
14 |
/// What to do when decoding fails. (Encoding always uses |
|
15 |
/// `Mode::Replace`). |
|
16 |
decoding_mode: Mode, |
|
17 |
/// Width to use for characters that can be interpreted either as narrow |
|
18 |
/// or wide depending on the context. |
|
19 |
pub ambiguous_width: Width, |
|
20 |
} |
|
21 |
||
22 |
/// Character encoding. |
|
23 |
#[derive(Copy, Clone, Debug)] |
|
24 |
pub enum Encoding { |
|
25 |
Utf8, |
|
26 |
Ascii, |
|
27 |
} |
|
28 |
||
29 |
/// Character decoding mode. |
|
30 |
#[derive(Copy, Clone, Debug)] |
|
31 |
pub enum Mode { |
|
32 |
/// Produce an error message for invalid characters. |
|
33 |
Strict, |
|
34 |
/// Replace invalid characters with a special character. |
|
35 |
Replace, |
|
36 |
} |
|
37 |
||
38 |
/// The width of a Unicode character. |
|
39 |
#[derive(Copy, Clone, Debug)] |
|
40 |
pub enum Width { |
|
41 |
/// Narrow, taking up 1 terminal column. |
|
42 |
Narrow, |
|
43 |
/// Wide, taking up 2 terminal columns. |
|
44 |
Wide, |
|
45 |
} |
|
46 |
||
47 |
impl Default for Encoder { |
|
48 |
fn default() -> Self { |
|
49 |
Self { |
|
50 |
local_encoding: Encoding::Utf8, |
|
51 |
decoding_mode: Mode::Strict, |
|
52 |
ambiguous_width: Width::Narrow, |
|
53 |
} |
|
54 |
} |
|
55 |
} |
|
56 |
||
57 |
impl Encoder { |
|
58 |
/// Creates an encoder from environment variables. |
|
59 |
pub fn from_env() -> Result<Self, HgError> { |
|
60 |
let default = Encoder::default(); |
|
61 |
let local_encoding = match std::env::var_os("HGENCODING") { |
|
62 |
None => default.local_encoding, |
|
63 |
Some(s) |
|
64 |
if s.eq_ignore_ascii_case("utf-8") |
|
65 |
|| s.eq_ignore_ascii_case("utf8") => |
|
66 |
{ |
|
67 |
Encoding::Utf8 |
|
68 |
} |
|
69 |
Some(s) if s.eq_ignore_ascii_case("ascii") => Encoding::Ascii, |
|
70 |
Some(s) => { |
|
71 |
return Err(HgError::unsupported(format!( |
|
72 |
"HGENCODING value '{}' is not supported", |
|
73 |
s.to_string_lossy() |
|
74 |
))) |
|
75 |
} |
|
76 |
}; |
|
77 |
let decoding_mode = match std::env::var_os("HGENCODINGMODE") { |
|
78 |
None => default.decoding_mode, |
|
79 |
Some(s) if s == "strict" => Mode::Strict, |
|
80 |
Some(s) if s == "replace" => Mode::Replace, |
|
81 |
Some(s) => { |
|
82 |
return Err(HgError::abort_simple(format!( |
|
83 |
"HGENCODINGMODE value '{}' is not supported", |
|
84 |
s.to_string_lossy() |
|
85 |
))) |
|
86 |
} |
|
87 |
}; |
|
88 |
let ambiguous_width = match std::env::var_os("HGENCODINGAMBIGUOUS") { |
|
89 |
None => default.ambiguous_width, |
|
90 |
Some(s) if s == "narrow" => Width::Narrow, |
|
91 |
Some(s) if s == "wide" => Width::Wide, |
|
92 |
Some(s) => { |
|
93 |
return Err(HgError::abort_simple(format!( |
|
94 |
"HGENCODINGAMBIGUOUS value '{}' is not supported", |
|
95 |
s.to_string_lossy() |
|
96 |
))) |
|
97 |
} |
|
98 |
}; |
|
99 |
Ok(Self { |
|
100 |
local_encoding, |
|
101 |
decoding_mode, |
|
102 |
ambiguous_width, |
|
103 |
}) |
|
104 |
} |
|
105 |
||
106 |
/// Decodes an internal UTF-8 string from bytes. |
|
107 |
pub fn decode_internal<'a>( |
|
108 |
&self, |
|
109 |
bytes: &'a [u8], |
|
110 |
) -> Result<&'a str, HgError> { |
|
111 |
decode_utf8(bytes).map_err(HgError::corrupted) |
|
112 |
} |
|
113 |
||
114 |
/// Converts a string from internal UTF-8 to the local character encoding. |
|
115 |
pub fn to_local<'a>(&self, str: &'a str) -> Cow<'a, [u8]> { |
|
116 |
match self.local_encoding { |
|
117 |
Encoding::Utf8 => Cow::Borrowed(str.as_bytes()), |
|
118 |
Encoding::Ascii => { |
|
119 |
if str.is_ascii() { |
|
120 |
Cow::Borrowed(str.as_bytes()) |
|
121 |
} else { |
|
122 |
Cow::Owned(codepoints_to_ascii_lossy(str).into_bytes()) |
|
123 |
} |
|
124 |
} |
|
125 |
} |
|
126 |
} |
|
127 |
||
128 |
/// Converts a string from the local character encoding to UTF-8. |
|
129 |
pub fn from_local<'a>( |
|
130 |
&self, |
|
131 |
bytes: &'a [u8], |
|
132 |
) -> Result<Cow<'a, str>, HgError> { |
|
133 |
match (self.local_encoding, self.decoding_mode) { |
|
134 |
(Encoding::Utf8, Mode::Strict) => Ok(Cow::Borrowed( |
|
135 |
decode_utf8(bytes).map_err(HgError::abort_simple)?, |
|
136 |
)), |
|
137 |
(Encoding::Utf8, Mode::Replace) => { |
|
138 |
Ok(String::from_utf8_lossy(bytes)) |
|
139 |
} |
|
140 |
(Encoding::Ascii, Mode::Strict) => Ok(Cow::Borrowed( |
|
141 |
decode_ascii(bytes).map_err(HgError::abort_simple)?, |
|
142 |
)), |
|
143 |
(Encoding::Ascii, Mode::Replace) => { |
|
144 |
Ok(Cow::Owned(bytes_to_ascii_lossy(bytes))) |
|
145 |
} |
|
146 |
} |
|
147 |
} |
|
148 |
||
149 |
/// Returns the column width of a string for display. |
|
150 |
pub fn column_width(&self, str: &str) -> usize { |
|
151 |
match self.ambiguous_width { |
|
152 |
Width::Narrow => str.width(), |
|
153 |
Width::Wide => str.width_cjk(), |
|
154 |
} |
|
155 |
} |
|
156 |
||
157 |
/// Returns the column width if `bytes` can be decoded as UTF-8, otherwise |
|
158 |
/// just returns the length in bytes. |
|
159 |
pub fn column_width_bytes(&self, bytes: &[u8]) -> usize { |
|
160 |
match str::from_utf8(bytes) { |
|
161 |
Ok(str) => self.column_width(str), |
|
162 |
Err(_) => bytes.len(), |
|
163 |
} |
|
164 |
} |
|
165 |
} |
|
166 |
||
167 |
/// Decodes bytes as UTF-8 or returns a detailed error message. |
|
168 |
fn decode_utf8(bytes: &[u8]) -> Result<&str, String> { |
|
169 |
str::from_utf8(bytes).map_err(|err| { |
|
170 |
format!( |
|
171 |
"invalid UTF-8 at offset {}: \"{}\"", |
|
172 |
err.valid_up_to(), |
|
173 |
str::from_utf8(&bytes.escaped_bytes()).unwrap() |
|
174 |
) |
|
175 |
}) |
|
176 |
} |
|
177 |
||
178 |
/// Decodes bytes as ASCII or returns a detailed error message. |
|
179 |
fn decode_ascii(bytes: &[u8]) -> Result<&str, String> { |
|
180 |
// TODO: Use `as_ascii` https://github.com/rust-lang/rust/issues/110998 |
|
181 |
if bytes.is_ascii() { |
|
182 |
// Safety: Just checked that it's ASCII. |
|
183 |
let str = unsafe { str::from_utf8_unchecked(bytes) }; |
|
184 |
Ok(str) |
|
185 |
} else { |
|
186 |
Err(format!( |
|
187 |
"invalid ASCII: \"{}\"", |
|
188 |
str::from_utf8(&bytes.escaped_bytes()).unwrap() |
|
189 |
)) |
|
190 |
} |
|
191 |
} |
|
192 |
||
193 |
/// Replaces all non-ASCII codepoints with '?'. |
|
194 |
fn codepoints_to_ascii_lossy(str: &str) -> String { |
|
195 |
let mut ascii = String::new(); |
|
196 |
for char in str.chars() { |
|
197 |
ascii.push(if char.is_ascii() { char } else { '?' }); |
|
198 |
} |
|
199 |
ascii |
|
200 |
} |
|
201 |
||
202 |
/// Replaces all non-ASCII bytes with '?'. |
|
203 |
fn bytes_to_ascii_lossy(bytes: &[u8]) -> String { |
|
204 |
let mut ascii = String::new(); |
|
205 |
for &b in bytes { |
|
206 |
ascii.push(if b.is_ascii() { b as char } else { '?' }); |
|
207 |
} |
|
208 |
ascii |
|
209 |
} |
|
210 |
||
211 |
#[cfg(test)] |
|
212 |
mod tests { |
|
213 |
use super::*; |
|
214 |
||
215 |
#[test] |
|
216 |
fn test_decode_internal() { |
|
217 |
let encoder = Encoder::default(); |
|
218 |
assert_eq!(encoder.decode_internal(b"").unwrap(), ""); |
|
219 |
assert_eq!(encoder.decode_internal(b"\xc3\xa9").unwrap(), "é"); |
|
220 |
match encoder.decode_internal(b"A\xc3") { |
|
221 |
Ok(_) => panic!("expected an error"), |
|
222 |
Err(HgError::CorruptedRepository(message)) => { |
|
223 |
assert_eq!(message, "invalid UTF-8 at offset 1: \"A\\xc3\"") |
|
224 |
} |
|
225 |
Err(_) => panic!("expected a CorruptedRepository error"), |
|
226 |
} |
|
227 |
} |
|
228 |
||
229 |
#[test] |
|
230 |
fn test_to_local() { |
|
231 |
let encoder = Encoder::default(); |
|
232 |
assert_eq!(encoder.to_local("").as_ref(), b""); |
|
233 |
assert_eq!(encoder.to_local("é").as_ref(), b"\xc3\xa9"); |
|
234 |
} |
|
235 |
||
236 |
#[test] |
|
237 |
fn test_from_local() { |
|
238 |
let encoder = Encoder::default(); |
|
239 |
assert_eq!(encoder.from_local(b"").unwrap(), ""); |
|
240 |
assert_eq!(encoder.from_local(b"\xc3\xa9").unwrap(), "é"); |
|
241 |
match encoder.from_local(b"A\xc3") { |
|
242 |
Ok(_) => panic!("expected an error"), |
|
243 |
Err(HgError::Abort { message, .. }) => { |
|
244 |
assert_eq!(message, "invalid UTF-8 at offset 1: \"A\\xc3\"") |
|
245 |
} |
|
246 |
Err(_) => panic!("expected a CorruptedRepository error"), |
|
247 |
} |
|
248 |
} |
|
249 |
||
250 |
#[test] |
|
251 |
fn test_from_local_replace() { |
|
252 |
let encoder = Encoder { |
|
253 |
decoding_mode: Mode::Replace, |
|
254 |
..Default::default() |
|
255 |
}; |
|
256 |
assert_eq!(encoder.from_local(b"A\xc3").unwrap(), "A\u{fffd}"); |
|
257 |
} |
|
258 |
||
259 |
#[test] |
|
260 |
fn test_column_width() { |
|
261 |
let encoder = Encoder::default(); |
|
262 |
assert_eq!(encoder.column_width(""), 0); |
|
263 |
assert_eq!(encoder.column_width("a"), 1); |
|
264 |
assert_eq!(encoder.column_width("ab"), 2); |
|
265 |
assert_eq!(encoder.column_width("été"), 3); |
|
266 |
assert_eq!(encoder.column_width("\u{1f496}"), 2); |
|
267 |
} |
|
268 |
||
269 |
#[test] |
|
270 |
fn test_column_width_ambiguous() { |
|
271 |
let narrow_encoder = Encoder { |
|
272 |
ambiguous_width: Width::Narrow, |
|
273 |
..Default::default() |
|
274 |
}; |
|
275 |
assert_eq!(narrow_encoder.column_width("\u{2606}"), 1); |
|
276 |
||
277 |
let wide_encoder = Encoder { |
|
278 |
ambiguous_width: Width::Wide, |
|
279 |
..Default::default() |
|
280 |
}; |
|
281 |
assert_eq!(wide_encoder.column_width("\u{2606}"), 2); |
|
282 |
} |
|
283 |
||
284 |
#[test] |
|
285 |
fn test_column_width_bytes() { |
|
286 |
let encoder = Encoder::default(); |
|
287 |
assert_eq!(encoder.column_width_bytes(b""), 0); |
|
288 |
assert_eq!(encoder.column_width_bytes("été".as_bytes()), 3); |
|
289 |
assert_eq!(encoder.column_width_bytes(b"A\xc3"), 2); |
|
290 |
} |
|
291 |
} |