52756
|
1 //! Character transcoding support.
|
|
2
|
|
3 use core::str;
|
|
4 use std::borrow::Cow;
|
|
5
|
|
6 use crate::{errors::HgError, utils::Escaped};
|
|
7 use unicode_width::UnicodeWidthStr as _;
|
|
8
|
|
9 /// String encoder and decoder.
|
|
10 #[derive(Copy, Clone, Debug)]
|
|
11 pub struct Encoder {
|
|
12 /// The user's local encoding.
|
|
13 local_encoding: Encoding,
|
|
14 /// What to do when decoding fails. (Encoding always uses
|
|
15 /// `Mode::Replace`).
|
|
16 decoding_mode: Mode,
|
|
17 /// Width to use for characters that can be interpreted either as narrow
|
|
18 /// or wide depending on the context.
|
|
19 pub ambiguous_width: Width,
|
|
20 }
|
|
21
|
|
22 /// Character encoding.
|
|
23 #[derive(Copy, Clone, Debug)]
|
|
24 pub enum Encoding {
|
|
25 Utf8,
|
|
26 Ascii,
|
|
27 }
|
|
28
|
|
29 /// Character decoding mode.
|
|
30 #[derive(Copy, Clone, Debug)]
|
|
31 pub enum Mode {
|
|
32 /// Produce an error message for invalid characters.
|
|
33 Strict,
|
|
34 /// Replace invalid characters with a special character.
|
|
35 Replace,
|
|
36 }
|
|
37
|
|
38 /// The width of a Unicode character.
|
|
39 #[derive(Copy, Clone, Debug)]
|
|
40 pub enum Width {
|
|
41 /// Narrow, taking up 1 terminal column.
|
|
42 Narrow,
|
|
43 /// Wide, taking up 2 terminal columns.
|
|
44 Wide,
|
|
45 }
|
|
46
|
|
47 impl Default for Encoder {
|
|
48 fn default() -> Self {
|
|
49 Self {
|
|
50 local_encoding: Encoding::Utf8,
|
|
51 decoding_mode: Mode::Strict,
|
|
52 ambiguous_width: Width::Narrow,
|
|
53 }
|
|
54 }
|
|
55 }
|
|
56
|
|
57 impl Encoder {
|
|
58 /// Creates an encoder from environment variables.
|
|
59 pub fn from_env() -> Result<Self, HgError> {
|
|
60 let default = Encoder::default();
|
|
61 let local_encoding = match std::env::var_os("HGENCODING") {
|
|
62 None => default.local_encoding,
|
|
63 Some(s)
|
|
64 if s.eq_ignore_ascii_case("utf-8")
|
|
65 || s.eq_ignore_ascii_case("utf8") =>
|
|
66 {
|
|
67 Encoding::Utf8
|
|
68 }
|
|
69 Some(s) if s.eq_ignore_ascii_case("ascii") => Encoding::Ascii,
|
|
70 Some(s) => {
|
|
71 return Err(HgError::unsupported(format!(
|
|
72 "HGENCODING value '{}' is not supported",
|
|
73 s.to_string_lossy()
|
|
74 )))
|
|
75 }
|
|
76 };
|
|
77 let decoding_mode = match std::env::var_os("HGENCODINGMODE") {
|
|
78 None => default.decoding_mode,
|
|
79 Some(s) if s == "strict" => Mode::Strict,
|
|
80 Some(s) if s == "replace" => Mode::Replace,
|
|
81 Some(s) => {
|
|
82 return Err(HgError::abort_simple(format!(
|
|
83 "HGENCODINGMODE value '{}' is not supported",
|
|
84 s.to_string_lossy()
|
|
85 )))
|
|
86 }
|
|
87 };
|
|
88 let ambiguous_width = match std::env::var_os("HGENCODINGAMBIGUOUS") {
|
|
89 None => default.ambiguous_width,
|
|
90 Some(s) if s == "narrow" => Width::Narrow,
|
|
91 Some(s) if s == "wide" => Width::Wide,
|
|
92 Some(s) => {
|
|
93 return Err(HgError::abort_simple(format!(
|
|
94 "HGENCODINGAMBIGUOUS value '{}' is not supported",
|
|
95 s.to_string_lossy()
|
|
96 )))
|
|
97 }
|
|
98 };
|
|
99 Ok(Self {
|
|
100 local_encoding,
|
|
101 decoding_mode,
|
|
102 ambiguous_width,
|
|
103 })
|
|
104 }
|
|
105
|
|
106 /// Decodes an internal UTF-8 string from bytes.
|
|
107 pub fn decode_internal<'a>(
|
|
108 &self,
|
|
109 bytes: &'a [u8],
|
|
110 ) -> Result<&'a str, HgError> {
|
|
111 decode_utf8(bytes).map_err(HgError::corrupted)
|
|
112 }
|
|
113
|
|
114 /// Converts a string from internal UTF-8 to the local character encoding.
|
|
115 pub fn to_local<'a>(&self, str: &'a str) -> Cow<'a, [u8]> {
|
|
116 match self.local_encoding {
|
|
117 Encoding::Utf8 => Cow::Borrowed(str.as_bytes()),
|
|
118 Encoding::Ascii => {
|
|
119 if str.is_ascii() {
|
|
120 Cow::Borrowed(str.as_bytes())
|
|
121 } else {
|
|
122 Cow::Owned(codepoints_to_ascii_lossy(str).into_bytes())
|
|
123 }
|
|
124 }
|
|
125 }
|
|
126 }
|
|
127
|
|
128 /// Converts a string from the local character encoding to UTF-8.
|
|
129 pub fn from_local<'a>(
|
|
130 &self,
|
|
131 bytes: &'a [u8],
|
|
132 ) -> Result<Cow<'a, str>, HgError> {
|
|
133 match (self.local_encoding, self.decoding_mode) {
|
|
134 (Encoding::Utf8, Mode::Strict) => Ok(Cow::Borrowed(
|
|
135 decode_utf8(bytes).map_err(HgError::abort_simple)?,
|
|
136 )),
|
|
137 (Encoding::Utf8, Mode::Replace) => {
|
|
138 Ok(String::from_utf8_lossy(bytes))
|
|
139 }
|
|
140 (Encoding::Ascii, Mode::Strict) => Ok(Cow::Borrowed(
|
|
141 decode_ascii(bytes).map_err(HgError::abort_simple)?,
|
|
142 )),
|
|
143 (Encoding::Ascii, Mode::Replace) => {
|
|
144 Ok(Cow::Owned(bytes_to_ascii_lossy(bytes)))
|
|
145 }
|
|
146 }
|
|
147 }
|
|
148
|
|
149 /// Returns the column width of a string for display.
|
|
150 pub fn column_width(&self, str: &str) -> usize {
|
|
151 match self.ambiguous_width {
|
|
152 Width::Narrow => str.width(),
|
|
153 Width::Wide => str.width_cjk(),
|
|
154 }
|
|
155 }
|
|
156
|
|
157 /// Returns the column width if `bytes` can be decoded as UTF-8, otherwise
|
|
158 /// just returns the length in bytes.
|
|
159 pub fn column_width_bytes(&self, bytes: &[u8]) -> usize {
|
|
160 match str::from_utf8(bytes) {
|
|
161 Ok(str) => self.column_width(str),
|
|
162 Err(_) => bytes.len(),
|
|
163 }
|
|
164 }
|
|
165 }
|
|
166
|
|
167 /// Decodes bytes as UTF-8 or returns a detailed error message.
|
|
168 fn decode_utf8(bytes: &[u8]) -> Result<&str, String> {
|
|
169 str::from_utf8(bytes).map_err(|err| {
|
|
170 format!(
|
|
171 "invalid UTF-8 at offset {}: \"{}\"",
|
|
172 err.valid_up_to(),
|
|
173 str::from_utf8(&bytes.escaped_bytes()).unwrap()
|
|
174 )
|
|
175 })
|
|
176 }
|
|
177
|
|
178 /// Decodes bytes as ASCII or returns a detailed error message.
|
|
179 fn decode_ascii(bytes: &[u8]) -> Result<&str, String> {
|
|
180 // TODO: Use `as_ascii` https://github.com/rust-lang/rust/issues/110998
|
|
181 if bytes.is_ascii() {
|
|
182 // Safety: Just checked that it's ASCII.
|
|
183 let str = unsafe { str::from_utf8_unchecked(bytes) };
|
|
184 Ok(str)
|
|
185 } else {
|
|
186 Err(format!(
|
|
187 "invalid ASCII: \"{}\"",
|
|
188 str::from_utf8(&bytes.escaped_bytes()).unwrap()
|
|
189 ))
|
|
190 }
|
|
191 }
|
|
192
|
|
193 /// Replaces all non-ASCII codepoints with '?'.
|
|
194 fn codepoints_to_ascii_lossy(str: &str) -> String {
|
|
195 let mut ascii = String::new();
|
|
196 for char in str.chars() {
|
|
197 ascii.push(if char.is_ascii() { char } else { '?' });
|
|
198 }
|
|
199 ascii
|
|
200 }
|
|
201
|
|
202 /// Replaces all non-ASCII bytes with '?'.
|
|
203 fn bytes_to_ascii_lossy(bytes: &[u8]) -> String {
|
|
204 let mut ascii = String::new();
|
|
205 for &b in bytes {
|
|
206 ascii.push(if b.is_ascii() { b as char } else { '?' });
|
|
207 }
|
|
208 ascii
|
|
209 }
|
|
210
|
|
211 #[cfg(test)]
|
|
212 mod tests {
|
|
213 use super::*;
|
|
214
|
|
215 #[test]
|
|
216 fn test_decode_internal() {
|
|
217 let encoder = Encoder::default();
|
|
218 assert_eq!(encoder.decode_internal(b"").unwrap(), "");
|
|
219 assert_eq!(encoder.decode_internal(b"\xc3\xa9").unwrap(), "é");
|
|
220 match encoder.decode_internal(b"A\xc3") {
|
|
221 Ok(_) => panic!("expected an error"),
|
|
222 Err(HgError::CorruptedRepository(message)) => {
|
|
223 assert_eq!(message, "invalid UTF-8 at offset 1: \"A\\xc3\"")
|
|
224 }
|
|
225 Err(_) => panic!("expected a CorruptedRepository error"),
|
|
226 }
|
|
227 }
|
|
228
|
|
229 #[test]
|
|
230 fn test_to_local() {
|
|
231 let encoder = Encoder::default();
|
|
232 assert_eq!(encoder.to_local("").as_ref(), b"");
|
|
233 assert_eq!(encoder.to_local("é").as_ref(), b"\xc3\xa9");
|
|
234 }
|
|
235
|
|
236 #[test]
|
|
237 fn test_from_local() {
|
|
238 let encoder = Encoder::default();
|
|
239 assert_eq!(encoder.from_local(b"").unwrap(), "");
|
|
240 assert_eq!(encoder.from_local(b"\xc3\xa9").unwrap(), "é");
|
|
241 match encoder.from_local(b"A\xc3") {
|
|
242 Ok(_) => panic!("expected an error"),
|
|
243 Err(HgError::Abort { message, .. }) => {
|
|
244 assert_eq!(message, "invalid UTF-8 at offset 1: \"A\\xc3\"")
|
|
245 }
|
|
246 Err(_) => panic!("expected a CorruptedRepository error"),
|
|
247 }
|
|
248 }
|
|
249
|
|
250 #[test]
|
|
251 fn test_from_local_replace() {
|
|
252 let encoder = Encoder {
|
|
253 decoding_mode: Mode::Replace,
|
|
254 ..Default::default()
|
|
255 };
|
|
256 assert_eq!(encoder.from_local(b"A\xc3").unwrap(), "A\u{fffd}");
|
|
257 }
|
|
258
|
|
259 #[test]
|
|
260 fn test_column_width() {
|
|
261 let encoder = Encoder::default();
|
|
262 assert_eq!(encoder.column_width(""), 0);
|
|
263 assert_eq!(encoder.column_width("a"), 1);
|
|
264 assert_eq!(encoder.column_width("ab"), 2);
|
|
265 assert_eq!(encoder.column_width("été"), 3);
|
|
266 assert_eq!(encoder.column_width("\u{1f496}"), 2);
|
|
267 }
|
|
268
|
|
269 #[test]
|
|
270 fn test_column_width_ambiguous() {
|
|
271 let narrow_encoder = Encoder {
|
|
272 ambiguous_width: Width::Narrow,
|
|
273 ..Default::default()
|
|
274 };
|
|
275 assert_eq!(narrow_encoder.column_width("\u{2606}"), 1);
|
|
276
|
|
277 let wide_encoder = Encoder {
|
|
278 ambiguous_width: Width::Wide,
|
|
279 ..Default::default()
|
|
280 };
|
|
281 assert_eq!(wide_encoder.column_width("\u{2606}"), 2);
|
|
282 }
|
|
283
|
|
284 #[test]
|
|
285 fn test_column_width_bytes() {
|
|
286 let encoder = Encoder::default();
|
|
287 assert_eq!(encoder.column_width_bytes(b""), 0);
|
|
288 assert_eq!(encoder.column_width_bytes("été".as_bytes()), 3);
|
|
289 assert_eq!(encoder.column_width_bytes(b"A\xc3"), 2);
|
|
290 }
|
|
291 }
|