annotate rust/hg-core/src/encoding.rs @ 52756:bbf1c52252ae

rust: add encoding.rs This is based on encoding.py. It reads the environment variables HGENCODING, HGENCODINGMODE, and HGENCODINGAMBIGUOUS. Currently it only supports UTF-8 and ascii, but it could be extended to support other local encodings. Unlike Python, it assumes all internal strings are UTF-8 and does not attempt to fallback to latin-1 (or ui.fallbackencoding). Nothing is using this now, but in the future command output and error messages should transition to using it. I replaced existing calls to `utf8_to_local` and `local_to_uf8` with direct String/bytes methods since they were not logically converting between internal and local encodings. Instead, they were used (for example) when an error message happened to be stored as String but needed to be passed somewhere as bytes. The proper fix for this will be to avoid String in the first place.
author Mitchell Kember <mkember@janestreet.com>
date Wed, 05 Feb 2025 17:35:52 -0500
parents
children 94e2547e6f3d
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
52756
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
1 //! Character transcoding support.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
2
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
3 use core::str;
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
4 use std::borrow::Cow;
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
5
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
6 use crate::{errors::HgError, utils::Escaped};
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
7 use unicode_width::UnicodeWidthStr as _;
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
8
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
9 /// String encoder and decoder.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
10 #[derive(Copy, Clone, Debug)]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
11 pub struct Encoder {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
12 /// The user's local encoding.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
13 local_encoding: Encoding,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
14 /// What to do when decoding fails. (Encoding always uses
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
15 /// `Mode::Replace`).
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
16 decoding_mode: Mode,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
17 /// Width to use for characters that can be interpreted either as narrow
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
18 /// or wide depending on the context.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
19 pub ambiguous_width: Width,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
20 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
21
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
22 /// Character encoding.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
23 #[derive(Copy, Clone, Debug)]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
24 pub enum Encoding {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
25 Utf8,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
26 Ascii,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
27 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
28
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
29 /// Character decoding mode.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
30 #[derive(Copy, Clone, Debug)]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
31 pub enum Mode {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
32 /// Produce an error message for invalid characters.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
33 Strict,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
34 /// Replace invalid characters with a special character.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
35 Replace,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
36 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
37
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
38 /// The width of a Unicode character.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
39 #[derive(Copy, Clone, Debug)]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
40 pub enum Width {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
41 /// Narrow, taking up 1 terminal column.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
42 Narrow,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
43 /// Wide, taking up 2 terminal columns.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
44 Wide,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
45 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
46
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
47 impl Default for Encoder {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
48 fn default() -> Self {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
49 Self {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
50 local_encoding: Encoding::Utf8,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
51 decoding_mode: Mode::Strict,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
52 ambiguous_width: Width::Narrow,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
53 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
54 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
55 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
56
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
57 impl Encoder {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
58 /// Creates an encoder from environment variables.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
59 pub fn from_env() -> Result<Self, HgError> {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
60 let default = Encoder::default();
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
61 let local_encoding = match std::env::var_os("HGENCODING") {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
62 None => default.local_encoding,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
63 Some(s)
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
64 if s.eq_ignore_ascii_case("utf-8")
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
65 || s.eq_ignore_ascii_case("utf8") =>
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
66 {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
67 Encoding::Utf8
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
68 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
69 Some(s) if s.eq_ignore_ascii_case("ascii") => Encoding::Ascii,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
70 Some(s) => {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
71 return Err(HgError::unsupported(format!(
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
72 "HGENCODING value '{}' is not supported",
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
73 s.to_string_lossy()
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
74 )))
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
75 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
76 };
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
77 let decoding_mode = match std::env::var_os("HGENCODINGMODE") {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
78 None => default.decoding_mode,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
79 Some(s) if s == "strict" => Mode::Strict,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
80 Some(s) if s == "replace" => Mode::Replace,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
81 Some(s) => {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
82 return Err(HgError::abort_simple(format!(
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
83 "HGENCODINGMODE value '{}' is not supported",
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
84 s.to_string_lossy()
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
85 )))
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
86 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
87 };
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
88 let ambiguous_width = match std::env::var_os("HGENCODINGAMBIGUOUS") {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
89 None => default.ambiguous_width,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
90 Some(s) if s == "narrow" => Width::Narrow,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
91 Some(s) if s == "wide" => Width::Wide,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
92 Some(s) => {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
93 return Err(HgError::abort_simple(format!(
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
94 "HGENCODINGAMBIGUOUS value '{}' is not supported",
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
95 s.to_string_lossy()
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
96 )))
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
97 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
98 };
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
99 Ok(Self {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
100 local_encoding,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
101 decoding_mode,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
102 ambiguous_width,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
103 })
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
104 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
105
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
106 /// Decodes an internal UTF-8 string from bytes.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
107 pub fn decode_internal<'a>(
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
108 &self,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
109 bytes: &'a [u8],
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
110 ) -> Result<&'a str, HgError> {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
111 decode_utf8(bytes).map_err(HgError::corrupted)
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
112 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
113
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
114 /// Converts a string from internal UTF-8 to the local character encoding.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
115 pub fn to_local<'a>(&self, str: &'a str) -> Cow<'a, [u8]> {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
116 match self.local_encoding {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
117 Encoding::Utf8 => Cow::Borrowed(str.as_bytes()),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
118 Encoding::Ascii => {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
119 if str.is_ascii() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
120 Cow::Borrowed(str.as_bytes())
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
121 } else {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
122 Cow::Owned(codepoints_to_ascii_lossy(str).into_bytes())
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
123 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
124 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
125 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
126 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
127
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
128 /// Converts a string from the local character encoding to UTF-8.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
129 pub fn from_local<'a>(
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
130 &self,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
131 bytes: &'a [u8],
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
132 ) -> Result<Cow<'a, str>, HgError> {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
133 match (self.local_encoding, self.decoding_mode) {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
134 (Encoding::Utf8, Mode::Strict) => Ok(Cow::Borrowed(
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
135 decode_utf8(bytes).map_err(HgError::abort_simple)?,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
136 )),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
137 (Encoding::Utf8, Mode::Replace) => {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
138 Ok(String::from_utf8_lossy(bytes))
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
139 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
140 (Encoding::Ascii, Mode::Strict) => Ok(Cow::Borrowed(
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
141 decode_ascii(bytes).map_err(HgError::abort_simple)?,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
142 )),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
143 (Encoding::Ascii, Mode::Replace) => {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
144 Ok(Cow::Owned(bytes_to_ascii_lossy(bytes)))
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
145 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
146 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
147 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
148
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
149 /// Returns the column width of a string for display.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
150 pub fn column_width(&self, str: &str) -> usize {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
151 match self.ambiguous_width {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
152 Width::Narrow => str.width(),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
153 Width::Wide => str.width_cjk(),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
154 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
155 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
156
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
157 /// Returns the column width if `bytes` can be decoded as UTF-8, otherwise
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
158 /// just returns the length in bytes.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
159 pub fn column_width_bytes(&self, bytes: &[u8]) -> usize {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
160 match str::from_utf8(bytes) {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
161 Ok(str) => self.column_width(str),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
162 Err(_) => bytes.len(),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
163 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
164 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
165 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
166
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
167 /// Decodes bytes as UTF-8 or returns a detailed error message.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
168 fn decode_utf8(bytes: &[u8]) -> Result<&str, String> {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
169 str::from_utf8(bytes).map_err(|err| {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
170 format!(
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
171 "invalid UTF-8 at offset {}: \"{}\"",
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
172 err.valid_up_to(),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
173 str::from_utf8(&bytes.escaped_bytes()).unwrap()
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
174 )
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
175 })
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
176 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
177
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
178 /// Decodes bytes as ASCII or returns a detailed error message.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
179 fn decode_ascii(bytes: &[u8]) -> Result<&str, String> {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
180 // TODO: Use `as_ascii` https://github.com/rust-lang/rust/issues/110998
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
181 if bytes.is_ascii() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
182 // Safety: Just checked that it's ASCII.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
183 let str = unsafe { str::from_utf8_unchecked(bytes) };
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
184 Ok(str)
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
185 } else {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
186 Err(format!(
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
187 "invalid ASCII: \"{}\"",
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
188 str::from_utf8(&bytes.escaped_bytes()).unwrap()
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
189 ))
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
190 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
191 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
192
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
193 /// Replaces all non-ASCII codepoints with '?'.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
194 fn codepoints_to_ascii_lossy(str: &str) -> String {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
195 let mut ascii = String::new();
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
196 for char in str.chars() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
197 ascii.push(if char.is_ascii() { char } else { '?' });
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
198 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
199 ascii
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
200 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
201
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
202 /// Replaces all non-ASCII bytes with '?'.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
203 fn bytes_to_ascii_lossy(bytes: &[u8]) -> String {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
204 let mut ascii = String::new();
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
205 for &b in bytes {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
206 ascii.push(if b.is_ascii() { b as char } else { '?' });
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
207 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
208 ascii
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
209 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
210
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
211 #[cfg(test)]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
212 mod tests {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
213 use super::*;
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
214
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
215 #[test]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
216 fn test_decode_internal() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
217 let encoder = Encoder::default();
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
218 assert_eq!(encoder.decode_internal(b"").unwrap(), "");
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
219 assert_eq!(encoder.decode_internal(b"\xc3\xa9").unwrap(), "é");
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
220 match encoder.decode_internal(b"A\xc3") {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
221 Ok(_) => panic!("expected an error"),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
222 Err(HgError::CorruptedRepository(message)) => {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
223 assert_eq!(message, "invalid UTF-8 at offset 1: \"A\\xc3\"")
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
224 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
225 Err(_) => panic!("expected a CorruptedRepository error"),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
226 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
227 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
228
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
229 #[test]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
230 fn test_to_local() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
231 let encoder = Encoder::default();
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
232 assert_eq!(encoder.to_local("").as_ref(), b"");
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
233 assert_eq!(encoder.to_local("é").as_ref(), b"\xc3\xa9");
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
234 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
235
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
236 #[test]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
237 fn test_from_local() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
238 let encoder = Encoder::default();
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
239 assert_eq!(encoder.from_local(b"").unwrap(), "");
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
240 assert_eq!(encoder.from_local(b"\xc3\xa9").unwrap(), "é");
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
241 match encoder.from_local(b"A\xc3") {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
242 Ok(_) => panic!("expected an error"),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
243 Err(HgError::Abort { message, .. }) => {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
244 assert_eq!(message, "invalid UTF-8 at offset 1: \"A\\xc3\"")
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
245 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
246 Err(_) => panic!("expected a CorruptedRepository error"),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
247 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
248 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
249
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
250 #[test]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
251 fn test_from_local_replace() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
252 let encoder = Encoder {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
253 decoding_mode: Mode::Replace,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
254 ..Default::default()
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
255 };
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
256 assert_eq!(encoder.from_local(b"A\xc3").unwrap(), "A\u{fffd}");
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
257 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
258
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
259 #[test]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
260 fn test_column_width() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
261 let encoder = Encoder::default();
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
262 assert_eq!(encoder.column_width(""), 0);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
263 assert_eq!(encoder.column_width("a"), 1);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
264 assert_eq!(encoder.column_width("ab"), 2);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
265 assert_eq!(encoder.column_width("été"), 3);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
266 assert_eq!(encoder.column_width("\u{1f496}"), 2);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
267 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
268
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
269 #[test]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
270 fn test_column_width_ambiguous() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
271 let narrow_encoder = Encoder {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
272 ambiguous_width: Width::Narrow,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
273 ..Default::default()
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
274 };
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
275 assert_eq!(narrow_encoder.column_width("\u{2606}"), 1);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
276
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
277 let wide_encoder = Encoder {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
278 ambiguous_width: Width::Wide,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
279 ..Default::default()
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
280 };
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
281 assert_eq!(wide_encoder.column_width("\u{2606}"), 2);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
282 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
283
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
284 #[test]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
285 fn test_column_width_bytes() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
286 let encoder = Encoder::default();
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
287 assert_eq!(encoder.column_width_bytes(b""), 0);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
288 assert_eq!(encoder.column_width_bytes("été".as_bytes()), 3);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
289 assert_eq!(encoder.column_width_bytes(b"A\xc3"), 2);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
290 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
291 }