annotate rust/hg-core/src/encoding.rs @ 53003:155e1e8dc055

rust-nodemap: don't compute the error string unless needed This is... really dumb and costs a ton of performance in a hot loop. It was 75% of a profile for a tip to null p1 node traversal in pure Rust. I'm at fault, done in 652149ed64f08ee73e8fd2f76aa480ea8820fe08. I thought clippy had a lint for this, but apparently not?
author Rapha?l Gom?s <rgomes@octobus.net>
date Fri, 21 Feb 2025 13:56:11 -0500
parents 94e2547e6f3d
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
52770
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
1 //! Character transcoding support.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
2
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
3 use core::str;
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
4 use std::borrow::Cow;
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
5
52774
94e2547e6f3d rust: move code from utils to utils::strings
Mitchell Kember <mkember@janestreet.com>
parents: 52770
diff changeset
6 use crate::{errors::HgError, utils::strings::Escaped};
52770
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
7 use unicode_width::UnicodeWidthStr as _;
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
8
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
9 /// String encoder and decoder.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
10 #[derive(Copy, Clone, Debug)]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
11 pub struct Encoder {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
12 /// The user's local encoding.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
13 local_encoding: Encoding,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
14 /// What to do when decoding fails. (Encoding always uses
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
15 /// `Mode::Replace`).
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
16 decoding_mode: Mode,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
17 /// Width to use for characters that can be interpreted either as narrow
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
18 /// or wide depending on the context.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
19 pub ambiguous_width: Width,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
20 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
21
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
22 /// Character encoding.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
23 #[derive(Copy, Clone, Debug)]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
24 pub enum Encoding {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
25 Utf8,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
26 Ascii,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
27 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
28
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
29 /// Character decoding mode.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
30 #[derive(Copy, Clone, Debug)]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
31 pub enum Mode {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
32 /// Produce an error message for invalid characters.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
33 Strict,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
34 /// Replace invalid characters with a special character.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
35 Replace,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
36 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
37
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
38 /// The width of a Unicode character.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
39 #[derive(Copy, Clone, Debug)]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
40 pub enum Width {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
41 /// Narrow, taking up 1 terminal column.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
42 Narrow,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
43 /// Wide, taking up 2 terminal columns.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
44 Wide,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
45 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
46
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
47 impl Default for Encoder {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
48 fn default() -> Self {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
49 Self {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
50 local_encoding: Encoding::Utf8,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
51 decoding_mode: Mode::Strict,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
52 ambiguous_width: Width::Narrow,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
53 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
54 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
55 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
56
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
57 impl Encoder {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
58 /// Creates an encoder from environment variables.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
59 pub fn from_env() -> Result<Self, HgError> {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
60 let default = Encoder::default();
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
61 let local_encoding = match std::env::var_os("HGENCODING") {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
62 None => default.local_encoding,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
63 Some(s)
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
64 if s.eq_ignore_ascii_case("utf-8")
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
65 || s.eq_ignore_ascii_case("utf8") =>
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
66 {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
67 Encoding::Utf8
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
68 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
69 Some(s) if s.eq_ignore_ascii_case("ascii") => Encoding::Ascii,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
70 Some(s) => {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
71 return Err(HgError::unsupported(format!(
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
72 "HGENCODING value '{}' is not supported",
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
73 s.to_string_lossy()
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
74 )))
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
75 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
76 };
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
77 let decoding_mode = match std::env::var_os("HGENCODINGMODE") {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
78 None => default.decoding_mode,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
79 Some(s) if s == "strict" => Mode::Strict,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
80 Some(s) if s == "replace" => Mode::Replace,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
81 Some(s) => {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
82 return Err(HgError::abort_simple(format!(
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
83 "HGENCODINGMODE value '{}' is not supported",
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
84 s.to_string_lossy()
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
85 )))
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
86 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
87 };
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
88 let ambiguous_width = match std::env::var_os("HGENCODINGAMBIGUOUS") {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
89 None => default.ambiguous_width,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
90 Some(s) if s == "narrow" => Width::Narrow,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
91 Some(s) if s == "wide" => Width::Wide,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
92 Some(s) => {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
93 return Err(HgError::abort_simple(format!(
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
94 "HGENCODINGAMBIGUOUS value '{}' is not supported",
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
95 s.to_string_lossy()
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
96 )))
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
97 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
98 };
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
99 Ok(Self {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
100 local_encoding,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
101 decoding_mode,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
102 ambiguous_width,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
103 })
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
104 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
105
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
106 /// Decodes an internal UTF-8 string from bytes.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
107 pub fn decode_internal<'a>(
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
108 &self,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
109 bytes: &'a [u8],
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
110 ) -> Result<&'a str, HgError> {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
111 decode_utf8(bytes).map_err(HgError::corrupted)
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
112 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
113
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
114 /// Converts a string from internal UTF-8 to the local character encoding.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
115 pub fn to_local<'a>(&self, str: &'a str) -> Cow<'a, [u8]> {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
116 match self.local_encoding {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
117 Encoding::Utf8 => Cow::Borrowed(str.as_bytes()),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
118 Encoding::Ascii => {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
119 if str.is_ascii() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
120 Cow::Borrowed(str.as_bytes())
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
121 } else {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
122 Cow::Owned(codepoints_to_ascii_lossy(str).into_bytes())
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
123 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
124 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
125 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
126 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
127
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
128 /// Converts a string from the local character encoding to UTF-8.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
129 pub fn from_local<'a>(
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
130 &self,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
131 bytes: &'a [u8],
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
132 ) -> Result<Cow<'a, str>, HgError> {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
133 match (self.local_encoding, self.decoding_mode) {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
134 (Encoding::Utf8, Mode::Strict) => Ok(Cow::Borrowed(
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
135 decode_utf8(bytes).map_err(HgError::abort_simple)?,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
136 )),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
137 (Encoding::Utf8, Mode::Replace) => {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
138 Ok(String::from_utf8_lossy(bytes))
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
139 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
140 (Encoding::Ascii, Mode::Strict) => Ok(Cow::Borrowed(
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
141 decode_ascii(bytes).map_err(HgError::abort_simple)?,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
142 )),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
143 (Encoding::Ascii, Mode::Replace) => {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
144 Ok(Cow::Owned(bytes_to_ascii_lossy(bytes)))
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
145 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
146 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
147 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
148
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
149 /// Returns the column width of a string for display.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
150 pub fn column_width(&self, str: &str) -> usize {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
151 match self.ambiguous_width {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
152 Width::Narrow => str.width(),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
153 Width::Wide => str.width_cjk(),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
154 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
155 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
156
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
157 /// Returns the column width if `bytes` can be decoded as UTF-8, otherwise
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
158 /// just returns the length in bytes.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
159 pub fn column_width_bytes(&self, bytes: &[u8]) -> usize {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
160 match str::from_utf8(bytes) {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
161 Ok(str) => self.column_width(str),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
162 Err(_) => bytes.len(),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
163 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
164 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
165 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
166
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
167 /// Decodes bytes as UTF-8 or returns a detailed error message.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
168 fn decode_utf8(bytes: &[u8]) -> Result<&str, String> {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
169 str::from_utf8(bytes).map_err(|err| {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
170 format!(
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
171 "invalid UTF-8 at offset {}: \"{}\"",
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
172 err.valid_up_to(),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
173 str::from_utf8(&bytes.escaped_bytes()).unwrap()
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
174 )
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
175 })
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
176 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
177
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
178 /// Decodes bytes as ASCII or returns a detailed error message.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
179 fn decode_ascii(bytes: &[u8]) -> Result<&str, String> {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
180 // TODO: Use `as_ascii` https://github.com/rust-lang/rust/issues/110998
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
181 if bytes.is_ascii() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
182 // Safety: Just checked that it's ASCII.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
183 let str = unsafe { str::from_utf8_unchecked(bytes) };
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
184 Ok(str)
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
185 } else {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
186 Err(format!(
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
187 "invalid ASCII: \"{}\"",
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
188 str::from_utf8(&bytes.escaped_bytes()).unwrap()
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
189 ))
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
190 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
191 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
192
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
193 /// Replaces all non-ASCII codepoints with '?'.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
194 fn codepoints_to_ascii_lossy(str: &str) -> String {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
195 let mut ascii = String::new();
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
196 for char in str.chars() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
197 ascii.push(if char.is_ascii() { char } else { '?' });
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
198 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
199 ascii
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
200 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
201
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
202 /// Replaces all non-ASCII bytes with '?'.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
203 fn bytes_to_ascii_lossy(bytes: &[u8]) -> String {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
204 let mut ascii = String::new();
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
205 for &b in bytes {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
206 ascii.push(if b.is_ascii() { b as char } else { '?' });
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
207 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
208 ascii
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
209 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
210
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
211 #[cfg(test)]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
212 mod tests {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
213 use super::*;
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
214
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
215 #[test]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
216 fn test_decode_internal() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
217 let encoder = Encoder::default();
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
218 assert_eq!(encoder.decode_internal(b"").unwrap(), "");
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
219 assert_eq!(encoder.decode_internal(b"\xc3\xa9").unwrap(), "é");
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
220 match encoder.decode_internal(b"A\xc3") {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
221 Ok(_) => panic!("expected an error"),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
222 Err(HgError::CorruptedRepository(message)) => {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
223 assert_eq!(message, "invalid UTF-8 at offset 1: \"A\\xc3\"")
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
224 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
225 Err(_) => panic!("expected a CorruptedRepository error"),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
226 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
227 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
228
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
229 #[test]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
230 fn test_to_local() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
231 let encoder = Encoder::default();
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
232 assert_eq!(encoder.to_local("").as_ref(), b"");
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
233 assert_eq!(encoder.to_local("é").as_ref(), b"\xc3\xa9");
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
234 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
235
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
236 #[test]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
237 fn test_from_local() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
238 let encoder = Encoder::default();
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
239 assert_eq!(encoder.from_local(b"").unwrap(), "");
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
240 assert_eq!(encoder.from_local(b"\xc3\xa9").unwrap(), "é");
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
241 match encoder.from_local(b"A\xc3") {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
242 Ok(_) => panic!("expected an error"),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
243 Err(HgError::Abort { message, .. }) => {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
244 assert_eq!(message, "invalid UTF-8 at offset 1: \"A\\xc3\"")
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
245 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
246 Err(_) => panic!("expected a CorruptedRepository error"),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
247 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
248 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
249
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
250 #[test]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
251 fn test_from_local_replace() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
252 let encoder = Encoder {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
253 decoding_mode: Mode::Replace,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
254 ..Default::default()
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
255 };
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
256 assert_eq!(encoder.from_local(b"A\xc3").unwrap(), "A\u{fffd}");
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
257 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
258
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
259 #[test]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
260 fn test_column_width() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
261 let encoder = Encoder::default();
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
262 assert_eq!(encoder.column_width(""), 0);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
263 assert_eq!(encoder.column_width("a"), 1);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
264 assert_eq!(encoder.column_width("ab"), 2);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
265 assert_eq!(encoder.column_width("été"), 3);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
266 assert_eq!(encoder.column_width("\u{1f496}"), 2);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
267 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
268
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
269 #[test]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
270 fn test_column_width_ambiguous() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
271 let narrow_encoder = Encoder {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
272 ambiguous_width: Width::Narrow,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
273 ..Default::default()
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
274 };
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
275 assert_eq!(narrow_encoder.column_width("\u{2606}"), 1);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
276
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
277 let wide_encoder = Encoder {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
278 ambiguous_width: Width::Wide,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
279 ..Default::default()
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
280 };
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
281 assert_eq!(wide_encoder.column_width("\u{2606}"), 2);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
282 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
283
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
284 #[test]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
285 fn test_column_width_bytes() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
286 let encoder = Encoder::default();
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
287 assert_eq!(encoder.column_width_bytes(b""), 0);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
288 assert_eq!(encoder.column_width_bytes("été".as_bytes()), 3);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
289 assert_eq!(encoder.column_width_bytes(b"A\xc3"), 2);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
290 }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
291 }