Mercurial > public > mercurial-scm > hg-stable
diff rust/hg-core/src/encoding.rs @ 52770:bbf1c52252ae
rust: add encoding.rs
This is based on encoding.py. It reads the environment variables HGENCODING,
HGENCODINGMODE, and HGENCODINGAMBIGUOUS. Currently it only supports UTF-8 and
ascii, but it could be extended to support other local encodings.
Unlike Python, it assumes all internal strings are UTF-8 and does not attempt to
fallback to latin-1 (or ui.fallbackencoding).
Nothing is using this now, but in the future command output and error messages
should transition to using it.
I replaced existing calls to `utf8_to_local` and `local_to_uf8` with direct
String/bytes methods since they were not logically converting between internal
and local encodings. Instead, they were used (for example) when an error message
happened to be stored as String but needed to be passed somewhere as bytes. The
proper fix for this will be to avoid String in the first place.
author | Mitchell Kember <mkember@janestreet.com> |
---|---|
date | Wed, 05 Feb 2025 17:35:52 -0500 |
parents | |
children | 94e2547e6f3d |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rust/hg-core/src/encoding.rs Wed Feb 05 17:35:52 2025 -0500 @@ -0,0 +1,291 @@ +//! Character transcoding support. + +use core::str; +use std::borrow::Cow; + +use crate::{errors::HgError, utils::Escaped}; +use unicode_width::UnicodeWidthStr as _; + +/// String encoder and decoder. +#[derive(Copy, Clone, Debug)] +pub struct Encoder { + /// The user's local encoding. + local_encoding: Encoding, + /// What to do when decoding fails. (Encoding always uses + /// `Mode::Replace`). + decoding_mode: Mode, + /// Width to use for characters that can be interpreted either as narrow + /// or wide depending on the context. + pub ambiguous_width: Width, +} + +/// Character encoding. +#[derive(Copy, Clone, Debug)] +pub enum Encoding { + Utf8, + Ascii, +} + +/// Character decoding mode. +#[derive(Copy, Clone, Debug)] +pub enum Mode { + /// Produce an error message for invalid characters. + Strict, + /// Replace invalid characters with a special character. + Replace, +} + +/// The width of a Unicode character. +#[derive(Copy, Clone, Debug)] +pub enum Width { + /// Narrow, taking up 1 terminal column. + Narrow, + /// Wide, taking up 2 terminal columns. + Wide, +} + +impl Default for Encoder { + fn default() -> Self { + Self { + local_encoding: Encoding::Utf8, + decoding_mode: Mode::Strict, + ambiguous_width: Width::Narrow, + } + } +} + +impl Encoder { + /// Creates an encoder from environment variables. + pub fn from_env() -> Result<Self, HgError> { + let default = Encoder::default(); + let local_encoding = match std::env::var_os("HGENCODING") { + None => default.local_encoding, + Some(s) + if s.eq_ignore_ascii_case("utf-8") + || s.eq_ignore_ascii_case("utf8") => + { + Encoding::Utf8 + } + Some(s) if s.eq_ignore_ascii_case("ascii") => Encoding::Ascii, + Some(s) => { + return Err(HgError::unsupported(format!( + "HGENCODING value '{}' is not supported", + s.to_string_lossy() + ))) + } + }; + let decoding_mode = match std::env::var_os("HGENCODINGMODE") { + None => default.decoding_mode, + Some(s) if s == "strict" => Mode::Strict, + Some(s) if s == "replace" => Mode::Replace, + Some(s) => { + return Err(HgError::abort_simple(format!( + "HGENCODINGMODE value '{}' is not supported", + s.to_string_lossy() + ))) + } + }; + let ambiguous_width = match std::env::var_os("HGENCODINGAMBIGUOUS") { + None => default.ambiguous_width, + Some(s) if s == "narrow" => Width::Narrow, + Some(s) if s == "wide" => Width::Wide, + Some(s) => { + return Err(HgError::abort_simple(format!( + "HGENCODINGAMBIGUOUS value '{}' is not supported", + s.to_string_lossy() + ))) + } + }; + Ok(Self { + local_encoding, + decoding_mode, + ambiguous_width, + }) + } + + /// Decodes an internal UTF-8 string from bytes. + pub fn decode_internal<'a>( + &self, + bytes: &'a [u8], + ) -> Result<&'a str, HgError> { + decode_utf8(bytes).map_err(HgError::corrupted) + } + + /// Converts a string from internal UTF-8 to the local character encoding. + pub fn to_local<'a>(&self, str: &'a str) -> Cow<'a, [u8]> { + match self.local_encoding { + Encoding::Utf8 => Cow::Borrowed(str.as_bytes()), + Encoding::Ascii => { + if str.is_ascii() { + Cow::Borrowed(str.as_bytes()) + } else { + Cow::Owned(codepoints_to_ascii_lossy(str).into_bytes()) + } + } + } + } + + /// Converts a string from the local character encoding to UTF-8. + pub fn from_local<'a>( + &self, + bytes: &'a [u8], + ) -> Result<Cow<'a, str>, HgError> { + match (self.local_encoding, self.decoding_mode) { + (Encoding::Utf8, Mode::Strict) => Ok(Cow::Borrowed( + decode_utf8(bytes).map_err(HgError::abort_simple)?, + )), + (Encoding::Utf8, Mode::Replace) => { + Ok(String::from_utf8_lossy(bytes)) + } + (Encoding::Ascii, Mode::Strict) => Ok(Cow::Borrowed( + decode_ascii(bytes).map_err(HgError::abort_simple)?, + )), + (Encoding::Ascii, Mode::Replace) => { + Ok(Cow::Owned(bytes_to_ascii_lossy(bytes))) + } + } + } + + /// Returns the column width of a string for display. + pub fn column_width(&self, str: &str) -> usize { + match self.ambiguous_width { + Width::Narrow => str.width(), + Width::Wide => str.width_cjk(), + } + } + + /// Returns the column width if `bytes` can be decoded as UTF-8, otherwise + /// just returns the length in bytes. + pub fn column_width_bytes(&self, bytes: &[u8]) -> usize { + match str::from_utf8(bytes) { + Ok(str) => self.column_width(str), + Err(_) => bytes.len(), + } + } +} + +/// Decodes bytes as UTF-8 or returns a detailed error message. +fn decode_utf8(bytes: &[u8]) -> Result<&str, String> { + str::from_utf8(bytes).map_err(|err| { + format!( + "invalid UTF-8 at offset {}: \"{}\"", + err.valid_up_to(), + str::from_utf8(&bytes.escaped_bytes()).unwrap() + ) + }) +} + +/// Decodes bytes as ASCII or returns a detailed error message. +fn decode_ascii(bytes: &[u8]) -> Result<&str, String> { + // TODO: Use `as_ascii` https://github.com/rust-lang/rust/issues/110998 + if bytes.is_ascii() { + // Safety: Just checked that it's ASCII. + let str = unsafe { str::from_utf8_unchecked(bytes) }; + Ok(str) + } else { + Err(format!( + "invalid ASCII: \"{}\"", + str::from_utf8(&bytes.escaped_bytes()).unwrap() + )) + } +} + +/// Replaces all non-ASCII codepoints with '?'. +fn codepoints_to_ascii_lossy(str: &str) -> String { + let mut ascii = String::new(); + for char in str.chars() { + ascii.push(if char.is_ascii() { char } else { '?' }); + } + ascii +} + +/// Replaces all non-ASCII bytes with '?'. +fn bytes_to_ascii_lossy(bytes: &[u8]) -> String { + let mut ascii = String::new(); + for &b in bytes { + ascii.push(if b.is_ascii() { b as char } else { '?' }); + } + ascii +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_decode_internal() { + let encoder = Encoder::default(); + assert_eq!(encoder.decode_internal(b"").unwrap(), ""); + assert_eq!(encoder.decode_internal(b"\xc3\xa9").unwrap(), "é"); + match encoder.decode_internal(b"A\xc3") { + Ok(_) => panic!("expected an error"), + Err(HgError::CorruptedRepository(message)) => { + assert_eq!(message, "invalid UTF-8 at offset 1: \"A\\xc3\"") + } + Err(_) => panic!("expected a CorruptedRepository error"), + } + } + + #[test] + fn test_to_local() { + let encoder = Encoder::default(); + assert_eq!(encoder.to_local("").as_ref(), b""); + assert_eq!(encoder.to_local("é").as_ref(), b"\xc3\xa9"); + } + + #[test] + fn test_from_local() { + let encoder = Encoder::default(); + assert_eq!(encoder.from_local(b"").unwrap(), ""); + assert_eq!(encoder.from_local(b"\xc3\xa9").unwrap(), "é"); + match encoder.from_local(b"A\xc3") { + Ok(_) => panic!("expected an error"), + Err(HgError::Abort { message, .. }) => { + assert_eq!(message, "invalid UTF-8 at offset 1: \"A\\xc3\"") + } + Err(_) => panic!("expected a CorruptedRepository error"), + } + } + + #[test] + fn test_from_local_replace() { + let encoder = Encoder { + decoding_mode: Mode::Replace, + ..Default::default() + }; + assert_eq!(encoder.from_local(b"A\xc3").unwrap(), "A\u{fffd}"); + } + + #[test] + fn test_column_width() { + let encoder = Encoder::default(); + assert_eq!(encoder.column_width(""), 0); + assert_eq!(encoder.column_width("a"), 1); + assert_eq!(encoder.column_width("ab"), 2); + assert_eq!(encoder.column_width("été"), 3); + assert_eq!(encoder.column_width("\u{1f496}"), 2); + } + + #[test] + fn test_column_width_ambiguous() { + let narrow_encoder = Encoder { + ambiguous_width: Width::Narrow, + ..Default::default() + }; + assert_eq!(narrow_encoder.column_width("\u{2606}"), 1); + + let wide_encoder = Encoder { + ambiguous_width: Width::Wide, + ..Default::default() + }; + assert_eq!(wide_encoder.column_width("\u{2606}"), 2); + } + + #[test] + fn test_column_width_bytes() { + let encoder = Encoder::default(); + assert_eq!(encoder.column_width_bytes(b""), 0); + assert_eq!(encoder.column_width_bytes("été".as_bytes()), 3); + assert_eq!(encoder.column_width_bytes(b"A\xc3"), 2); + } +}