rust/hg-core/src/encoding.rs
changeset 52756 bbf1c52252ae
child 52760 94e2547e6f3d
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rust/hg-core/src/encoding.rs	Wed Feb 05 17:35:52 2025 -0500
@@ -0,0 +1,291 @@
+//! Character transcoding support.
+
+use core::str;
+use std::borrow::Cow;
+
+use crate::{errors::HgError, utils::Escaped};
+use unicode_width::UnicodeWidthStr as _;
+
+/// String encoder and decoder.
+#[derive(Copy, Clone, Debug)]
+pub struct Encoder {
+    /// The user's local encoding.
+    local_encoding: Encoding,
+    /// What to do when decoding fails. (Encoding always uses
+    /// `Mode::Replace`).
+    decoding_mode: Mode,
+    /// Width to use for characters that can be interpreted either as narrow
+    /// or wide depending on the context.
+    pub ambiguous_width: Width,
+}
+
+/// Character encoding.
+#[derive(Copy, Clone, Debug)]
+pub enum Encoding {
+    Utf8,
+    Ascii,
+}
+
+/// Character decoding mode.
+#[derive(Copy, Clone, Debug)]
+pub enum Mode {
+    /// Produce an error message for invalid characters.
+    Strict,
+    /// Replace invalid characters with a special character.
+    Replace,
+}
+
+/// The width of a Unicode character.
+#[derive(Copy, Clone, Debug)]
+pub enum Width {
+    /// Narrow, taking up 1 terminal column.
+    Narrow,
+    /// Wide, taking up 2 terminal columns.
+    Wide,
+}
+
+impl Default for Encoder {
+    fn default() -> Self {
+        Self {
+            local_encoding: Encoding::Utf8,
+            decoding_mode: Mode::Strict,
+            ambiguous_width: Width::Narrow,
+        }
+    }
+}
+
+impl Encoder {
+    /// Creates an encoder from environment variables.
+    pub fn from_env() -> Result<Self, HgError> {
+        let default = Encoder::default();
+        let local_encoding = match std::env::var_os("HGENCODING") {
+            None => default.local_encoding,
+            Some(s)
+                if s.eq_ignore_ascii_case("utf-8")
+                    || s.eq_ignore_ascii_case("utf8") =>
+            {
+                Encoding::Utf8
+            }
+            Some(s) if s.eq_ignore_ascii_case("ascii") => Encoding::Ascii,
+            Some(s) => {
+                return Err(HgError::unsupported(format!(
+                    "HGENCODING value '{}' is not supported",
+                    s.to_string_lossy()
+                )))
+            }
+        };
+        let decoding_mode = match std::env::var_os("HGENCODINGMODE") {
+            None => default.decoding_mode,
+            Some(s) if s == "strict" => Mode::Strict,
+            Some(s) if s == "replace" => Mode::Replace,
+            Some(s) => {
+                return Err(HgError::abort_simple(format!(
+                    "HGENCODINGMODE value '{}' is not supported",
+                    s.to_string_lossy()
+                )))
+            }
+        };
+        let ambiguous_width = match std::env::var_os("HGENCODINGAMBIGUOUS") {
+            None => default.ambiguous_width,
+            Some(s) if s == "narrow" => Width::Narrow,
+            Some(s) if s == "wide" => Width::Wide,
+            Some(s) => {
+                return Err(HgError::abort_simple(format!(
+                    "HGENCODINGAMBIGUOUS value '{}' is not supported",
+                    s.to_string_lossy()
+                )))
+            }
+        };
+        Ok(Self {
+            local_encoding,
+            decoding_mode,
+            ambiguous_width,
+        })
+    }
+
+    /// Decodes an internal UTF-8 string from bytes.
+    pub fn decode_internal<'a>(
+        &self,
+        bytes: &'a [u8],
+    ) -> Result<&'a str, HgError> {
+        decode_utf8(bytes).map_err(HgError::corrupted)
+    }
+
+    /// Converts a string from internal UTF-8 to the local character encoding.
+    pub fn to_local<'a>(&self, str: &'a str) -> Cow<'a, [u8]> {
+        match self.local_encoding {
+            Encoding::Utf8 => Cow::Borrowed(str.as_bytes()),
+            Encoding::Ascii => {
+                if str.is_ascii() {
+                    Cow::Borrowed(str.as_bytes())
+                } else {
+                    Cow::Owned(codepoints_to_ascii_lossy(str).into_bytes())
+                }
+            }
+        }
+    }
+
+    /// Converts a string from the local character encoding to UTF-8.
+    pub fn from_local<'a>(
+        &self,
+        bytes: &'a [u8],
+    ) -> Result<Cow<'a, str>, HgError> {
+        match (self.local_encoding, self.decoding_mode) {
+            (Encoding::Utf8, Mode::Strict) => Ok(Cow::Borrowed(
+                decode_utf8(bytes).map_err(HgError::abort_simple)?,
+            )),
+            (Encoding::Utf8, Mode::Replace) => {
+                Ok(String::from_utf8_lossy(bytes))
+            }
+            (Encoding::Ascii, Mode::Strict) => Ok(Cow::Borrowed(
+                decode_ascii(bytes).map_err(HgError::abort_simple)?,
+            )),
+            (Encoding::Ascii, Mode::Replace) => {
+                Ok(Cow::Owned(bytes_to_ascii_lossy(bytes)))
+            }
+        }
+    }
+
+    /// Returns the column width of a string for display.
+    pub fn column_width(&self, str: &str) -> usize {
+        match self.ambiguous_width {
+            Width::Narrow => str.width(),
+            Width::Wide => str.width_cjk(),
+        }
+    }
+
+    /// Returns the column width if `bytes` can be decoded as UTF-8, otherwise
+    /// just returns the length in bytes.
+    pub fn column_width_bytes(&self, bytes: &[u8]) -> usize {
+        match str::from_utf8(bytes) {
+            Ok(str) => self.column_width(str),
+            Err(_) => bytes.len(),
+        }
+    }
+}
+
+/// Decodes bytes as UTF-8 or returns a detailed error message.
+fn decode_utf8(bytes: &[u8]) -> Result<&str, String> {
+    str::from_utf8(bytes).map_err(|err| {
+        format!(
+            "invalid UTF-8 at offset {}: \"{}\"",
+            err.valid_up_to(),
+            str::from_utf8(&bytes.escaped_bytes()).unwrap()
+        )
+    })
+}
+
+/// Decodes bytes as ASCII or returns a detailed error message.
+fn decode_ascii(bytes: &[u8]) -> Result<&str, String> {
+    // TODO: Use `as_ascii` https://github.com/rust-lang/rust/issues/110998
+    if bytes.is_ascii() {
+        // Safety: Just checked that it's ASCII.
+        let str = unsafe { str::from_utf8_unchecked(bytes) };
+        Ok(str)
+    } else {
+        Err(format!(
+            "invalid ASCII: \"{}\"",
+            str::from_utf8(&bytes.escaped_bytes()).unwrap()
+        ))
+    }
+}
+
+/// Replaces all non-ASCII codepoints with '?'.
+fn codepoints_to_ascii_lossy(str: &str) -> String {
+    let mut ascii = String::new();
+    for char in str.chars() {
+        ascii.push(if char.is_ascii() { char } else { '?' });
+    }
+    ascii
+}
+
+/// Replaces all non-ASCII bytes with '?'.
+fn bytes_to_ascii_lossy(bytes: &[u8]) -> String {
+    let mut ascii = String::new();
+    for &b in bytes {
+        ascii.push(if b.is_ascii() { b as char } else { '?' });
+    }
+    ascii
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_decode_internal() {
+        let encoder = Encoder::default();
+        assert_eq!(encoder.decode_internal(b"").unwrap(), "");
+        assert_eq!(encoder.decode_internal(b"\xc3\xa9").unwrap(), "é");
+        match encoder.decode_internal(b"A\xc3") {
+            Ok(_) => panic!("expected an error"),
+            Err(HgError::CorruptedRepository(message)) => {
+                assert_eq!(message, "invalid UTF-8 at offset 1: \"A\\xc3\"")
+            }
+            Err(_) => panic!("expected a CorruptedRepository error"),
+        }
+    }
+
+    #[test]
+    fn test_to_local() {
+        let encoder = Encoder::default();
+        assert_eq!(encoder.to_local("").as_ref(), b"");
+        assert_eq!(encoder.to_local("é").as_ref(), b"\xc3\xa9");
+    }
+
+    #[test]
+    fn test_from_local() {
+        let encoder = Encoder::default();
+        assert_eq!(encoder.from_local(b"").unwrap(), "");
+        assert_eq!(encoder.from_local(b"\xc3\xa9").unwrap(), "é");
+        match encoder.from_local(b"A\xc3") {
+            Ok(_) => panic!("expected an error"),
+            Err(HgError::Abort { message, .. }) => {
+                assert_eq!(message, "invalid UTF-8 at offset 1: \"A\\xc3\"")
+            }
+            Err(_) => panic!("expected a CorruptedRepository error"),
+        }
+    }
+
+    #[test]
+    fn test_from_local_replace() {
+        let encoder = Encoder {
+            decoding_mode: Mode::Replace,
+            ..Default::default()
+        };
+        assert_eq!(encoder.from_local(b"A\xc3").unwrap(), "A\u{fffd}");
+    }
+
+    #[test]
+    fn test_column_width() {
+        let encoder = Encoder::default();
+        assert_eq!(encoder.column_width(""), 0);
+        assert_eq!(encoder.column_width("a"), 1);
+        assert_eq!(encoder.column_width("ab"), 2);
+        assert_eq!(encoder.column_width("été"), 3);
+        assert_eq!(encoder.column_width("\u{1f496}"), 2);
+    }
+
+    #[test]
+    fn test_column_width_ambiguous() {
+        let narrow_encoder = Encoder {
+            ambiguous_width: Width::Narrow,
+            ..Default::default()
+        };
+        assert_eq!(narrow_encoder.column_width("\u{2606}"), 1);
+
+        let wide_encoder = Encoder {
+            ambiguous_width: Width::Wide,
+            ..Default::default()
+        };
+        assert_eq!(wide_encoder.column_width("\u{2606}"), 2);
+    }
+
+    #[test]
+    fn test_column_width_bytes() {
+        let encoder = Encoder::default();
+        assert_eq!(encoder.column_width_bytes(b""), 0);
+        assert_eq!(encoder.column_width_bytes("été".as_bytes()), 3);
+        assert_eq!(encoder.column_width_bytes(b"A\xc3"), 2);
+    }
+}