rust/hg-core/src/encoding.rs
author Mitchell Kember <mkember@janestreet.com>
Wed, 05 Feb 2025 17:35:52 -0500
changeset 52756 bbf1c52252ae
child 52760 94e2547e6f3d
permissions -rw-r--r--
rust: add encoding.rs This is based on encoding.py. It reads the environment variables HGENCODING, HGENCODINGMODE, and HGENCODINGAMBIGUOUS. Currently it only supports UTF-8 and ascii, but it could be extended to support other local encodings. Unlike Python, it assumes all internal strings are UTF-8 and does not attempt to fallback to latin-1 (or ui.fallbackencoding). Nothing is using this now, but in the future command output and error messages should transition to using it. I replaced existing calls to `utf8_to_local` and `local_to_uf8` with direct String/bytes methods since they were not logically converting between internal and local encodings. Instead, they were used (for example) when an error message happened to be stored as String but needed to be passed somewhere as bytes. The proper fix for this will be to avoid String in the first place.
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
52756
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
     1
//! Character transcoding support.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
     2
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
     3
use core::str;
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
     4
use std::borrow::Cow;
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
     5
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
     6
use crate::{errors::HgError, utils::Escaped};
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
     7
use unicode_width::UnicodeWidthStr as _;
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
     8
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
     9
/// String encoder and decoder.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    10
#[derive(Copy, Clone, Debug)]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    11
pub struct Encoder {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    12
    /// The user's local encoding.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    13
    local_encoding: Encoding,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    14
    /// What to do when decoding fails. (Encoding always uses
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    15
    /// `Mode::Replace`).
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    16
    decoding_mode: Mode,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    17
    /// Width to use for characters that can be interpreted either as narrow
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    18
    /// or wide depending on the context.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    19
    pub ambiguous_width: Width,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    20
}
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    21
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    22
/// Character encoding.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    23
#[derive(Copy, Clone, Debug)]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    24
pub enum Encoding {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    25
    Utf8,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    26
    Ascii,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    27
}
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    28
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    29
/// Character decoding mode.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    30
#[derive(Copy, Clone, Debug)]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    31
pub enum Mode {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    32
    /// Produce an error message for invalid characters.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    33
    Strict,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    34
    /// Replace invalid characters with a special character.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    35
    Replace,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    36
}
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    37
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    38
/// The width of a Unicode character.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    39
#[derive(Copy, Clone, Debug)]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    40
pub enum Width {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    41
    /// Narrow, taking up 1 terminal column.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    42
    Narrow,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    43
    /// Wide, taking up 2 terminal columns.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    44
    Wide,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    45
}
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    46
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    47
impl Default for Encoder {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    48
    fn default() -> Self {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    49
        Self {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    50
            local_encoding: Encoding::Utf8,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    51
            decoding_mode: Mode::Strict,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    52
            ambiguous_width: Width::Narrow,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    53
        }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    54
    }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    55
}
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    56
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    57
impl Encoder {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    58
    /// Creates an encoder from environment variables.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    59
    pub fn from_env() -> Result<Self, HgError> {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    60
        let default = Encoder::default();
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    61
        let local_encoding = match std::env::var_os("HGENCODING") {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    62
            None => default.local_encoding,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    63
            Some(s)
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    64
                if s.eq_ignore_ascii_case("utf-8")
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    65
                    || s.eq_ignore_ascii_case("utf8") =>
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    66
            {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    67
                Encoding::Utf8
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    68
            }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    69
            Some(s) if s.eq_ignore_ascii_case("ascii") => Encoding::Ascii,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    70
            Some(s) => {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    71
                return Err(HgError::unsupported(format!(
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    72
                    "HGENCODING value '{}' is not supported",
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    73
                    s.to_string_lossy()
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    74
                )))
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    75
            }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    76
        };
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    77
        let decoding_mode = match std::env::var_os("HGENCODINGMODE") {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    78
            None => default.decoding_mode,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    79
            Some(s) if s == "strict" => Mode::Strict,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    80
            Some(s) if s == "replace" => Mode::Replace,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    81
            Some(s) => {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    82
                return Err(HgError::abort_simple(format!(
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    83
                    "HGENCODINGMODE value '{}' is not supported",
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    84
                    s.to_string_lossy()
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    85
                )))
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    86
            }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    87
        };
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    88
        let ambiguous_width = match std::env::var_os("HGENCODINGAMBIGUOUS") {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    89
            None => default.ambiguous_width,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    90
            Some(s) if s == "narrow" => Width::Narrow,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    91
            Some(s) if s == "wide" => Width::Wide,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    92
            Some(s) => {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    93
                return Err(HgError::abort_simple(format!(
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    94
                    "HGENCODINGAMBIGUOUS value '{}' is not supported",
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    95
                    s.to_string_lossy()
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    96
                )))
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    97
            }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    98
        };
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    99
        Ok(Self {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   100
            local_encoding,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   101
            decoding_mode,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   102
            ambiguous_width,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   103
        })
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   104
    }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   105
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   106
    /// Decodes an internal UTF-8 string from bytes.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   107
    pub fn decode_internal<'a>(
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   108
        &self,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   109
        bytes: &'a [u8],
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   110
    ) -> Result<&'a str, HgError> {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   111
        decode_utf8(bytes).map_err(HgError::corrupted)
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   112
    }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   113
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   114
    /// Converts a string from internal UTF-8 to the local character encoding.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   115
    pub fn to_local<'a>(&self, str: &'a str) -> Cow<'a, [u8]> {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   116
        match self.local_encoding {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   117
            Encoding::Utf8 => Cow::Borrowed(str.as_bytes()),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   118
            Encoding::Ascii => {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   119
                if str.is_ascii() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   120
                    Cow::Borrowed(str.as_bytes())
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   121
                } else {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   122
                    Cow::Owned(codepoints_to_ascii_lossy(str).into_bytes())
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   123
                }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   124
            }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   125
        }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   126
    }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   127
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   128
    /// Converts a string from the local character encoding to UTF-8.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   129
    pub fn from_local<'a>(
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   130
        &self,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   131
        bytes: &'a [u8],
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   132
    ) -> Result<Cow<'a, str>, HgError> {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   133
        match (self.local_encoding, self.decoding_mode) {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   134
            (Encoding::Utf8, Mode::Strict) => Ok(Cow::Borrowed(
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   135
                decode_utf8(bytes).map_err(HgError::abort_simple)?,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   136
            )),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   137
            (Encoding::Utf8, Mode::Replace) => {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   138
                Ok(String::from_utf8_lossy(bytes))
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   139
            }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   140
            (Encoding::Ascii, Mode::Strict) => Ok(Cow::Borrowed(
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   141
                decode_ascii(bytes).map_err(HgError::abort_simple)?,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   142
            )),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   143
            (Encoding::Ascii, Mode::Replace) => {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   144
                Ok(Cow::Owned(bytes_to_ascii_lossy(bytes)))
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   145
            }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   146
        }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   147
    }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   148
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   149
    /// Returns the column width of a string for display.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   150
    pub fn column_width(&self, str: &str) -> usize {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   151
        match self.ambiguous_width {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   152
            Width::Narrow => str.width(),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   153
            Width::Wide => str.width_cjk(),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   154
        }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   155
    }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   156
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   157
    /// Returns the column width if `bytes` can be decoded as UTF-8, otherwise
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   158
    /// just returns the length in bytes.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   159
    pub fn column_width_bytes(&self, bytes: &[u8]) -> usize {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   160
        match str::from_utf8(bytes) {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   161
            Ok(str) => self.column_width(str),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   162
            Err(_) => bytes.len(),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   163
        }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   164
    }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   165
}
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   166
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   167
/// Decodes bytes as UTF-8 or returns a detailed error message.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   168
fn decode_utf8(bytes: &[u8]) -> Result<&str, String> {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   169
    str::from_utf8(bytes).map_err(|err| {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   170
        format!(
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   171
            "invalid UTF-8 at offset {}: \"{}\"",
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   172
            err.valid_up_to(),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   173
            str::from_utf8(&bytes.escaped_bytes()).unwrap()
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   174
        )
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   175
    })
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   176
}
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   177
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   178
/// Decodes bytes as ASCII or returns a detailed error message.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   179
fn decode_ascii(bytes: &[u8]) -> Result<&str, String> {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   180
    // TODO: Use `as_ascii` https://github.com/rust-lang/rust/issues/110998
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   181
    if bytes.is_ascii() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   182
        // Safety: Just checked that it's ASCII.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   183
        let str = unsafe { str::from_utf8_unchecked(bytes) };
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   184
        Ok(str)
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   185
    } else {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   186
        Err(format!(
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   187
            "invalid ASCII: \"{}\"",
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   188
            str::from_utf8(&bytes.escaped_bytes()).unwrap()
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   189
        ))
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   190
    }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   191
}
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   192
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   193
/// Replaces all non-ASCII codepoints with '?'.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   194
fn codepoints_to_ascii_lossy(str: &str) -> String {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   195
    let mut ascii = String::new();
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   196
    for char in str.chars() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   197
        ascii.push(if char.is_ascii() { char } else { '?' });
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   198
    }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   199
    ascii
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   200
}
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   201
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   202
/// Replaces all non-ASCII bytes with '?'.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   203
fn bytes_to_ascii_lossy(bytes: &[u8]) -> String {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   204
    let mut ascii = String::new();
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   205
    for &b in bytes {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   206
        ascii.push(if b.is_ascii() { b as char } else { '?' });
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   207
    }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   208
    ascii
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   209
}
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   210
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   211
#[cfg(test)]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   212
mod tests {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   213
    use super::*;
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   214
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   215
    #[test]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   216
    fn test_decode_internal() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   217
        let encoder = Encoder::default();
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   218
        assert_eq!(encoder.decode_internal(b"").unwrap(), "");
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   219
        assert_eq!(encoder.decode_internal(b"\xc3\xa9").unwrap(), "é");
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   220
        match encoder.decode_internal(b"A\xc3") {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   221
            Ok(_) => panic!("expected an error"),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   222
            Err(HgError::CorruptedRepository(message)) => {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   223
                assert_eq!(message, "invalid UTF-8 at offset 1: \"A\\xc3\"")
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   224
            }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   225
            Err(_) => panic!("expected a CorruptedRepository error"),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   226
        }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   227
    }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   228
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   229
    #[test]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   230
    fn test_to_local() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   231
        let encoder = Encoder::default();
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   232
        assert_eq!(encoder.to_local("").as_ref(), b"");
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   233
        assert_eq!(encoder.to_local("é").as_ref(), b"\xc3\xa9");
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   234
    }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   235
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   236
    #[test]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   237
    fn test_from_local() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   238
        let encoder = Encoder::default();
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   239
        assert_eq!(encoder.from_local(b"").unwrap(), "");
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   240
        assert_eq!(encoder.from_local(b"\xc3\xa9").unwrap(), "é");
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   241
        match encoder.from_local(b"A\xc3") {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   242
            Ok(_) => panic!("expected an error"),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   243
            Err(HgError::Abort { message, .. }) => {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   244
                assert_eq!(message, "invalid UTF-8 at offset 1: \"A\\xc3\"")
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   245
            }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   246
            Err(_) => panic!("expected a CorruptedRepository error"),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   247
        }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   248
    }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   249
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   250
    #[test]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   251
    fn test_from_local_replace() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   252
        let encoder = Encoder {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   253
            decoding_mode: Mode::Replace,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   254
            ..Default::default()
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   255
        };
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   256
        assert_eq!(encoder.from_local(b"A\xc3").unwrap(), "A\u{fffd}");
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   257
    }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   258
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   259
    #[test]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   260
    fn test_column_width() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   261
        let encoder = Encoder::default();
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   262
        assert_eq!(encoder.column_width(""), 0);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   263
        assert_eq!(encoder.column_width("a"), 1);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   264
        assert_eq!(encoder.column_width("ab"), 2);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   265
        assert_eq!(encoder.column_width("été"), 3);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   266
        assert_eq!(encoder.column_width("\u{1f496}"), 2);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   267
    }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   268
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   269
    #[test]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   270
    fn test_column_width_ambiguous() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   271
        let narrow_encoder = Encoder {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   272
            ambiguous_width: Width::Narrow,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   273
            ..Default::default()
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   274
        };
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   275
        assert_eq!(narrow_encoder.column_width("\u{2606}"), 1);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   276
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   277
        let wide_encoder = Encoder {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   278
            ambiguous_width: Width::Wide,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   279
            ..Default::default()
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   280
        };
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   281
        assert_eq!(wide_encoder.column_width("\u{2606}"), 2);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   282
    }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   283
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   284
    #[test]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   285
    fn test_column_width_bytes() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   286
        let encoder = Encoder::default();
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   287
        assert_eq!(encoder.column_width_bytes(b""), 0);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   288
        assert_eq!(encoder.column_width_bytes("été".as_bytes()), 3);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   289
        assert_eq!(encoder.column_width_bytes(b"A\xc3"), 2);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   290
    }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   291
}