rust/hg-core/src/encoding.rs
author Mitchell Kember <mkember@janestreet.com>
Thu, 16 Jan 2025 13:15:02 -0500
changeset 52760 94e2547e6f3d
parent 52756 bbf1c52252ae
permissions -rw-r--r--
rust: move code from utils to utils::strings This moves string-related functions in hg::utils into the recently added hg::utils::strings module.
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
52756
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
     1
//! Character transcoding support.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
     2
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
     3
use core::str;
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
     4
use std::borrow::Cow;
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
     5
52760
94e2547e6f3d rust: move code from utils to utils::strings
Mitchell Kember <mkember@janestreet.com>
parents: 52756
diff changeset
     6
use crate::{errors::HgError, utils::strings::Escaped};
52756
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
     7
use unicode_width::UnicodeWidthStr as _;
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
     8
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
     9
/// String encoder and decoder.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    10
#[derive(Copy, Clone, Debug)]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    11
pub struct Encoder {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    12
    /// The user's local encoding.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    13
    local_encoding: Encoding,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    14
    /// What to do when decoding fails. (Encoding always uses
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    15
    /// `Mode::Replace`).
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    16
    decoding_mode: Mode,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    17
    /// Width to use for characters that can be interpreted either as narrow
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    18
    /// or wide depending on the context.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    19
    pub ambiguous_width: Width,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    20
}
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    21
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    22
/// Character encoding.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    23
#[derive(Copy, Clone, Debug)]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    24
pub enum Encoding {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    25
    Utf8,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    26
    Ascii,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    27
}
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    28
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    29
/// Character decoding mode.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    30
#[derive(Copy, Clone, Debug)]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    31
pub enum Mode {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    32
    /// Produce an error message for invalid characters.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    33
    Strict,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    34
    /// Replace invalid characters with a special character.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    35
    Replace,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    36
}
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    37
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    38
/// The width of a Unicode character.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    39
#[derive(Copy, Clone, Debug)]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    40
pub enum Width {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    41
    /// Narrow, taking up 1 terminal column.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    42
    Narrow,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    43
    /// Wide, taking up 2 terminal columns.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    44
    Wide,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    45
}
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    46
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    47
impl Default for Encoder {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    48
    fn default() -> Self {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    49
        Self {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    50
            local_encoding: Encoding::Utf8,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    51
            decoding_mode: Mode::Strict,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    52
            ambiguous_width: Width::Narrow,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    53
        }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    54
    }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    55
}
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    56
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    57
impl Encoder {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    58
    /// Creates an encoder from environment variables.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    59
    pub fn from_env() -> Result<Self, HgError> {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    60
        let default = Encoder::default();
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    61
        let local_encoding = match std::env::var_os("HGENCODING") {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    62
            None => default.local_encoding,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    63
            Some(s)
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    64
                if s.eq_ignore_ascii_case("utf-8")
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    65
                    || s.eq_ignore_ascii_case("utf8") =>
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    66
            {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    67
                Encoding::Utf8
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    68
            }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    69
            Some(s) if s.eq_ignore_ascii_case("ascii") => Encoding::Ascii,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    70
            Some(s) => {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    71
                return Err(HgError::unsupported(format!(
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    72
                    "HGENCODING value '{}' is not supported",
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    73
                    s.to_string_lossy()
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    74
                )))
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    75
            }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    76
        };
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    77
        let decoding_mode = match std::env::var_os("HGENCODINGMODE") {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    78
            None => default.decoding_mode,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    79
            Some(s) if s == "strict" => Mode::Strict,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    80
            Some(s) if s == "replace" => Mode::Replace,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    81
            Some(s) => {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    82
                return Err(HgError::abort_simple(format!(
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    83
                    "HGENCODINGMODE value '{}' is not supported",
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    84
                    s.to_string_lossy()
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    85
                )))
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    86
            }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    87
        };
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    88
        let ambiguous_width = match std::env::var_os("HGENCODINGAMBIGUOUS") {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    89
            None => default.ambiguous_width,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    90
            Some(s) if s == "narrow" => Width::Narrow,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    91
            Some(s) if s == "wide" => Width::Wide,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    92
            Some(s) => {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    93
                return Err(HgError::abort_simple(format!(
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    94
                    "HGENCODINGAMBIGUOUS value '{}' is not supported",
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    95
                    s.to_string_lossy()
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    96
                )))
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    97
            }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    98
        };
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
    99
        Ok(Self {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   100
            local_encoding,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   101
            decoding_mode,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   102
            ambiguous_width,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   103
        })
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   104
    }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   105
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   106
    /// Decodes an internal UTF-8 string from bytes.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   107
    pub fn decode_internal<'a>(
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   108
        &self,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   109
        bytes: &'a [u8],
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   110
    ) -> Result<&'a str, HgError> {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   111
        decode_utf8(bytes).map_err(HgError::corrupted)
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   112
    }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   113
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   114
    /// Converts a string from internal UTF-8 to the local character encoding.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   115
    pub fn to_local<'a>(&self, str: &'a str) -> Cow<'a, [u8]> {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   116
        match self.local_encoding {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   117
            Encoding::Utf8 => Cow::Borrowed(str.as_bytes()),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   118
            Encoding::Ascii => {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   119
                if str.is_ascii() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   120
                    Cow::Borrowed(str.as_bytes())
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   121
                } else {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   122
                    Cow::Owned(codepoints_to_ascii_lossy(str).into_bytes())
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   123
                }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   124
            }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   125
        }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   126
    }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   127
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   128
    /// Converts a string from the local character encoding to UTF-8.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   129
    pub fn from_local<'a>(
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   130
        &self,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   131
        bytes: &'a [u8],
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   132
    ) -> Result<Cow<'a, str>, HgError> {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   133
        match (self.local_encoding, self.decoding_mode) {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   134
            (Encoding::Utf8, Mode::Strict) => Ok(Cow::Borrowed(
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   135
                decode_utf8(bytes).map_err(HgError::abort_simple)?,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   136
            )),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   137
            (Encoding::Utf8, Mode::Replace) => {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   138
                Ok(String::from_utf8_lossy(bytes))
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   139
            }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   140
            (Encoding::Ascii, Mode::Strict) => Ok(Cow::Borrowed(
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   141
                decode_ascii(bytes).map_err(HgError::abort_simple)?,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   142
            )),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   143
            (Encoding::Ascii, Mode::Replace) => {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   144
                Ok(Cow::Owned(bytes_to_ascii_lossy(bytes)))
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   145
            }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   146
        }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   147
    }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   148
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   149
    /// Returns the column width of a string for display.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   150
    pub fn column_width(&self, str: &str) -> usize {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   151
        match self.ambiguous_width {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   152
            Width::Narrow => str.width(),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   153
            Width::Wide => str.width_cjk(),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   154
        }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   155
    }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   156
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   157
    /// Returns the column width if `bytes` can be decoded as UTF-8, otherwise
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   158
    /// just returns the length in bytes.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   159
    pub fn column_width_bytes(&self, bytes: &[u8]) -> usize {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   160
        match str::from_utf8(bytes) {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   161
            Ok(str) => self.column_width(str),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   162
            Err(_) => bytes.len(),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   163
        }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   164
    }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   165
}
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   166
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   167
/// Decodes bytes as UTF-8 or returns a detailed error message.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   168
fn decode_utf8(bytes: &[u8]) -> Result<&str, String> {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   169
    str::from_utf8(bytes).map_err(|err| {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   170
        format!(
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   171
            "invalid UTF-8 at offset {}: \"{}\"",
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   172
            err.valid_up_to(),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   173
            str::from_utf8(&bytes.escaped_bytes()).unwrap()
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   174
        )
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   175
    })
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   176
}
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   177
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   178
/// Decodes bytes as ASCII or returns a detailed error message.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   179
fn decode_ascii(bytes: &[u8]) -> Result<&str, String> {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   180
    // TODO: Use `as_ascii` https://github.com/rust-lang/rust/issues/110998
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   181
    if bytes.is_ascii() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   182
        // Safety: Just checked that it's ASCII.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   183
        let str = unsafe { str::from_utf8_unchecked(bytes) };
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   184
        Ok(str)
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   185
    } else {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   186
        Err(format!(
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   187
            "invalid ASCII: \"{}\"",
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   188
            str::from_utf8(&bytes.escaped_bytes()).unwrap()
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   189
        ))
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   190
    }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   191
}
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   192
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   193
/// Replaces all non-ASCII codepoints with '?'.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   194
fn codepoints_to_ascii_lossy(str: &str) -> String {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   195
    let mut ascii = String::new();
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   196
    for char in str.chars() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   197
        ascii.push(if char.is_ascii() { char } else { '?' });
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   198
    }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   199
    ascii
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   200
}
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   201
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   202
/// Replaces all non-ASCII bytes with '?'.
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   203
fn bytes_to_ascii_lossy(bytes: &[u8]) -> String {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   204
    let mut ascii = String::new();
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   205
    for &b in bytes {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   206
        ascii.push(if b.is_ascii() { b as char } else { '?' });
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   207
    }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   208
    ascii
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   209
}
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   210
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   211
#[cfg(test)]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   212
mod tests {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   213
    use super::*;
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   214
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   215
    #[test]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   216
    fn test_decode_internal() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   217
        let encoder = Encoder::default();
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   218
        assert_eq!(encoder.decode_internal(b"").unwrap(), "");
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   219
        assert_eq!(encoder.decode_internal(b"\xc3\xa9").unwrap(), "é");
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   220
        match encoder.decode_internal(b"A\xc3") {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   221
            Ok(_) => panic!("expected an error"),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   222
            Err(HgError::CorruptedRepository(message)) => {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   223
                assert_eq!(message, "invalid UTF-8 at offset 1: \"A\\xc3\"")
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   224
            }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   225
            Err(_) => panic!("expected a CorruptedRepository error"),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   226
        }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   227
    }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   228
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   229
    #[test]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   230
    fn test_to_local() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   231
        let encoder = Encoder::default();
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   232
        assert_eq!(encoder.to_local("").as_ref(), b"");
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   233
        assert_eq!(encoder.to_local("é").as_ref(), b"\xc3\xa9");
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   234
    }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   235
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   236
    #[test]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   237
    fn test_from_local() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   238
        let encoder = Encoder::default();
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   239
        assert_eq!(encoder.from_local(b"").unwrap(), "");
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   240
        assert_eq!(encoder.from_local(b"\xc3\xa9").unwrap(), "é");
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   241
        match encoder.from_local(b"A\xc3") {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   242
            Ok(_) => panic!("expected an error"),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   243
            Err(HgError::Abort { message, .. }) => {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   244
                assert_eq!(message, "invalid UTF-8 at offset 1: \"A\\xc3\"")
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   245
            }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   246
            Err(_) => panic!("expected a CorruptedRepository error"),
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   247
        }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   248
    }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   249
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   250
    #[test]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   251
    fn test_from_local_replace() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   252
        let encoder = Encoder {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   253
            decoding_mode: Mode::Replace,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   254
            ..Default::default()
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   255
        };
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   256
        assert_eq!(encoder.from_local(b"A\xc3").unwrap(), "A\u{fffd}");
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   257
    }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   258
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   259
    #[test]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   260
    fn test_column_width() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   261
        let encoder = Encoder::default();
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   262
        assert_eq!(encoder.column_width(""), 0);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   263
        assert_eq!(encoder.column_width("a"), 1);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   264
        assert_eq!(encoder.column_width("ab"), 2);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   265
        assert_eq!(encoder.column_width("été"), 3);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   266
        assert_eq!(encoder.column_width("\u{1f496}"), 2);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   267
    }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   268
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   269
    #[test]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   270
    fn test_column_width_ambiguous() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   271
        let narrow_encoder = Encoder {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   272
            ambiguous_width: Width::Narrow,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   273
            ..Default::default()
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   274
        };
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   275
        assert_eq!(narrow_encoder.column_width("\u{2606}"), 1);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   276
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   277
        let wide_encoder = Encoder {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   278
            ambiguous_width: Width::Wide,
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   279
            ..Default::default()
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   280
        };
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   281
        assert_eq!(wide_encoder.column_width("\u{2606}"), 2);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   282
    }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   283
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   284
    #[test]
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   285
    fn test_column_width_bytes() {
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   286
        let encoder = Encoder::default();
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   287
        assert_eq!(encoder.column_width_bytes(b""), 0);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   288
        assert_eq!(encoder.column_width_bytes("été".as_bytes()), 3);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   289
        assert_eq!(encoder.column_width_bytes(b"A\xc3"), 2);
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   290
    }
bbf1c52252ae rust: add encoding.rs
Mitchell Kember <mkember@janestreet.com>
parents:
diff changeset
   291
}