diff -r 443029011990 -r 7acec9408e1c mercurial/encoding.py --- a/mercurial/encoding.py Sun Jul 01 23:36:53 2018 +0900 +++ b/mercurial/encoding.py Thu Jul 19 13:55:54 2018 -0400 @@ -98,6 +98,16 @@ def __hash__(self): return hash(self._utf8) # avoid collisions in local string space +class safelocalstr(bytes): + """Tagged string denoting it was previously an internal UTF-8 string, + and can be converted back to UTF-8 losslessly + + >>> assert safelocalstr(b'\\xc3') == b'\\xc3' + >>> assert b'\\xc3' == safelocalstr(b'\\xc3') + >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0} + >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0} + """ + def tolocal(s): """ Convert a string from internal UTF-8 to local encoding @@ -145,7 +155,7 @@ r = u.encode(_sysstr(encoding), u"replace") if u == r.decode(_sysstr(encoding)): # r is a safe, non-lossy encoding of s - return r + return safelocalstr(r) return localstr(s, r) except UnicodeDecodeError: # we should only get here if we're looking at an ancient changeset @@ -154,7 +164,7 @@ r = u.encode(_sysstr(encoding), u"replace") if u == r.decode(_sysstr(encoding)): # r is a safe, non-lossy encoding of s - return r + return safelocalstr(r) return localstr(u.encode('UTF-8'), r) except UnicodeDecodeError: u = s.decode("utf-8", "replace") # last ditch @@ -407,7 +417,7 @@ JSON is problematic for us because it doesn't support non-Unicode bytes. To deal with this, we take the following approach: - - localstr objects are converted back to UTF-8 + - localstr/safelocalstr objects are converted back to UTF-8 - valid UTF-8/ASCII strings are passed as-is - other strings are converted to UTF-8b surrogate encoding - apply JSON-specified string escaping @@ -500,6 +510,7 @@ - local strings that have a cached known UTF-8 encoding (aka localstr) get sent as UTF-8 so Unicode-oriented clients get the Unicode data they want + - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well - because we must preserve UTF-8 bytestring in places such as filenames, metadata can't be roundtripped without help @@ -509,11 +520,17 @@ internal surrogate encoding as a UTF-8 string.) ''' - if not isinstance(s, localstr) and isasciistr(s): + if isinstance(s, localstr): + # assume that the original UTF-8 sequence would never contain + # invalid characters in U+DCxx range + return s._utf8 + elif isinstance(s, safelocalstr): + # already verified that s is non-lossy in legacy encoding, which + # shouldn't contain characters in U+DCxx range + return fromlocal(s) + elif isasciistr(s): return s if "\xed" not in s: - if isinstance(s, localstr): - return s._utf8 try: s.decode('utf-8', _utf8strict) return s