mercurial-scm/hg: comparison mercurial/encoding.py

equal deleted inserted replaced

-:57b0c7221dba
+:3ea3c96ada54
 s = bytes.__new__(cls, l)
 s._utf8 = u
 return s
 def __hash__(self):
 return hash(self._utf8) # avoid collisions in local string space
+class safelocalstr(bytes):
+"""Tagged string denoting it was previously an internal UTF-8 string,
+and can be converted back to UTF-8 losslessly
+>>> assert safelocalstr(b'\\xc3') == b'\\xc3'
+>>> assert b'\\xc3' == safelocalstr(b'\\xc3')
+>>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
+>>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
+"""
 def tolocal(s):
 """
 Convert a string from internal UTF-8 to local encoding
 # fast path
 return s
 r = u.encode(_sysstr(encoding), u"replace")
 if u == r.decode(_sysstr(encoding)):
 # r is a safe, non-lossy encoding of s
-return r
+return safelocalstr(r)
 return localstr(s, r)
 except UnicodeDecodeError:
 # we should only get here if we're looking at an ancient changeset
 try:
 u = s.decode(_sysstr(fallbackencoding))
 r = u.encode(_sysstr(encoding), u"replace")
 if u == r.decode(_sysstr(encoding)):
 # r is a safe, non-lossy encoding of s
-return r
+return safelocalstr(r)
 return localstr(u.encode('UTF-8'), r)
 except UnicodeDecodeError:
 u = s.decode("utf-8", "replace") # last ditch
 # can't round-trip
 return u.encode(_sysstr(encoding), u"replace")
 '''returns a string suitable for JSON
 JSON is problematic for us because it doesn't support non-Unicode
 bytes. To deal with this, we take the following approach:
-- localstr objects are converted back to UTF-8
+- localstr/safelocalstr objects are converted back to UTF-8
 - valid UTF-8/ASCII strings are passed as-is
 - other strings are converted to UTF-8b surrogate encoding
 - apply JSON-specified string escaping
 (escapes are doubled in these tests)
 - filenames and file contents in arbitrary other encodings can have
 be round-tripped or recovered by clueful clients
 - local strings that have a cached known UTF-8 encoding (aka
 localstr) get sent as UTF-8 so Unicode-oriented clients get the
 Unicode data they want
+- non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
 - because we must preserve UTF-8 bytestring in places such as
 filenames, metadata can't be roundtripped without help
 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
 arbitrary bytes into an internal Unicode format that can be
 if isinstance(s, localstr):
 # assume that the original UTF-8 sequence would never contain
 # invalid characters in U+DCxx range
 return s._utf8
+elif isinstance(s, safelocalstr):
+# already verified that s is non-lossy in legacy encoding, which
+# shouldn't contain characters in U+DCxx range
+return fromlocal(s)
 elif isasciistr(s):
 return s
 if "\xed" not in s:
 try:
 s.decode('utf-8', _utf8strict)

changeset 37947	3ea3c96ada54
parent 37946	57b0c7221dba
child 38739	7acec9408e1c