comparison mercurial/encoding.py @ 37991:3ea3c96ada54

encoding: introduce tagging type for non-lossy non-ASCII string This fixes the weird behavior of toutf8b(), which would convert a local string back to UTF-8 *only if* it was lossy in the system encoding. Before b7b26e54e37a "encoding: avoid localstr when a string can be encoded losslessly (issue2763)", all local strings were wrapped by the localstr class. I think this would justify the round-trip behavior of toutf8b(). ASCII strings are special-cased, so the cost of wrapping with safelocalstr is negligible. (with mercurial repo) $ export HGRCPATH=/dev/null HGPLAIN= HGENCODING=latin-1 $ hg log --time --config experimental.evolution=all > /dev/null (original) time: real 11.340 secs (user 11.290+0.000 sys 0.060+0.000) time: real 11.390 secs (user 11.300+0.000 sys 0.080+0.000) time: real 11.430 secs (user 11.360+0.000 sys 0.070+0.000) (this patch) time: real 11.200 secs (user 11.100+0.000 sys 0.100+0.000) time: real 11.370 secs (user 11.300+0.000 sys 0.070+0.000) time: real 11.190 secs (user 11.130+0.000 sys 0.060+0.000)
author Yuya Nishihara <yuya@tcha.org>
date Sun, 23 Apr 2017 13:15:30 +0900
parents 57b0c7221dba
children 7acec9408e1c
comparison
equal deleted inserted replaced
37990:57b0c7221dba 37991:3ea3c96ada54
90 s = bytes.__new__(cls, l) 90 s = bytes.__new__(cls, l)
91 s._utf8 = u 91 s._utf8 = u
92 return s 92 return s
93 def __hash__(self): 93 def __hash__(self):
94 return hash(self._utf8) # avoid collisions in local string space 94 return hash(self._utf8) # avoid collisions in local string space
95
96 class safelocalstr(bytes):
97 """Tagged string denoting it was previously an internal UTF-8 string,
98 and can be converted back to UTF-8 losslessly
99
100 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
101 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
102 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
103 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
104 """
95 105
96 def tolocal(s): 106 def tolocal(s):
97 """ 107 """
98 Convert a string from internal UTF-8 to local encoding 108 Convert a string from internal UTF-8 to local encoding
99 109
138 # fast path 148 # fast path
139 return s 149 return s
140 r = u.encode(_sysstr(encoding), u"replace") 150 r = u.encode(_sysstr(encoding), u"replace")
141 if u == r.decode(_sysstr(encoding)): 151 if u == r.decode(_sysstr(encoding)):
142 # r is a safe, non-lossy encoding of s 152 # r is a safe, non-lossy encoding of s
143 return r 153 return safelocalstr(r)
144 return localstr(s, r) 154 return localstr(s, r)
145 except UnicodeDecodeError: 155 except UnicodeDecodeError:
146 # we should only get here if we're looking at an ancient changeset 156 # we should only get here if we're looking at an ancient changeset
147 try: 157 try:
148 u = s.decode(_sysstr(fallbackencoding)) 158 u = s.decode(_sysstr(fallbackencoding))
149 r = u.encode(_sysstr(encoding), u"replace") 159 r = u.encode(_sysstr(encoding), u"replace")
150 if u == r.decode(_sysstr(encoding)): 160 if u == r.decode(_sysstr(encoding)):
151 # r is a safe, non-lossy encoding of s 161 # r is a safe, non-lossy encoding of s
152 return r 162 return safelocalstr(r)
153 return localstr(u.encode('UTF-8'), r) 163 return localstr(u.encode('UTF-8'), r)
154 except UnicodeDecodeError: 164 except UnicodeDecodeError:
155 u = s.decode("utf-8", "replace") # last ditch 165 u = s.decode("utf-8", "replace") # last ditch
156 # can't round-trip 166 # can't round-trip
157 return u.encode(_sysstr(encoding), u"replace") 167 return u.encode(_sysstr(encoding), u"replace")
400 '''returns a string suitable for JSON 410 '''returns a string suitable for JSON
401 411
402 JSON is problematic for us because it doesn't support non-Unicode 412 JSON is problematic for us because it doesn't support non-Unicode
403 bytes. To deal with this, we take the following approach: 413 bytes. To deal with this, we take the following approach:
404 414
405 - localstr objects are converted back to UTF-8 415 - localstr/safelocalstr objects are converted back to UTF-8
406 - valid UTF-8/ASCII strings are passed as-is 416 - valid UTF-8/ASCII strings are passed as-is
407 - other strings are converted to UTF-8b surrogate encoding 417 - other strings are converted to UTF-8b surrogate encoding
408 - apply JSON-specified string escaping 418 - apply JSON-specified string escaping
409 419
410 (escapes are doubled in these tests) 420 (escapes are doubled in these tests)
493 - filenames and file contents in arbitrary other encodings can have 503 - filenames and file contents in arbitrary other encodings can have
494 be round-tripped or recovered by clueful clients 504 be round-tripped or recovered by clueful clients
495 - local strings that have a cached known UTF-8 encoding (aka 505 - local strings that have a cached known UTF-8 encoding (aka
496 localstr) get sent as UTF-8 so Unicode-oriented clients get the 506 localstr) get sent as UTF-8 so Unicode-oriented clients get the
497 Unicode data they want 507 Unicode data they want
508 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
498 - because we must preserve UTF-8 bytestring in places such as 509 - because we must preserve UTF-8 bytestring in places such as
499 filenames, metadata can't be roundtripped without help 510 filenames, metadata can't be roundtripped without help
500 511
501 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and 512 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
502 arbitrary bytes into an internal Unicode format that can be 513 arbitrary bytes into an internal Unicode format that can be
506 517
507 if isinstance(s, localstr): 518 if isinstance(s, localstr):
508 # assume that the original UTF-8 sequence would never contain 519 # assume that the original UTF-8 sequence would never contain
509 # invalid characters in U+DCxx range 520 # invalid characters in U+DCxx range
510 return s._utf8 521 return s._utf8
522 elif isinstance(s, safelocalstr):
523 # already verified that s is non-lossy in legacy encoding, which
524 # shouldn't contain characters in U+DCxx range
525 return fromlocal(s)
511 elif isasciistr(s): 526 elif isasciistr(s):
512 return s 527 return s
513 if "\xed" not in s: 528 if "\xed" not in s:
514 try: 529 try:
515 s.decode('utf-8', _utf8strict) 530 s.decode('utf-8', _utf8strict)