Mercurial > public > mercurial-scm > hg-stable
comparison mercurial/encoding.py @ 37991:3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
This fixes the weird behavior of toutf8b(), which would convert a local
string back to UTF-8 *only if* it was lossy in the system encoding.
Before b7b26e54e37a "encoding: avoid localstr when a string can be encoded
losslessly (issue2763)", all local strings were wrapped by the localstr
class. I think this would justify the round-trip behavior of toutf8b().
ASCII strings are special-cased, so the cost of wrapping with safelocalstr
is negligible.
(with mercurial repo)
$ export HGRCPATH=/dev/null HGPLAIN= HGENCODING=latin-1
$ hg log --time --config experimental.evolution=all > /dev/null
(original)
time: real 11.340 secs (user 11.290+0.000 sys 0.060+0.000)
time: real 11.390 secs (user 11.300+0.000 sys 0.080+0.000)
time: real 11.430 secs (user 11.360+0.000 sys 0.070+0.000)
(this patch)
time: real 11.200 secs (user 11.100+0.000 sys 0.100+0.000)
time: real 11.370 secs (user 11.300+0.000 sys 0.070+0.000)
time: real 11.190 secs (user 11.130+0.000 sys 0.060+0.000)
author | Yuya Nishihara <yuya@tcha.org> |
---|---|
date | Sun, 23 Apr 2017 13:15:30 +0900 |
parents | 57b0c7221dba |
children | 7acec9408e1c |
comparison
equal
deleted
inserted
replaced
37990:57b0c7221dba | 37991:3ea3c96ada54 |
---|---|
90 s = bytes.__new__(cls, l) | 90 s = bytes.__new__(cls, l) |
91 s._utf8 = u | 91 s._utf8 = u |
92 return s | 92 return s |
93 def __hash__(self): | 93 def __hash__(self): |
94 return hash(self._utf8) # avoid collisions in local string space | 94 return hash(self._utf8) # avoid collisions in local string space |
95 | |
96 class safelocalstr(bytes): | |
97 """Tagged string denoting it was previously an internal UTF-8 string, | |
98 and can be converted back to UTF-8 losslessly | |
99 | |
100 >>> assert safelocalstr(b'\\xc3') == b'\\xc3' | |
101 >>> assert b'\\xc3' == safelocalstr(b'\\xc3') | |
102 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0} | |
103 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0} | |
104 """ | |
95 | 105 |
96 def tolocal(s): | 106 def tolocal(s): |
97 """ | 107 """ |
98 Convert a string from internal UTF-8 to local encoding | 108 Convert a string from internal UTF-8 to local encoding |
99 | 109 |
138 # fast path | 148 # fast path |
139 return s | 149 return s |
140 r = u.encode(_sysstr(encoding), u"replace") | 150 r = u.encode(_sysstr(encoding), u"replace") |
141 if u == r.decode(_sysstr(encoding)): | 151 if u == r.decode(_sysstr(encoding)): |
142 # r is a safe, non-lossy encoding of s | 152 # r is a safe, non-lossy encoding of s |
143 return r | 153 return safelocalstr(r) |
144 return localstr(s, r) | 154 return localstr(s, r) |
145 except UnicodeDecodeError: | 155 except UnicodeDecodeError: |
146 # we should only get here if we're looking at an ancient changeset | 156 # we should only get here if we're looking at an ancient changeset |
147 try: | 157 try: |
148 u = s.decode(_sysstr(fallbackencoding)) | 158 u = s.decode(_sysstr(fallbackencoding)) |
149 r = u.encode(_sysstr(encoding), u"replace") | 159 r = u.encode(_sysstr(encoding), u"replace") |
150 if u == r.decode(_sysstr(encoding)): | 160 if u == r.decode(_sysstr(encoding)): |
151 # r is a safe, non-lossy encoding of s | 161 # r is a safe, non-lossy encoding of s |
152 return r | 162 return safelocalstr(r) |
153 return localstr(u.encode('UTF-8'), r) | 163 return localstr(u.encode('UTF-8'), r) |
154 except UnicodeDecodeError: | 164 except UnicodeDecodeError: |
155 u = s.decode("utf-8", "replace") # last ditch | 165 u = s.decode("utf-8", "replace") # last ditch |
156 # can't round-trip | 166 # can't round-trip |
157 return u.encode(_sysstr(encoding), u"replace") | 167 return u.encode(_sysstr(encoding), u"replace") |
400 '''returns a string suitable for JSON | 410 '''returns a string suitable for JSON |
401 | 411 |
402 JSON is problematic for us because it doesn't support non-Unicode | 412 JSON is problematic for us because it doesn't support non-Unicode |
403 bytes. To deal with this, we take the following approach: | 413 bytes. To deal with this, we take the following approach: |
404 | 414 |
405 - localstr objects are converted back to UTF-8 | 415 - localstr/safelocalstr objects are converted back to UTF-8 |
406 - valid UTF-8/ASCII strings are passed as-is | 416 - valid UTF-8/ASCII strings are passed as-is |
407 - other strings are converted to UTF-8b surrogate encoding | 417 - other strings are converted to UTF-8b surrogate encoding |
408 - apply JSON-specified string escaping | 418 - apply JSON-specified string escaping |
409 | 419 |
410 (escapes are doubled in these tests) | 420 (escapes are doubled in these tests) |
493 - filenames and file contents in arbitrary other encodings can have | 503 - filenames and file contents in arbitrary other encodings can have |
494 be round-tripped or recovered by clueful clients | 504 be round-tripped or recovered by clueful clients |
495 - local strings that have a cached known UTF-8 encoding (aka | 505 - local strings that have a cached known UTF-8 encoding (aka |
496 localstr) get sent as UTF-8 so Unicode-oriented clients get the | 506 localstr) get sent as UTF-8 so Unicode-oriented clients get the |
497 Unicode data they want | 507 Unicode data they want |
508 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well | |
498 - because we must preserve UTF-8 bytestring in places such as | 509 - because we must preserve UTF-8 bytestring in places such as |
499 filenames, metadata can't be roundtripped without help | 510 filenames, metadata can't be roundtripped without help |
500 | 511 |
501 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and | 512 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and |
502 arbitrary bytes into an internal Unicode format that can be | 513 arbitrary bytes into an internal Unicode format that can be |
506 | 517 |
507 if isinstance(s, localstr): | 518 if isinstance(s, localstr): |
508 # assume that the original UTF-8 sequence would never contain | 519 # assume that the original UTF-8 sequence would never contain |
509 # invalid characters in U+DCxx range | 520 # invalid characters in U+DCxx range |
510 return s._utf8 | 521 return s._utf8 |
522 elif isinstance(s, safelocalstr): | |
523 # already verified that s is non-lossy in legacy encoding, which | |
524 # shouldn't contain characters in U+DCxx range | |
525 return fromlocal(s) | |
511 elif isasciistr(s): | 526 elif isasciistr(s): |
512 return s | 527 return s |
513 if "\xed" not in s: | 528 if "\xed" not in s: |
514 try: | 529 try: |
515 s.decode('utf-8', _utf8strict) | 530 s.decode('utf-8', _utf8strict) |