90 s = bytes.__new__(cls, l) |
90 s = bytes.__new__(cls, l) |
91 s._utf8 = u |
91 s._utf8 = u |
92 return s |
92 return s |
93 def __hash__(self): |
93 def __hash__(self): |
94 return hash(self._utf8) # avoid collisions in local string space |
94 return hash(self._utf8) # avoid collisions in local string space |
|
95 |
|
96 class safelocalstr(bytes): |
|
97 """Tagged string denoting it was previously an internal UTF-8 string, |
|
98 and can be converted back to UTF-8 losslessly |
|
99 |
|
100 >>> assert safelocalstr(b'\\xc3') == b'\\xc3' |
|
101 >>> assert b'\\xc3' == safelocalstr(b'\\xc3') |
|
102 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0} |
|
103 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0} |
|
104 """ |
95 |
105 |
96 def tolocal(s): |
106 def tolocal(s): |
97 """ |
107 """ |
98 Convert a string from internal UTF-8 to local encoding |
108 Convert a string from internal UTF-8 to local encoding |
99 |
109 |
138 # fast path |
148 # fast path |
139 return s |
149 return s |
140 r = u.encode(_sysstr(encoding), u"replace") |
150 r = u.encode(_sysstr(encoding), u"replace") |
141 if u == r.decode(_sysstr(encoding)): |
151 if u == r.decode(_sysstr(encoding)): |
142 # r is a safe, non-lossy encoding of s |
152 # r is a safe, non-lossy encoding of s |
143 return r |
153 return safelocalstr(r) |
144 return localstr(s, r) |
154 return localstr(s, r) |
145 except UnicodeDecodeError: |
155 except UnicodeDecodeError: |
146 # we should only get here if we're looking at an ancient changeset |
156 # we should only get here if we're looking at an ancient changeset |
147 try: |
157 try: |
148 u = s.decode(_sysstr(fallbackencoding)) |
158 u = s.decode(_sysstr(fallbackencoding)) |
149 r = u.encode(_sysstr(encoding), u"replace") |
159 r = u.encode(_sysstr(encoding), u"replace") |
150 if u == r.decode(_sysstr(encoding)): |
160 if u == r.decode(_sysstr(encoding)): |
151 # r is a safe, non-lossy encoding of s |
161 # r is a safe, non-lossy encoding of s |
152 return r |
162 return safelocalstr(r) |
153 return localstr(u.encode('UTF-8'), r) |
163 return localstr(u.encode('UTF-8'), r) |
154 except UnicodeDecodeError: |
164 except UnicodeDecodeError: |
155 u = s.decode("utf-8", "replace") # last ditch |
165 u = s.decode("utf-8", "replace") # last ditch |
156 # can't round-trip |
166 # can't round-trip |
157 return u.encode(_sysstr(encoding), u"replace") |
167 return u.encode(_sysstr(encoding), u"replace") |
400 '''returns a string suitable for JSON |
410 '''returns a string suitable for JSON |
401 |
411 |
402 JSON is problematic for us because it doesn't support non-Unicode |
412 JSON is problematic for us because it doesn't support non-Unicode |
403 bytes. To deal with this, we take the following approach: |
413 bytes. To deal with this, we take the following approach: |
404 |
414 |
405 - localstr objects are converted back to UTF-8 |
415 - localstr/safelocalstr objects are converted back to UTF-8 |
406 - valid UTF-8/ASCII strings are passed as-is |
416 - valid UTF-8/ASCII strings are passed as-is |
407 - other strings are converted to UTF-8b surrogate encoding |
417 - other strings are converted to UTF-8b surrogate encoding |
408 - apply JSON-specified string escaping |
418 - apply JSON-specified string escaping |
409 |
419 |
410 (escapes are doubled in these tests) |
420 (escapes are doubled in these tests) |
493 - filenames and file contents in arbitrary other encodings can have |
503 - filenames and file contents in arbitrary other encodings can have |
494 be round-tripped or recovered by clueful clients |
504 be round-tripped or recovered by clueful clients |
495 - local strings that have a cached known UTF-8 encoding (aka |
505 - local strings that have a cached known UTF-8 encoding (aka |
496 localstr) get sent as UTF-8 so Unicode-oriented clients get the |
506 localstr) get sent as UTF-8 so Unicode-oriented clients get the |
497 Unicode data they want |
507 Unicode data they want |
|
508 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well |
498 - because we must preserve UTF-8 bytestring in places such as |
509 - because we must preserve UTF-8 bytestring in places such as |
499 filenames, metadata can't be roundtripped without help |
510 filenames, metadata can't be roundtripped without help |
500 |
511 |
501 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and |
512 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and |
502 arbitrary bytes into an internal Unicode format that can be |
513 arbitrary bytes into an internal Unicode format that can be |
506 |
517 |
507 if isinstance(s, localstr): |
518 if isinstance(s, localstr): |
508 # assume that the original UTF-8 sequence would never contain |
519 # assume that the original UTF-8 sequence would never contain |
509 # invalid characters in U+DCxx range |
520 # invalid characters in U+DCxx range |
510 return s._utf8 |
521 return s._utf8 |
|
522 elif isinstance(s, safelocalstr): |
|
523 # already verified that s is non-lossy in legacy encoding, which |
|
524 # shouldn't contain characters in U+DCxx range |
|
525 return fromlocal(s) |
511 elif isasciistr(s): |
526 elif isasciistr(s): |
512 return s |
527 return s |
513 if "\xed" not in s: |
528 if "\xed" not in s: |
514 try: |
529 try: |
515 s.decode('utf-8', _utf8strict) |
530 s.decode('utf-8', _utf8strict) |