Mercurial > public > mercurial-scm > hg-stable
diff mercurial/encoding.py @ 34225:aa877860d4d7
py3: use 'surrogatepass' error handler to process U+DCxx transparently
It's disallowed by default on Python 3.
https://docs.python.org/3/library/codecs.html#error-handlers
author | Yuya Nishihara <yuya@tcha.org> |
---|---|
date | Sat, 16 Sep 2017 22:55:48 +0900 |
parents | 1c601df9894c |
children | 3696efeab66f |
line wrap: on
line diff
--- a/mercurial/encoding.py Sat Sep 16 22:42:19 2017 +0900 +++ b/mercurial/encoding.py Sat Sep 16 22:55:48 2017 +0900 @@ -448,6 +448,13 @@ pass return charencodepure.jsonescapeu8fallback(u8chars, paranoid) +# We need to decode/encode U+DCxx codes transparently since invalid UTF-8 +# bytes are mapped to that range. +if pycompat.ispy3: + _utf8strict = r'surrogatepass' +else: + _utf8strict = r'strict' + _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] def getutf8char(s, pos): @@ -464,7 +471,7 @@ c = s[pos:pos + l] # validate with attempted decode - c.decode("utf-8") + c.decode("utf-8", _utf8strict) return c def toutf8b(s): @@ -503,7 +510,7 @@ if isinstance(s, localstr): return s._utf8 try: - s.decode('utf-8') + s.decode('utf-8', _utf8strict) return s except UnicodeDecodeError: pass @@ -517,12 +524,12 @@ c = getutf8char(s, pos) if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": # have to re-escape existing U+DCxx characters - c = unichr(0xdc00 + ord(s[pos])).encode('utf-8') + c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict) pos += 1 else: pos += len(c) except UnicodeDecodeError: - c = unichr(0xdc00 + ord(s[pos])).encode('utf-8') + c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict) pos += 1 r += c return r @@ -570,7 +577,7 @@ pos += len(c) # unescape U+DCxx characters if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": - c = chr(ord(c.decode("utf-8")) & 0xff) + c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff) r += c return r