mercurial-scm/hg: comparison mercurial/encoding.py

equal deleted inserted replaced

-:5307cc57f271
+:aa877860d4d7
 return _jsonescapeu8fast(u8chars, paranoid)
 except ValueError:
 pass
 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
+# We need to decode/encode U+DCxx codes transparently since invalid UTF-8
+# bytes are mapped to that range.
+if pycompat.ispy3:
+_utf8strict = r'surrogatepass'
+else:
+_utf8strict = r'strict'
 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
 def getutf8char(s, pos):
 '''get the next full utf-8 character in the given string, starting at pos
 if not l: # ascii
 return s[pos:pos + 1]
 c = s[pos:pos + l]
 # validate with attempted decode
-c.decode("utf-8")
+c.decode("utf-8", _utf8strict)
 return c
 def toutf8b(s):
 '''convert a local, possibly-binary string into UTF-8b
 return s
 if "\xed" not in s:
 if isinstance(s, localstr):
 return s._utf8
 try:
-s.decode('utf-8')
+s.decode('utf-8', _utf8strict)
 return s
 except UnicodeDecodeError:
 pass
 s = pycompat.bytestr(s)
 while pos < l:
 try:
 c = getutf8char(s, pos)
 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
 # have to re-escape existing U+DCxx characters
-c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
+c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
 pos += 1
 else:
 pos += len(c)
 except UnicodeDecodeError:
-c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
+c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
 pos += 1
 r += c
 return r
 def fromutf8b(s):
 while pos < l:
 c = getutf8char(s, pos)
 pos += len(c)
 # unescape U+DCxx characters
 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
-c = chr(ord(c.decode("utf-8")) & 0xff)
+c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff)
 r += c
 return r
 if pycompat.ispy3:
 class strio(io.TextIOWrapper):

changeset 34218	aa877860d4d7
parent 34216	1c601df9894c
child 36549	3696efeab66f