mercurial/encoding.py
changeset 34218 aa877860d4d7
parent 34216 1c601df9894c
child 36549 3696efeab66f
equal deleted inserted replaced
34217:5307cc57f271 34218:aa877860d4d7
   446         return _jsonescapeu8fast(u8chars, paranoid)
   446         return _jsonescapeu8fast(u8chars, paranoid)
   447     except ValueError:
   447     except ValueError:
   448         pass
   448         pass
   449     return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
   449     return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
   450 
   450 
       
   451 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
       
   452 # bytes are mapped to that range.
       
   453 if pycompat.ispy3:
       
   454     _utf8strict = r'surrogatepass'
       
   455 else:
       
   456     _utf8strict = r'strict'
       
   457 
   451 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
   458 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
   452 
   459 
   453 def getutf8char(s, pos):
   460 def getutf8char(s, pos):
   454     '''get the next full utf-8 character in the given string, starting at pos
   461     '''get the next full utf-8 character in the given string, starting at pos
   455 
   462 
   462     if not l: # ascii
   469     if not l: # ascii
   463         return s[pos:pos + 1]
   470         return s[pos:pos + 1]
   464 
   471 
   465     c = s[pos:pos + l]
   472     c = s[pos:pos + l]
   466     # validate with attempted decode
   473     # validate with attempted decode
   467     c.decode("utf-8")
   474     c.decode("utf-8", _utf8strict)
   468     return c
   475     return c
   469 
   476 
   470 def toutf8b(s):
   477 def toutf8b(s):
   471     '''convert a local, possibly-binary string into UTF-8b
   478     '''convert a local, possibly-binary string into UTF-8b
   472 
   479 
   501         return s
   508         return s
   502     if "\xed" not in s:
   509     if "\xed" not in s:
   503         if isinstance(s, localstr):
   510         if isinstance(s, localstr):
   504             return s._utf8
   511             return s._utf8
   505         try:
   512         try:
   506             s.decode('utf-8')
   513             s.decode('utf-8', _utf8strict)
   507             return s
   514             return s
   508         except UnicodeDecodeError:
   515         except UnicodeDecodeError:
   509             pass
   516             pass
   510 
   517 
   511     s = pycompat.bytestr(s)
   518     s = pycompat.bytestr(s)
   515     while pos < l:
   522     while pos < l:
   516         try:
   523         try:
   517             c = getutf8char(s, pos)
   524             c = getutf8char(s, pos)
   518             if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
   525             if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
   519                 # have to re-escape existing U+DCxx characters
   526                 # have to re-escape existing U+DCxx characters
   520                 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
   527                 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
   521                 pos += 1
   528                 pos += 1
   522             else:
   529             else:
   523                 pos += len(c)
   530                 pos += len(c)
   524         except UnicodeDecodeError:
   531         except UnicodeDecodeError:
   525             c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
   532             c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
   526             pos += 1
   533             pos += 1
   527         r += c
   534         r += c
   528     return r
   535     return r
   529 
   536 
   530 def fromutf8b(s):
   537 def fromutf8b(s):
   568     while pos < l:
   575     while pos < l:
   569         c = getutf8char(s, pos)
   576         c = getutf8char(s, pos)
   570         pos += len(c)
   577         pos += len(c)
   571         # unescape U+DCxx characters
   578         # unescape U+DCxx characters
   572         if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
   579         if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
   573             c = chr(ord(c.decode("utf-8")) & 0xff)
   580             c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff)
   574         r += c
   581         r += c
   575     return r
   582     return r
   576 
   583 
   577 if pycompat.ispy3:
   584 if pycompat.ispy3:
   578     class strio(io.TextIOWrapper):
   585     class strio(io.TextIOWrapper):