comparison mercurial/encoding.py @ 26878:d7e83f106459

encoding: use getutf8char in toutf8b This correctly avoids the ambiguity of U+FFFD already present in the input and similar confusion by working a character at a time.
author Matt Mackall <mpm@selenic.com>
date Thu, 05 Nov 2015 17:21:43 -0600
parents cb467a9d7593
children a24b98f4e03c
comparison
equal deleted inserted replaced
26877:cb467a9d7593 26878:d7e83f106459
468 468
469 try: 469 try:
470 s.decode('utf-8') 470 s.decode('utf-8')
471 return s 471 return s
472 except UnicodeDecodeError: 472 except UnicodeDecodeError:
473 # surrogate-encode any characters that don't round-trip 473 pass
474 s2 = s.decode('utf-8', 'ignore').encode('utf-8') 474
475 r = "" 475 r = ""
476 pos = 0 476 pos = 0
477 for c in s: 477 l = len(s)
478 if s2[pos:pos + 1] == c: 478 while pos < l:
479 r += c 479 try:
480 pos += 1 480 c = getutf8char(s, pos)
481 else: 481 pos += len(c)
482 r += unichr(0xdc00 + ord(c)).encode('utf-8') 482 except UnicodeDecodeError:
483 return r 483 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
484 pos += 1
485 r += c
486 return r
484 487
485 def fromutf8b(s): 488 def fromutf8b(s):
486 '''Given a UTF-8b string, return a local, possibly-binary string. 489 '''Given a UTF-8b string, return a local, possibly-binary string.
487 490
488 return the original binary string. This 491 return the original binary string. This