Mercurial > public > mercurial-scm > hg-stable
comparison mercurial/encoding.py @ 26878:d7e83f106459
encoding: use getutf8char in toutf8b
This correctly avoids the ambiguity of U+FFFD already present in the
input and similar confusion by working a character at a time.
author | Matt Mackall <mpm@selenic.com> |
---|---|
date | Thu, 05 Nov 2015 17:21:43 -0600 |
parents | cb467a9d7593 |
children | a24b98f4e03c |
comparison
equal
deleted
inserted
replaced
26877:cb467a9d7593 | 26878:d7e83f106459 |
---|---|
468 | 468 |
469 try: | 469 try: |
470 s.decode('utf-8') | 470 s.decode('utf-8') |
471 return s | 471 return s |
472 except UnicodeDecodeError: | 472 except UnicodeDecodeError: |
473 # surrogate-encode any characters that don't round-trip | 473 pass |
474 s2 = s.decode('utf-8', 'ignore').encode('utf-8') | 474 |
475 r = "" | 475 r = "" |
476 pos = 0 | 476 pos = 0 |
477 for c in s: | 477 l = len(s) |
478 if s2[pos:pos + 1] == c: | 478 while pos < l: |
479 r += c | 479 try: |
480 pos += 1 | 480 c = getutf8char(s, pos) |
481 else: | 481 pos += len(c) |
482 r += unichr(0xdc00 + ord(c)).encode('utf-8') | 482 except UnicodeDecodeError: |
483 return r | 483 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8') |
484 pos += 1 | |
485 r += c | |
486 return r | |
484 | 487 |
485 def fromutf8b(s): | 488 def fromutf8b(s): |
486 '''Given a UTF-8b string, return a local, possibly-binary string. | 489 '''Given a UTF-8b string, return a local, possibly-binary string. |
487 | 490 |
488 return the original binary string. This | 491 return the original binary string. This |