Mercurial > public > mercurial-scm > hg
comparison mercurial/encoding.py @ 26879:a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
This is the final missing piece in fully round-tripping random byte
strings through UTF-8b. While this issue means that UTF-8 <-> UTF-8b
isn't fully bijective, we don't expect to ever see U+DCxx codepoints
in "real" UTF-8 data, so it should remain bijective in practice.
author | Matt Mackall <mpm@selenic.com> |
---|---|
date | Thu, 05 Nov 2015 17:30:10 -0600 |
parents | d7e83f106459 |
children | de5ae97ce9f4 |
comparison
equal
deleted
inserted
replaced
26878:d7e83f106459 | 26879:a24b98f4e03c |
---|---|
461 arbitrary bytes into an internal Unicode format that can be | 461 arbitrary bytes into an internal Unicode format that can be |
462 re-encoded back into the original. Here we are exposing the | 462 re-encoded back into the original. Here we are exposing the |
463 internal surrogate encoding as a UTF-8 string.) | 463 internal surrogate encoding as a UTF-8 string.) |
464 ''' | 464 ''' |
465 | 465 |
466 if isinstance(s, localstr): | 466 if "\xed" not in s: |
467 return s._utf8 | 467 if isinstance(s, localstr): |
468 | 468 return s._utf8 |
469 try: | 469 try: |
470 s.decode('utf-8') | 470 s.decode('utf-8') |
471 return s | 471 return s |
472 except UnicodeDecodeError: | 472 except UnicodeDecodeError: |
473 pass | 473 pass |
474 | 474 |
475 r = "" | 475 r = "" |
476 pos = 0 | 476 pos = 0 |
477 l = len(s) | 477 l = len(s) |
478 while pos < l: | 478 while pos < l: |
479 try: | 479 try: |
480 c = getutf8char(s, pos) | 480 c = getutf8char(s, pos) |
481 pos += len(c) | 481 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": |
482 # have to re-escape existing U+DCxx characters | |
483 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8') | |
484 pos += 1 | |
485 else: | |
486 pos += len(c) | |
482 except UnicodeDecodeError: | 487 except UnicodeDecodeError: |
483 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8') | 488 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8') |
484 pos += 1 | 489 pos += 1 |
485 r += c | 490 r += c |
486 return r | 491 return r |