Mercurial > public > mercurial-scm > hg
comparison mercurial/encoding.py @ 45942:89a2afe31e82
formating: upgrade to black 20.8b1
This required a couple of small tweaks to un-confuse black, but now it
works. Big formatting changes come from:
* Dramatically improved collection-splitting logic upstream
* Black having a strong (correct IMO) opinion that """ is better than '''
Differential Revision: https://phab.mercurial-scm.org/D9430
author | Augie Fackler <raf@durin42.com> |
---|---|
date | Fri, 27 Nov 2020 17:03:29 -0500 |
parents | a736ab681b78 |
children | 3dfebba99ef6 |
comparison
equal
deleted
inserted
replaced
45941:346af7687c6f | 45942:89a2afe31e82 |
---|---|
111 encodingmode = environ.get(b"HGENCODINGMODE", b"strict") | 111 encodingmode = environ.get(b"HGENCODINGMODE", b"strict") |
112 fallbackencoding = b'ISO-8859-1' | 112 fallbackencoding = b'ISO-8859-1' |
113 | 113 |
114 | 114 |
115 class localstr(bytes): | 115 class localstr(bytes): |
116 '''This class allows strings that are unmodified to be | 116 """This class allows strings that are unmodified to be |
117 round-tripped to the local encoding and back''' | 117 round-tripped to the local encoding and back""" |
118 | 118 |
119 def __new__(cls, u, l): | 119 def __new__(cls, u, l): |
120 s = bytes.__new__(cls, l) | 120 s = bytes.__new__(cls, l) |
121 s._utf8 = u | 121 s._utf8 = u |
122 return s | 122 return s |
327 return len(d) | 327 return len(d) |
328 | 328 |
329 | 329 |
330 def getcols(s, start, c): | 330 def getcols(s, start, c): |
331 # type: (bytes, int, int) -> bytes | 331 # type: (bytes, int, int) -> bytes |
332 '''Use colwidth to find a c-column substring of s starting at byte | 332 """Use colwidth to find a c-column substring of s starting at byte |
333 index start''' | 333 index start""" |
334 for x in pycompat.xrange(start + c, len(s)): | 334 for x in pycompat.xrange(start + c, len(s)): |
335 t = s[start:x] | 335 t = s[start:x] |
336 if colwidth(t) == c: | 336 if colwidth(t) == c: |
337 return t | 337 return t |
338 raise ValueError('substring not found') | 338 raise ValueError('substring not found') |
485 except LookupError as k: | 485 except LookupError as k: |
486 raise error.Abort(k, hint=b"please check your locale settings") | 486 raise error.Abort(k, hint=b"please check your locale settings") |
487 | 487 |
488 | 488 |
489 class normcasespecs(object): | 489 class normcasespecs(object): |
490 '''what a platform's normcase does to ASCII strings | 490 """what a platform's normcase does to ASCII strings |
491 | 491 |
492 This is specified per platform, and should be consistent with what normcase | 492 This is specified per platform, and should be consistent with what normcase |
493 on that platform actually does. | 493 on that platform actually does. |
494 | 494 |
495 lower: normcase lowercases ASCII strings | 495 lower: normcase lowercases ASCII strings |
496 upper: normcase uppercases ASCII strings | 496 upper: normcase uppercases ASCII strings |
497 other: the fallback function should always be called | 497 other: the fallback function should always be called |
498 | 498 |
499 This should be kept in sync with normcase_spec in util.h.''' | 499 This should be kept in sync with normcase_spec in util.h.""" |
500 | 500 |
501 lower = -1 | 501 lower = -1 |
502 upper = 1 | 502 upper = 1 |
503 other = 0 | 503 other = 0 |
504 | 504 |
505 | 505 |
506 def jsonescape(s, paranoid=False): | 506 def jsonescape(s, paranoid=False): |
507 # type: (Any, Any) -> Any | 507 # type: (Any, Any) -> Any |
508 '''returns a string suitable for JSON | 508 """returns a string suitable for JSON |
509 | 509 |
510 JSON is problematic for us because it doesn't support non-Unicode | 510 JSON is problematic for us because it doesn't support non-Unicode |
511 bytes. To deal with this, we take the following approach: | 511 bytes. To deal with this, we take the following approach: |
512 | 512 |
513 - localstr/safelocalstr objects are converted back to UTF-8 | 513 - localstr/safelocalstr objects are converted back to UTF-8 |
545 'utf-8: caf\\\\u00e9' | 545 'utf-8: caf\\\\u00e9' |
546 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True) | 546 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True) |
547 'non-BMP: \\\\ud834\\\\udd1e' | 547 'non-BMP: \\\\ud834\\\\udd1e' |
548 >>> jsonescape(b'<foo@example.org>', paranoid=True) | 548 >>> jsonescape(b'<foo@example.org>', paranoid=True) |
549 '\\\\u003cfoo@example.org\\\\u003e' | 549 '\\\\u003cfoo@example.org\\\\u003e' |
550 ''' | 550 """ |
551 | 551 |
552 u8chars = toutf8b(s) | 552 u8chars = toutf8b(s) |
553 try: | 553 try: |
554 return _jsonescapeu8fast(u8chars, paranoid) | 554 return _jsonescapeu8fast(u8chars, paranoid) |
555 except ValueError: | 555 except ValueError: |
567 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] | 567 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] |
568 | 568 |
569 | 569 |
570 def getutf8char(s, pos): | 570 def getutf8char(s, pos): |
571 # type: (bytes, int) -> bytes | 571 # type: (bytes, int) -> bytes |
572 '''get the next full utf-8 character in the given string, starting at pos | 572 """get the next full utf-8 character in the given string, starting at pos |
573 | 573 |
574 Raises a UnicodeError if the given location does not start a valid | 574 Raises a UnicodeError if the given location does not start a valid |
575 utf-8 character. | 575 utf-8 character. |
576 ''' | 576 """ |
577 | 577 |
578 # find how many bytes to attempt decoding from first nibble | 578 # find how many bytes to attempt decoding from first nibble |
579 l = _utf8len[ord(s[pos : pos + 1]) >> 4] | 579 l = _utf8len[ord(s[pos : pos + 1]) >> 4] |
580 if not l: # ascii | 580 if not l: # ascii |
581 return s[pos : pos + 1] | 581 return s[pos : pos + 1] |
586 return c | 586 return c |
587 | 587 |
588 | 588 |
589 def toutf8b(s): | 589 def toutf8b(s): |
590 # type: (bytes) -> bytes | 590 # type: (bytes) -> bytes |
591 '''convert a local, possibly-binary string into UTF-8b | 591 """convert a local, possibly-binary string into UTF-8b |
592 | 592 |
593 This is intended as a generic method to preserve data when working | 593 This is intended as a generic method to preserve data when working |
594 with schemes like JSON and XML that have no provision for | 594 with schemes like JSON and XML that have no provision for |
595 arbitrary byte strings. As Mercurial often doesn't know | 595 arbitrary byte strings. As Mercurial often doesn't know |
596 what encoding data is in, we use so-called UTF-8b. | 596 what encoding data is in, we use so-called UTF-8b. |
614 | 614 |
615 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and | 615 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and |
616 arbitrary bytes into an internal Unicode format that can be | 616 arbitrary bytes into an internal Unicode format that can be |
617 re-encoded back into the original. Here we are exposing the | 617 re-encoded back into the original. Here we are exposing the |
618 internal surrogate encoding as a UTF-8 string.) | 618 internal surrogate encoding as a UTF-8 string.) |
619 ''' | 619 """ |
620 | 620 |
621 if isinstance(s, localstr): | 621 if isinstance(s, localstr): |
622 # assume that the original UTF-8 sequence would never contain | 622 # assume that the original UTF-8 sequence would never contain |
623 # invalid characters in U+DCxx range | 623 # invalid characters in U+DCxx range |
624 return s._utf8 | 624 return s._utf8 |
655 return r | 655 return r |
656 | 656 |
657 | 657 |
658 def fromutf8b(s): | 658 def fromutf8b(s): |
659 # type: (bytes) -> bytes | 659 # type: (bytes) -> bytes |
660 '''Given a UTF-8b string, return a local, possibly-binary string. | 660 """Given a UTF-8b string, return a local, possibly-binary string. |
661 | 661 |
662 return the original binary string. This | 662 return the original binary string. This |
663 is a round-trip process for strings like filenames, but metadata | 663 is a round-trip process for strings like filenames, but metadata |
664 that's was passed through tolocal will remain in UTF-8. | 664 that's was passed through tolocal will remain in UTF-8. |
665 | 665 |
675 True | 675 True |
676 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd") | 676 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd") |
677 True | 677 True |
678 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80") | 678 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80") |
679 True | 679 True |
680 ''' | 680 """ |
681 | 681 |
682 if isasciistr(s): | 682 if isasciistr(s): |
683 return s | 683 return s |
684 # fast path - look for uDxxx prefixes in s | 684 # fast path - look for uDxxx prefixes in s |
685 if b"\xed" not in s: | 685 if b"\xed" not in s: |