Mercurial > public > mercurial-scm > hg-stable
comparison mercurial/encoding.py @ 50434:95acba2c29f6
encoding: avoid quadratic time complexity when json-encoding non-UTF8 strings
Apparently the code uses "+=" with a bytes object, which is linear-time, so the
whole encoding is quadratic-time. This patch makes us use a bytearray object,
instead, which has a(n amortized-)constant-time append operation.
The encoding is still not particularly fast, but at least a 10MB file
takes tens of seconds, not many hours to encode.
author | Arseniy Alekseyev <aalekseyev@janestreet.com> |
---|---|
date | Mon, 06 Mar 2023 11:27:57 +0000 |
parents | d44e3c45f0e4 |
children | 18c8c18993f0 |
comparison
equal
deleted
inserted
replaced
50433:bcf54837241d | 50434:95acba2c29f6 |
---|---|
655 return s | 655 return s |
656 except UnicodeDecodeError: | 656 except UnicodeDecodeError: |
657 pass | 657 pass |
658 | 658 |
659 s = pycompat.bytestr(s) | 659 s = pycompat.bytestr(s) |
660 r = b"" | 660 r = bytearray() |
661 pos = 0 | 661 pos = 0 |
662 l = len(s) | 662 l = len(s) |
663 while pos < l: | 663 while pos < l: |
664 try: | 664 try: |
665 c = getutf8char(s, pos) | 665 c = getutf8char(s, pos) |
671 pos += len(c) | 671 pos += len(c) |
672 except UnicodeDecodeError: | 672 except UnicodeDecodeError: |
673 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict) | 673 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict) |
674 pos += 1 | 674 pos += 1 |
675 r += c | 675 r += c |
676 return r | 676 return bytes(r) |
677 | 677 |
678 | 678 |
679 def fromutf8b(s): | 679 def fromutf8b(s): |
680 # type: (bytes) -> bytes | 680 # type: (bytes) -> bytes |
681 """Given a UTF-8b string, return a local, possibly-binary string. | 681 """Given a UTF-8b string, return a local, possibly-binary string. |
710 # use UTF-16 internally (issue5031) which causes non-BMP code | 710 # use UTF-16 internally (issue5031) which causes non-BMP code |
711 # points to be escaped. Instead, we use our handy getutf8char | 711 # points to be escaped. Instead, we use our handy getutf8char |
712 # helper again to walk the string without "decoding" it. | 712 # helper again to walk the string without "decoding" it. |
713 | 713 |
714 s = pycompat.bytestr(s) | 714 s = pycompat.bytestr(s) |
715 r = b"" | 715 r = bytearray() |
716 pos = 0 | 716 pos = 0 |
717 l = len(s) | 717 l = len(s) |
718 while pos < l: | 718 while pos < l: |
719 c = getutf8char(s, pos) | 719 c = getutf8char(s, pos) |
720 pos += len(c) | 720 pos += len(c) |
721 # unescape U+DCxx characters | 721 # unescape U+DCxx characters |
722 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf": | 722 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf": |
723 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF) | 723 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF) |
724 r += c | 724 r += c |
725 return r | 725 return bytes(r) |