comparison mercurial/encoding.py @ 50434:95acba2c29f6

encoding: avoid quadratic time complexity when json-encoding non-UTF8 strings Apparently the code uses "+=" with a bytes object, which is linear-time, so the whole encoding is quadratic-time. This patch makes us use a bytearray object, instead, which has a(n amortized-)constant-time append operation. The encoding is still not particularly fast, but at least a 10MB file takes tens of seconds, not many hours to encode.
author Arseniy Alekseyev <aalekseyev@janestreet.com>
date Mon, 06 Mar 2023 11:27:57 +0000
parents d44e3c45f0e4
children 18c8c18993f0
comparison
equal deleted inserted replaced
50433:bcf54837241d 50434:95acba2c29f6
655 return s 655 return s
656 except UnicodeDecodeError: 656 except UnicodeDecodeError:
657 pass 657 pass
658 658
659 s = pycompat.bytestr(s) 659 s = pycompat.bytestr(s)
660 r = b"" 660 r = bytearray()
661 pos = 0 661 pos = 0
662 l = len(s) 662 l = len(s)
663 while pos < l: 663 while pos < l:
664 try: 664 try:
665 c = getutf8char(s, pos) 665 c = getutf8char(s, pos)
671 pos += len(c) 671 pos += len(c)
672 except UnicodeDecodeError: 672 except UnicodeDecodeError:
673 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict) 673 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
674 pos += 1 674 pos += 1
675 r += c 675 r += c
676 return r 676 return bytes(r)
677 677
678 678
679 def fromutf8b(s): 679 def fromutf8b(s):
680 # type: (bytes) -> bytes 680 # type: (bytes) -> bytes
681 """Given a UTF-8b string, return a local, possibly-binary string. 681 """Given a UTF-8b string, return a local, possibly-binary string.
710 # use UTF-16 internally (issue5031) which causes non-BMP code 710 # use UTF-16 internally (issue5031) which causes non-BMP code
711 # points to be escaped. Instead, we use our handy getutf8char 711 # points to be escaped. Instead, we use our handy getutf8char
712 # helper again to walk the string without "decoding" it. 712 # helper again to walk the string without "decoding" it.
713 713
714 s = pycompat.bytestr(s) 714 s = pycompat.bytestr(s)
715 r = b"" 715 r = bytearray()
716 pos = 0 716 pos = 0
717 l = len(s) 717 l = len(s)
718 while pos < l: 718 while pos < l:
719 c = getutf8char(s, pos) 719 c = getutf8char(s, pos)
720 pos += len(c) 720 pos += len(c)
721 # unescape U+DCxx characters 721 # unescape U+DCxx characters
722 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf": 722 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
723 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF) 723 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
724 r += c 724 r += c
725 return r 725 return bytes(r)