Mercurial > public > mercurial-scm > hg-stable
comparison mercurial/utils/stringutil.py @ 49724:bbbb5213d043
typing: add basic type hints to stringutil.py
author | Matt Harbison <matt_harbison@yahoo.com> |
---|---|
date | Fri, 04 Nov 2022 22:59:16 -0400 |
parents | d44e3c45f0e4 |
children | 9be765b82a90 |
comparison
equal
deleted
inserted
replaced
49723:2506c3ac73f4 | 49724:bbbb5213d043 |
---|---|
12 import codecs | 12 import codecs |
13 import re as remod | 13 import re as remod |
14 import textwrap | 14 import textwrap |
15 import types | 15 import types |
16 | 16 |
17 from typing import ( | |
18 Optional, | |
19 overload, | |
20 ) | |
21 | |
17 from ..i18n import _ | 22 from ..i18n import _ |
18 from ..thirdparty import attr | 23 from ..thirdparty import attr |
19 | 24 |
20 from .. import ( | 25 from .. import ( |
21 encoding, | 26 encoding, |
26 # regex special chars pulled from https://bugs.python.org/issue29995 | 31 # regex special chars pulled from https://bugs.python.org/issue29995 |
27 # which was part of Python 3.7. | 32 # which was part of Python 3.7. |
28 _respecial = pycompat.bytestr(b'()[]{}?*+-|^$\\.&~# \t\n\r\v\f') | 33 _respecial = pycompat.bytestr(b'()[]{}?*+-|^$\\.&~# \t\n\r\v\f') |
29 _regexescapemap = {ord(i): (b'\\' + i).decode('latin1') for i in _respecial} | 34 _regexescapemap = {ord(i): (b'\\' + i).decode('latin1') for i in _respecial} |
30 regexbytesescapemap = {i: (b'\\' + i) for i in _respecial} | 35 regexbytesescapemap = {i: (b'\\' + i) for i in _respecial} |
36 | |
37 | |
38 @overload | |
39 def reescape(pat: bytes) -> bytes: | |
40 ... | |
41 | |
42 | |
43 @overload | |
44 def reescape(pat: str) -> str: | |
45 ... | |
31 | 46 |
32 | 47 |
33 def reescape(pat): | 48 def reescape(pat): |
34 """Drop-in replacement for re.escape.""" | 49 """Drop-in replacement for re.escape.""" |
35 # NOTE: it is intentional that this works on unicodes and not | 50 # NOTE: it is intentional that this works on unicodes and not |
43 if wantuni: | 58 if wantuni: |
44 return pat | 59 return pat |
45 return pat.encode('latin1') | 60 return pat.encode('latin1') |
46 | 61 |
47 | 62 |
48 def pprint(o, bprefix=False, indent=0, level=0): | 63 def pprint(o, bprefix: bool = False, indent: int = 0, level: int = 0) -> bytes: |
49 """Pretty print an object.""" | 64 """Pretty print an object.""" |
50 return b''.join(pprintgen(o, bprefix=bprefix, indent=indent, level=level)) | 65 return b''.join(pprintgen(o, bprefix=bprefix, indent=indent, level=level)) |
51 | 66 |
52 | 67 |
53 def pprintgen(o, bprefix=False, indent=0, level=0): | 68 def pprintgen(o, bprefix: bool = False, indent: int = 0, level: int = 0): |
54 """Pretty print an object to a generator of atoms. | 69 """Pretty print an object to a generator of atoms. |
55 | 70 |
56 ``bprefix`` is a flag influencing whether bytestrings are preferred with | 71 ``bprefix`` is a flag influencing whether bytestrings are preferred with |
57 a ``b''`` prefix. | 72 a ``b''`` prefix. |
58 | 73 |
248 yield b']' | 263 yield b']' |
249 else: | 264 else: |
250 yield pycompat.byterepr(o) | 265 yield pycompat.byterepr(o) |
251 | 266 |
252 | 267 |
253 def prettyrepr(o): | 268 def prettyrepr(o) -> bytes: |
254 """Pretty print a representation of a possibly-nested object""" | 269 """Pretty print a representation of a possibly-nested object""" |
255 lines = [] | 270 lines = [] |
256 rs = pycompat.byterepr(o) | 271 rs = pycompat.byterepr(o) |
257 p0 = p1 = 0 | 272 p0 = p1 = 0 |
258 while p0 < len(rs): | 273 while p0 < len(rs): |
279 lines.append((l, rs[p0:q0].rstrip())) | 294 lines.append((l, rs[p0:q0].rstrip())) |
280 p0, p1 = q0, q1 | 295 p0, p1 = q0, q1 |
281 return b'\n'.join(b' ' * l + s for l, s in lines) | 296 return b'\n'.join(b' ' * l + s for l, s in lines) |
282 | 297 |
283 | 298 |
284 def buildrepr(r): | 299 def buildrepr(r) -> bytes: |
285 """Format an optional printable representation from unexpanded bits | 300 """Format an optional printable representation from unexpanded bits |
286 | 301 |
287 ======== ================================= | 302 ======== ================================= |
288 type(r) example | 303 type(r) example |
289 ======== ================================= | 304 ======== ================================= |
303 return r() | 318 return r() |
304 else: | 319 else: |
305 return pprint(r) | 320 return pprint(r) |
306 | 321 |
307 | 322 |
308 def binary(s): | 323 def binary(s: bytes) -> bool: |
309 """return true if a string is binary data""" | 324 """return true if a string is binary data""" |
310 return bool(s and b'\0' in s) | 325 return bool(s and b'\0' in s) |
311 | 326 |
312 | 327 |
313 def _splitpattern(pattern): | 328 def _splitpattern(pattern: bytes): |
314 if pattern.startswith(b're:'): | 329 if pattern.startswith(b're:'): |
315 return b're', pattern[3:] | 330 return b're', pattern[3:] |
316 elif pattern.startswith(b'literal:'): | 331 elif pattern.startswith(b'literal:'): |
317 return b'literal', pattern[8:] | 332 return b'literal', pattern[8:] |
318 return b'literal', pattern | 333 return b'literal', pattern |
319 | 334 |
320 | 335 |
321 def stringmatcher(pattern, casesensitive=True): | 336 def stringmatcher(pattern: bytes, casesensitive: bool = True): |
322 """ | 337 """ |
323 accepts a string, possibly starting with 're:' or 'literal:' prefix. | 338 accepts a string, possibly starting with 're:' or 'literal:' prefix. |
324 returns the matcher name, pattern, and matcher function. | 339 returns the matcher name, pattern, and matcher function. |
325 missing or unknown prefixes are treated as literal matches. | 340 missing or unknown prefixes are treated as literal matches. |
326 | 341 |
377 return kind, pattern, match | 392 return kind, pattern, match |
378 | 393 |
379 raise error.ProgrammingError(b'unhandled pattern kind: %s' % kind) | 394 raise error.ProgrammingError(b'unhandled pattern kind: %s' % kind) |
380 | 395 |
381 | 396 |
382 def substringregexp(pattern, flags=0): | 397 def substringregexp(pattern: bytes, flags: int = 0): |
383 """Build a regexp object from a string pattern possibly starting with | 398 """Build a regexp object from a string pattern possibly starting with |
384 're:' or 'literal:' prefix. | 399 're:' or 'literal:' prefix. |
385 | 400 |
386 helper for tests: | 401 helper for tests: |
387 >>> def test(pattern, *tests): | 402 >>> def test(pattern, *tests): |
429 return remod.compile(remod.escape(pattern), flags) | 444 return remod.compile(remod.escape(pattern), flags) |
430 | 445 |
431 raise error.ProgrammingError(b'unhandled pattern kind: %s' % kind) | 446 raise error.ProgrammingError(b'unhandled pattern kind: %s' % kind) |
432 | 447 |
433 | 448 |
434 def shortuser(user): | 449 def shortuser(user: bytes) -> bytes: |
435 """Return a short representation of a user name or email address.""" | 450 """Return a short representation of a user name or email address.""" |
436 f = user.find(b'@') | 451 f = user.find(b'@') |
437 if f >= 0: | 452 if f >= 0: |
438 user = user[:f] | 453 user = user[:f] |
439 f = user.find(b'<') | 454 f = user.find(b'<') |
446 if f >= 0: | 461 if f >= 0: |
447 user = user[:f] | 462 user = user[:f] |
448 return user | 463 return user |
449 | 464 |
450 | 465 |
451 def emailuser(user): | 466 def emailuser(user: bytes) -> bytes: |
452 """Return the user portion of an email address.""" | 467 """Return the user portion of an email address.""" |
453 f = user.find(b'@') | 468 f = user.find(b'@') |
454 if f >= 0: | 469 if f >= 0: |
455 user = user[:f] | 470 user = user[:f] |
456 f = user.find(b'<') | 471 f = user.find(b'<') |
457 if f >= 0: | 472 if f >= 0: |
458 user = user[f + 1 :] | 473 user = user[f + 1 :] |
459 return user | 474 return user |
460 | 475 |
461 | 476 |
462 def email(author): | 477 def email(author: bytes) -> bytes: |
463 '''get email of author.''' | 478 '''get email of author.''' |
464 r = author.find(b'>') | 479 r = author.find(b'>') |
465 if r == -1: | 480 if r == -1: |
466 r = None | 481 r = None |
467 return author[author.find(b'<') + 1 : r] | 482 return author[author.find(b'<') + 1 : r] |
468 | 483 |
469 | 484 |
470 def person(author): | 485 def person(author: bytes) -> bytes: |
471 """Returns the name before an email address, | 486 """Returns the name before an email address, |
472 interpreting it as per RFC 5322 | 487 interpreting it as per RFC 5322 |
473 | 488 |
474 >>> person(b'foo@bar') | 489 >>> person(b'foo@bar') |
475 'foo' | 490 'foo' |
610 ) | 625 ) |
611 | 626 |
612 return mailmap | 627 return mailmap |
613 | 628 |
614 | 629 |
615 def mapname(mailmap, author): | 630 def mapname(mailmap, author: bytes) -> bytes: |
616 """Returns the author field according to the mailmap cache, or | 631 """Returns the author field according to the mailmap cache, or |
617 the original author field. | 632 the original author field. |
618 | 633 |
619 >>> mmdata = b"\\n".join([ | 634 >>> mmdata = b"\\n".join([ |
620 ... b'# Comment', | 635 ... b'# Comment', |
661 | 676 |
662 | 677 |
663 _correctauthorformat = remod.compile(br'^[^<]+\s<[^<>]+@[^<>]+>$') | 678 _correctauthorformat = remod.compile(br'^[^<]+\s<[^<>]+@[^<>]+>$') |
664 | 679 |
665 | 680 |
666 def isauthorwellformed(author): | 681 def isauthorwellformed(author: bytes) -> bool: |
667 """Return True if the author field is well formed | 682 """Return True if the author field is well formed |
668 (ie "Contributor Name <contrib@email.dom>") | 683 (ie "Contributor Name <contrib@email.dom>") |
669 | 684 |
670 >>> isauthorwellformed(b'Good Author <good@author.com>') | 685 >>> isauthorwellformed(b'Good Author <good@author.com>') |
671 True | 686 True |
683 False | 698 False |
684 """ | 699 """ |
685 return _correctauthorformat.match(author) is not None | 700 return _correctauthorformat.match(author) is not None |
686 | 701 |
687 | 702 |
688 def firstline(text): | 703 def firstline(text: bytes) -> bytes: |
689 """Return the first line of the input""" | 704 """Return the first line of the input""" |
690 # Try to avoid running splitlines() on the whole string | 705 # Try to avoid running splitlines() on the whole string |
691 i = text.find(b'\n') | 706 i = text.find(b'\n') |
692 if i != -1: | 707 if i != -1: |
693 text = text[:i] | 708 text = text[:i] |
695 return text.splitlines()[0] | 710 return text.splitlines()[0] |
696 except IndexError: | 711 except IndexError: |
697 return b'' | 712 return b'' |
698 | 713 |
699 | 714 |
700 def ellipsis(text, maxlength=400): | 715 def ellipsis(text: bytes, maxlength: int = 400) -> bytes: |
701 """Trim string to at most maxlength (default: 400) columns in display.""" | 716 """Trim string to at most maxlength (default: 400) columns in display.""" |
702 return encoding.trim(text, maxlength, ellipsis=b'...') | 717 return encoding.trim(text, maxlength, ellipsis=b'...') |
703 | 718 |
704 | 719 |
705 def escapestr(s): | 720 def escapestr(s: bytes) -> bytes: |
721 # "bytes" is also a typing shortcut for bytes, bytearray, and memoryview | |
706 if isinstance(s, memoryview): | 722 if isinstance(s, memoryview): |
707 s = bytes(s) | 723 s = bytes(s) |
708 # call underlying function of s.encode('string_escape') directly for | 724 # call underlying function of s.encode('string_escape') directly for |
709 # Python 3 compatibility | 725 # Python 3 compatibility |
710 return codecs.escape_encode(s)[0] # pytype: disable=module-attr | 726 return codecs.escape_encode(s)[0] # pytype: disable=module-attr |
711 | 727 |
712 | 728 |
713 def unescapestr(s): | 729 def unescapestr(s: bytes) -> bytes: |
714 return codecs.escape_decode(s)[0] # pytype: disable=module-attr | 730 return codecs.escape_decode(s)[0] # pytype: disable=module-attr |
715 | 731 |
716 | 732 |
717 def forcebytestr(obj): | 733 def forcebytestr(obj): |
718 """Portably format an arbitrary object (e.g. exception) into a byte | 734 """Portably format an arbitrary object (e.g. exception) into a byte |
722 except UnicodeEncodeError: | 738 except UnicodeEncodeError: |
723 # non-ascii string, may be lossy | 739 # non-ascii string, may be lossy |
724 return pycompat.bytestr(encoding.strtolocal(str(obj))) | 740 return pycompat.bytestr(encoding.strtolocal(str(obj))) |
725 | 741 |
726 | 742 |
727 def uirepr(s): | 743 def uirepr(s: bytes) -> bytes: |
728 # Avoid double backslash in Windows path repr() | 744 # Avoid double backslash in Windows path repr() |
729 return pycompat.byterepr(pycompat.bytestr(s)).replace(b'\\\\', b'\\') | 745 return pycompat.byterepr(pycompat.bytestr(s)).replace(b'\\\\', b'\\') |
730 | 746 |
731 | 747 |
732 # delay import of textwrap | 748 # delay import of textwrap |
836 global _MBTextWrapper | 852 global _MBTextWrapper |
837 _MBTextWrapper = tw | 853 _MBTextWrapper = tw |
838 return tw(**kwargs) | 854 return tw(**kwargs) |
839 | 855 |
840 | 856 |
841 def wrap(line, width, initindent=b'', hangindent=b''): | 857 def wrap( |
858 line: bytes, width: int, initindent: bytes = b'', hangindent: bytes = b'' | |
859 ) -> bytes: | |
842 maxindent = max(len(hangindent), len(initindent)) | 860 maxindent = max(len(hangindent), len(initindent)) |
843 if width <= maxindent: | 861 if width <= maxindent: |
844 # adjust for weird terminal size | 862 # adjust for weird terminal size |
845 width = max(78, maxindent + 1) | 863 width = max(78, maxindent + 1) |
846 line = line.decode( | 864 line = line.decode( |
873 b'off': False, | 891 b'off': False, |
874 b'never': False, | 892 b'never': False, |
875 } | 893 } |
876 | 894 |
877 | 895 |
878 def parsebool(s): | 896 def parsebool(s: bytes) -> Optional[bool]: |
879 """Parse s into a boolean. | 897 """Parse s into a boolean. |
880 | 898 |
881 If s is not a valid boolean, returns None. | 899 If s is not a valid boolean, returns None. |
882 """ | 900 """ |
883 return _booleans.get(s.lower(), None) | 901 return _booleans.get(s.lower(), None) |
884 | 902 |
885 | 903 |
886 def parselist(value): | 904 # TODO: make arg mandatory (and fix code below?) |
905 def parselist(value: Optional[bytes]): | |
887 """parse a configuration value as a list of comma/space separated strings | 906 """parse a configuration value as a list of comma/space separated strings |
888 | 907 |
889 >>> parselist(b'this,is "a small" ,test') | 908 >>> parselist(b'this,is "a small" ,test') |
890 ['this', 'is', 'a small', 'test'] | 909 ['this', 'is', 'a small', 'test'] |
891 """ | 910 """ |
971 else: | 990 else: |
972 result = value | 991 result = value |
973 return result or [] | 992 return result or [] |
974 | 993 |
975 | 994 |
976 def evalpythonliteral(s): | 995 def evalpythonliteral(s: bytes): |
977 """Evaluate a string containing a Python literal expression""" | 996 """Evaluate a string containing a Python literal expression""" |
978 # We could backport our tokenizer hack to rewrite '' to u'' if we want | 997 # We could backport our tokenizer hack to rewrite '' to u'' if we want |
979 return ast.literal_eval(s.decode('latin1')) | 998 return ast.literal_eval(s.decode('latin1')) |