Mercurial > public > mercurial-scm > hg
diff mercurial/utils/stringutil.py @ 37083:f99d64e8a4e4
stringutil: move generic string helpers to new module
Per https://phab.mercurial-scm.org/D2903#46738
URL and file paths functions are left since they are big enough to make
separate modules.
author | Yuya Nishihara <yuya@tcha.org> |
---|---|
date | Thu, 22 Mar 2018 21:19:31 +0900 |
parents | mercurial/util.py@1a1d1c44b570 |
children | f8e1f48de118 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mercurial/utils/stringutil.py Thu Mar 22 21:19:31 2018 +0900 @@ -0,0 +1,288 @@ +# stringutil.py - utility for generic string formatting, parsing, etc. +# +# Copyright 2005 K. Thananchayan <thananck@yahoo.com> +# Copyright 2005-2007 Matt Mackall <mpm@selenic.com> +# Copyright 2006 Vadim Gelfer <vadim.gelfer@gmail.com> +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. + +from __future__ import absolute_import + +import codecs +import re as remod +import textwrap + +from ..i18n import _ + +from .. import ( + encoding, + error, + pycompat, +) + +_DATA_ESCAPE_MAP = {pycompat.bytechr(i): br'\x%02x' % i for i in range(256)} +_DATA_ESCAPE_MAP.update({ + b'\\': b'\\\\', + b'\r': br'\r', + b'\n': br'\n', +}) +_DATA_ESCAPE_RE = remod.compile(br'[\x00-\x08\x0a-\x1f\\\x7f-\xff]') + +def escapedata(s): + if isinstance(s, bytearray): + s = bytes(s) + + return _DATA_ESCAPE_RE.sub(lambda m: _DATA_ESCAPE_MAP[m.group(0)], s) + +def binary(s): + """return true if a string is binary data""" + return bool(s and '\0' in s) + +def stringmatcher(pattern, casesensitive=True): + """ + accepts a string, possibly starting with 're:' or 'literal:' prefix. + returns the matcher name, pattern, and matcher function. + missing or unknown prefixes are treated as literal matches. + + helper for tests: + >>> def test(pattern, *tests): + ... kind, pattern, matcher = stringmatcher(pattern) + ... return (kind, pattern, [bool(matcher(t)) for t in tests]) + >>> def itest(pattern, *tests): + ... kind, pattern, matcher = stringmatcher(pattern, casesensitive=False) + ... return (kind, pattern, [bool(matcher(t)) for t in tests]) + + exact matching (no prefix): + >>> test(b'abcdefg', b'abc', b'def', b'abcdefg') + ('literal', 'abcdefg', [False, False, True]) + + regex matching ('re:' prefix) + >>> test(b're:a.+b', b'nomatch', b'fooadef', b'fooadefbar') + ('re', 'a.+b', [False, False, True]) + + force exact matches ('literal:' prefix) + >>> test(b'literal:re:foobar', b'foobar', b're:foobar') + ('literal', 're:foobar', [False, True]) + + unknown prefixes are ignored and treated as literals + >>> test(b'foo:bar', b'foo', b'bar', b'foo:bar') + ('literal', 'foo:bar', [False, False, True]) + + case insensitive regex matches + >>> itest(b're:A.+b', b'nomatch', b'fooadef', b'fooadefBar') + ('re', 'A.+b', [False, False, True]) + + case insensitive literal matches + >>> itest(b'ABCDEFG', b'abc', b'def', b'abcdefg') + ('literal', 'ABCDEFG', [False, False, True]) + """ + if pattern.startswith('re:'): + pattern = pattern[3:] + try: + flags = 0 + if not casesensitive: + flags = remod.I + regex = remod.compile(pattern, flags) + except remod.error as e: + raise error.ParseError(_('invalid regular expression: %s') + % e) + return 're', pattern, regex.search + elif pattern.startswith('literal:'): + pattern = pattern[8:] + + match = pattern.__eq__ + + if not casesensitive: + ipat = encoding.lower(pattern) + match = lambda s: ipat == encoding.lower(s) + return 'literal', pattern, match + +def shortuser(user): + """Return a short representation of a user name or email address.""" + f = user.find('@') + if f >= 0: + user = user[:f] + f = user.find('<') + if f >= 0: + user = user[f + 1:] + f = user.find(' ') + if f >= 0: + user = user[:f] + f = user.find('.') + if f >= 0: + user = user[:f] + return user + +def emailuser(user): + """Return the user portion of an email address.""" + f = user.find('@') + if f >= 0: + user = user[:f] + f = user.find('<') + if f >= 0: + user = user[f + 1:] + return user + +def email(author): + '''get email of author.''' + r = author.find('>') + if r == -1: + r = None + return author[author.find('<') + 1:r] + +def ellipsis(text, maxlength=400): + """Trim string to at most maxlength (default: 400) columns in display.""" + return encoding.trim(text, maxlength, ellipsis='...') + +def escapestr(s): + # call underlying function of s.encode('string_escape') directly for + # Python 3 compatibility + return codecs.escape_encode(s)[0] + +def unescapestr(s): + return codecs.escape_decode(s)[0] + +def forcebytestr(obj): + """Portably format an arbitrary object (e.g. exception) into a byte + string.""" + try: + return pycompat.bytestr(obj) + except UnicodeEncodeError: + # non-ascii string, may be lossy + return pycompat.bytestr(encoding.strtolocal(str(obj))) + +def uirepr(s): + # Avoid double backslash in Windows path repr() + return pycompat.byterepr(pycompat.bytestr(s)).replace(b'\\\\', b'\\') + +# delay import of textwrap +def _MBTextWrapper(**kwargs): + class tw(textwrap.TextWrapper): + """ + Extend TextWrapper for width-awareness. + + Neither number of 'bytes' in any encoding nor 'characters' is + appropriate to calculate terminal columns for specified string. + + Original TextWrapper implementation uses built-in 'len()' directly, + so overriding is needed to use width information of each characters. + + In addition, characters classified into 'ambiguous' width are + treated as wide in East Asian area, but as narrow in other. + + This requires use decision to determine width of such characters. + """ + def _cutdown(self, ucstr, space_left): + l = 0 + colwidth = encoding.ucolwidth + for i in xrange(len(ucstr)): + l += colwidth(ucstr[i]) + if space_left < l: + return (ucstr[:i], ucstr[i:]) + return ucstr, '' + + # overriding of base class + def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width): + space_left = max(width - cur_len, 1) + + if self.break_long_words: + cut, res = self._cutdown(reversed_chunks[-1], space_left) + cur_line.append(cut) + reversed_chunks[-1] = res + elif not cur_line: + cur_line.append(reversed_chunks.pop()) + + # this overriding code is imported from TextWrapper of Python 2.6 + # to calculate columns of string by 'encoding.ucolwidth()' + def _wrap_chunks(self, chunks): + colwidth = encoding.ucolwidth + + lines = [] + if self.width <= 0: + raise ValueError("invalid width %r (must be > 0)" % self.width) + + # Arrange in reverse order so items can be efficiently popped + # from a stack of chucks. + chunks.reverse() + + while chunks: + + # Start the list of chunks that will make up the current line. + # cur_len is just the length of all the chunks in cur_line. + cur_line = [] + cur_len = 0 + + # Figure out which static string will prefix this line. + if lines: + indent = self.subsequent_indent + else: + indent = self.initial_indent + + # Maximum width for this line. + width = self.width - len(indent) + + # First chunk on line is whitespace -- drop it, unless this + # is the very beginning of the text (i.e. no lines started yet). + if self.drop_whitespace and chunks[-1].strip() == r'' and lines: + del chunks[-1] + + while chunks: + l = colwidth(chunks[-1]) + + # Can at least squeeze this chunk onto the current line. + if cur_len + l <= width: + cur_line.append(chunks.pop()) + cur_len += l + + # Nope, this line is full. + else: + break + + # The current line is full, and the next chunk is too big to + # fit on *any* line (not just this one). + if chunks and colwidth(chunks[-1]) > width: + self._handle_long_word(chunks, cur_line, cur_len, width) + + # If the last chunk on this line is all whitespace, drop it. + if (self.drop_whitespace and + cur_line and cur_line[-1].strip() == r''): + del cur_line[-1] + + # Convert current line back to a string and store it in list + # of all lines (return value). + if cur_line: + lines.append(indent + r''.join(cur_line)) + + return lines + + global _MBTextWrapper + _MBTextWrapper = tw + return tw(**kwargs) + +def wrap(line, width, initindent='', hangindent=''): + maxindent = max(len(hangindent), len(initindent)) + if width <= maxindent: + # adjust for weird terminal size + width = max(78, maxindent + 1) + line = line.decode(pycompat.sysstr(encoding.encoding), + pycompat.sysstr(encoding.encodingmode)) + initindent = initindent.decode(pycompat.sysstr(encoding.encoding), + pycompat.sysstr(encoding.encodingmode)) + hangindent = hangindent.decode(pycompat.sysstr(encoding.encoding), + pycompat.sysstr(encoding.encodingmode)) + wrapper = _MBTextWrapper(width=width, + initial_indent=initindent, + subsequent_indent=hangindent) + return wrapper.fill(line).encode(pycompat.sysstr(encoding.encoding)) + +_booleans = {'1': True, 'yes': True, 'true': True, 'on': True, 'always': True, + '0': False, 'no': False, 'false': False, 'off': False, + 'never': False} + +def parsebool(s): + """Parse s into a boolean. + + If s is not a valid boolean, returns None. + """ + return _booleans.get(s.lower(), None)