mercurial/utils/stringutil.py
changeset 37083 f99d64e8a4e4
parent 37082 1a1d1c44b570
child 37154 f8e1f48de118
equal deleted inserted replaced
37082:1a1d1c44b570 37083:f99d64e8a4e4
       
     1 # stringutil.py - utility for generic string formatting, parsing, etc.
       
     2 #
       
     3 #  Copyright 2005 K. Thananchayan <thananck@yahoo.com>
       
     4 #  Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
       
     5 #  Copyright 2006 Vadim Gelfer <vadim.gelfer@gmail.com>
       
     6 #
       
     7 # This software may be used and distributed according to the terms of the
       
     8 # GNU General Public License version 2 or any later version.
       
     9 
       
    10 from __future__ import absolute_import
       
    11 
       
    12 import codecs
       
    13 import re as remod
       
    14 import textwrap
       
    15 
       
    16 from ..i18n import _
       
    17 
       
    18 from .. import (
       
    19     encoding,
       
    20     error,
       
    21     pycompat,
       
    22 )
       
    23 
       
    24 _DATA_ESCAPE_MAP = {pycompat.bytechr(i): br'\x%02x' % i for i in range(256)}
       
    25 _DATA_ESCAPE_MAP.update({
       
    26     b'\\': b'\\\\',
       
    27     b'\r': br'\r',
       
    28     b'\n': br'\n',
       
    29 })
       
    30 _DATA_ESCAPE_RE = remod.compile(br'[\x00-\x08\x0a-\x1f\\\x7f-\xff]')
       
    31 
       
    32 def escapedata(s):
       
    33     if isinstance(s, bytearray):
       
    34         s = bytes(s)
       
    35 
       
    36     return _DATA_ESCAPE_RE.sub(lambda m: _DATA_ESCAPE_MAP[m.group(0)], s)
       
    37 
       
    38 def binary(s):
       
    39     """return true if a string is binary data"""
       
    40     return bool(s and '\0' in s)
       
    41 
       
    42 def stringmatcher(pattern, casesensitive=True):
       
    43     """
       
    44     accepts a string, possibly starting with 're:' or 'literal:' prefix.
       
    45     returns the matcher name, pattern, and matcher function.
       
    46     missing or unknown prefixes are treated as literal matches.
       
    47 
       
    48     helper for tests:
       
    49     >>> def test(pattern, *tests):
       
    50     ...     kind, pattern, matcher = stringmatcher(pattern)
       
    51     ...     return (kind, pattern, [bool(matcher(t)) for t in tests])
       
    52     >>> def itest(pattern, *tests):
       
    53     ...     kind, pattern, matcher = stringmatcher(pattern, casesensitive=False)
       
    54     ...     return (kind, pattern, [bool(matcher(t)) for t in tests])
       
    55 
       
    56     exact matching (no prefix):
       
    57     >>> test(b'abcdefg', b'abc', b'def', b'abcdefg')
       
    58     ('literal', 'abcdefg', [False, False, True])
       
    59 
       
    60     regex matching ('re:' prefix)
       
    61     >>> test(b're:a.+b', b'nomatch', b'fooadef', b'fooadefbar')
       
    62     ('re', 'a.+b', [False, False, True])
       
    63 
       
    64     force exact matches ('literal:' prefix)
       
    65     >>> test(b'literal:re:foobar', b'foobar', b're:foobar')
       
    66     ('literal', 're:foobar', [False, True])
       
    67 
       
    68     unknown prefixes are ignored and treated as literals
       
    69     >>> test(b'foo:bar', b'foo', b'bar', b'foo:bar')
       
    70     ('literal', 'foo:bar', [False, False, True])
       
    71 
       
    72     case insensitive regex matches
       
    73     >>> itest(b're:A.+b', b'nomatch', b'fooadef', b'fooadefBar')
       
    74     ('re', 'A.+b', [False, False, True])
       
    75 
       
    76     case insensitive literal matches
       
    77     >>> itest(b'ABCDEFG', b'abc', b'def', b'abcdefg')
       
    78     ('literal', 'ABCDEFG', [False, False, True])
       
    79     """
       
    80     if pattern.startswith('re:'):
       
    81         pattern = pattern[3:]
       
    82         try:
       
    83             flags = 0
       
    84             if not casesensitive:
       
    85                 flags = remod.I
       
    86             regex = remod.compile(pattern, flags)
       
    87         except remod.error as e:
       
    88             raise error.ParseError(_('invalid regular expression: %s')
       
    89                                    % e)
       
    90         return 're', pattern, regex.search
       
    91     elif pattern.startswith('literal:'):
       
    92         pattern = pattern[8:]
       
    93 
       
    94     match = pattern.__eq__
       
    95 
       
    96     if not casesensitive:
       
    97         ipat = encoding.lower(pattern)
       
    98         match = lambda s: ipat == encoding.lower(s)
       
    99     return 'literal', pattern, match
       
   100 
       
   101 def shortuser(user):
       
   102     """Return a short representation of a user name or email address."""
       
   103     f = user.find('@')
       
   104     if f >= 0:
       
   105         user = user[:f]
       
   106     f = user.find('<')
       
   107     if f >= 0:
       
   108         user = user[f + 1:]
       
   109     f = user.find(' ')
       
   110     if f >= 0:
       
   111         user = user[:f]
       
   112     f = user.find('.')
       
   113     if f >= 0:
       
   114         user = user[:f]
       
   115     return user
       
   116 
       
   117 def emailuser(user):
       
   118     """Return the user portion of an email address."""
       
   119     f = user.find('@')
       
   120     if f >= 0:
       
   121         user = user[:f]
       
   122     f = user.find('<')
       
   123     if f >= 0:
       
   124         user = user[f + 1:]
       
   125     return user
       
   126 
       
   127 def email(author):
       
   128     '''get email of author.'''
       
   129     r = author.find('>')
       
   130     if r == -1:
       
   131         r = None
       
   132     return author[author.find('<') + 1:r]
       
   133 
       
   134 def ellipsis(text, maxlength=400):
       
   135     """Trim string to at most maxlength (default: 400) columns in display."""
       
   136     return encoding.trim(text, maxlength, ellipsis='...')
       
   137 
       
   138 def escapestr(s):
       
   139     # call underlying function of s.encode('string_escape') directly for
       
   140     # Python 3 compatibility
       
   141     return codecs.escape_encode(s)[0]
       
   142 
       
   143 def unescapestr(s):
       
   144     return codecs.escape_decode(s)[0]
       
   145 
       
   146 def forcebytestr(obj):
       
   147     """Portably format an arbitrary object (e.g. exception) into a byte
       
   148     string."""
       
   149     try:
       
   150         return pycompat.bytestr(obj)
       
   151     except UnicodeEncodeError:
       
   152         # non-ascii string, may be lossy
       
   153         return pycompat.bytestr(encoding.strtolocal(str(obj)))
       
   154 
       
   155 def uirepr(s):
       
   156     # Avoid double backslash in Windows path repr()
       
   157     return pycompat.byterepr(pycompat.bytestr(s)).replace(b'\\\\', b'\\')
       
   158 
       
   159 # delay import of textwrap
       
   160 def _MBTextWrapper(**kwargs):
       
   161     class tw(textwrap.TextWrapper):
       
   162         """
       
   163         Extend TextWrapper for width-awareness.
       
   164 
       
   165         Neither number of 'bytes' in any encoding nor 'characters' is
       
   166         appropriate to calculate terminal columns for specified string.
       
   167 
       
   168         Original TextWrapper implementation uses built-in 'len()' directly,
       
   169         so overriding is needed to use width information of each characters.
       
   170 
       
   171         In addition, characters classified into 'ambiguous' width are
       
   172         treated as wide in East Asian area, but as narrow in other.
       
   173 
       
   174         This requires use decision to determine width of such characters.
       
   175         """
       
   176         def _cutdown(self, ucstr, space_left):
       
   177             l = 0
       
   178             colwidth = encoding.ucolwidth
       
   179             for i in xrange(len(ucstr)):
       
   180                 l += colwidth(ucstr[i])
       
   181                 if space_left < l:
       
   182                     return (ucstr[:i], ucstr[i:])
       
   183             return ucstr, ''
       
   184 
       
   185         # overriding of base class
       
   186         def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
       
   187             space_left = max(width - cur_len, 1)
       
   188 
       
   189             if self.break_long_words:
       
   190                 cut, res = self._cutdown(reversed_chunks[-1], space_left)
       
   191                 cur_line.append(cut)
       
   192                 reversed_chunks[-1] = res
       
   193             elif not cur_line:
       
   194                 cur_line.append(reversed_chunks.pop())
       
   195 
       
   196         # this overriding code is imported from TextWrapper of Python 2.6
       
   197         # to calculate columns of string by 'encoding.ucolwidth()'
       
   198         def _wrap_chunks(self, chunks):
       
   199             colwidth = encoding.ucolwidth
       
   200 
       
   201             lines = []
       
   202             if self.width <= 0:
       
   203                 raise ValueError("invalid width %r (must be > 0)" % self.width)
       
   204 
       
   205             # Arrange in reverse order so items can be efficiently popped
       
   206             # from a stack of chucks.
       
   207             chunks.reverse()
       
   208 
       
   209             while chunks:
       
   210 
       
   211                 # Start the list of chunks that will make up the current line.
       
   212                 # cur_len is just the length of all the chunks in cur_line.
       
   213                 cur_line = []
       
   214                 cur_len = 0
       
   215 
       
   216                 # Figure out which static string will prefix this line.
       
   217                 if lines:
       
   218                     indent = self.subsequent_indent
       
   219                 else:
       
   220                     indent = self.initial_indent
       
   221 
       
   222                 # Maximum width for this line.
       
   223                 width = self.width - len(indent)
       
   224 
       
   225                 # First chunk on line is whitespace -- drop it, unless this
       
   226                 # is the very beginning of the text (i.e. no lines started yet).
       
   227                 if self.drop_whitespace and chunks[-1].strip() == r'' and lines:
       
   228                     del chunks[-1]
       
   229 
       
   230                 while chunks:
       
   231                     l = colwidth(chunks[-1])
       
   232 
       
   233                     # Can at least squeeze this chunk onto the current line.
       
   234                     if cur_len + l <= width:
       
   235                         cur_line.append(chunks.pop())
       
   236                         cur_len += l
       
   237 
       
   238                     # Nope, this line is full.
       
   239                     else:
       
   240                         break
       
   241 
       
   242                 # The current line is full, and the next chunk is too big to
       
   243                 # fit on *any* line (not just this one).
       
   244                 if chunks and colwidth(chunks[-1]) > width:
       
   245                     self._handle_long_word(chunks, cur_line, cur_len, width)
       
   246 
       
   247                 # If the last chunk on this line is all whitespace, drop it.
       
   248                 if (self.drop_whitespace and
       
   249                     cur_line and cur_line[-1].strip() == r''):
       
   250                     del cur_line[-1]
       
   251 
       
   252                 # Convert current line back to a string and store it in list
       
   253                 # of all lines (return value).
       
   254                 if cur_line:
       
   255                     lines.append(indent + r''.join(cur_line))
       
   256 
       
   257             return lines
       
   258 
       
   259     global _MBTextWrapper
       
   260     _MBTextWrapper = tw
       
   261     return tw(**kwargs)
       
   262 
       
   263 def wrap(line, width, initindent='', hangindent=''):
       
   264     maxindent = max(len(hangindent), len(initindent))
       
   265     if width <= maxindent:
       
   266         # adjust for weird terminal size
       
   267         width = max(78, maxindent + 1)
       
   268     line = line.decode(pycompat.sysstr(encoding.encoding),
       
   269                        pycompat.sysstr(encoding.encodingmode))
       
   270     initindent = initindent.decode(pycompat.sysstr(encoding.encoding),
       
   271                                    pycompat.sysstr(encoding.encodingmode))
       
   272     hangindent = hangindent.decode(pycompat.sysstr(encoding.encoding),
       
   273                                    pycompat.sysstr(encoding.encodingmode))
       
   274     wrapper = _MBTextWrapper(width=width,
       
   275                              initial_indent=initindent,
       
   276                              subsequent_indent=hangindent)
       
   277     return wrapper.fill(line).encode(pycompat.sysstr(encoding.encoding))
       
   278 
       
   279 _booleans = {'1': True, 'yes': True, 'true': True, 'on': True, 'always': True,
       
   280              '0': False, 'no': False, 'false': False, 'off': False,
       
   281              'never': False}
       
   282 
       
   283 def parsebool(s):
       
   284     """Parse s into a boolean.
       
   285 
       
   286     If s is not a valid boolean, returns None.
       
   287     """
       
   288     return _booleans.get(s.lower(), None)