view mercurial/utils/stringutil.py @ 37212:2a2ce93e12f4

templatefuncs: add mailmap template function This commit adds a template function to support the .mailmap file in Mercurial repositories. The .mailmap file comes from git, and can be used to map new emails and names for old commits. The general use case is that someone may change their name or author commits under different emails and aliases, which would make these commits appear as though they came from different persons. The file allows you to specify the correct name that should be used in place of the author field specified in the commit. The mailmap file has 4 possible formats used to map old "commit" names to new "proper" names: 1. <proper@email.com> <commit@email.com> 2. Proper Name <commit@email.com> 3. Proper Name <proper@email.com> <commit@email.com> 4. Proper Name <proper@email.com> Commit Name <commit@email.com> Essentially there is a commit email present in each mailmap entry, that maps to either an updated name, email, or both. The final possible format allows commits authored by a person who used both an old name and an old email to map to a new name and email. To parse the file, we split by spaces and build a name out of every element that does not start with "<". Once we find an element that does start with "<" we concatenate all the name elements that preceded and add that as a parsed name. We then add the email as the first parsed email. We repeat the process until the end of the line, or a comment is found. We will be left with all parsed names in a list, and all parsed emails in a list, with the 0 index being the proper values and the 1 index being the commit values (if they were specified in the entry). The commit values are added as the keys to a dict, and with the proper fields as the values. The mapname function takes the mapping object and the commit author field and attempts to look for a corresponding entry. To do so we try (commit name, commit email) first, and if no results are returned then (None, commit email) is also looked up. This is due to format 4 from above, where someone may have a mailmap entry with both name and email, and if they don't it is possible they have an entry that uses only the commit email. Differential Revision: https://phab.mercurial-scm.org/D2904
author Connor Sheehan <sheehan@mozilla.com>
date Mon, 19 Mar 2018 11:16:21 -0400
parents fb7140f1d09d
children 54b896f195d1
line wrap: on
line source

# stringutil.py - utility for generic string formatting, parsing, etc.
#
#  Copyright 2005 K. Thananchayan <thananck@yahoo.com>
#  Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
#  Copyright 2006 Vadim Gelfer <vadim.gelfer@gmail.com>
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.

from __future__ import absolute_import

import codecs
import re as remod
import textwrap

from ..i18n import _
from ..thirdparty import attr

from .. import (
    encoding,
    error,
    pycompat,
)

_DATA_ESCAPE_MAP = {pycompat.bytechr(i): br'\x%02x' % i for i in range(256)}
_DATA_ESCAPE_MAP.update({
    b'\\': b'\\\\',
    b'\r': br'\r',
    b'\n': br'\n',
})
_DATA_ESCAPE_RE = remod.compile(br'[\x00-\x08\x0a-\x1f\\\x7f-\xff]')

def escapedata(s):
    if isinstance(s, bytearray):
        s = bytes(s)

    return _DATA_ESCAPE_RE.sub(lambda m: _DATA_ESCAPE_MAP[m.group(0)], s)

def binary(s):
    """return true if a string is binary data"""
    return bool(s and '\0' in s)

def stringmatcher(pattern, casesensitive=True):
    """
    accepts a string, possibly starting with 're:' or 'literal:' prefix.
    returns the matcher name, pattern, and matcher function.
    missing or unknown prefixes are treated as literal matches.

    helper for tests:
    >>> def test(pattern, *tests):
    ...     kind, pattern, matcher = stringmatcher(pattern)
    ...     return (kind, pattern, [bool(matcher(t)) for t in tests])
    >>> def itest(pattern, *tests):
    ...     kind, pattern, matcher = stringmatcher(pattern, casesensitive=False)
    ...     return (kind, pattern, [bool(matcher(t)) for t in tests])

    exact matching (no prefix):
    >>> test(b'abcdefg', b'abc', b'def', b'abcdefg')
    ('literal', 'abcdefg', [False, False, True])

    regex matching ('re:' prefix)
    >>> test(b're:a.+b', b'nomatch', b'fooadef', b'fooadefbar')
    ('re', 'a.+b', [False, False, True])

    force exact matches ('literal:' prefix)
    >>> test(b'literal:re:foobar', b'foobar', b're:foobar')
    ('literal', 're:foobar', [False, True])

    unknown prefixes are ignored and treated as literals
    >>> test(b'foo:bar', b'foo', b'bar', b'foo:bar')
    ('literal', 'foo:bar', [False, False, True])

    case insensitive regex matches
    >>> itest(b're:A.+b', b'nomatch', b'fooadef', b'fooadefBar')
    ('re', 'A.+b', [False, False, True])

    case insensitive literal matches
    >>> itest(b'ABCDEFG', b'abc', b'def', b'abcdefg')
    ('literal', 'ABCDEFG', [False, False, True])
    """
    if pattern.startswith('re:'):
        pattern = pattern[3:]
        try:
            flags = 0
            if not casesensitive:
                flags = remod.I
            regex = remod.compile(pattern, flags)
        except remod.error as e:
            raise error.ParseError(_('invalid regular expression: %s')
                                   % e)
        return 're', pattern, regex.search
    elif pattern.startswith('literal:'):
        pattern = pattern[8:]

    match = pattern.__eq__

    if not casesensitive:
        ipat = encoding.lower(pattern)
        match = lambda s: ipat == encoding.lower(s)
    return 'literal', pattern, match

def shortuser(user):
    """Return a short representation of a user name or email address."""
    f = user.find('@')
    if f >= 0:
        user = user[:f]
    f = user.find('<')
    if f >= 0:
        user = user[f + 1:]
    f = user.find(' ')
    if f >= 0:
        user = user[:f]
    f = user.find('.')
    if f >= 0:
        user = user[:f]
    return user

def emailuser(user):
    """Return the user portion of an email address."""
    f = user.find('@')
    if f >= 0:
        user = user[:f]
    f = user.find('<')
    if f >= 0:
        user = user[f + 1:]
    return user

def email(author):
    '''get email of author.'''
    r = author.find('>')
    if r == -1:
        r = None
    return author[author.find('<') + 1:r]

def person(author):
    """Returns the name before an email address,
    interpreting it as per RFC 5322

    >>> person(b'foo@bar')
    'foo'
    >>> person(b'Foo Bar <foo@bar>')
    'Foo Bar'
    >>> person(b'"Foo Bar" <foo@bar>')
    'Foo Bar'
    >>> person(b'"Foo \"buz\" Bar" <foo@bar>')
    'Foo "buz" Bar'
    >>> # The following are invalid, but do exist in real-life
    ...
    >>> person(b'Foo "buz" Bar <foo@bar>')
    'Foo "buz" Bar'
    >>> person(b'"Foo Bar <foo@bar>')
    'Foo Bar'
    """
    if '@' not in author:
        return author
    f = author.find('<')
    if f != -1:
        return author[:f].strip(' "').replace('\\"', '"')
    f = author.find('@')
    return author[:f].replace('.', ' ')

@attr.s(hash=True)
class mailmapping(object):
    '''Represents a username/email key or value in
    a mailmap file'''
    email = attr.ib()
    name = attr.ib(default=None)

def parsemailmap(mailmapcontent):
    """Parses data in the .mailmap format

    >>> mmdata = b"\\n".join([
    ... b'# Comment',
    ... b'Name <commit1@email.xx>',
    ... b'<name@email.xx> <commit2@email.xx>',
    ... b'Name <proper@email.xx> <commit3@email.xx>',
    ... b'Name <proper@email.xx> Commit <commit4@email.xx>',
    ... ])
    >>> mm = parsemailmap(mmdata)
    >>> for key in sorted(mm.keys()):
    ...     print(key)
    mailmapping(email='commit1@email.xx', name=None)
    mailmapping(email='commit2@email.xx', name=None)
    mailmapping(email='commit3@email.xx', name=None)
    mailmapping(email='commit4@email.xx', name='Commit')
    >>> for val in sorted(mm.values()):
    ...     print(val)
    mailmapping(email='commit1@email.xx', name='Name')
    mailmapping(email='name@email.xx', name=None)
    mailmapping(email='proper@email.xx', name='Name')
    mailmapping(email='proper@email.xx', name='Name')
    """
    mailmap = {}

    if mailmapcontent is None:
        return mailmap

    for line in mailmapcontent.splitlines():

        # Don't bother checking the line if it is a comment or
        # is an improperly formed author field
        if line.lstrip().startswith('#') or any(c not in line for c in '<>@'):
            continue

        # name, email hold the parsed emails and names for each line
        # name_builder holds the words in a persons name
        name, email = [], []
        namebuilder = []

        for element in line.split():
            if element.startswith('#'):
                # If we reach a comment in the mailmap file, move on
                break

            elif element.startswith('<') and element.endswith('>'):
                # We have found an email.
                # Parse it, and finalize any names from earlier
                email.append(element[1:-1])  # Slice off the "<>"

                if namebuilder:
                    name.append(' '.join(namebuilder))
                    namebuilder = []

                # Break if we have found a second email, any other
                # data does not fit the spec for .mailmap
                if len(email) > 1:
                    break

            else:
                # We have found another word in the committers name
                namebuilder.append(element)

        mailmapkey = mailmapping(
            email=email[-1],
            name=name[-1] if len(name) == 2 else None,
        )

        mailmap[mailmapkey] = mailmapping(
            email=email[0],
            name=name[0] if name else None,
        )

    return mailmap

def mapname(mailmap, author):
    """Returns the author field according to the mailmap cache, or
    the original author field.

    >>> mmdata = b"\\n".join([
    ...     b'# Comment',
    ...     b'Name <commit1@email.xx>',
    ...     b'<name@email.xx> <commit2@email.xx>',
    ...     b'Name <proper@email.xx> <commit3@email.xx>',
    ...     b'Name <proper@email.xx> Commit <commit4@email.xx>',
    ... ])
    >>> m = parsemailmap(mmdata)
    >>> mapname(m, b'Commit <commit1@email.xx>')
    'Name <commit1@email.xx>'
    >>> mapname(m, b'Name <commit2@email.xx>')
    'Name <name@email.xx>'
    >>> mapname(m, b'Commit <commit3@email.xx>')
    'Name <proper@email.xx>'
    >>> mapname(m, b'Commit <commit4@email.xx>')
    'Name <proper@email.xx>'
    >>> mapname(m, b'Unknown Name <unknown@email.com>')
    'Unknown Name <unknown@email.com>'
    """
    # If the author field coming in isn't in the correct format,
    # or the mailmap is empty just return the original author field
    if not isauthorwellformed(author) or not mailmap:
        return author

    # Turn the user name into a mailmaptup
    commit = mailmapping(name=person(author), email=email(author))

    try:
        # Try and use both the commit email and name as the key
        proper = mailmap[commit]

    except KeyError:
        # If the lookup fails, use just the email as the key instead
        # We call this commit2 as not to erase original commit fields
        commit2 = mailmapping(email=commit.email)
        proper = mailmap.get(commit2, mailmapping(None, None))

    # Return the author field with proper values filled in
    return '%s <%s>' % (
        proper.name if proper.name else commit.name,
        proper.email if proper.email else commit.email,
    )

_correctauthorformat = remod.compile(br'^[^<]+\s\<[^<>]+@[^<>]+\>$')

def isauthorwellformed(author):
    '''Return True if the author field is well formed
    (ie "Contributor Name <contrib@email.dom>")

    >>> isauthorwellformed(b'Good Author <good@author.com>')
    True
    >>> isauthorwellformed(b'Author <good@author.com>')
    True
    >>> isauthorwellformed(b'Bad Author')
    False
    >>> isauthorwellformed(b'Bad Author <author@author.com')
    False
    >>> isauthorwellformed(b'Bad Author author@author.com')
    False
    >>> isauthorwellformed(b'<author@author.com>')
    False
    >>> isauthorwellformed(b'Bad Author <author>')
    False
    '''
    return _correctauthorformat.match(author) is not None

def ellipsis(text, maxlength=400):
    """Trim string to at most maxlength (default: 400) columns in display."""
    return encoding.trim(text, maxlength, ellipsis='...')

def escapestr(s):
    # call underlying function of s.encode('string_escape') directly for
    # Python 3 compatibility
    return codecs.escape_encode(s)[0]

def unescapestr(s):
    return codecs.escape_decode(s)[0]

def forcebytestr(obj):
    """Portably format an arbitrary object (e.g. exception) into a byte
    string."""
    try:
        return pycompat.bytestr(obj)
    except UnicodeEncodeError:
        # non-ascii string, may be lossy
        return pycompat.bytestr(encoding.strtolocal(str(obj)))

def uirepr(s):
    # Avoid double backslash in Windows path repr()
    return pycompat.byterepr(pycompat.bytestr(s)).replace(b'\\\\', b'\\')

# delay import of textwrap
def _MBTextWrapper(**kwargs):
    class tw(textwrap.TextWrapper):
        """
        Extend TextWrapper for width-awareness.

        Neither number of 'bytes' in any encoding nor 'characters' is
        appropriate to calculate terminal columns for specified string.

        Original TextWrapper implementation uses built-in 'len()' directly,
        so overriding is needed to use width information of each characters.

        In addition, characters classified into 'ambiguous' width are
        treated as wide in East Asian area, but as narrow in other.

        This requires use decision to determine width of such characters.
        """
        def _cutdown(self, ucstr, space_left):
            l = 0
            colwidth = encoding.ucolwidth
            for i in xrange(len(ucstr)):
                l += colwidth(ucstr[i])
                if space_left < l:
                    return (ucstr[:i], ucstr[i:])
            return ucstr, ''

        # overriding of base class
        def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
            space_left = max(width - cur_len, 1)

            if self.break_long_words:
                cut, res = self._cutdown(reversed_chunks[-1], space_left)
                cur_line.append(cut)
                reversed_chunks[-1] = res
            elif not cur_line:
                cur_line.append(reversed_chunks.pop())

        # this overriding code is imported from TextWrapper of Python 2.6
        # to calculate columns of string by 'encoding.ucolwidth()'
        def _wrap_chunks(self, chunks):
            colwidth = encoding.ucolwidth

            lines = []
            if self.width <= 0:
                raise ValueError("invalid width %r (must be > 0)" % self.width)

            # Arrange in reverse order so items can be efficiently popped
            # from a stack of chucks.
            chunks.reverse()

            while chunks:

                # Start the list of chunks that will make up the current line.
                # cur_len is just the length of all the chunks in cur_line.
                cur_line = []
                cur_len = 0

                # Figure out which static string will prefix this line.
                if lines:
                    indent = self.subsequent_indent
                else:
                    indent = self.initial_indent

                # Maximum width for this line.
                width = self.width - len(indent)

                # First chunk on line is whitespace -- drop it, unless this
                # is the very beginning of the text (i.e. no lines started yet).
                if self.drop_whitespace and chunks[-1].strip() == r'' and lines:
                    del chunks[-1]

                while chunks:
                    l = colwidth(chunks[-1])

                    # Can at least squeeze this chunk onto the current line.
                    if cur_len + l <= width:
                        cur_line.append(chunks.pop())
                        cur_len += l

                    # Nope, this line is full.
                    else:
                        break

                # The current line is full, and the next chunk is too big to
                # fit on *any* line (not just this one).
                if chunks and colwidth(chunks[-1]) > width:
                    self._handle_long_word(chunks, cur_line, cur_len, width)

                # If the last chunk on this line is all whitespace, drop it.
                if (self.drop_whitespace and
                    cur_line and cur_line[-1].strip() == r''):
                    del cur_line[-1]

                # Convert current line back to a string and store it in list
                # of all lines (return value).
                if cur_line:
                    lines.append(indent + r''.join(cur_line))

            return lines

    global _MBTextWrapper
    _MBTextWrapper = tw
    return tw(**kwargs)

def wrap(line, width, initindent='', hangindent=''):
    maxindent = max(len(hangindent), len(initindent))
    if width <= maxindent:
        # adjust for weird terminal size
        width = max(78, maxindent + 1)
    line = line.decode(pycompat.sysstr(encoding.encoding),
                       pycompat.sysstr(encoding.encodingmode))
    initindent = initindent.decode(pycompat.sysstr(encoding.encoding),
                                   pycompat.sysstr(encoding.encodingmode))
    hangindent = hangindent.decode(pycompat.sysstr(encoding.encoding),
                                   pycompat.sysstr(encoding.encodingmode))
    wrapper = _MBTextWrapper(width=width,
                             initial_indent=initindent,
                             subsequent_indent=hangindent)
    return wrapper.fill(line).encode(pycompat.sysstr(encoding.encoding))

_booleans = {'1': True, 'yes': True, 'true': True, 'on': True, 'always': True,
             '0': False, 'no': False, 'false': False, 'off': False,
             'never': False}

def parsebool(s):
    """Parse s into a boolean.

    If s is not a valid boolean, returns None.
    """
    return _booleans.get(s.lower(), None)