contrib/byteify-strings.py
changeset 38389 1d68fd5f614a
parent 38388 f701bc936e7f
child 38390 47dd23e6b116
equal deleted inserted replaced
38388:f701bc936e7f 38389:1d68fd5f614a
    25         Returns a generator of possibly rewritten tokens.
    25         Returns a generator of possibly rewritten tokens.
    26 
    26 
    27         The input token list may be mutated as part of processing. However,
    27         The input token list may be mutated as part of processing. However,
    28         its changes do not necessarily match the output token stream.
    28         its changes do not necessarily match the output token stream.
    29         """
    29         """
       
    30         sysstrtokens = set()
       
    31 
    30         # The following utility functions access the tokens list and i index of
    32         # The following utility functions access the tokens list and i index of
    31         # the for i, t enumerate(tokens) loop below
    33         # the for i, t enumerate(tokens) loop below
    32         def _isop(j, *o):
    34         def _isop(j, *o):
    33             """Assert that tokens[j] is an OP with one of the given values"""
    35             """Assert that tokens[j] is an OP with one of the given values"""
    34             try:
    36             try:
    60                 elif _isop(j, ',') and nested == 0:
    62                 elif _isop(j, ',') and nested == 0:
    61                     n -= 1
    63                     n -= 1
    62 
    64 
    63             return None
    65             return None
    64 
    66 
    65         def _ensureunicode(j):
    67         def _ensuresysstr(j):
    66             """Make sure the token at j is a unicode string
    68             """Make sure the token at j is a system string
    67 
    69 
    68             This rewrites a string token to include the unicode literal prefix
    70             Remember the given token so the string transformer won't add
    69             so the string transformer won't add the byte prefix.
    71             the byte prefix.
    70 
    72 
    71             Ignores tokens that are not strings. Assumes bounds checking has
    73             Ignores tokens that are not strings. Assumes bounds checking has
    72             already been done.
    74             already been done.
    73 
    75 
    74             """
    76             """
    75             st = tokens[j]
    77             st = tokens[j]
    76             if st.type == token.STRING and st.string.startswith(("'", '"')):
    78             if st.type == token.STRING and st.string.startswith(("'", '"')):
    77                 tokens[j] = st._replace(string='u%s' % st.string)
    79                 sysstrtokens.add(st)
    78 
    80 
    79         for i, t in enumerate(tokens):
    81         for i, t in enumerate(tokens):
    80             # Convert most string literals to byte literals. String literals
    82             # Convert most string literals to byte literals. String literals
    81             # in Python 2 are bytes. String literals in Python 3 are unicode.
    83             # in Python 2 are bytes. String literals in Python 3 are unicode.
    82             # Most strings in Mercurial are bytes and unicode strings are rare.
    84             # Most strings in Mercurial are bytes and unicode strings are rare.
    83             # Rather than rewrite all string literals to use ``b''`` to indicate
    85             # Rather than rewrite all string literals to use ``b''`` to indicate
    84             # byte strings, we apply this token transformer to insert the ``b``
    86             # byte strings, we apply this token transformer to insert the ``b``
    85             # prefix nearly everywhere.
    87             # prefix nearly everywhere.
    86             if t.type == token.STRING:
    88             if t.type == token.STRING and t not in sysstrtokens:
    87                 s = t.string
    89                 s = t.string
    88 
    90 
    89                 # Preserve docstrings as string literals. This is inconsistent
    91                 # Preserve docstrings as string literals. This is inconsistent
    90                 # with regular unprefixed strings. However, the
    92                 # with regular unprefixed strings. However, the
    91                 # "from __future__" parsing (which allows a module docstring to
    93                 # "from __future__" parsing (which allows a module docstring to
   115                 # *attr() builtins don't accept byte strings to 2nd argument.
   117                 # *attr() builtins don't accept byte strings to 2nd argument.
   116                 if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and
   118                 if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and
   117                         not _isop(i - 1, '.')):
   119                         not _isop(i - 1, '.')):
   118                     arg1idx = _findargnofcall(1)
   120                     arg1idx = _findargnofcall(1)
   119                     if arg1idx is not None:
   121                     if arg1idx is not None:
   120                         _ensureunicode(arg1idx)
   122                         _ensuresysstr(arg1idx)
   121 
   123 
   122                 # .encode() and .decode() on str/bytes/unicode don't accept
   124                 # .encode() and .decode() on str/bytes/unicode don't accept
   123                 # byte strings on Python 3.
   125                 # byte strings on Python 3.
   124                 elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
   126                 elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
   125                     for argn in range(2):
   127                     for argn in range(2):
   126                         argidx = _findargnofcall(argn)
   128                         argidx = _findargnofcall(argn)
   127                         if argidx is not None:
   129                         if argidx is not None:
   128                             _ensureunicode(argidx)
   130                             _ensuresysstr(argidx)
   129 
   131 
   130                 # It changes iteritems/values to items/values as they are not
   132                 # It changes iteritems/values to items/values as they are not
   131                 # present in Python 3 world.
   133                 # present in Python 3 world.
   132                 elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):
   134                 elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):
   133                     yield t._replace(string=fn[4:])
   135                     yield t._replace(string=fn[4:])