contrib/byteify-strings.py
changeset 38391 f77bbd34a1df
parent 38390 47dd23e6b116
child 39103 da130c5cef90
equal deleted inserted replaced
38390:47dd23e6b116 38391:f77bbd34a1df
    21 def adjusttokenpos(t, ofs):
    21 def adjusttokenpos(t, ofs):
    22     """Adjust start/end column of the given token"""
    22     """Adjust start/end column of the given token"""
    23     return t._replace(start=(t.start[0], t.start[1] + ofs),
    23     return t._replace(start=(t.start[0], t.start[1] + ofs),
    24                       end=(t.end[0], t.end[1] + ofs))
    24                       end=(t.end[0], t.end[1] + ofs))
    25 
    25 
    26 if True:
    26 def replacetokens(tokens, opts):
    27     def replacetokens(tokens, opts):
    27     """Transform a stream of tokens from raw to Python 3.
    28         """Transform a stream of tokens from raw to Python 3.
    28 
    29 
    29     Returns a generator of possibly rewritten tokens.
    30         Returns a generator of possibly rewritten tokens.
    30 
    31 
    31     The input token list may be mutated as part of processing. However,
    32         The input token list may be mutated as part of processing. However,
    32     its changes do not necessarily match the output token stream.
    33         its changes do not necessarily match the output token stream.
    33     """
       
    34     sysstrtokens = set()
       
    35 
       
    36     # The following utility functions access the tokens list and i index of
       
    37     # the for i, t enumerate(tokens) loop below
       
    38     def _isop(j, *o):
       
    39         """Assert that tokens[j] is an OP with one of the given values"""
       
    40         try:
       
    41             return tokens[j].type == token.OP and tokens[j].string in o
       
    42         except IndexError:
       
    43             return False
       
    44 
       
    45     def _findargnofcall(n):
       
    46         """Find arg n of a call expression (start at 0)
       
    47 
       
    48         Returns index of the first token of that argument, or None if
       
    49         there is not that many arguments.
       
    50 
       
    51         Assumes that token[i + 1] is '('.
       
    52 
    34         """
    53         """
    35         sysstrtokens = set()
    54         nested = 0
    36 
    55         for j in range(i + 2, len(tokens)):
    37         # The following utility functions access the tokens list and i index of
    56             if _isop(j, ')', ']', '}'):
    38         # the for i, t enumerate(tokens) loop below
    57                 # end of call, tuple, subscription or dict / set
    39         def _isop(j, *o):
    58                 nested -= 1
    40             """Assert that tokens[j] is an OP with one of the given values"""
    59                 if nested < 0:
    41             try:
    60                     return None
    42                 return tokens[j].type == token.OP and tokens[j].string in o
    61             elif n == 0:
    43             except IndexError:
    62                 # this is the starting position of arg
    44                 return False
    63                 return j
    45 
    64             elif _isop(j, '(', '[', '{'):
    46         def _findargnofcall(n):
    65                 nested += 1
    47             """Find arg n of a call expression (start at 0)
    66             elif _isop(j, ',') and nested == 0:
    48 
    67                 n -= 1
    49             Returns index of the first token of that argument, or None if
    68 
    50             there is not that many arguments.
    69         return None
    51 
    70 
    52             Assumes that token[i + 1] is '('.
    71     def _ensuresysstr(j):
    53 
    72         """Make sure the token at j is a system string
    54             """
    73 
    55             nested = 0
    74         Remember the given token so the string transformer won't add
    56             for j in range(i + 2, len(tokens)):
    75         the byte prefix.
    57                 if _isop(j, ')', ']', '}'):
    76 
    58                     # end of call, tuple, subscription or dict / set
    77         Ignores tokens that are not strings. Assumes bounds checking has
    59                     nested -= 1
    78         already been done.
    60                     if nested < 0:
    79 
    61                         return None
    80         """
    62                 elif n == 0:
    81         st = tokens[j]
    63                     # this is the starting position of arg
    82         if st.type == token.STRING and st.string.startswith(("'", '"')):
    64                     return j
    83             sysstrtokens.add(st)
    65                 elif _isop(j, '(', '[', '{'):
    84 
    66                     nested += 1
    85     coldelta = 0  # column increment for new opening parens
    67                 elif _isop(j, ',') and nested == 0:
    86     coloffset = -1  # column offset for the current line (-1: TBD)
    68                     n -= 1
    87     parens = [(0, 0, 0)]  # stack of (line, end-column, column-offset)
    69 
    88     for i, t in enumerate(tokens):
    70             return None
    89         # Compute the column offset for the current line, such that
    71 
    90         # the current line will be aligned to the last opening paren
    72         def _ensuresysstr(j):
    91         # as before.
    73             """Make sure the token at j is a system string
    92         if coloffset < 0:
    74 
    93             if t.start[1] == parens[-1][1]:
    75             Remember the given token so the string transformer won't add
    94                 coloffset = parens[-1][2]
    76             the byte prefix.
    95             elif t.start[1] + 1 == parens[-1][1]:
    77 
    96                 # fix misaligned indent of s/util.Abort/error.Abort/
    78             Ignores tokens that are not strings. Assumes bounds checking has
    97                 coloffset = parens[-1][2] + (parens[-1][1] - t.start[1])
    79             already been done.
    98             else:
    80 
    99                 coloffset = 0
    81             """
   100 
    82             st = tokens[j]
   101         # Reset per-line attributes at EOL.
    83             if st.type == token.STRING and st.string.startswith(("'", '"')):
   102         if t.type in (token.NEWLINE, tokenize.NL):
    84                 sysstrtokens.add(st)
   103             yield adjusttokenpos(t, coloffset)
    85 
   104             coldelta = 0
    86         coldelta = 0  # column increment for new opening parens
   105             coloffset = -1
    87         coloffset = -1  # column offset for the current line (-1: TBD)
   106             continue
    88         parens = [(0, 0, 0)]  # stack of (line, end-column, column-offset)
   107 
    89         for i, t in enumerate(tokens):
   108         # Remember the last paren position.
    90             # Compute the column offset for the current line, such that
   109         if _isop(i, '(', '[', '{'):
    91             # the current line will be aligned to the last opening paren
   110             parens.append(t.end + (coloffset + coldelta,))
    92             # as before.
   111         elif _isop(i, ')', ']', '}'):
    93             if coloffset < 0:
   112             parens.pop()
    94                 if t.start[1] == parens[-1][1]:
   113 
    95                     coloffset = parens[-1][2]
   114         # Convert most string literals to byte literals. String literals
    96                 elif t.start[1] + 1 == parens[-1][1]:
   115         # in Python 2 are bytes. String literals in Python 3 are unicode.
    97                     # fix misaligned indent of s/util.Abort/error.Abort/
   116         # Most strings in Mercurial are bytes and unicode strings are rare.
    98                     coloffset = parens[-1][2] + (parens[-1][1] - t.start[1])
   117         # Rather than rewrite all string literals to use ``b''`` to indicate
    99                 else:
   118         # byte strings, we apply this token transformer to insert the ``b``
   100                     coloffset = 0
   119         # prefix nearly everywhere.
   101 
   120         if t.type == token.STRING and t not in sysstrtokens:
   102             # Reset per-line attributes at EOL.
   121             s = t.string
   103             if t.type in (token.NEWLINE, tokenize.NL):
   122 
       
   123             # Preserve docstrings as string literals. This is inconsistent
       
   124             # with regular unprefixed strings. However, the
       
   125             # "from __future__" parsing (which allows a module docstring to
       
   126             # exist before it) doesn't properly handle the docstring if it
       
   127             # is b''' prefixed, leading to a SyntaxError. We leave all
       
   128             # docstrings as unprefixed to avoid this. This means Mercurial
       
   129             # components touching docstrings need to handle unicode,
       
   130             # unfortunately.
       
   131             if s[0:3] in ("'''", '"""'):
   104                 yield adjusttokenpos(t, coloffset)
   132                 yield adjusttokenpos(t, coloffset)
   105                 coldelta = 0
       
   106                 coloffset = -1
       
   107                 continue
   133                 continue
   108 
   134 
   109             # Remember the last paren position.
   135             # If the first character isn't a quote, it is likely a string
   110             if _isop(i, '(', '[', '{'):
   136             # prefixing character (such as 'b', 'u', or 'r'. Ignore.
   111                 parens.append(t.end + (coloffset + coldelta,))
   137             if s[0] not in ("'", '"'):
   112             elif _isop(i, ')', ']', '}'):
   138                 yield adjusttokenpos(t, coloffset)
   113                 parens.pop()
       
   114 
       
   115             # Convert most string literals to byte literals. String literals
       
   116             # in Python 2 are bytes. String literals in Python 3 are unicode.
       
   117             # Most strings in Mercurial are bytes and unicode strings are rare.
       
   118             # Rather than rewrite all string literals to use ``b''`` to indicate
       
   119             # byte strings, we apply this token transformer to insert the ``b``
       
   120             # prefix nearly everywhere.
       
   121             if t.type == token.STRING and t not in sysstrtokens:
       
   122                 s = t.string
       
   123 
       
   124                 # Preserve docstrings as string literals. This is inconsistent
       
   125                 # with regular unprefixed strings. However, the
       
   126                 # "from __future__" parsing (which allows a module docstring to
       
   127                 # exist before it) doesn't properly handle the docstring if it
       
   128                 # is b''' prefixed, leading to a SyntaxError. We leave all
       
   129                 # docstrings as unprefixed to avoid this. This means Mercurial
       
   130                 # components touching docstrings need to handle unicode,
       
   131                 # unfortunately.
       
   132                 if s[0:3] in ("'''", '"""'):
       
   133                     yield adjusttokenpos(t, coloffset)
       
   134                     continue
       
   135 
       
   136                 # If the first character isn't a quote, it is likely a string
       
   137                 # prefixing character (such as 'b', 'u', or 'r'. Ignore.
       
   138                 if s[0] not in ("'", '"'):
       
   139                     yield adjusttokenpos(t, coloffset)
       
   140                     continue
       
   141 
       
   142                 # String literal. Prefix to make a b'' string.
       
   143                 yield adjusttokenpos(t._replace(string='b%s' % t.string),
       
   144                                      coloffset)
       
   145                 coldelta += 1
       
   146                 continue
   139                 continue
   147 
   140 
   148             # This looks like a function call.
   141             # String literal. Prefix to make a b'' string.
   149             if t.type == token.NAME and _isop(i + 1, '('):
   142             yield adjusttokenpos(t._replace(string='b%s' % t.string),
   150                 fn = t.string
   143                                  coloffset)
   151 
   144             coldelta += 1
   152                 # *attr() builtins don't accept byte strings to 2nd argument.
   145             continue
   153                 if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and
   146 
   154                         not _isop(i - 1, '.')):
   147         # This looks like a function call.
   155                     arg1idx = _findargnofcall(1)
   148         if t.type == token.NAME and _isop(i + 1, '('):
   156                     if arg1idx is not None:
   149             fn = t.string
   157                         _ensuresysstr(arg1idx)
   150 
   158 
   151             # *attr() builtins don't accept byte strings to 2nd argument.
   159                 # .encode() and .decode() on str/bytes/unicode don't accept
   152             if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and
   160                 # byte strings on Python 3.
   153                     not _isop(i - 1, '.')):
   161                 elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
   154                 arg1idx = _findargnofcall(1)
   162                     for argn in range(2):
   155                 if arg1idx is not None:
   163                         argidx = _findargnofcall(argn)
   156                     _ensuresysstr(arg1idx)
   164                         if argidx is not None:
   157 
   165                             _ensuresysstr(argidx)
   158             # .encode() and .decode() on str/bytes/unicode don't accept
   166 
   159             # byte strings on Python 3.
   167                 # It changes iteritems/values to items/values as they are not
   160             elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
   168                 # present in Python 3 world.
   161                 for argn in range(2):
   169                 elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):
   162                     argidx = _findargnofcall(argn)
   170                     yield adjusttokenpos(t._replace(string=fn[4:]), coloffset)
   163                     if argidx is not None:
   171                     continue
   164                         _ensuresysstr(argidx)
   172 
   165 
   173             # Emit unmodified token.
   166             # It changes iteritems/values to items/values as they are not
   174             yield adjusttokenpos(t, coloffset)
   167             # present in Python 3 world.
       
   168             elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):
       
   169                 yield adjusttokenpos(t._replace(string=fn[4:]), coloffset)
       
   170                 continue
       
   171 
       
   172         # Emit unmodified token.
       
   173         yield adjusttokenpos(t, coloffset)
   175 
   174 
   176 def process(fin, fout, opts):
   175 def process(fin, fout, opts):
   177     tokens = tokenize.tokenize(fin.readline)
   176     tokens = tokenize.tokenize(fin.readline)
   178     tokens = replacetokens(list(tokens), opts)
   177     tokens = replacetokens(list(tokens), opts)
   179     fout.write(tokenize.untokenize(tokens))
   178     fout.write(tokenize.untokenize(tokens))