comparison contrib/byteify-strings.py @ 38390:1d9c97db465f

byteify-strings: fork py3 code transformer to make it a standalone command I'm thinking of making a one-off s/''/b''/g change for overall codebase to make linter happy. We could do that without maintaining the script, but I think it will be somewhat useful for extension authors. So it is in contrib.
author Yuya Nishihara <yuya@tcha.org>
date Thu, 31 May 2018 22:07:04 +0900
parents mercurial/__init__.py@5246f940a48e
children a2976c27dac4
comparison
equal deleted inserted replaced
38389:23dc901cdf13 38390:1d9c97db465f
1 # byteify-strings.py - transform string literals to be Python 3 safe
2 #
3 # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com>
4 #
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
7
8 from __future__ import absolute_import
9
10 import io
11 import token
12 import tokenize
13
14 if True:
15 def replacetokens(tokens, fullname):
16 """Transform a stream of tokens from raw to Python 3.
17
18 Returns a generator of possibly rewritten tokens.
19
20 The input token list may be mutated as part of processing. However,
21 its changes do not necessarily match the output token stream.
22 """
23 futureimpline = False
24
25 # The following utility functions access the tokens list and i index of
26 # the for i, t enumerate(tokens) loop below
27 def _isop(j, *o):
28 """Assert that tokens[j] is an OP with one of the given values"""
29 try:
30 return tokens[j].type == token.OP and tokens[j].string in o
31 except IndexError:
32 return False
33
34 def _findargnofcall(n):
35 """Find arg n of a call expression (start at 0)
36
37 Returns index of the first token of that argument, or None if
38 there is not that many arguments.
39
40 Assumes that token[i + 1] is '('.
41
42 """
43 nested = 0
44 for j in range(i + 2, len(tokens)):
45 if _isop(j, ')', ']', '}'):
46 # end of call, tuple, subscription or dict / set
47 nested -= 1
48 if nested < 0:
49 return None
50 elif n == 0:
51 # this is the starting position of arg
52 return j
53 elif _isop(j, '(', '[', '{'):
54 nested += 1
55 elif _isop(j, ',') and nested == 0:
56 n -= 1
57
58 return None
59
60 def _ensureunicode(j):
61 """Make sure the token at j is a unicode string
62
63 This rewrites a string token to include the unicode literal prefix
64 so the string transformer won't add the byte prefix.
65
66 Ignores tokens that are not strings. Assumes bounds checking has
67 already been done.
68
69 """
70 st = tokens[j]
71 if st.type == token.STRING and st.string.startswith(("'", '"')):
72 tokens[j] = st._replace(string='u%s' % st.string)
73
74 for i, t in enumerate(tokens):
75 # Convert most string literals to byte literals. String literals
76 # in Python 2 are bytes. String literals in Python 3 are unicode.
77 # Most strings in Mercurial are bytes and unicode strings are rare.
78 # Rather than rewrite all string literals to use ``b''`` to indicate
79 # byte strings, we apply this token transformer to insert the ``b``
80 # prefix nearly everywhere.
81 if t.type == token.STRING:
82 s = t.string
83
84 # Preserve docstrings as string literals. This is inconsistent
85 # with regular unprefixed strings. However, the
86 # "from __future__" parsing (which allows a module docstring to
87 # exist before it) doesn't properly handle the docstring if it
88 # is b''' prefixed, leading to a SyntaxError. We leave all
89 # docstrings as unprefixed to avoid this. This means Mercurial
90 # components touching docstrings need to handle unicode,
91 # unfortunately.
92 if s[0:3] in ("'''", '"""'):
93 yield t
94 continue
95
96 # If the first character isn't a quote, it is likely a string
97 # prefixing character (such as 'b', 'u', or 'r'. Ignore.
98 if s[0] not in ("'", '"'):
99 yield t
100 continue
101
102 # String literal. Prefix to make a b'' string.
103 yield t._replace(string='b%s' % t.string)
104 continue
105
106 # Insert compatibility imports at "from __future__ import" line.
107 # No '\n' should be added to preserve line numbers.
108 if (t.type == token.NAME and t.string == 'import' and
109 all(u.type == token.NAME for u in tokens[i - 2:i]) and
110 [u.string for u in tokens[i - 2:i]] == ['from', '__future__']):
111 futureimpline = True
112 if t.type == token.NEWLINE and futureimpline:
113 futureimpline = False
114 if fullname == 'mercurial.pycompat':
115 yield t
116 continue
117 r, c = t.start
118 l = (b'; from mercurial.pycompat import '
119 b'delattr, getattr, hasattr, setattr, xrange, '
120 b'open, unicode\n')
121 for u in tokenize.tokenize(io.BytesIO(l).readline):
122 if u.type in (tokenize.ENCODING, token.ENDMARKER):
123 continue
124 yield u._replace(
125 start=(r, c + u.start[1]), end=(r, c + u.end[1]))
126 continue
127
128 # This looks like a function call.
129 if t.type == token.NAME and _isop(i + 1, '('):
130 fn = t.string
131
132 # *attr() builtins don't accept byte strings to 2nd argument.
133 if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and
134 not _isop(i - 1, '.')):
135 arg1idx = _findargnofcall(1)
136 if arg1idx is not None:
137 _ensureunicode(arg1idx)
138
139 # .encode() and .decode() on str/bytes/unicode don't accept
140 # byte strings on Python 3.
141 elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
142 for argn in range(2):
143 argidx = _findargnofcall(argn)
144 if argidx is not None:
145 _ensureunicode(argidx)
146
147 # It changes iteritems/values to items/values as they are not
148 # present in Python 3 world.
149 elif fn in ('iteritems', 'itervalues'):
150 yield t._replace(string=fn[4:])
151 continue
152
153 # Emit unmodified token.
154 yield t