|
1 # byteify-strings.py - transform string literals to be Python 3 safe |
|
2 # |
|
3 # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com> |
|
4 # |
|
5 # This software may be used and distributed according to the terms of the |
|
6 # GNU General Public License version 2 or any later version. |
|
7 |
|
8 from __future__ import absolute_import |
|
9 |
|
10 import io |
|
11 import token |
|
12 import tokenize |
|
13 |
|
14 if True: |
|
15 def replacetokens(tokens, fullname): |
|
16 """Transform a stream of tokens from raw to Python 3. |
|
17 |
|
18 Returns a generator of possibly rewritten tokens. |
|
19 |
|
20 The input token list may be mutated as part of processing. However, |
|
21 its changes do not necessarily match the output token stream. |
|
22 """ |
|
23 futureimpline = False |
|
24 |
|
25 # The following utility functions access the tokens list and i index of |
|
26 # the for i, t enumerate(tokens) loop below |
|
27 def _isop(j, *o): |
|
28 """Assert that tokens[j] is an OP with one of the given values""" |
|
29 try: |
|
30 return tokens[j].type == token.OP and tokens[j].string in o |
|
31 except IndexError: |
|
32 return False |
|
33 |
|
34 def _findargnofcall(n): |
|
35 """Find arg n of a call expression (start at 0) |
|
36 |
|
37 Returns index of the first token of that argument, or None if |
|
38 there is not that many arguments. |
|
39 |
|
40 Assumes that token[i + 1] is '('. |
|
41 |
|
42 """ |
|
43 nested = 0 |
|
44 for j in range(i + 2, len(tokens)): |
|
45 if _isop(j, ')', ']', '}'): |
|
46 # end of call, tuple, subscription or dict / set |
|
47 nested -= 1 |
|
48 if nested < 0: |
|
49 return None |
|
50 elif n == 0: |
|
51 # this is the starting position of arg |
|
52 return j |
|
53 elif _isop(j, '(', '[', '{'): |
|
54 nested += 1 |
|
55 elif _isop(j, ',') and nested == 0: |
|
56 n -= 1 |
|
57 |
|
58 return None |
|
59 |
|
60 def _ensureunicode(j): |
|
61 """Make sure the token at j is a unicode string |
|
62 |
|
63 This rewrites a string token to include the unicode literal prefix |
|
64 so the string transformer won't add the byte prefix. |
|
65 |
|
66 Ignores tokens that are not strings. Assumes bounds checking has |
|
67 already been done. |
|
68 |
|
69 """ |
|
70 st = tokens[j] |
|
71 if st.type == token.STRING and st.string.startswith(("'", '"')): |
|
72 tokens[j] = st._replace(string='u%s' % st.string) |
|
73 |
|
74 for i, t in enumerate(tokens): |
|
75 # Convert most string literals to byte literals. String literals |
|
76 # in Python 2 are bytes. String literals in Python 3 are unicode. |
|
77 # Most strings in Mercurial are bytes and unicode strings are rare. |
|
78 # Rather than rewrite all string literals to use ``b''`` to indicate |
|
79 # byte strings, we apply this token transformer to insert the ``b`` |
|
80 # prefix nearly everywhere. |
|
81 if t.type == token.STRING: |
|
82 s = t.string |
|
83 |
|
84 # Preserve docstrings as string literals. This is inconsistent |
|
85 # with regular unprefixed strings. However, the |
|
86 # "from __future__" parsing (which allows a module docstring to |
|
87 # exist before it) doesn't properly handle the docstring if it |
|
88 # is b''' prefixed, leading to a SyntaxError. We leave all |
|
89 # docstrings as unprefixed to avoid this. This means Mercurial |
|
90 # components touching docstrings need to handle unicode, |
|
91 # unfortunately. |
|
92 if s[0:3] in ("'''", '"""'): |
|
93 yield t |
|
94 continue |
|
95 |
|
96 # If the first character isn't a quote, it is likely a string |
|
97 # prefixing character (such as 'b', 'u', or 'r'. Ignore. |
|
98 if s[0] not in ("'", '"'): |
|
99 yield t |
|
100 continue |
|
101 |
|
102 # String literal. Prefix to make a b'' string. |
|
103 yield t._replace(string='b%s' % t.string) |
|
104 continue |
|
105 |
|
106 # Insert compatibility imports at "from __future__ import" line. |
|
107 # No '\n' should be added to preserve line numbers. |
|
108 if (t.type == token.NAME and t.string == 'import' and |
|
109 all(u.type == token.NAME for u in tokens[i - 2:i]) and |
|
110 [u.string for u in tokens[i - 2:i]] == ['from', '__future__']): |
|
111 futureimpline = True |
|
112 if t.type == token.NEWLINE and futureimpline: |
|
113 futureimpline = False |
|
114 if fullname == 'mercurial.pycompat': |
|
115 yield t |
|
116 continue |
|
117 r, c = t.start |
|
118 l = (b'; from mercurial.pycompat import ' |
|
119 b'delattr, getattr, hasattr, setattr, xrange, ' |
|
120 b'open, unicode\n') |
|
121 for u in tokenize.tokenize(io.BytesIO(l).readline): |
|
122 if u.type in (tokenize.ENCODING, token.ENDMARKER): |
|
123 continue |
|
124 yield u._replace( |
|
125 start=(r, c + u.start[1]), end=(r, c + u.end[1])) |
|
126 continue |
|
127 |
|
128 # This looks like a function call. |
|
129 if t.type == token.NAME and _isop(i + 1, '('): |
|
130 fn = t.string |
|
131 |
|
132 # *attr() builtins don't accept byte strings to 2nd argument. |
|
133 if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and |
|
134 not _isop(i - 1, '.')): |
|
135 arg1idx = _findargnofcall(1) |
|
136 if arg1idx is not None: |
|
137 _ensureunicode(arg1idx) |
|
138 |
|
139 # .encode() and .decode() on str/bytes/unicode don't accept |
|
140 # byte strings on Python 3. |
|
141 elif fn in ('encode', 'decode') and _isop(i - 1, '.'): |
|
142 for argn in range(2): |
|
143 argidx = _findargnofcall(argn) |
|
144 if argidx is not None: |
|
145 _ensureunicode(argidx) |
|
146 |
|
147 # It changes iteritems/values to items/values as they are not |
|
148 # present in Python 3 world. |
|
149 elif fn in ('iteritems', 'itervalues'): |
|
150 yield t._replace(string=fn[4:]) |
|
151 continue |
|
152 |
|
153 # Emit unmodified token. |
|
154 yield t |