Mercurial > public > mercurial-scm > hg-stable
comparison contrib/byteify-strings.py @ 38390:1d9c97db465f
byteify-strings: fork py3 code transformer to make it a standalone command
I'm thinking of making a one-off s/''/b''/g change for overall codebase to
make linter happy. We could do that without maintaining the script, but I
think it will be somewhat useful for extension authors. So it is in contrib.
author | Yuya Nishihara <yuya@tcha.org> |
---|---|
date | Thu, 31 May 2018 22:07:04 +0900 |
parents | mercurial/__init__.py@5246f940a48e |
children | a2976c27dac4 |
comparison
equal
deleted
inserted
replaced
38389:23dc901cdf13 | 38390:1d9c97db465f |
---|---|
1 # byteify-strings.py - transform string literals to be Python 3 safe | |
2 # | |
3 # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com> | |
4 # | |
5 # This software may be used and distributed according to the terms of the | |
6 # GNU General Public License version 2 or any later version. | |
7 | |
8 from __future__ import absolute_import | |
9 | |
10 import io | |
11 import token | |
12 import tokenize | |
13 | |
14 if True: | |
15 def replacetokens(tokens, fullname): | |
16 """Transform a stream of tokens from raw to Python 3. | |
17 | |
18 Returns a generator of possibly rewritten tokens. | |
19 | |
20 The input token list may be mutated as part of processing. However, | |
21 its changes do not necessarily match the output token stream. | |
22 """ | |
23 futureimpline = False | |
24 | |
25 # The following utility functions access the tokens list and i index of | |
26 # the for i, t enumerate(tokens) loop below | |
27 def _isop(j, *o): | |
28 """Assert that tokens[j] is an OP with one of the given values""" | |
29 try: | |
30 return tokens[j].type == token.OP and tokens[j].string in o | |
31 except IndexError: | |
32 return False | |
33 | |
34 def _findargnofcall(n): | |
35 """Find arg n of a call expression (start at 0) | |
36 | |
37 Returns index of the first token of that argument, or None if | |
38 there is not that many arguments. | |
39 | |
40 Assumes that token[i + 1] is '('. | |
41 | |
42 """ | |
43 nested = 0 | |
44 for j in range(i + 2, len(tokens)): | |
45 if _isop(j, ')', ']', '}'): | |
46 # end of call, tuple, subscription or dict / set | |
47 nested -= 1 | |
48 if nested < 0: | |
49 return None | |
50 elif n == 0: | |
51 # this is the starting position of arg | |
52 return j | |
53 elif _isop(j, '(', '[', '{'): | |
54 nested += 1 | |
55 elif _isop(j, ',') and nested == 0: | |
56 n -= 1 | |
57 | |
58 return None | |
59 | |
60 def _ensureunicode(j): | |
61 """Make sure the token at j is a unicode string | |
62 | |
63 This rewrites a string token to include the unicode literal prefix | |
64 so the string transformer won't add the byte prefix. | |
65 | |
66 Ignores tokens that are not strings. Assumes bounds checking has | |
67 already been done. | |
68 | |
69 """ | |
70 st = tokens[j] | |
71 if st.type == token.STRING and st.string.startswith(("'", '"')): | |
72 tokens[j] = st._replace(string='u%s' % st.string) | |
73 | |
74 for i, t in enumerate(tokens): | |
75 # Convert most string literals to byte literals. String literals | |
76 # in Python 2 are bytes. String literals in Python 3 are unicode. | |
77 # Most strings in Mercurial are bytes and unicode strings are rare. | |
78 # Rather than rewrite all string literals to use ``b''`` to indicate | |
79 # byte strings, we apply this token transformer to insert the ``b`` | |
80 # prefix nearly everywhere. | |
81 if t.type == token.STRING: | |
82 s = t.string | |
83 | |
84 # Preserve docstrings as string literals. This is inconsistent | |
85 # with regular unprefixed strings. However, the | |
86 # "from __future__" parsing (which allows a module docstring to | |
87 # exist before it) doesn't properly handle the docstring if it | |
88 # is b''' prefixed, leading to a SyntaxError. We leave all | |
89 # docstrings as unprefixed to avoid this. This means Mercurial | |
90 # components touching docstrings need to handle unicode, | |
91 # unfortunately. | |
92 if s[0:3] in ("'''", '"""'): | |
93 yield t | |
94 continue | |
95 | |
96 # If the first character isn't a quote, it is likely a string | |
97 # prefixing character (such as 'b', 'u', or 'r'. Ignore. | |
98 if s[0] not in ("'", '"'): | |
99 yield t | |
100 continue | |
101 | |
102 # String literal. Prefix to make a b'' string. | |
103 yield t._replace(string='b%s' % t.string) | |
104 continue | |
105 | |
106 # Insert compatibility imports at "from __future__ import" line. | |
107 # No '\n' should be added to preserve line numbers. | |
108 if (t.type == token.NAME and t.string == 'import' and | |
109 all(u.type == token.NAME for u in tokens[i - 2:i]) and | |
110 [u.string for u in tokens[i - 2:i]] == ['from', '__future__']): | |
111 futureimpline = True | |
112 if t.type == token.NEWLINE and futureimpline: | |
113 futureimpline = False | |
114 if fullname == 'mercurial.pycompat': | |
115 yield t | |
116 continue | |
117 r, c = t.start | |
118 l = (b'; from mercurial.pycompat import ' | |
119 b'delattr, getattr, hasattr, setattr, xrange, ' | |
120 b'open, unicode\n') | |
121 for u in tokenize.tokenize(io.BytesIO(l).readline): | |
122 if u.type in (tokenize.ENCODING, token.ENDMARKER): | |
123 continue | |
124 yield u._replace( | |
125 start=(r, c + u.start[1]), end=(r, c + u.end[1])) | |
126 continue | |
127 | |
128 # This looks like a function call. | |
129 if t.type == token.NAME and _isop(i + 1, '('): | |
130 fn = t.string | |
131 | |
132 # *attr() builtins don't accept byte strings to 2nd argument. | |
133 if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and | |
134 not _isop(i - 1, '.')): | |
135 arg1idx = _findargnofcall(1) | |
136 if arg1idx is not None: | |
137 _ensureunicode(arg1idx) | |
138 | |
139 # .encode() and .decode() on str/bytes/unicode don't accept | |
140 # byte strings on Python 3. | |
141 elif fn in ('encode', 'decode') and _isop(i - 1, '.'): | |
142 for argn in range(2): | |
143 argidx = _findargnofcall(argn) | |
144 if argidx is not None: | |
145 _ensureunicode(argidx) | |
146 | |
147 # It changes iteritems/values to items/values as they are not | |
148 # present in Python 3 world. | |
149 elif fn in ('iteritems', 'itervalues'): | |
150 yield t._replace(string=fn[4:]) | |
151 continue | |
152 | |
153 # Emit unmodified token. | |
154 yield t |