Mercurial > public > mercurial-scm > hg-stable
comparison contrib/byteify-strings.py @ 38397:f77bbd34a1df
byteify-strings: remove superfluous "if True" block
author | Yuya Nishihara <yuya@tcha.org> |
---|---|
date | Sun, 03 Jun 2018 18:19:54 +0900 |
parents | 47dd23e6b116 |
children | da130c5cef90 |
comparison
equal
deleted
inserted
replaced
38396:47dd23e6b116 | 38397:f77bbd34a1df |
---|---|
21 def adjusttokenpos(t, ofs): | 21 def adjusttokenpos(t, ofs): |
22 """Adjust start/end column of the given token""" | 22 """Adjust start/end column of the given token""" |
23 return t._replace(start=(t.start[0], t.start[1] + ofs), | 23 return t._replace(start=(t.start[0], t.start[1] + ofs), |
24 end=(t.end[0], t.end[1] + ofs)) | 24 end=(t.end[0], t.end[1] + ofs)) |
25 | 25 |
26 if True: | 26 def replacetokens(tokens, opts): |
27 def replacetokens(tokens, opts): | 27 """Transform a stream of tokens from raw to Python 3. |
28 """Transform a stream of tokens from raw to Python 3. | 28 |
29 | 29 Returns a generator of possibly rewritten tokens. |
30 Returns a generator of possibly rewritten tokens. | 30 |
31 | 31 The input token list may be mutated as part of processing. However, |
32 The input token list may be mutated as part of processing. However, | 32 its changes do not necessarily match the output token stream. |
33 its changes do not necessarily match the output token stream. | 33 """ |
34 sysstrtokens = set() | |
35 | |
36 # The following utility functions access the tokens list and i index of | |
37 # the for i, t enumerate(tokens) loop below | |
38 def _isop(j, *o): | |
39 """Assert that tokens[j] is an OP with one of the given values""" | |
40 try: | |
41 return tokens[j].type == token.OP and tokens[j].string in o | |
42 except IndexError: | |
43 return False | |
44 | |
45 def _findargnofcall(n): | |
46 """Find arg n of a call expression (start at 0) | |
47 | |
48 Returns index of the first token of that argument, or None if | |
49 there is not that many arguments. | |
50 | |
51 Assumes that token[i + 1] is '('. | |
52 | |
34 """ | 53 """ |
35 sysstrtokens = set() | 54 nested = 0 |
36 | 55 for j in range(i + 2, len(tokens)): |
37 # The following utility functions access the tokens list and i index of | 56 if _isop(j, ')', ']', '}'): |
38 # the for i, t enumerate(tokens) loop below | 57 # end of call, tuple, subscription or dict / set |
39 def _isop(j, *o): | 58 nested -= 1 |
40 """Assert that tokens[j] is an OP with one of the given values""" | 59 if nested < 0: |
41 try: | 60 return None |
42 return tokens[j].type == token.OP and tokens[j].string in o | 61 elif n == 0: |
43 except IndexError: | 62 # this is the starting position of arg |
44 return False | 63 return j |
45 | 64 elif _isop(j, '(', '[', '{'): |
46 def _findargnofcall(n): | 65 nested += 1 |
47 """Find arg n of a call expression (start at 0) | 66 elif _isop(j, ',') and nested == 0: |
48 | 67 n -= 1 |
49 Returns index of the first token of that argument, or None if | 68 |
50 there is not that many arguments. | 69 return None |
51 | 70 |
52 Assumes that token[i + 1] is '('. | 71 def _ensuresysstr(j): |
53 | 72 """Make sure the token at j is a system string |
54 """ | 73 |
55 nested = 0 | 74 Remember the given token so the string transformer won't add |
56 for j in range(i + 2, len(tokens)): | 75 the byte prefix. |
57 if _isop(j, ')', ']', '}'): | 76 |
58 # end of call, tuple, subscription or dict / set | 77 Ignores tokens that are not strings. Assumes bounds checking has |
59 nested -= 1 | 78 already been done. |
60 if nested < 0: | 79 |
61 return None | 80 """ |
62 elif n == 0: | 81 st = tokens[j] |
63 # this is the starting position of arg | 82 if st.type == token.STRING and st.string.startswith(("'", '"')): |
64 return j | 83 sysstrtokens.add(st) |
65 elif _isop(j, '(', '[', '{'): | 84 |
66 nested += 1 | 85 coldelta = 0 # column increment for new opening parens |
67 elif _isop(j, ',') and nested == 0: | 86 coloffset = -1 # column offset for the current line (-1: TBD) |
68 n -= 1 | 87 parens = [(0, 0, 0)] # stack of (line, end-column, column-offset) |
69 | 88 for i, t in enumerate(tokens): |
70 return None | 89 # Compute the column offset for the current line, such that |
71 | 90 # the current line will be aligned to the last opening paren |
72 def _ensuresysstr(j): | 91 # as before. |
73 """Make sure the token at j is a system string | 92 if coloffset < 0: |
74 | 93 if t.start[1] == parens[-1][1]: |
75 Remember the given token so the string transformer won't add | 94 coloffset = parens[-1][2] |
76 the byte prefix. | 95 elif t.start[1] + 1 == parens[-1][1]: |
77 | 96 # fix misaligned indent of s/util.Abort/error.Abort/ |
78 Ignores tokens that are not strings. Assumes bounds checking has | 97 coloffset = parens[-1][2] + (parens[-1][1] - t.start[1]) |
79 already been done. | 98 else: |
80 | 99 coloffset = 0 |
81 """ | 100 |
82 st = tokens[j] | 101 # Reset per-line attributes at EOL. |
83 if st.type == token.STRING and st.string.startswith(("'", '"')): | 102 if t.type in (token.NEWLINE, tokenize.NL): |
84 sysstrtokens.add(st) | 103 yield adjusttokenpos(t, coloffset) |
85 | 104 coldelta = 0 |
86 coldelta = 0 # column increment for new opening parens | 105 coloffset = -1 |
87 coloffset = -1 # column offset for the current line (-1: TBD) | 106 continue |
88 parens = [(0, 0, 0)] # stack of (line, end-column, column-offset) | 107 |
89 for i, t in enumerate(tokens): | 108 # Remember the last paren position. |
90 # Compute the column offset for the current line, such that | 109 if _isop(i, '(', '[', '{'): |
91 # the current line will be aligned to the last opening paren | 110 parens.append(t.end + (coloffset + coldelta,)) |
92 # as before. | 111 elif _isop(i, ')', ']', '}'): |
93 if coloffset < 0: | 112 parens.pop() |
94 if t.start[1] == parens[-1][1]: | 113 |
95 coloffset = parens[-1][2] | 114 # Convert most string literals to byte literals. String literals |
96 elif t.start[1] + 1 == parens[-1][1]: | 115 # in Python 2 are bytes. String literals in Python 3 are unicode. |
97 # fix misaligned indent of s/util.Abort/error.Abort/ | 116 # Most strings in Mercurial are bytes and unicode strings are rare. |
98 coloffset = parens[-1][2] + (parens[-1][1] - t.start[1]) | 117 # Rather than rewrite all string literals to use ``b''`` to indicate |
99 else: | 118 # byte strings, we apply this token transformer to insert the ``b`` |
100 coloffset = 0 | 119 # prefix nearly everywhere. |
101 | 120 if t.type == token.STRING and t not in sysstrtokens: |
102 # Reset per-line attributes at EOL. | 121 s = t.string |
103 if t.type in (token.NEWLINE, tokenize.NL): | 122 |
123 # Preserve docstrings as string literals. This is inconsistent | |
124 # with regular unprefixed strings. However, the | |
125 # "from __future__" parsing (which allows a module docstring to | |
126 # exist before it) doesn't properly handle the docstring if it | |
127 # is b''' prefixed, leading to a SyntaxError. We leave all | |
128 # docstrings as unprefixed to avoid this. This means Mercurial | |
129 # components touching docstrings need to handle unicode, | |
130 # unfortunately. | |
131 if s[0:3] in ("'''", '"""'): | |
104 yield adjusttokenpos(t, coloffset) | 132 yield adjusttokenpos(t, coloffset) |
105 coldelta = 0 | |
106 coloffset = -1 | |
107 continue | 133 continue |
108 | 134 |
109 # Remember the last paren position. | 135 # If the first character isn't a quote, it is likely a string |
110 if _isop(i, '(', '[', '{'): | 136 # prefixing character (such as 'b', 'u', or 'r'. Ignore. |
111 parens.append(t.end + (coloffset + coldelta,)) | 137 if s[0] not in ("'", '"'): |
112 elif _isop(i, ')', ']', '}'): | 138 yield adjusttokenpos(t, coloffset) |
113 parens.pop() | |
114 | |
115 # Convert most string literals to byte literals. String literals | |
116 # in Python 2 are bytes. String literals in Python 3 are unicode. | |
117 # Most strings in Mercurial are bytes and unicode strings are rare. | |
118 # Rather than rewrite all string literals to use ``b''`` to indicate | |
119 # byte strings, we apply this token transformer to insert the ``b`` | |
120 # prefix nearly everywhere. | |
121 if t.type == token.STRING and t not in sysstrtokens: | |
122 s = t.string | |
123 | |
124 # Preserve docstrings as string literals. This is inconsistent | |
125 # with regular unprefixed strings. However, the | |
126 # "from __future__" parsing (which allows a module docstring to | |
127 # exist before it) doesn't properly handle the docstring if it | |
128 # is b''' prefixed, leading to a SyntaxError. We leave all | |
129 # docstrings as unprefixed to avoid this. This means Mercurial | |
130 # components touching docstrings need to handle unicode, | |
131 # unfortunately. | |
132 if s[0:3] in ("'''", '"""'): | |
133 yield adjusttokenpos(t, coloffset) | |
134 continue | |
135 | |
136 # If the first character isn't a quote, it is likely a string | |
137 # prefixing character (such as 'b', 'u', or 'r'. Ignore. | |
138 if s[0] not in ("'", '"'): | |
139 yield adjusttokenpos(t, coloffset) | |
140 continue | |
141 | |
142 # String literal. Prefix to make a b'' string. | |
143 yield adjusttokenpos(t._replace(string='b%s' % t.string), | |
144 coloffset) | |
145 coldelta += 1 | |
146 continue | 139 continue |
147 | 140 |
148 # This looks like a function call. | 141 # String literal. Prefix to make a b'' string. |
149 if t.type == token.NAME and _isop(i + 1, '('): | 142 yield adjusttokenpos(t._replace(string='b%s' % t.string), |
150 fn = t.string | 143 coloffset) |
151 | 144 coldelta += 1 |
152 # *attr() builtins don't accept byte strings to 2nd argument. | 145 continue |
153 if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and | 146 |
154 not _isop(i - 1, '.')): | 147 # This looks like a function call. |
155 arg1idx = _findargnofcall(1) | 148 if t.type == token.NAME and _isop(i + 1, '('): |
156 if arg1idx is not None: | 149 fn = t.string |
157 _ensuresysstr(arg1idx) | 150 |
158 | 151 # *attr() builtins don't accept byte strings to 2nd argument. |
159 # .encode() and .decode() on str/bytes/unicode don't accept | 152 if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and |
160 # byte strings on Python 3. | 153 not _isop(i - 1, '.')): |
161 elif fn in ('encode', 'decode') and _isop(i - 1, '.'): | 154 arg1idx = _findargnofcall(1) |
162 for argn in range(2): | 155 if arg1idx is not None: |
163 argidx = _findargnofcall(argn) | 156 _ensuresysstr(arg1idx) |
164 if argidx is not None: | 157 |
165 _ensuresysstr(argidx) | 158 # .encode() and .decode() on str/bytes/unicode don't accept |
166 | 159 # byte strings on Python 3. |
167 # It changes iteritems/values to items/values as they are not | 160 elif fn in ('encode', 'decode') and _isop(i - 1, '.'): |
168 # present in Python 3 world. | 161 for argn in range(2): |
169 elif opts['dictiter'] and fn in ('iteritems', 'itervalues'): | 162 argidx = _findargnofcall(argn) |
170 yield adjusttokenpos(t._replace(string=fn[4:]), coloffset) | 163 if argidx is not None: |
171 continue | 164 _ensuresysstr(argidx) |
172 | 165 |
173 # Emit unmodified token. | 166 # It changes iteritems/values to items/values as they are not |
174 yield adjusttokenpos(t, coloffset) | 167 # present in Python 3 world. |
168 elif opts['dictiter'] and fn in ('iteritems', 'itervalues'): | |
169 yield adjusttokenpos(t._replace(string=fn[4:]), coloffset) | |
170 continue | |
171 | |
172 # Emit unmodified token. | |
173 yield adjusttokenpos(t, coloffset) | |
175 | 174 |
176 def process(fin, fout, opts): | 175 def process(fin, fout, opts): |
177 tokens = tokenize.tokenize(fin.readline) | 176 tokens = tokenize.tokenize(fin.readline) |
178 tokens = replacetokens(list(tokens), opts) | 177 tokens = replacetokens(list(tokens), opts) |
179 fout.write(tokenize.untokenize(tokens)) | 178 fout.write(tokenize.untokenize(tokens)) |