Mercurial > public > mercurial-scm > hg-stable
comparison contrib/byteify-strings.py @ 38395:1d68fd5f614a
byteify-strings: do not rewrite system string literals to u''
It would make things worse on Python 2 because unicode processing is generally
slower than byte string. We should just leave system strings unmodified.
author | Yuya Nishihara <yuya@tcha.org> |
---|---|
date | Thu, 31 May 2018 23:44:35 +0900 |
parents | f701bc936e7f |
children | 47dd23e6b116 |
comparison
equal
deleted
inserted
replaced
38394:f701bc936e7f | 38395:1d68fd5f614a |
---|---|
25 Returns a generator of possibly rewritten tokens. | 25 Returns a generator of possibly rewritten tokens. |
26 | 26 |
27 The input token list may be mutated as part of processing. However, | 27 The input token list may be mutated as part of processing. However, |
28 its changes do not necessarily match the output token stream. | 28 its changes do not necessarily match the output token stream. |
29 """ | 29 """ |
30 sysstrtokens = set() | |
31 | |
30 # The following utility functions access the tokens list and i index of | 32 # The following utility functions access the tokens list and i index of |
31 # the for i, t enumerate(tokens) loop below | 33 # the for i, t enumerate(tokens) loop below |
32 def _isop(j, *o): | 34 def _isop(j, *o): |
33 """Assert that tokens[j] is an OP with one of the given values""" | 35 """Assert that tokens[j] is an OP with one of the given values""" |
34 try: | 36 try: |
60 elif _isop(j, ',') and nested == 0: | 62 elif _isop(j, ',') and nested == 0: |
61 n -= 1 | 63 n -= 1 |
62 | 64 |
63 return None | 65 return None |
64 | 66 |
65 def _ensureunicode(j): | 67 def _ensuresysstr(j): |
66 """Make sure the token at j is a unicode string | 68 """Make sure the token at j is a system string |
67 | 69 |
68 This rewrites a string token to include the unicode literal prefix | 70 Remember the given token so the string transformer won't add |
69 so the string transformer won't add the byte prefix. | 71 the byte prefix. |
70 | 72 |
71 Ignores tokens that are not strings. Assumes bounds checking has | 73 Ignores tokens that are not strings. Assumes bounds checking has |
72 already been done. | 74 already been done. |
73 | 75 |
74 """ | 76 """ |
75 st = tokens[j] | 77 st = tokens[j] |
76 if st.type == token.STRING and st.string.startswith(("'", '"')): | 78 if st.type == token.STRING and st.string.startswith(("'", '"')): |
77 tokens[j] = st._replace(string='u%s' % st.string) | 79 sysstrtokens.add(st) |
78 | 80 |
79 for i, t in enumerate(tokens): | 81 for i, t in enumerate(tokens): |
80 # Convert most string literals to byte literals. String literals | 82 # Convert most string literals to byte literals. String literals |
81 # in Python 2 are bytes. String literals in Python 3 are unicode. | 83 # in Python 2 are bytes. String literals in Python 3 are unicode. |
82 # Most strings in Mercurial are bytes and unicode strings are rare. | 84 # Most strings in Mercurial are bytes and unicode strings are rare. |
83 # Rather than rewrite all string literals to use ``b''`` to indicate | 85 # Rather than rewrite all string literals to use ``b''`` to indicate |
84 # byte strings, we apply this token transformer to insert the ``b`` | 86 # byte strings, we apply this token transformer to insert the ``b`` |
85 # prefix nearly everywhere. | 87 # prefix nearly everywhere. |
86 if t.type == token.STRING: | 88 if t.type == token.STRING and t not in sysstrtokens: |
87 s = t.string | 89 s = t.string |
88 | 90 |
89 # Preserve docstrings as string literals. This is inconsistent | 91 # Preserve docstrings as string literals. This is inconsistent |
90 # with regular unprefixed strings. However, the | 92 # with regular unprefixed strings. However, the |
91 # "from __future__" parsing (which allows a module docstring to | 93 # "from __future__" parsing (which allows a module docstring to |
115 # *attr() builtins don't accept byte strings to 2nd argument. | 117 # *attr() builtins don't accept byte strings to 2nd argument. |
116 if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and | 118 if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and |
117 not _isop(i - 1, '.')): | 119 not _isop(i - 1, '.')): |
118 arg1idx = _findargnofcall(1) | 120 arg1idx = _findargnofcall(1) |
119 if arg1idx is not None: | 121 if arg1idx is not None: |
120 _ensureunicode(arg1idx) | 122 _ensuresysstr(arg1idx) |
121 | 123 |
122 # .encode() and .decode() on str/bytes/unicode don't accept | 124 # .encode() and .decode() on str/bytes/unicode don't accept |
123 # byte strings on Python 3. | 125 # byte strings on Python 3. |
124 elif fn in ('encode', 'decode') and _isop(i - 1, '.'): | 126 elif fn in ('encode', 'decode') and _isop(i - 1, '.'): |
125 for argn in range(2): | 127 for argn in range(2): |
126 argidx = _findargnofcall(argn) | 128 argidx = _findargnofcall(argn) |
127 if argidx is not None: | 129 if argidx is not None: |
128 _ensureunicode(argidx) | 130 _ensuresysstr(argidx) |
129 | 131 |
130 # It changes iteritems/values to items/values as they are not | 132 # It changes iteritems/values to items/values as they are not |
131 # present in Python 3 world. | 133 # present in Python 3 world. |
132 elif opts['dictiter'] and fn in ('iteritems', 'itervalues'): | 134 elif opts['dictiter'] and fn in ('iteritems', 'itervalues'): |
133 yield t._replace(string=fn[4:]) | 135 yield t._replace(string=fn[4:]) |