comparison contrib/byteify-strings.py @ 38395:1d68fd5f614a

byteify-strings: do not rewrite system string literals to u'' It would make things worse on Python 2 because unicode processing is generally slower than byte string. We should just leave system strings unmodified.
author Yuya Nishihara <yuya@tcha.org>
date Thu, 31 May 2018 23:44:35 +0900
parents f701bc936e7f
children 47dd23e6b116
comparison
equal deleted inserted replaced
38394:f701bc936e7f 38395:1d68fd5f614a
25 Returns a generator of possibly rewritten tokens. 25 Returns a generator of possibly rewritten tokens.
26 26
27 The input token list may be mutated as part of processing. However, 27 The input token list may be mutated as part of processing. However,
28 its changes do not necessarily match the output token stream. 28 its changes do not necessarily match the output token stream.
29 """ 29 """
30 sysstrtokens = set()
31
30 # The following utility functions access the tokens list and i index of 32 # The following utility functions access the tokens list and i index of
31 # the for i, t enumerate(tokens) loop below 33 # the for i, t enumerate(tokens) loop below
32 def _isop(j, *o): 34 def _isop(j, *o):
33 """Assert that tokens[j] is an OP with one of the given values""" 35 """Assert that tokens[j] is an OP with one of the given values"""
34 try: 36 try:
60 elif _isop(j, ',') and nested == 0: 62 elif _isop(j, ',') and nested == 0:
61 n -= 1 63 n -= 1
62 64
63 return None 65 return None
64 66
65 def _ensureunicode(j): 67 def _ensuresysstr(j):
66 """Make sure the token at j is a unicode string 68 """Make sure the token at j is a system string
67 69
68 This rewrites a string token to include the unicode literal prefix 70 Remember the given token so the string transformer won't add
69 so the string transformer won't add the byte prefix. 71 the byte prefix.
70 72
71 Ignores tokens that are not strings. Assumes bounds checking has 73 Ignores tokens that are not strings. Assumes bounds checking has
72 already been done. 74 already been done.
73 75
74 """ 76 """
75 st = tokens[j] 77 st = tokens[j]
76 if st.type == token.STRING and st.string.startswith(("'", '"')): 78 if st.type == token.STRING and st.string.startswith(("'", '"')):
77 tokens[j] = st._replace(string='u%s' % st.string) 79 sysstrtokens.add(st)
78 80
79 for i, t in enumerate(tokens): 81 for i, t in enumerate(tokens):
80 # Convert most string literals to byte literals. String literals 82 # Convert most string literals to byte literals. String literals
81 # in Python 2 are bytes. String literals in Python 3 are unicode. 83 # in Python 2 are bytes. String literals in Python 3 are unicode.
82 # Most strings in Mercurial are bytes and unicode strings are rare. 84 # Most strings in Mercurial are bytes and unicode strings are rare.
83 # Rather than rewrite all string literals to use ``b''`` to indicate 85 # Rather than rewrite all string literals to use ``b''`` to indicate
84 # byte strings, we apply this token transformer to insert the ``b`` 86 # byte strings, we apply this token transformer to insert the ``b``
85 # prefix nearly everywhere. 87 # prefix nearly everywhere.
86 if t.type == token.STRING: 88 if t.type == token.STRING and t not in sysstrtokens:
87 s = t.string 89 s = t.string
88 90
89 # Preserve docstrings as string literals. This is inconsistent 91 # Preserve docstrings as string literals. This is inconsistent
90 # with regular unprefixed strings. However, the 92 # with regular unprefixed strings. However, the
91 # "from __future__" parsing (which allows a module docstring to 93 # "from __future__" parsing (which allows a module docstring to
115 # *attr() builtins don't accept byte strings to 2nd argument. 117 # *attr() builtins don't accept byte strings to 2nd argument.
116 if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and 118 if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and
117 not _isop(i - 1, '.')): 119 not _isop(i - 1, '.')):
118 arg1idx = _findargnofcall(1) 120 arg1idx = _findargnofcall(1)
119 if arg1idx is not None: 121 if arg1idx is not None:
120 _ensureunicode(arg1idx) 122 _ensuresysstr(arg1idx)
121 123
122 # .encode() and .decode() on str/bytes/unicode don't accept 124 # .encode() and .decode() on str/bytes/unicode don't accept
123 # byte strings on Python 3. 125 # byte strings on Python 3.
124 elif fn in ('encode', 'decode') and _isop(i - 1, '.'): 126 elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
125 for argn in range(2): 127 for argn in range(2):
126 argidx = _findargnofcall(argn) 128 argidx = _findargnofcall(argn)
127 if argidx is not None: 129 if argidx is not None:
128 _ensureunicode(argidx) 130 _ensuresysstr(argidx)
129 131
130 # It changes iteritems/values to items/values as they are not 132 # It changes iteritems/values to items/values as they are not
131 # present in Python 3 world. 133 # present in Python 3 world.
132 elif opts['dictiter'] and fn in ('iteritems', 'itervalues'): 134 elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):
133 yield t._replace(string=fn[4:]) 135 yield t._replace(string=fn[4:])