mercurial-scm/hg: comparison contrib/byteify-strings.py

equal deleted inserted replaced

-:47dd23e6b116
+:f77bbd34a1df
 def adjusttokenpos(t, ofs):
 """Adjust start/end column of the given token"""
 return t._replace(start=(t.start[0], t.start[1] + ofs),
 end=(t.end[0], t.end[1] + ofs))
-if True:
+def replacetokens(tokens, opts):
-def replacetokens(tokens, opts):
+"""Transform a stream of tokens from raw to Python 3.
-"""Transform a stream of tokens from raw to Python 3.
+Returns a generator of possibly rewritten tokens.
-Returns a generator of possibly rewritten tokens.
+The input token list may be mutated as part of processing. However,
-The input token list may be mutated as part of processing. However,
+its changes do not necessarily match the output token stream.
-its changes do not necessarily match the output token stream.
+"""
+sysstrtokens = set()
+# The following utility functions access the tokens list and i index of
+# the for i, t enumerate(tokens) loop below
+def _isop(j, *o):
+"""Assert that tokens[j] is an OP with one of the given values"""
+try:
+return tokens[j].type == token.OP and tokens[j].string in o
+except IndexError:
+return False
+def _findargnofcall(n):
+"""Find arg n of a call expression (start at 0)
+Returns index of the first token of that argument, or None if
+there is not that many arguments.
+Assumes that token[i + 1] is '('.
 """
-sysstrtokens = set()
+nested = 0
+for j in range(i + 2, len(tokens)):
-# The following utility functions access the tokens list and i index of
+if _isop(j, ')', ']', '}'):
-# the for i, t enumerate(tokens) loop below
+# end of call, tuple, subscription or dict / set
-def _isop(j, *o):
+nested -= 1
-"""Assert that tokens[j] is an OP with one of the given values"""
+if nested < 0:
-try:
+return None
-return tokens[j].type == token.OP and tokens[j].string in o
+elif n == 0:
-except IndexError:
+# this is the starting position of arg
-return False
+return j
+elif _isop(j, '(', '[', '{'):
-def _findargnofcall(n):
+nested += 1
-"""Find arg n of a call expression (start at 0)
+elif _isop(j, ',') and nested == 0:
+n -= 1
-Returns index of the first token of that argument, or None if
-there is not that many arguments.
+return None
-Assumes that token[i + 1] is '('.
+def _ensuresysstr(j):
+"""Make sure the token at j is a system string
-"""
-nested = 0
+Remember the given token so the string transformer won't add
-for j in range(i + 2, len(tokens)):
+the byte prefix.
-if _isop(j, ')', ']', '}'):
-# end of call, tuple, subscription or dict / set
+Ignores tokens that are not strings. Assumes bounds checking has
-nested -= 1
+already been done.
-if nested < 0:
-return None
+"""
-elif n == 0:
+st = tokens[j]
-# this is the starting position of arg
+if st.type == token.STRING and st.string.startswith(("'", '"')):
-return j
+sysstrtokens.add(st)
-elif _isop(j, '(', '[', '{'):
-nested += 1
+coldelta = 0  # column increment for new opening parens
-elif _isop(j, ',') and nested == 0:
+coloffset = -1  # column offset for the current line (-1: TBD)
-n -= 1
+parens = [(0, 0, 0)]  # stack of (line, end-column, column-offset)
+for i, t in enumerate(tokens):
-return None
+# Compute the column offset for the current line, such that
+# the current line will be aligned to the last opening paren
-def _ensuresysstr(j):
+# as before.
-"""Make sure the token at j is a system string
+if coloffset < 0:
+if t.start[1] == parens[-1][1]:
-Remember the given token so the string transformer won't add
+coloffset = parens[-1][2]
-the byte prefix.
+elif t.start[1] + 1 == parens[-1][1]:
+# fix misaligned indent of s/util.Abort/error.Abort/
-Ignores tokens that are not strings. Assumes bounds checking has
+coloffset = parens[-1][2] + (parens[-1][1] - t.start[1])
-already been done.
+else:
+coloffset = 0
-"""
-st = tokens[j]
+# Reset per-line attributes at EOL.
-if st.type == token.STRING and st.string.startswith(("'", '"')):
+if t.type in (token.NEWLINE, tokenize.NL):
-sysstrtokens.add(st)
+yield adjusttokenpos(t, coloffset)
+coldelta = 0
-coldelta = 0  # column increment for new opening parens
+coloffset = -1
-coloffset = -1  # column offset for the current line (-1: TBD)
+continue
-parens = [(0, 0, 0)]  # stack of (line, end-column, column-offset)
-for i, t in enumerate(tokens):
+# Remember the last paren position.
-# Compute the column offset for the current line, such that
+if _isop(i, '(', '[', '{'):
-# the current line will be aligned to the last opening paren
+parens.append(t.end + (coloffset + coldelta,))
-# as before.
+elif _isop(i, ')', ']', '}'):
-if coloffset < 0:
+parens.pop()
-if t.start[1] == parens[-1][1]:
-coloffset = parens[-1][2]
+# Convert most string literals to byte literals. String literals
-elif t.start[1] + 1 == parens[-1][1]:
+# in Python 2 are bytes. String literals in Python 3 are unicode.
-# fix misaligned indent of s/util.Abort/error.Abort/
+# Most strings in Mercurial are bytes and unicode strings are rare.
-coloffset = parens[-1][2] + (parens[-1][1] - t.start[1])
+# Rather than rewrite all string literals to use ``b''`` to indicate
-else:
+# byte strings, we apply this token transformer to insert the ``b``
-coloffset = 0
+# prefix nearly everywhere.
+if t.type == token.STRING and t not in sysstrtokens:
-# Reset per-line attributes at EOL.
+s = t.string
-if t.type in (token.NEWLINE, tokenize.NL):
+# Preserve docstrings as string literals. This is inconsistent
+# with regular unprefixed strings. However, the
+# "from __future__" parsing (which allows a module docstring to
+# exist before it) doesn't properly handle the docstring if it
+# is b''' prefixed, leading to a SyntaxError. We leave all
+# docstrings as unprefixed to avoid this. This means Mercurial
+# components touching docstrings need to handle unicode,
+# unfortunately.
+if s[0:3] in ("'''", '"""'):
 yield adjusttokenpos(t, coloffset)
-coldelta = 0
-coloffset = -1
 continue
-# Remember the last paren position.
+# If the first character isn't a quote, it is likely a string
-if _isop(i, '(', '[', '{'):
+# prefixing character (such as 'b', 'u', or 'r'. Ignore.
-parens.append(t.end + (coloffset + coldelta,))
+if s[0] not in ("'", '"'):
-elif _isop(i, ')', ']', '}'):
+yield adjusttokenpos(t, coloffset)
-parens.pop()
-# Convert most string literals to byte literals. String literals
-# in Python 2 are bytes. String literals in Python 3 are unicode.
-# Most strings in Mercurial are bytes and unicode strings are rare.
-# Rather than rewrite all string literals to use ``b''`` to indicate
-# byte strings, we apply this token transformer to insert the ``b``
-# prefix nearly everywhere.
-if t.type == token.STRING and t not in sysstrtokens:
-s = t.string
-# Preserve docstrings as string literals. This is inconsistent
-# with regular unprefixed strings. However, the
-# "from __future__" parsing (which allows a module docstring to
-# exist before it) doesn't properly handle the docstring if it
-# is b''' prefixed, leading to a SyntaxError. We leave all
-# docstrings as unprefixed to avoid this. This means Mercurial
-# components touching docstrings need to handle unicode,
-# unfortunately.
-if s[0:3] in ("'''", '"""'):
-yield adjusttokenpos(t, coloffset)
-continue
-# If the first character isn't a quote, it is likely a string
-# prefixing character (such as 'b', 'u', or 'r'. Ignore.
-if s[0] not in ("'", '"'):
-yield adjusttokenpos(t, coloffset)
-continue
-# String literal. Prefix to make a b'' string.
-yield adjusttokenpos(t._replace(string='b%s' % t.string),
-coloffset)
-coldelta += 1
 continue
-# This looks like a function call.
+# String literal. Prefix to make a b'' string.
-if t.type == token.NAME and _isop(i + 1, '('):
+yield adjusttokenpos(t._replace(string='b%s' % t.string),
-fn = t.string
+coloffset)
+coldelta += 1
-# *attr() builtins don't accept byte strings to 2nd argument.
+continue
-if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and
-not _isop(i - 1, '.')):
+# This looks like a function call.
-arg1idx = _findargnofcall(1)
+if t.type == token.NAME and _isop(i + 1, '('):
-if arg1idx is not None:
+fn = t.string
-_ensuresysstr(arg1idx)
+# *attr() builtins don't accept byte strings to 2nd argument.
-# .encode() and .decode() on str/bytes/unicode don't accept
+if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and
-# byte strings on Python 3.
+not _isop(i - 1, '.')):
-elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
+arg1idx = _findargnofcall(1)
-for argn in range(2):
+if arg1idx is not None:
-argidx = _findargnofcall(argn)
+_ensuresysstr(arg1idx)
-if argidx is not None:
-_ensuresysstr(argidx)
+# .encode() and .decode() on str/bytes/unicode don't accept
+# byte strings on Python 3.
-# It changes iteritems/values to items/values as they are not
+elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
-# present in Python 3 world.
+for argn in range(2):
-elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):
+argidx = _findargnofcall(argn)
-yield adjusttokenpos(t._replace(string=fn[4:]), coloffset)
+if argidx is not None:
-continue
+_ensuresysstr(argidx)
-# Emit unmodified token.
+# It changes iteritems/values to items/values as they are not
-yield adjusttokenpos(t, coloffset)
+# present in Python 3 world.
+elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):
+yield adjusttokenpos(t._replace(string=fn[4:]), coloffset)
+continue
+# Emit unmodified token.
+yield adjusttokenpos(t, coloffset)
 def process(fin, fout, opts):
 tokens = tokenize.tokenize(fin.readline)
 tokens = replacetokens(list(tokens), opts)
 fout.write(tokenize.untokenize(tokens))

changeset 38391	f77bbd34a1df
parent 38390	47dd23e6b116
child 39103	da130c5cef90