diff -r 7b22599dcb85 -r 1c22400db72d mercurial/__init__.py --- a/mercurial/__init__.py Fri Jul 15 23:54:56 2016 +0900 +++ b/mercurial/__init__.py Mon Jul 04 11:18:03 2016 -0700 @@ -121,9 +121,238 @@ sys.modules[name] = mod return mod +# Python 3 uses a custom module loader that transforms source code between +# source file reading and compilation. This is done by registering a custom +# finder that changes the spec for Mercurial modules to use a custom loader. +if sys.version_info[0] >= 3: + from . import pure + import importlib + import io + import token + import tokenize + + class hgpathentryfinder(importlib.abc.MetaPathFinder): + """A sys.meta_path finder that uses a custom module loader.""" + def find_spec(self, fullname, path, target=None): + # Only handle Mercurial-related modules. + if not fullname.startswith(('mercurial.', 'hgext.', 'hgext3rd.')): + return None + + # This assumes Python 3 doesn't support loading C modules. + if fullname in _dualmodules: + stem = fullname.split('.')[-1] + fullname = 'mercurial.pure.%s' % stem + target = pure + assert len(path) == 1 + path = [os.path.join(path[0], 'pure')] + + # Try to find the module using other registered finders. + spec = None + for finder in sys.meta_path: + if finder == self: + continue + + spec = finder.find_spec(fullname, path, target=target) + if spec: + break + + # This is a Mercurial-related module but we couldn't find it + # using the previously-registered finders. This likely means + # the module doesn't exist. + if not spec: + return None + + if fullname.startswith('mercurial.pure.'): + spec.name = spec.name.replace('.pure.', '.') + + # TODO need to support loaders from alternate specs, like zip + # loaders. + spec.loader = hgloader(spec.name, spec.origin) + return spec + + def replacetokens(tokens): + """Transform a stream of tokens from raw to Python 3. + + It is called by the custom module loading machinery to rewrite + source/tokens between source decoding and compilation. + + Returns a generator of possibly rewritten tokens. + + The input token list may be mutated as part of processing. However, + its changes do not necessarily match the output token stream. + + REMEMBER TO CHANGE ``BYTECODEHEADER`` WHEN CHANGING THIS FUNCTION + OR CACHED FILES WON'T GET INVALIDATED PROPERLY. + """ + for i, t in enumerate(tokens): + # Convert most string literals to byte literals. String literals + # in Python 2 are bytes. String literals in Python 3 are unicode. + # Most strings in Mercurial are bytes and unicode strings are rare. + # Rather than rewrite all string literals to use ``b''`` to indicate + # byte strings, we apply this token transformer to insert the ``b`` + # prefix nearly everywhere. + if t.type == token.STRING: + s = t.string + + # Preserve docstrings as string literals. This is inconsistent + # with regular unprefixed strings. However, the + # "from __future__" parsing (which allows a module docstring to + # exist before it) doesn't properly handle the docstring if it + # is b''' prefixed, leading to a SyntaxError. We leave all + # docstrings as unprefixed to avoid this. This means Mercurial + # components touching docstrings need to handle unicode, + # unfortunately. + if s[0:3] in ("'''", '"""'): + yield t + continue + + # If the first character isn't a quote, it is likely a string + # prefixing character (such as 'b', 'u', or 'r'. Ignore. + if s[0] not in ("'", '"'): + yield t + continue + + # String literal. Prefix to make a b'' string. + yield tokenize.TokenInfo(t.type, 'b%s' % s, t.start, t.end, + t.line) + continue + + try: + nexttoken = tokens[i + 1] + except IndexError: + nexttoken = None + + try: + prevtoken = tokens[i - 1] + except IndexError: + prevtoken = None + + # This looks like a function call. + if (t.type == token.NAME and nexttoken and + nexttoken.type == token.OP and nexttoken.string == '('): + fn = t.string + + # *attr() builtins don't accept byte strings to 2nd argument. + # Rewrite the token to include the unicode literal prefix so + # the string transformer above doesn't add the byte prefix. + if fn in ('getattr', 'setattr', 'hasattr', 'safehasattr'): + try: + # (NAME, 'getattr') + # (OP, '(') + # (NAME, 'foo') + # (OP, ',') + # (NAME|STRING, foo) + st = tokens[i + 4] + if (st.type == token.STRING and + st.string[0] in ("'", '"')): + rt = tokenize.TokenInfo(st.type, 'u%s' % st.string, + st.start, st.end, st.line) + tokens[i + 4] = rt + except IndexError: + pass + + # .encode() and .decode() on str/bytes/unicode don't accept + # byte strings on Python 3. Rewrite the token to include the + # unicode literal prefix so the string transformer above doesn't + # add the byte prefix. + if (fn in ('encode', 'decode') and + prevtoken.type == token.OP and prevtoken.string == '.'): + # (OP, '.') + # (NAME, 'encode') + # (OP, '(') + # (STRING, 'utf-8') + # (OP, ')') + try: + st = tokens[i + 2] + if (st.type == token.STRING and + st.string[0] in ("'", '"')): + rt = tokenize.TokenInfo(st.type, 'u%s' % st.string, + st.start, st.end, st.line) + tokens[i + 2] = rt + except IndexError: + pass + + # Emit unmodified token. + yield t + + # Header to add to bytecode files. This MUST be changed when + # ``replacetoken`` or any mechanism that changes semantics of module + # loading is changed. Otherwise cached bytecode may get loaded without + # the new transformation mechanisms applied. + BYTECODEHEADER = b'HG\x00\x01' + + class hgloader(importlib.machinery.SourceFileLoader): + """Custom module loader that transforms source code. + + When the source code is converted to a code object, we transform + certain patterns to be Python 3 compatible. This allows us to write code + that is natively Python 2 and compatible with Python 3 without + making the code excessively ugly. + + We do this by transforming the token stream between parse and compile. + + Implementing transformations invalidates caching assumptions made + by the built-in importer. The built-in importer stores a header on + saved bytecode files indicating the Python/bytecode version. If the + version changes, the cached bytecode is ignored. The Mercurial + transformations could change at any time. This means we need to check + that cached bytecode was generated with the current transformation + code or there could be a mismatch between cached bytecode and what + would be generated from this class. + + We supplement the bytecode caching layer by wrapping ``get_data`` + and ``set_data``. These functions are called when the + ``SourceFileLoader`` retrieves and saves bytecode cache files, + respectively. We simply add an additional header on the file. As + long as the version in this file is changed when semantics change, + cached bytecode should be invalidated when transformations change. + + The added header has the form ``HG``. That is a literal + ``HG`` with 2 binary bytes indicating the transformation version. + """ + def get_data(self, path): + data = super(hgloader, self).get_data(path) + + if not path.endswith(tuple(importlib.machinery.BYTECODE_SUFFIXES)): + return data + + # There should be a header indicating the Mercurial transformation + # version. If it doesn't exist or doesn't match the current version, + # we raise an OSError because that is what + # ``SourceFileLoader.get_code()`` expects when loading bytecode + # paths to indicate the cached file is "bad." + if data[0:2] != b'HG': + raise OSError('no hg header') + if data[0:4] != BYTECODEHEADER: + raise OSError('hg header version mismatch') + + return data[4:] + + def set_data(self, path, data, *args, **kwargs): + if path.endswith(tuple(importlib.machinery.BYTECODE_SUFFIXES)): + data = BYTECODEHEADER + data + + return super(hgloader, self).set_data(path, data, *args, **kwargs) + + def source_to_code(self, data, path): + """Perform token transformation before compilation.""" + buf = io.BytesIO(data) + tokens = tokenize.tokenize(buf.readline) + data = tokenize.untokenize(replacetokens(list(tokens))) + # Python's built-in importer strips frames from exceptions raised + # for this code. Unfortunately, that mechanism isn't extensible + # and our frame will be blamed for the import failure. There + # are extremely hacky ways to do frame stripping. We haven't + # implemented them because they are very ugly. + return super(hgloader, self).source_to_code(data, path) + # We automagically register our custom importer as a side-effect of loading. # This is necessary to ensure that any entry points are able to import # mercurial.* modules without having to perform this registration themselves. -if not any(isinstance(x, hgimporter) for x in sys.meta_path): +if sys.version_info[0] >= 3: + _importercls = hgpathentryfinder +else: + _importercls = hgimporter +if not any(isinstance(x, _importercls) for x in sys.meta_path): # meta_path is used before any implicit finders and before sys.path. - sys.meta_path.insert(0, hgimporter()) + sys.meta_path.insert(0, _importercls())