hgext/highlight/highlight.py
author Gregory Szorc <gregory.szorc@gmail.com>
Wed, 14 Oct 2015 18:22:16 -0700
changeset 26680 7a3f6490ef97
parent 25899 c35ee1bbbbdc
child 27637 b502138f5faa
permissions -rw-r--r--
highlight: add option to prevent content-only based fallback When Mozilla enabled Pygments on hg.mozilla.org, we got a lot of weirdly colorized files. Upon further investigation, the hightlight extension is first attempting a filename+content based match then falling back to a purely content-driven detection mode in Pygments. Sounds good in theory. Unfortunately, Pygments' content-driven detection establishes no minimum threshold for returning a lexer. Furthermore, the detection code for a number of languages is very liberal. For example, ActionScript 3 will return a confidence of 0.3 (out of 1.0) if the first 1k of the file we pass in matches the regex "\w+\s*:\s*\w"! Python matches on "import ". It's no coincidence that a number of our extension-less files were getting highlighted improperly. This patch adds an option to have the highlighter not fall back to purely content-based detection when filename+content detection failed. This can be enabled to render unlighted text instead of taking the risk that unknown file types are highlighted incorrectly. The old behavior is still the default.

# highlight.py - highlight extension implementation file
#
#  Copyright 2007-2009 Adam Hupp <adam@hupp.org> and others
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.
#
# The original module was split in an interface and an implementation
# file to defer pygments loading and speedup extension setup.

from mercurial import demandimport
demandimport.ignore.extend(['pkgutil', 'pkg_resources', '__main__'])
from mercurial import util, encoding

from pygments import highlight
from pygments.util import ClassNotFound
from pygments.lexers import guess_lexer, guess_lexer_for_filename, TextLexer
from pygments.formatters import HtmlFormatter

SYNTAX_CSS = ('\n<link rel="stylesheet" href="{url}highlightcss" '
              'type="text/css" />')

def pygmentize(field, fctx, style, tmpl, guessfilenameonly=False):

    # append a <link ...> to the syntax highlighting css
    old_header = tmpl.load('header')
    if SYNTAX_CSS not in old_header:
        new_header =  old_header + SYNTAX_CSS
        tmpl.cache['header'] = new_header

    text = fctx.data()
    if util.binary(text):
        return

    # str.splitlines() != unicode.splitlines() because "reasons"
    for c in "\x0c\x1c\x1d\x1e":
        if c in text:
            text = text.replace(c, '')

    # Pygments is best used with Unicode strings:
    # <http://pygments.org/docs/unicode/>
    text = text.decode(encoding.encoding, 'replace')

    # To get multi-line strings right, we can't format line-by-line
    try:
        lexer = guess_lexer_for_filename(fctx.path(), text[:1024],
                                         stripnl=False)
    except (ClassNotFound, ValueError):
        # guess_lexer will return a lexer if *any* lexer matches. There is
        # no way to specify a minimum match score. This can give a high rate of
        # false positives on files with an unknown filename pattern.
        if guessfilenameonly:
            return

        try:
            lexer = guess_lexer(text[:1024], stripnl=False)
        except (ClassNotFound, ValueError):
            # Don't highlight unknown files
            return

    # Don't highlight text files
    if isinstance(lexer, TextLexer):
        return

    formatter = HtmlFormatter(nowrap=True, style=style)

    colorized = highlight(text, lexer, formatter)
    coloriter = (s.encode(encoding.encoding, 'replace')
                 for s in colorized.splitlines())

    tmpl.filters['colorize'] = lambda x: coloriter.next()

    oldl = tmpl.cache[field]
    newl = oldl.replace('line|escape', 'line|colorize')
    tmpl.cache[field] = newl