Mercurial > public > mercurial-scm > hg
diff mercurial/pure/charencode.py @ 33924:b9101467d88b
encoding: extract stub for fast JSON escape
This moves JSON character maps to pure/charencode.py because they will be
used only when the fast-path fails.
author | Yuya Nishihara <yuya@tcha.org> |
---|---|
date | Sun, 23 Apr 2017 16:10:51 +0900 |
parents | f5fc54e7e467 |
children | f4433f2713d0 |
line wrap: on
line diff
--- a/mercurial/pure/charencode.py Thu Aug 24 21:43:54 2017 -0700 +++ b/mercurial/pure/charencode.py Sun Apr 23 16:10:51 2017 +0900 @@ -7,6 +7,12 @@ from __future__ import absolute_import +import array + +from .. import ( + pycompat, +) + def asciilower(s): '''convert a string to lowercase if ASCII @@ -20,3 +26,47 @@ Raises UnicodeDecodeError if non-ASCII characters are found.''' s.decode('ascii') return s.upper() + +_jsonmap = [] +_jsonmap.extend("\\u%04x" % x for x in range(32)) +_jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127)) +_jsonmap.append('\\u007f') +_jsonmap[0x09] = '\\t' +_jsonmap[0x0a] = '\\n' +_jsonmap[0x22] = '\\"' +_jsonmap[0x5c] = '\\\\' +_jsonmap[0x08] = '\\b' +_jsonmap[0x0c] = '\\f' +_jsonmap[0x0d] = '\\r' +_paranoidjsonmap = _jsonmap[:] +_paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>") +_paranoidjsonmap[0x3e] = '\\u003e' # '>' +_jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256)) + +def jsonescapeu8fast(u8chars, paranoid): + """Convert a UTF-8 byte string to JSON-escaped form (fast path) + + Raises ValueError if non-ASCII characters have to be escaped. + """ + if paranoid: + jm = _paranoidjsonmap + else: + jm = _jsonmap + try: + return ''.join(jm[x] for x in bytearray(u8chars)) + except IndexError: + raise ValueError + +def jsonescapeu8fallback(u8chars, paranoid): + """Convert a UTF-8 byte string to JSON-escaped form (slow path) + + Escapes all non-ASCII characters no matter if paranoid is False. + """ + if paranoid: + jm = _paranoidjsonmap + else: + jm = _jsonmap + # non-BMP char is represented as UTF-16 surrogate pair + u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16')) + u16codes.pop(0) # drop BOM + return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)