mercurial-scm/hg: comparison mercurial/encoding.py

equal deleted inserted replaced

-:69a02b1e947c
+:9ece901f7a19
 # This software may be used and distributed according to the terms of the
 # GNU General Public License version 2 or any later version.
 from __future__ import absolute_import
+import array
 import locale
 import os
 import unicodedata
 from . import (
 upper = 1
 other = 0
 _jsonmap = []
 _jsonmap.extend("\\u%04x" % x for x in xrange(32))
-_jsonmap.extend(chr(x) for x in xrange(32, 256))
+_jsonmap.extend(chr(x) for x in xrange(32, 127))
-_jsonmap[0x7f] = '\\u007f'
+_jsonmap.append('\\u007f')
 _jsonmap[0x09] = '\\t'
 _jsonmap[0x0a] = '\\n'
 _jsonmap[0x22] = '\\"'
 _jsonmap[0x5c] = '\\\\'
 _jsonmap[0x08] = '\\b'
 _jsonmap[0x0c] = '\\f'
 _jsonmap[0x0d] = '\\r'
+_paranoidjsonmap = _jsonmap[:]
-def jsonescape(s):
+_jsonmap.extend(chr(x) for x in xrange(128, 256))
+def jsonescape(s, paranoid=False):
 '''returns a string suitable for JSON
 JSON is problematic for us because it doesn't support non-Unicode
 bytes. To deal with this, we take the following approach:
 'a weird byte: \\xed\\xb3\\x9d'
 >>> jsonescape('utf-8: caf\\xc3\\xa9')
 'utf-8: caf\\xc3\\xa9'
 >>> jsonescape('')
 ''
+If paranoid, non-ascii characters are also escaped. This is suitable for
+web output.
+>>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
+'escape boundary: ~ \\\\u007f \\\\u0080'
+>>> jsonescape('a weird byte: \\xdd', paranoid=True)
+'a weird byte: \\\\udcdd'
+>>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
+'utf-8: caf\\\\u00e9'
+>>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
+'non-BMP: \\\\ud834\\\\udd1e'
 '''
-return ''.join(_jsonmap[x] for x in bytearray(toutf8b(s)))
+if paranoid:
+jm = _paranoidjsonmap
+else:
+jm = _jsonmap
+u8chars = toutf8b(s)
+try:
+return ''.join(jm[x] for x in bytearray(u8chars))  # fast path
+except IndexError:
+pass
+# non-BMP char is represented as UTF-16 surrogate pair
+u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
+u16codes.pop(0)  # drop BOM
+return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
 def getutf8char(s, pos):
 '''get the next full utf-8 character in the given string, starting at pos

changeset 28068	9ece901f7a19
parent 28067	69a02b1e947c
child 28069	b2d24c2898f9