comparison mercurial/encoding.py @ 33924:b9101467d88b

encoding: extract stub for fast JSON escape This moves JSON character maps to pure/charencode.py because they will be used only when the fast-path fails.
author Yuya Nishihara <yuya@tcha.org>
date Sun, 23 Apr 2017 16:10:51 +0900
parents f18b11534274
children 2c37f9dabc32
comparison
equal deleted inserted replaced
33923:e6d421566906 33924:b9101467d88b
5 # This software may be used and distributed according to the terms of the 5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version. 6 # GNU General Public License version 2 or any later version.
7 7
8 from __future__ import absolute_import 8 from __future__ import absolute_import
9 9
10 import array
11 import io 10 import io
12 import locale 11 import locale
13 import os 12 import os
14 import unicodedata 13 import unicodedata
15 14
17 error, 16 error,
18 policy, 17 policy,
19 pycompat, 18 pycompat,
20 ) 19 )
21 20
21 from .pure import (
22 charencode as charencodepure,
23 )
24
22 charencode = policy.importmod(r'charencode') 25 charencode = policy.importmod(r'charencode')
23 26
24 asciilower = charencode.asciilower 27 asciilower = charencode.asciilower
25 asciiupper = charencode.asciiupper 28 asciiupper = charencode.asciiupper
29 _jsonescapeu8fast = charencodepure.jsonescapeu8fast # TODO: no "pure"
26 30
27 _sysstr = pycompat.sysstr 31 _sysstr = pycompat.sysstr
28 32
29 if pycompat.ispy3: 33 if pycompat.ispy3:
30 unichr = chr 34 unichr = chr
381 This should be kept in sync with normcase_spec in util.h.''' 385 This should be kept in sync with normcase_spec in util.h.'''
382 lower = -1 386 lower = -1
383 upper = 1 387 upper = 1
384 other = 0 388 other = 0
385 389
386 _jsonmap = []
387 _jsonmap.extend("\\u%04x" % x for x in range(32))
388 _jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))
389 _jsonmap.append('\\u007f')
390 _jsonmap[0x09] = '\\t'
391 _jsonmap[0x0a] = '\\n'
392 _jsonmap[0x22] = '\\"'
393 _jsonmap[0x5c] = '\\\\'
394 _jsonmap[0x08] = '\\b'
395 _jsonmap[0x0c] = '\\f'
396 _jsonmap[0x0d] = '\\r'
397 _paranoidjsonmap = _jsonmap[:]
398 _paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")
399 _paranoidjsonmap[0x3e] = '\\u003e' # '>'
400 _jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))
401
402 def jsonescape(s, paranoid=False): 390 def jsonescape(s, paranoid=False):
403 '''returns a string suitable for JSON 391 '''returns a string suitable for JSON
404 392
405 JSON is problematic for us because it doesn't support non-Unicode 393 JSON is problematic for us because it doesn't support non-Unicode
406 bytes. To deal with this, we take the following approach: 394 bytes. To deal with this, we take the following approach:
438 'non-BMP: \\\\ud834\\\\udd1e' 426 'non-BMP: \\\\ud834\\\\udd1e'
439 >>> jsonescape('<foo@example.org>', paranoid=True) 427 >>> jsonescape('<foo@example.org>', paranoid=True)
440 '\\\\u003cfoo@example.org\\\\u003e' 428 '\\\\u003cfoo@example.org\\\\u003e'
441 ''' 429 '''
442 430
443 if paranoid:
444 jm = _paranoidjsonmap
445 else:
446 jm = _jsonmap
447
448 u8chars = toutf8b(s) 431 u8chars = toutf8b(s)
449 try: 432 try:
450 return ''.join(jm[x] for x in bytearray(u8chars)) # fast path 433 return _jsonescapeu8fast(u8chars, paranoid)
451 except IndexError: 434 except ValueError:
452 pass 435 pass
453 # non-BMP char is represented as UTF-16 surrogate pair 436 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
454 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
455 u16codes.pop(0) # drop BOM
456 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
457 437
458 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] 438 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
459 439
460 def getutf8char(s, pos): 440 def getutf8char(s, pos):
461 '''get the next full utf-8 character in the given string, starting at pos 441 '''get the next full utf-8 character in the given string, starting at pos