Mercurial > public > mercurial-scm > hg
comparison mercurial/encoding.py @ 33924:b9101467d88b
encoding: extract stub for fast JSON escape
This moves JSON character maps to pure/charencode.py because they will be
used only when the fast-path fails.
author | Yuya Nishihara <yuya@tcha.org> |
---|---|
date | Sun, 23 Apr 2017 16:10:51 +0900 |
parents | f18b11534274 |
children | 2c37f9dabc32 |
comparison
equal
deleted
inserted
replaced
33923:e6d421566906 | 33924:b9101467d88b |
---|---|
5 # This software may be used and distributed according to the terms of the | 5 # This software may be used and distributed according to the terms of the |
6 # GNU General Public License version 2 or any later version. | 6 # GNU General Public License version 2 or any later version. |
7 | 7 |
8 from __future__ import absolute_import | 8 from __future__ import absolute_import |
9 | 9 |
10 import array | |
11 import io | 10 import io |
12 import locale | 11 import locale |
13 import os | 12 import os |
14 import unicodedata | 13 import unicodedata |
15 | 14 |
17 error, | 16 error, |
18 policy, | 17 policy, |
19 pycompat, | 18 pycompat, |
20 ) | 19 ) |
21 | 20 |
21 from .pure import ( | |
22 charencode as charencodepure, | |
23 ) | |
24 | |
22 charencode = policy.importmod(r'charencode') | 25 charencode = policy.importmod(r'charencode') |
23 | 26 |
24 asciilower = charencode.asciilower | 27 asciilower = charencode.asciilower |
25 asciiupper = charencode.asciiupper | 28 asciiupper = charencode.asciiupper |
29 _jsonescapeu8fast = charencodepure.jsonescapeu8fast # TODO: no "pure" | |
26 | 30 |
27 _sysstr = pycompat.sysstr | 31 _sysstr = pycompat.sysstr |
28 | 32 |
29 if pycompat.ispy3: | 33 if pycompat.ispy3: |
30 unichr = chr | 34 unichr = chr |
381 This should be kept in sync with normcase_spec in util.h.''' | 385 This should be kept in sync with normcase_spec in util.h.''' |
382 lower = -1 | 386 lower = -1 |
383 upper = 1 | 387 upper = 1 |
384 other = 0 | 388 other = 0 |
385 | 389 |
386 _jsonmap = [] | |
387 _jsonmap.extend("\\u%04x" % x for x in range(32)) | |
388 _jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127)) | |
389 _jsonmap.append('\\u007f') | |
390 _jsonmap[0x09] = '\\t' | |
391 _jsonmap[0x0a] = '\\n' | |
392 _jsonmap[0x22] = '\\"' | |
393 _jsonmap[0x5c] = '\\\\' | |
394 _jsonmap[0x08] = '\\b' | |
395 _jsonmap[0x0c] = '\\f' | |
396 _jsonmap[0x0d] = '\\r' | |
397 _paranoidjsonmap = _jsonmap[:] | |
398 _paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>") | |
399 _paranoidjsonmap[0x3e] = '\\u003e' # '>' | |
400 _jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256)) | |
401 | |
402 def jsonescape(s, paranoid=False): | 390 def jsonescape(s, paranoid=False): |
403 '''returns a string suitable for JSON | 391 '''returns a string suitable for JSON |
404 | 392 |
405 JSON is problematic for us because it doesn't support non-Unicode | 393 JSON is problematic for us because it doesn't support non-Unicode |
406 bytes. To deal with this, we take the following approach: | 394 bytes. To deal with this, we take the following approach: |
438 'non-BMP: \\\\ud834\\\\udd1e' | 426 'non-BMP: \\\\ud834\\\\udd1e' |
439 >>> jsonescape('<foo@example.org>', paranoid=True) | 427 >>> jsonescape('<foo@example.org>', paranoid=True) |
440 '\\\\u003cfoo@example.org\\\\u003e' | 428 '\\\\u003cfoo@example.org\\\\u003e' |
441 ''' | 429 ''' |
442 | 430 |
443 if paranoid: | |
444 jm = _paranoidjsonmap | |
445 else: | |
446 jm = _jsonmap | |
447 | |
448 u8chars = toutf8b(s) | 431 u8chars = toutf8b(s) |
449 try: | 432 try: |
450 return ''.join(jm[x] for x in bytearray(u8chars)) # fast path | 433 return _jsonescapeu8fast(u8chars, paranoid) |
451 except IndexError: | 434 except ValueError: |
452 pass | 435 pass |
453 # non-BMP char is represented as UTF-16 surrogate pair | 436 return charencodepure.jsonescapeu8fallback(u8chars, paranoid) |
454 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16')) | |
455 u16codes.pop(0) # drop BOM | |
456 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes) | |
457 | 437 |
458 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] | 438 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] |
459 | 439 |
460 def getutf8char(s, pos): | 440 def getutf8char(s, pos): |
461 '''get the next full utf-8 character in the given string, starting at pos | 441 '''get the next full utf-8 character in the given string, starting at pos |