mercurial/encoding.py
changeset 33924 b9101467d88b
parent 33852 f18b11534274
child 33925 2c37f9dabc32
equal deleted inserted replaced
33923:e6d421566906 33924:b9101467d88b
     5 # This software may be used and distributed according to the terms of the
     5 # This software may be used and distributed according to the terms of the
     6 # GNU General Public License version 2 or any later version.
     6 # GNU General Public License version 2 or any later version.
     7 
     7 
     8 from __future__ import absolute_import
     8 from __future__ import absolute_import
     9 
     9 
    10 import array
       
    11 import io
    10 import io
    12 import locale
    11 import locale
    13 import os
    12 import os
    14 import unicodedata
    13 import unicodedata
    15 
    14 
    17     error,
    16     error,
    18     policy,
    17     policy,
    19     pycompat,
    18     pycompat,
    20 )
    19 )
    21 
    20 
       
    21 from .pure import (
       
    22     charencode as charencodepure,
       
    23 )
       
    24 
    22 charencode = policy.importmod(r'charencode')
    25 charencode = policy.importmod(r'charencode')
    23 
    26 
    24 asciilower = charencode.asciilower
    27 asciilower = charencode.asciilower
    25 asciiupper = charencode.asciiupper
    28 asciiupper = charencode.asciiupper
       
    29 _jsonescapeu8fast = charencodepure.jsonescapeu8fast  # TODO: no "pure"
    26 
    30 
    27 _sysstr = pycompat.sysstr
    31 _sysstr = pycompat.sysstr
    28 
    32 
    29 if pycompat.ispy3:
    33 if pycompat.ispy3:
    30     unichr = chr
    34     unichr = chr
   381     This should be kept in sync with normcase_spec in util.h.'''
   385     This should be kept in sync with normcase_spec in util.h.'''
   382     lower = -1
   386     lower = -1
   383     upper = 1
   387     upper = 1
   384     other = 0
   388     other = 0
   385 
   389 
   386 _jsonmap = []
       
   387 _jsonmap.extend("\\u%04x" % x for x in range(32))
       
   388 _jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))
       
   389 _jsonmap.append('\\u007f')
       
   390 _jsonmap[0x09] = '\\t'
       
   391 _jsonmap[0x0a] = '\\n'
       
   392 _jsonmap[0x22] = '\\"'
       
   393 _jsonmap[0x5c] = '\\\\'
       
   394 _jsonmap[0x08] = '\\b'
       
   395 _jsonmap[0x0c] = '\\f'
       
   396 _jsonmap[0x0d] = '\\r'
       
   397 _paranoidjsonmap = _jsonmap[:]
       
   398 _paranoidjsonmap[0x3c] = '\\u003c'  # '<' (e.g. escape "</script>")
       
   399 _paranoidjsonmap[0x3e] = '\\u003e'  # '>'
       
   400 _jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))
       
   401 
       
   402 def jsonescape(s, paranoid=False):
   390 def jsonescape(s, paranoid=False):
   403     '''returns a string suitable for JSON
   391     '''returns a string suitable for JSON
   404 
   392 
   405     JSON is problematic for us because it doesn't support non-Unicode
   393     JSON is problematic for us because it doesn't support non-Unicode
   406     bytes. To deal with this, we take the following approach:
   394     bytes. To deal with this, we take the following approach:
   438     'non-BMP: \\\\ud834\\\\udd1e'
   426     'non-BMP: \\\\ud834\\\\udd1e'
   439     >>> jsonescape('<foo@example.org>', paranoid=True)
   427     >>> jsonescape('<foo@example.org>', paranoid=True)
   440     '\\\\u003cfoo@example.org\\\\u003e'
   428     '\\\\u003cfoo@example.org\\\\u003e'
   441     '''
   429     '''
   442 
   430 
   443     if paranoid:
       
   444         jm = _paranoidjsonmap
       
   445     else:
       
   446         jm = _jsonmap
       
   447 
       
   448     u8chars = toutf8b(s)
   431     u8chars = toutf8b(s)
   449     try:
   432     try:
   450         return ''.join(jm[x] for x in bytearray(u8chars))  # fast path
   433         return _jsonescapeu8fast(u8chars, paranoid)
   451     except IndexError:
   434     except ValueError:
   452         pass
   435         pass
   453     # non-BMP char is represented as UTF-16 surrogate pair
   436     return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
   454     u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
       
   455     u16codes.pop(0)  # drop BOM
       
   456     return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
       
   457 
   437 
   458 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
   438 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
   459 
   439 
   460 def getutf8char(s, pos):
   440 def getutf8char(s, pos):
   461     '''get the next full utf-8 character in the given string, starting at pos
   441     '''get the next full utf-8 character in the given string, starting at pos