mercurial/encoding.py
changeset 28068 9ece901f7a19
parent 28067 69a02b1e947c
child 28069 b2d24c2898f9
equal deleted inserted replaced
28067:69a02b1e947c 28068:9ece901f7a19
     5 # This software may be used and distributed according to the terms of the
     5 # This software may be used and distributed according to the terms of the
     6 # GNU General Public License version 2 or any later version.
     6 # GNU General Public License version 2 or any later version.
     7 
     7 
     8 from __future__ import absolute_import
     8 from __future__ import absolute_import
     9 
     9 
       
    10 import array
    10 import locale
    11 import locale
    11 import os
    12 import os
    12 import unicodedata
    13 import unicodedata
    13 
    14 
    14 from . import (
    15 from . import (
   378     upper = 1
   379     upper = 1
   379     other = 0
   380     other = 0
   380 
   381 
   381 _jsonmap = []
   382 _jsonmap = []
   382 _jsonmap.extend("\\u%04x" % x for x in xrange(32))
   383 _jsonmap.extend("\\u%04x" % x for x in xrange(32))
   383 _jsonmap.extend(chr(x) for x in xrange(32, 256))
   384 _jsonmap.extend(chr(x) for x in xrange(32, 127))
   384 _jsonmap[0x7f] = '\\u007f'
   385 _jsonmap.append('\\u007f')
   385 _jsonmap[0x09] = '\\t'
   386 _jsonmap[0x09] = '\\t'
   386 _jsonmap[0x0a] = '\\n'
   387 _jsonmap[0x0a] = '\\n'
   387 _jsonmap[0x22] = '\\"'
   388 _jsonmap[0x22] = '\\"'
   388 _jsonmap[0x5c] = '\\\\'
   389 _jsonmap[0x5c] = '\\\\'
   389 _jsonmap[0x08] = '\\b'
   390 _jsonmap[0x08] = '\\b'
   390 _jsonmap[0x0c] = '\\f'
   391 _jsonmap[0x0c] = '\\f'
   391 _jsonmap[0x0d] = '\\r'
   392 _jsonmap[0x0d] = '\\r'
   392 
   393 _paranoidjsonmap = _jsonmap[:]
   393 def jsonescape(s):
   394 _jsonmap.extend(chr(x) for x in xrange(128, 256))
       
   395 
       
   396 def jsonescape(s, paranoid=False):
   394     '''returns a string suitable for JSON
   397     '''returns a string suitable for JSON
   395 
   398 
   396     JSON is problematic for us because it doesn't support non-Unicode
   399     JSON is problematic for us because it doesn't support non-Unicode
   397     bytes. To deal with this, we take the following approach:
   400     bytes. To deal with this, we take the following approach:
   398 
   401 
   413     'a weird byte: \\xed\\xb3\\x9d'
   416     'a weird byte: \\xed\\xb3\\x9d'
   414     >>> jsonescape('utf-8: caf\\xc3\\xa9')
   417     >>> jsonescape('utf-8: caf\\xc3\\xa9')
   415     'utf-8: caf\\xc3\\xa9'
   418     'utf-8: caf\\xc3\\xa9'
   416     >>> jsonescape('')
   419     >>> jsonescape('')
   417     ''
   420     ''
       
   421 
       
   422     If paranoid, non-ascii characters are also escaped. This is suitable for
       
   423     web output.
       
   424 
       
   425     >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
       
   426     'escape boundary: ~ \\\\u007f \\\\u0080'
       
   427     >>> jsonescape('a weird byte: \\xdd', paranoid=True)
       
   428     'a weird byte: \\\\udcdd'
       
   429     >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
       
   430     'utf-8: caf\\\\u00e9'
       
   431     >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
       
   432     'non-BMP: \\\\ud834\\\\udd1e'
   418     '''
   433     '''
   419 
   434 
   420     return ''.join(_jsonmap[x] for x in bytearray(toutf8b(s)))
   435     if paranoid:
       
   436         jm = _paranoidjsonmap
       
   437     else:
       
   438         jm = _jsonmap
       
   439 
       
   440     u8chars = toutf8b(s)
       
   441     try:
       
   442         return ''.join(jm[x] for x in bytearray(u8chars))  # fast path
       
   443     except IndexError:
       
   444         pass
       
   445     # non-BMP char is represented as UTF-16 surrogate pair
       
   446     u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
       
   447     u16codes.pop(0)  # drop BOM
       
   448     return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
   421 
   449 
   422 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
   450 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
   423 
   451 
   424 def getutf8char(s, pos):
   452 def getutf8char(s, pos):
   425     '''get the next full utf-8 character in the given string, starting at pos
   453     '''get the next full utf-8 character in the given string, starting at pos