mercurial/encoding.py
changeset 37947 3ea3c96ada54
parent 37946 57b0c7221dba
child 38739 7acec9408e1c
equal deleted inserted replaced
37946:57b0c7221dba 37947:3ea3c96ada54
    90         s = bytes.__new__(cls, l)
    90         s = bytes.__new__(cls, l)
    91         s._utf8 = u
    91         s._utf8 = u
    92         return s
    92         return s
    93     def __hash__(self):
    93     def __hash__(self):
    94         return hash(self._utf8) # avoid collisions in local string space
    94         return hash(self._utf8) # avoid collisions in local string space
       
    95 
       
    96 class safelocalstr(bytes):
       
    97     """Tagged string denoting it was previously an internal UTF-8 string,
       
    98     and can be converted back to UTF-8 losslessly
       
    99 
       
   100     >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
       
   101     >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
       
   102     >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
       
   103     >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
       
   104     """
    95 
   105 
    96 def tolocal(s):
   106 def tolocal(s):
    97     """
   107     """
    98     Convert a string from internal UTF-8 to local encoding
   108     Convert a string from internal UTF-8 to local encoding
    99 
   109 
   138                 # fast path
   148                 # fast path
   139                 return s
   149                 return s
   140             r = u.encode(_sysstr(encoding), u"replace")
   150             r = u.encode(_sysstr(encoding), u"replace")
   141             if u == r.decode(_sysstr(encoding)):
   151             if u == r.decode(_sysstr(encoding)):
   142                 # r is a safe, non-lossy encoding of s
   152                 # r is a safe, non-lossy encoding of s
   143                 return r
   153                 return safelocalstr(r)
   144             return localstr(s, r)
   154             return localstr(s, r)
   145         except UnicodeDecodeError:
   155         except UnicodeDecodeError:
   146             # we should only get here if we're looking at an ancient changeset
   156             # we should only get here if we're looking at an ancient changeset
   147             try:
   157             try:
   148                 u = s.decode(_sysstr(fallbackencoding))
   158                 u = s.decode(_sysstr(fallbackencoding))
   149                 r = u.encode(_sysstr(encoding), u"replace")
   159                 r = u.encode(_sysstr(encoding), u"replace")
   150                 if u == r.decode(_sysstr(encoding)):
   160                 if u == r.decode(_sysstr(encoding)):
   151                     # r is a safe, non-lossy encoding of s
   161                     # r is a safe, non-lossy encoding of s
   152                     return r
   162                     return safelocalstr(r)
   153                 return localstr(u.encode('UTF-8'), r)
   163                 return localstr(u.encode('UTF-8'), r)
   154             except UnicodeDecodeError:
   164             except UnicodeDecodeError:
   155                 u = s.decode("utf-8", "replace") # last ditch
   165                 u = s.decode("utf-8", "replace") # last ditch
   156                 # can't round-trip
   166                 # can't round-trip
   157                 return u.encode(_sysstr(encoding), u"replace")
   167                 return u.encode(_sysstr(encoding), u"replace")
   400     '''returns a string suitable for JSON
   410     '''returns a string suitable for JSON
   401 
   411 
   402     JSON is problematic for us because it doesn't support non-Unicode
   412     JSON is problematic for us because it doesn't support non-Unicode
   403     bytes. To deal with this, we take the following approach:
   413     bytes. To deal with this, we take the following approach:
   404 
   414 
   405     - localstr objects are converted back to UTF-8
   415     - localstr/safelocalstr objects are converted back to UTF-8
   406     - valid UTF-8/ASCII strings are passed as-is
   416     - valid UTF-8/ASCII strings are passed as-is
   407     - other strings are converted to UTF-8b surrogate encoding
   417     - other strings are converted to UTF-8b surrogate encoding
   408     - apply JSON-specified string escaping
   418     - apply JSON-specified string escaping
   409 
   419 
   410     (escapes are doubled in these tests)
   420     (escapes are doubled in these tests)
   493     - filenames and file contents in arbitrary other encodings can have
   503     - filenames and file contents in arbitrary other encodings can have
   494       be round-tripped or recovered by clueful clients
   504       be round-tripped or recovered by clueful clients
   495     - local strings that have a cached known UTF-8 encoding (aka
   505     - local strings that have a cached known UTF-8 encoding (aka
   496       localstr) get sent as UTF-8 so Unicode-oriented clients get the
   506       localstr) get sent as UTF-8 so Unicode-oriented clients get the
   497       Unicode data they want
   507       Unicode data they want
       
   508     - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
   498     - because we must preserve UTF-8 bytestring in places such as
   509     - because we must preserve UTF-8 bytestring in places such as
   499       filenames, metadata can't be roundtripped without help
   510       filenames, metadata can't be roundtripped without help
   500 
   511 
   501     (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
   512     (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
   502     arbitrary bytes into an internal Unicode format that can be
   513     arbitrary bytes into an internal Unicode format that can be
   506 
   517 
   507     if isinstance(s, localstr):
   518     if isinstance(s, localstr):
   508         # assume that the original UTF-8 sequence would never contain
   519         # assume that the original UTF-8 sequence would never contain
   509         # invalid characters in U+DCxx range
   520         # invalid characters in U+DCxx range
   510         return s._utf8
   521         return s._utf8
       
   522     elif isinstance(s, safelocalstr):
       
   523         # already verified that s is non-lossy in legacy encoding, which
       
   524         # shouldn't contain characters in U+DCxx range
       
   525         return fromlocal(s)
   511     elif isasciistr(s):
   526     elif isasciistr(s):
   512         return s
   527         return s
   513     if "\xed" not in s:
   528     if "\xed" not in s:
   514         try:
   529         try:
   515             s.decode('utf-8', _utf8strict)
   530             s.decode('utf-8', _utf8strict)