mercurial/encoding.py
changeset 16133 84c58da3a1f8
parent 15769 afdf4f5bac61
child 16274 5d75eb8568d1
equal deleted inserted replaced
16132:41fc1e078d68 16133:84c58da3a1f8
   188         return uu.encode(encoding)
   188         return uu.encode(encoding)
   189     except UnicodeError:
   189     except UnicodeError:
   190         return s.upper() # we don't know how to fold this except in ASCII
   190         return s.upper() # we don't know how to fold this except in ASCII
   191     except LookupError, k:
   191     except LookupError, k:
   192         raise error.Abort(k, hint="please check your locale settings")
   192         raise error.Abort(k, hint="please check your locale settings")
       
   193 
       
   194 def toutf8b(s):
       
   195     '''convert a local, possibly-binary string into UTF-8b
       
   196 
       
   197     This is intended as a generic method to preserve data when working
       
   198     with schemes like JSON and XML that have no provision for
       
   199     arbitrary byte strings. As Mercurial often doesn't know
       
   200     what encoding data is in, we use so-called UTF-8b.
       
   201 
       
   202     If a string is already valid UTF-8 (or ASCII), it passes unmodified.
       
   203     Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
       
   204     uDC00-uDCFF.
       
   205 
       
   206     Principles of operation:
       
   207 
       
   208     - ASCII and UTF-8 data sucessfully round-trips and is understood
       
   209       by Unicode-oriented clients
       
   210     - filenames and file contents in arbitrary other encodings can have
       
   211       be round-tripped or recovered by clueful clients
       
   212     - local strings that have a cached known UTF-8 encoding (aka
       
   213       localstr) get sent as UTF-8 so Unicode-oriented clients get the
       
   214       Unicode data they want
       
   215     - because we must preserve UTF-8 bytestring in places such as
       
   216       filenames, metadata can't be roundtripped without help
       
   217 
       
   218     (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
       
   219     arbitrary bytes into an internal Unicode format that can be
       
   220     re-encoded back into the original. Here we are exposing the
       
   221     internal surrogate encoding as a UTF-8 string.)
       
   222     '''
       
   223 
       
   224     if isinstance(s, localstr):
       
   225         return s._utf8
       
   226 
       
   227     try:
       
   228         if s.decode('utf-8'):
       
   229             return s
       
   230     except UnicodeDecodeError:
       
   231         # surrogate-encode any characters that don't round-trip
       
   232         s2 = s.decode('utf-8', 'ignore').encode('utf-8')
       
   233         r = ""
       
   234         pos = 0
       
   235         for c in s:
       
   236             if s2[pos:pos + 1] == c:
       
   237                 r += c
       
   238                 pos += 1
       
   239             else:
       
   240                 r += unichr(0xdc00 + ord(c)).encode('utf-8')
       
   241         return r
       
   242 
       
   243 def fromutf8b(s):
       
   244     '''Given a UTF-8b string, return a local, possibly-binary string.
       
   245 
       
   246     return the original binary string. This
       
   247     is a round-trip process for strings like filenames, but metadata
       
   248     that's was passed through tolocal will remain in UTF-8.
       
   249 
       
   250     >>> m = "\\xc3\\xa9\\x99abcd"
       
   251     >>> n = toutf8b(m)
       
   252     >>> n
       
   253     '\\xc3\\xa9\\xed\\xb2\\x99abcd'
       
   254     >>> fromutf8b(n) == m
       
   255     True
       
   256     '''
       
   257 
       
   258     # fast path - look for uDxxx prefixes in s
       
   259     if "\xed" not in s:
       
   260         return s
       
   261 
       
   262     u = s.decode("utf-8")
       
   263     r = ""
       
   264     for c in u:
       
   265         if ord(c) & 0xff00 == 0xdc00:
       
   266             r += chr(ord(c) & 0xff)
       
   267         else:
       
   268             r += c.encode("utf-8")
       
   269     return r