188 return uu.encode(encoding) |
188 return uu.encode(encoding) |
189 except UnicodeError: |
189 except UnicodeError: |
190 return s.upper() # we don't know how to fold this except in ASCII |
190 return s.upper() # we don't know how to fold this except in ASCII |
191 except LookupError, k: |
191 except LookupError, k: |
192 raise error.Abort(k, hint="please check your locale settings") |
192 raise error.Abort(k, hint="please check your locale settings") |
|
193 |
|
194 def toutf8b(s): |
|
195 '''convert a local, possibly-binary string into UTF-8b |
|
196 |
|
197 This is intended as a generic method to preserve data when working |
|
198 with schemes like JSON and XML that have no provision for |
|
199 arbitrary byte strings. As Mercurial often doesn't know |
|
200 what encoding data is in, we use so-called UTF-8b. |
|
201 |
|
202 If a string is already valid UTF-8 (or ASCII), it passes unmodified. |
|
203 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range, |
|
204 uDC00-uDCFF. |
|
205 |
|
206 Principles of operation: |
|
207 |
|
208 - ASCII and UTF-8 data sucessfully round-trips and is understood |
|
209 by Unicode-oriented clients |
|
210 - filenames and file contents in arbitrary other encodings can have |
|
211 be round-tripped or recovered by clueful clients |
|
212 - local strings that have a cached known UTF-8 encoding (aka |
|
213 localstr) get sent as UTF-8 so Unicode-oriented clients get the |
|
214 Unicode data they want |
|
215 - because we must preserve UTF-8 bytestring in places such as |
|
216 filenames, metadata can't be roundtripped without help |
|
217 |
|
218 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and |
|
219 arbitrary bytes into an internal Unicode format that can be |
|
220 re-encoded back into the original. Here we are exposing the |
|
221 internal surrogate encoding as a UTF-8 string.) |
|
222 ''' |
|
223 |
|
224 if isinstance(s, localstr): |
|
225 return s._utf8 |
|
226 |
|
227 try: |
|
228 if s.decode('utf-8'): |
|
229 return s |
|
230 except UnicodeDecodeError: |
|
231 # surrogate-encode any characters that don't round-trip |
|
232 s2 = s.decode('utf-8', 'ignore').encode('utf-8') |
|
233 r = "" |
|
234 pos = 0 |
|
235 for c in s: |
|
236 if s2[pos:pos + 1] == c: |
|
237 r += c |
|
238 pos += 1 |
|
239 else: |
|
240 r += unichr(0xdc00 + ord(c)).encode('utf-8') |
|
241 return r |
|
242 |
|
243 def fromutf8b(s): |
|
244 '''Given a UTF-8b string, return a local, possibly-binary string. |
|
245 |
|
246 return the original binary string. This |
|
247 is a round-trip process for strings like filenames, but metadata |
|
248 that's was passed through tolocal will remain in UTF-8. |
|
249 |
|
250 >>> m = "\\xc3\\xa9\\x99abcd" |
|
251 >>> n = toutf8b(m) |
|
252 >>> n |
|
253 '\\xc3\\xa9\\xed\\xb2\\x99abcd' |
|
254 >>> fromutf8b(n) == m |
|
255 True |
|
256 ''' |
|
257 |
|
258 # fast path - look for uDxxx prefixes in s |
|
259 if "\xed" not in s: |
|
260 return s |
|
261 |
|
262 u = s.decode("utf-8") |
|
263 r = "" |
|
264 for c in u: |
|
265 if ord(c) & 0xff00 == 0xdc00: |
|
266 r += chr(ord(c) & 0xff) |
|
267 else: |
|
268 r += c.encode("utf-8") |
|
269 return r |