Mercurial > public > mercurial-scm > hg
comparison mercurial/encoding.py @ 16133:84c58da3a1f8
encoding: introduce utf8-b helpers
author | Matt Mackall <mpm@selenic.com> |
---|---|
date | Mon, 20 Feb 2012 16:42:45 -0600 |
parents | afdf4f5bac61 |
children | 5d75eb8568d1 |
comparison
equal
deleted
inserted
replaced
16132:41fc1e078d68 | 16133:84c58da3a1f8 |
---|---|
188 return uu.encode(encoding) | 188 return uu.encode(encoding) |
189 except UnicodeError: | 189 except UnicodeError: |
190 return s.upper() # we don't know how to fold this except in ASCII | 190 return s.upper() # we don't know how to fold this except in ASCII |
191 except LookupError, k: | 191 except LookupError, k: |
192 raise error.Abort(k, hint="please check your locale settings") | 192 raise error.Abort(k, hint="please check your locale settings") |
193 | |
194 def toutf8b(s): | |
195 '''convert a local, possibly-binary string into UTF-8b | |
196 | |
197 This is intended as a generic method to preserve data when working | |
198 with schemes like JSON and XML that have no provision for | |
199 arbitrary byte strings. As Mercurial often doesn't know | |
200 what encoding data is in, we use so-called UTF-8b. | |
201 | |
202 If a string is already valid UTF-8 (or ASCII), it passes unmodified. | |
203 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range, | |
204 uDC00-uDCFF. | |
205 | |
206 Principles of operation: | |
207 | |
208 - ASCII and UTF-8 data sucessfully round-trips and is understood | |
209 by Unicode-oriented clients | |
210 - filenames and file contents in arbitrary other encodings can have | |
211 be round-tripped or recovered by clueful clients | |
212 - local strings that have a cached known UTF-8 encoding (aka | |
213 localstr) get sent as UTF-8 so Unicode-oriented clients get the | |
214 Unicode data they want | |
215 - because we must preserve UTF-8 bytestring in places such as | |
216 filenames, metadata can't be roundtripped without help | |
217 | |
218 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and | |
219 arbitrary bytes into an internal Unicode format that can be | |
220 re-encoded back into the original. Here we are exposing the | |
221 internal surrogate encoding as a UTF-8 string.) | |
222 ''' | |
223 | |
224 if isinstance(s, localstr): | |
225 return s._utf8 | |
226 | |
227 try: | |
228 if s.decode('utf-8'): | |
229 return s | |
230 except UnicodeDecodeError: | |
231 # surrogate-encode any characters that don't round-trip | |
232 s2 = s.decode('utf-8', 'ignore').encode('utf-8') | |
233 r = "" | |
234 pos = 0 | |
235 for c in s: | |
236 if s2[pos:pos + 1] == c: | |
237 r += c | |
238 pos += 1 | |
239 else: | |
240 r += unichr(0xdc00 + ord(c)).encode('utf-8') | |
241 return r | |
242 | |
243 def fromutf8b(s): | |
244 '''Given a UTF-8b string, return a local, possibly-binary string. | |
245 | |
246 return the original binary string. This | |
247 is a round-trip process for strings like filenames, but metadata | |
248 that's was passed through tolocal will remain in UTF-8. | |
249 | |
250 >>> m = "\\xc3\\xa9\\x99abcd" | |
251 >>> n = toutf8b(m) | |
252 >>> n | |
253 '\\xc3\\xa9\\xed\\xb2\\x99abcd' | |
254 >>> fromutf8b(n) == m | |
255 True | |
256 ''' | |
257 | |
258 # fast path - look for uDxxx prefixes in s | |
259 if "\xed" not in s: | |
260 return s | |
261 | |
262 u = s.decode("utf-8") | |
263 r = "" | |
264 for c in u: | |
265 if ord(c) & 0xff00 == 0xdc00: | |
266 r += chr(ord(c) & 0xff) | |
267 else: | |
268 r += c.encode("utf-8") | |
269 return r |