34 unichr = chr |
32 unichr = chr |
35 |
33 |
36 # These unicode characters are ignored by HFS+ (Apple Technote 1150, |
34 # These unicode characters are ignored by HFS+ (Apple Technote 1150, |
37 # "Unicode Subtleties"), so we need to ignore them in some places for |
35 # "Unicode Subtleties"), so we need to ignore them in some places for |
38 # sanity. |
36 # sanity. |
39 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in |
37 _ignore = [ |
40 "200c 200d 200e 200f 202a 202b 202c 202d 202e " |
38 unichr(int(x, 16)).encode("utf-8") |
41 "206a 206b 206c 206d 206e 206f feff".split()] |
39 for x in "200c 200d 200e 200f 202a 202b 202c 202d 202e " |
|
40 "206a 206b 206c 206d 206e 206f feff".split() |
|
41 ] |
42 # verify the next function will work |
42 # verify the next function will work |
43 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore) |
43 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore) |
|
44 |
44 |
45 |
45 def hfsignoreclean(s): |
46 def hfsignoreclean(s): |
46 """Remove codepoints ignored by HFS+ from s. |
47 """Remove codepoints ignored by HFS+ from s. |
47 |
48 |
48 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8')) |
49 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8')) |
53 if "\xe2" in s or "\xef" in s: |
54 if "\xe2" in s or "\xef" in s: |
54 for c in _ignore: |
55 for c in _ignore: |
55 s = s.replace(c, '') |
56 s = s.replace(c, '') |
56 return s |
57 return s |
57 |
58 |
|
59 |
58 # encoding.environ is provided read-only, which may not be used to modify |
60 # encoding.environ is provided read-only, which may not be used to modify |
59 # the process environment |
61 # the process environment |
60 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ) |
62 _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ |
61 if not pycompat.ispy3: |
63 if not pycompat.ispy3: |
62 environ = os.environ # re-exports |
64 environ = os.environ # re-exports |
63 elif _nativeenviron: |
65 elif _nativeenviron: |
64 environ = os.environb # re-exports |
66 environ = os.environb # re-exports |
65 else: |
67 else: |
66 # preferred encoding isn't known yet; use utf-8 to avoid unicode error |
68 # preferred encoding isn't known yet; use utf-8 to avoid unicode error |
67 # and recreate it once encoding is settled |
69 # and recreate it once encoding is settled |
68 environ = dict((k.encode(r'utf-8'), v.encode(r'utf-8')) |
70 environ = dict( |
69 for k, v in os.environ.items()) # re-exports |
71 (k.encode(r'utf-8'), v.encode(r'utf-8')) |
|
72 for k, v in os.environ.items() # re-exports |
|
73 ) |
70 |
74 |
71 _encodingrewrites = { |
75 _encodingrewrites = { |
72 '646': 'ascii', |
76 '646': 'ascii', |
73 'ANSI_X3.4-1968': 'ascii', |
77 'ANSI_X3.4-1968': 'ascii', |
74 } |
78 } |
86 except locale.Error: |
90 except locale.Error: |
87 encoding = 'ascii' |
91 encoding = 'ascii' |
88 encodingmode = environ.get("HGENCODINGMODE", "strict") |
92 encodingmode = environ.get("HGENCODINGMODE", "strict") |
89 fallbackencoding = 'ISO-8859-1' |
93 fallbackencoding = 'ISO-8859-1' |
90 |
94 |
|
95 |
91 class localstr(bytes): |
96 class localstr(bytes): |
92 '''This class allows strings that are unmodified to be |
97 '''This class allows strings that are unmodified to be |
93 round-tripped to the local encoding and back''' |
98 round-tripped to the local encoding and back''' |
|
99 |
94 def __new__(cls, u, l): |
100 def __new__(cls, u, l): |
95 s = bytes.__new__(cls, l) |
101 s = bytes.__new__(cls, l) |
96 s._utf8 = u |
102 s._utf8 = u |
97 return s |
103 return s |
|
104 |
98 def __hash__(self): |
105 def __hash__(self): |
99 return hash(self._utf8) # avoid collisions in local string space |
106 return hash(self._utf8) # avoid collisions in local string space |
|
107 |
100 |
108 |
101 class safelocalstr(bytes): |
109 class safelocalstr(bytes): |
102 """Tagged string denoting it was previously an internal UTF-8 string, |
110 """Tagged string denoting it was previously an internal UTF-8 string, |
103 and can be converted back to UTF-8 losslessly |
111 and can be converted back to UTF-8 losslessly |
104 |
112 |
105 >>> assert safelocalstr(b'\\xc3') == b'\\xc3' |
113 >>> assert safelocalstr(b'\\xc3') == b'\\xc3' |
106 >>> assert b'\\xc3' == safelocalstr(b'\\xc3') |
114 >>> assert b'\\xc3' == safelocalstr(b'\\xc3') |
107 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0} |
115 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0} |
108 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0} |
116 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0} |
109 """ |
117 """ |
|
118 |
110 |
119 |
111 def tolocal(s): |
120 def tolocal(s): |
112 """ |
121 """ |
113 Convert a string from internal UTF-8 to local encoding |
122 Convert a string from internal UTF-8 to local encoding |
114 |
123 |
165 if u == r.decode(_sysstr(encoding)): |
174 if u == r.decode(_sysstr(encoding)): |
166 # r is a safe, non-lossy encoding of s |
175 # r is a safe, non-lossy encoding of s |
167 return safelocalstr(r) |
176 return safelocalstr(r) |
168 return localstr(u.encode('UTF-8'), r) |
177 return localstr(u.encode('UTF-8'), r) |
169 except UnicodeDecodeError: |
178 except UnicodeDecodeError: |
170 u = s.decode("utf-8", "replace") # last ditch |
179 u = s.decode("utf-8", "replace") # last ditch |
171 # can't round-trip |
180 # can't round-trip |
172 return u.encode(_sysstr(encoding), r"replace") |
181 return u.encode(_sysstr(encoding), r"replace") |
173 except LookupError as k: |
182 except LookupError as k: |
174 raise error.Abort(k, hint="please check your locale settings") |
183 raise error.Abort(k, hint="please check your locale settings") |
|
184 |
175 |
185 |
176 def fromlocal(s): |
186 def fromlocal(s): |
177 """ |
187 """ |
178 Convert a string from the local character encoding to UTF-8 |
188 Convert a string from the local character encoding to UTF-8 |
179 |
189 |
192 |
202 |
193 try: |
203 try: |
194 u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) |
204 u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) |
195 return u.encode("utf-8") |
205 return u.encode("utf-8") |
196 except UnicodeDecodeError as inst: |
206 except UnicodeDecodeError as inst: |
197 sub = s[max(0, inst.start - 10):inst.start + 10] |
207 sub = s[max(0, inst.start - 10) : inst.start + 10] |
198 raise error.Abort("decoding near '%s': %s!" |
208 raise error.Abort( |
199 % (sub, pycompat.bytestr(inst))) |
209 "decoding near '%s': %s!" % (sub, pycompat.bytestr(inst)) |
|
210 ) |
200 except LookupError as k: |
211 except LookupError as k: |
201 raise error.Abort(k, hint="please check your locale settings") |
212 raise error.Abort(k, hint="please check your locale settings") |
|
213 |
202 |
214 |
203 def unitolocal(u): |
215 def unitolocal(u): |
204 """Convert a unicode string to a byte string of local encoding""" |
216 """Convert a unicode string to a byte string of local encoding""" |
205 return tolocal(u.encode('utf-8')) |
217 return tolocal(u.encode('utf-8')) |
206 |
218 |
|
219 |
207 def unifromlocal(s): |
220 def unifromlocal(s): |
208 """Convert a byte string of local encoding to a unicode string""" |
221 """Convert a byte string of local encoding to a unicode string""" |
209 return fromlocal(s).decode('utf-8') |
222 return fromlocal(s).decode('utf-8') |
210 |
223 |
|
224 |
211 def unimethod(bytesfunc): |
225 def unimethod(bytesfunc): |
212 """Create a proxy method that forwards __unicode__() and __str__() of |
226 """Create a proxy method that forwards __unicode__() and __str__() of |
213 Python 3 to __bytes__()""" |
227 Python 3 to __bytes__()""" |
|
228 |
214 def unifunc(obj): |
229 def unifunc(obj): |
215 return unifromlocal(bytesfunc(obj)) |
230 return unifromlocal(bytesfunc(obj)) |
|
231 |
216 return unifunc |
232 return unifunc |
|
233 |
217 |
234 |
218 # converter functions between native str and byte string. use these if the |
235 # converter functions between native str and byte string. use these if the |
219 # character encoding is not aware (e.g. exception message) or is known to |
236 # character encoding is not aware (e.g. exception message) or is known to |
220 # be locale dependent (e.g. date formatting.) |
237 # be locale dependent (e.g. date formatting.) |
221 if pycompat.ispy3: |
238 if pycompat.ispy3: |
228 strmethod = pycompat.identity |
245 strmethod = pycompat.identity |
229 |
246 |
230 if not _nativeenviron: |
247 if not _nativeenviron: |
231 # now encoding and helper functions are available, recreate the environ |
248 # now encoding and helper functions are available, recreate the environ |
232 # dict to be exported to other modules |
249 # dict to be exported to other modules |
233 environ = dict((tolocal(k.encode(r'utf-8')), tolocal(v.encode(r'utf-8'))) |
250 environ = dict( |
234 for k, v in os.environ.items()) # re-exports |
251 (tolocal(k.encode(r'utf-8')), tolocal(v.encode(r'utf-8'))) |
|
252 for k, v in os.environ.items() # re-exports |
|
253 ) |
235 |
254 |
236 if pycompat.ispy3: |
255 if pycompat.ispy3: |
237 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which |
256 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which |
238 # returns bytes. |
257 # returns bytes. |
239 if pycompat.iswindows: |
258 if pycompat.iswindows: |
244 getcwd = os.getcwdb # re-exports |
263 getcwd = os.getcwdb # re-exports |
245 else: |
264 else: |
246 getcwd = os.getcwd # re-exports |
265 getcwd = os.getcwd # re-exports |
247 |
266 |
248 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide. |
267 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide. |
249 _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide" |
268 _wide = _sysstr( |
250 and "WFA" or "WF") |
269 environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide" and "WFA" or "WF" |
|
270 ) |
|
271 |
251 |
272 |
252 def colwidth(s): |
273 def colwidth(s): |
253 "Find the column width of a string for display in the local encoding" |
274 "Find the column width of a string for display in the local encoding" |
254 return ucolwidth(s.decode(_sysstr(encoding), r'replace')) |
275 return ucolwidth(s.decode(_sysstr(encoding), r'replace')) |
|
276 |
255 |
277 |
256 def ucolwidth(d): |
278 def ucolwidth(d): |
257 "Find the column width of a Unicode string for display" |
279 "Find the column width of a Unicode string for display" |
258 eaw = getattr(unicodedata, 'east_asian_width', None) |
280 eaw = getattr(unicodedata, 'east_asian_width', None) |
259 if eaw is not None: |
281 if eaw is not None: |
260 return sum([eaw(c) in _wide and 2 or 1 for c in d]) |
282 return sum([eaw(c) in _wide and 2 or 1 for c in d]) |
261 return len(d) |
283 return len(d) |
|
284 |
262 |
285 |
263 def getcols(s, start, c): |
286 def getcols(s, start, c): |
264 '''Use colwidth to find a c-column substring of s starting at byte |
287 '''Use colwidth to find a c-column substring of s starting at byte |
265 index start''' |
288 index start''' |
266 for x in pycompat.xrange(start + c, len(s)): |
289 for x in pycompat.xrange(start + c, len(s)): |
267 t = s[start:x] |
290 t = s[start:x] |
268 if colwidth(t) == c: |
291 if colwidth(t) == c: |
269 return t |
292 return t |
|
293 |
270 |
294 |
271 def trim(s, width, ellipsis='', leftside=False): |
295 def trim(s, width, ellipsis='', leftside=False): |
272 """Trim string 's' to at most 'width' columns (including 'ellipsis'). |
296 """Trim string 's' to at most 'width' columns (including 'ellipsis'). |
273 |
297 |
274 If 'leftside' is True, left side of string 's' is trimmed. |
298 If 'leftside' is True, left side of string 's' is trimmed. |
334 + |
358 + |
335 """ |
359 """ |
336 try: |
360 try: |
337 u = s.decode(_sysstr(encoding)) |
361 u = s.decode(_sysstr(encoding)) |
338 except UnicodeDecodeError: |
362 except UnicodeDecodeError: |
339 if len(s) <= width: # trimming is not needed |
363 if len(s) <= width: # trimming is not needed |
340 return s |
364 return s |
341 width -= len(ellipsis) |
365 width -= len(ellipsis) |
342 if width <= 0: # no enough room even for ellipsis |
366 if width <= 0: # no enough room even for ellipsis |
343 return ellipsis[:width + len(ellipsis)] |
367 return ellipsis[: width + len(ellipsis)] |
344 if leftside: |
368 if leftside: |
345 return ellipsis + s[-width:] |
369 return ellipsis + s[-width:] |
346 return s[:width] + ellipsis |
370 return s[:width] + ellipsis |
347 |
371 |
348 if ucolwidth(u) <= width: # trimming is not needed |
372 if ucolwidth(u) <= width: # trimming is not needed |
349 return s |
373 return s |
350 |
374 |
351 width -= len(ellipsis) |
375 width -= len(ellipsis) |
352 if width <= 0: # no enough room even for ellipsis |
376 if width <= 0: # no enough room even for ellipsis |
353 return ellipsis[:width + len(ellipsis)] |
377 return ellipsis[: width + len(ellipsis)] |
354 |
378 |
355 if leftside: |
379 if leftside: |
356 uslice = lambda i: u[i:] |
380 uslice = lambda i: u[i:] |
357 concat = lambda s: ellipsis + s |
381 concat = lambda s: ellipsis + s |
358 else: |
382 else: |
360 concat = lambda s: s + ellipsis |
384 concat = lambda s: s + ellipsis |
361 for i in pycompat.xrange(1, len(u)): |
385 for i in pycompat.xrange(1, len(u)): |
362 usub = uslice(i) |
386 usub = uslice(i) |
363 if ucolwidth(usub) <= width: |
387 if ucolwidth(usub) <= width: |
364 return concat(usub.encode(_sysstr(encoding))) |
388 return concat(usub.encode(_sysstr(encoding))) |
365 return ellipsis # no enough room for multi-column characters |
389 return ellipsis # no enough room for multi-column characters |
|
390 |
366 |
391 |
367 def lower(s): |
392 def lower(s): |
368 "best-effort encoding-aware case-folding of local string s" |
393 "best-effort encoding-aware case-folding of local string s" |
369 try: |
394 try: |
370 return asciilower(s) |
395 return asciilower(s) |
376 else: |
401 else: |
377 u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) |
402 u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) |
378 |
403 |
379 lu = u.lower() |
404 lu = u.lower() |
380 if u == lu: |
405 if u == lu: |
381 return s # preserve localstring |
406 return s # preserve localstring |
382 return lu.encode(_sysstr(encoding)) |
407 return lu.encode(_sysstr(encoding)) |
383 except UnicodeError: |
408 except UnicodeError: |
384 return s.lower() # we don't know how to fold this except in ASCII |
409 return s.lower() # we don't know how to fold this except in ASCII |
385 except LookupError as k: |
410 except LookupError as k: |
386 raise error.Abort(k, hint="please check your locale settings") |
411 raise error.Abort(k, hint="please check your locale settings") |
|
412 |
387 |
413 |
388 def upper(s): |
414 def upper(s): |
389 "best-effort encoding-aware case-folding of local string s" |
415 "best-effort encoding-aware case-folding of local string s" |
390 try: |
416 try: |
391 return asciiupper(s) |
417 return asciiupper(s) |
392 except UnicodeDecodeError: |
418 except UnicodeDecodeError: |
393 return upperfallback(s) |
419 return upperfallback(s) |
|
420 |
394 |
421 |
395 def upperfallback(s): |
422 def upperfallback(s): |
396 try: |
423 try: |
397 if isinstance(s, localstr): |
424 if isinstance(s, localstr): |
398 u = s._utf8.decode("utf-8") |
425 u = s._utf8.decode("utf-8") |
399 else: |
426 else: |
400 u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) |
427 u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) |
401 |
428 |
402 uu = u.upper() |
429 uu = u.upper() |
403 if u == uu: |
430 if u == uu: |
404 return s # preserve localstring |
431 return s # preserve localstring |
405 return uu.encode(_sysstr(encoding)) |
432 return uu.encode(_sysstr(encoding)) |
406 except UnicodeError: |
433 except UnicodeError: |
407 return s.upper() # we don't know how to fold this except in ASCII |
434 return s.upper() # we don't know how to fold this except in ASCII |
408 except LookupError as k: |
435 except LookupError as k: |
409 raise error.Abort(k, hint="please check your locale settings") |
436 raise error.Abort(k, hint="please check your locale settings") |
|
437 |
410 |
438 |
411 class normcasespecs(object): |
439 class normcasespecs(object): |
412 '''what a platform's normcase does to ASCII strings |
440 '''what a platform's normcase does to ASCII strings |
413 |
441 |
414 This is specified per platform, and should be consistent with what normcase |
442 This is specified per platform, and should be consistent with what normcase |
417 lower: normcase lowercases ASCII strings |
445 lower: normcase lowercases ASCII strings |
418 upper: normcase uppercases ASCII strings |
446 upper: normcase uppercases ASCII strings |
419 other: the fallback function should always be called |
447 other: the fallback function should always be called |
420 |
448 |
421 This should be kept in sync with normcase_spec in util.h.''' |
449 This should be kept in sync with normcase_spec in util.h.''' |
|
450 |
422 lower = -1 |
451 lower = -1 |
423 upper = 1 |
452 upper = 1 |
424 other = 0 |
453 other = 0 |
|
454 |
425 |
455 |
426 def jsonescape(s, paranoid=False): |
456 def jsonescape(s, paranoid=False): |
427 '''returns a string suitable for JSON |
457 '''returns a string suitable for JSON |
428 |
458 |
429 JSON is problematic for us because it doesn't support non-Unicode |
459 JSON is problematic for us because it doesn't support non-Unicode |
473 return _jsonescapeu8fast(u8chars, paranoid) |
503 return _jsonescapeu8fast(u8chars, paranoid) |
474 except ValueError: |
504 except ValueError: |
475 pass |
505 pass |
476 return charencodepure.jsonescapeu8fallback(u8chars, paranoid) |
506 return charencodepure.jsonescapeu8fallback(u8chars, paranoid) |
477 |
507 |
|
508 |
478 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8 |
509 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8 |
479 # bytes are mapped to that range. |
510 # bytes are mapped to that range. |
480 if pycompat.ispy3: |
511 if pycompat.ispy3: |
481 _utf8strict = r'surrogatepass' |
512 _utf8strict = r'surrogatepass' |
482 else: |
513 else: |
483 _utf8strict = r'strict' |
514 _utf8strict = r'strict' |
484 |
515 |
485 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] |
516 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] |
486 |
517 |
|
518 |
487 def getutf8char(s, pos): |
519 def getutf8char(s, pos): |
488 '''get the next full utf-8 character in the given string, starting at pos |
520 '''get the next full utf-8 character in the given string, starting at pos |
489 |
521 |
490 Raises a UnicodeError if the given location does not start a valid |
522 Raises a UnicodeError if the given location does not start a valid |
491 utf-8 character. |
523 utf-8 character. |
492 ''' |
524 ''' |
493 |
525 |
494 # find how many bytes to attempt decoding from first nibble |
526 # find how many bytes to attempt decoding from first nibble |
495 l = _utf8len[ord(s[pos:pos + 1]) >> 4] |
527 l = _utf8len[ord(s[pos : pos + 1]) >> 4] |
496 if not l: # ascii |
528 if not l: # ascii |
497 return s[pos:pos + 1] |
529 return s[pos : pos + 1] |
498 |
530 |
499 c = s[pos:pos + l] |
531 c = s[pos : pos + l] |
500 # validate with attempted decode |
532 # validate with attempted decode |
501 c.decode("utf-8", _utf8strict) |
533 c.decode("utf-8", _utf8strict) |
502 return c |
534 return c |
|
535 |
503 |
536 |
504 def toutf8b(s): |
537 def toutf8b(s): |
505 '''convert a local, possibly-binary string into UTF-8b |
538 '''convert a local, possibly-binary string into UTF-8b |
506 |
539 |
507 This is intended as a generic method to preserve data when working |
540 This is intended as a generic method to preserve data when working |
556 while pos < l: |
589 while pos < l: |
557 try: |
590 try: |
558 c = getutf8char(s, pos) |
591 c = getutf8char(s, pos) |
559 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": |
592 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": |
560 # have to re-escape existing U+DCxx characters |
593 # have to re-escape existing U+DCxx characters |
561 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict) |
594 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict) |
562 pos += 1 |
595 pos += 1 |
563 else: |
596 else: |
564 pos += len(c) |
597 pos += len(c) |
565 except UnicodeDecodeError: |
598 except UnicodeDecodeError: |
566 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict) |
599 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict) |
567 pos += 1 |
600 pos += 1 |
568 r += c |
601 r += c |
569 return r |
602 return r |
|
603 |
570 |
604 |
571 def fromutf8b(s): |
605 def fromutf8b(s): |
572 '''Given a UTF-8b string, return a local, possibly-binary string. |
606 '''Given a UTF-8b string, return a local, possibly-binary string. |
573 |
607 |
574 return the original binary string. This |
608 return the original binary string. This |